diff --git a/.bazelrc b/.bazelrc
index d8990ac5c12cc5..d7ae76f096431a 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -253,7 +253,7 @@ build:cuda_clang_official --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-12.3"
 build:cuda_clang_official --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
 build:cuda_clang_official --action_env=CLANG_CUDA_COMPILER_PATH="/usr/lib/llvm-17/bin/clang"
 build:cuda_clang_official --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
-build:cuda_clang_official --crosstool_top="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain"
+build:cuda_clang_official --crosstool_top="@sigbuild-r2.17-clang_config_cuda//crosstool:toolchain"
 
 # Build with nvcc for CUDA and clang for host
 build:nvcc_clang --config=cuda
@@ -293,6 +293,11 @@ build:rocm --define=using_rocm_hipcc=true
 build:rocm --define=tensorflow_mkldnn_contraction_kernel=0
 build:rocm --repo_env TF_NEED_ROCM=1
 
+build:sycl --crosstool_top=@local_config_sycl//crosstool:toolchain
+build:sycl --define=using_sycl=true
+build:sycl --define=tensorflow_mkldnn_contraction_kernel=0
+build:sycl --repo_env TF_NEED_SYCL=1
+
 # Options to disable default on features
 build:noaws --define=no_aws_support=true
 build:nogcp --define=no_gcp_support=true
@@ -497,12 +502,12 @@ build:rbe_linux --host_linkopt=-lm
 
 build:rbe_linux_cpu --config=rbe_linux
 # Linux cpu and cuda builds share the same toolchain now.
-build:rbe_linux_cpu --host_crosstool_top="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain"
-build:rbe_linux_cpu --crosstool_top="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain"
-build:rbe_linux_cpu --extra_toolchains="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain-linux-x86_64"
-build:rbe_linux_cpu --extra_execution_platforms="@sigbuild-r2.16-clang_config_platform//:platform"
-build:rbe_linux_cpu --host_platform="@sigbuild-r2.16-clang_config_platform//:platform"
-build:rbe_linux_cpu --platforms="@sigbuild-r2.16-clang_config_platform//:platform"
+build:rbe_linux_cpu --host_crosstool_top="@sigbuild-r2.17-clang_config_cuda//crosstool:toolchain"
+build:rbe_linux_cpu --crosstool_top="@sigbuild-r2.17-clang_config_cuda//crosstool:toolchain"
+build:rbe_linux_cpu --extra_toolchains="@sigbuild-r2.17-clang_config_cuda//crosstool:toolchain-linux-x86_64"
+build:rbe_linux_cpu --extra_execution_platforms="@sigbuild-r2.17-clang_config_platform//:platform"
+build:rbe_linux_cpu --host_platform="@sigbuild-r2.17-clang_config_platform//:platform"
+build:rbe_linux_cpu --platforms="@sigbuild-r2.17-clang_config_platform//:platform"
 # This is needed for all Clang17 builds but must not be present in GCC builds.
 build:rbe_linux_cpu --copt=-Wno-error=unused-command-line-argument
 # This was added in clang-16 by https://reviews.llvm.org/D133574.
@@ -511,7 +516,7 @@ build:rbe_linux_cpu --copt=-Wno-error=unused-command-line-argument
 # See https://github.com/protocolbuffers/upb/blob/9effcbcb27f0a665f9f345030188c0b291e32482/upb/upb.c#L183.
 build:rbe_linux_cpu --copt=-Wno-gnu-offsetof-extensions
 # Python config is the same across all containers because the binary is the same
-build:rbe_linux_cpu --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.16-clang_config_python"
+build:rbe_linux_cpu --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.17-clang_config_python"
 build:rbe_linux_cpu --python_path="/usr/bin/python3"
 # These you may need to change for your own GCP project.
 common:rbe_linux_cpu --remote_instance_name=projects/tensorflow-testing/instances/default_instance
@@ -532,9 +537,9 @@ build:rbe_linux_cuda --config=cuda_clang_official
 build:rbe_linux_cuda --config=rbe_linux_cpu
 # For Remote build execution -- GPU configuration
 build:rbe_linux_cuda --repo_env=REMOTE_GPU_TESTING=1
-build:rbe_linux_cuda --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.16-clang_config_cuda"
-build:rbe_linux_cuda --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.16-clang_config_tensorrt"
-build:rbe_linux_cuda --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.16-clang_config_nccl"
+build:rbe_linux_cuda --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.17-clang_config_cuda"
+build:rbe_linux_cuda --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.17-clang_config_tensorrt"
+build:rbe_linux_cuda --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.17-clang_config_nccl"
 test:rbe_linux_cuda --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 
 build:rbe_linux_cuda_nvcc --config=rbe_linux_cuda
@@ -639,7 +644,7 @@ test:release_linux_base --test_summary=short
 
 # Use the Clang toolchain to compile
 build:release_cpu_linux --config=release_linux_base
-build:release_cpu_linux --crosstool_top="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain"
+build:release_cpu_linux --crosstool_top="@sigbuild-r2.17-clang_config_cuda//crosstool:toolchain"
 
 build:release_gpu_linux --config=release_cpu_linux
 # Set up compilation CUDA version and paths and use the CUDA Clang toolchain.
diff --git a/.github/workflows/update-rbe.yml b/.github/workflows/update-rbe.yml
index d670cd6040401d..94929afefbea9f 100644
--- a/.github/workflows/update-rbe.yml
+++ b/.github/workflows/update-rbe.yml
@@ -117,6 +117,18 @@ jobs:
         map sigbuild-r2.16-clang-python3.10 2.16-python3.10
         map sigbuild-r2.16-clang-python3.11 2.16-python3.11
         map sigbuild-r2.16-clang-python3.12 2.16-python3.12
+        # TF 2.17
+        map sigbuild-r2.17 2.17-python3.11
+        map sigbuild-r2.17-python3.9 2.17-python3.9
+        map sigbuild-r2.17-python3.10 2.17-python3.10
+        map sigbuild-r2.17-python3.11 2.17-python3.11
+        map sigbuild-r2.17-python3.12 2.17-python3.12
+        # TF 2.17 + Clang (containers are the same, but env vars in configs.bzl are different)
+        map sigbuild-r2.17-clang 2.17-python3.11
+        map sigbuild-r2.17-clang-python3.9 2.17-python3.9
+        map sigbuild-r2.17-clang-python3.10 2.17-python3.10
+        map sigbuild-r2.17-clang-python3.11 2.17-python3.11
+        map sigbuild-r2.17-clang-python3.12 2.17-python3.12
     - name: Create Pull Request with changes
       uses: peter-evans/create-pull-request@2b011faafdcbc9ceb11414d64d0573f37c774b04 # v4.2.3
       with:
diff --git a/RELEASE.md b/RELEASE.md
index 8c9ba51d7993ae..3c6198b60d1918 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -59,6 +59,15 @@
     built with support for a given CPU target. This can be useful for skipping
     target-specific tests if a target is not supported.
 
+*   `tf.data`
+    * Support `data.experimental.distribued_save`. `distribued_save` uses
+      tf.data service
+      (https://www.tensorflow.org/api_docs/python/tf/data/experimental/service)
+      to write distributed dataset snapshots. The call is non-blocking and
+      returns without waiting for the snapshot to finish. Setting `wait=True` to
+      `tf.data.Dataset.load` allows the snapshots to be read while they are
+      being written.
+
 ### Bug Fixes and Other Changes
 
 * <SIMILAR TO ABOVE SECTION, BUT FOR OTHER IMPORTANT CHANGES / BUG FIXES>
@@ -79,6 +88,13 @@
       `experimental_default_delegate_latest_features` to enable all default
       delegate features.
 
+* `tf.data`
+    * Add `wait` to `tf.data.Dataset.load`. If `True`, for snapshots written
+      with `distributed_save`, it reads the snapshot while it is being written.
+      For snapshots written with regular `save`, it waits for the snapshot until
+      it's finished. The default is `False` for backward compatibility. Users of
+      `distributed_save` are recommended to set it to `True`.
+
 ## Thanks to our Contributors
 
 This release contains contributions from many people at Google, as well as:
diff --git a/WORKSPACE b/WORKSPACE
index 675a9481283514..cb024a13a19a47 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -2,6 +2,8 @@
 
 workspace(name = "org_tensorflow")
 
+# buildifier: disable=load-on-top
+
 # We must initialize hermetic python first.
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 
@@ -14,6 +16,12 @@ http_archive(
     ],
 )
 
+http_archive(
+    name = "rules_java",
+    sha256 = "c73336802d0b4882e40770666ad055212df4ea62cfa6edf9cb0f9d29828a0934",
+    url = "https://github.com/bazelbuild/rules_java/releases/download/5.3.5/rules_java-5.3.5.tar.gz",
+)
+
 http_archive(
     name = "rules_python",
     sha256 = "9d04041ac92a0985e344235f5d946f71ac543f1b1565f2cdbc9a2aaee8adf55b",
@@ -21,6 +29,7 @@ http_archive(
     url = "https://github.com/bazelbuild/rules_python/releases/download/0.26.0/rules_python-0.26.0.tar.gz",
 )
 
+# buildifier: disable=same-origin-load
 load("@rules_python//python:repositories.bzl", "py_repositories")
 
 py_repositories()
diff --git a/ci/official/requirements_updater/WORKSPACE b/ci/official/requirements_updater/WORKSPACE
index f9a116a6a3153e..e29f586f933c6a 100644
--- a/ci/official/requirements_updater/WORKSPACE
+++ b/ci/official/requirements_updater/WORKSPACE
@@ -2,6 +2,8 @@
 
 workspace(name = "requirements_updater")
 
+# buildifier: disable=load-on-top
+
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 
 http_archive(
@@ -20,6 +22,7 @@ http_archive(
     url = "https://github.com/bazelbuild/rules_python/releases/download/0.26.0/rules_python-0.26.0.tar.gz",
 )
 
+# buildifier: disable=same-origin-load
 load("@rules_python//python:repositories.bzl", "py_repositories")
 
 py_repositories()
diff --git a/ci/official/wheel_test/WORKSPACE b/ci/official/wheel_test/WORKSPACE
index d52a3ed895173b..db46144dadbbb1 100644
--- a/ci/official/wheel_test/WORKSPACE
+++ b/ci/official/wheel_test/WORKSPACE
@@ -2,6 +2,8 @@
 
 workspace(name = "wheel_test")
 
+# buildifier: disable=load-on-top
+
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 
 http_archive(
@@ -20,6 +22,7 @@ http_archive(
     url = "https://github.com/bazelbuild/rules_python/releases/download/0.26.0/rules_python-0.26.0.tar.gz",
 )
 
+# buildifier: disable=same-origin-load
 load("@rules_python//python:repositories.bzl", "py_repositories")
 
 py_repositories()
diff --git a/configure.py b/configure.py
index 66427431b42c16..0081eeabf66bcc 100644
--- a/configure.py
+++ b/configure.py
@@ -892,8 +892,8 @@ def set_clang_compiler_path_win(environ_cp):
   )
 
   write_action_env_to_bazelrc('CLANG_COMPILER_PATH', clang_compiler_path)
-  write_to_bazelrc('build --repo_env=CC=%s' % clang_compiler_path)
-  write_to_bazelrc('build --repo_env=BAZEL_COMPILER=%s' % clang_compiler_path)
+  write_to_bazelrc(f'build --repo_env=CC="{clang_compiler_path}"')
+  write_to_bazelrc(f'build --repo_env=BAZEL_COMPILER="{clang_compiler_path}"')
 
   return clang_compiler_path
 
diff --git a/requirements_lock_3_10.txt b/requirements_lock_3_10.txt
index 2335d295d0faf6..05dc3940487eef 100644
--- a/requirements_lock_3_10.txt
+++ b/requirements_lock_3_10.txt
@@ -249,9 +249,9 @@ h5py==3.10.0 \
     # via
     #   -r requirements.in
     #   keras-nightly
-idna==3.6 \
-    --hash=sha256:9ecdbbd083b06798ae1e86adcbfe8ab1479cf864e4ee30fe4e46a003d12491ca \
-    --hash=sha256:c05567e9c24a6b9faaa835c4821bad0590fbb9d5779e7caa6e1cc4978e7eb24f
+idna==3.7 \
+    --hash=sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc \
+    --hash=sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0
     # via requests
 jax==0.4.7 \
     --hash=sha256:5e7002d74db25f97c99b979d4ba1233b1ef26e1597e5fc468ad11d1c8a9dc4f8
diff --git a/requirements_lock_3_11.txt b/requirements_lock_3_11.txt
index 2335d295d0faf6..05dc3940487eef 100644
--- a/requirements_lock_3_11.txt
+++ b/requirements_lock_3_11.txt
@@ -249,9 +249,9 @@ h5py==3.10.0 \
     # via
     #   -r requirements.in
     #   keras-nightly
-idna==3.6 \
-    --hash=sha256:9ecdbbd083b06798ae1e86adcbfe8ab1479cf864e4ee30fe4e46a003d12491ca \
-    --hash=sha256:c05567e9c24a6b9faaa835c4821bad0590fbb9d5779e7caa6e1cc4978e7eb24f
+idna==3.7 \
+    --hash=sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc \
+    --hash=sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0
     # via requests
 jax==0.4.7 \
     --hash=sha256:5e7002d74db25f97c99b979d4ba1233b1ef26e1597e5fc468ad11d1c8a9dc4f8
diff --git a/requirements_lock_3_12.txt b/requirements_lock_3_12.txt
index 9bc6eff7313ec3..120ec6ebcd7c72 100644
--- a/requirements_lock_3_12.txt
+++ b/requirements_lock_3_12.txt
@@ -249,9 +249,9 @@ h5py==3.10.0 \
     # via
     #   -r requirements.in
     #   keras-nightly
-idna==3.6 \
-    --hash=sha256:9ecdbbd083b06798ae1e86adcbfe8ab1479cf864e4ee30fe4e46a003d12491ca \
-    --hash=sha256:c05567e9c24a6b9faaa835c4821bad0590fbb9d5779e7caa6e1cc4978e7eb24f
+idna==3.7 \
+    --hash=sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc \
+    --hash=sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0
     # via requests
 jax==0.4.7 \
     --hash=sha256:5e7002d74db25f97c99b979d4ba1233b1ef26e1597e5fc468ad11d1c8a9dc4f8
diff --git a/requirements_lock_3_9.txt b/requirements_lock_3_9.txt
index 9d9e85aceda9c7..36a55514cd788b 100644
--- a/requirements_lock_3_9.txt
+++ b/requirements_lock_3_9.txt
@@ -249,9 +249,9 @@ h5py==3.10.0 \
     # via
     #   -r requirements.in
     #   keras-nightly
-idna==3.6 \
-    --hash=sha256:9ecdbbd083b06798ae1e86adcbfe8ab1479cf864e4ee30fe4e46a003d12491ca \
-    --hash=sha256:c05567e9c24a6b9faaa835c4821bad0590fbb9d5779e7caa6e1cc4978e7eb24f
+idna==3.7 \
+    --hash=sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc \
+    --hash=sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0
     # via requests
 importlib-metadata==7.0.1 \
     --hash=sha256:4805911c3a4ec7c3966410053e9ec6a1fecd629117df5adee56dfc9432a1081e \
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 8c1e9d535d5cb1..71487e2aec0bee 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -1068,6 +1068,7 @@ package_group(
         "//third_party/py/keras/...",
         "//third_party/py/tf_keras/...",
         "//third_party/yggdrasil_decision_forests/...",
+        "//waymo/accelerator/...",
         "//waymo/ml/cn/...",
         "//waymo/ml/models/...",
     ],
@@ -1116,9 +1117,10 @@ bzl_library(
         "@local_config_cuda//cuda:build_defs_bzl",
         "@local_config_rocm//rocm:build_defs_bzl",
         "@local_config_tensorrt//:build_defs_bzl",
-        "@local_tsl//tsl:tsl_bzl",
         "@local_tsl//tsl/platform/default:cuda_build_defs_bzl",
+        "@local_xla//xla/tsl:tsl_bzl",
         "@local_xla//xla/tsl/mkl:build_defs_bzl",
+        "@rules_java//java:rules",
     ],
 )
 
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index 5490149bc905b1..aa4b5d6987871b 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -176,7 +176,7 @@ void TF_CloseDeprecatedSession(TF_DeprecatedSession* s, TF_Status* status) {
 }
 
 void TF_DeleteDeprecatedSession(TF_DeprecatedSession* s, TF_Status* status) {
-  status->status = ::tensorflow::OkStatus();
+  status->status = absl::OkStatus();
   if (s == nullptr) return;
   delete s->session;
   delete s;
@@ -352,7 +352,7 @@ bool ExtendSessionGraphHelper(TF_Session* session, TF_Status* status) {
 
 static void TF_Run_Setup(int noutputs, TF_Tensor** c_outputs,
                          TF_Status* status) {
-  status->status = ::tensorflow::OkStatus();
+  status->status = absl::OkStatus();
   for (int i = 0; i < noutputs; ++i) {
     c_outputs[i] = nullptr;
   }
@@ -388,9 +388,9 @@ static Status TF_TensorToTensorV1(const TF_Tensor* src, Tensor* dst) {
       return InvalidArgument(
           "Malformed TF_RESOURCE tensor: unable to parse resource handle");
     }
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 static bool TF_Run_Inputs(TF_Tensor* const* c_inputs,
@@ -959,7 +959,7 @@ void TF_SetAttrTensorShapeProto(TF_OperationDescription* desc,
   TensorShapeProto shape;
   if (shape.ParseFromArray(proto, static_cast<int>(proto_len))) {
     desc->node_builder.Attr(attr_name, shape);
-    status->status = ::tensorflow::OkStatus();
+    status->status = absl::OkStatus();
   } else {
     status->status = InvalidArgument("Unparseable TensorShapeProto");
   }
@@ -986,7 +986,7 @@ void TF_SetAttrTensorShapeProtoList(TF_OperationDescription* desc,
     }
   }
   desc->node_builder.Attr(attr_name, shapes);
-  status->status = ::tensorflow::OkStatus();
+  status->status = absl::OkStatus();
 }
 
 void TF_SetAttrTensor(TF_OperationDescription* desc, const char* attr_name,
@@ -999,7 +999,7 @@ void TF_SetAttrTensor(TF_OperationDescription* desc, const char* attr_name,
 void TF_SetAttrTensorList(TF_OperationDescription* desc, const char* attr_name,
                           TF_Tensor* const* values, int num_values,
                           TF_Status* status) {
-  status->status = ::tensorflow::OkStatus();
+  status->status = absl::OkStatus();
   std::vector<Tensor> t;
   t.reserve(num_values);
 
@@ -1037,7 +1037,7 @@ void TF_SetAttrValueProto(TF_OperationDescription* desc, const char* attr_name,
     desc->node_builder.Attr(attr_name, std::move(attr_value));
   }
 
-  status->status = ::tensorflow::OkStatus();
+  status->status = absl::OkStatus();
 }
 
 TF_Operation* TF_FinishOperationLocked(TF_OperationDescription* desc,
@@ -1552,7 +1552,7 @@ void TF_OperationGetAttrName(TF_Operation* oper, int i, char* output,
   for (it = attrs.begin(); it != attrs.end(); it++) {
     if (count == i) {
       strncpy(output, it->first.c_str(), it->first.length());
-      status->status = ::tensorflow::OkStatus();
+      status->status = absl::OkStatus();
       return;
     }
     count++;
@@ -1931,7 +1931,7 @@ Status CopyGraph(Graph* src_graph, Graph* dst_graph,
   for (const auto& pair : results.return_tensors) {
     return_nodes->emplace_back(pair.first, pair.second);
   }
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 bool ValidateConstWhileParams(const TF_WhileParams& params, TF_Status* s) {
@@ -2063,7 +2063,7 @@ void TF_FinishWhileHelper(const TF_WhileParams* params, TF_Status* status,
             scope.impl()->control_deps(), &params->cond_output,
             /* nreturn_nodes */ 1, &cond_output));
         *output = cond_output[0];
-        return ::tensorflow::OkStatus();
+        return absl::OkStatus();
       };
 
   // 'body_fn' copies the body graph into the parent graph.
@@ -2078,7 +2078,7 @@ void TF_FinishWhileHelper(const TF_WhileParams* params, TF_Status* status,
                       &parent->refiner, params->body_inputs, inputs,
                       scope.impl()->name(), scope.impl()->control_deps(),
                       params->body_outputs, num_loop_vars, outputs));
-        return ::tensorflow::OkStatus();
+        return absl::OkStatus();
       };
 
   // Create the while loop using an internal scope.
@@ -2312,7 +2312,7 @@ void TF_CloseSession(TF_Session* s, TF_Status* status) {
 }
 
 void TF_DeleteSession(TF_Session* s, TF_Status* status) {
-  status->status = ::tensorflow::OkStatus();
+  status->status = absl::OkStatus();
   if (s == nullptr) return;
   TF_Graph* const graph = s->graph;
   if (graph != nullptr) {
@@ -2472,7 +2472,7 @@ TF_ApiDefMap* TF_NewApiDefMap(TF_Buffer* op_list_buffer, TF_Status* status) {
     status->status = InvalidArgument("Unparseable OpList");
     return nullptr;
   }
-  status->status = ::tensorflow::OkStatus();
+  status->status = absl::OkStatus();
   return new TF_ApiDefMap(op_list);
 }
 
diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index 45697e20d1ea05..bedba2c51c6d39 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -501,7 +501,7 @@ TFE_TensorHandle* TFE_NewTensorHandleFromScalar(TF_DataType data_type,
   tensorflow::Tensor tensor(dtype, tensorflow::TensorShape({}));
   std::memcpy(tensorflow::TensorCApi::Buffer(tensor)->data(), data, len);
 
-  status->status = ::tensorflow::OkStatus();
+  status->status = absl::OkStatus();
   return tensorflow::wrap(tensorflow::TensorHandle::CreateLocalHandle(tensor));
 }
 
diff --git a/tensorflow/c/c_api_function.cc b/tensorflow/c/c_api_function.cc
index 2fd92bd7dc0546..25805954eff67c 100644
--- a/tensorflow/c/c_api_function.cc
+++ b/tensorflow/c/c_api_function.cc
@@ -44,7 +44,7 @@ Status ValidateNonRefOutput(const Node* node, int idx) {
   return IsRefType(dt)
              ? InvalidArgument("Output ", idx, " of node '", node->name(),
                                "' has a reference type ", DataTypeString(dt))
-             : OkStatus();
+             : absl::OkStatus();
 }
 
 // Converts `ninputs` and `inputs` into `inputs_tensors` and `input_nodes` and
@@ -83,7 +83,7 @@ Status ProcessInputs(
       indices.push_back(idx);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Converts `noutputs` and `outputs` into `outputs_tensors` and does various
@@ -105,7 +105,7 @@ Status ProcessOutputs(const TF_Graph* fn_body, const char* fn_name,
                                     fn_name, "'");
     output_tensors->emplace_back(node, idx);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Populates `body_nodes` with the nodes that will become function's body.
@@ -142,7 +142,7 @@ Status ComputeBodyNodes(
       body_nodes->push_back(node);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -294,7 +294,7 @@ int TF_GraphGetFunctions(TF_Graph* g, TF_Function** funcs, int max_func,
     func->record = new tensorflow::FunctionRecord(lib.function(i), {}, false);
     funcs[i] = func;
   }
-  status->status = ::tensorflow::OkStatus();
+  status->status = absl::OkStatus();
   return len;
 }
 
@@ -315,7 +315,7 @@ TF_Function* TF_FunctionImportFunctionDef(const void* proto, size_t proto_len,
 
   TF_Function* func = new TF_Function();
   func->record = new tensorflow::FunctionRecord(std::move(fdef), {}, false);
-  status->status = ::tensorflow::OkStatus();
+  status->status = absl::OkStatus();
   return func;
 }
 
@@ -338,7 +338,7 @@ void TF_FunctionSetAttrValueProto(TF_Function* func, const char* attr_name,
 
   (*(fdef_or.value()->mutable_attr()))[string(attr_name)] = attr_value;
 
-  status->status = ::tensorflow::OkStatus();
+  status->status = absl::OkStatus();
 }
 
 void TF_FunctionGetAttrValueProto(TF_Function* func, const char* attr_name,
diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc
index 877e2f262fba44..14045bbc2daef4 100644
--- a/tensorflow/c/c_api_test.cc
+++ b/tensorflow/c/c_api_test.cc
@@ -249,7 +249,7 @@ void TestEncodeDecode(int line, const std::vector<string>& data) {
 
     // Convert back to a C++ Tensor and ensure we get expected output.
     Tensor output;
-    ASSERT_EQ(OkStatus(), TF_TensorToTensor(dst, &output)) << line;
+    ASSERT_EQ(absl::OkStatus(), TF_TensorToTensor(dst, &output)) << line;
     ASSERT_EQ(src.NumElements(), output.NumElements()) << line;
     for (int64_t i = 0; i < src.NumElements(); ++i) {
       ASSERT_EQ(data[i], output.flat<tstring>()(i)) << line;
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index 3a15bb5ba41f7f..f4b480752c90c9 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -913,8 +913,8 @@ tf_cuda_library(
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
-        "@local_tsl//tsl/distributed_runtime/coordination:coordination_service_agent",
         "@local_xla//xla/tsl/c:tsl_status_internal",
+        "@local_xla//xla/tsl/distributed_runtime/coordination:coordination_service_agent",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/c/eager/c_api_experimental.cc b/tensorflow/c/eager/c_api_experimental.cc
index a7eb7798f23dec..05e0cb1c5347df 100644
--- a/tensorflow/c/eager/c_api_experimental.cc
+++ b/tensorflow/c/eager/c_api_experimental.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/c/tf_status.h"
 #include "tensorflow/c/tf_status_helper.h"
 #include "xla/tsl/c/tsl_status_internal.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
 #include "tensorflow/core/common_runtime/composite_device.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/eager/eager_operation.h"
@@ -44,7 +45,6 @@ limitations under the License.
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/strcat.h"
-#include "tsl/distributed_runtime/coordination/coordination_service_agent.h"
 #include "tsl/framework/cancellation.h"
 
 using tensorflow::string;
diff --git a/tensorflow/c/eager/tape.h b/tensorflow/c/eager/tape.h
index 12b8d0c77ea7bb..f7fa3b2491a40b 100644
--- a/tensorflow/c/eager/tape.h
+++ b/tensorflow/c/eager/tape.h
@@ -140,8 +140,8 @@ class GradientTape {
 
   // Returns whether any tensor in a list of tensors is being watched and has
   // a trainable dtype.
-  bool ShouldRecord(gtl::ArraySlice<int64_t> tensor_ids,
-                    gtl::ArraySlice<tensorflow::DataType> dtypes) const;
+  bool ShouldRecord(absl::Span<const int64_t> tensor_ids,
+                    absl::Span<const tensorflow::DataType> dtypes) const;
 
   // Adds this tensor to the list of watched tensors.
   //
@@ -158,8 +158,8 @@ class GradientTape {
   // nullptr instead of building zeros when build_default_zeros_grads == true.
   void RecordOperation(
       const string& op_type, const std::vector<TapeTensor>& output_tensors,
-      gtl::ArraySlice<int64_t> input_tensor_id,
-      gtl::ArraySlice<tensorflow::DataType> input_dtypes,
+      absl::Span<const int64_t> input_tensor_id,
+      absl::Span<const tensorflow::DataType> input_dtypes,
       const std::function<BackwardFunction*()>& backward_function_getter,
       const std::function<void(BackwardFunction*)>& backward_function_deleter);
 
@@ -174,8 +174,8 @@ class GradientTape {
   // is set to false.
   Status ComputeGradient(
       const VSpace<Gradient, BackwardFunction, TapeTensor>& vspace,
-      const gtl::ArraySlice<int64_t> target_tensor_ids,
-      const gtl::ArraySlice<int64_t> source_tensor_ids,
+      const absl::Span<const int64_t> target_tensor_ids,
+      const absl::Span<const int64_t> source_tensor_ids,
       const std::unordered_map<int64, TapeTensor>& sources_that_are_targets,
       gtl::ArraySlice<Gradient*> output_gradients, absl::Span<Gradient*> result,
       bool build_default_zeros_grads = true);
@@ -283,8 +283,8 @@ class ForwardAccumulator {
   Status Accumulate(
       const string& op_type, const std::vector<TapeTensor>& input_tensors,
       const std::vector<TapeTensor>& output_tensors,
-      gtl::ArraySlice<int64_t> input_tensor_id,
-      gtl::ArraySlice<tensorflow::DataType> input_dtypes,
+      absl::Span<const int64_t> input_tensor_id,
+      absl::Span<const tensorflow::DataType> input_dtypes,
       const ForwardFunction<Gradient>* forward_function,
       const std::function<BackwardFunction*()>& backward_function_getter,
       const std::function<void(BackwardFunction*)>& backward_function_deleter);
@@ -306,8 +306,8 @@ class ForwardAccumulator {
 
   // Indicates whether the forward accumulator should run on an operation with
   // the specified inputs and dtypes.
-  bool ShouldRecord(gtl::ArraySlice<int64_t> tensor_ids,
-                    gtl::ArraySlice<tensorflow::DataType> dtypes);
+  bool ShouldRecord(absl::Span<const int64_t> tensor_ids,
+                    absl::Span<const tensorflow::DataType> dtypes);
 
   // Temporarily push or pop transient state for this accumulator.
   //
@@ -392,8 +392,8 @@ inline bool IsDtypeTrainable(DataType dtype) {
 
 template <typename Gradient, typename BackwardFunction, typename TapeTensor>
 bool GradientTape<Gradient, BackwardFunction, TapeTensor>::ShouldRecord(
-    gtl::ArraySlice<int64_t> tensor_ids,
-    gtl::ArraySlice<tensorflow::DataType> dtypes) const {
+    absl::Span<const int64_t> tensor_ids,
+    absl::Span<const tensorflow::DataType> dtypes) const {
   CHECK_EQ(tensor_ids.size(), dtypes.size());
   for (int i = 0; i < tensor_ids.size(); ++i) {
     if (tensor_tape_.find(tensor_ids[i]) != tensor_tape_.end()) {
@@ -414,8 +414,8 @@ void GradientTape<Gradient, BackwardFunction, TapeTensor>::Watch(
 template <typename Gradient, typename BackwardFunction, typename TapeTensor>
 void GradientTape<Gradient, BackwardFunction, TapeTensor>::RecordOperation(
     const string& op_type, const std::vector<TapeTensor>& output_tensors,
-    gtl::ArraySlice<int64_t> input_tensor_id,
-    gtl::ArraySlice<tensorflow::DataType> input_dtypes,
+    absl::Span<const int64_t> input_tensor_id,
+    absl::Span<const tensorflow::DataType> input_dtypes,
     const std::function<BackwardFunction*()>& backward_function_getter,
     const std::function<void(BackwardFunction*)>& backward_function_deleter) {
   if (!ShouldRecord(input_tensor_id, input_dtypes)) {
@@ -530,7 +530,7 @@ struct BackpropInitialState {
 // are needed, are copied and returned in BackpropInitialState.
 template <typename BackwardFunction, typename TapeTensor>
 BackpropInitialState<BackwardFunction, TapeTensor> PrepareBackprop(
-    gtl::ArraySlice<int64_t> target, const TensorTape& tensor_tape,
+    absl::Span<const int64_t> target, const TensorTape& tensor_tape,
     OpTape<BackwardFunction, TapeTensor>* op_tape,
     const std::unordered_set<int64_t>& sources_set, bool persistent_tape) {
   std::vector<int64_t> tensor_stack;
@@ -605,7 +605,7 @@ std::vector<int64_t> InitialStack(
 template <typename Gradient, typename BackwardFunction, typename TapeTensor>
 Status InitialGradients(
     const VSpace<Gradient, BackwardFunction, TapeTensor>& vspace,
-    gtl::ArraySlice<int64_t> target_tensor_ids,
+    absl::Span<const int64_t> target_tensor_ids,
     const std::unordered_map<int64_t, TapeTensor>& sources_that_are_targets,
     gtl::ArraySlice<Gradient*> output_gradients, const TensorTape& tensor_tape,
     const OpTape<BackwardFunction, TapeTensor>& op_tape,
@@ -690,8 +690,8 @@ constexpr int kMinAggregateBytes = 128 * 1024 * 1024;
 template <typename Gradient, typename BackwardFunction, typename TapeTensor>
 Status GradientTape<Gradient, BackwardFunction, TapeTensor>::ComputeGradient(
     const VSpace<Gradient, BackwardFunction, TapeTensor>& vspace,
-    const gtl::ArraySlice<int64_t> target_tensor_ids,
-    const gtl::ArraySlice<int64_t> source_tensor_ids,
+    const absl::Span<const int64_t> target_tensor_ids,
+    const absl::Span<const int64_t> source_tensor_ids,
     const std::unordered_map<int64_t, TapeTensor>& sources_that_are_targets,
     gtl::ArraySlice<Gradient*> output_gradients, absl::Span<Gradient*> result,
     bool build_default_zeros_grads) {
@@ -907,8 +907,8 @@ Status GradientTape<Gradient, BackwardFunction, TapeTensor>::ComputeGradient(
 
 template <typename Gradient, typename BackwardFunction, typename TapeTensor>
 bool ForwardAccumulator<Gradient, BackwardFunction, TapeTensor>::ShouldRecord(
-    gtl::ArraySlice<int64_t> tensor_ids,
-    gtl::ArraySlice<tensorflow::DataType> dtypes) {
+    absl::Span<const int64_t> tensor_ids,
+    absl::Span<const tensorflow::DataType> dtypes) {
   if (call_state_.top().backward_tape != nullptr) {
     // If we're forwarding Accumulate calls to backward_tape's RecordOperation,
     // we should also delegate ShouldRecord.
@@ -1031,8 +1031,8 @@ template <typename Gradient, typename BackwardFunction, typename TapeTensor>
 Status ForwardAccumulator<Gradient, BackwardFunction, TapeTensor>::Accumulate(
     const string& op_type, const std::vector<TapeTensor>& input_tensors,
     const std::vector<TapeTensor>& output_tensors,
-    gtl::ArraySlice<int64_t> input_tensor_id,
-    gtl::ArraySlice<tensorflow::DataType> input_dtypes,
+    absl::Span<const int64_t> input_tensor_id,
+    absl::Span<const tensorflow::DataType> input_dtypes,
     const ForwardFunction<Gradient>* forward_function,
     const std::function<BackwardFunction*()>& backward_function_getter,
     const std::function<void(BackwardFunction*)>& backward_function_deleter) {
diff --git a/tensorflow/c/experimental/gradients/nn_grad.cc b/tensorflow/c/experimental/gradients/nn_grad.cc
index d249ed98944758..9d28a6d5cc4714 100644
--- a/tensorflow/c/experimental/gradients/nn_grad.cc
+++ b/tensorflow/c/experimental/gradients/nn_grad.cc
@@ -26,7 +26,6 @@ limitations under the License.
 
 using std::vector;
 using tensorflow::ops::BiasAddGrad;
-using tensorflow::ops::Mul;
 using tensorflow::ops::ReluGrad;
 
 namespace tensorflow {
diff --git a/tensorflow/c/experimental/next_pluggable_device/BUILD b/tensorflow/c/experimental/next_pluggable_device/BUILD
index 3d92b7ad3d2992..56586f757f369b 100644
--- a/tensorflow/c/experimental/next_pluggable_device/BUILD
+++ b/tensorflow/c/experimental/next_pluggable_device/BUILD
@@ -31,11 +31,11 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/distributed_runtime/coordination:coordination_service_agent",
         "@local_xla//xla/pjrt:pjrt_c_api_client",
         "@local_xla//xla/pjrt:pjrt_client",
         "@local_xla//xla/pjrt/c:pjrt_c_api_hdrs",
         "@local_xla//xla/pjrt/c:pjrt_c_api_helpers",
+        "@local_xla//xla/tsl/distributed_runtime/coordination:coordination_service_agent",
     ],
 )
 
diff --git a/tensorflow/c/experimental/next_pluggable_device/c_api.cc b/tensorflow/c/experimental/next_pluggable_device/c_api.cc
index a2ad1977a7ef3d..15a50a0a7c4060 100644
--- a/tensorflow/c/experimental/next_pluggable_device/c_api.cc
+++ b/tensorflow/c/experimental/next_pluggable_device/c_api.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include "xla/pjrt/c/pjrt_c_api_helpers.h"
 #include "xla/pjrt/pjrt_c_api_client.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
 #include "tensorflow/core/common_runtime/next_pluggable_device/plugin_resource.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_handle.h"
@@ -51,7 +52,6 @@ limitations under the License.
 #include "tensorflow/core/platform/refcount.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/tfrt/common/pjrt_util.h"
-#include "tsl/distributed_runtime/coordination/coordination_service_agent.h"
 
 TF_Device* TF_GetDevice(TF_OpKernelContext* ctx) {
   auto* cc_ctx = reinterpret_cast<tensorflow::OpKernelContext*>(ctx);
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/BUILD b/tensorflow/c/experimental/ops/gen/cpp/renderers/BUILD
index c13bc899f2d016..7b62bd72c56903 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/BUILD
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/BUILD
@@ -36,10 +36,12 @@ tf_cc_tests(
     ),
     deps = [
         ":renderers",
+        "//tensorflow/c/experimental/ops/gen/common",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/platform:types",
     ],
 )
diff --git a/tensorflow/c/experimental/stream_executor/BUILD b/tensorflow/c/experimental/stream_executor/BUILD
index 22cf7275c6efa6..5dcb4a37c7af1d 100644
--- a/tensorflow/c/experimental/stream_executor/BUILD
+++ b/tensorflow/c/experimental/stream_executor/BUILD
@@ -45,6 +45,8 @@ cc_library(
         "//tensorflow/core/common_runtime/device:device_utils",
         "//tensorflow/core/platform:strcat",
         "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:str_format",
         "@local_tsl//tsl/platform:status",
         "@local_xla//xla/stream_executor",
         "@local_xla//xla/stream_executor:platform",
@@ -67,7 +69,9 @@ cc_library(
         "//tensorflow/c:tf_status_helper",
         "@local_tsl//tsl/platform:statusor",
         "@local_xla//xla/stream_executor",
-        "@local_xla//xla/stream_executor:stream_executor_internal",
+        "@local_xla//xla/stream_executor:event_interface",
+        "@local_xla//xla/stream_executor:stream_executor_interface",
+        "@local_xla//xla/stream_executor:stream_interface",
     ],
 )
 
diff --git a/tensorflow/c/experimental/stream_executor/stream_executor.cc b/tensorflow/c/experimental/stream_executor/stream_executor.cc
index 1efbff9241d732..93d07b431ee4cf 100644
--- a/tensorflow/c/experimental/stream_executor/stream_executor.cc
+++ b/tensorflow/c/experimental/stream_executor/stream_executor.cc
@@ -21,15 +21,20 @@ limitations under the License.
 // device.
 #include "tensorflow/c/experimental/stream_executor/stream_executor.h"
 
+#include <memory>
 #include <string>
 #include <utility>
 
 #include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_format.h"
 #include "tensorflow/c/c_api_macros.h"
 #include "tensorflow/c/c_api_macros_internal.h"
 #include "tensorflow/c/experimental/stream_executor/stream_executor_internal.h"
 #include "tensorflow/c/tf_status_helper.h"
 #include "xla/stream_executor/executor_cache.h"
+#include "xla/stream_executor/host_memory_allocation.h"
+#include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream.h"
@@ -61,7 +66,7 @@ absl::Status ValidateSPPlatform(const SP_Platform& platform) {
   TF_RETURN_IF_ERROR(
       tensorflow::device_utils::ValidateDeviceType(platform.type));
   // `visible_device_count` could be 0 at initialization time.
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 absl::Status ValidateSPPlatformFns(const SP_PlatformFns& platform_fns) {
@@ -73,33 +78,33 @@ absl::Status ValidateSPPlatformFns(const SP_PlatformFns& platform_fns) {
   TF_VALIDATE_NOT_NULL(SP_PlatformFns, platform_fns, destroy_stream_executor);
   TF_VALIDATE_NOT_NULL(SP_PlatformFns, platform_fns, create_device_fns);
   TF_VALIDATE_NOT_NULL(SP_PlatformFns, platform_fns, destroy_device_fns);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 absl::Status ValidateSPAllocatorStats(const SP_AllocatorStats& stats) {
   TF_VALIDATE_STRUCT_SIZE(SP_AllocatorStats, stats,
                           SP_ALLOCATORSTATS_STRUCT_SIZE);
   // All other fields could theoretically be zero/null.
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 absl::Status ValidateSPDeviceMemoryBase(const SP_DeviceMemoryBase& mem) {
   TF_VALIDATE_STRUCT_SIZE(SP_DeviceMemoryBase, mem,
                           SP_DEVICE_MEMORY_BASE_STRUCT_SIZE);
   // All other fields could theoretically be zero/null.
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 absl::Status ValidateSPDevice(const SP_Device& device) {
   TF_VALIDATE_STRUCT_SIZE(SP_Device, device, SP_DEVICE_STRUCT_SIZE);
   // All other fields could theoretically be zero/null.
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 absl::Status ValidateSPDeviceFns(const SP_DeviceFns& device_fns) {
   TF_VALIDATE_STRUCT_SIZE(SP_DeviceFns, device_fns, SP_DEVICE_FNS_STRUCT_SIZE);
   // All other fields could theoretically be zero/null.
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 absl::Status ValidateSPStreamExecutor(const SP_StreamExecutor& se,
@@ -135,7 +140,7 @@ absl::Status ValidateSPStreamExecutor(const SP_StreamExecutor& se,
   TF_VALIDATE_NOT_NULL(SP_StreamExecutor, se, mem_zero);
   TF_VALIDATE_NOT_NULL(SP_StreamExecutor, se, memset);
   TF_VALIDATE_NOT_NULL(SP_StreamExecutor, se, memset32);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 absl::Status ValidateSEPlatformRegistrationParams(
@@ -145,7 +150,7 @@ absl::Status ValidateSEPlatformRegistrationParams(
   TF_VALIDATE_NOT_NULL(SE_PlatformRegistrationParams, params, destroy_platform);
   TF_VALIDATE_NOT_NULL(SE_PlatformRegistrationParams, params,
                        destroy_platform_fns);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 #undef TF_VALIDATE_NOT_NULL
 
@@ -195,7 +200,7 @@ void HostCallbackTrampoline(void* ctx, TF_Status* status) {
   delete host_ctx;
 }
 
-class CStreamExecutor : public internal::StreamExecutorInterface {
+class CStreamExecutor : public StreamExecutorInterface {
  public:
   explicit CStreamExecutor(SP_Device device, SP_DeviceFns* device_fns,
                            SP_StreamExecutor* stream_executor,
@@ -215,9 +220,7 @@ class CStreamExecutor : public internal::StreamExecutorInterface {
     platform_fns_->destroy_device(platform_, &device_);
   }
 
-  absl::Status Init(int device_ordinal) override {
-    return ::tensorflow::OkStatus();
-  }
+  absl::Status Init() override { return absl::OkStatus(); }
 
   DeviceMemoryBase Allocate(uint64 size, int64_t memory_space) override {
     SP_DeviceMemoryBase mem = {SP_DEVICE_MEMORY_BASE_STRUCT_SIZE};
@@ -237,17 +240,20 @@ class CStreamExecutor : public internal::StreamExecutorInterface {
     stream_executor_->deallocate(&device_, &device_memory_base);
   }
 
-  void* HostMemoryAllocate(uint64 size) override {
-    return stream_executor_->host_memory_allocate(&device_, size);
+  absl::StatusOr<std::unique_ptr<MemoryAllocation>> HostMemoryAllocate(
+      uint64 size) override {
+    auto* buffer = stream_executor_->host_memory_allocate(&device_, size);
+    if (buffer == nullptr && size > 0) {
+      return absl::InternalError(
+          absl::StrFormat("Failed to allocate HostMemory of size %d", size));
+    }
+    return std::make_unique<HostMemoryAllocation>(buffer, size, this);
   }
 
   void HostMemoryDeallocate(void* mem) override {
     stream_executor_->host_memory_deallocate(&device_, mem);
   }
 
-  bool HostMemoryRegister(void* mem, uint64 size) override { return false; }
-  bool HostMemoryUnregister(void* mem) override { return false; }
-
   void* UnifiedMemoryAllocate(uint64 size) override {
     CHECK(stream_executor_->unified_memory_allocate);
     return stream_executor_->unified_memory_allocate(&device_, size);
@@ -302,11 +308,6 @@ class CStreamExecutor : public internal::StreamExecutorInterface {
     return tsl::errors::Unimplemented(
         "SynchronousMemZero is not supported by pluggable device.");
   }
-  absl::Status SynchronousMemSet(DeviceMemoryBase* location, int value,
-                                 uint64 size) override {
-    return tsl::errors::Unimplemented(
-        "SynchronousMemSet is not supported by pluggable device.");
-  }
   absl::Status SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
                                  const void* host_src, uint64 size) override {
     OwnedTFStatus c_status(TF_NewStatus());
@@ -324,16 +325,6 @@ class CStreamExecutor : public internal::StreamExecutorInterface {
                                        size, c_status.get());
     return StatusFromTF_Status(c_status.get());
   }
-  absl::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase* gpu_dst,
-                                               const DeviceMemoryBase& gpu_src,
-                                               uint64 size) override {
-    OwnedTFStatus c_status(TF_NewStatus());
-    SP_DeviceMemoryBase device_mem_dst = DeviceMemoryBaseToC(gpu_dst);
-    SP_DeviceMemoryBase device_mem_src = DeviceMemoryBaseToC(&gpu_src);
-    stream_executor_->sync_memcpy_dtod(&device_, &device_mem_dst,
-                                       &device_mem_src, size, c_status.get());
-    return StatusFromTF_Status(c_status.get());
-  }
   absl::Status MemZero(Stream* stream, DeviceMemoryBase* location,
                        uint64 size) override {
     OwnedTFStatus c_status(TF_NewStatus());
@@ -420,7 +411,7 @@ class CStreamExecutor : public internal::StreamExecutorInterface {
   }
   absl::Status DeallocateEvent(Event* event) override {
     static_cast<CEvent*>(event->implementation())->Destroy();
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   absl::Status RecordEvent(Stream* stream, Event* event) override {
     SP_Stream stream_handle =
@@ -568,14 +559,12 @@ class CStreamExecutor : public internal::StreamExecutorInterface {
 
   // Each call creates a new instance of the platform-specific implementation of
   // the corresponding interface type.
-  std::unique_ptr<internal::EventInterface> CreateEventImplementation()
-      override {
-    return std::unique_ptr<internal::EventInterface>(
+  std::unique_ptr<EventInterface> CreateEventImplementation() override {
+    return std::unique_ptr<EventInterface>(
         new CEvent(&device_, stream_executor_));
   }
-  std::unique_ptr<internal::StreamInterface> GetStreamImplementation()
-      override {
-    return std::unique_ptr<internal::StreamInterface>(
+  std::unique_ptr<StreamInterface> GetStreamImplementation() override {
+    return std::unique_ptr<StreamInterface>(
         new CStream(&device_, stream_executor_));
   }
 
@@ -655,11 +644,10 @@ absl::StatusOr<std::unique_ptr<StreamExecutor>> CPlatform::GetUncachedExecutor(
                                  c_status.get());
   TF_RETURN_IF_ERROR(StatusFromTF_Status(c_status.get()));
 
-  auto executor = absl::make_unique<CStreamExecutor>(
+  auto executor = std::make_unique<CStreamExecutor>(
       std::move(device), &device_fns_, &stream_executor_, &platform_,
       &platform_fns_, &timer_fns_, name_, visible_device_count);
-  auto result = absl::make_unique<StreamExecutor>(this, std::move(executor),
-                                                  config.ordinal);
+  auto result = std::make_unique<StreamExecutor>(this, std::move(executor));
   return result;
 }
 
@@ -735,6 +723,6 @@ absl::Status InitStreamExecutorPlugin(SEInitPluginFn init_fn,
       stream_executor::PlatformManager::RegisterPlatform(std::move(cplatform)));
   // TODO(annarev): Return `use_bfc_allocator` value in some way so that it is
   // available in `PluggableDeviceProcessState` once the latter is checked in.
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace stream_executor
diff --git a/tensorflow/c/experimental/stream_executor/stream_executor_internal.h b/tensorflow/c/experimental/stream_executor/stream_executor_internal.h
index ad193a045cba50..48ea2ccf26d6f9 100644
--- a/tensorflow/c/experimental/stream_executor/stream_executor_internal.h
+++ b/tensorflow/c/experimental/stream_executor/stream_executor_internal.h
@@ -20,10 +20,12 @@ limitations under the License.
 
 #include "tensorflow/c/experimental/stream_executor/stream_executor.h"
 #include "tensorflow/c/tf_status_helper.h"
+#include "xla/stream_executor/event_interface.h"
 #include "xla/stream_executor/executor_cache.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/stream_executor/stream_executor_internal.h"
+#include "xla/stream_executor/stream_executor_interface.h"
+#include "xla/stream_executor/stream_interface.h"
 #include "tsl/platform/statusor.h"
 
 namespace stream_executor {
@@ -35,14 +37,15 @@ typedef void (*SEInitPluginFn)(SE_PlatformRegistrationParams* const,
 
 // Registers StreamExecutor platform. `device_type` and `platform_name` are
 // output parameters.
-tsl::Status InitStreamExecutorPlugin(void* dso_handle, std::string* device_type,
-                                     std::string* platform_name);
+absl::Status InitStreamExecutorPlugin(void* dso_handle,
+                                      std::string* device_type,
+                                      std::string* platform_name);
 
 // Allow registering a StreamExecutor plugin using a function (used for
 // testing).
-tsl::Status InitStreamExecutorPlugin(SEInitPluginFn init_fn,
-                                     std::string* device_type,
-                                     std::string* platform_name);
+absl::Status InitStreamExecutorPlugin(SEInitPluginFn init_fn,
+                                      std::string* device_type,
+                                      std::string* platform_name);
 
 // This file implements core stream executor base classes in terms of
 // the C API defined in stream_executor.h. A class "CSomething" represents a
@@ -72,12 +75,12 @@ class CPlatform : public Platform {
   }
   bool UseBfcAllocator() const { return platform_.use_bfc_allocator; }
   bool ForceMemoryGrowth() const { return platform_.force_memory_growth; }
-  tsl::StatusOr<std::unique_ptr<DeviceDescription>> DescriptionForDevice(
+  absl::StatusOr<std::unique_ptr<DeviceDescription>> DescriptionForDevice(
       int ordinal) const override;
-  tsl::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal) override;
-  tsl::StatusOr<StreamExecutor*> GetExecutor(
+  absl::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal) override;
+  absl::StatusOr<StreamExecutor*> GetExecutor(
       const StreamExecutorConfig& config) override;
-  tsl::StatusOr<std::unique_ptr<StreamExecutor>> GetUncachedExecutor(
+  absl::StatusOr<std::unique_ptr<StreamExecutor>> GetUncachedExecutor(
       const StreamExecutorConfig& config) override;
 
   void DestroyAllExecutors() { executor_cache_.DestroyAllExecutors(); }
@@ -95,7 +98,7 @@ class CPlatform : public Platform {
   stream_executor::ExecutorCache executor_cache_;
 };
 
-class CStream : public internal::StreamInterface {
+class CStream : public StreamInterface {
  public:
   CStream(SP_Device* device, SP_StreamExecutor* stream_executor)
       : device_(device),
@@ -103,10 +106,10 @@ class CStream : public internal::StreamInterface {
         stream_handle_(nullptr) {}
   ~CStream() override { Destroy(); }
 
-  tsl::Status Create() {
+  absl::Status Create() {
     tensorflow::TF_StatusPtr c_status(TF_NewStatus());
     stream_executor_->create_stream(device_, &stream_handle_, c_status.get());
-    tsl::Status s = tensorflow::StatusFromTF_Status(c_status.get());
+    absl::Status s = tensorflow::StatusFromTF_Status(c_status.get());
     return s;
   }
 
@@ -125,7 +128,7 @@ class CStream : public internal::StreamInterface {
   SP_Stream stream_handle_;
 };
 
-class CEvent : public internal::EventInterface {
+class CEvent : public EventInterface {
  public:
   CEvent(SP_Device* device, SP_StreamExecutor* stream_executor)
       : device_(device),
@@ -133,13 +136,13 @@ class CEvent : public internal::EventInterface {
         event_handle_(nullptr) {}
   ~CEvent() override { Destroy(); }
 
-  tsl::Status Create() {
+  absl::Status Create() {
     tensorflow::TF_StatusPtr c_status(TF_NewStatus());
     stream_executor_->create_event(device_, &event_handle_, c_status.get());
     return tensorflow::StatusFromTF_Status(c_status.get());
   }
 
-  tsl::Status Record(SP_Stream stream_handle) {
+  absl::Status Record(SP_Stream stream_handle) {
     tensorflow::TF_StatusPtr c_status(TF_NewStatus());
     stream_executor_->record_event(device_, stream_handle, event_handle_,
                                    c_status.get());
diff --git a/tensorflow/c/experimental/stream_executor/stream_executor_test.cc b/tensorflow/c/experimental/stream_executor/stream_executor_test.cc
index e4dda6c0a6c177..56f25a5811293e 100644
--- a/tensorflow/c/experimental/stream_executor/stream_executor_test.cc
+++ b/tensorflow/c/experimental/stream_executor/stream_executor_test.cc
@@ -39,17 +39,17 @@ TEST(StreamExecutor, SuccessfulRegistration) {
     test_util::PopulateDefaultPlatformRegistrationParams(params);
   };
   std::string device_type, platform_name;
-  tsl::Status status =
+  absl::Status status =
       InitStreamExecutorPlugin(plugin_init, &device_type, &platform_name);
   TF_ASSERT_OK(status);
-  tsl::StatusOr<Platform*> maybe_platform =
+  absl::StatusOr<Platform*> maybe_platform =
       PlatformManager::PlatformWithName("MY_DEVICE");
   TF_ASSERT_OK(maybe_platform.status());
   Platform* platform = std::move(maybe_platform).value();
   ASSERT_EQ(platform->Name(), test_util::kDeviceName);
   ASSERT_EQ(platform->VisibleDeviceCount(), test_util::kDeviceCount);
 
-  tsl::StatusOr<StreamExecutor*> maybe_executor =
+  absl::StatusOr<StreamExecutor*> maybe_executor =
       platform->ExecutorForDevice(0);
   TF_ASSERT_OK(maybe_executor.status());
 }
@@ -63,7 +63,7 @@ TEST(StreamExecutor, NameNotSet) {
   };
 
   std::string device_type, platform_name;
-  tsl::Status status =
+  absl::Status status =
       InitStreamExecutorPlugin(plugin_init, &device_type, &platform_name);
   ASSERT_EQ(status.code(), tensorflow::error::FAILED_PRECONDITION);
   ASSERT_EQ(status.message(), "'name' field in SP_Platform must be set.");
@@ -78,7 +78,7 @@ TEST(StreamExecutor, InvalidNameWithSemicolon) {
   };
 
   std::string device_type, platform_name;
-  tsl::Status status =
+  absl::Status status =
       InitStreamExecutorPlugin(plugin_init, &device_type, &platform_name);
   ASSERT_EQ(status.code(), tensorflow::error::FAILED_PRECONDITION);
   EXPECT_THAT(
@@ -95,7 +95,7 @@ TEST(StreamExecutor, InvalidNameWithSlash) {
   };
 
   std::string device_type, platform_name;
-  tsl::Status status =
+  absl::Status status =
       InitStreamExecutorPlugin(plugin_init, &device_type, &platform_name);
   ASSERT_EQ(status.code(), tensorflow::error::FAILED_PRECONDITION);
   EXPECT_THAT(status.message(),
@@ -111,7 +111,7 @@ TEST(StreamExecutor, CreateDeviceNotSet) {
   };
 
   std::string device_type, platform_name;
-  tsl::Status status =
+  absl::Status status =
       InitStreamExecutorPlugin(plugin_init, &device_type, &platform_name);
   ASSERT_EQ(status.code(), tensorflow::error::FAILED_PRECONDITION);
   ASSERT_EQ(status.message(),
@@ -127,7 +127,7 @@ TEST(StreamExecutor, UnifiedMemoryAllocateNotSet) {
   };
 
   std::string device_type, platform_name;
-  tsl::Status status =
+  absl::Status status =
       InitStreamExecutorPlugin(plugin_init, &device_type, &platform_name);
   ASSERT_EQ(status.code(), tensorflow::error::FAILED_PRECONDITION);
   ASSERT_EQ(
@@ -153,7 +153,7 @@ class StreamExecutorTest : public ::testing::Test {
           platform_, test_util::DestroyPlatform, platform_fns_,
           test_util::DestroyPlatformFns, device_fns_, se_, timer_fns_);
     }
-    tsl::StatusOr<StreamExecutor*> maybe_executor =
+    absl::StatusOr<StreamExecutor*> maybe_executor =
         cplatform_->ExecutorForDevice(ordinal);
     TF_CHECK_OK(maybe_executor.status());
     return std::move(maybe_executor).value();
@@ -185,7 +185,6 @@ TEST_F(StreamExecutorTest, Allocate) {
   ASSERT_NE(mem.opaque(), nullptr);
   ASSERT_EQ(mem.size(), 2 * sizeof(int));
   executor->Deallocate(&mem);
-  ASSERT_EQ(mem.opaque(), nullptr);
 }
 
 TEST_F(StreamExecutorTest, HostMemoryAllocate) {
@@ -515,25 +514,6 @@ TEST_F(StreamExecutorTest, SyncMemcpyFromHost) {
   ASSERT_EQ(dst_data, 18);
 }
 
-TEST_F(StreamExecutorTest, SyncMemcpyDeviceToDevice) {
-  se_.sync_memcpy_dtod = [](const SP_Device* const device,
-                            SP_DeviceMemoryBase* const device_dst,
-                            const SP_DeviceMemoryBase* const device_src,
-                            uint64_t size, TF_Status* const status) {
-    TF_SetStatus(status, TF_OK, "");
-    std::memcpy(device_dst->opaque, device_src->opaque, size);
-  };
-
-  StreamExecutor* executor = GetExecutor(0);
-  size_t size = sizeof(int);
-  int src_data = 18;
-  int dst_data = 0;
-  DeviceMemoryBase device_dst(&dst_data, size);
-  DeviceMemoryBase device_src(&src_data, size);
-  ASSERT_TRUE(executor->SynchronousMemcpy(&device_dst, device_src, size));
-  ASSERT_EQ(dst_data, 18);
-}
-
 TEST_F(StreamExecutorTest, BlockHostForEvent) {
   static bool block_host_for_event_called = false;
   se_.create_event = [](const SP_Device* const device, SP_Event* event,
@@ -625,7 +605,7 @@ TEST_F(StreamExecutorTest, HostCallbackError) {
   };
   StreamExecutor* executor = GetExecutor(0);
   TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
-  std::function<tsl::Status()> callback = []() -> tsl::Status {
+  std::function<absl::Status()> callback = []() -> absl::Status {
     return tsl::errors::Unimplemented("Unimplemented");
   };
   ASSERT_FALSE(stream->DoHostCallbackWithStatus(callback).ok());
diff --git a/tensorflow/c/kernels_experimental.cc b/tensorflow/c/kernels_experimental.cc
index 09ce84d42f7392..26173507f29aec 100644
--- a/tensorflow/c/kernels_experimental.cc
+++ b/tensorflow/c/kernels_experimental.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/c/kernels_experimental.h"
 
 #include <algorithm>
+#include <memory>
 #include <optional>
 #include <string>
 #include <utility>
@@ -74,7 +75,7 @@ tensorflow::Status EnsureSparseVariableAccess(
     tensorflow::Var* var, bool lock_held = false) {
   auto* context = reinterpret_cast<::tensorflow::OpKernelContext*>(ctx);
   if (var->copy_on_read_mode.load()) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   std::optional<mutex_lock> ml;
@@ -87,7 +88,7 @@ tensorflow::Status EnsureSparseVariableAccess(
   // copy-on-read mode is false.
   if (var->tensor()->RefCountIsOne()) {
     var->copy_on_read_mode.store(true);
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   Tensor tmp;
   if (variantType) {
@@ -114,7 +115,7 @@ tensorflow::Status EnsureSparseVariableAccess(
   }
   *var->tensor() = tmp;
   var->copy_on_read_mode.store(true);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status PrepareToUpdateVariable(
@@ -151,7 +152,7 @@ tensorflow::Status PrepareToUpdateVariable(
     }
     *tensor = tmp;
   }
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::mutex* GetTrainingVariableMutex(TF_OpKernelContext* ctx,
@@ -186,7 +187,7 @@ void TF_AssignVariable(TF_OpKernelContext* ctx, int input_index,
                                *ptr = new tensorflow::Var(value.dtype());
                                *(*ptr)->tensor() = value;
                                (*ptr)->is_initialized = true;
-                               return ::tensorflow::OkStatus();
+                               return absl::OkStatus();
                              }));
   tensorflow::mutex_lock ml(*variable->mu());
 
@@ -414,9 +415,9 @@ void TF_MaybeLockVariableInputMutexesInOrder(
   std::sort(acquire_order.begin(), acquire_order.end(),
             [&mutexes](int a, int b) { return mutexes[a] < mutexes[b]; });
 
-  auto locks = absl::make_unique<std::vector<tensorflow::mutex_lock>>();
+  auto locks = std::make_unique<std::vector<tensorflow::mutex_lock>>();
   auto shared_locks =
-      absl::make_unique<std::vector<tensorflow::tf_shared_lock>>();
+      std::make_unique<std::vector<tensorflow::tf_shared_lock>>();
   locks->reserve(acquire_order.size());
 
   for (auto acquire : acquire_order) {
@@ -565,7 +566,7 @@ static Status ValidateVariantType(const Variant& variant) {
         type_index_name);
   }
 
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 static Status VariantBinaryAddFunc(
@@ -581,11 +582,11 @@ static Status CCBinaryAddFunc(
                             TF_Tensor* out)) {
   if (cc_a.dtype() == ::tensorflow::DT_INVALID) {
     *cc_out = cc_b;
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   if (cc_b.dtype() == ::tensorflow::DT_INVALID) {
     *cc_out = cc_a;
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   Status status;
diff --git a/tensorflow/c/tf_buffer.cc b/tensorflow/c/tf_buffer.cc
index 864a9e79818db9..a891f89ed16d0c 100644
--- a/tensorflow/c/tf_buffer.cc
+++ b/tensorflow/c/tf_buffer.cc
@@ -78,7 +78,7 @@ Status MessageToBuffer(const tensorflow::protobuf::MessageLite& in,
   out->data = buf;
   out->length = proto_size;
   out->data_deallocator = [](void* data, size_t length) { port::Free(data); };
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status BufferToMessage(const TF_Buffer* in,
@@ -87,7 +87,7 @@ Status BufferToMessage(const TF_Buffer* in,
     return errors::InvalidArgument("Unparseable ", out->GetTypeName(),
                                    " proto");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/c/tf_status_helper.cc b/tensorflow/c/tf_status_helper.cc
index bb07a9213b4256..bbeae6f76bc497 100644
--- a/tensorflow/c/tf_status_helper.cc
+++ b/tensorflow/c/tf_status_helper.cc
@@ -22,7 +22,8 @@ limitations under the License.
 
 namespace tsl {
 
-void Set_TF_Status_from_Status(TF_Status* tf_status, const Status& status) {
+void Set_TF_Status_from_Status(TF_Status* tf_status,
+                               const absl::Status& status) {
   TF_SetStatus(tf_status, TSLCodeFromStatusCode(status.code()),
                tsl::NullTerminatedMessage(status));
   status.ForEachPayload(
@@ -33,13 +34,13 @@ void Set_TF_Status_from_Status(TF_Status* tf_status, const Status& status) {
       });
 }
 
-Status StatusFromTF_Status(const TF_Status* tf_status) {
-  Status status(StatusCodeFromTSLCode(TF_GetCode(tf_status)),
-                TF_Message(tf_status));
+absl::Status StatusFromTF_Status(const TF_Status* tf_status) {
+  absl::Status status(StatusCodeFromTSLCode(TF_GetCode(tf_status)),
+                      TF_Message(tf_status));
   TF_ForEachPayload(
       tf_status,
       [](const char* key, const char* value, void* capture) {
-        Status* status = static_cast<Status*>(capture);
+        absl::Status* status = static_cast<absl::Status*>(capture);
         status->SetPayload(key, absl::Cord(absl::string_view(value)));
       },
       &status);
diff --git a/tensorflow/c/tf_status_helper.h b/tensorflow/c/tf_status_helper.h
index 0f5c2faa6a0a65..ce833c394cb01b 100644
--- a/tensorflow/c/tf_status_helper.h
+++ b/tensorflow/c/tf_status_helper.h
@@ -24,10 +24,11 @@ limitations under the License.
 
 namespace tsl {
 // Set the attribute of "tf_status" from the attributes of "status".
-void Set_TF_Status_from_Status(TF_Status* tf_status, const Status& status);
+void Set_TF_Status_from_Status(TF_Status* tf_status,
+                               const absl::Status& status);
 
 // Returns a "status" from "tf_status".
-Status StatusFromTF_Status(const TF_Status* tf_status);
+absl::Status StatusFromTF_Status(const TF_Status* tf_status);
 }  // namespace tsl
 
 namespace tensorflow {
diff --git a/tensorflow/c/tf_status_helper_test.cc b/tensorflow/c/tf_status_helper_test.cc
index e99c64d68d335d..653395437821c3 100644
--- a/tensorflow/c/tf_status_helper_test.cc
+++ b/tensorflow/c/tf_status_helper_test.cc
@@ -23,14 +23,14 @@ namespace {
 
 TEST(StatusHelper, TestStatusHelper) {
   TSL_Status* s = TSL_NewStatus();
-  Status cc_status(absl::InvalidArgumentError("some error"));
+  absl::Status cc_status(absl::InvalidArgumentError("some error"));
   cc_status.SetPayload("key1", absl::Cord("value1"));
   cc_status.SetPayload("key2", absl::Cord("value2"));
   Set_TF_Status_from_Status(s, cc_status);
   ASSERT_EQ(TSL_INVALID_ARGUMENT, TSL_GetCode(s));
   ASSERT_EQ(std::string("some error"), TSL_Message(s));
 
-  Status another_cc_status(StatusFromTF_Status(s));
+  absl::Status another_cc_status(StatusFromTF_Status(s));
   ASSERT_FALSE(another_cc_status.ok());
   ASSERT_EQ(std::string("some error"), another_cc_status.message());
   ASSERT_EQ(error::INVALID_ARGUMENT, another_cc_status.code());
diff --git a/tensorflow/c/tf_tensor.cc b/tensorflow/c/tf_tensor.cc
index 701c6fe825c36a..96c3fd97344115 100644
--- a/tensorflow/c/tf_tensor.cc
+++ b/tensorflow/c/tf_tensor.cc
@@ -260,7 +260,7 @@ Status TensorInterface::BitcastFrom(const TensorInterface& from, DataType type,
 
 Status TensorInterface::FromProto(const tensorflow::TensorProto& from) {
   bool success = tensor_.FromProto(from);
-  if (success) return OkStatus();
+  if (success) return absl::OkStatus();
   return errors::InvalidArgument("Unparseable tensor proto");
 }
 
@@ -296,7 +296,7 @@ namespace tensorflow {
 
 AbstractTensorInterface* TensorInterfaceFromTensor(const Tensor& src,
                                                    Status* status) {
-  *status = OkStatus();
+  *status = absl::OkStatus();
   if (!src.IsInitialized()) {
     *status = FailedPrecondition(
         "attempt to use a tensor with an uninitialized value");
@@ -324,7 +324,7 @@ TF_Tensor* TF_TensorFromTensor(const tensorflow::Tensor& src, Status* status) {
 
 TF_Tensor* TF_TensorFromTensorShallow(const tensorflow::Tensor& src,
                                       Status* status) {
-  *status = OkStatus();
+  *status = absl::OkStatus();
   if (!src.IsInitialized()) {
     *status = FailedPrecondition(
         "attempt to use a tensor with an uninitialized value");
@@ -343,7 +343,7 @@ Status TF_TensorToTensor(const TF_Tensor* src, Tensor* dst) {
 
 Status TensorInterface::ToTensor(tensorflow::Tensor* dst) const {
   *dst = tensor_;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool TensorInterface::IsAligned() const { return tensor_.IsAligned(); }
diff --git a/tensorflow/c/while_loop_test.cc b/tensorflow/c/while_loop_test.cc
index 687c73d8b4e495..1bb04fec430b23 100644
--- a/tensorflow/c/while_loop_test.cc
+++ b/tensorflow/c/while_loop_test.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/c_test_util.h"
 #include "tensorflow/core/platform/logging.h"
@@ -45,8 +47,8 @@ class CApiWhileLoopTest : public ::testing::Test {
 
     original_graph_description_ = GraphDebugString();
 
-    params_.reset(new TF_WhileParams(
-        TF_NewWhile(graph_, &inputs_[0], inputs_.size(), s_)));
+    params_ = std::make_unique<TF_WhileParams>(
+        TF_NewWhile(graph_, &inputs_[0], inputs_.size(), s_));
     ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
     ASSERT_EQ(original_graph_description_, GraphDebugString())
         << "TF_NewWhile() altered graph";
@@ -85,7 +87,7 @@ class CApiWhileLoopTest : public ::testing::Test {
       ++i;
     }
     // TODO(skyewm): use std::make_unique or absl::make_unique when possible.
-    csession_.reset(new CSession(graph_, s_));
+    csession_ = std::make_unique<CSession>(graph_, s_);
     csession_->SetInputs(inputs);
     csession_->SetOutputs(run_outputs);
     csession_->Run(s_);
diff --git a/tensorflow/cc/experimental/base/tests/tensor_test.cc b/tensorflow/cc/experimental/base/tests/tensor_test.cc
index 33f9ab637e82a5..95610f098cc470 100644
--- a/tensorflow/cc/experimental/base/tests/tensor_test.cc
+++ b/tensorflow/cc/experimental/base/tests/tensor_test.cc
@@ -82,7 +82,7 @@ TYPED_TEST(Construct1DTensorTest, ValidTensorAttributesAfterConstruction) {
 
   EXPECT_EQ(tensor.dims(), 1);
   EXPECT_EQ(tensor.dtype(), dtype);
-  tensorflow::gtl::ArraySlice<typename TypeParam::type> tensor_view(
+  absl::Span<const typename TypeParam::type> tensor_view(
       reinterpret_cast<typename TypeParam::type*>(tensor.data()), value.size());
   EXPECT_EQ(tensor_view[0], 42);
   EXPECT_EQ(tensor_view[1], 100);
@@ -121,7 +121,7 @@ TYPED_TEST(Construct2DTensorTest, ValidTensorAttributesAfterConstruction) {
 
   EXPECT_EQ(tensor.dims(), 2);
   EXPECT_EQ(tensor.dtype(), dtype);
-  tensorflow::gtl::ArraySlice<typename TypeParam::type> tensor_view(
+  absl::Span<const typename TypeParam::type> tensor_view(
       reinterpret_cast<typename TypeParam::type*>(tensor.data()), value.size());
   EXPECT_EQ(tensor_view[0], 42);
   EXPECT_EQ(tensor_view[1], 100);
diff --git a/tensorflow/cc/experimental/base/tests/tensorhandle_test.cc b/tensorflow/cc/experimental/base/tests/tensorhandle_test.cc
index cfeaba4e3923ca..77ac7052baa0fe 100644
--- a/tensorflow/cc/experimental/base/tests/tensorhandle_test.cc
+++ b/tensorflow/cc/experimental/base/tests/tensorhandle_test.cc
@@ -116,7 +116,7 @@ TYPED_TEST(Construct1DTensorHandleTest,
 
   EXPECT_EQ(tensor.dims(), 1);
   EXPECT_EQ(tensor.dtype(), dtype);
-  tensorflow::gtl::ArraySlice<typename TypeParam::type> tensor_view(
+  absl::Span<const typename TypeParam::type> tensor_view(
       reinterpret_cast<typename TypeParam::type*>(tensor.data()), value.size());
   EXPECT_EQ(tensor_view[0], 42);
   EXPECT_EQ(tensor_view[1], 100);
@@ -166,7 +166,7 @@ TYPED_TEST(Construct2DTensorHandleTest,
 
   EXPECT_EQ(tensor.dims(), 2);
   EXPECT_EQ(tensor.dtype(), dtype);
-  tensorflow::gtl::ArraySlice<typename TypeParam::type> tensor_view(
+  absl::Span<const typename TypeParam::type> tensor_view(
       reinterpret_cast<typename TypeParam::type*>(tensor.data()), value.size());
   EXPECT_EQ(tensor_view[0], 42);
   EXPECT_EQ(tensor_view[1], 100);
diff --git a/tensorflow/cc/experimental/libtf/value.h b/tensorflow/cc/experimental/libtf/value.h
index c8347e6c3033d7..61a2888426ee3d 100644
--- a/tensorflow/cc/experimental/libtf/value.h
+++ b/tensorflow/cc/experimental/libtf/value.h
@@ -56,7 +56,7 @@ using Dict =
 using DictPtr = std::shared_ptr<Dict>;
 using TuplePtr = std::shared_ptr<Tuple>;
 using Func =
-    std::function<tensorflow::StatusOr<TaggedValue>(TaggedValue, TaggedValue)>;
+    std::function<absl::StatusOr<TaggedValue>(TaggedValue, TaggedValue)>;
 // A capsule holds a pointer and a destructor for the pointer (i.e. a generic
 // shared_ptr to void with a custom deleter).
 using Capsule = std::shared_ptr<void>;
diff --git a/tensorflow/cc/framework/grad_op_registry.cc b/tensorflow/cc/framework/grad_op_registry.cc
index 268ea764de8a4c..26628759277889 100644
--- a/tensorflow/cc/framework/grad_op_registry.cc
+++ b/tensorflow/cc/framework/grad_op_registry.cc
@@ -41,7 +41,7 @@ Status GradOpRegistry::Lookup(const string& op, GradFunc* func) const {
     return errors::NotFound(error_msg);
   }
   *func = iter->second;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // end namespace ops
diff --git a/tensorflow/cc/framework/gradient_checker.cc b/tensorflow/cc/framework/gradient_checker.cc
index 0c026cf9a0c2c5..90f104bc24b129 100644
--- a/tensorflow/cc/framework/gradient_checker.cc
+++ b/tensorflow/cc/framework/gradient_checker.cc
@@ -183,7 +183,7 @@ Status ComputeTheoreticalJacobianTranspose(
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status EvaluateGraph(ClientSession* session, const OutputList& xs,
@@ -208,7 +208,7 @@ Status EvaluateGraph(ClientSession* session, const OutputList& xs,
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename X_T, typename Y_T, typename JAC_T>
@@ -272,7 +272,7 @@ Status ComputeNumericJacobianTranspose(const Scope& scope, const OutputList& xs,
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // The Jacobian is always a real-valued matrix.
@@ -366,13 +366,13 @@ Status ComputeGradientErrorInternal(const Scope& scope, const OutputList& xs,
         // (Note that std::max may ignore NaN arguments.)
         if (std::isnan(cur_error)) {
           *max_error = cur_error;
-          return OkStatus();
+          return absl::OkStatus();
         }
         *max_error = std::max(*max_error, cur_error);
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
diff --git a/tensorflow/cc/framework/gradients.cc b/tensorflow/cc/framework/gradients.cc
index e0a399d6b1c0da..548f5c04833a2e 100644
--- a/tensorflow/cc/framework/gradients.cc
+++ b/tensorflow/cc/framework/gradients.cc
@@ -166,7 +166,7 @@ Status SymbolicGradientBuilder::BackpropAlongEdge(const Output& dst_grad,
       ready_.push_back(src.node());
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 std::vector<bool> SymbolicGradientBuilder::GetReachableNodes() {
@@ -341,7 +341,7 @@ Status SymbolicGradientBuilder::Initialize() {
       TF_RETURN_IF_ERROR(BackpropAlongEdge(grad_inputs_[i], outputs_[i]));
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status SymbolicGradientBuilder::SumGradients(const Output& src, Output* grad) {
@@ -372,7 +372,7 @@ Status SymbolicGradientBuilder::SumGradients(const Output& src, Output* grad) {
     *grad = ops::AddN(scope_, grads_to_keep);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool SymbolicGradientBuilder::IsPrimitiveOpWithNoGrad(const string& opname) {
@@ -388,7 +388,7 @@ Status SymbolicGradientBuilder::CallGradFunction(
   TF_RETURN_IF_ERROR(registry_->Lookup(op.node()->type_string(), &grad_fn));
   TF_RETURN_IF_ERROR(grad_fn(scope_, op, grad_inputs, grad_outputs));
   TF_RETURN_IF_ERROR(scope_.status());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status SymbolicGradientBuilder::ProcessWhileLoop(Node* exit_node,
@@ -414,7 +414,8 @@ Status SymbolicGradientBuilder::ProcessWhileLoop(Node* exit_node,
   // Wait until we have all exit nodes' backprops collected before processing
   // the while loop.
   // TODO(skyewm): what if not all the exit nodes are reachable?
-  if (backprops.size() < while_ctx->exit_nodes().size()) return OkStatus();
+  if (backprops.size() < while_ctx->exit_nodes().size())
+    return absl::OkStatus();
 
   // We've seen all the exit nodes for this loop and have collected all the
   // backprops. Create the gradient graph for the while loop.
@@ -435,7 +436,7 @@ Status SymbolicGradientBuilder::ProcessWhileLoop(Node* exit_node,
       TF_RETURN_IF_ERROR(BackpropAlongEdge(dx[i], {e->src(), e->src_output()}));
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status SymbolicGradientBuilder::AddGradients() {
@@ -553,7 +554,7 @@ Status SymbolicGradientBuilder::AddGradients() {
     int num_requested_inputs = p.first->num_outputs() - pending_[p.first->id()];
     CHECK_EQ(num_requested_inputs, p.second);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
diff --git a/tensorflow/cc/framework/ops.h b/tensorflow/cc/framework/ops.h
index ab8b387ab5681a..7bbb3b2bcb5236 100644
--- a/tensorflow/cc/framework/ops.h
+++ b/tensorflow/cc/framework/ops.h
@@ -221,7 +221,7 @@ class Input {
         tensor_(init.tensor) {}
 
   Input(const Tensor& t)  // NOLINT(runtime/explicit)
-      : status_(OkStatus()), tensor_(t) {}
+      : status_(absl::OkStatus()), tensor_(t) {}
 
   Input(const std::initializer_list<Initializer>&
             init) {  // NOLINT(runtime/explicit)
@@ -274,8 +274,7 @@ class InputList {
       const std::initializer_list<Input>& inputs)  // NOLINT(runtime/explicit)
       : inputs_(inputs.begin(), inputs.end()) {}
 
-  InputList(const tensorflow::gtl::ArraySlice<Input>&
-                inputs)  // NOLINT(runtime/explicit)
+  InputList(const absl::Span<const Input>& inputs)  // NOLINT(runtime/explicit)
       : inputs_(inputs.begin(), inputs.end()) {}
 
   InputList(
diff --git a/tensorflow/cc/framework/scope.cc b/tensorflow/cc/framework/scope.cc
index 6667b6919d52e6..0c972612089918 100644
--- a/tensorflow/cc/framework/scope.cc
+++ b/tensorflow/cc/framework/scope.cc
@@ -311,7 +311,7 @@ Status Scope::ToGraphDef(GraphDef* gdef, bool include_debug_info) const {
     return *impl()->status_;
   }
   graph()->ToGraphDef(gdef, /*include_flib_def=*/true, include_debug_info);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status Scope::ToGraph(Graph* g, GraphConstructorOptions opts) const {
@@ -427,7 +427,7 @@ Scope Scope::WithOpNameImpl(const string& op_name) const {
 }
 
 Scope Scope::WithControlDependencies(
-    const gtl::ArraySlice<Operation> control_deps) const {
+    const absl::Span<const Operation> control_deps) const {
   return Scope(
       new Impl(*this, Impl::Tags::ControlDeps(),
                std::vector<Operation>(control_deps.begin(), control_deps.end()),
@@ -499,7 +499,7 @@ CompositeOpScopes Scope::GetCompositeOpScopes(
 }
 
 Status Scope::DoShapeInference(Node* node) const {
-  if (impl_->disable_shape_inference_) return OkStatus();
+  if (impl_->disable_shape_inference_) return absl::OkStatus();
   return impl_->refiner_->AddNode(node);
 }
 
@@ -547,7 +547,7 @@ Status CreateOutputWithScope(string op_name,
   scope.UpdateStatus(builder.Finalize(scope.graph(), &ret));
   TF_RETURN_IF_ERROR(scope.status());
   *output = Output(ret, 0);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/cc/framework/scope.h b/tensorflow/cc/framework/scope.h
index 771fdaa11688c9..0b0f6871e7f27c 100644
--- a/tensorflow/cc/framework/scope.h
+++ b/tensorflow/cc/framework/scope.h
@@ -125,7 +125,7 @@ class Scope {
   /// Return a new scope. All ops created within the returned scope will have as
   /// control dependencies the union of operations in the control_deps vector
   /// and the control dependencies of the current scope.
-  Scope WithControlDependencies(gtl::ArraySlice<Operation> control_deps) const;
+  Scope WithControlDependencies(absl::Span<const Operation> control_deps) const;
   /// Same as above, but convenient to add control dependency on the operation
   /// producing the control_dep output.
   Scope WithControlDependencies(const Output& control_dep) const;
diff --git a/tensorflow/cc/framework/while_gradients.cc b/tensorflow/cc/framework/while_gradients.cc
index e28306e5a33031..9f966994ea2066 100644
--- a/tensorflow/cc/framework/while_gradients.cc
+++ b/tensorflow/cc/framework/while_gradients.cc
@@ -70,7 +70,7 @@ Status AddForwardLoopCounter(WhileContext* while_ctx, const Scope& scope,
                                            const std::vector<Output>& inputs,
                                            Output* output) {
     *output = ToOutput(while_ctx->cond_output());
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   // Body function that adds one to input.
@@ -88,7 +88,7 @@ Status AddForwardLoopCounter(WhileContext* while_ctx, const Scope& scope,
                                     while_ctx->frame_name(), &outputs,
                                     /* create_while_ctx */ false));
   *count = outputs[0];
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Creates a loop that executes `loop_count` times. The returned output is the
@@ -126,7 +126,7 @@ Status AddBackPropLoopCounter(WhileContext* while_ctx, const Output& loop_count,
   TF_RETURN_IF_ERROR(BuildWhileLoop(
       scope, {loop_count}, cond_fn, body_fn, frame_name, &outputs,
       /* create_while_ctx */ false, backprop_execution_pred));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Creates the main backprop loop that computes the gradient of the loop
@@ -155,7 +155,7 @@ Status AddWhileGradientLoop(WhileContext* while_ctx,
                                    const std::vector<Output>& inputs,
                                    Output* output) {
     *output = backprop_execution_pred;
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   // Body function that builds while body gradient subgraph.
@@ -173,7 +173,7 @@ Status AddWhileGradientLoop(WhileContext* while_ctx,
   TF_RETURN_IF_ERROR(BuildWhileLoop(scope, grad_inputs, cond_fn, body_fn,
                                     frame_name, grad_outputs,
                                     /* create_while_ctx */ false));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
diff --git a/tensorflow/cc/training/coordinator.cc b/tensorflow/cc/training/coordinator.cc
index a54f3cb9f6d010..fdbce41c8ca1ef 100644
--- a/tensorflow/cc/training/coordinator.cc
+++ b/tensorflow/cc/training/coordinator.cc
@@ -45,7 +45,7 @@ Status Coordinator::RegisterRunner(std::unique_ptr<RunnerInterface> runner) {
   }
   mutex_lock l(runners_lock_);
   runners_.push_back(std::move(runner));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool Coordinator::AllRunnersStopped() {
@@ -66,7 +66,7 @@ Status Coordinator::RequestStop() {
   }
   should_stop_ = true;
   wait_for_stop_.notify_all();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool Coordinator::ShouldStop() {
@@ -123,7 +123,7 @@ Status Coordinator::ExportCostGraph(CostGraphDef* cost_graph) const {
       return s;
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/cc/training/queue_runner.cc b/tensorflow/cc/training/queue_runner.cc
index 3a68e981c24c63..e480ea29a8061b 100644
--- a/tensorflow/cc/training/queue_runner.cc
+++ b/tensorflow/cc/training/queue_runner.cc
@@ -77,7 +77,7 @@ Status QueueRunner::Init(const QueueRunnerDef& queue_runner_def) {
   thread_pool_.reset(new thread::ThreadPool(
       Env::Default(), SanitizeThreadSuffix(queue_name_), nthreads));
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 QueueRunner::~QueueRunner() {
@@ -118,7 +118,7 @@ Status QueueRunner::Start(Session* sess, int wait_for) {
       return status_;
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status QueueRunner::StartAndCollectCostGraph(Session* session, int wait_for_ms,
@@ -212,7 +212,7 @@ Status QueueRunner::ExportCostGraph(CostGraphDef* cost_graph) const {
   }
   mutex_lock l(*cg_mu_);
   cost_graph->MergeFrom(*cost_graph_);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void QueueRunner::SetRunArgumentsAndCostGraph(const RunOptions& run_options) {
diff --git a/tensorflow/compiler/aot/BUILD b/tensorflow/compiler/aot/BUILD
index 44f2622834fa4b..1edcfbf51432b7 100644
--- a/tensorflow/compiler/aot/BUILD
+++ b/tensorflow/compiler/aot/BUILD
@@ -4,6 +4,7 @@ load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "if_llvm_aarch32_available",
     "if_llvm_aarch64_available",
+    "if_llvm_hexagon_available",
     "if_llvm_powerpc_available",
     "if_llvm_system_z_available",
     "if_llvm_x86_available",
@@ -51,6 +52,8 @@ cc_library(
     compatible_with = [],
     defines = if_llvm_aarch32_available(["TF_LLVM_AARCH32_AVAILABLE=1"]) + if_llvm_aarch64_available([
         "TF_LLVM_AARCH64_AVAILABLE=1",
+    ]) + if_llvm_hexagon_available([
+        "TF_LLVM_HEXAGON_AVAILABLE=1",
     ]) + if_llvm_powerpc_available([
         "TF_LLVM_POWERPC_AVAILABLE=1",
     ]) + if_llvm_system_z_available([
@@ -141,6 +144,9 @@ cc_library(
     ]) + if_llvm_aarch64_available([
         "@llvm-project//llvm:AArch64AsmParser",  # fixdeps: keep
         "@llvm-project//llvm:AArch64CodeGen",  # fixdeps: keep
+    ]) + if_llvm_hexagon_available([
+        "@llvm-project//llvm:HexagonAsmParser",  # fixdeps: keep
+        "@llvm-project//llvm:HexagonCodeGen",  # fixdeps: keep
     ]) + if_llvm_powerpc_available([
         "@llvm-project//llvm:PowerPCAsmParser",  # fixdeps: keep
         "@llvm-project//llvm:PowerPCCodeGen",  # fixdeps: keep
diff --git a/tensorflow/compiler/aot/compile.cc b/tensorflow/compiler/aot/compile.cc
index d2a4f53a426f09..e558bd67b8ec7d 100644
--- a/tensorflow/compiler/aot/compile.cc
+++ b/tensorflow/compiler/aot/compile.cc
@@ -100,7 +100,7 @@ Status CompileXla(xla::CompileOnlyClient* client,
   compile_result->entry_point = aot_opts.entry_point_name();
   compile_result->pointer_size =
       xla::CompileOnlyClient::PointerSizeForTriple(aot_opts.triple());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -196,6 +196,13 @@ static void InitializeTargets() {
   LLVMInitializeAArch64AsmParser();
   LLVMInitializeAArch64AsmPrinter();
 #endif
+#if TF_LLVM_HEXAGON_AVAILABLE
+  LLVMInitializeHexagonTarget();
+  LLVMInitializeHexagonTargetInfo();
+  LLVMInitializeHexagonTargetMC();
+  LLVMInitializeHexagonAsmParser();
+  LLVMInitializeHexagonAsmPrinter();
+#endif
 #if TF_LLVM_POWERPC_AVAILABLE
   LLVMInitializePowerPCTarget();
   LLVMInitializePowerPCTargetInfo();
@@ -252,7 +259,7 @@ Status Main(const MainFlags& flags) {
       nodes.insert(fetch.id().node_name());
     }
     std::cout << absl::StrJoin(nodes, ",");
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Read and initialize the graph.
@@ -306,7 +313,7 @@ Status Main(const MainFlags& flags) {
   TF_RETURN_IF_ERROR(GenerateHeader(codegen_opts, config, compile_result,
                                     metadata_result, &header));
   TF_RETURN_IF_ERROR(WriteStringToFile(env, flags.out_header, header));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tfcompile
diff --git a/tensorflow/compiler/aot/embedded_protocol_buffers.h b/tensorflow/compiler/aot/embedded_protocol_buffers.h
index d531996cbb2be1..fc21fc99a0b84e 100644
--- a/tensorflow/compiler/aot/embedded_protocol_buffers.h
+++ b/tensorflow/compiler/aot/embedded_protocol_buffers.h
@@ -26,7 +26,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace tfcompile {
-using xla::StatusOr;
+using absl::StatusOr;
 
 // Represents a set of protocol buffers embedded into an object file and
 // describes how to access them at runtime.
diff --git a/tensorflow/compiler/jit/flags.cc b/tensorflow/compiler/jit/flags.cc
index f85fd5fde4c1fa..235e8fda0dfc86 100644
--- a/tensorflow/compiler/jit/flags.cc
+++ b/tensorflow/compiler/jit/flags.cc
@@ -287,7 +287,6 @@ void AllocateAndParseFlags() {
   bool enable_mlir_multiple_local_cpu_devices = false;
   // Dump graphs in TFG dialect.
   bool use_tfg_graph_dumper = false;
-  bool enable_mlir_generic_outside_compilation = false;
   bool enable_tpu_variable_runtime_reformatting_pass = true;
 
   flag_list = new std::vector<Flag>(
@@ -391,10 +390,6 @@ void AllocateAndParseFlags() {
        Flag("tf_dump_graphs_in_tfg", &use_tfg_graph_dumper,
             "When tf_dump_graphs_in_tfg is true, graphs after transformations "
             "are dumped in MLIR TFG dialect and not in GraphDef"),
-       Flag("tf_mlir_enable_generic_outside_compilation",
-            &enable_mlir_generic_outside_compilation,
-            "Enables OutsideCompilation passes for MLIR-Based TensorFlow "
-            "Generic Compiler Bridge."),
        Flag("tf_mlir_enable_tpu_variable_runtime_reformatting_pass",
             &enable_tpu_variable_runtime_reformatting_pass,
             "Enables TPUVariableRuntimeReformatting pass for MLIR-Based "
@@ -422,8 +417,6 @@ void AllocateAndParseFlags() {
   mlir_flags->tf_mlir_enable_composite_tpuexecute_side_effects =
       enable_mlir_composite_tpuexecute_side_effects;
   mlir_flags->tf_mlir_enable_strict_clusters = enable_mlir_strict_clusters;
-  mlir_flags->tf_mlir_enable_generic_outside_compilation =
-      enable_mlir_generic_outside_compilation;
   mlir_flags->tf_mlir_enable_tpu_variable_runtime_reformatting_pass =
       enable_tpu_variable_runtime_reformatting_pass;
   mlir_flags->tf_mlir_enable_multiple_local_cpu_devices =
diff --git a/tensorflow/compiler/jit/flags.h b/tensorflow/compiler/jit/flags.h
index d2c078a617b258..9dbd6106514ab8 100644
--- a/tensorflow/compiler/jit/flags.h
+++ b/tensorflow/compiler/jit/flags.h
@@ -290,7 +290,6 @@ struct MlirCommonFlags {
   bool tf_mlir_enable_convert_control_to_data_outputs_pass;
   bool tf_mlir_enable_composite_tpuexecute_side_effects;
   bool tf_mlir_enable_strict_clusters;
-  bool tf_mlir_enable_generic_outside_compilation;
   bool tf_mlir_enable_tpu_variable_runtime_reformatting_pass;
   // TODO(pineapplejuice233): Revisit this flag once the performance impact is verified
   // with different local CPU devices settings.
diff --git a/tensorflow/compiler/jit/kernels/xla_ops.cc b/tensorflow/compiler/jit/kernels/xla_ops.cc
index 25654267a6ae01..9d75388cfbbe80 100644
--- a/tensorflow/compiler/jit/kernels/xla_ops.cc
+++ b/tensorflow/compiler/jit/kernels/xla_ops.cc
@@ -921,13 +921,13 @@ void XlaRunOp::Compute(OpKernelContext* ctx) {
   absl::StatusOr<std::vector<xla::ExecutionInput>> execution_inputs;
   std::map<int, const Tensor*> snapshot_ptrs;
   {
-    tensorflow::profiler::TraceMe hlo_module_activity(
+    tsl::profiler::TraceMe hlo_module_activity(
         [&] {
           return absl::StrCat(
               "Populate Inputs (",
               closure.compilation_result()->xla_input_shapes.size(), ")");
         },
-        tensorflow::profiler::TraceMeLevel::kInfo);
+        tsl::profiler::TraceMeLevel::kInfo);
 
     for (const auto& [variable_index, variable_tensor] :
          closure.resource_var_snapshots()) {
@@ -957,11 +957,11 @@ void XlaRunOp::Compute(OpKernelContext* ctx) {
       closure.executable(), ctx, allocator.get());
   OP_REQUIRES(ctx, execution_output.ok(), execution_output.status());
 
-  tensorflow::profiler::TraceMe hlo_module_activity(
+  tsl::profiler::TraceMe hlo_module_activity(
       [&] {
         return absl::StrCat("Populate Outputs (", ctx->num_outputs(), ")");
       },
-      tensorflow::profiler::TraceMeLevel::kInfo);
+      tsl::profiler::TraceMeLevel::kInfo);
 
   absl::StatusOr<std::vector<VariableInfo>> variable_infos = GatherVariableInfo(
       ctx, *closure.compilation_result(), closure.num_constant_args());
diff --git a/tensorflow/compiler/jit/ops/BUILD b/tensorflow/compiler/jit/ops/BUILD
index 6372b2e5516cd3..37a8bf9ce39df6 100644
--- a/tensorflow/compiler/jit/ops/BUILD
+++ b/tensorflow/compiler/jit/ops/BUILD
@@ -11,7 +11,10 @@ package(
 cc_library(
     name = "xla_ops",
     srcs = ["xla_ops.cc"],
-    deps = ["//tensorflow/core:framework"],
+    deps = [
+        "//tensorflow/core:framework",
+        "@com_google_absl//absl/status",
+    ],
     alwayslink = 1,
 )
 
diff --git a/tensorflow/compiler/jit/ops/xla_ops.cc b/tensorflow/compiler/jit/ops/xla_ops.cc
index 7c370e46dec63f..8d49471c33741b 100644
--- a/tensorflow/compiler/jit/ops/xla_ops.cc
+++ b/tensorflow/compiler/jit/ops/xla_ops.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
+#include "absl/status/status.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
 
diff --git a/tensorflow/compiler/jit/pjrt_device_context.cc b/tensorflow/compiler/jit/pjrt_device_context.cc
index 84a651361681dc..2d982fa82e76b8 100644
--- a/tensorflow/compiler/jit/pjrt_device_context.cc
+++ b/tensorflow/compiler/jit/pjrt_device_context.cc
@@ -62,7 +62,7 @@ absl::StatusOr<std::unique_ptr<xla::PjRtBuffer>> HostTensorToPjRtBuffer(
   auto first_try_buffer = pjrt_client->BufferFromHostBuffer(
       cpu_tensor->data(), shape.element_type(), shape.dimensions(),
       /*byte_strides=*/std::nullopt,
-      xla::PjRtClient::HostBufferSemantics::kZeroCopy,
+      xla::PjRtClient::HostBufferSemantics::kImmutableZeroCopy,
       /*on_done_with_host_buffer=*/
       [cpu_tensor = *cpu_tensor]() { /* frees tensor */ }, pjrt_device,
       device_layout);
@@ -78,7 +78,7 @@ absl::StatusOr<std::unique_ptr<xla::PjRtBuffer>> HostTensorToPjRtBuffer(
         pjrt_client->BufferFromHostBuffer(
             cpu_tensor->data(), shape.element_type(), shape.dimensions(),
             /*byte_strides=*/std::nullopt,
-            xla::PjRtClient::HostBufferSemantics::kZeroCopy,
+            xla::PjRtClient::HostBufferSemantics::kImmutableZeroCopy,
             /*on_done_with_host_buffer=*/
             [cpu_tensor = *cpu_tensor]() { /* frees tensor */ }, pjrt_device));
     return second_try_buffer;
@@ -93,7 +93,7 @@ void PjRtDeviceContext::CopyDeviceTensorToCPU(const Tensor* device_tensor,
                                               Device* device,
                                               Tensor* cpu_tensor,
                                               StatusCallback done) {
-  profiler::TraceMe traceme("PjRtDeviceContext::CopyDeviceTensorToCPU");
+  tsl::profiler::TraceMe traceme("PjRtDeviceContext::CopyDeviceTensorToCPU");
   if (device_tensor->NumElements() == 0) {
     VLOG(2) << "CopyDeviceTensorToCPU empty tensor";
     done(absl::OkStatus());
@@ -136,7 +136,7 @@ void PjRtDeviceContext::CopyDeviceTensorToCPU(const Tensor* device_tensor,
     return;
   }
 
-  xla::PjRtFuture<Status> future = device_buffer->ToLiteral(literal.get());
+  xla::PjRtFuture<> future = device_buffer->ToLiteral(literal.get());
   future.OnReady([literal = std::move(literal), done = std::move(done)](
                      const tensorflow::Status& status) { done(status); });
 }
@@ -146,14 +146,13 @@ void PjRtDeviceContext::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
                                               Tensor* device_tensor,
                                               StatusCallback done,
                                               bool sync_dst_compute) const {
-  profiler::TraceMe traceme("PjRtDeviceContext::CopyCPUTensorToDevice");
+  tsl::profiler::TraceMe traceme("PjRtDeviceContext::CopyCPUTensorToDevice");
   if (cpu_tensor->NumElements() == 0) {
     VLOG(2) << "CopyCPUTensorToDevice empty tensor";
     done(absl::OkStatus());
     return;
   }
 
-  // TODO(b/252887149): figure out how to cache PJRT client.
   absl::StatusOr<xla::PjRtClient*> pjrt_client =
       GetOrCreatePjRtClient(DeviceType(device->device_type()));
   if (!pjrt_client.ok()) {
@@ -187,8 +186,6 @@ void PjRtDeviceContext::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
     CHECK(!result_tensor->GetBuffer());  // Crash OK
     result_tensor->SetBuffer(std::move(*buffer_or));
   }
-  // TODO(b/244666476): evaluate the performance impact of marking ready when
-  // the data in device buffer is computed.
   pjrt_buffer->GetReadyFuture().OnReady(std::move(done));
 }
 
@@ -243,7 +240,7 @@ void PjRtDeviceToDeviceCopy(DeviceContext* send_dev_context,
                             AllocatorAttributes dst_alloc_attr,
                             const Tensor* input, Tensor* output,
                             int dev_to_dev_stream_index, StatusCallback done) {
-  profiler::TraceMe traceme("PjRtDevice_DeviceToDeviceCopy");
+  tsl::profiler::TraceMe traceme("PjRtDevice_DeviceToDeviceCopy");
   if (input->NumElements() == 0) {
     VLOG(2) << "PjRtDevice_DeviceToDeviceCopy empty tensor";
     done(absl::OkStatus());
@@ -298,8 +295,6 @@ void PjRtDeviceToDeviceCopy(DeviceContext* send_dev_context,
     CHECK(!output_tensor->GetBuffer());  // Crash OK
     output_tensor->SetBuffer(std::move(*buffer_or));
   }
-  // TODO(b/244666476): evaluate the performance impact of marking ready when
-  // the data in device buffer is computed.
   pjrt_buffer->GetReadyFuture().OnReady(std::move(done));
 }
 
diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc
index 2d0d9d51036033..b5b0c16422ccab 100644
--- a/tensorflow/compiler/jit/xla_device.cc
+++ b/tensorflow/compiler/jit/xla_device.cc
@@ -485,7 +485,8 @@ void XlaDevice::ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context,
 
 Status XlaDevice::Sync() {
   VLOG(1) << "XlaDevice::Sync";
-  profiler::TraceMe activity("XlaDevice::Sync", profiler::TraceMeLevel::kInfo);
+  tsl::profiler::TraceMe activity("XlaDevice::Sync",
+                                  tsl::profiler::TraceMeLevel::kInfo);
   std::shared_ptr<se::Stream> stream;
   {
     mutex_lock lock(mu_);
diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc
index 4eba7373910f97..821d294af90f66 100644
--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@@ -261,7 +261,7 @@ void XlaDeviceContext::CopyDeviceTensorToCPU(const Tensor* device_tensor,
   transfer_manager_->TransferLiteralFromDevice(
       device_to_host_stream.get(), xla_tensor->shaped_buffer(), literal,
       [this, ref, xla_tensor, done, device_to_host_stream,
-       device_allows_sync_on_completion](xla::Status status) {
+       device_allows_sync_on_completion](absl::Status status) {
         Status done_status = status;
         VLOG(2) << "Transfer from device as literal: "
                 << xla_tensor->shaped_buffer().ToString();
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index f9657509623cc1..9107e07b83bc21 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -920,7 +920,7 @@ absl::StatusOr<std::vector<std::unique_ptr<xla::PjRtBuffer>>> RunPjRtExecutable(
       &executable_args, &owned_executable_args, &non_donatable_input_indices));
 
   std::vector<std::unique_ptr<xla::PjRtBuffer>> execute_outputs;
-  std::optional<xla::PjRtFuture<Status>> future;
+  std::optional<xla::PjRtFuture<>> future;
   if (executable->num_replicas() != 1 || executable->num_partitions() != 1) {
     TF_ASSIGN_OR_RETURN(
         execute_outputs,
diff --git a/tensorflow/compiler/mlir/lite/debug/debug_test.cc b/tensorflow/compiler/mlir/lite/debug/debug_test.cc
index 127d485b842f94..d876e05b31fdf6 100644
--- a/tensorflow/compiler/mlir/lite/debug/debug_test.cc
+++ b/tensorflow/compiler/mlir/lite/debug/debug_test.cc
@@ -120,7 +120,7 @@ class InitPassManagerTest : public testing::Test {
     builder.create<mlir::func::ReturnOp>(builder.getUnknownLoc());
   }
 
-  tsl::Status GetDumpDir(std::string* dump_dir) {
+  absl::Status GetDumpDir(std::string* dump_dir) {
     std::vector<string> files;
     if (auto status = tsl::Env::Default()->GetChildren(path_, &files);
         !status.ok()) {
@@ -131,7 +131,7 @@ class InitPassManagerTest : public testing::Test {
           "Expecting directory to have one child.");
     }
     *dump_dir = tsl::io::JoinPath(path_, files[0]);
-    return tsl::OkStatus();
+    return absl::OkStatus();
   }
 
   std::string path_;
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
index b98d3220ee15a8..4655fa7c069b54 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
@@ -120,6 +120,7 @@ limitations under the License.
 #include "tsl/platform/status.h"
 #include "tsl/platform/tstring.h"
 
+using absl::StatusOr;
 using llvm::dyn_cast;
 using llvm::formatv;
 using llvm::isa;
@@ -143,7 +144,6 @@ using tensorflow::OpOrArgLocNameMapper;
 using tensorflow::OpOrArgNameMapper;
 using tensorflow::Status;
 using tflite::flex::IsAllowlistedFlexOp;
-using xla::StatusOr;
 
 template <typename T>
 using BufferOffset = flatbuffers::Offset<T>;
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_import.cc b/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
index 90a6e7704d70af..0d477b51b6d467 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
@@ -101,6 +101,8 @@ limitations under the License.
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
 
+using absl::Status;
+using absl::StatusOr;
 using llvm::ArrayRef;
 using mlir::Builder;
 using mlir::DenseElementsAttr;
@@ -115,8 +117,6 @@ using mlir::Value;
 using mlir::func::FuncOp;
 using tflite::OperatorT;
 using tflite::TensorT;
-using xla::Status;
-using xla::StatusOr;
 
 namespace errors = tensorflow::errors;
 namespace tfl = mlir::TFL;
@@ -519,7 +519,7 @@ Status ConvertSubgraphIdxToStablehloRegion(
 
     op_state.addAttribute("body", body_attr);
 
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   if (auto* opts = op.builtin_options_2.AsStablehloReduceWindowOptions()) {
     int32_t body_idx = opts->body_subgraph_index;
@@ -532,7 +532,7 @@ Status ConvertSubgraphIdxToStablehloRegion(
 
     op_state.addAttribute("body", body_attr);
 
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   if (auto* opts = op.builtin_options_2.AsStablehloSortOptions()) {
     int32_t comparator_idx = opts->comparator_subgraph_index;
@@ -545,7 +545,7 @@ Status ConvertSubgraphIdxToStablehloRegion(
 
     op_state.addAttribute("comparator", comparator_attr);
 
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   if (auto* opts = op.builtin_options_2.AsStablehloWhileOptions()) {
     int32_t body_idx = opts->body_subgraph_index;
@@ -566,7 +566,7 @@ Status ConvertSubgraphIdxToStablehloRegion(
     op_state.addAttribute("body", body_attr);
     op_state.addAttribute("cond", cond_attr);
 
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   if (auto* opts = op.builtin_options_2.AsStablehloScatterOptions()) {
     uint32_t subgraph_idx = opts->update_computation_subgraph_index;
@@ -580,10 +580,10 @@ Status ConvertSubgraphIdxToStablehloRegion(
 
     op_state.addAttribute(kScatterRegionFuncName, subgraph_attr);
 
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   // skip if not supported
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 Status AddOpIntermediatesForLstm(
@@ -612,7 +612,7 @@ Status AddOpIntermediatesForLstm(
       op_state.addAttribute(named_attr.getName(), named_attr.getValue());
     }
   }
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 // TODO(krzysd) Handle function calls
@@ -747,7 +747,7 @@ StatusOr<Operation*> ConvertOp(
   llvm::SmallVector<mlir::NamedAttribute, 2> attrs;
   auto builtin_code = tflite::GetBuiltinCode(&op_code);
   if (builtin_code == tflite::BuiltinOperator_CUSTOM) {
-    auto status = ::tensorflow::OkStatus();
+    auto status = absl::OkStatus();
 
     std::vector<uint8_t> custom_options;
 
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc b/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc
index 3166b589418658..f72ef1f9641d48 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc
@@ -58,9 +58,9 @@ limitations under the License.
 
 namespace {
 
+using ::absl::StatusOr;
 using ::tensorflow::Status;
 using ::tensorflow::errors::InvalidArgument;
-using ::xla::StatusOr;
 
 StatusOr<mlir::StringAttr> GetPaddingAttr(TfLitePadding pad_params,
                                           mlir::Builder builder,
@@ -448,7 +448,7 @@ Status mlir::CustomOptionsToAttributes(
       "custom_option",
       mlir::TFL::ConstBytesAttr::get(builder.getContext(), content)));
 
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 // TODO(zichuanwei@): Populate Builtin_options_2 manual for now, should
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
index 30a19e04fb368a..8ac81939d0d4de 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
@@ -578,6 +578,14 @@ inline bool IsBF16ShapedType(Type t) {
   return false;
 }
 
+// Returns true if it is a shaped type of FloatType elements.
+inline bool IsFloatShapedType(Type t) {
+  if (auto shaped_type = t.dyn_cast_or_null<ShapedType>()) {
+    return shaped_type.getElementType().isa<FloatType>();
+  }
+  return false;
+}
+
 // Returns new shape with rank 'new_dims' with padded ones on the
 // left if needed.
 inline std::vector<int64_t> GetPaddedShape(ArrayRef<int64_t> old_shape,
@@ -3069,6 +3077,50 @@ OpFoldResult SquareOp::fold(FoldAdaptor adaptor) {
   return ConstFoldUnaryOp(result_type, operands[0], compute);
 }
 
+//===----------------------------------------------------------------------===//
+// MaximumOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult MaximumOp::fold(FoldAdaptor adaptor) {
+  auto lhs_type = getLhs().getType();
+  auto rhs_type = getRhs().getType();
+  // Only constant fold for float tensors of the same type is implemented.
+  if (lhs_type != rhs_type || !IsFloatShapedType(lhs_type)) return nullptr;
+
+  auto lhs = adaptor.getLhs().dyn_cast_or_null<DenseElementsAttr>();
+  auto rhs = adaptor.getRhs().dyn_cast_or_null<DenseElementsAttr>();
+  if (lhs && lhs.isSplat()) {
+    APFloat lhs_value = lhs.getSplatValue<APFloat>();
+    lhs_value.changeSign();
+    if (lhs_value.isLargest()) return getRhs();
+  }
+  if (rhs && rhs.isSplat()) {
+    APFloat rhs_value = rhs.getSplatValue<APFloat>();
+    rhs_value.changeSign();
+    if (rhs_value.isLargest()) return getLhs();
+  }
+  return nullptr;
+}
+
+//===----------------------------------------------------------------------===//
+// MinimumOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult MinimumOp::fold(FoldAdaptor adaptor) {
+  auto lhs_type = getLhs().getType();
+  auto rhs_type = getRhs().getType();
+  // Only constant fold for float tensors of the same type is implemented.
+  if (lhs_type != rhs_type || !IsFloatShapedType(lhs_type)) return nullptr;
+
+  auto lhs = adaptor.getLhs().dyn_cast_or_null<DenseElementsAttr>();
+  auto rhs = adaptor.getRhs().dyn_cast_or_null<DenseElementsAttr>();
+  if (lhs && lhs.isSplat() && lhs.getSplatValue<APFloat>().isLargest())
+    return getRhs();
+  if (rhs && rhs.isSplat() && rhs.getSplatValue<APFloat>().isLargest())
+    return getLhs();
+  return nullptr;
+}
+
 //===----------------------------------------------------------------------===//
 // RankOp
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index 481f5573058b8c..5f4cce6d8e8a76 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -2269,6 +2269,8 @@ def TFL_MaximumOp : TFL_Op<"maximum", [
     TFL_TensorOf<[F32, TFL_Int32Or64, QI8, QUI8, QI16]>:$max
   );
 
+  let hasFolder = 1;
+
   let builders = [TFL_BroadcastableBinaryBuilder];
 
   let hasOptions = 0;
@@ -2528,6 +2530,8 @@ def TFL_MinimumOp : TFL_Op<"minimum", [
     TFL_TensorOf<[F32, TFL_Int32Or64, QI8, QUI8, QI16]>:$min
   );
 
+  let hasFolder = 1;
+
   let builders = [TFL_BroadcastableBinaryBuilder];
 
   let hasOptions = 0;
diff --git a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
index 3e50192fa0640d..16e12bbb6da04d 100644
--- a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
@@ -124,7 +124,7 @@ Status HandleInputOutputArraysWithModule(
                                      ") does not exist in the given graph");
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ConvertSavedModelToTFLiteFlatBuffer(
diff --git a/tensorflow/compiler/mlir/lite/quantization/BUILD b/tensorflow/compiler/mlir/lite/quantization/BUILD
index c6928f60f1ccaa..f6d8de698481e3 100644
--- a/tensorflow/compiler/mlir/lite/quantization/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/BUILD
@@ -112,7 +112,6 @@ cc_library(
         ":device_target",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib",
-        "@com_google_absl//absl/memory",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
diff --git a/tensorflow/compiler/mlir/lite/quantization/numerical_utils_test.cc b/tensorflow/compiler/mlir/lite/quantization/numerical_utils_test.cc
index e0035e5c3c5175..7f9b02b9f61473 100644
--- a/tensorflow/compiler/mlir/lite/quantization/numerical_utils_test.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/numerical_utils_test.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <cmath>
 #include <optional>
 
-#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/types/optional.h"
 
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_context.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_context.cc
index d7bcc43b208675..ecba20595c0f91 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_context.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_context.cc
@@ -18,9 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -28,10 +26,9 @@ limitations under the License.
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/Matchers.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_context.h b/tensorflow/compiler/mlir/lite/quantization/quantization_context.h
index c67573d0e9d3f8..8a6bf4f83a28c5 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_context.h
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_context.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/lite/quantization/stablehlo/BUILD b/tensorflow/compiler/mlir/lite/quantization/stablehlo/BUILD
index f96d4961e733b4..43142a7a7c52dd 100644
--- a/tensorflow/compiler/mlir/lite/quantization/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/stablehlo/BUILD
@@ -20,6 +20,7 @@ cc_library(
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:config",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:static_range_ptq",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:weight_only_ptq",
         "//tensorflow/compiler/mlir/quantization/tensorflow/python:py_function_lib",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tf_saved_model_freeze_variables",
diff --git a/tensorflow/compiler/mlir/lite/quantization/stablehlo/quantization.cc b/tensorflow/compiler/mlir/lite/quantization/stablehlo/quantization.cc
index 08f5ecd4851b7e..9561b9003add3b 100644
--- a/tensorflow/compiler/mlir/lite/quantization/stablehlo/quantization.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/stablehlo/quantization.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/weight_only_ptq.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
@@ -43,6 +44,8 @@ namespace tensorflow {
 namespace {
 
 using ::mlir::quant::stablehlo::StaticRangePtqComponent;
+using ::mlir::quant::stablehlo::WeightOnlyPtqComponent;
+using ::stablehlo::quantization::Method;
 using ::stablehlo::quantization::PopulateDefaults;
 using ::stablehlo::quantization::QuantizationConfig;
 using ::tensorflow::SignatureDef;
@@ -122,22 +125,38 @@ absl::StatusOr<mlir::ModuleOp> RunQuantization(
   // after variable freezing.
   mlir::PassManager pm(module_op.getContext());
   pm.addPass(mlir::TF::CreateTFShapeInferencePass());
-  mlir::odml::AddLegalizeTFToStablehloPasses(
-      pm, /*skip_quantization_ops=*/true,
-      /*skip_resize=*/false, /*skip_stateful_partitioned_call=*/false);
+  mlir::odml::AddLegalizeTFToStablehloPasses(pm, /*skip_quantization_ops=*/true,
+                                             /*skip_resize=*/false,
+                                             /*skip_partitioned_calls=*/false);
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::quant::stablehlo::createRemoveShardingCustomCallPass());
   if (failed(pm.run(module_op))) {
     return absl::InternalError("Failed to run legalize TF to StableHLO.");
   }
 
-  StaticRangePtqComponent static_range_ptq_component(
-      module_op.getContext(), quantization_py_function_lib, saved_model_dir,
-      /*signature_keys=*/exported_names, saved_model_tags, signature_def_map,
-      GetFunctionAliases(*saved_model_bundle));
+  absl::StatusOr<mlir::ModuleOp> quantized_module_op;
+  // Currently, only StaticRangePtq or WeightOnlyPtq is supported.
+  // Consider merging the pipelines to address mixed algorithm models.
+  if (HasQuantizationMethod(updated_config.specs(),
+                            Method::MethodCase::kStaticRangePtq)) {
+    StaticRangePtqComponent static_range_ptq_component(
+        module_op.getContext(), quantization_py_function_lib, saved_model_dir,
+        /*signature_keys=*/exported_names, saved_model_tags, signature_def_map,
+        GetFunctionAliases(*saved_model_bundle));
+
+    quantized_module_op =
+        static_range_ptq_component.Run(module_op, updated_config);
+  } else if (HasQuantizationMethod(updated_config.specs(),
+                                   Method::MethodCase::kWeightOnlyPtq)) {
+    WeightOnlyPtqComponent weight_only_ptq_component(module_op.getContext());
+    quantized_module_op =
+        weight_only_ptq_component.Run(module_op, updated_config);
+  } else {
+    return absl::InvalidArgumentError(
+        "Quantization config must have either static_range_ptq_preset or "
+        "weight_only_ptq_preset.");
+  }
 
-  absl::StatusOr<mlir::ModuleOp> quantized_module_op =
-      static_range_ptq_component.Run(module_op, updated_config);
   if (!quantized_module_op.ok()) {
     return absl::InternalError("Failed to run quantization. Status msg: " +
                                quantized_module_op.status().ToString());
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/BUILD b/tensorflow/compiler/mlir/lite/stablehlo/BUILD
index bd83f16de105f8..9976d6ff363c8f 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/lite/stablehlo/BUILD
@@ -1,4 +1,4 @@
-load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
+load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
@@ -693,12 +693,41 @@ cc_library(
     ],
 )
 
+td_library(
+    name = "composite_td_files",
+    srcs = [
+        "transforms/composite_avg_pool_patterns.td",
+        "transforms/composite_utils.td",
+    ],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/compiler/mlir/lite:tensorflow_lite_ops_td_files",
+        "@llvm-project//mlir:FuncTdFiles",
+        "@local_xla//xla/mlir_hlo:hlo_ops_td_files",
+    ],
+)
+
+cc_library(
+    name = "composite_utils",
+    srcs = ["transforms/composite_utils.cc"],
+    hdrs = ["transforms/composite_utils.h"],
+    deps = [
+        "//tensorflow/compiler/mlir/lite:tensorflow_lite",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+        "@local_xla//xla/mlir_hlo",
+    ],
+)
+
 cc_library(
     name = "composite_lowering",
     srcs = [
+        "transforms/composite_avg_pool.cc",
         "transforms/composite_lowering_pass.cc",
     ],
     hdrs = [
+        "transforms/composite_avg_pool.h",
         "transforms/passes.h",
     ],
     copts = [
@@ -706,8 +735,13 @@ cc_library(
     ],
     deps = [
         ":composite_lowering_inc_gen",
+        ":composite_utils",
         ":passes_inc_gen",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
+        "//tensorflow/core:framework",
+        "@com_google_absl//absl/status",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
@@ -730,6 +764,8 @@ gentbl_cc_library(
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "transforms/composite_lowering_patterns.td",
     deps = [
+        ":composite_td_files",
+        ":composite_utils",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite_ops_td_files",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncTdFiles",
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/BUILD b/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/BUILD
new file mode 100644
index 00000000000000..c487600517f9b8
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/BUILD
@@ -0,0 +1,64 @@
+load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        ":friends",
+    ],
+)
+
+package_group(
+    name = "friends",
+    packages = [
+        "//tensorflow/compiler/mlir/lite/...",
+    ],
+)
+
+tf_cc_binary(
+    name = "odml-converter",
+    srcs = ["odml_converter_main.cc"],
+    visibility = [
+        "//tensorflow/compiler/mlir/lite/stablehlo/odml_converter:__subpackages__",
+        "//third_party/odml/infra:__subpackages__",
+    ],  # Prototype phase.
+    deps = [
+        ":all_passes",
+        "//tensorflow/compiler/mlir:init_mlir",
+        "//tensorflow/compiler/mlir/lite:tensorflow_lite",
+        "//tensorflow/compiler/mlir/lite/stablehlo:legalize_stablehlo_to_vhlo_pass",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:MlirOptLib",
+        "@llvm-project//mlir:Support",
+        "@stablehlo//:stablehlo_ops",
+    ],
+)
+
+gentbl_cc_library(
+    name = "passes_inc_gen",
+    tbl_outs = [
+        (
+            [
+                "-gen-pass-decls",
+                "-name=ODMLConverter",
+            ],
+            "passes.h.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "passes.td",
+    deps = ["@llvm-project//mlir:PassBaseTdFiles"],
+)
+
+cc_library(
+    name = "all_passes",
+    hdrs = ["passes.h"],
+    deps = [":passes_inc_gen"],
+)
+
+exports_files([
+    "run_lit.sh",
+])
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/odml_converter_main.cc b/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/odml_converter_main.cc
new file mode 100644
index 00000000000000..ecd7396c2a4622
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/odml_converter_main.cc
@@ -0,0 +1,49 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Tools/mlir-opt/MlirOptMain.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/init_mlir.h"
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/stablehlo/odml_converter/passes.h"
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+
+const char* art = R"(
+  ___  ___  __  __ _       ___                     _
+ / _ \|   \|  \/  | |     / __|___ _ ___ _____ _ _| |_ ___ _ _
+| (_) | |) | |\/| | |__  | (__/ _ \ ' \ V / -_) '_|  _/ -_) '_|
+ \___/|___/|_|  |_|____|  \___\___/_||_\_/\___|_|  \__\___|_|
+)";
+
+int main(int argc, char* argv[]) {
+  tensorflow::InitMlir y(&argc, &argv);
+  llvm::errs() << art << "\n";
+
+  mlir::odml::registerODMLConverterPasses();
+  mlir::odml::registerLegalizeStablehloToVhloPass();
+
+  mlir::DialectRegistry registry;
+  registry.insert<mlir::func::FuncDialect, mlir::stablehlo::StablehloDialect,
+                  mlir::TFL::TFLDialect, mlir::arith::ArithDialect,
+                  mlir::TF::TensorFlowDialect>();
+
+  return failed(
+      mlir::MlirOptMain(argc, argv, "ODML Converter Driver\n", registry));
+}
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/passes.h b/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/passes.h
new file mode 100644
index 00000000000000..b3589356f196a2
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/passes.h
@@ -0,0 +1,26 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_ODML_CONVERTER_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_ODML_CONVERTER_PASSES_H_
+
+namespace mlir::odml {
+
+#define GEN_PASS_REGISTRATION
+#include "tensorflow/compiler/mlir/lite/stablehlo/odml_converter/passes.h.inc"
+
+}  // namespace mlir::odml
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_ODML_CONVERTER_PASSES_H_
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/passes.td b/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/passes.td
new file mode 100644
index 00000000000000..800d7e0d2ff59b
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/passes.td
@@ -0,0 +1,17 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+include "mlir/Pass/PassBase.td"
+
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/tests/BUILD b/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/tests/BUILD
new file mode 100644
index 00000000000000..c990b20c8fb51c
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/tests/BUILD
@@ -0,0 +1,25 @@
+load("//tensorflow:tensorflow.default.bzl", "filegroup")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//tensorflow/compiler/mlir/lite/stablehlo/odml_converter:__subpackages__"],
+)
+
+glob_lit_tests(
+    name = "filecheck_tests",
+    data = [":test_utilities"],
+    driver = "@llvm-project//mlir:run_lit.sh",
+    test_file_exts = [
+        "mlir",
+    ],
+)
+
+filegroup(
+    name = "test_utilities",
+    testonly = True,
+    data = [
+        "//tensorflow/compiler/mlir/lite/stablehlo/odml_converter:odml-converter",
+        "@llvm-project//llvm:FileCheck",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/composite-lowering.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/composite-lowering.mlir
index 5924d0dce396c4..c614ee10bf2b45 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/composite-lowering.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/composite-lowering.mlir
@@ -8,15 +8,15 @@ func.func @hardswish(%arg0: tensor<2xf32>) -> (tensor<*xf32>) {
 }
 func.func private @XlaCallModule_aten.hardswish.default.impl_0(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   %0 = mhlo.constant dense<6.000000e+00> : tensor<f32>
-  %1 = "mhlo.broadcast_in_dim"(%0) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<2xf32>
+  %1 = "mhlo.broadcast_in_dim"(%0) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<f32>) -> tensor<2xf32>
   %2 = mhlo.constant dense<3.40282347E+38> : tensor<f32>
-  %3 = "mhlo.broadcast_in_dim"(%2) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<2xf32>
+  %3 = "mhlo.broadcast_in_dim"(%2) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<f32>) -> tensor<2xf32>
   %4 = mhlo.constant dense<3.000000e+00> : tensor<f32>
-  %5 = "mhlo.broadcast_in_dim"(%4) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<2xf32>
+  %5 = "mhlo.broadcast_in_dim"(%4) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<f32>) -> tensor<2xf32>
   %6 = mhlo.constant dense<0.000000e+00> : tensor<f32>
-  %7 = "mhlo.broadcast_in_dim"(%6) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<2xf32>
+  %7 = "mhlo.broadcast_in_dim"(%6) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<f32>) -> tensor<2xf32>
   %8 = mhlo.constant dense<-3.40282347E+38> : tensor<f32>
-  %9 = "mhlo.broadcast_in_dim"(%8) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<2xf32>
+  %9 = "mhlo.broadcast_in_dim"(%8) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<f32>) -> tensor<2xf32>
   %10 = mhlo.add %arg0, %5 : tensor<2xf32>
   %11 = mhlo.clamp %7, %10, %3 : tensor<2xf32>
   %12 = mhlo.clamp %9, %11, %1 : tensor<2xf32>
@@ -31,4 +31,149 @@ func.func private @XlaCallModule_aten.hardswish.default.impl_0(%arg0: tensor<2xf
 // CHECK:           %[[VAL_2:.*]] = "tf.Identity"(%[[VAL_1]]) {device = ""} : (tensor<2xf32>) -> tensor<*xf32>
 // CHECK:           %[[VAL_3:.*]] = "tf.Identity"(%[[VAL_2]]) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
 // CHECK:           return %[[VAL_3]] : tensor<*xf32>
-// CHECK:         }
\ No newline at end of file
+// CHECK:         }
+
+
+func.func @avg_pool2d_1(%arg0: tensor<1x3x6x6xf32>) -> (tensor<*xf32>) {
+  %0 = mhlo.composite "aten.avg_pool2d.default" %arg0 {composite_attributes = {ceil_mode = false, count_include_pad = true, divisor_override = "py_None", kernel_size = dense<3> : tensor<2xi64>, padding = dense<0> : tensor<2xi64>, stride = dense<1> : tensor<2xi64>}, decomposition = @XlaCallModule_aten.avg_pool2d.default.impl_0} : (tensor<1x3x6x6xf32>) -> tensor<1x3x4x4xf32>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<1x3x4x4xf32>) -> tensor<*xf32>
+  %2 = "tf.Identity"(%1) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+  return %2 : tensor<*xf32>
+}
+func.func private @XlaCallModule_aten.avg_pool2d.default.impl_0(%arg0: tensor<1x3x6x6xf32>) -> tensor<1x3x4x4xf32> {
+  %0 = mhlo.constant dense<1.000000e+00> : tensor<f32>
+  %1 = "mhlo.broadcast_in_dim"(%0) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<6x6xf32>
+  %2 = mhlo.constant dense<0.000000e+00> : tensor<f32>
+  %3 = "mhlo.reduce_window"(%arg0, %2) ({
+  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+    %7 = mhlo.add %arg1, %arg2 : tensor<f32>
+    mhlo.return %7 : tensor<f32>
+  }) {window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>} : (tensor<1x3x6x6xf32>, tensor<f32>) -> tensor<1x3x4x4xf32>
+  %4 = "mhlo.reduce_window"(%1, %2) ({
+  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+    %7 = mhlo.add %arg1, %arg2 : tensor<f32>
+    mhlo.return %7 : tensor<f32>
+  }) {window_dimensions = dense<3> : tensor<2xi64>} : (tensor<6x6xf32>, tensor<f32>) -> tensor<4x4xf32>
+  %5 = "mhlo.broadcast_in_dim"(%4) {broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>} : (tensor<4x4xf32>) -> tensor<1x3x4x4xf32>
+  %6 = mhlo.divide %3, %5 : tensor<1x3x4x4xf32>
+  return %6 : tensor<1x3x4x4xf32>
+}
+
+// CHECK-LABEL:   func.func @avg_pool2d_1(
+// CHECK-SAME:                            %[[VAL_0:.*]]: tensor<1x3x6x6xf32>) -> tensor<*xf32> {
+// CHECK:           %[[VAL_1:.*]] = arith.constant dense<[0, 2, 3, 1]> : tensor<4xi32>
+// CHECK:           %[[VAL_2:.*]] = "tfl.transpose"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1x3x6x6xf32>, tensor<4xi32>) -> tensor<1x6x6x3xf32>
+// CHECK:           %[[VAL_3:.*]] = arith.constant dense<0> : tensor<4x2xi32>
+// CHECK:           %[[VAL_4:.*]] = "tfl.pad"(%[[VAL_2]], %[[VAL_3]]) : (tensor<1x6x6x3xf32>, tensor<4x2xi32>) -> tensor<1x6x6x3xf32>
+// CHECK:           %[[VAL_5:.*]] = "tfl.average_pool_2d"(%[[VAL_4]]) {filter_height = 3 : i32, filter_width = 3 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x6x6x3xf32>) -> tensor<1x4x4x3xf32>
+// CHECK:           %[[VAL_6:.*]] = arith.constant dense<[0, 3, 1, 2]> : tensor<4xi32>
+// CHECK:           %[[VAL_7:.*]] = "tfl.transpose"(%[[VAL_5]], %[[VAL_6]]) : (tensor<1x4x4x3xf32>, tensor<4xi32>) -> tensor<1x3x4x4xf32>
+// CHECK:           %[[VAL_8:.*]] = "tf.Identity"(%[[VAL_7]]) {device = ""} : (tensor<1x3x4x4xf32>) -> tensor<*xf32>
+// CHECK:           %[[VAL_9:.*]] = "tf.Identity"(%[[VAL_8]]) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+// CHECK:           return %[[VAL_9]] : tensor<*xf32>
+// CHECK:         }
+
+func.func @avg_pool2d_2(%arg0: tensor<1x3x6x6xf32>) -> (tensor<*xf32>) {
+  %0 = mhlo.composite "aten.avg_pool2d.default" %arg0 {composite_attributes = {ceil_mode = false, count_include_pad = false, divisor_override = "py_None", kernel_size = dense<3> : tensor<2xi64>, padding = dense<1> : tensor<2xi64>, stride = dense<1> : tensor<2xi64>}, decomposition = @XlaCallModule_aten.avg_pool2d.default.impl_1} : (tensor<1x3x6x6xf32>) -> tensor<1x3x6x6xf32>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<1x3x6x6xf32>) -> tensor<*xf32>
+  %2 = "tf.Identity"(%1) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+  return %2 : tensor<*xf32>
+}
+func.func private @XlaCallModule_aten.avg_pool2d.default.impl_1(%arg0: tensor<1x3x6x6xf32>) -> tensor<1x3x6x6xf32> {
+  %0 = mhlo.constant dense<[[0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00], [0.000000e+00, 1.000000e+00, 1.000000e+00, 1.000000e+00, 1.000000e+00, 1.000000e+00, 1.000000e+00, 0.000000e+00], [0.000000e+00, 1.000000e+00, 1.000000e+00, 1.000000e+00, 1.000000e+00, 1.000000e+00, 1.000000e+00, 0.000000e+00], [0.000000e+00, 1.000000e+00, 1.000000e+00, 1.000000e+00, 1.000000e+00, 1.000000e+00, 1.000000e+00, 0.000000e+00], [0.000000e+00, 1.000000e+00, 1.000000e+00, 1.000000e+00, 1.000000e+00, 1.000000e+00, 1.000000e+00, 0.000000e+00], [0.000000e+00, 1.000000e+00, 1.000000e+00, 1.000000e+00, 1.000000e+00, 1.000000e+00, 1.000000e+00, 0.000000e+00], [0.000000e+00, 1.000000e+00, 1.000000e+00, 1.000000e+00, 1.000000e+00, 1.000000e+00, 1.000000e+00, 0.000000e+00], [0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00]]> : tensor<8x8xf32>
+  %1 = mhlo.constant dense<0.000000e+00> : tensor<f32>
+  %2 = "mhlo.pad"(%arg0, %1) {edge_padding_high = dense<[0, 0, 1, 1]> : tensor<4xi64>, edge_padding_low = dense<[0, 0, 1, 1]> : tensor<4xi64>, interior_padding = dense<0> : tensor<4xi64>} : (tensor<1x3x6x6xf32>, tensor<f32>) -> tensor<1x3x8x8xf32>
+  %3 = "mhlo.reduce_window"(%2, %1) ({
+  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+    %7 = mhlo.add %arg1, %arg2 : tensor<f32>
+    mhlo.return %7 : tensor<f32>
+  }) {window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>} : (tensor<1x3x8x8xf32>, tensor<f32>) -> tensor<1x3x6x6xf32>
+  %4 = "mhlo.reduce_window"(%0, %1) ({
+  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+    %7 = mhlo.add %arg1, %arg2 : tensor<f32>
+    mhlo.return %7 : tensor<f32>
+  }) {window_dimensions = dense<3> : tensor<2xi64>} : (tensor<8x8xf32>, tensor<f32>) -> tensor<6x6xf32>
+  %5 = "mhlo.broadcast_in_dim"(%4) {broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>} : (tensor<6x6xf32>) -> tensor<1x3x6x6xf32>
+  %6 = mhlo.divide %3, %5 : tensor<1x3x6x6xf32>
+  return %6 : tensor<1x3x6x6xf32>
+}
+
+// CHECK-LABEL:   func.func @avg_pool2d_2(
+// CHECK-SAME:                               %[[VAL_0:.*]]: tensor<1x3x6x6xf32>) -> tensor<*xf32> {
+// CHECK:           %[[VAL_1:.*]] = arith.constant dense<[0, 2, 3, 1]> : tensor<4xi32>
+// CHECK:           %[[VAL_2:.*]] = "tfl.transpose"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1x3x6x6xf32>, tensor<4xi32>) -> tensor<1x6x6x3xf32>
+// CHECK:           %[[VAL_3:.*]] = "tfl.average_pool_2d"(%[[VAL_2]]) {filter_height = 3 : i32, filter_width = 3 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x6x6x3xf32>) -> tensor<1x6x6x3xf32>
+// CHECK:           %[[VAL_4:.*]] = arith.constant dense<[0, 3, 1, 2]> : tensor<4xi32>
+// CHECK:           %[[VAL_5:.*]] = "tfl.transpose"(%[[VAL_3]], %[[VAL_4]]) : (tensor<1x6x6x3xf32>, tensor<4xi32>) -> tensor<1x3x6x6xf32>
+// CHECK:           %[[VAL_6:.*]] = "tf.Identity"(%[[VAL_5]]) {device = ""} : (tensor<1x3x6x6xf32>) -> tensor<*xf32>
+// CHECK:           %[[VAL_7:.*]] = "tf.Identity"(%[[VAL_6]]) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+// CHECK:           return %[[VAL_7]] : tensor<*xf32>
+// CHECK:         }
+
+func.func @upsample_bilinear2d(%arg0: tensor<1x64x16x16xf32>) -> (tensor<1x64x32x32xf32>) {
+  %0 = mhlo.composite "odml.upsample_bilinear2d" %arg0 {composite_attributes = {align_corners = false, output = dense<32> : tensor<2xi64>}, decomposition = @XlaCallModule_odml.upsample_bilinear2d.impl_21_0} : (tensor<1x64x16x16xf32>) -> tensor<1x64x32x32xf32>
+  return %0 : tensor<1x64x32x32xf32>
+}
+func.func private @XlaCallModule_odml.upsample_bilinear2d.impl_21_0(%arg0: tensor<1x64x16x16xf32>) -> tensor<1x64x32x32xf32> {
+  %0 = mhlo.constant dense<[[0.000000e+00], [2.500000e-01], [7.500000e-01], [2.500000e-01], [7.500000e-01], [2.500000e-01], [7.500000e-01], [2.500000e-01], [7.500000e-01], [2.500000e-01], [7.500000e-01], [2.500000e-01], [7.500000e-01], [2.500000e-01], [7.500000e-01], [2.500000e-01], [7.500000e-01], [2.500000e-01], [7.500000e-01], [2.500000e-01], [7.500000e-01], [2.500000e-01], [7.500000e-01], [2.500000e-01], [7.500000e-01], [2.500000e-01], [7.500000e-01], [2.500000e-01], [7.500000e-01], [2.500000e-01], [7.500000e-01], [2.500000e-01]]> : tensor<32x1xf32>
+  %1 = mhlo.constant dense<[0.000000e+00, 2.500000e-01, 7.500000e-01, 2.500000e-01, 7.500000e-01, 2.500000e-01, 7.500000e-01, 2.500000e-01, 7.500000e-01, 2.500000e-01, 7.500000e-01, 2.500000e-01, 7.500000e-01, 2.500000e-01, 7.500000e-01, 2.500000e-01, 7.500000e-01, 2.500000e-01, 7.500000e-01, 2.500000e-01, 7.500000e-01, 2.500000e-01, 7.500000e-01, 2.500000e-01, 7.500000e-01, 2.500000e-01, 7.500000e-01, 2.500000e-01, 7.500000e-01, 2.500000e-01, 7.500000e-01, 2.500000e-01]> : tensor<32xf32>
+  %2 = mhlo.constant dense<[1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 15]> : tensor<32xi64>
+  %3 = mhlo.constant dense<16> : tensor<i64>
+  %4 = "mhlo.broadcast_in_dim"(%3) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<i64>) -> tensor<32x32xi64>
+  %5 = mhlo.constant dense<0> : tensor<i64>
+  %6 = "mhlo.broadcast_in_dim"(%5) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<i64>) -> tensor<32x32xi64>
+  %7 = mhlo.constant dense<[0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15]> : tensor<32xi64>
+  %8 = "mhlo.broadcast_in_dim"(%7) <{broadcast_dimensions = dense<0> : tensor<1xi64>}> : (tensor<32xi64>) -> tensor<32x32xi64>
+  %9 = mhlo.compare  LT, %8, %6 : (tensor<32x32xi64>, tensor<32x32xi64>) -> tensor<32x32xi1>
+  %10 = mhlo.add %8, %4 : tensor<32x32xi64>
+  %11 = mhlo.select %9, %10, %8 : tensor<32x32xi1>, tensor<32x32xi64>
+  %12 = mhlo.reshape %11 : (tensor<32x32xi64>) -> tensor<32x32x1xi64>
+  %13 = "mhlo.broadcast_in_dim"(%7) <{broadcast_dimensions = dense<1> : tensor<1xi64>}> : (tensor<32xi64>) -> tensor<32x32xi64>
+  %14 = mhlo.compare  LT, %13, %6 : (tensor<32x32xi64>, tensor<32x32xi64>) -> tensor<32x32xi1>
+  %15 = mhlo.add %13, %4 : tensor<32x32xi64>
+  %16 = mhlo.select %14, %15, %13 : tensor<32x32xi1>, tensor<32x32xi64>
+  %17 = mhlo.reshape %16 : (tensor<32x32xi64>) -> tensor<32x32x1xi64>
+  %18 = "mhlo.concatenate"(%12, %17) <{dimension = 2 : i64}> : (tensor<32x32x1xi64>, tensor<32x32x1xi64>) -> tensor<32x32x2xi64>
+  %19 = "mhlo.gather"(%arg0, %18) <{dimension_numbers = #mhlo.gather<offset_dims = [0, 1], collapsed_slice_dims = [2, 3], start_index_map = [2, 3], index_vector_dim = 2>, slice_sizes = dense<[1, 64, 1, 1]> : tensor<4xi64>}> : (tensor<1x64x16x16xf32>, tensor<32x32x2xi64>) -> tensor<1x64x32x32xf32>
+  %20 = "mhlo.broadcast_in_dim"(%2) <{broadcast_dimensions = dense<1> : tensor<1xi64>}> : (tensor<32xi64>) -> tensor<32x32xi64>
+  %21 = mhlo.compare  LT, %20, %6 : (tensor<32x32xi64>, tensor<32x32xi64>) -> tensor<32x32xi1>
+  %22 = mhlo.add %20, %4 : tensor<32x32xi64>
+  %23 = mhlo.select %21, %22, %20 : tensor<32x32xi1>, tensor<32x32xi64>
+  %24 = mhlo.reshape %23 : (tensor<32x32xi64>) -> tensor<32x32x1xi64>
+  %25 = "mhlo.concatenate"(%12, %24) <{dimension = 2 : i64}> : (tensor<32x32x1xi64>, tensor<32x32x1xi64>) -> tensor<32x32x2xi64>
+  %26 = "mhlo.gather"(%arg0, %25) <{dimension_numbers = #mhlo.gather<offset_dims = [0, 1], collapsed_slice_dims = [2, 3], start_index_map = [2, 3], index_vector_dim = 2>, slice_sizes = dense<[1, 64, 1, 1]> : tensor<4xi64>}> : (tensor<1x64x16x16xf32>, tensor<32x32x2xi64>) -> tensor<1x64x32x32xf32>
+  %27 = mhlo.subtract %26, %19 : tensor<1x64x32x32xf32>
+  %28 = "mhlo.broadcast_in_dim"(%1) <{broadcast_dimensions = dense<3> : tensor<1xi64>}> : (tensor<32xf32>) -> tensor<1x64x32x32xf32>
+  %29 = mhlo.multiply %27, %28 : tensor<1x64x32x32xf32>
+  %30 = mhlo.add %19, %29 : tensor<1x64x32x32xf32>
+  %31 = "mhlo.broadcast_in_dim"(%2) <{broadcast_dimensions = dense<0> : tensor<1xi64>}> : (tensor<32xi64>) -> tensor<32x32xi64>
+  %32 = mhlo.compare  LT, %31, %6 : (tensor<32x32xi64>, tensor<32x32xi64>) -> tensor<32x32xi1>
+  %33 = mhlo.add %31, %4 : tensor<32x32xi64>
+  %34 = mhlo.select %32, %33, %31 : tensor<32x32xi1>, tensor<32x32xi64>
+  %35 = mhlo.reshape %34 : (tensor<32x32xi64>) -> tensor<32x32x1xi64>
+  %36 = "mhlo.concatenate"(%35, %17) <{dimension = 2 : i64}> : (tensor<32x32x1xi64>, tensor<32x32x1xi64>) -> tensor<32x32x2xi64>
+  %37 = "mhlo.gather"(%arg0, %36) <{dimension_numbers = #mhlo.gather<offset_dims = [0, 1], collapsed_slice_dims = [2, 3], start_index_map = [2, 3], index_vector_dim = 2>, slice_sizes = dense<[1, 64, 1, 1]> : tensor<4xi64>}> : (tensor<1x64x16x16xf32>, tensor<32x32x2xi64>) -> tensor<1x64x32x32xf32>
+  %38 = "mhlo.concatenate"(%35, %24) <{dimension = 2 : i64}> : (tensor<32x32x1xi64>, tensor<32x32x1xi64>) -> tensor<32x32x2xi64>
+  %39 = "mhlo.gather"(%arg0, %38) <{dimension_numbers = #mhlo.gather<offset_dims = [0, 1], collapsed_slice_dims = [2, 3], start_index_map = [2, 3], index_vector_dim = 2>, slice_sizes = dense<[1, 64, 1, 1]> : tensor<4xi64>}> : (tensor<1x64x16x16xf32>, tensor<32x32x2xi64>) -> tensor<1x64x32x32xf32>
+  %40 = mhlo.subtract %39, %37 : tensor<1x64x32x32xf32>
+  %41 = mhlo.multiply %40, %28 : tensor<1x64x32x32xf32>
+  %42 = mhlo.add %37, %41 : tensor<1x64x32x32xf32>
+  %43 = mhlo.subtract %42, %30 : tensor<1x64x32x32xf32>
+  %44 = "mhlo.broadcast_in_dim"(%0) <{broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>}> : (tensor<32x1xf32>) -> tensor<1x64x32x1xf32>
+  %45 = mhlo.reshape %44 : (tensor<1x64x32x1xf32>) -> tensor<1x64x32xf32>
+  %46 = "mhlo.broadcast_in_dim"(%45) <{broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>}> : (tensor<1x64x32xf32>) -> tensor<1x64x32x32xf32>
+  %47 = mhlo.multiply %43, %46 : tensor<1x64x32x32xf32>
+  %48 = mhlo.add %30, %47 : tensor<1x64x32x32xf32>
+  return %48 : tensor<1x64x32x32xf32>
+}
+
+// CHECK-LABEL:   func.func @upsample_bilinear2d(
+// CHECK-SAME:                               %[[VAL_0:.*]]: tensor<1x64x16x16xf32>) -> tensor<1x64x32x32xf32> {
+// CHECK:           %[[VAL_1:.*]] = arith.constant dense<[0, 2, 3, 1]> : tensor<4xi32>
+// CHECK:           %[[VAL_2:.*]] = "tfl.transpose"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1x64x16x16xf32>, tensor<4xi32>) -> tensor<1x16x16x64xf32>
+// CHECK:           %[[VAL_3:.*]] = arith.constant dense<32> : tensor<2xi32>
+// CHECK:           %[[VAL_4:.*]] = "tfl.resize_bilinear"(%[[VAL_2]], %[[VAL_3]]) {align_corners = false, half_pixel_centers = true} : (tensor<1x16x16x64xf32>, tensor<2xi32>) -> tensor<1x32x32x64xf32>
+// CHECK:           %[[VAL_5:.*]] = arith.constant dense<[0, 3, 1, 2]> : tensor<4xi32>
+// CHECK:           %[[VAL_6:.*]] = "tfl.transpose"(%[[VAL_4]], %[[VAL_5]]) : (tensor<1x32x32x64xf32>, tensor<4xi32>) -> tensor<1x64x32x32xf32>
+// CHECK:           return %[[VAL_6]] : tensor<1x64x32x32xf32>
+// CHECK:         }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/fold_broadcast.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/fold_broadcast.mlir
index 1ec3e3b1fa9fbe..de0cba2f56d258 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/fold_broadcast.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/fold_broadcast.mlir
@@ -5,7 +5,7 @@ func.func @foldBroadcastInDimBeforeMulOp_bcast_dim_1D_float() -> (tensor<1x1x2x4
   // CHECK-DAG: %[[RES:.*]] = mhlo.constant dense<{{\[\[\[\[}}1.000000e+00, 4.000000e+00, 9.000000e+00, 1.600000e+01], [5.000000e+00, 1.200000e+01, 2.100000e+01, 3.200000e+01]]]]> : tensor<1x1x2x4xf32>
   %cst0 = mhlo.constant dense<[1.0, 2.0, 3.0, 4.0]> : tensor<4xf32>
   %cst1 = mhlo.constant dense<[[[[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0]]]]> : tensor<1x1x2x4xf32>
-  %0 = "mhlo.broadcast_in_dim"(%cst0) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<1x1x2x4xf32>
+  %0 = "mhlo.broadcast_in_dim"(%cst0) <{broadcast_dimensions = dense<3> : tensor<1xi64>}> : (tensor<4xf32>) -> tensor<1x1x2x4xf32>
   %1 = mhlo.multiply %0, %cst1 : tensor<1x1x2x4xf32>
   // CHECK:      return %[[RES]] : tensor<1x1x2x4xf32>
   func.return %1 : tensor<1x1x2x4xf32>
@@ -16,7 +16,7 @@ func.func @foldBroadcastInDimBeforeMulOp_bcast_dim_2D_float() -> (tensor<1x2x2x3
   // CHECK-DAG: %[[RES:.*]] = mhlo.constant dense<{{\[\[\[\[}}1.000000e+00, 4.000000e+00, 9.000000e+00], [4.000000e+00, 1.000000e+01, 1.800000e+01]], {{\[\[}}2.800000e+01, 4.000000e+01, 5.400000e+01], [4.000000e+01, 5.500000e+01, 7.200000e+01]]]]> : tensor<1x2x2x3xf32>
   %cst0 = mhlo.constant dense<[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]> : tensor<2x3xf32>
   %cst1 = mhlo.constant dense<[[[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], [[7.0, 8.0, 9.0], [10.0, 11.0, 12.0]]]]> : tensor<1x2x2x3xf32>
-  %0 = "mhlo.broadcast_in_dim"(%cst0) {broadcast_dimensions = dense<[1, 3]> : tensor<2xi64>} : (tensor<2x3xf32>) -> tensor<1x2x2x3xf32>
+  %0 = "mhlo.broadcast_in_dim"(%cst0) <{broadcast_dimensions = dense<[1, 3]> : tensor<2xi64>}> : (tensor<2x3xf32>) -> tensor<1x2x2x3xf32>
   %1 = mhlo.multiply %0, %cst1 : tensor<1x2x2x3xf32>
   // CHECK:      return %[[RES]] : tensor<1x2x2x3xf32>
   func.return %1 : tensor<1x2x2x3xf32>
@@ -27,7 +27,7 @@ func.func @foldBroadcastInDimBeforeMulOp_bcast_dim_1D_int() -> (tensor<1x1x2x4xi
   // CHECK-DAG: %[[RES:.*]] = mhlo.constant dense<{{\[\[\[\[}}1, 4, 9, 16], [5, 12, 21, 32]]]]> : tensor<1x1x2x4xi32>
   %cst0 = mhlo.constant dense<[1, 2, 3, 4]> : tensor<4xi32>
   %cst1 = mhlo.constant dense<[[[[1, 2, 3, 4], [5, 6, 7, 8]]]]> : tensor<1x1x2x4xi32>
-  %0 = "mhlo.broadcast_in_dim"(%cst0) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<4xi32>) -> tensor<1x1x2x4xi32>
+  %0 = "mhlo.broadcast_in_dim"(%cst0) <{broadcast_dimensions = dense<3> : tensor<1xi64>}> : (tensor<4xi32>) -> tensor<1x1x2x4xi32>
   %1 = mhlo.multiply %0, %cst1 : tensor<1x1x2x4xi32>
   // CHECK:      return %[[RES]] : tensor<1x1x2x4xi32>
   func.return %1 : tensor<1x1x2x4xi32>
@@ -38,7 +38,7 @@ func.func @foldBroadcastInDimBeforeMulOp_bcast_dim_4D_int() -> tensor<1x2x1x4xi3
   // CHECK-DAG: %[[RES:.*]] = mhlo.constant dense<{{\[\[\[\[}}0, 1, 4, 9]], {{\[\[}}0, 1, 4, 9]]]]> : tensor<1x2x1x4xi32>
   %0 = mhlo.constant dense<[[[[0, 1, 2, 3]]]]> : tensor<1x1x1x4xi32>
   %1 = mhlo.constant dense<[[[[0, 1, 2, 3]], [[0, 1, 2, 3]]]]> : tensor<1x2x1x4xi32>
-  %2 = "mhlo.broadcast_in_dim"(%0) {broadcast_dimensions = dense<[0, 1, 2, 3]> : tensor<4xi64>} : (tensor<1x1x1x4xi32>) -> tensor<1x2x1x4xi32>
+  %2 = "mhlo.broadcast_in_dim"(%0) <{broadcast_dimensions = dense<[0, 1, 2, 3]> : tensor<4xi64>}> : (tensor<1x1x1x4xi32>) -> tensor<1x2x1x4xi32>
   %3 = mhlo.multiply %1, %2 : tensor<1x2x1x4xi32>
   // CHECK:      return %[[RES]] : tensor<1x2x1x4xi32>
   return %3 : tensor<1x2x1x4xi32>
@@ -48,8 +48,8 @@ func.func @foldBroadcastInDimBeforeMulOp_bcast_dim_4D_int() -> tensor<1x2x1x4xi3
 func.func @notFoldBroadcastInDimBeforeMulOpWhenArgIsNonConst_bcast_dim_1D_int(%arg0: tensor<1x1x2x4xi32>) -> (tensor<1x1x2x4xi32>) {
   // CHECK-DAG: %[[CONST:.*]] = mhlo.constant dense<{{\[}}1, 2, 3, 4]> : tensor<4xi32>
   %cst0 = mhlo.constant dense<[1, 2, 3, 4]> : tensor<4xi32>
-  // CHECK: %[[BROADCAST:.*]] = "mhlo.broadcast_in_dim"(%[[CONST]]) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<4xi32>) -> tensor<1x1x2x4xi32>
-  %0 = "mhlo.broadcast_in_dim"(%cst0) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<4xi32>) -> tensor<1x1x2x4xi32>
+  // CHECK: %[[BROADCAST:.*]] = "mhlo.broadcast_in_dim"(%[[CONST]]) <{broadcast_dimensions = dense<3> : tensor<1xi64>}> : (tensor<4xi32>) -> tensor<1x1x2x4xi32>
+  %0 = "mhlo.broadcast_in_dim"(%cst0) <{broadcast_dimensions = dense<3> : tensor<1xi64>}> : (tensor<4xi32>) -> tensor<1x1x2x4xi32>
   // CHECK: %[[MUL:.*]] = mhlo.multiply %[[BROADCAST]], %[[ARG]] : tensor<1x1x2x4xi32>
   %1 = mhlo.multiply %0, %arg0 : tensor<1x1x2x4xi32>
   // CHECK:      return %[[MUL]] : tensor<1x1x2x4xi32>
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/fuse_mhlo_convolution.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/fuse_mhlo_convolution.mlir
index 042defb58dda00..98e97d1ef7b4c0 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/fuse_mhlo_convolution.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/fuse_mhlo_convolution.mlir
@@ -5,13 +5,13 @@
 func.func @fuseMulAndConv2D(%input: tensor<1x256x256x3xf32>) -> (tensor<1x256x256x2xf32>) {
   // CHECK-DAG: %[[FILTER:.+]] = mhlo.constant dense<{{\[\[\[\[}}1.000000e+00, 2.000000e+00], [3.000000e+00, 4.000000e+00], [5.000000e+00, 6.000000e+00]]]]> : tensor<1x1x3x2xf32>
   // CHECK-DAG: %[[CST:.+]] = mhlo.constant dense<[1.000000e-01, 2.000000e-01]> : tensor<2xf32>
-  // CHECK-DAG: %[[CST_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[CST]]) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<2xf32>) -> tensor<1x1x3x2xf32>
+  // CHECK-DAG: %[[CST_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[CST]]) <{broadcast_dimensions = dense<3> : tensor<1xi64>}> : (tensor<2xf32>) -> tensor<1x1x3x2xf32>
   // CHECK-DAG: %[[NEW_FILTER:.+]] =  mhlo.multiply %[[CST_BCAST]], %[[FILTER]] : tensor<1x1x3x2xf32>
   // CHECK-DAG: %[[RESULT:.+]] = mhlo.convolution(%[[INPUT]], %[[NEW_FILTER]]) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [1, 1], pad = {{\[\[}}0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x256x3xf32>, tensor<1x1x3x2xf32>) -> tensor<1x256x256x2xf32>
   %filter = mhlo.constant dense<[[[[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]]]> : tensor<1x1x3x2xf32>
   %cst = mhlo.constant dense<[0.1, 0.2]> : tensor<2xf32>
   %0 = mhlo.convolution(%input, %filter) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x256x3xf32>, tensor<1x1x3x2xf32>) -> tensor<1x256x256x2xf32>
-  %1 = "mhlo.broadcast_in_dim"(%cst) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<2xf32>) -> tensor<1x256x256x2xf32>
+  %1 = "mhlo.broadcast_in_dim"(%cst) <{broadcast_dimensions = dense<3> : tensor<1xi64>}> : (tensor<2xf32>) -> tensor<1x256x256x2xf32>
   %2 = mhlo.multiply %0, %1 : tensor<1x256x256x2xf32>
   // CHECK-DAG: return %[[RESULT]]
   func.return %2 : tensor<1x256x256x2xf32>
@@ -25,20 +25,20 @@ func.func @fuseMulAndConv2DDynamic(%input: tensor<?x256x256x3xf32>) -> (tensor<?
   // CHECK-DAG: %[[FILTER:.+]] = mhlo.constant dense<{{\[\[\[\[}}1.000000e+00, 2.000000e+00], [3.000000e+00, 4.000000e+00], [5.000000e+00, 6.000000e+00]]]]> : tensor<1x1x3x2xf32>
   // CHECK-DAG: %[[CST_0:.+]] = mhlo.constant dense<[1.000000e-01, 2.000000e-01]> : tensor<2xf32>
   // CHECK-DAG: %[[CST_1:.+]] = mhlo.constant dense<[3.000000e-01, 4.000000e-01]> : tensor<2xf32>
-  // CHECK: %[[CST_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[CST_0]]) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<2xf32>) -> tensor<1x1x3x2xf32>
+  // CHECK: %[[CST_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[CST_0]]) <{broadcast_dimensions = dense<3> : tensor<1xi64>}> : (tensor<2xf32>) -> tensor<1x1x3x2xf32>
   // CHECK: %[[NEW_FILTER:.+]] =  mhlo.multiply %[[CST_BCAST]], %[[FILTER]] : tensor<1x1x3x2xf32>
   // CHECK: %[[CONV:.+]] = mhlo.convolution(%[[INPUT]], %[[NEW_FILTER]]) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [1, 1], pad = {{\[\[}}0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<?x256x256x3xf32>, tensor<1x1x3x2xf32>) -> tensor<?x256x256x2xf32>
   // CHECK: %[[SHAPE:.+]] = shape.shape_of %[[CONV]] : tensor<?x256x256x2xf32> -> tensor<4xindex>
-  // CHECK: %[[DYNAMIC_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[CST_1]], %[[SHAPE]]) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<2xf32>, tensor<4xindex>) -> tensor<?x256x256x2xf32>
+  // CHECK: %[[DYNAMIC_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[CST_1]], %[[SHAPE]]) <{broadcast_dimensions = dense<3> : tensor<1xi64>}> : (tensor<2xf32>, tensor<4xindex>) -> tensor<?x256x256x2xf32>
   // CHECK: %[[ADD:.+]] = mhlo.add %[[CONV]], %[[DYNAMIC_BCAST]] : tensor<?x256x256x2xf32>
   %filter = mhlo.constant dense<[[[[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]]]> : tensor<1x1x3x2xf32>
   %cst_0 = mhlo.constant dense<[0.1, 0.2]> : tensor<2xf32>
   %cst_1 = mhlo.constant dense<[0.3, 0.4]> : tensor<2xf32>
   %0 = mhlo.convolution(%input, %filter) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<?x256x256x3xf32>, tensor<1x1x3x2xf32>) -> tensor<?x256x256x2xf32>
   %1 = shape.shape_of %0 : tensor<?x256x256x2xf32> -> tensor<4xindex>
-  %2 = "mhlo.dynamic_broadcast_in_dim"(%cst_0, %1) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<2xf32>, tensor<4xindex>) -> tensor<?x256x256x2xf32>
+  %2 = "mhlo.dynamic_broadcast_in_dim"(%cst_0, %1) <{broadcast_dimensions = dense<3> : tensor<1xi64>}> : (tensor<2xf32>, tensor<4xindex>) -> tensor<?x256x256x2xf32>
   %3 = mhlo.multiply %0, %2 : tensor<?x256x256x2xf32>
-  %4 = "mhlo.dynamic_broadcast_in_dim"(%cst_1, %1) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<2xf32>, tensor<4xindex>) -> tensor<?x256x256x2xf32>
+  %4 = "mhlo.dynamic_broadcast_in_dim"(%cst_1, %1) <{broadcast_dimensions = dense<3> : tensor<1xi64>}> : (tensor<2xf32>, tensor<4xindex>) -> tensor<?x256x256x2xf32>
   %5 = mhlo.add %3, %4 : tensor<?x256x256x2xf32>
   // CHECK-DAG: return %[[ADD]]
   func.return %5 : tensor<?x256x256x2xf32>
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-skip-partitioned-calls.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-skip-partitioned-calls.mlir
new file mode 100644
index 00000000000000..79ccccff831531
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-skip-partitioned-calls.mlir
@@ -0,0 +1,34 @@
+// RUN: odml-to-stablehlo-opt %s --tf-stablehlo=skip-partitioned-calls=true | FileCheck %s --check-prefix=CHECK-SKIP
+// RUN: odml-to-stablehlo-opt %s --tf-stablehlo=skip-partitioned-calls=false | FileCheck %s --check-prefix=CHECK-NOSKIP
+
+module {
+  func.func @partitioned_call(%arg0: tensor<1x2x2x3xf32>) -> (tensor<1x2x2x3xf32>) {
+    %0 = "tf.StatefulPartitionedCall"(%arg0) <{
+      config = "", config_proto = "", executor_type = "", f = @some_func
+    }> {
+      _collective_manager_ids = [], device = ""
+    } : (tensor<1x2x2x3xf32>) -> tensor<1x2x2x3xf32>
+    // CHECK-SKIP: tf.StatefulPartitionedCall
+    // CHECK-NOSKIP: call @some_func
+    // CHECK-NOSKIP-NOT: tf.StatefulPartitionedCall
+    %1 = "tf.PartitionedCall"(%0) <{
+      config = "", config_proto = "", executor_type = "", f = @some_other_func
+    }> {
+      _collective_manager_ids = [], device = ""
+    } : (tensor<1x2x2x3xf32>) -> tensor<1x2x2x3xf32>
+    // CHECK-SKIP: tf.PartitionedCall
+    // CHECK-NOSKIP: call @some_other_func
+    // CHECK-NOSKIP-NOT: tf.PartitionedCall
+    func.return %1: tensor<1x2x2x3xf32>
+  }
+
+  // CHECK-SKIP: func.func private @some_func
+  func.func private @some_func(%arg0: tensor<1x2x2x3xf32>) -> tensor<1x2x2x3xf32> attributes {tf._noinline = true} {
+    return %arg0 : tensor<1x2x2x3xf32>
+  }
+
+  // CHECK-SKIP: func.func private @some_other_func
+  func.func private @some_other_func(%arg0: tensor<1x2x2x3xf32>) -> tensor<1x2x2x3xf32> attributes {tf._noinline = true} {
+    return %arg0 : tensor<1x2x2x3xf32>
+  }
+}
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-skip-stateful-partition-calls.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-skip-stateful-partition-calls.mlir
deleted file mode 100644
index 032bf414edfe0f..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-skip-stateful-partition-calls.mlir
+++ /dev/null
@@ -1,19 +0,0 @@
-// RUN: odml-to-stablehlo-opt %s --tf-stablehlo=skip-stateful-partitioned-call=true | FileCheck %s --check-prefix=CHECK-SKIP
-// RUN: odml-to-stablehlo-opt %s --tf-stablehlo=skip-stateful-partitioned-call=false | FileCheck %s --check-prefix=CHECK-NOSKIP
-
-module {
-  func.func @partitioned_call(%arg0: tensor<1x2x2x3xf32>) -> (tensor<1x2x2x3xf32>) {
-    %0 = "tf.StatefulPartitionedCall"(%arg0) <{
-      config = "", config_proto = "", executor_type = "", f = @some_func
-    }> {
-      _collective_manager_ids = [], device = ""
-    } : (tensor<1x2x2x3xf32>) -> tensor<1x2x2x3xf32>
-    // CHECK-SKIP: tf.StatefulPartitionedCall
-    // CHECK-NOSKIP-NOT: tf.StatefulPartitionedCall
-    func.return %0: tensor<1x2x2x3xf32>
-  }
-
-  func.func private @some_func(%arg0: tensor<1x2x2x3xf32>) -> tensor<1x2x2x3xf32> attributes {tf._noinline = true} {
-    return %arg0 : tensor<1x2x2x3xf32>
-  }
-}
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-composite.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-composite.mlir
index 41a94b929c0f47..268247e815faa3 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-composite.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-composite.mlir
@@ -4,23 +4,10 @@
 module {
   func.func public @main(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>, %arg2: tensor<1x100x32x4xf32>,
       %arg3: tensor<1x500x4x4xf32>, %arg4: tensor<1x500x4x4xf32>, %arg5: tensor<1x1x100x500xf32>, %arg6: tensor<f32>)
-      -> (tensor<3x3xf32>, tensor<1x100x32x4xf32>) {
-    // CHECK-ROUNDTRIP: %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "odml.update_kv_cache", custom_option = #tfl<const_bytes : "0x6B765F63616368655F6D617800010E00020001000100F40105032501">} : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<3x3xf32>
-    // CHECK-ROUNDTRIP: %1 = "tfl.custom"(%arg2, %arg3, %arg4, %arg5, %arg6) {custom_code = "odml.scaled_dot_product_attention", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<1x100x32x4xf32>, tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>, tensor<1x1x100x500xf32>, tensor<f32>) -> tensor<1x100x32x4xf32>
-    %0 = func.call @test_kv_cache(%arg0, %arg1) : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<3x3xf32>
-    %1 = func.call @test_sdpa(%arg2, %arg3, %arg4, %arg5, %arg6) : (tensor<1x100x32x4xf32>,  tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>, tensor<1x1x100x500xf32>, tensor<f32>) -> tensor<1x100x32x4xf32>
-    return %0, %1 : tensor<3x3xf32>, tensor<1x100x32x4xf32>
-  }
-
-  // CHECK-LABEL: func.func private @test_kv_cache
-  func.func private @test_kv_cache(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<3x3xf32> {
-    // CHECK: %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "odml.update_kv_cache", custom_option = #tfl<const_bytes : "0x6B765F63616368655F6D617800010E00020001000100F40105032501">} : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<3x3xf32>
-    %0 = stablehlo.composite "odml.update_kv_cache" %arg0, %arg1 {composite_attributes = {kv_cache_max = 500 : i64}, decomposition = @odml.update_kv_cache.impl} : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<3x3xf32>
-    return %0 : tensor<3x3xf32>
-  }
-  func.func private @odml.update_kv_cache.impl(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<3x3xf32> {
-    // No decomposition provided for test case.
-    return %arg0 : tensor<3x3xf32>
+      -> tensor<1x100x32x4xf32> {
+    // CHECK-ROUNDTRIP: %0 = "tfl.custom"(%arg2, %arg3, %arg4, %arg5, %arg6) {custom_code = "odml.scaled_dot_product_attention", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<1x100x32x4xf32>, tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>, tensor<1x1x100x500xf32>, tensor<f32>) -> tensor<1x100x32x4xf32>
+    %0 = func.call @test_sdpa(%arg2, %arg3, %arg4, %arg5, %arg6) : (tensor<1x100x32x4xf32>,  tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>, tensor<1x1x100x500xf32>, tensor<f32>) -> tensor<1x100x32x4xf32>
+    return %0: tensor<1x100x32x4xf32>
   }
 
   // CHECK-LABEL: func.func private @test_sdpa
@@ -34,4 +21,30 @@ module {
     return %arg0 : tensor<1x100x32x4xf32>
   }
 
+  // CHECK-LABEL: func.func private @test_multiple_kv_caches
+  func.func private @test_multiple_kv_caches(%arg0: tensor<1x500x4x4xf32>, %arg1: tensor<1x500x4x4xf32>, %arg2: tensor<100xi64>, %arg3: tensor<1x100x4x4xf32>, %arg4: tensor<1x100x4x4xf32>) -> (tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>) {
+    // CHECK: %0:2 = "tfl.custom"(%arg2, %arg3, %arg4) {custom_code = "odml.update_kv_cache", custom_option = #tfl<const_bytes : "0x6B765F63616368655F6D6178006C617965725F696E646578006E756D5F6C6179657273000325190E030001000300F40100000200050505092501">} : (tensor<100xi64>, tensor<1x100x4x4xf32>, tensor<1x100x4x4xf32>) -> (tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>)
+    // CHECK: %1:2 = "tfl.custom"(%arg2, %arg3, %arg4) {custom_code = "odml.update_kv_cache", custom_option = #tfl<const_bytes : "0x6B765F63616368655F6D6178006C617965725F696E646578006E756D5F6C6179657273000325190E030001000300F40101000200050505092501">} : (tensor<100xi64>, tensor<1x100x4x4xf32>, tensor<1x100x4x4xf32>) -> (tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>)
+    %0:2 = stablehlo.composite "odml.update_kv_cache" %arg0, %arg1, %arg2, %arg3, %arg4 {composite_attributes = {kv_cache_max = 500 : i64}, decomposition = @odml.update_kv_cache.impl_0} : (tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>, tensor<100xi64>, tensor<1x100x4x4xf32>, tensor<1x100x4x4xf32>) -> (tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>)
+    %1:2 = stablehlo.composite "odml.update_kv_cache" %0#0, %0#1, %arg2, %arg3, %arg4 {composite_attributes = {kv_cache_max = 500 : i64}, decomposition = @odml.update_kv_cache.impl_0} : (tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>, tensor<100xi64>, tensor<1x100x4x4xf32>, tensor<1x100x4x4xf32>) -> (tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>)
+    return %1#0, %1#1 : tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>
+  }
+  func.func private @odml.update_kv_cache.impl_0(%arg0: tensor<1x500x4x4xf32>, %arg1: tensor<1x500x4x4xf32>, %arg2: tensor<100xi64>, %arg3: tensor<1x100x4x4xf32>, %arg4: tensor<1x100x4x4xf32>) -> (tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>) {
+    %0 = stablehlo.constant dense<500> : tensor<100xi64>
+    %1 = stablehlo.constant dense<0> : tensor<100xi64>
+    %2 = stablehlo.compare  LT, %arg2, %1 : (tensor<100xi64>, tensor<100xi64>) -> tensor<100xi1>
+    %3 = stablehlo.add %arg2, %0 : tensor<100xi64>
+    %4 = stablehlo.select %2, %3, %arg2 : tensor<100xi1>, tensor<100xi64>
+    %5 = stablehlo.reshape %4 : (tensor<100xi64>) -> tensor<100x1xi64>
+    %6 = "stablehlo.scatter"(%arg0, %5, %arg3) ({
+    ^bb0(%arg5: tensor<f32>, %arg6: tensor<f32>):
+      stablehlo.return %arg6 : tensor<f32>
+    }) {indices_are_sorted = false, scatter_dimension_numbers = #stablehlo.scatter<update_window_dims = [0, 2, 3], inserted_window_dims = [1], scatter_dims_to_operand_dims = [1], index_vector_dim = 1>, unique_indices = false} : (tensor<1x500x4x4xf32>, tensor<100x1xi64>, tensor<1x100x4x4xf32>) -> tensor<1x500x4x4xf32>
+    %7 = "stablehlo.scatter"(%arg1, %5, %arg4) ({
+    ^bb0(%arg5: tensor<f32>, %arg6: tensor<f32>):
+      stablehlo.return %arg6 : tensor<f32>
+    }) {indices_are_sorted = false, scatter_dimension_numbers = #stablehlo.scatter<update_window_dims = [0, 2, 3], inserted_window_dims = [1], scatter_dims_to_operand_dims = [1], index_vector_dim = 1>, unique_indices = false} : (tensor<1x500x4x4xf32>, tensor<100x1xi64>, tensor<1x100x4x4xf32>) -> tensor<1x500x4x4xf32>
+    return %6, %7 : tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>
+  }
+
 }
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-constant.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-constant.mlir
index d4d3b0abf01de3..f5d5a734fad666 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-constant.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-constant.mlir
@@ -9,7 +9,7 @@ module {
 
 // CHECK:  module {
 // CHECK-NEXT:    func @main() -> tensor<1xi64> {
-// CHECK-NEXT:    %0 = stablehlo.constant dense<2> : tensor<1xi64>
-// CHECK-NEXT:    return %0 : tensor<1xi64>
+// CHECK-NEXT:    %[[c0:.+]] = stablehlo.constant dense<2> : tensor<1xi64>
+// CHECK-NEXT:    return %[[c0]] : tensor<1xi64>
 // CHECK-NEXT:    }
 // CHECK-NEXT:    }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
index 8b454fa898e5a6..2329f68b36fc33 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
@@ -55,7 +55,7 @@ func.func @add(%arg0: tensor<2xi32>) -> tensor<2xi32> {
 // CHECK:           return %[[VAL_2]], %[[VAL_3]] : tensor<1x1000xf32>, tensor<1x1000xf32>
 // CHECK:         }
 func.func @broadcast_add(%arg0: tensor<1x1xf32>, %arg1: tensor<1x1000xf32>) -> (tensor<1x1000xf32>, tensor<1x1000xf32>) {
-  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x1xf32>) -> tensor<1x1000xf32>
+  %0 = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}> : (tensor<1x1xf32>) -> tensor<1x1000xf32>
   %1 = mhlo.add %0, %arg1 : tensor<1x1000xf32>
   %2 = mhlo.add %arg1, %0 : tensor<1x1000xf32>
   func.return %1, %2 : tensor<1x1000xf32>, tensor<1x1000xf32>
@@ -109,7 +109,7 @@ func.func @div(%arg0: tensor<2xi32>) -> tensor<2xi32> {
 // CHECK:           return %[[VAL_2]], %[[VAL_3]] : tensor<1x1000xf32>, tensor<1x1000xf32>
 // CHECK:         }
 func.func @broadcast_div(%arg0: tensor<1x1xf32>, %arg1: tensor<1x1000xf32>) -> (tensor<1x1000xf32>, tensor<1x1000xf32>) {
-  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x1xf32>) -> tensor<1x1000xf32>
+  %0 = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}> : (tensor<1x1xf32>) -> tensor<1x1000xf32>
   %1 = mhlo.divide %0, %arg1 : tensor<1x1000xf32>
   %2 = mhlo.divide %arg1, %0 : tensor<1x1000xf32>
   func.return %1, %2 : tensor<1x1000xf32>, tensor<1x1000xf32>
@@ -146,7 +146,7 @@ func.func @shift_left(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi3
 // CHECK:           return %[[VAL_2]], %[[VAL_3]] : tensor<4xi32>, tensor<4xi32>
 // CHECK:         }
 func.func @broadcast_shift_left(%arg0: tensor<1xi32>, %arg1: tensor<4xi32>) -> (tensor<4xi32>, tensor<4xi32>) {
-  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0]> : tensor<1xi64>} : (tensor<1xi32>) -> tensor<4xi32>
+  %0 = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<[0]> : tensor<1xi64>}> : (tensor<1xi32>) -> tensor<4xi32>
   %1 = mhlo.shift_left %0, %arg1 : tensor<4xi32>
   %2 = mhlo.shift_left %arg1, %0 : tensor<4xi32>
   func.return %1, %2 : tensor<4xi32>, tensor<4xi32>
@@ -183,7 +183,7 @@ func.func @maximum(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32>
 // CHECK:           return %[[VAL_2]], %[[VAL_3]] : tensor<1x1000xf32>, tensor<1x1000xf32>
 // CHECK:         }
 func.func @broadcast_maximum(%arg0: tensor<1x1xf32>, %arg1: tensor<1x1000xf32>) -> (tensor<1x1000xf32>, tensor<1x1000xf32>) {
-  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x1xf32>) -> tensor<1x1000xf32>
+  %0 = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}> : (tensor<1x1xf32>) -> tensor<1x1000xf32>
   %1 = mhlo.maximum %0, %arg1 : tensor<1x1000xf32>
   %2 = mhlo.maximum %arg1, %0 : tensor<1x1000xf32>
   func.return %1, %2 : tensor<1x1000xf32>, tensor<1x1000xf32>
@@ -209,7 +209,7 @@ func.func @minimum(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32>
 // CHECK:           return %[[VAL_2]], %[[VAL_3]] : tensor<1x1000xf32>, tensor<1x1000xf32>
 // CHECK:         }
 func.func @broadcast_minimum(%arg0: tensor<1x1xf32>, %arg1: tensor<1x1000xf32>) -> (tensor<1x1000xf32>, tensor<1x1000xf32>) {
-  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x1xf32>) -> tensor<1x1000xf32>
+  %0 = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}> : (tensor<1x1xf32>) -> tensor<1x1000xf32>
   %1 = mhlo.minimum %0, %arg1 : tensor<1x1000xf32>
   %2 = mhlo.minimum %arg1, %0 : tensor<1x1000xf32>
   func.return %1, %2 : tensor<1x1000xf32>, tensor<1x1000xf32>
@@ -234,7 +234,7 @@ func.func @mul(%arg0: tensor<2xi32>) -> tensor<2xi32> {
 // CHECK:           return %[[VAL_2]], %[[VAL_3]] : tensor<1x1000xf32>, tensor<1x1000xf32>
 // CHECK:         }
 func.func @broadcast_mul(%arg0: tensor<1x1xf32>, %arg1: tensor<1x1000xf32>) -> (tensor<1x1000xf32>, tensor<1x1000xf32>) {
-  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x1xf32>) -> tensor<1x1000xf32>
+  %0 = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}> : (tensor<1x1xf32>) -> tensor<1x1000xf32>
   %1 = mhlo.multiply %0, %arg1 : tensor<1x1000xf32>
   %2 = mhlo.multiply %arg1, %0 : tensor<1x1000xf32>
   func.return %1, %2 : tensor<1x1000xf32>, tensor<1x1000xf32>
@@ -291,7 +291,7 @@ func.func @sub(%arg0: tensor<2xi32>) -> tensor<2xi32> {
 // CHECK:           return %[[VAL_2]], %[[VAL_3]] : tensor<1x1000xf32>, tensor<1x1000xf32>
 // CHECK:         }
 func.func @broadcast_sub(%arg0: tensor<1x1xf32>, %arg1: tensor<1x1000xf32>) -> (tensor<1x1000xf32>, tensor<1x1000xf32>) {
-  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x1xf32>) -> tensor<1x1000xf32>
+  %0 = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}> : (tensor<1x1xf32>) -> tensor<1x1000xf32>
   %1 = mhlo.subtract %0, %arg1 : tensor<1x1000xf32>
   %2 = mhlo.subtract %arg1, %0 : tensor<1x1000xf32>
   func.return %1, %2 : tensor<1x1000xf32>, tensor<1x1000xf32>
@@ -317,7 +317,7 @@ func.func @broadcast_sub_chlo(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> t
 // CHECK:           return %[[VAL_2]], %[[VAL_3]] : tensor<4xf32>, tensor<4xf32>
 // CHECK:         }
 func.func @broadcast_atan2(%arg0: tensor<1xf32>, %arg1: tensor<4xf32>) -> (tensor<4xf32>, tensor<4xf32>) {
-  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0]> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<4xf32>
+  %0 = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<[0]> : tensor<1xi64>}> : (tensor<1xf32>) -> tensor<4xf32>
   %1 = mhlo.atan2 %0, %arg1 : tensor<4xf32>
   %2 = mhlo.atan2 %arg1, %0 : tensor<4xf32>
   func.return %1, %2 : tensor<4xf32>, tensor<4xf32>
@@ -429,7 +429,7 @@ func.func @bitwise_or(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi3
 // CHECK:           return %[[VAL_2]] : tensor<1x4xi8>
 // CHECK:         }
 func.func @bitwise_or_broadcast(%arg0: tensor<1xi8>, %arg1: tensor<1x4xi8>) -> tensor<1x4xi8> {
-  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[1]> : tensor<1xi64>} : (tensor<1xi8>) -> tensor<1x4xi8>
+  %0 = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<[1]> : tensor<1xi64>}> : (tensor<1xi8>) -> tensor<1x4xi8>
   %1 = mhlo.or %0, %arg1 : tensor<1x4xi8>
   func.return %1 : tensor<1x4xi8>
 }
@@ -474,7 +474,7 @@ func.func @bitwise_xor(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi
 // CHECK:           return %[[VAL_2]] : tensor<1x4xi8>
 // CHECK:         }
 func.func @bitwise_xor_broadcast(%arg0: tensor<1xi8>, %arg1: tensor<1x4xi8>) -> tensor<1x4xi8> {
-  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi8>) -> tensor<1x4xi8>
+  %0 = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<1> : tensor<1xi64>}> : (tensor<1xi8>) -> tensor<1x4xi8>
   %1 = mhlo.xor %0, %arg1 : tensor<1x4xi8>
   func.return %1 : tensor<1x4xi8>
 }
@@ -497,7 +497,7 @@ func.func @bitwise_and(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi
 // CHECK:           return %[[VAL_2]] : tensor<1x4xi8>
 // CHECK:         }
 func.func @bitwise_and_broadcast(%arg0: tensor<1xi8>, %arg1: tensor<1x4xi8>) -> tensor<1x4xi8> {
-  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[1]> : tensor<1xi64>} : (tensor<1xi8>) -> tensor<1x4xi8>
+  %0 = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<[1]> : tensor<1xi64>}> : (tensor<1xi8>) -> tensor<1x4xi8>
   %1 = mhlo.and %0, %arg1 : tensor<1x4xi8>
   func.return %1 : tensor<1x4xi8>
 }
@@ -543,7 +543,7 @@ func.func @pow(%arg0: tensor<2xf32>) -> tensor<2xf32> {
 // CHECK:           return %[[VAL_2]], %[[VAL_3]] : tensor<4xi32>, tensor<4xi32>
 // CHECK:         }
 func.func @broadcast_pow(%arg0: tensor<1xi32>, %arg1: tensor<4xi32>) -> (tensor<4xi32>, tensor<4xi32>) {
-  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0]> : tensor<1xi64>} : (tensor<1xi32>) -> tensor<4xi32>
+  %0 = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<[0]> : tensor<1xi64>}> : (tensor<1xi32>) -> tensor<4xi32>
   %1 = mhlo.power %0, %arg1 : tensor<4xi32>
   %2 = mhlo.power %arg1, %0 : tensor<4xi32>
   func.return %1, %2 : tensor<4xi32>, tensor<4xi32>
@@ -695,7 +695,7 @@ func.func @equal_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?
 // CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
 // CHECK:         }
 func.func @equal_broadcast(%arg0: tensor<1x1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x1xi32>) -> tensor<1x2xi32>
+  %0 = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}> : (tensor<1x1xi32>) -> tensor<1x2xi32>
   %1 = "mhlo.compare"(%0, %arg1) {comparison_direction = #mhlo<comparison_direction EQ>} : (tensor<1x2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   func.return %1 : tensor<1x2xi1>
 }
@@ -758,7 +758,7 @@ func.func @notequal(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2xi1>
 // CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
 // CHECK:         }
 func.func @notequal_broadcast(%arg0: tensor<1x1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x1xi32>) -> tensor<1x2xi32>
+  %0 = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}> : (tensor<1x1xi32>) -> tensor<1x2xi32>
   %1 = "mhlo.compare"(%0, %arg1) {comparison_direction = #mhlo<comparison_direction NE>} : (tensor<1x2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %1 : tensor<1x2xi1>
 }
@@ -814,7 +814,7 @@ func.func @greater(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2xi1> {
 // CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
 // CHECK:         }
 func.func @broadcast_greater(%arg0: tensor<1x1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x1xi32>) -> tensor<1x2xi32>
+  %0 = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}> : (tensor<1x1xi32>) -> tensor<1x2xi32>
   %1 = "mhlo.compare"(%0, %arg1) {comparison_direction = #mhlo<comparison_direction GT>} : (tensor<1x2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   func.return %1 : tensor<1x2xi1>
 }
@@ -855,7 +855,7 @@ func.func @greater_equal(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2
 // CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
 // CHECK:         }
 func.func @broadcast_greater_equal(%arg0: tensor<1x1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x1xi32>) -> tensor<1x2xi32>
+  %0 = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}> : (tensor<1x1xi32>) -> tensor<1x2xi32>
   %1 = "mhlo.compare"(%0, %arg1) {comparison_direction = #mhlo<comparison_direction GE>} : (tensor<1x2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   func.return %1 : tensor<1x2xi1>
 }
@@ -889,7 +889,7 @@ func.func @less(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2xi1> {
 // CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
 // CHECK:         }
 func.func @broadcast_less(%arg0: tensor<1x1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x1xi32>) -> tensor<1x2xi32>
+  %0 = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}> : (tensor<1x1xi32>) -> tensor<1x2xi32>
   %1 = "mhlo.compare"(%0, %arg1) {comparison_direction = #mhlo<comparison_direction LT>} : (tensor<1x2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   func.return %1 : tensor<1x2xi1>
 }
@@ -923,7 +923,7 @@ func.func @less_equal(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2xi1
 // CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
 // CHECK:         }
 func.func @broadcast_less_equal(%arg0: tensor<1x1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x1xi32>) -> tensor<1x2xi32>
+  %0 = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}> : (tensor<1x1xi32>) -> tensor<1x2xi32>
   %1 = "mhlo.compare"(%0, %arg1) {comparison_direction = #mhlo<comparison_direction LE>} : (tensor<1x2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   func.return %1 : tensor<1x2xi1>
 }
@@ -947,7 +947,7 @@ func.func @broadcast_less_equal_chlo(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32
 // CHECK:           return %[[VAL_3]] : tensor<6x3xf32>
 // CHECK:         }
 func.func @concat_v2(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<6x3xf32> {
-  %2 = "mhlo.concatenate"(%arg0, %arg1) {dimension = 0 : i64} : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<6x3xf32>
+  %2 = "mhlo.concatenate"(%arg0, %arg1) <{dimension = 0 : i64}> : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<6x3xf32>
   func.return %2 : tensor<6x3xf32>
 }
 
@@ -959,7 +959,7 @@ func.func @concat_v2(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<6
 // CHECK:           return %[[VAL_3]] : tensor<3x6xf32>
 // CHECK:         }
 func.func @concat_v2_1d_axis(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<3x6xf32> {
-  %2 = "mhlo.concatenate"(%arg0, %arg1) {dimension = 1 : i64} : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<3x6xf32>
+  %2 = "mhlo.concatenate"(%arg0, %arg1) <{dimension = 1 : i64}> : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<3x6xf32>
   func.return %2 : tensor<3x6xf32>
 }
 
@@ -1113,7 +1113,7 @@ func.func @selectv2_pred_scalar(%arg0: tensor<i1>, %arg1: tensor<2xi32>, %arg2:
 // CHECK:           return %[[VAL_3]] : tensor<1x100xi32>
 // CHECK:         }
 func.func @selectv2_broadcasted_operand(%arg0: tensor<i1>, %arg1: tensor<1x1xi32>, %arg2: tensor<1x100xi32>) -> tensor<1x100xi32> {
-  %0 = "mhlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x1xi32>) -> tensor<1x100xi32>
+  %0 = "mhlo.broadcast_in_dim"(%arg1) <{broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}> : (tensor<1x1xi32>) -> tensor<1x100xi32>
   %1 = "mhlo.select"(%arg0, %0, %arg2) : (tensor<i1>, tensor<1x100xi32>, tensor<1x100xi32>) -> tensor<1x100xi32>
   func.return %1 : tensor<1x100xi32>
 }
@@ -1126,7 +1126,7 @@ func.func @selectv2_broadcasted_operand(%arg0: tensor<i1>, %arg1: tensor<1x1xi32
 // CHECK:           return %[[VAL_3]] : tensor<1x100xi32>
 // CHECK:         }
 func.func @selectv2_broadcasted_condition(%arg0: tensor<1x1xi1>, %arg1: tensor<1x100xi32>, %arg2: tensor<1x100xi32>) -> tensor<1x100xi32> {
-  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x1xi1>) -> tensor<1x100xi1>
+  %0 = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}> : (tensor<1x1xi1>) -> tensor<1x100xi1>
   %1 = "mhlo.select"(%0, %arg1, %arg2) : (tensor<1x100xi1>, tensor<1x100xi32>, tensor<1x100xi32>) -> tensor<1x100xi32>
   func.return %1 : tensor<1x100xi32>
 }
@@ -1142,7 +1142,7 @@ func.func @selectv2_broadcasted_condition(%arg0: tensor<1x1xi1>, %arg1: tensor<1
 func.func @transpose_2d(%arg0: tensor<2x3xf32>) -> tensor<3x2xf32> {
   %0 = mhlo.constant dense<[1, 0]> : tensor<2xi64>
   %1 = mhlo.constant dense<[1, 0]> : tensor<2xi64>
-  %2 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<2x3xf32>) -> tensor<3x2xf32>
+  %2 = "mhlo.transpose"(%arg0) <{permutation = dense<[1, 0]> : tensor<2xi64>}> : (tensor<2x3xf32>) -> tensor<3x2xf32>
   func.return %2 : tensor<3x2xf32>
 }
 
@@ -1157,7 +1157,7 @@ func.func @transpose_2d(%arg0: tensor<2x3xf32>) -> tensor<3x2xf32> {
 func.func @transpose_3d_int32(%arg0: tensor<1x2x3xf32>) -> tensor<3x2x1xf32> {
   %0 = mhlo.constant dense<[2, 1, 0]> : tensor<3xi32>
   %1 = mhlo.constant dense<[2, 1, 0]> : tensor<3xi64>
-  %2 = "mhlo.transpose"(%arg0) {permutation = dense<[2, 1, 0]> : tensor<3xi64>} : (tensor<1x2x3xf32>) -> tensor<3x2x1xf32>
+  %2 = "mhlo.transpose"(%arg0) <{permutation = dense<[2, 1, 0]> : tensor<3xi64>}> : (tensor<1x2x3xf32>) -> tensor<3x2x1xf32>
   func.return %2 : tensor<3x2x1xf32>
 }
 
@@ -1172,7 +1172,7 @@ func.func @transpose_3d_int32(%arg0: tensor<1x2x3xf32>) -> tensor<3x2x1xf32> {
 func.func @transpose_3d(%arg0: tensor<1x2x3xf32>) -> tensor<3x2x1xf32> {
   %0 = mhlo.constant dense<[2, 1, 0]> : tensor<3xi64>
   %1 = mhlo.constant dense<[2, 1, 0]> : tensor<3xi64>
-  %2 = "mhlo.transpose"(%arg0) {permutation = dense<[2, 1, 0]> : tensor<3xi64>} : (tensor<1x2x3xf32>) -> tensor<3x2x1xf32>
+  %2 = "mhlo.transpose"(%arg0) <{permutation = dense<[2, 1, 0]> : tensor<3xi64>}> : (tensor<1x2x3xf32>) -> tensor<3x2x1xf32>
   func.return %2 : tensor<3x2x1xf32>
 }
 
@@ -1187,7 +1187,7 @@ func.func @transpose_3d(%arg0: tensor<1x2x3xf32>) -> tensor<3x2x1xf32> {
 func.func @transpose_dynamic_2d(%arg0: tensor<?x4xf32>) -> tensor<4x?xf32> {
   %0 = mhlo.constant dense<[1, 0]> : tensor<2xi64>
   %1 = mhlo.constant dense<[1, 0]> : tensor<2xi64>
-  %2 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<?x4xf32>) -> tensor<4x?xf32>
+  %2 = "mhlo.transpose"(%arg0) <{permutation = dense<[1, 0]> : tensor<2xi64>}> : (tensor<?x4xf32>) -> tensor<4x?xf32>
   func.return %2 : tensor<4x?xf32>
 }
 
@@ -1588,7 +1588,7 @@ func.func @convert_i32_f32(%arg0: tensor<2xi32>) -> tensor<2xf32> {
 // CHECK:           return %[[VAL_4]] : tensor<1x519xf32>
 // CHECK:         }
 func.func @convert_slice(%arg0: tensor<1x4672xf32>) -> tensor<1x519xf32> {
-  %0 = "mhlo.slice"(%arg0) {limit_indices = dense<[1, 4672]> : tensor<2xi64>, start_indices = dense<[0, 4153]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x4672xf32>) -> tensor<1x519xf32>
+  %0 = "mhlo.slice"(%arg0) <{limit_indices = dense<[1, 4672]> : tensor<2xi64>, start_indices = dense<[0, 4153]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}> : (tensor<1x4672xf32>) -> tensor<1x519xf32>
   func.return %0 : tensor<1x519xf32>
 }
 
@@ -1676,7 +1676,7 @@ func.func @convert_dot_2d_2d(%arg0: tensor<1x256xf32>, %arg1: tensor<256x1xf32>)
 // CHECK:           return %[[VAL_2]] : tensor<3x8x8x16xf32>
 // CHECK:         }
 func.func @broadcast_in_dim_tf_style(%arg0: tensor<8x1x16xf32>) -> tensor<3x8x8x16xf32> {
-  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[1, 2, 3]> : tensor<3xi64>, name = "broadcast.0"} : (tensor<8x1x16xf32>) -> tensor<3x8x8x16xf32>
+  %0 = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<[1, 2, 3]> : tensor<3xi64>, name = "broadcast.0"}> : (tensor<8x1x16xf32>) -> tensor<3x8x8x16xf32>
   func.return %0 : tensor<3x8x8x16xf32>
 }
 
@@ -1689,7 +1689,7 @@ func.func @broadcast_in_dim_tf_style(%arg0: tensor<8x1x16xf32>) -> tensor<3x8x8x
 // CHECK:           return %[[VAL_4]] : tensor<3x8x8x16xf32>
 // CHECK:         }
 func.func @broadcast_in_dim_general_case(%arg0: tensor<3x1x16xf32>) -> tensor<3x8x8x16xf32> {
-  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 2, 3]> : tensor<3xi64>, name = "broadcast.0"} : (tensor<3x1x16xf32>) -> tensor<3x8x8x16xf32>
+  %0 = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<[0, 2, 3]> : tensor<3xi64>, name = "broadcast.0"}> : (tensor<3x1x16xf32>) -> tensor<3x8x8x16xf32>
   func.return %0 : tensor<3x8x8x16xf32>
 }
 
@@ -1699,7 +1699,7 @@ func.func @broadcast_in_dim_general_case(%arg0: tensor<3x1x16xf32>) -> tensor<3x
 // CHECK          %[[VAL_0:.*]] = "tf.BroadcastTo"(%[[ARG_0]], %[[ARG_1]]) : (tensor<?x1x1x2x1xf32>, tensor<5xi32>) -> tensor<?x750x1x2x384xf32>
 // CHECK          return %[[VAL_0]] : tensor<?x750x1x2x384xf32>
 func.func @dynamic_broadcast_in_dim_tf_style(%arg0: tensor<?x1x1x2x1xf32>, %arg1: tensor<5xi32>) -> tensor<?x750x1x2x384xf32> {
-  %0 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %arg1) {broadcast_dimensions = dense<[0, 1, 2, 3, 4]> : tensor<5xi64>} : (tensor<?x1x1x2x1xf32>, tensor<5xi32>) -> tensor<?x750x1x2x384xf32>
+  %0 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %arg1) <{broadcast_dimensions = dense<[0, 1, 2, 3, 4]> : tensor<5xi64>}> : (tensor<?x1x1x2x1xf32>, tensor<5xi32>) -> tensor<?x750x1x2x384xf32>
   func.return %0 : tensor<?x750x1x2x384xf32>
 }
 
@@ -1713,7 +1713,7 @@ func.func @dynamic_broadcast_in_dim_tf_style(%arg0: tensor<?x1x1x2x1xf32>, %arg1
 // CHECK          %[[VAL_2:.*]] = "tf.BroadcastTo"(%[[VAL_1]], %[[ARG_1]]) : (tensor<?x3000x1x1xf32>, tensor<4xi32>) -> tensor<?x3000x2x4xf32>
 // CHECK          return %[[VAL_2]] : tensor<?x3000x2x4xf32>
 func.func @dynamic_broadcast_in_dim_general_case_expand_back_dims(%arg0: tensor<?x3000xf32>, %arg1: tensor<4xi32>) -> tensor<?x3000x2x4xf32> {
-  %0 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %arg1) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x3000xf32>, tensor<4xi32>) -> tensor<?x3000x2x4xf32>
+  %0 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %arg1) <{broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}> : (tensor<?x3000xf32>, tensor<4xi32>) -> tensor<?x3000x2x4xf32>
   func.return %0 : tensor<?x3000x2x4xf32>
 }
 
@@ -1725,7 +1725,7 @@ func.func @dynamic_broadcast_in_dim_general_case_expand_back_dims(%arg0: tensor<
 // CHECK          %[[VAL_1:.*]] = "tf.BroadcastTo"(%[[VAL_0]], %[[ARG_1]]) : (tensor<?x750x1x768xf32>, tensor<4xi32>) -> tensor<?x750x1x768xf32>
 // CHECK          return %[[VAL_1]] : tensor<?x750x1x768xf32>
 func.func @dynamic_broadcast_in_dim_general_case_expand_middle_dim(%arg0: tensor<?x750x768xf32>, %arg1: tensor<4xi32>) -> tensor<?x750x1x768xf32> {
-  %0 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %arg1) {broadcast_dimensions = dense<[0, 1, 3]> : tensor<3xi64>} : (tensor<?x750x768xf32>, tensor<4xi32>) -> tensor<?x750x1x768xf32>
+  %0 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %arg1) <{broadcast_dimensions = dense<[0, 1, 3]> : tensor<3xi64>}> : (tensor<?x750x768xf32>, tensor<4xi32>) -> tensor<?x750x1x768xf32>
   func.return %0 : tensor<?x750x1x768xf32>
 }
 
@@ -3041,7 +3041,7 @@ func.func @convert_reduce_to_min_int(%arg0: tensor<1x4xi32>) -> tensor<1xi32> {
 // CHECK:           return %[[VAL_3]] : tensor<123xf32>
 // CHECK:         }
 func.func @convert_iota_1d() -> tensor<123xf32> {
-  %0 = "mhlo.iota"() { iota_dimension = 0 : i64 } : () -> tensor<123xf32>
+  %0 = "mhlo.iota"() <{ iota_dimension = 0 : i64 }> : () -> tensor<123xf32>
   func.return %0 : tensor<123xf32>
 }
 
@@ -3057,7 +3057,7 @@ func.func @convert_iota_1d() -> tensor<123xf32> {
 // CHECK:           return %[[VAL_7]] : tensor<5x7x9xi32>
 // CHECK:         }
 func.func @convert_iota_3d() -> tensor<5x7x9xi32> {
-  %0 = "mhlo.iota"() { iota_dimension = 1 : i64 } : () -> tensor<5x7x9xi32>
+  %0 = "mhlo.iota"() <{ iota_dimension = 1 : i64 }> : () -> tensor<5x7x9xi32>
   func.return %0 : tensor<5x7x9xi32>
 }
 
@@ -3091,7 +3091,7 @@ func.func @convert_avgpool_valid(%arg0: tensor<4x16x16x8xf32>) -> tensor<4x7x7x8
 func.func @convert_avgpool_valid_broadcasted_divisor(%arg0: tensor<4x16x16x8xf32>) -> tensor<4x7x7x8xf32> {
   %0 = mhlo.constant dense<0.0> : tensor<f32>
   %1 = mhlo.constant dense<9.0> : tensor<f32>
-  %2 = "mhlo.broadcast_in_dim"(%1) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<4x7x7x8xf32>
+  %2 = "mhlo.broadcast_in_dim"(%1) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<f32>) -> tensor<4x7x7x8xf32>
   %3 = "mhlo.reduce_window"(%arg0, %0) ({
     ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
       %5 = mhlo.add %arg1, %arg2 : tensor<f32>
@@ -3167,7 +3167,7 @@ func.func @convert_avgpool_valid_rw(%arg0: tensor<4x16x16x8xf32>) -> tensor<4x7x
 // CHECK:         }
 func.func @convert_avgpool_valid_rw_broadcasted_const_lhs(%arg0: tensor<4x16x16x8xf32>) -> tensor<4x7x7x8xf32> {
   %0 = mhlo.constant dense<1.0> : tensor<f32>
-  %1 = "mhlo.broadcast_in_dim"(%0) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<4x16x16x8xf32>
+  %1 = "mhlo.broadcast_in_dim"(%0) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<f32>) -> tensor<4x16x16x8xf32>
   %2 = mhlo.constant dense<0.0> : tensor<f32>
   %3 = "mhlo.reduce_window"(%arg0, %2) ({
     ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
@@ -3288,7 +3288,7 @@ func.func @convert_avgpool_reshape_broadcast(%arg0: tensor<4x16x16x8xf32>) -> te
     mhlo.return %7 : tensor<f32>
   }) {base_dilations = dense<1> : tensor<4xi64>, padding = dense<[[0, 0], [0, 1], [0, 1], [0, 0]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 3, 3, 1]> : tensor<4xi64>, window_strides = dense<[1, 2, 2, 1]> : tensor<4xi64>} : (tensor<1x16x16x1xf32>, tensor<f32>) -> tensor<1x8x8x1xf32>
   %4 = mhlo.reshape %3 : (tensor<1x8x8x1xf32>) -> tensor<8x8xf32>
-  %5 = "mhlo.broadcast_in_dim"(%4) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<8x8xf32>) -> tensor<4x8x8x8xf32>
+  %5 = "mhlo.broadcast_in_dim"(%4) <{broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>}> : (tensor<8x8xf32>) -> tensor<4x8x8x8xf32>
   %6 = mhlo.divide %2, %5 : tensor<4x8x8x8xf32>
   return %6 : tensor<4x8x8x8xf32>
 }
@@ -3626,7 +3626,7 @@ func.func @convert_floor_div_broadcast_cst(%arg0: tensor<10x8xf32>) -> tensor<10
   %1 = mhlo.constant dense<[1.000000e+00, 2.000000e+00, 4.000000e+00, 8.000000e+00, 1.600000e+01, 3.200000e+01, 6.400000e+01, 1.280000e+02]> : tensor<8xf32>
   %2 = mhlo.constant dense<0.000000e+00> : tensor<10x8xf32>
   %3 = mhlo.constant dense<-1.000000e+00> : tensor<10x8xf32>
-  %5 = "mhlo.broadcast_in_dim"(%1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<8xf32>) -> tensor<10x8xf32>
+  %5 = "mhlo.broadcast_in_dim"(%1) <{broadcast_dimensions = dense<1> : tensor<1xi64>}> : (tensor<8xf32>) -> tensor<10x8xf32>
   %6 = mhlo.remainder %arg0, %5 : tensor<10x8xf32>
   %7 = "mhlo.compare"(%6, %2) {comparison_direction = #mhlo<comparison_direction NE>} : (tensor<10x8xf32>, tensor<10x8xf32>) -> tensor<10x8xi1>
   %8 = "mhlo.sign"(%6) : (tensor<10x8xf32>) -> tensor<10x8xf32>
@@ -3890,7 +3890,7 @@ func.func @convert_gather_to_slice_dynamic_error(%arg0: tensor<3x?xi32>, %arg1:
 // CHECK:           return %[[VAL_14]] : tensor<4x2xf32>
 // CHECK:         }
 func.func @convert_dynamic_slice(%arg0: tensor<7x3xf32>, %arg1: tensor<i32>, %arg2: tensor<i32>) -> tensor<4x2xf32> {
-  %0 = "mhlo.dynamic_slice"(%arg0, %arg1, %arg2) {slice_sizes = dense<[4, 2]> : tensor<2xi64>} : (tensor<7x3xf32>, tensor<i32>, tensor<i32>) -> tensor<4x2xf32>
+  %0 = "mhlo.dynamic_slice"(%arg0, %arg1, %arg2) <{slice_sizes = dense<[4, 2]> : tensor<2xi64>}> : (tensor<7x3xf32>, tensor<i32>, tensor<i32>) -> tensor<4x2xf32>
   func.return %0 : tensor<4x2xf32>
 }
 
@@ -3913,7 +3913,7 @@ func.func @convert_dynamic_slice(%arg0: tensor<7x3xf32>, %arg1: tensor<i32>, %ar
 // CHECK:           return %[[VAL_14]] : tensor<4x2xf32>
 // CHECK:         }
 func.func @convert_dynamic_slice_ui32(%arg0: tensor<7x3xf32>, %arg1: tensor<ui32>, %arg2: tensor<ui32>) -> tensor<4x2xf32> {
-  %0 = "mhlo.dynamic_slice"(%arg0, %arg1, %arg2) {slice_sizes = dense<[4, 2]> : tensor<2xi64>} : (tensor<7x3xf32>, tensor<ui32>, tensor<ui32>) -> tensor<4x2xf32>
+  %0 = "mhlo.dynamic_slice"(%arg0, %arg1, %arg2) <{slice_sizes = dense<[4, 2]> : tensor<2xi64>}> : (tensor<7x3xf32>, tensor<ui32>, tensor<ui32>) -> tensor<4x2xf32>
   func.return %0 : tensor<4x2xf32>
 }
 
@@ -3921,10 +3921,12 @@ func.func @convert_dynamic_slice_ui32(%arg0: tensor<7x3xf32>, %arg1: tensor<ui32
 // CHECK-SAME:                                      %[[VAL_0:.*]]: tensor<20x6xf32>,
 // CHECK-SAME:                                      %[[VAL_1:.*]]: tensor<4xi32>,
 // CHECK-SAME:                                      %[[VAL_2:.*]]: tensor<4x6xf32>) -> tensor<20x6xf32> {
-// CHECK:           %[[VAL_3:.*]] = "mhlo.scatter"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) ({
+// CHECK:           %[[VAL_3:.*]] = "mhlo.scatter"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) <{
+// CHECK-SAME:          indices_are_sorted = false, scatter_dimension_numbers = #mhlo.scatter<update_window_dims = [1], inserted_window_dims = [0], scatter_dims_to_operand_dims = [0], index_vector_dim = 1>, unique_indices = false
+// CHECK-SAME:      }> ({
 // CHECK:           ^bb0(%[[VAL_4:.*]]: tensor<f32>, %[[VAL_5:.*]]: tensor<f32>):
 // CHECK:             mhlo.return %[[VAL_5]] : tensor<f32>
-// CHECK:           }) {indices_are_sorted = false, scatter_dimension_numbers = #mhlo.scatter<update_window_dims = [1], inserted_window_dims = [0], scatter_dims_to_operand_dims = [0], index_vector_dim = 1>, unique_indices = false} : (tensor<20x6xf32>, tensor<4xi32>, tensor<4x6xf32>) -> tensor<20x6xf32>
+// CHECK:           }) : (tensor<20x6xf32>, tensor<4xi32>, tensor<4x6xf32>) -> tensor<20x6xf32>
 // CHECK:           return %[[VAL_3]] : tensor<20x6xf32>
 // CHECK:         }
 func.func @convert_scatter_update(%arg0: tensor<20x6xf32>, %arg1: tensor<4xi32>, %arg2: tensor<4x6xf32>) -> tensor<20x6xf32> {
@@ -3947,10 +3949,12 @@ func.func @convert_scatter_update(%arg0: tensor<20x6xf32>, %arg1: tensor<4xi32>,
 // CHECK-SAME:                                                                           %[[VAL_0:.*]]: tensor<5x10xf32>,
 // CHECK-SAME:                                                                           %[[VAL_1:.*]]: tensor<3x1xi32>,
 // CHECK-SAME:                                                                           %[[VAL_2:.*]]: tensor<10x3xf32>) -> tensor<5x10xf32> {
-// CHECK:           %[[VAL_3:.*]] = "mhlo.scatter"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) ({
+// CHECK:           %[[VAL_3:.*]] = "mhlo.scatter"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) <{
+// CHECK-SAME:         indices_are_sorted = false, scatter_dimension_numbers = #mhlo.scatter<update_window_dims = [0], inserted_window_dims = [0], scatter_dims_to_operand_dims = [0], index_vector_dim = 1>, unique_indices = false
+// CHECK-SAME:      }> ({
 // CHECK:           ^bb0(%[[VAL_4:.*]]: tensor<f32>, %[[VAL_5:.*]]: tensor<f32>):
 // CHECK:             mhlo.return %[[VAL_5]] : tensor<f32>
-// CHECK:           }) {indices_are_sorted = false, scatter_dimension_numbers = #mhlo.scatter<update_window_dims = [0], inserted_window_dims = [0], scatter_dims_to_operand_dims = [0], index_vector_dim = 1>, unique_indices = false} : (tensor<5x10xf32>, tensor<3x1xi32>, tensor<10x3xf32>) -> tensor<5x10xf32>
+// CHECK:           }) : (tensor<5x10xf32>, tensor<3x1xi32>, tensor<10x3xf32>) -> tensor<5x10xf32>
 // CHECK:           return %[[VAL_3]] : tensor<5x10xf32>
 // CHECK:         }
 func.func @convert_scatter_update_with_non_trailing_update_window_dims(
@@ -3977,10 +3981,12 @@ func.func @convert_scatter_update_with_non_trailing_update_window_dims(
 // CHECK-SAME:                                                                         %[[VAL_0:.*]]: tensor<5x4x3x7xf32>,
 // CHECK-SAME:                                                                         %[[VAL_1:.*]]: tensor<2x2xi32>,
 // CHECK-SAME:                                                                         %[[VAL_2:.*]]: tensor<2x5x3xf32>) -> tensor<5x4x3x7xf32> {
-// CHECK:           %[[VAL_3:.*]] = "mhlo.scatter"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) ({
+// CHECK:           %[[VAL_3:.*]] = "mhlo.scatter"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) <{
+// CHECK-SAME:          indices_are_sorted = false, scatter_dimension_numbers = #mhlo.scatter<update_window_dims = [1, 2], inserted_window_dims = [1, 3], scatter_dims_to_operand_dims = [1, 3], index_vector_dim = 1>, unique_indices = false
+// CHECK-SAME:      }> ({
 // CHECK:           ^bb0(%[[VAL_4:.*]]: tensor<f32>, %[[VAL_5:.*]]: tensor<f32>):
 // CHECK:             mhlo.return %[[VAL_5]] : tensor<f32>
-// CHECK:           }) {indices_are_sorted = false, scatter_dimension_numbers = #mhlo.scatter<update_window_dims = [1, 2], inserted_window_dims = [1, 3], scatter_dims_to_operand_dims = [1, 3], index_vector_dim = 1>, unique_indices = false} : (tensor<5x4x3x7xf32>, tensor<2x2xi32>, tensor<2x5x3xf32>) -> tensor<5x4x3x7xf32>
+// CHECK:           }) : (tensor<5x4x3x7xf32>, tensor<2x2xi32>, tensor<2x5x3xf32>) -> tensor<5x4x3x7xf32>
 // CHECK:           return %[[VAL_3]] : tensor<5x4x3x7xf32>
 // CHECK:         }
 func.func @convert_scatter_update_to_non_trailing_operand_dimensions(
@@ -4006,10 +4012,12 @@ func.func @convert_scatter_update_to_non_trailing_operand_dimensions(
 // CHECK-SAME:                                                                  %[[VAL_0:.*]]: tensor<16x1504xf32>,
 // CHECK-SAME:                                                                  %[[VAL_1:.*]]: tensor<1xi32>,
 // CHECK-SAME:                                                                  %[[VAL_2:.*]]: tensor<16xf32>) -> tensor<16x1504xf32> {
-// CHECK:           %[[VAL_3:.*]] = "mhlo.scatter"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) ({
+// CHECK:           %[[VAL_3:.*]] = "mhlo.scatter"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) <{
+// CHECK-SAME:          indices_are_sorted = true, scatter_dimension_numbers = #mhlo.scatter<update_window_dims = [0], inserted_window_dims = [1], scatter_dims_to_operand_dims = [1]>, unique_indices = true
+// CHECK-SAME:      }> ({
 // CHECK:           ^bb0(%[[VAL_4:.*]]: tensor<f32>, %[[VAL_5:.*]]: tensor<f32>):
 // CHECK:             mhlo.return %[[VAL_5]] : tensor<f32>
-// CHECK:           }) {indices_are_sorted = true, scatter_dimension_numbers = #mhlo.scatter<update_window_dims = [0], inserted_window_dims = [1], scatter_dims_to_operand_dims = [1]>, unique_indices = true} : (tensor<16x1504xf32>, tensor<1xi32>, tensor<16xf32>) -> tensor<16x1504xf32>
+// CHECK:           }) : (tensor<16x1504xf32>, tensor<1xi32>, tensor<16xf32>) -> tensor<16x1504xf32>
 // CHECK:           return %[[VAL_3]] : tensor<16x1504xf32>
 // CHECK:         }
 func.func @convert_scatter_update_reshape_indices_and_updates(
@@ -4034,11 +4042,13 @@ func.func @convert_scatter_update_reshape_indices_and_updates(
 // CHECK-SAME:      %[[VAL_0:.*]]: tensor<20x6xf32>,
 // CHECK-SAME:      %[[VAL_1:.*]]: tensor<4x1xi32>,
 // CHECK-SAME:      %[[VAL_2:.*]]: tensor<4x6xf32>) -> tensor<20x6xf32> {
-// CHECK:    %[[VAL_6:.*]] = "mhlo.scatter"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) ({
+// CHECK:    %[[VAL_6:.*]] = "mhlo.scatter"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) <{
+// CHECK-SAME:          indices_are_sorted = false, scatter_dimension_numbers = #mhlo.scatter<update_window_dims = [1], inserted_window_dims = [0], scatter_dims_to_operand_dims = [0], index_vector_dim = 1>, unique_indices = false
+// CHECK-SAME:      }> ({
 // CHECK:    ^bb0(%[[VAL_3:.*]]: tensor<f32>, %[[VAL_4:.*]]: tensor<f32>):
 // CHECK:      %[[VAL_5:.*]] = mhlo.add %[[VAL_3]], %[[VAL_4]] : tensor<f32>
 // CHECK:      mhlo.return %[[VAL_5]] : tensor<f32>
-// CHECK:    }) {indices_are_sorted = false, scatter_dimension_numbers = #mhlo.scatter<update_window_dims = [1], inserted_window_dims = [0], scatter_dims_to_operand_dims = [0], index_vector_dim = 1>, unique_indices = false} : (tensor<20x6xf32>, tensor<4x1xi32>, tensor<4x6xf32>) -> tensor<20x6xf32>
+// CHECK:    }) : (tensor<20x6xf32>, tensor<4x1xi32>, tensor<4x6xf32>) -> tensor<20x6xf32>
 // CHECK:    return %[[VAL_6]] : tensor<20x6xf32>
 // CHECK:  }
 func.func @convert_scatter_add(%arg0: tensor<20x6xf32>, %arg1: tensor<4x1xi32>, %arg2: tensor<4x6xf32>) -> tensor<20x6xf32> {
@@ -4062,11 +4072,13 @@ func.func @convert_scatter_add(%arg0: tensor<20x6xf32>, %arg1: tensor<4x1xi32>,
 // CHECK-SAME:      %[[VAL_0:.*]]: tensor<20x6xf32>,
 // CHECK-SAME:      %[[VAL_1:.*]]: tensor<4x1xi32>,
 // CHECK-SAME:      %[[VAL_2:.*]]: tensor<4x6xf32>) -> tensor<20x6xf32> {
-// CHECK:    %[[VAL_6:.*]] = "mhlo.scatter"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) ({
+// CHECK:    %[[VAL_6:.*]] = "mhlo.scatter"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) <{
+// CHECK-SAME:          indices_are_sorted = false, scatter_dimension_numbers = #mhlo.scatter<update_window_dims = [1], inserted_window_dims = [0], scatter_dims_to_operand_dims = [0], index_vector_dim = 1>, unique_indices = false
+// CHECK-SAME:      }> ({
 // CHECK:    ^bb0(%[[VAL_3:.*]]: tensor<f32>, %[[VAL_4:.*]]: tensor<f32>):
 // CHECK:      %[[VAL_5:.*]] = mhlo.maximum %[[VAL_3]], %[[VAL_4]] : tensor<f32>
 // CHECK:      mhlo.return %[[VAL_5]] : tensor<f32>
-// CHECK:    }) {indices_are_sorted = false, scatter_dimension_numbers = #mhlo.scatter<update_window_dims = [1], inserted_window_dims = [0], scatter_dims_to_operand_dims = [0], index_vector_dim = 1>, unique_indices = false} : (tensor<20x6xf32>, tensor<4x1xi32>, tensor<4x6xf32>) -> tensor<20x6xf32>
+// CHECK:    }) : (tensor<20x6xf32>, tensor<4x1xi32>, tensor<4x6xf32>) -> tensor<20x6xf32>
 // CHECK:    return %[[VAL_6]] : tensor<20x6xf32>
 // CHECK:  }
 func.func @convert_scatter_max(%arg0: tensor<20x6xf32>, %arg1: tensor<4x1xi32>, %arg2: tensor<4x6xf32>) -> tensor<20x6xf32> {
@@ -4090,11 +4102,13 @@ func.func @convert_scatter_max(%arg0: tensor<20x6xf32>, %arg1: tensor<4x1xi32>,
 // CHECK-SAME:      %[[VAL_0:.*]]: tensor<20x6xf32>,
 // CHECK-SAME:      %[[VAL_1:.*]]: tensor<4x1xi32>,
 // CHECK-SAME:      %[[VAL_2:.*]]: tensor<4x6xf32>) -> tensor<20x6xf32> {
-// CHECK:    %[[VAL_6:.*]] = "mhlo.scatter"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) ({
+// CHECK:    %[[VAL_6:.*]] = "mhlo.scatter"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) <{
+// CHECK-SAME:          indices_are_sorted = false, scatter_dimension_numbers = #mhlo.scatter<update_window_dims = [1], inserted_window_dims = [0], scatter_dims_to_operand_dims = [0], index_vector_dim = 1>, unique_indices = false
+// CHECK-SAME:      }> ({
 // CHECK:    ^bb0(%[[VAL_3:.*]]: tensor<f32>, %[[VAL_4:.*]]: tensor<f32>):
 // CHECK:      %[[VAL_5:.*]] = mhlo.minimum %[[VAL_3]], %[[VAL_4]] : tensor<f32>
 // CHECK:      mhlo.return %[[VAL_5]] : tensor<f32>
-// CHECK:    }) {indices_are_sorted = false, scatter_dimension_numbers = #mhlo.scatter<update_window_dims = [1], inserted_window_dims = [0], scatter_dims_to_operand_dims = [0], index_vector_dim = 1>, unique_indices = false} : (tensor<20x6xf32>, tensor<4x1xi32>, tensor<4x6xf32>) -> tensor<20x6xf32>
+// CHECK:    }) : (tensor<20x6xf32>, tensor<4x1xi32>, tensor<4x6xf32>) -> tensor<20x6xf32>
 // CHECK:    return %[[VAL_6]] : tensor<20x6xf32>
 // CHECK:  }
 func.func @convert_scatter_min(%arg0: tensor<20x6xf32>, %arg1: tensor<4x1xi32>, %arg2: tensor<4x6xf32>) -> tensor<20x6xf32> {
@@ -4118,11 +4132,13 @@ func.func @convert_scatter_min(%arg0: tensor<20x6xf32>, %arg1: tensor<4x1xi32>,
 // CHECK-SAME:      %[[VAL_0:.*]]: tensor<20x6xf32>,
 // CHECK-SAME:      %[[VAL_1:.*]]: tensor<4x1xi32>,
 // CHECK-SAME:      %[[VAL_2:.*]]: tensor<4x6xf32>) -> tensor<20x6xf32> {
-// CHECK:    %[[VAL_6:.*]] = "mhlo.scatter"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) ({
+// CHECK:    %[[VAL_6:.*]] = "mhlo.scatter"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) <{
+// CHECK-SAME:          indices_are_sorted = false, scatter_dimension_numbers = #mhlo.scatter<update_window_dims = [1], inserted_window_dims = [0], scatter_dims_to_operand_dims = [0], index_vector_dim = 1>, unique_indices = false
+// CHECK-SAME:      }> ({
 // CHECK:    ^bb0(%[[VAL_3:.*]]: tensor<f32>, %[[VAL_4:.*]]: tensor<f32>):
 // CHECK:      %[[VAL_5:.*]] = mhlo.subtract %[[VAL_3]], %[[VAL_4]] : tensor<f32>
 // CHECK:      mhlo.return %[[VAL_5]] : tensor<f32>
-// CHECK:    }) {indices_are_sorted = false, scatter_dimension_numbers = #mhlo.scatter<update_window_dims = [1], inserted_window_dims = [0], scatter_dims_to_operand_dims = [0], index_vector_dim = 1>, unique_indices = false} : (tensor<20x6xf32>, tensor<4x1xi32>, tensor<4x6xf32>) -> tensor<20x6xf32>
+// CHECK:    }) : (tensor<20x6xf32>, tensor<4x1xi32>, tensor<4x6xf32>) -> tensor<20x6xf32>
 // CHECK:    return %[[VAL_6]] : tensor<20x6xf32>
 // CHECK:  }
 func.func @convert_scatter_sub(%arg0: tensor<20x6xf32>, %arg1: tensor<4x1xi32>, %arg2: tensor<4x6xf32>) -> tensor<20x6xf32> {
@@ -4146,7 +4162,7 @@ func.func @convert_scatter_sub(%arg0: tensor<20x6xf32>, %arg1: tensor<4x1xi32>,
 func.func @convert_pytorch_argmax(%arg0: tensor<1x9xi32>) -> tensor<1xi32> {
   %0 = mhlo.constant dense<0> : tensor<i32>
   %1 = mhlo.constant dense<-2147483648> : tensor<i32>
-  %2 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<9xi32>
+  %2 = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<9xi32>
   %3 = mhlo.reshape %2 : (tensor<9xi32>) -> tensor<1x9xi32>
   %4:2 = mhlo.reduce(%arg0 init: %1), (%3 init: %0) across dimensions = [1] : (tensor<1x9xi32>, tensor<1x9xi32>, tensor<i32>, tensor<i32>) -> (tensor<1xi32>, tensor<1xi32>)
     reducer(%arg1: tensor<i32>, %arg3: tensor<i32>) (%arg2: tensor<i32>, %arg4: tensor<i32>)  {
@@ -4184,8 +4200,8 @@ func.func @convert_pytorch_argmax(%arg0: tensor<1x9xi32>) -> tensor<1xi32> {
 func.func @convert_argmax(%arg0: tensor<4x32x256xf32>) -> (tensor<4x32xf32>, tensor<4x32xi32>) {
   %0 = mhlo.constant dense<0xFF800000> : tensor<f32>
   %1 = mhlo.constant dense<0> : tensor<i32>
-  %2 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<256xi32>
-  %3 = "mhlo.broadcast_in_dim"(%2) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<256xi32>) -> tensor<4x32x256xi32>
+  %2 = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<256xi32>
+  %3 = "mhlo.broadcast_in_dim"(%2) <{broadcast_dimensions = dense<2> : tensor<1xi64>}> : (tensor<256xi32>) -> tensor<4x32x256xi32>
   %4:2 = "mhlo.reduce"(%arg0, %3, %0, %1) ({
   ^bb0(%arg1: tensor<f32>, %arg2: tensor<i32>, %arg3: tensor<f32>, %arg4: tensor<i32>):
     %7 = "mhlo.compare"(%arg1, %arg3) {comparison_direction = #mhlo<comparison_direction GT>} : (tensor<f32>, tensor<f32>) -> tensor<i1>
@@ -4277,7 +4293,7 @@ func.func @convert_argmax_constant_non_z_axis(%arg0: tensor<4x4xf32>) -> (tensor
 // CHECK:         }
 
 func.func @convert_argmax_bool(%arg0: tensor<2xi1>) -> tensor<i32> {
-  %0 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<2xi32>
+  %0 = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<2xi32>
   %1 = mhlo.constant dense<false> : tensor<i1>
   %2 = mhlo.constant dense<0> : tensor<i32>
   %3:2 = mhlo.reduce(%arg0 init: %1), (%0 init: %2) across dimensions = [0] : (tensor<2xi1>, tensor<2xi32>, tensor<i1>, tensor<i32>) -> (tensor<i1>, tensor<i32>)
@@ -4304,8 +4320,8 @@ func.func @convert_argmax_bool(%arg0: tensor<2xi1>) -> tensor<i32> {
 func.func @convert_argmin(%arg0: tensor<4x32x256xf32>) -> (tensor<4x32xf32>, tensor<4x32xi32>) {
   %0 = mhlo.constant dense<0x7F800000> : tensor<f32>
   %1 = mhlo.constant dense<0> : tensor<i32>
-  %2 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<256xi32>
-  %3 = "mhlo.broadcast_in_dim"(%2) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<256xi32>) -> tensor<4x32x256xi32>
+  %2 = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<256xi32>
+  %3 = "mhlo.broadcast_in_dim"(%2) <{broadcast_dimensions = dense<2> : tensor<1xi64>}> : (tensor<256xi32>) -> tensor<4x32x256xi32>
   %4:2 = "mhlo.reduce"(%arg0, %3, %0, %1) ({
   ^bb0(%arg1: tensor<f32>, %arg2: tensor<i32>, %arg3: tensor<f32>, %arg4: tensor<i32>):
     %7 = "mhlo.compare"(%arg1, %arg3) {comparison_direction = #mhlo<comparison_direction LT>} : (tensor<f32>, tensor<f32>) -> tensor<i1>
@@ -4331,7 +4347,7 @@ func.func @convert_argmin(%arg0: tensor<4x32x256xf32>) -> (tensor<4x32xf32>, ten
 // CHECK:         }
 func.func @convert_argmin_i16(%arg0: tensor<2xi16>) -> (tensor<i16>, tensor<i32>) {
   %0 = mhlo.constant dense<false> : tensor<i1>
-  %1 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<2xi32>
+  %1 = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<2xi32>
   %2 = mhlo.constant dense<32767> : tensor<i16>
   %3 = mhlo.constant dense<0> : tensor<i32>
   %4:2 = "mhlo.reduce"(%arg0, %1, %2, %3) ({
@@ -4394,7 +4410,7 @@ func.func @convert_argmin_constant(%arg0: tensor<2x2x4xf32>) -> (tensor<2x2xf32>
 // CHECK:           return %[[VAL_9]] : tensor<i32>
 // CHECK:         }
 func.func @convert_argmin_bool(%arg0: tensor<2xi1>) -> tensor<i32> {
-  %0 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<2xi32>
+  %0 = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<2xi32>
   %1 = mhlo.constant dense<false> : tensor<i1>
   %2 = mhlo.constant dense<0> : tensor<i32>
   %3:2 = mhlo.reduce(%arg0 init: %1), (%0 init: %2) across dimensions = [0] : (tensor<2xi1>, tensor<2xi32>, tensor<i1>, tensor<i32>) -> (tensor<i1>, tensor<i32>)
@@ -4429,7 +4445,7 @@ func.func @convert_argmin_bool(%arg0: tensor<2xi1>) -> tensor<i32> {
 func.func @convert_argmax_with_reshaped_iota(%arg0: tensor<1x32x1xf32>) -> (tensor<1x1xf32>, tensor<1x1xi32>) {
   %0 = mhlo.constant dense<0xFF800000> : tensor<f32>
   %1 = mhlo.constant dense<0> : tensor<i32>
-  %2 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<32xi32>
+  %2 = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<32xi32>
   %3 = "mhlo.reshape"(%2) : (tensor<32xi32>) -> tensor<1x32x1xi32>
   %4:2 = mhlo.reduce(%arg0 init: %0), (%3 init: %1) across dimensions = [1] : (tensor<1x32x1xf32>, tensor<1x32x1xi32>, tensor<f32>, tensor<i32>) -> (tensor<1x1xf32>, tensor<1x1xi32>)
    reducer(%arg1: tensor<f32>, %arg3: tensor<f32>) (%arg2: tensor<i32>, %arg4: tensor<i32>)  {
@@ -4772,8 +4788,8 @@ func.func @convert_reduce_to_any_non_constant_init(%arg0: tensor<i1>, %arg1: ten
 // CHECK:           return %[[VALUES]], %[[INDICES]] : tensor<3x6xf32>, tensor<3x6xi32>
 // CHECK:         }
 func.func @convert_sort_to_topk_iota_broadcast(%arg0: tensor<3x6xf32>) -> (tensor<3x6xf32>, tensor<3x6xi32>) {
-  %0 = "mhlo.iota"() { iota_dimension = 0 : i64 } : () -> tensor<6xi32>
-  %1 = "mhlo.broadcast_in_dim"(%0) {broadcast_dimensions = dense<[1]> : tensor<1xi64>, name = "broadcast.0"} : (tensor<6xi32>) -> tensor<3x6xi32>
+  %0 = "mhlo.iota"() <{ iota_dimension = 0 : i64 }> : () -> tensor<6xi32>
+  %1 = "mhlo.broadcast_in_dim"(%0) <{broadcast_dimensions = dense<[1]> : tensor<1xi64>, name = "broadcast.0"}> : (tensor<6xi32>) -> tensor<3x6xi32>
   %2:2 = "mhlo.sort"(%arg0, %1) ({
   ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>, %arg3: tensor<i32>, %arg4: tensor<i32>):
     %3 = "mhlo.compare"(%arg1, %arg2) {compare_type = #mhlo<comparison_type TOTALORDER>, comparison_direction = #mhlo<comparison_direction GT>} : (tensor<f32>, tensor<f32>) -> tensor<i1>
@@ -4793,7 +4809,7 @@ func.func @convert_sort_to_topk_iota_broadcast(%arg0: tensor<3x6xf32>) -> (tenso
 // CHECK:         }
 func.func @convert_sort_to_topk_iotacst_broadcast(%arg0: tensor<3x6xf32>) -> (tensor<3x6xf32>, tensor<3x6xi32>) {
   %0 = mhlo.constant dense<[0, 1, 2, 3, 4, 5]> : tensor<6xi32>
-  %1 = "mhlo.broadcast_in_dim"(%0) {broadcast_dimensions = dense<[1]> : tensor<1xi64>, name = "broadcast.0"} : (tensor<6xi32>) -> tensor<3x6xi32>
+  %1 = "mhlo.broadcast_in_dim"(%0) <{broadcast_dimensions = dense<[1]> : tensor<1xi64>, name = "broadcast.0"}> : (tensor<6xi32>) -> tensor<3x6xi32>
   %2:2 = "mhlo.sort"(%arg0, %1) ({
   ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>, %arg3: tensor<i32>, %arg4: tensor<i32>):
     %3 = "mhlo.compare"(%arg1, %arg2) {compare_type = #mhlo<comparison_type TOTALORDER>, comparison_direction = #mhlo<comparison_direction GT>} : (tensor<f32>, tensor<f32>) -> tensor<i1>
@@ -4996,7 +5012,7 @@ func.func @convert_dot_quant_type(%arg0: tensor<1x256xf32>, %arg1: tensor<256x!q
 // CHECK          %[[CST_0:.*]] = "tf.Const"() <{value = dense<256> : tensor<i32>}> : () -> tensor<i32>
 // CHECK          return %[[CST_0]] : tensor<i32>
 func.func @get_dimension_size(%arg0: tensor<4x256x?xf32>) -> tensor<i32> {
-  %0 = "mhlo.get_dimension_size"(%arg0) {dimension = 1 : i64} : (tensor<4x256x?xf32>) -> tensor<i32>
+  %0 = "mhlo.get_dimension_size"(%arg0) <{dimension = 1 : i64}> : (tensor<4x256x?xf32>) -> tensor<i32>
   func.return %0 : tensor<i32>
 }
 
@@ -5009,7 +5025,7 @@ func.func @get_dimension_size(%arg0: tensor<4x256x?xf32>) -> tensor<i32> {
 // CHECK          %[[VAL_2:.*]] = "tf.Squeeze"(%[[VAL_1]]) <{squeeze_dims = [0]}> : (tensor<1xi32>) -> tensor<i32>
 // CHECK          return %[[VAL_2]] : tensor<i32>
 func.func @get_dimension_size_dynamic(%arg0: tensor<4x256x?xf32>) -> tensor<i32> {
-  %0 = "mhlo.get_dimension_size"(%arg0) {dimension = 2 : i64} : (tensor<4x256x?xf32>) -> tensor<i32>
+  %0 = "mhlo.get_dimension_size"(%arg0) <{dimension = 2 : i64}> : (tensor<4x256x?xf32>) -> tensor<i32>
   func.return %0 : tensor<i32>
 }
 
@@ -5022,7 +5038,7 @@ func.func @get_dimension_size_dynamic(%arg0: tensor<4x256x?xf32>) -> tensor<i32>
 // CHECK:         %[[VAL_1:.*]] = "tf.Range"(%[[CST_1]], %[[VAL_0]], %[[CST_2]]) : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
 // CHECK:         return %[[VAL_1]] : tensor<?xi32>
 func.func @dynamic_iota_i32_1d(%arg0: tensor<1xi32>) -> tensor<?xi32> {
-  %0 = "mhlo.dynamic_iota"(%arg0) {iota_dimension = 0 : i64} : (tensor<1xi32>) -> tensor<?xi32>
+  %0 = "mhlo.dynamic_iota"(%arg0) <{iota_dimension = 0 : i64}> : (tensor<1xi32>) -> tensor<?xi32>
   func.return %0 : tensor<?xi32>
 }
 
@@ -5036,7 +5052,7 @@ func.func @dynamic_iota_i32_1d(%arg0: tensor<1xi32>) -> tensor<?xi32> {
 // CHECK:         %[[VAL_2:.*]] = "tf.Range"(%[[CST_1]], %[[VAL_1]], %[[CST_2]]) : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<?xf32>
 // CHECK:         return %[[VAL_2]] : tensor<?xf32>
 func.func @dynamic_iota_f32_1d(%arg0: tensor<1xi32>) -> tensor<?xf32> {
-  %0 = "mhlo.dynamic_iota"(%arg0) {iota_dimension = 0 : i64} : (tensor<1xi32>) -> tensor<?xf32>
+  %0 = "mhlo.dynamic_iota"(%arg0) <{iota_dimension = 0 : i64}> : (tensor<1xi32>) -> tensor<?xf32>
   func.return %0 : tensor<?xf32>
 }
 
@@ -5070,7 +5086,7 @@ func.return %0 : tensor<1x?x1x2xf32>
 // CHECK-NOT:       "mhlo.custom_call"
 func.func @remove_shape_assertion_custom_call(%arg1: tensor<?x5xi32>) -> tensor<i32> {
   %0 = mhlo.constant dense<3> : tensor<i32>
-  %1 = "mhlo.get_dimension_size"(%arg1) {dimension = 0 : i64} : (tensor<?x5xi32>) -> tensor<i32>
+  %1 = "mhlo.get_dimension_size"(%arg1) <{dimension = 0 : i64}> : (tensor<?x5xi32>) -> tensor<i32>
   %ok = mhlo.compare  EQ, %1, %0,  SIGNED : (tensor<i32>, tensor<i32>) -> tensor<i1>
   mhlo.custom_call @shape_assertion(%ok) {
     error_message = "The error message",
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/odml-to-stablehlo-allow-tf.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/odml-to-stablehlo-allow-tf.mlir
index 608b90d54a7f72..4596d21637b69e 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/odml-to-stablehlo-allow-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/odml-to-stablehlo-allow-tf.mlir
@@ -3,8 +3,8 @@
 
 module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 975 : i32}, tf_saved_model.semantics}  {
   func.func @serving_default(%arg0: tensor<1x20x20x28xf32> {tf_saved_model.index_path = ["a"]}) -> (tensor<1x40x40x28xf32> {tf_saved_model.index_path = ["b"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "c:0", outputs = "d:0"}, tf_saved_model.exported_names = ["serving_default"]} {
-      %0 = stablehlo.constant dense<40> : tensor<2xi32>
-      %1 = "tf.UnconvertedOp"(%arg0, %0) {align_corners = false, half_pixel_centers = false} : (tensor<1x20x20x28xf32>, tensor<2xi32>) -> tensor<1x40x40x28xf32>
-      func.return %1 : tensor<1x40x40x28xf32>
+      %c = stablehlo.constant dense<40> : tensor<2xi32>
+      %0 = "tf.UnconvertedOp"(%arg0, %c) {align_corners = false, half_pixel_centers = false} : (tensor<1x20x20x28xf32>, tensor<2xi32>) -> tensor<1x40x40x28xf32>
+      func.return %0 : tensor<1x40x40x28xf32>
   }
 }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/odml-to-stablehlo-smuggle-resize.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/odml-to-stablehlo-smuggle-resize.mlir
index 4a0f6a5d5e673b..eee056a8a489f9 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/odml-to-stablehlo-smuggle-resize.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/odml-to-stablehlo-smuggle-resize.mlir
@@ -5,8 +5,8 @@
 module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 975 : i32}, tf_saved_model.semantics}  {
   func.func @serving_default(%arg0: tensor<1x32x32x128xf32> {tf_saved_model.index_path = ["a"]}) -> (tensor<1x64x64x128xf32> {tf_saved_model.index_path = ["b"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "c:0", outputs = "d:0"}, tf_saved_model.exported_names = ["serving_default"]} {
       %0  = "tf.Const"() {value = dense<[56, 904]> : tensor<2xi32>} : () -> tensor<2xi32>
-      // CHECK: %1 = stablehlo.custom_call @tf.ResizeBilinear(%arg0, %0) {align_corners = false, device = "", half_pixel_centers = true} : (tensor<1x32x32x128xf32>, tensor<2xi32>) -> tensor<1x64x64x128xf32>
-      // CHECK-OPT: %0 = stablehlo.custom_call @tf.ResizeBilinear(%arg0, %cst) {align_corners = false, device = "", half_pixel_centers = true} : (tensor<1x32x32x128xf32>, tensor<2xi32>) -> tensor<1x64x64x128xf32>
+      // CHECK: %{{.*}} = stablehlo.custom_call @tf.ResizeBilinear(%arg0, %{{.*}}) {align_corners = false, device = "", half_pixel_centers = true} : (tensor<1x32x32x128xf32>, tensor<2xi32>) -> tensor<1x64x64x128xf32>
+      // CHECK-OPT: %{{.*}} = stablehlo.custom_call @tf.ResizeBilinear(%arg0, %cst) {align_corners = false, device = "", half_pixel_centers = true} : (tensor<1x32x32x128xf32>, tensor<2xi32>) -> tensor<1x64x64x128xf32>
       %1 = "tf.ResizeBilinear"(%arg0, %0) {
         align_corners = false, device = "", half_pixel_centers = true
       } : (tensor<1x32x32x128xf32>, tensor<2xi32>) -> tensor<1x64x64x128xf32>
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/optimize.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/optimize.mlir
index 722fc5b47459f8..7db47a1a3e7703 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/optimize.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/optimize.mlir
@@ -5,11 +5,11 @@ func.func @testDotToDotGeneralVectorVector(%arg0: tensor<3072xf32>, %arg1: tenso
   %0 = "mhlo.dot"(%arg0, %arg1) : (tensor<3072xf32>, tensor<3072xf32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 
-// CHECK:      %[[RES:.*]] = "mhlo.dot_general"(%arg0, %arg1) {
+// CHECK:      %[[RES:.*]] = "mhlo.dot_general"(%arg0, %arg1) <{
 // CHECK-SAME:   dot_dimension_numbers = #mhlo.dot<
 // CHECK-SAME:     lhs_contracting_dimensions = [0],
 // CHECK-SAME:     rhs_contracting_dimensions = [0]
-// CHECK-SAME: >} : (tensor<3072xf32>, tensor<3072xf32>) -> tensor<f32>
+// CHECK-SAME: >}> : (tensor<3072xf32>, tensor<3072xf32>) -> tensor<f32>
 // CHECK:      return %[[RES]] : tensor<f32>
 }
 
@@ -20,11 +20,11 @@ func.func @testDotToDotGeneralVectorMatrix(%arg0: tensor<3072xf32>, %arg1: tenso
   %0 = "mhlo.dot"(%arg0, %arg1) : (tensor<3072xf32>, tensor<3072x512xf32>) -> tensor<512xf32>
   func.return %0 : tensor<512xf32>
 
-// CHECK:      %[[RES:.*]] = "mhlo.dot_general"(%arg0, %arg1) {
+// CHECK:      %[[RES:.*]] = "mhlo.dot_general"(%arg0, %arg1) <{
 // CHECK-SAME:   dot_dimension_numbers = #mhlo.dot<
 // CHECK-SAME:     lhs_contracting_dimensions = [0],
 // CHECK-SAME:     rhs_contracting_dimensions = [0]
-// CHECK-SAME: >} : (tensor<3072xf32>, tensor<3072x512xf32>) -> tensor<512xf32>
+// CHECK-SAME: >}> : (tensor<3072xf32>, tensor<3072x512xf32>) -> tensor<512xf32>
 // CHECK:      return %[[RES]] : tensor<512xf32>
 }
 
@@ -35,11 +35,11 @@ func.func @testDotToDotGeneralMatrixVector(%arg0: tensor<2x3072xf32>, %arg1: ten
   %0 = "mhlo.dot"(%arg0, %arg1) : (tensor<2x3072xf32>, tensor<3072xf32>) -> tensor<2xf32>
   func.return %0 : tensor<2xf32>
 
-// CHECK:      %[[RES:.*]] = "mhlo.dot_general"(%arg0, %arg1) {
+// CHECK:      %[[RES:.*]] = "mhlo.dot_general"(%arg0, %arg1) <{
 // CHECK-SAME:   dot_dimension_numbers = #mhlo.dot<
 // CHECK-SAME:     lhs_contracting_dimensions = [1],
 // CHECK-SAME:     rhs_contracting_dimensions = [0]
-// CHECK-SAME: >} : (tensor<2x3072xf32>, tensor<3072xf32>) -> tensor<2xf32>
+// CHECK-SAME: >}> : (tensor<2x3072xf32>, tensor<3072xf32>) -> tensor<2xf32>
 // CHECK:      return %[[RES]] : tensor<2xf32>
 }
 
@@ -50,11 +50,11 @@ func.func @testDotToDotGeneralMatrixMatrix(%arg0: tensor<2x3072xf32>, %arg1: ten
   %0 = "mhlo.dot"(%arg0, %arg1) : (tensor<2x3072xf32>, tensor<3072x512xf32>) -> tensor<2x512xf32>
   func.return %0 : tensor<2x512xf32>
 
-// CHECK:      %[[RES:.*]] = "mhlo.dot_general"(%arg0, %arg1) {
+// CHECK:      %[[RES:.*]] = "mhlo.dot_general"(%arg0, %arg1) <{
 // CHECK-SAME:   dot_dimension_numbers = #mhlo.dot<
 // CHECK-SAME:     lhs_contracting_dimensions = [1],
 // CHECK-SAME:     rhs_contracting_dimensions = [0]
-// CHECK-SAME: >} : (tensor<2x3072xf32>, tensor<3072x512xf32>) -> tensor<2x512xf32>
+// CHECK-SAME: >}> : (tensor<2x3072xf32>, tensor<3072x512xf32>) -> tensor<2x512xf32>
 // CHECK:      return %[[RES]] : tensor<2x512xf32>
 }
 
@@ -73,13 +73,13 @@ func.func @testRemoveReshapeAroundDotGeneral(%arg0: tensor<3x72x1x2048xf32>, %ar
   %2 = "mhlo.reshape"(%1) : (tensor<3x72x512xf32>) -> tensor<3x72x1x512xf32>
   func.return %2 : tensor<3x72x1x512xf32>
 
-// CHECK:      %[[RES:.*]] = "mhlo.dot_general"(%arg0, %arg1) {
+// CHECK:      %[[RES:.*]] = "mhlo.dot_general"(%arg0, %arg1) <{
 // CHECK-SAME:   dot_dimension_numbers = #mhlo.dot<
 // CHECK-SAME:     lhs_batching_dimensions = [0],
 // CHECK-SAME:     rhs_batching_dimensions = [0],
 // CHECK-SAME:     lhs_contracting_dimensions = [3],
 // CHECK-SAME:     rhs_contracting_dimensions = [1]
-// CHECK-SAME: >} : (tensor<3x72x1x2048xf32>, tensor<3x2048x512xf32>) -> tensor<3x72x1x512xf32>
+// CHECK-SAME: >}> : (tensor<3x72x1x2048xf32>, tensor<3x2048x512xf32>) -> tensor<3x72x1x512xf32>
 // CHECK:      return %[[RES]] : tensor<3x72x1x512xf32>
 }
 
@@ -92,11 +92,11 @@ func.func @testRemoveReshapeAroundDot(%arg0: tensor<1x1x512xf32>, %arg1: tensor<
   %2 = "mhlo.reshape"(%1) : (tensor<1x13xf32>) -> tensor<1x1x13xf32>
   func.return %2 : tensor<1x1x13xf32>
 
-// CHECK:      %[[RES:.*]] = "mhlo.dot_general"(%arg0, %arg1) {
+// CHECK:      %[[RES:.*]] = "mhlo.dot_general"(%arg0, %arg1) <{
 // CHECK-SAME:   dot_dimension_numbers = #mhlo.dot<
 // CHECK-SAME:     lhs_contracting_dimensions = [2],
 // CHECK-SAME:     rhs_contracting_dimensions = [0]
-// CHECK-SAME: >} : (tensor<1x1x512xf32>, tensor<512x13x!quant.uniform<i8:f32, 2.850000e-03>>) -> tensor<1x1x13xf32>
+// CHECK-SAME: >}> : (tensor<1x1x512xf32>, tensor<512x13x!quant.uniform<i8:f32, 2.850000e-03>>) -> tensor<1x1x13xf32>
 // CHECK:      return %[[RES]] : tensor<1x1x13xf32>
 }
 
@@ -105,15 +105,15 @@ func.func @testRemoveReshapeAroundDot(%arg0: tensor<1x1x512xf32>, %arg1: tensor<
 // CHECK-LABEL: testTwoConsecutivePads
 func.func @testTwoConsecutivePads(%arg0: tensor<10x10x10xf32>) -> (tensor<12x12x12xf32>) {
   %0 = mhlo.constant dense<0.000000e+00> : tensor<f32>
-  %1 = "mhlo.pad"(%arg0, %0) {edge_padding_high = dense<0> : tensor<3xi64>, edge_padding_low = dense<1> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<10x10x10xf32>, tensor<f32>) -> tensor<11x11x11xf32>
+  %1 = "mhlo.pad"(%arg0, %0) <{edge_padding_high = dense<0> : tensor<3xi64>, edge_padding_low = dense<1> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>}> : (tensor<10x10x10xf32>, tensor<f32>) -> tensor<11x11x11xf32>
   %2 = mhlo.constant dense<0.000000e+00> : tensor<f32>
-  %3 = "mhlo.pad"(%1, %2) {edge_padding_high = dense<1> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<11x11x11xf32>, tensor<f32>) -> tensor<12x12x12xf32>
+  %3 = "mhlo.pad"(%1, %2) <{edge_padding_high = dense<1> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>}> : (tensor<11x11x11xf32>, tensor<f32>) -> tensor<12x12x12xf32>
   return %3 : tensor<12x12x12xf32>
-// CHECK:      %[[RES:.*]] = "mhlo.pad"(%arg0, %0) {
+// CHECK:      %[[RES:.*]] = "mhlo.pad"(%arg0, %0) <{
 // CHECK-SAME:     edge_padding_high = dense<1> : tensor<3xi64>,
 // CHECK-SAME:     edge_padding_low = dense<1> : tensor<3xi64>,
 // CHECK-SAME:     interior_padding = dense<0> : tensor<3xi64>
-// CHECK-SAME: } : (tensor<10x10x10xf32>, tensor<f32>) -> tensor<12x12x12xf32>
+// CHECK-SAME: }> : (tensor<10x10x10xf32>, tensor<f32>) -> tensor<12x12x12xf32>
 // CHECK:      return %[[RES]] : tensor<12x12x12xf32>
 }
 
@@ -122,16 +122,16 @@ func.func @testTwoConsecutivePads(%arg0: tensor<10x10x10xf32>) -> (tensor<12x12x
 // CHECK-LABEL: testTwoConsecutivePadsNegativeLowPad
 func.func @testTwoConsecutivePadsNegativeLowPad(%arg0: tensor<10x10x10xf32>) -> (tensor<10x10x10xf32>) {
   %0 = mhlo.constant dense<0.000000e+00> : tensor<f32>
-  %1 = "mhlo.pad"(%arg0, %0) {edge_padding_high = dense<0> : tensor<3xi64>, edge_padding_low = dense<-1> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<10x10x10xf32>, tensor<f32>) -> tensor<9x9x9xf32>
+  %1 = "mhlo.pad"(%arg0, %0) <{edge_padding_high = dense<0> : tensor<3xi64>, edge_padding_low = dense<-1> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>}> : (tensor<10x10x10xf32>, tensor<f32>) -> tensor<9x9x9xf32>
   %2 = mhlo.constant dense<0.000000e+00> : tensor<f32>
-  %3 = "mhlo.pad"(%1, %2) {edge_padding_high = dense<1> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<9x9x9xf32>, tensor<f32>) -> tensor<10x10x10xf32>
+  %3 = "mhlo.pad"(%1, %2) <{edge_padding_high = dense<1> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>}> : (tensor<9x9x9xf32>, tensor<f32>) -> tensor<10x10x10xf32>
   return %3 : tensor<10x10x10xf32>
 
-// CHECK:      %[[RES:.*]] = "mhlo.pad"(%arg0, %0) {
+// CHECK:      %[[RES:.*]] = "mhlo.pad"(%arg0, %0) <{
 // CHECK-SAME:     edge_padding_high = dense<1> : tensor<3xi64>,
 // CHECK-SAME:     edge_padding_low = dense<-1> : tensor<3xi64>,
 // CHECK-SAME:     interior_padding = dense<0> : tensor<3xi64>
-// CHECK-SAME: } : (tensor<10x10x10xf32>, tensor<f32>) -> tensor<10x10x10xf32>
+// CHECK-SAME: }> : (tensor<10x10x10xf32>, tensor<f32>) -> tensor<10x10x10xf32>
 // CHECK:      return %[[RES]] : tensor<10x10x10xf32>
 }
 
@@ -140,16 +140,16 @@ func.func @testTwoConsecutivePadsNegativeLowPad(%arg0: tensor<10x10x10xf32>) ->
 // CHECK-LABEL: testTwoConsecutivePadsTwoNegativeHighPad
 func.func @testTwoConsecutivePadsTwoNegativeHighPad(%arg0: tensor<10x10x10xf32>) -> (tensor<9x9x9xf32>) {
   %0 = mhlo.constant dense<0.000000e+00> : tensor<f32>
-  %1 = "mhlo.pad"(%arg0, %0) {edge_padding_high = dense<-1> : tensor<3xi64>, edge_padding_low = dense<1> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<10x10x10xf32>, tensor<f32>) -> tensor<10x10x10xf32>
+  %1 = "mhlo.pad"(%arg0, %0) <{edge_padding_high = dense<-1> : tensor<3xi64>, edge_padding_low = dense<1> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>}> : (tensor<10x10x10xf32>, tensor<f32>) -> tensor<10x10x10xf32>
   %2 = mhlo.constant dense<0.000000e+00> : tensor<f32>
-  %3 = "mhlo.pad"(%1, %2) {edge_padding_high = dense<-1> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<10x10x10xf32>, tensor<f32>) -> tensor<9x9x9xf32>
+  %3 = "mhlo.pad"(%1, %2) <{edge_padding_high = dense<-1> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>}> : (tensor<10x10x10xf32>, tensor<f32>) -> tensor<9x9x9xf32>
   return %3 : tensor<9x9x9xf32>
 
-// CHECK:      %[[RES:.*]] = "mhlo.pad"(%arg0, %0) {
+// CHECK:      %[[RES:.*]] = "mhlo.pad"(%arg0, %0) <{
 // CHECK-SAME:     edge_padding_high = dense<-2> : tensor<3xi64>,
 // CHECK-SAME:     edge_padding_low = dense<1> : tensor<3xi64>,
 // CHECK-SAME:     interior_padding = dense<0> : tensor<3xi64>
-// CHECK-SAME: } : (tensor<10x10x10xf32>, tensor<f32>) -> tensor<9x9x9xf32>
+// CHECK-SAME: }> : (tensor<10x10x10xf32>, tensor<f32>) -> tensor<9x9x9xf32>
 // CHECK:      return %[[RES]] : tensor<9x9x9xf32>
 }
 
@@ -158,16 +158,16 @@ func.func @testTwoConsecutivePadsTwoNegativeHighPad(%arg0: tensor<10x10x10xf32>)
 // CHECK-LABEL: testTwoConsecutivePadsPositiveNegativeHighPad
 func.func @testTwoConsecutivePadsPositiveNegativeHighPad(%arg0: tensor<10x10x10xf32>) -> (tensor<11x11x11xf32>) {
   %0 = mhlo.constant dense<0.000000e+00> : tensor<f32>
-  %1 = "mhlo.pad"(%arg0, %0) {edge_padding_high = dense<1> : tensor<3xi64>, edge_padding_low = dense<1> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<10x10x10xf32>, tensor<f32>) -> tensor<12x12x12xf32>
+  %1 = "mhlo.pad"(%arg0, %0) <{edge_padding_high = dense<1> : tensor<3xi64>, edge_padding_low = dense<1> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>}> : (tensor<10x10x10xf32>, tensor<f32>) -> tensor<12x12x12xf32>
   %2 = mhlo.constant dense<0.000000e+00> : tensor<f32>
-  %3 = "mhlo.pad"(%1, %2) {edge_padding_high = dense<-1> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<12x12x12xf32>, tensor<f32>) -> tensor<11x11x11xf32>
+  %3 = "mhlo.pad"(%1, %2) <{edge_padding_high = dense<-1> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>}> : (tensor<12x12x12xf32>, tensor<f32>) -> tensor<11x11x11xf32>
   return %3 : tensor<11x11x11xf32>
 
-// CHECK:      %[[RES:.*]] = "mhlo.pad"(%arg0, %0) {
+// CHECK:      %[[RES:.*]] = "mhlo.pad"(%arg0, %0) <{
 // CHECK-SAME:     edge_padding_high = dense<0> : tensor<3xi64>,
 // CHECK-SAME:     edge_padding_low = dense<1> : tensor<3xi64>,
 // CHECK-SAME:     interior_padding = dense<0> : tensor<3xi64>
-// CHECK-SAME: } : (tensor<10x10x10xf32>, tensor<f32>) -> tensor<11x11x11xf32>
+// CHECK-SAME: }> : (tensor<10x10x10xf32>, tensor<f32>) -> tensor<11x11x11xf32>
 // CHECK:      return %[[RES]] : tensor<11x11x11xf32>
 }
 
@@ -176,22 +176,22 @@ func.func @testTwoConsecutivePadsPositiveNegativeHighPad(%arg0: tensor<10x10x10x
 // CHECK-LABEL: testTwoConsecutivePadsNegativePositiveHighPad
 func.func @testTwoConsecutivePadsNegativePositiveHighPad(%arg0: tensor<10x10x10xf32>) -> (tensor<11x11x11xf32>) {
   %0 = mhlo.constant dense<0.000000e+00> : tensor<f32>
-  %1 = "mhlo.pad"(%arg0, %0) {edge_padding_high = dense<-1> : tensor<3xi64>, edge_padding_low = dense<1> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<10x10x10xf32>, tensor<f32>) -> tensor<10x10x10xf32>
+  %1 = "mhlo.pad"(%arg0, %0) <{edge_padding_high = dense<-1> : tensor<3xi64>, edge_padding_low = dense<1> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>}> : (tensor<10x10x10xf32>, tensor<f32>) -> tensor<10x10x10xf32>
   %2 = mhlo.constant dense<0.000000e+00> : tensor<f32>
-  %3 = "mhlo.pad"(%1, %2) {edge_padding_high = dense<1> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<10x10x10xf32>, tensor<f32>) -> tensor<11x11x11xf32>
+  %3 = "mhlo.pad"(%1, %2) <{edge_padding_high = dense<1> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>}> : (tensor<10x10x10xf32>, tensor<f32>) -> tensor<11x11x11xf32>
   return %3 : tensor<11x11x11xf32>
 
-// CHECK:      "mhlo.pad"(%arg0, %0) {
+// CHECK:      "mhlo.pad"(%arg0, %0) <{
 // CHECK-SAME:     edge_padding_high = dense<-1> : tensor<3xi64>,
 // CHECK-SAME:     edge_padding_low = dense<1> : tensor<3xi64>,
 // CHECK-SAME:     interior_padding = dense<0> : tensor<3xi64>
-// CHECK-SAME: } : (tensor<10x10x10xf32>, tensor<f32>) -> tensor<10x10x10xf32>
+// CHECK-SAME: }> : (tensor<10x10x10xf32>, tensor<f32>) -> tensor<10x10x10xf32>
 
-// CHECK:      "mhlo.pad"(%1, %0) {
+// CHECK:      "mhlo.pad"(%1, %0) <{
 // CHECK-SAME:     edge_padding_high = dense<1> : tensor<3xi64>,
 // CHECK-SAME:     edge_padding_low = dense<0> : tensor<3xi64>,
 // CHECK-SAME:     interior_padding = dense<0> : tensor<3xi64>
-// CHECK-SAME: } : (tensor<10x10x10xf32>, tensor<f32>) -> tensor<11x11x11xf32>
+// CHECK-SAME: }> : (tensor<10x10x10xf32>, tensor<f32>) -> tensor<11x11x11xf32>
 }
 
 // -----
@@ -199,22 +199,22 @@ func.func @testTwoConsecutivePadsNegativePositiveHighPad(%arg0: tensor<10x10x10x
 // CHECK-LABEL: testTwoConsecutivePadsDifferentPadVal
 func.func @testTwoConsecutivePadsDifferentPadVal(%arg0: tensor<10x10x10xf32>) -> (tensor<14x14x14xf32>) {
   %0 = mhlo.constant dense<1.000000e+00> : tensor<f32>
-  %1 = "mhlo.pad"(%arg0, %0) {edge_padding_high = dense<1> : tensor<3xi64>, edge_padding_low = dense<1> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<10x10x10xf32>, tensor<f32>) -> tensor<12x12x12xf32>
+  %1 = "mhlo.pad"(%arg0, %0) <{edge_padding_high = dense<1> : tensor<3xi64>, edge_padding_low = dense<1> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>}> : (tensor<10x10x10xf32>, tensor<f32>) -> tensor<12x12x12xf32>
   %2 = mhlo.constant dense<0.000000e+00> : tensor<f32>
-  %3 = "mhlo.pad"(%1, %2) {edge_padding_high = dense<1> : tensor<3xi64>, edge_padding_low = dense<1> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<12x12x12xf32>, tensor<f32>) -> tensor<14x14x14xf32>
+  %3 = "mhlo.pad"(%1, %2) <{edge_padding_high = dense<1> : tensor<3xi64>, edge_padding_low = dense<1> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>}> : (tensor<12x12x12xf32>, tensor<f32>) -> tensor<14x14x14xf32>
   return %3 : tensor<14x14x14xf32>
 
-// CHECK:      "mhlo.pad"(%arg0, %1) {
+// CHECK:      "mhlo.pad"(%arg0, %1) <{
 // CHECK-SAME:     edge_padding_high = dense<1> : tensor<3xi64>,
 // CHECK-SAME:     edge_padding_low = dense<1> : tensor<3xi64>,
 // CHECK-SAME:     interior_padding = dense<0> : tensor<3xi64>
-// CHECK-SAME: } : (tensor<10x10x10xf32>, tensor<f32>) -> tensor<12x12x12xf32>
+// CHECK-SAME: }> : (tensor<10x10x10xf32>, tensor<f32>) -> tensor<12x12x12xf32>
 
-// CHECK:      "mhlo.pad"(%2, %0) {
+// CHECK:      "mhlo.pad"(%2, %0) <{
 // CHECK-SAME:     edge_padding_high = dense<1> : tensor<3xi64>,
 // CHECK-SAME:     edge_padding_low = dense<1> : tensor<3xi64>,
 // CHECK-SAME:     interior_padding = dense<0> : tensor<3xi64>
-// CHECK-SAME: } : (tensor<12x12x12xf32>, tensor<f32>) -> tensor<14x14x14xf32>
+// CHECK-SAME: }> : (tensor<12x12x12xf32>, tensor<f32>) -> tensor<14x14x14xf32>
 }
 
 // -----
@@ -222,23 +222,23 @@ func.func @testTwoConsecutivePadsDifferentPadVal(%arg0: tensor<10x10x10xf32>) ->
 // CHECK-LABEL: testTwoConsecutivePadsDifferentUsers
 func.func @testTwoConsecutivePadsDifferentUsers(%arg0: tensor<10x10x10xf32>) -> (tensor<13x13x13xf32>, tensor<12x12x12xf32>) {
   %0 = mhlo.constant dense<0.000000e+00> : tensor<f32>
-  %1 = "mhlo.pad"(%arg0, %0) {edge_padding_high = dense<1> : tensor<3xi64>, edge_padding_low = dense<1> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<10x10x10xf32>, tensor<f32>) -> tensor<12x12x12xf32>
+  %1 = "mhlo.pad"(%arg0, %0) <{edge_padding_high = dense<1> : tensor<3xi64>, edge_padding_low = dense<1> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>}> : (tensor<10x10x10xf32>, tensor<f32>) -> tensor<12x12x12xf32>
   %2 = mhlo.exponential %1 : tensor<12x12x12xf32>
   %3 = mhlo.constant dense<0.000000e+00> : tensor<f32>
-  %4 = "mhlo.pad"(%1, %3) {edge_padding_high = dense<1> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<12x12x12xf32>, tensor<f32>) -> tensor<13x13x13xf32>
+  %4 = "mhlo.pad"(%1, %3) <{edge_padding_high = dense<1> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>}> : (tensor<12x12x12xf32>, tensor<f32>) -> tensor<13x13x13xf32>
   return %4, %2 : tensor<13x13x13xf32>, tensor<12x12x12xf32>
 
-// CHECK:      "mhlo.pad"(%arg0, %0) {
+// CHECK:      "mhlo.pad"(%arg0, %0) <{
 // CHECK-SAME:     edge_padding_high = dense<1> : tensor<3xi64>,
 // CHECK-SAME:     edge_padding_low = dense<1> : tensor<3xi64>,
 // CHECK-SAME:     interior_padding = dense<0> : tensor<3xi64>
-// CHECK-SAME: } : (tensor<10x10x10xf32>, tensor<f32>) -> tensor<12x12x12xf32>
+// CHECK-SAME: }> : (tensor<10x10x10xf32>, tensor<f32>) -> tensor<12x12x12xf32>
 
-// CHECK:      "mhlo.pad"(%1, %0) {
+// CHECK:      "mhlo.pad"(%1, %0) <{
 // CHECK-SAME:     edge_padding_high = dense<1> : tensor<3xi64>,
 // CHECK-SAME:     edge_padding_low = dense<0> : tensor<3xi64>,
 // CHECK-SAME:     interior_padding = dense<0> : tensor<3xi64>
-// CHECK-SAME: } : (tensor<12x12x12xf32>, tensor<f32>) -> tensor<13x13x13xf32>
+// CHECK-SAME: }> : (tensor<12x12x12xf32>, tensor<f32>) -> tensor<13x13x13xf32>
 }
 
 // -----
@@ -246,18 +246,18 @@ func.func @testTwoConsecutivePadsDifferentUsers(%arg0: tensor<10x10x10xf32>) ->
 // CHECK-LABEL: testTwoConsecutivePadsMultipleDownstreamUsers
   func.func @testTwoConsecutivePadsMultipleDownstreamUsers(%arg0: tensor<10x10x10xf32>) -> (tensor<13x13x13xf32>, tensor<13x13x13xf32>) {
     %0 = mhlo.constant dense<0.000000e+00> : tensor<f32>
-    %1 = "mhlo.pad"(%arg0, %0) {edge_padding_high = dense<1> : tensor<3xi64>, edge_padding_low = dense<1> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<10x10x10xf32>, tensor<f32>) -> tensor<12x12x12xf32>
+    %1 = "mhlo.pad"(%arg0, %0) <{edge_padding_high = dense<1> : tensor<3xi64>, edge_padding_low = dense<1> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>}> : (tensor<10x10x10xf32>, tensor<f32>) -> tensor<12x12x12xf32>
     %2 = mhlo.constant dense<0.000000e+00> : tensor<f32>
-    %3 = "mhlo.pad"(%1, %2) {edge_padding_high = dense<1> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<12x12x12xf32>, tensor<f32>) -> tensor<13x13x13xf32>
+    %3 = "mhlo.pad"(%1, %2) <{edge_padding_high = dense<1> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>}> : (tensor<12x12x12xf32>, tensor<f32>) -> tensor<13x13x13xf32>
     %4 = mhlo.exponential %3 : tensor<13x13x13xf32>
     %5 = mhlo.tanh %3 : tensor<13x13x13xf32>
     return %4, %5 : tensor<13x13x13xf32>, tensor<13x13x13xf32>
 
-// CHECK:      "mhlo.pad"(%arg0, %0) {
+// CHECK:      "mhlo.pad"(%arg0, %0) <{
 // CHECK-SAME:     edge_padding_high = dense<2> : tensor<3xi64>,
 // CHECK-SAME:     edge_padding_low = dense<1> : tensor<3xi64>,
 // CHECK-SAME:     interior_padding = dense<0> : tensor<3xi64>
-// CHECK-SAME: } : (tensor<10x10x10xf32>, tensor<f32>) -> tensor<13x13x13xf32>
+// CHECK-SAME: }> : (tensor<10x10x10xf32>, tensor<f32>) -> tensor<13x13x13xf32>
 
 // CHECK: mhlo.exponential %1 : tensor<13x13x13xf32>
 // CHECK: mhlo.tanh %1 : tensor<13x13x13xf32>
@@ -283,15 +283,15 @@ func.func @testLiftDotConcatLHSSimple(%arg0: tensor<1x1x512xf32>, %arg1: tensor<
       lhs_contracting_dimensions = [2],
       rhs_contracting_dimensions = [0]
   >} : (tensor<3x1x512xf32>, tensor<512x13xf32>) -> tensor<3x1x13xf32>
-  %r = "mhlo.concatenate"(%0, %1, %2) {dimension = 0 : i64} : (tensor<1x1x13xf32>, tensor<2x1x13xf32>, tensor<3x1x13xf32>) -> tensor<6x1x13xf32>
+  %r = "mhlo.concatenate"(%0, %1, %2) <{dimension = 0 : i64}> : (tensor<1x1x13xf32>, tensor<2x1x13xf32>, tensor<3x1x13xf32>) -> tensor<6x1x13xf32>
   func.return %r : tensor<6x1x13xf32>
 
-// CHECK:      %[[R0:.*]] = "mhlo.concatenate"(%arg0, %arg1, %arg2) {dimension = 0 : i64} : (tensor<1x1x512xf32>, tensor<2x1x512xf32>, tensor<3x1x512xf32>) -> tensor<6x1x512xf32>
-// CHECK:      %[[R1:.*]] = "mhlo.dot_general"(%[[R0]], %arg3) {
+// CHECK:      %[[R0:.*]] = "mhlo.concatenate"(%arg0, %arg1, %arg2) <{dimension = 0 : i64}> : (tensor<1x1x512xf32>, tensor<2x1x512xf32>, tensor<3x1x512xf32>) -> tensor<6x1x512xf32>
+// CHECK:      %[[R1:.*]] = "mhlo.dot_general"(%[[R0]], %arg3) <{
 // CHECK-SAME:   dot_dimension_numbers = #mhlo.dot<
 // CHECK-SAME:     lhs_contracting_dimensions = [2],
 // CHECK-SAME:     rhs_contracting_dimensions = [0]
-// CHECK-SAME: >} : (tensor<6x1x512xf32>, tensor<512x13xf32>) -> tensor<6x1x13xf32>
+// CHECK-SAME: >}> : (tensor<6x1x512xf32>, tensor<512x13xf32>) -> tensor<6x1x13xf32>
 // CHECK:      return %[[R1]] : tensor<6x1x13xf32>
 }
 
@@ -313,17 +313,17 @@ func.func @testLiftDotConcatLHSComplex(%arg0: tensor<1x9x2x3x8x4x10xf32>, %arg1:
       lhs_contracting_dimensions = [4, 1, 6],
       rhs_contracting_dimensions = [6, 0, 4]
   >} : (tensor<1x9x2x3x8x100x10xf32>, tensor<9x2x1x5x10x5x8x7xf32>) -> tensor<1x2x3x100x5x5x7xf32>
-  %r = "mhlo.concatenate"(%0, %1) {dimension = 3 : i64} : (tensor<1x2x3x4x5x5x7xf32>, tensor<1x2x3x100x5x5x7xf32>) -> tensor<1x2x3x104x5x5x7xf32>
+  %r = "mhlo.concatenate"(%0, %1) <{dimension = 3 : i64}> : (tensor<1x2x3x4x5x5x7xf32>, tensor<1x2x3x100x5x5x7xf32>) -> tensor<1x2x3x104x5x5x7xf32>
   func.return %r : tensor<1x2x3x104x5x5x7xf32>
 
-// CHECK:      %[[R0:.*]] = "mhlo.concatenate"(%arg0, %arg1) {dimension = 5 : i64} : (tensor<1x9x2x3x8x4x10xf32>, tensor<1x9x2x3x8x100x10xf32>) -> tensor<1x9x2x3x8x104x10xf32>
-// CHECK:      %[[R1:.*]] = "mhlo.dot_general"(%[[R0]], %arg2) {
+// CHECK:      %[[R0:.*]] = "mhlo.concatenate"(%arg0, %arg1) <{dimension = 5 : i64}> : (tensor<1x9x2x3x8x4x10xf32>, tensor<1x9x2x3x8x100x10xf32>) -> tensor<1x9x2x3x8x104x10xf32>
+// CHECK:      %[[R1:.*]] = "mhlo.dot_general"(%[[R0]], %arg2) <{
 // CHECK-SAME:   dot_dimension_numbers = #mhlo.dot<
 // CHECK-SAME:     lhs_batching_dimensions = [0, 2],
 // CHECK-SAME:     rhs_batching_dimensions = [2, 1],
 // CHECK-SAME:     lhs_contracting_dimensions = [4, 1, 6],
 // CHECK-SAME:     rhs_contracting_dimensions = [6, 0, 4]
-// CHECK-SAME: >} : (tensor<1x9x2x3x8x104x10xf32>, tensor<9x2x1x5x10x5x8x7xf32>) -> tensor<1x2x3x104x5x5x7xf32>
+// CHECK-SAME: >}> : (tensor<1x9x2x3x8x104x10xf32>, tensor<9x2x1x5x10x5x8x7xf32>) -> tensor<1x2x3x104x5x5x7xf32>
 // CHECK:      return %[[R1]] : tensor<1x2x3x104x5x5x7xf32>
 }
 
@@ -359,18 +359,18 @@ func.func @testLiftDotConcatLHSAndRHS(%arg0: tensor<1x72x128xf32>, %arg1: tensor
       lhs_contracting_dimensions = [2],
       rhs_contracting_dimensions = [1]
     >} : (tensor<1x72x128xf32>, tensor<1x128x72xf32>) -> tensor<1x72x72xf32>
-  %4 = "mhlo.concatenate"(%0, %1, %2, %3) {dimension = 0 : i64} : (tensor<1x72x72xf32>, tensor<1x72x72xf32>, tensor<1x72x72xf32>, tensor<1x72x72xf32>) -> tensor<4x72x72xf32>
+  %4 = "mhlo.concatenate"(%0, %1, %2, %3) <{dimension = 0 : i64}> : (tensor<1x72x72xf32>, tensor<1x72x72xf32>, tensor<1x72x72xf32>, tensor<1x72x72xf32>) -> tensor<4x72x72xf32>
   func.return %4 : tensor<4x72x72xf32>
 
-// CHECK:      %[[R0:.*]] = "mhlo.concatenate"(%arg0, %arg2, %arg4, %arg6) {dimension = 0 : i64} : (tensor<1x72x128xf32>, tensor<1x72x128xf32>, tensor<1x72x128xf32>, tensor<1x72x128xf32>) -> tensor<4x72x128xf32>
-// CHECK:      %[[R1:.*]] = "mhlo.concatenate"(%arg1, %arg3, %arg5, %arg7) {dimension = 0 : i64} : (tensor<1x128x72xf32>, tensor<1x128x72xf32>, tensor<1x128x72xf32>, tensor<1x128x72xf32>) -> tensor<4x128x72xf32>
-// CHECK:      %[[R2:.*]] = "mhlo.dot_general"(%[[R0]], %[[R1]]) {
+// CHECK:      %[[R0:.*]] = "mhlo.concatenate"(%arg0, %arg2, %arg4, %arg6) <{dimension = 0 : i64}> : (tensor<1x72x128xf32>, tensor<1x72x128xf32>, tensor<1x72x128xf32>, tensor<1x72x128xf32>) -> tensor<4x72x128xf32>
+// CHECK:      %[[R1:.*]] = "mhlo.concatenate"(%arg1, %arg3, %arg5, %arg7) <{dimension = 0 : i64}> : (tensor<1x128x72xf32>, tensor<1x128x72xf32>, tensor<1x128x72xf32>, tensor<1x128x72xf32>) -> tensor<4x128x72xf32>
+// CHECK:      %[[R2:.*]] = "mhlo.dot_general"(%[[R0]], %[[R1]]) <{
 // CHECK-SAME:   dot_dimension_numbers = #mhlo.dot<
 // CHECK-SAME:     lhs_batching_dimensions = [0],
 // CHECK-SAME:     rhs_batching_dimensions = [0],
 // CHECK-SAME:     lhs_contracting_dimensions = [2],
 // CHECK-SAME:     rhs_contracting_dimensions = [1]
-// CHECK-SAME:   >} : (tensor<4x72x128xf32>, tensor<4x128x72xf32>) -> tensor<4x72x72xf32>
+// CHECK-SAME:   >}> : (tensor<4x72x128xf32>, tensor<4x128x72xf32>) -> tensor<4x72x72xf32>
 // CHECK:      return %[[R2]] : tensor<4x72x72xf32>
 }
 
@@ -378,10 +378,10 @@ func.func @testLiftDotConcatLHSAndRHS(%arg0: tensor<1x72x128xf32>, %arg1: tensor
 
 // CHECK-LABEL: testSliceConcat
 func.func @testSliceConcat(%arg0: tensor<3x1x512xf32>) -> tensor<3x1x512xf32> {
-  %0 = "mhlo.slice"(%arg0) {limit_indices = dense<[1, 1, 512]> : tensor<3xi64>, start_indices = dense<[0, 0, 0]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<3x1x512xf32>) -> tensor<1x1x512xf32>
-  %1 = "mhlo.slice"(%arg0) {limit_indices = dense<[2, 1, 512]> : tensor<3xi64>, start_indices = dense<[1, 0, 0]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<3x1x512xf32>) -> tensor<1x1x512xf32>
-  %2 = "mhlo.slice"(%arg0) {limit_indices = dense<[3, 1, 512]> : tensor<3xi64>, start_indices = dense<[2, 0, 0]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<3x1x512xf32>) -> tensor<1x1x512xf32>
-  %r = "mhlo.concatenate"(%0, %1, %2) {dimension = 0 : i64} : (tensor<1x1x512xf32>, tensor<1x1x512xf32>, tensor<1x1x512xf32>) -> tensor<3x1x512xf32>
+  %0 = "mhlo.slice"(%arg0) <{limit_indices = dense<[1, 1, 512]> : tensor<3xi64>, start_indices = dense<[0, 0, 0]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>}> : (tensor<3x1x512xf32>) -> tensor<1x1x512xf32>
+  %1 = "mhlo.slice"(%arg0) <{limit_indices = dense<[2, 1, 512]> : tensor<3xi64>, start_indices = dense<[1, 0, 0]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>}> : (tensor<3x1x512xf32>) -> tensor<1x1x512xf32>
+  %2 = "mhlo.slice"(%arg0) <{limit_indices = dense<[3, 1, 512]> : tensor<3xi64>, start_indices = dense<[2, 0, 0]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>}> : (tensor<3x1x512xf32>) -> tensor<1x1x512xf32>
+  %r = "mhlo.concatenate"(%0, %1, %2) <{dimension = 0 : i64}> : (tensor<1x1x512xf32>, tensor<1x1x512xf32>, tensor<1x1x512xf32>) -> tensor<3x1x512xf32>
   func.return %r : tensor<3x1x512xf32>
 
 // CHECK: return %arg0 : tensor<3x1x512xf32>
@@ -399,12 +399,12 @@ func.func @testConvertReshapeDotRhsToBatchedDot(%arg0: tensor<1x72x72xf32>, %arg
     >} : (tensor<1x72x72xf32>, tensor<72x128xf32>) -> tensor<1x72x128xf32>
   func.return %1 : tensor<1x72x128xf32>
 
-// CHECK:      %[[R:.*]] = "mhlo.dot_general"(%arg0, %arg1) {
+// CHECK:      %[[R:.*]] = "mhlo.dot_general"(%arg0, %arg1) <{
 // CHECK-SAME: dot_dimension_numbers = #mhlo.dot<
 // CHECK-SAME:   lhs_batching_dimensions = [0],
 // CHECK-SAME:   rhs_batching_dimensions = [0],
 // CHECK-SAME:   lhs_contracting_dimensions = [2],
 // CHECK-SAME:   rhs_contracting_dimensions = [1]
-// CHECK-SAME: >} : (tensor<1x72x72xf32>, tensor<1x72x128xf32>) -> tensor<1x72x128xf32>
+// CHECK-SAME: >}> : (tensor<1x72x72xf32>, tensor<1x72x128xf32>) -> tensor<1x72x128xf32>
 // CHECK:      return %[[R]] : tensor<1x72x128xf32>
 }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/tf-tfl-translate-serialize-stablehlo.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/tf-tfl-translate-serialize-stablehlo.mlir
index d5384b4c96a1f3..479147ded9bb5d 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/tf-tfl-translate-serialize-stablehlo.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/tf-tfl-translate-serialize-stablehlo.mlir
@@ -14,9 +14,9 @@ func.func @tfInplaceUpdate(%arg0: tensor<2x1x2xf32>) -> tensor<2x1x2xf32> {
 //CHECK: module attributes
 //CHECK-SAME: keep_stablehlo_constant = "true"
 //CHECK-NEXT:  func.func @main(%arg0: tensor<2x1x2xf32>) -> tensor<2x1x2xf32> attributes {tf.entry_function = {inputs = "arg0", outputs = "vhlo.dynamic_update_slice_v1"}} {
-//CHECK-DAG:    %0 = stablehlo.constant dense<2.000000e+00> : tensor<1x1x2xf32>
-//CHECK-DAG:    %1 = stablehlo.constant dense<1> : tensor<i32>
-//CHECK-DAG:    %2 = stablehlo.constant dense<0> : tensor<i32>
-//CHECK-NEXT:   %3 = stablehlo.dynamic_update_slice %arg0, %0, %1, %2, %2 : (tensor<2x1x2xf32>, tensor<1x1x2xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<2x1x2xf32>
-//CHECK-NEXT:   return %3 : tensor<2x1x2xf32>
+//CHECK-DAG:    %[[c0:.+]] = stablehlo.constant dense<2.000000e+00> : tensor<1x1x2xf32>
+//CHECK-DAG:    %[[c1:.+]] = stablehlo.constant dense<1> : tensor<i32>
+//CHECK-DAG:    %[[c2:.+]] = stablehlo.constant dense<0> : tensor<i32>
+//CHECK-NEXT:   %[[c3:.+]] = stablehlo.dynamic_update_slice %arg0, %[[c0]], %[[c1]], %[[c2]], %[[c2]] : (tensor<2x1x2xf32>, tensor<1x1x2xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<2x1x2xf32>
+//CHECK-NEXT:   return %[[c3]] : tensor<2x1x2xf32>
 //CHECK-NEXT:  }
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/tf-tfl-translate-tf-quantize.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/tf-tfl-translate-tf-quantize.mlir
index d23de7ce50cef9..0b0988363a760a 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/tf-tfl-translate-tf-quantize.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/tf-tfl-translate-tf-quantize.mlir
@@ -13,10 +13,10 @@ func.func @tfInplaceUpdate(%arg0: tensor<2x1x2xf32>) -> tensor<2x1x2xf32> {
 
 //CHECK: module {
 //CHECK-NEXT:  func.func @main(%arg0: tensor<2x1x2xf32>) -> tensor<2x1x2xf32> {
-//CHECK-DAG:    %0 = stablehlo.constant dense<2.000000e+00> : tensor<1x1x2xf32>
-//CHECK-DAG:    %1 = stablehlo.constant dense<1> : tensor<i32>
-//CHECK-DAG:    %2 = stablehlo.constant dense<0> : tensor<i32>
-//CHECK-NEXT:    %3 = stablehlo.dynamic_update_slice %arg0, %0, %1, %2, %2 : (tensor<2x1x2xf32>, tensor<1x1x2xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<2x1x2xf32>
-//CHECK-NEXT:    return %3 : tensor<2x1x2xf32>
+//CHECK-DAG:    %[[c0:.+]] = stablehlo.constant dense<2.000000e+00> : tensor<1x1x2xf32>
+//CHECK-DAG:    %[[c1:.+]] = stablehlo.constant dense<1> : tensor<i32>
+//CHECK-DAG:    %[[c2:.+]] = stablehlo.constant dense<0> : tensor<i32>
+//CHECK-NEXT:    %[[c3:.+]] = stablehlo.dynamic_update_slice %arg0, %[[c0]], %[[c1]], %[[c2]], %[[c2]] : (tensor<2x1x2xf32>, tensor<1x1x2xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<2x1x2xf32>
+//CHECK-NEXT:    return %[[c3:.+]] : tensor<2x1x2xf32>
 //CHECK-NEXT:  }
 //CHECK-NEXT:}
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/tfl_legalize_hlo.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/tfl_legalize_hlo.mlir
index 4dcfa9c3cbacd7..9be635a44268f6 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/tfl_legalize_hlo.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/tfl_legalize_hlo.mlir
@@ -11,7 +11,7 @@ func.func @main(%arg0: tensor<5x7xf32>) -> tensor<5x7xf32> {
 // - transpose
 //
 func.func @transpose_2d(%arg0: tensor<2x3xf32>) -> tensor<3x2xf32> {
-  %0 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<2x3xf32>) -> tensor<3x2xf32>
+  %0 = "mhlo.transpose"(%arg0) <{permutation = dense<[1, 0]> : tensor<2xi64>}> : (tensor<2x3xf32>) -> tensor<3x2xf32>
   func.return %0 : tensor<3x2xf32>
 
 // CHECK-LABEL:   transpose_2d
@@ -22,7 +22,7 @@ func.func @transpose_2d(%arg0: tensor<2x3xf32>) -> tensor<3x2xf32> {
 }
 
 func.func @transpose_3d(%arg0: tensor<1x2x3xf32>) -> tensor<3x2x1xf32> {
-  %0 = "mhlo.transpose"(%arg0) {permutation = dense<[2, 1, 0]> : tensor<3xi64>} : (tensor<1x2x3xf32>) -> tensor<3x2x1xf32>
+  %0 = "mhlo.transpose"(%arg0) <{permutation = dense<[2, 1, 0]> : tensor<3xi64>}> : (tensor<1x2x3xf32>) -> tensor<3x2x1xf32>
   func.return %0 : tensor<3x2x1xf32>
 
 // CHECK-LABEL:   transpose_3d
@@ -33,7 +33,7 @@ func.func @transpose_3d(%arg0: tensor<1x2x3xf32>) -> tensor<3x2x1xf32> {
 }
 
 func.func @transpose_dynamic_2d(%arg0: tensor<?x4xf32>) -> tensor<4x?xf32> {
-  %0 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<?x4xf32>) -> tensor<4x?xf32>
+  %0 = "mhlo.transpose"(%arg0) <{permutation = dense<[1, 0]> : tensor<2xi64>}> : (tensor<?x4xf32>) -> tensor<4x?xf32>
   func.return %0 : tensor<4x?xf32>
 
 // CHECK-LABEL:   transpose_dynamic_2d
@@ -272,8 +272,8 @@ func.return %0 : tensor<4x4x256xf32>
 func.func @convert_argmax(%arg0: tensor<4x32x256xf32>) -> (tensor<4x32xf32>, tensor<4x32xi32>) {
   %0 = mhlo.constant dense<0xFF800000> : tensor<f32>
   %1 = mhlo.constant dense<0> : tensor<i32>
-  %2 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<256xi32>
-  %3 = "mhlo.broadcast_in_dim"(%2) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<256xi32>) -> tensor<4x32x256xi32>
+  %2 = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<256xi32>
+  %3 = "mhlo.broadcast_in_dim"(%2) <{broadcast_dimensions = dense<2> : tensor<1xi64>}> : (tensor<256xi32>) -> tensor<4x32x256xi32>
   %4:2 = "mhlo.reduce"(%arg0, %3, %0, %1) ({
   ^bb0(%arg1: tensor<f32>, %arg2: tensor<i32>, %arg3: tensor<f32>, %arg4: tensor<i32>):
     %7 = "mhlo.compare"(%arg1, %arg3) {comparison_direction = #mhlo<comparison_direction GT>} : (tensor<f32>, tensor<f32>) -> tensor<i1>
@@ -291,8 +291,8 @@ func.func @convert_argmax(%arg0: tensor<4x32x256xf32>) -> (tensor<4x32xf32>, ten
 
   // CHECK:  %0 = mhlo.constant dense<0xFF800000> : tensor<f32>
   // CHECK-DAG:  %1 = mhlo.constant dense<0> : tensor<i32>
-  // CHECK:  %2 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<256xi32>
-  // CHECK:  %3 = "mhlo.broadcast_in_dim"(%2) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<256xi32>) -> tensor<4x32x256xi32>
+  // CHECK:  %2 = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<256xi32>
+  // CHECK:  %3 = "mhlo.broadcast_in_dim"(%2) <{broadcast_dimensions = dense<2> : tensor<1xi64>}> : (tensor<256xi32>) -> tensor<4x32x256xi32>
   // CHECK:  %cst = arith.constant dense<2> : tensor<1xi32>
   // CHECK:  %4 = "tfl.reduce_max"(%arg0, %cst) {keep_dims = false} : (tensor<4x32x256xf32>, tensor<1xi32>) -> tensor<4x32xf32>
   // CHECK:  %5 = "tfl.arg_max"(%arg0, %cst) : (tensor<4x32x256xf32>, tensor<1xi32>) -> tensor<4x32xi32>
@@ -359,7 +359,7 @@ func.func @convert_argmax_constant_non_z_axis(%arg0: tensor<4x4xf32>) -> (tensor
 
 // CHECK-LABEL:   func.func @convert_argmax_bool
 func.func @convert_argmax_bool(%arg0: tensor<2xi1>) -> tensor<i32> {
-  %0 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<2xi32>
+  %0 = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<2xi32>
   %1 = mhlo.constant dense<false> : tensor<i1>
   %2 = mhlo.constant dense<0> : tensor<i32>
   %3:2 = mhlo.reduce(%arg0 init: %1), (%0 init: %2) across dimensions = [0] : (tensor<2xi1>, tensor<2xi32>, tensor<i1>, tensor<i32>) -> (tensor<i1>, tensor<i32>)
@@ -375,7 +375,7 @@ func.func @convert_argmax_bool(%arg0: tensor<2xi1>) -> tensor<i32> {
   }
   return %3#1 : tensor<i32>
 
-  // CHECK:  %0 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<2xi32>
+  // CHECK:  %0 = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<2xi32>
   // CHECK-DAG:  %1 = mhlo.constant dense<false> : tensor<i1>
   // CHECK:  %2 = mhlo.constant dense<0> : tensor<i32>
   // CHECK:  %cst = arith.constant dense<0> : tensor<1xi32>
@@ -388,8 +388,8 @@ func.func @convert_argmax_bool(%arg0: tensor<2xi1>) -> tensor<i32> {
 func.func @convert_argmin(%arg0: tensor<4x32x256xf32>) -> (tensor<4x32xf32>, tensor<4x32xi32>) {
   %0 = mhlo.constant dense<0x7F800000> : tensor<f32>
   %1 = mhlo.constant dense<0> : tensor<i32>
-  %2 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<256xi32>
-  %3 = "mhlo.broadcast_in_dim"(%2) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<256xi32>) -> tensor<4x32x256xi32>
+  %2 = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<256xi32>
+  %3 = "mhlo.broadcast_in_dim"(%2) <{broadcast_dimensions = dense<2> : tensor<1xi64>}> : (tensor<256xi32>) -> tensor<4x32x256xi32>
   %4:2 = "mhlo.reduce"(%arg0, %3, %0, %1) ({
   ^bb0(%arg1: tensor<f32>, %arg2: tensor<i32>, %arg3: tensor<f32>, %arg4: tensor<i32>):
     %7 = "mhlo.compare"(%arg1, %arg3) {comparison_direction = #mhlo<comparison_direction LT>} : (tensor<f32>, tensor<f32>) -> tensor<i1>
@@ -407,8 +407,8 @@ func.func @convert_argmin(%arg0: tensor<4x32x256xf32>) -> (tensor<4x32xf32>, ten
 
   // CHECK-DAG:  %0 = mhlo.constant dense<0x7F800000> : tensor<f32>
   // CHECK:  %1 = mhlo.constant dense<0> : tensor<i32>
-  // CHECK:  %2 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<256xi32>
-  // CHECK:  %3 = "mhlo.broadcast_in_dim"(%2) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<256xi32>) -> tensor<4x32x256xi32>
+  // CHECK:  %2 = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<256xi32>
+  // CHECK:  %3 = "mhlo.broadcast_in_dim"(%2) <{broadcast_dimensions = dense<2> : tensor<1xi64>}> : (tensor<256xi32>) -> tensor<4x32x256xi32>
   // CHECK:  %cst = arith.constant dense<2> : tensor<1xi32>
   // CHECK:  %4 = "tfl.reduce_min"(%arg0, %cst) {keep_dims = false} : (tensor<4x32x256xf32>, tensor<1xi32>) -> tensor<4x32xf32>
   // CHECK:  %5 = "tfl.arg_min"(%arg0, %cst) : (tensor<4x32x256xf32>, tensor<1xi32>) -> tensor<4x32xi32>
@@ -418,7 +418,7 @@ func.func @convert_argmin(%arg0: tensor<4x32x256xf32>) -> (tensor<4x32xf32>, ten
 // CHECK-LABEL:   func @convert_argmin_i16
 func.func @convert_argmin_i16(%arg0: tensor<2xi16>) -> (tensor<i16>, tensor<i32>) {
   %0 = mhlo.constant dense<false> : tensor<i1>
-  %1 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<2xi32>
+  %1 = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<2xi32>
   %2 = mhlo.constant dense<32767> : tensor<i16>
   %3 = mhlo.constant dense<0> : tensor<i32>
   %4:2 = "mhlo.reduce"(%arg0, %1, %2, %3) ({
@@ -436,7 +436,7 @@ func.func @convert_argmin_i16(%arg0: tensor<2xi16>) -> (tensor<i16>, tensor<i32>
   func.return %4#0, %4#1 : tensor<i16>, tensor<i32>
 
   // CHECK:  %0 = mhlo.constant dense<false> : tensor<i1>
-  // CHECK:  %1 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<2xi32>
+  // CHECK:  %1 = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<2xi32>
   // CHECK-DAG:  %2 = mhlo.constant dense<32767> : tensor<i16>
   // CHECK:  %3 = mhlo.constant dense<0> : tensor<i32>
   // CHECK:  %cst = arith.constant dense<0> : tensor<1xi32>
@@ -477,7 +477,7 @@ func.func @convert_argmin_constant(%arg0: tensor<2x2x4xf32>) -> (tensor<2x2xf32>
 
 // CHECK-LABEL:   func.func @convert_argmin_bool
 func.func @convert_argmin_bool(%arg0: tensor<2xi1>) -> tensor<i32> {
-  %0 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<2xi32>
+  %0 = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<2xi32>
   %1 = mhlo.constant dense<false> : tensor<i1>
   %2 = mhlo.constant dense<0> : tensor<i32>
   %3:2 = mhlo.reduce(%arg0 init: %1), (%0 init: %2) across dimensions = [0] : (tensor<2xi1>, tensor<2xi32>, tensor<i1>, tensor<i32>) -> (tensor<i1>, tensor<i32>)
@@ -493,7 +493,7 @@ func.func @convert_argmin_bool(%arg0: tensor<2xi1>) -> tensor<i32> {
   }
   return %3#1 : tensor<i32>
 
-  // CHECK:  %0 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<2xi32>
+  // CHECK:  %0 = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<2xi32>
   // CHECK-DAG:  %1 = mhlo.constant dense<false> : tensor<i1>
   // CHECK:  %2 = mhlo.constant dense<0> : tensor<i32>
   // CHECK:  %cst = arith.constant dense<0> : tensor<1xi32>
@@ -506,7 +506,7 @@ func.func @convert_argmin_bool(%arg0: tensor<2xi1>) -> tensor<i32> {
 func.func @convert_argmax_with_reshaped_iota(%arg0: tensor<1x32x1xf32>) -> (tensor<1x1xf32>, tensor<1x1xi32>) {
   %0 = mhlo.constant dense<0xFF800000> : tensor<f32>
   %1 = mhlo.constant dense<0> : tensor<i32>
-  %2 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<32xi32>
+  %2 = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<32xi32>
   %3 = "mhlo.reshape"(%2) : (tensor<32xi32>) -> tensor<1x32x1xi32>
   %4:2 = mhlo.reduce(%arg0 init: %0), (%3 init: %1) across dimensions = [1] : (tensor<1x32x1xf32>, tensor<1x32x1xi32>, tensor<f32>, tensor<i32>) -> (tensor<1x1xf32>, tensor<1x1xi32>)
    reducer(%arg1: tensor<f32>, %arg3: tensor<f32>) (%arg2: tensor<i32>, %arg4: tensor<i32>)  {
@@ -525,7 +525,7 @@ func.func @convert_argmax_with_reshaped_iota(%arg0: tensor<1x32x1xf32>) -> (tens
 
   // CHECK-DAG:  %0 = mhlo.constant dense<0xFF800000> : tensor<f32>
   // CHECK:  %1 = mhlo.constant dense<0> : tensor<i32>
-  // CHECK:  %2 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<32xi32>
+  // CHECK:  %2 = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<32xi32>
   // CHECK:  %3 = mhlo.reshape %2 : (tensor<32xi32>) -> tensor<1x32x1xi32>
   // CHECK:  %cst = arith.constant dense<1> : tensor<1xi32>
   // CHECK:  %4 = "tfl.reduce_max"(%arg0, %cst) {keep_dims = false} : (tensor<1x32x1xf32>, tensor<1xi32>) -> tensor<1x1xf32>
@@ -537,7 +537,7 @@ func.func @convert_argmax_with_reshaped_iota(%arg0: tensor<1x32x1xf32>) -> (tens
 func.func @convert_pytorch_argmax(%arg0: tensor<1x9xi32>) -> tensor<1xi32> {
   %0 = mhlo.constant dense<0> : tensor<i32>
   %1 = mhlo.constant dense<-2147483648> : tensor<i32>
-  %2 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<9xi32>
+  %2 = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<9xi32>
   %3 = mhlo.reshape %2 : (tensor<9xi32>) -> tensor<1x9xi32>
   %4:2 = mhlo.reduce(%arg0 init: %1), (%3 init: %0) across dimensions = [1] : (tensor<1x9xi32>, tensor<1x9xi32>, tensor<i32>, tensor<i32>) -> (tensor<1xi32>, tensor<1xi32>)
     reducer(%arg1: tensor<i32>, %arg3: tensor<i32>) (%arg2: tensor<i32>, %arg4: tensor<i32>)  {
@@ -553,7 +553,7 @@ func.func @convert_pytorch_argmax(%arg0: tensor<1x9xi32>) -> tensor<1xi32> {
 
   // CHECK:  %0 = mhlo.constant dense<0> : tensor<i32>
   // CHECK-DAG:  %1 = mhlo.constant dense<-2147483648> : tensor<i32>
-  // CHECK:  %2 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<9xi32>
+  // CHECK:  %2 = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<9xi32>
   // CHECK:  %3 = mhlo.reshape %2 : (tensor<9xi32>) -> tensor<1x9xi32>
   // CHECK:  %cst = arith.constant dense<1> : tensor<1xi32>
   // CHECK:  %4 = "tfl.reduce_max"(%arg0, %cst) {keep_dims = false} : (tensor<1x9xi32>, tensor<1xi32>) -> tensor<1xi32>
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/unfold_splat_constant_pass.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/unfold_splat_constant_pass.mlir
index fbad58fca6e940..fab612eef01a4f 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/unfold_splat_constant_pass.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/unfold_splat_constant_pass.mlir
@@ -6,7 +6,7 @@ func.func @unfold_splat_constant_float() -> tensor<1x750xf32> {
   func.return %cst : tensor<1x750xf32>
 
   // CHECK-DAG: %0 = mhlo.constant dense<7.680000e+02> : tensor<f32>
-  // CHECK: %1 = "mhlo.broadcast_in_dim"(%0) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<1x750xf32>
+  // CHECK: %1 = "mhlo.broadcast_in_dim"(%0) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<f32>) -> tensor<1x750xf32>
   // CHECK: return %1 : tensor<1x750xf32>
 }
 
@@ -16,7 +16,7 @@ func.func @unfold_splat_constant_integer() -> tensor<1x750xi32> {
   func.return %cst : tensor<1x750xi32>
 
   // CHECK-DAG: %0 = mhlo.constant dense<1> : tensor<i32>
-  // CHECK: %1 = "mhlo.broadcast_in_dim"(%0) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<i32>) -> tensor<1x750xi32>
+  // CHECK: %1 = "mhlo.broadcast_in_dim"(%0) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<i32>) -> tensor<1x750xi32>
   // CHECK: return %1 : tensor<1x750xi32>
 }
 
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/unfuse_mhlo_batch_norm.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/unfuse_mhlo_batch_norm.mlir
index 6f2771756cbac5..70a196f2af44c9 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/unfuse_mhlo_batch_norm.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/unfuse_mhlo_batch_norm.mlir
@@ -16,9 +16,9 @@ func.func @batchNormInference_2D_inner_features(
   // CHECK-DAG: %[[MULTIPLIER:.+]] = mhlo.multiply %[[VARIANCE_EPS_RSQRT]], %[[SCALE]] : tensor<256xf32>
   // CHECK-DAG: %[[MUL_MEAN:.+]] = mhlo.multiply %[[MULTIPLIER]], %[[MEAN]] : tensor<256xf32>
   // CHECK-DAG: %[[RHS:.+]] = mhlo.subtract %[[OFFSET]], %[[MUL_MEAN]] : tensor<256xf32>
-  // CHECK-DAG: %[[MULTIPLIER_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[MULTIPLIER]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<256xf32>) -> tensor<4x256xf32>
+  // CHECK-DAG: %[[MULTIPLIER_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[MULTIPLIER]]) <{broadcast_dimensions = dense<1> : tensor<1xi64>}> : (tensor<256xf32>) -> tensor<4x256xf32>
   // CHECK-DAG: %[[X_NORMED:.+]] = mhlo.multiply %[[X]], %[[MULTIPLIER_BCAST]] : tensor<4x256xf32>
-  // CHECK-DAG: %[[RHS_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[RHS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<256xf32>) -> tensor<4x256xf32>
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[RHS]]) <{broadcast_dimensions = dense<1> : tensor<1xi64>}> : (tensor<256xf32>) -> tensor<4x256xf32>
   // CHECK-DAG: %[[RESULT:.+]] = mhlo.add %[[X_NORMED]], %[[RHS_BCAST]] : tensor<4x256xf32>
   %0 = "mhlo.batch_norm_inference"(%x, %scale, %offset, %mean, %variance)
       {epsilon = 1.001000e-05 : f32, feature_index = 1 : i64} :
@@ -44,8 +44,8 @@ func.func @batchNormInference_4D_middle_features(
   // CHECK-DAG: %[[MULTIPLIER:.+]] = mhlo.multiply %[[VARIANCE_EPS_RSQRT]], %[[SCALE]] : tensor<256xf32>
   // CHECK-DAG: %[[MUL_MEAN:.+]] = mhlo.multiply %[[MULTIPLIER]], %[[MEAN]] : tensor<256xf32>
   // CHECK-DAG: %[[RHS:.+]] = mhlo.subtract %[[OFFSET]], %[[MUL_MEAN]] : tensor<256xf32>
-  // CHECK-DAG: %[[MULTIPLIER_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[MULTIPLIER]]) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<256xf32>) -> tensor<3x4x256x6xf32>
-  // CHECK-DAG: %[[RHS_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[RHS]]) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<256xf32>) -> tensor<3x4x256x6xf32>
+  // CHECK-DAG: %[[MULTIPLIER_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[MULTIPLIER]]) <{broadcast_dimensions = dense<2> : tensor<1xi64>}> : (tensor<256xf32>) -> tensor<3x4x256x6xf32>
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[RHS]]) <{broadcast_dimensions = dense<2> : tensor<1xi64>}> : (tensor<256xf32>) -> tensor<3x4x256x6xf32>
   %0 = "mhlo.batch_norm_inference"(%x, %scale, %offset, %mean, %variance)
       {epsilon = 1.001000e-05 : f32, feature_index = 2 : i64} :
       (tensor<3x4x256x6xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>,
@@ -66,16 +66,16 @@ func.func @batchNormInference_dynamic_shape(
     -> tensor<?x?x?x?xf32> {
   // CHECK-DAG: %[[EPS:.+]] = mhlo.constant dense<1.000000e-03> : tensor<f32>
   // CHECK-DAG: %[[VAR_SHAPE:.+]] = shape.shape_of %[[VARIANCE]] : tensor<?xf32> -> tensor<1xindex>
-  // CHECK-DAG: %[[EPS_BCAST:.+]] =  "mhlo.dynamic_broadcast_in_dim"(%[[EPS]], %[[VAR_SHAPE]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<1xindex>) -> tensor<?xf32>
+  // CHECK-DAG: %[[EPS_BCAST:.+]] =  "mhlo.dynamic_broadcast_in_dim"(%[[EPS]], %[[VAR_SHAPE]]) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<f32>, tensor<1xindex>) -> tensor<?xf32>
   // CHECK-DAG: %[[VARIANCE_EPS:.+]] = mhlo.add %[[VARIANCE]], %[[EPS_BCAST]] : tensor<?xf32>
   // CHECK-DAG: %[[R_STDDEV:.+]] = mhlo.rsqrt %[[VARIANCE_EPS]] : tensor<?xf32>
   // CHECK-DAG: %[[MULTIPLIER:.+]] = mhlo.multiply %[[R_STDDEV]], %[[SCALE]] : tensor<?xf32>
   // CHECK-DAG: %[[MUL_MEAN:.+]] = mhlo.multiply %[[MULTIPLIER]], %[[MEAN]] : tensor<?xf32>
   // CHECK-DAG: %[[RHS:.+]] = mhlo.subtract %[[OFFSET]], %[[MUL_MEAN]] : tensor<?xf32>
   // CHECK-DAG: %[[X_SHAPE:.+]] = shape.shape_of %[[X]] : tensor<?x?x?x?xf32> -> tensor<4xindex>
-  // CHECK-DAG: %[[MULTIPLIER_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[MULTIPLIER]], %[[X_SHAPE]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
+  // CHECK-DAG: %[[MULTIPLIER_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[MULTIPLIER]], %[[X_SHAPE]]) <{broadcast_dimensions = dense<1> : tensor<1xi64>}> : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
   // CHECK-DAG: %[[X_NORMED:.+]] = mhlo.multiply %[[X]], %[[MULTIPLIER_BCAST]] : tensor<?x?x?x?xf32>
-  // CHECK-DAG: %[[RHS_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[RHS]], %[[X_SHAPE]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[RHS]], %[[X_SHAPE]]) <{broadcast_dimensions = dense<1> : tensor<1xi64>}> : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
   // CHECK-DAG: %[[RESULT:.+]] = mhlo.add %[[X_NORMED]], %[[RHS_BCAST]] : tensor<?x?x?x?xf32>
   %0 = "mhlo.batch_norm_inference"(%x, %scale, %offset, %mean, %variance)
       {epsilon = 0.001 : f32, feature_index = 1 : i64} :
@@ -136,7 +136,7 @@ func.func @batchNormTraining_4D_middle_features(
   // CHECK-DAG: %[[X_SHAPE:.+]] = shape.shape_of %[[X]] : tensor<3x4x256x6xf32> -> tensor<4xindex>
   // CHECK-DAG: %[[EPS:.+]] = mhlo.constant dense<1.000000e+00> : tensor<256xf32>
   // CHECK-DAG: %[[MEAN:.+]] = "tf.Mean"(%arg0, %[[CST_AXIS]]) <{keep_dims = false}> : (tensor<3x4x256x6xf32>, tensor<3xi32>) -> tensor<256xf32>
-  // CHECK-DAG: %[[MEAN_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[MEAN]], %[[X_SHAPE]]) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<256xf32>, tensor<4xindex>) -> tensor<3x4x256x6xf32>
+  // CHECK-DAG: %[[MEAN_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[MEAN]], %[[X_SHAPE]]) <{broadcast_dimensions = dense<2> : tensor<1xi64>}> : (tensor<256xf32>, tensor<4xindex>) -> tensor<3x4x256x6xf32>
   // CHECK-DAG: %[[SQ_DIFF:.+]] = "tf.SquaredDifference"(%arg0, %[[MEAN_BCAST]]) : (tensor<3x4x256x6xf32>, tensor<3x4x256x6xf32>) -> tensor<3x4x256x6xf32>
   // CHECK-DAG: %[[VARIANCE:.+]] = "tf.Mean"(%[[SQ_DIFF]], %[[CST_AXIS]]) <{keep_dims = false}> : (tensor<3x4x256x6xf32>, tensor<3xi32>) -> tensor<256xf32>
   // CHECK-DAG: %[[VARIANCE_EPS:.+]] = mhlo.add %[[VARIANCE]], %[[EPS]] : tensor<256xf32>
@@ -144,9 +144,9 @@ func.func @batchNormTraining_4D_middle_features(
   // CHECK-DAG: %[[MULTIPLIER:.+]] = mhlo.multiply %[[VARIANCE_EPS_RSQRT]], %[[SCALE]] : tensor<256xf32>
   // CHECK-DAG: %[[MUL_MEAN:.+]] = mhlo.multiply %[[MULTIPLIER]], %[[MEAN]] : tensor<256xf32>
   // CHECK-DAG: %[[RHS:.+]] = mhlo.subtract %[[OFFSET]], %[[MUL_MEAN]] : tensor<256xf32>
-  // CHECK-DAG: %[[MULTIPLIER_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[MULTIPLIER]]) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<256xf32>) -> tensor<3x4x256x6xf32>
+  // CHECK-DAG: %[[MULTIPLIER_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[MULTIPLIER]]) <{broadcast_dimensions = dense<2> : tensor<1xi64>}> : (tensor<256xf32>) -> tensor<3x4x256x6xf32>
   // CHECK-DAG: %[[X_NORMED:.+]] = mhlo.multiply %[[X]], %[[MULTIPLIER_BCAST]] : tensor<3x4x256x6xf32>
-  // CHECK-DAG: %[[RHS_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[RHS]]) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<256xf32>) -> tensor<3x4x256x6xf32>
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[RHS]]) <{broadcast_dimensions = dense<2> : tensor<1xi64>}> : (tensor<256xf32>) -> tensor<3x4x256x6xf32>
   // CHECK-DAG: %[[RESULT:.+]] = mhlo.add %[[X_NORMED]], %[[RHS_BCAST]] : tensor<3x4x256x6xf32>
   %0:3 = "mhlo.batch_norm_training"(%x, %scale, %offset)
       {epsilon = 1.0 : f32, feature_index = 2 : i64} :
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_avg_pool.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_avg_pool.cc
new file mode 100644
index 00000000000000..801c8775682cbd
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_avg_pool.cc
@@ -0,0 +1,154 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_avg_pool.h"
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.h"
+#include "tensorflow/compiler/mlir/lite/transforms/passes.h"  // IWYU pragma: keep
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"  // IWYU pragma: keep
+#include "tensorflow/core/framework/kernel_shape_util.h"
+#include "tensorflow/core/util/padding.h"
+
+namespace mlir {
+namespace odml {
+
+DenseIntElementsAttr GetPaddingArrayAttr(Builder& builder, Operation* old_op) {
+  mhlo::CompositeOp composite_op = llvm::dyn_cast<mhlo::CompositeOp>(old_op);
+  auto composite_attrs = composite_op.getCompositeAttributes();
+  std::vector<int32_t> padding_vec;
+  GetI32VectorFromDenseI64CompositeAttr(composite_attrs, "padding",
+                                        &padding_vec);
+
+  std::vector<int32_t> result_padding_conf(8, 0);  // NHWC
+  result_padding_conf[2] = result_padding_conf[3] = padding_vec[0];
+  result_padding_conf[4] = result_padding_conf[5] = padding_vec[1];
+
+  return DenseIntElementsAttr::get(
+      RankedTensorType::get({4, 2}, builder.getI32Type()), result_padding_conf);
+}
+
+ShapedType GetPaddedType(Operation* old_op) {
+  auto input_type = old_op->getOperand(0).getType().cast<ShapedType>();
+  auto input_shape = input_type.getShape();  // NCHW
+  int64_t batch_size = input_shape[0];
+  int64_t channel_size = input_shape[1];
+  int64_t height = input_shape[2];
+  int64_t width = input_shape[3];
+
+  DenseIntElementsAttr padding_attr;
+  mhlo::CompositeOp composite_op = llvm::dyn_cast<mhlo::CompositeOp>(old_op);
+  auto composite_attributes = composite_op.getCompositeAttributes();
+  EnsureAttribute<DenseIntElementsAttr>(composite_attributes, "padding",
+                                        &padding_attr);
+  std::vector<int64_t> padding_values(padding_attr.getValues<int64_t>().begin(),
+                                      padding_attr.getValues<int64_t>().end());
+  int64_t padding_height = padding_values[0];
+  int64_t padding_width = padding_values[1];
+
+  std::array<int64_t, 4> output_shape = {
+      batch_size, height + 2 * padding_height, width + 2 * padding_width,
+      channel_size};  // NHWC
+  return RankedTensorType::get(output_shape, input_type.getElementType());
+}
+
+// Checks if the provided configuration can be supported by the tensorflow
+// "SAME" padding configuration.
+static bool IsSamePadding(const std::vector<int32_t>& spatial_dim_sizes,
+                          const std::vector<int32_t>& kernel_size,
+                          const std::vector<int32_t>& strides,
+                          const std::vector<int32_t>& padding_array) {
+  for (int dim : llvm::seq<int>(0, spatial_dim_sizes.size())) {
+    int64_t discard;
+    int64_t pad_low_ignore;
+    int64_t pad_high_ignore;
+    absl::Status status = tensorflow::GetWindowedOutputSizeVerbose(
+        spatial_dim_sizes[dim], kernel_size[dim], 1, strides[dim],
+        tensorflow::Padding::SAME, &discard, &pad_low_ignore, &pad_high_ignore);
+    if (!status.ok()) {
+      return false;
+    }
+    if (padding_array[dim] != pad_low_ignore ||
+        padding_array[dim] != pad_high_ignore) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+enum class PaddingType { kValid, kSame, kCustom };
+
+static PaddingType GetPaddingType(const std::vector<int32_t>& spatial_dim_sizes,
+                                  const std::vector<int32_t>& kernel_size,
+                                  const std::vector<int32_t>& strides,
+                                  const std::vector<int32_t>& padding_array) {
+  if (std::all_of(padding_array.begin(), padding_array.end(),
+                  [](int32_t padding_value) { return padding_value == 0; })) {
+    return PaddingType::kValid;
+  }
+  if (IsSamePadding(spatial_dim_sizes, kernel_size, strides, padding_array)) {
+    return PaddingType::kSame;
+  }
+  return PaddingType::kCustom;
+}
+
+StringAttr GetPaddingStringAttr(Builder& builder, Operation* old_op) {
+  mhlo::CompositeOp composite_op = llvm::dyn_cast<mhlo::CompositeOp>(old_op);
+  auto composite_attrs = composite_op.getCompositeAttributes();
+
+  auto operand_shape =
+      composite_op.getOperand(0).getType().cast<ShapedType>().getShape();
+  // NC(H)(W)
+  std::vector<int32_t> spatial_dim_sizes = {
+      static_cast<int32_t>(operand_shape[2]),
+      static_cast<int32_t>(operand_shape[3])};
+
+  std::vector<int32_t> padding_vec, kernel_size_vec, strides_vec;
+  GetI32VectorFromDenseI64CompositeAttr(composite_attrs, "kernel_size",
+                                        &kernel_size_vec);
+  GetI32VectorFromDenseI64CompositeAttr(composite_attrs, "stride",
+                                        &strides_vec);
+  GetI32VectorFromDenseI64CompositeAttr(composite_attrs, "padding",
+                                        &padding_vec);
+  PaddingType padding_type = GetPaddingType(spatial_dim_sizes, kernel_size_vec,
+                                            strides_vec, padding_vec);
+
+  switch (padding_type) {
+    case PaddingType::kValid:
+      return builder.getStringAttr("VALID");
+    case PaddingType::kSame:
+      return builder.getStringAttr("SAME");
+    case PaddingType::kCustom:
+      return builder.getStringAttr("CUSTOM");
+  }
+}
+
+}  // namespace odml
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_avg_pool.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_avg_pool.h
new file mode 100644
index 00000000000000..4224f2d6c8ae10
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_avg_pool.h
@@ -0,0 +1,50 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_COMPOSITE_AVG_POOL_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_COMPOSITE_AVG_POOL_H_
+
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/transforms/passes.h"  // IWYU pragma: keep
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"  // IWYU pragma: keep
+
+namespace mlir {
+namespace odml {
+
+// Given a Composite op that wraps a core.aten.avg_pool2d, returns the padding
+// configuration required for the `tfl.pad` if the padding part of the op is
+// to be done before average pooling.
+DenseIntElementsAttr GetPaddingArrayAttr(Builder& builder, Operation* old_op);
+
+// Given a Composite op that wraps a core.aten.avg_pool2d, and assuming that
+// the padding part is extracted into a tfl.pad op prior to a
+// tfl.average_pool_2d, this function finds the return type of the needed
+// tfl.pad .
+ShapedType GetPaddedType(Operation* old_op);
+
+// Given a Composite op that wraps a core.aten.avg_pool2d, finds the padding
+// attribute to be passed to the a tfl.average_pool_2d that can fully replace
+// this composite (here, padding is done directly by the tfl.average_pool_2d as
+// opposed to being extracted into a separate tfl.pad).
+StringAttr GetPaddingStringAttr(Builder& builder, Operation* old_op);
+
+}  // namespace odml
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_COMPOSITE_AVG_POOL_H_
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_avg_pool_patterns.td b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_avg_pool_patterns.td
new file mode 100644
index 00000000000000..607b8f520ba6f9
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_avg_pool_patterns.td
@@ -0,0 +1,91 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+include "mlir/IR/PatternBase.td"
+include "tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.td"
+
+
+// See the function doc in the header file.
+def GetPaddedType : NativeCodeCall<
+  "GetPaddedType((*$0.begin()).getDefiningOp())">;
+
+// See the function doc in the header file.
+def GetPadding: 
+  NativeCodeCall<"GetPaddingStringAttr($_builder, (*$0.begin()).getDefiningOp())">;
+
+// Returns true if the provided padding in the composite op can *not* be 
+// satisfied by SAME or VALID tensorflow padding.
+def HasCustomPadding:
+  Constraint<CPred<"GetPaddingStringAttr($_builder, (*$0.begin()).getDefiningOp()) == $_builder.getStringAttr(\"CUSTOM\")">>;
+
+// Returns true if the provided padding in the composite op can be satisfied 
+// by SAME or VALID tensorflow padding.
+def HasSameOrValidPadding: Constraint<Neg<HasCustomPadding.predicate>>;
+
+// See the function doc in the header file.
+def GetPaddingArrayAttr: NativeCodeCall<"GetPaddingArrayAttr($_builder, (*$0.begin()).getDefiningOp())">;
+
+// Replaces an ate.avg_pool2d with a (T -> tfl.average_pool_2d  -> T).
+// Constraints are added on the attributes of the aten.avg_pool2d to ensure only
+// ops that match the behaviour of tfl.average_pool_2d are directly lowered.
+def LegalizeAvgPool2dComposite: Pat<
+                    (MHLO_CompositeOp:$old_val
+                    (variadic $a_input), 
+                    ConstantStrAttr<StrAttr, "aten.avg_pool2d.default">, $attrs, $_, $_),
+                    (TFL_TransposeOp 
+                        (TFL_AveragePool2DOp 
+                            /*input*/ (TFL_TransposeOp $a_input,
+                                (Arith_ConstantOp
+                                    ConstantAttr<RankedI32ElementsAttr<[4]>,"{0, 2, 3, 1}">)),
+                            /*filter_height*/(GetI32At<0> (GetAsVectorAttr<"kernel_size"> $attrs)),
+                            /*filter_width*/(GetI32At<1> (GetAsVectorAttr<"kernel_size"> $attrs)),
+                            /*padding*/(GetPadding $old_val),
+                            /*stride_h*/(GetI32At<0> (GetAsVectorAttr<"stride"> $attrs)),
+                            /*stride_w*/(GetI32At<1> (GetAsVectorAttr<"stride"> $attrs)),
+                            /*fused_activation_function*/TFL_AF_None,
+                            (returnType (GetNhwcReturnTypeFromNchw $old_val))), 
+                        (Arith_ConstantOp
+                            ConstantAttr<RankedI32ElementsAttr<[4]>,"{0, 3, 1, 2}">)), 
+                    [(IsBoolCompositeAttribute<"ceil_mode", "false"> $attrs),
+                    (IsBoolCompositeAttribute<"count_include_pad", "false"> $attrs),
+                    (IsStrCompositeAttribute<"divisor_override", "py_None"> $attrs),
+                    (HasSameOrValidPadding $old_val)]>;
+
+// Replaces an ate.avg_pool2d with (T -> tfl.pad -> tfl.average_pool_2d  -> T).
+def LegalizeAvgPool2dWithPadComposite: Pat<
+                    (MHLO_CompositeOp:$old_val
+                    (variadic $a_input), 
+                    ConstantStrAttr<StrAttr, "aten.avg_pool2d.default">, $attrs, $_, $_),
+                    (TFL_TransposeOp 
+                        (TFL_AveragePool2DOp:$padded_value 
+                            /*input*/ (TFL_PadOp 
+                                (TFL_TransposeOp $a_input,
+                                    (Arith_ConstantOp
+                                        ConstantAttr<RankedI32ElementsAttr<[4]>,"{0, 2, 3, 1}">)),
+                                (Arith_ConstantOp
+                                    (GetPaddingArrayAttr $old_val)),
+                                (returnType (GetPaddedType $old_val))),
+                            /*filter_height*/(GetI32At<0> (GetAsVectorAttr<"kernel_size"> $attrs)),
+                            /*filter_width*/(GetI32At<1> (GetAsVectorAttr<"kernel_size"> $attrs)),
+                            /*padding*/TFL_PAD_Valid,
+                            /*stride_h*/(GetI32At<0> (GetAsVectorAttr<"stride"> $attrs)),
+                            /*stride_w*/(GetI32At<1> (GetAsVectorAttr<"stride"> $attrs)),
+                            /*fused_activation_function*/TFL_AF_None,
+                            (returnType (GetNhwcReturnTypeFromNchw $old_val))), 
+                        (Arith_ConstantOp
+                            ConstantAttr<RankedI32ElementsAttr<[4]>,"{0, 3, 1, 2}">)), 
+                    [(IsBoolCompositeAttribute<"ceil_mode", "false"> $attrs),
+                    (IsStrCompositeAttribute<"divisor_override", "py_None"> $attrs),
+                    (IsBoolCompositeAttribute<"count_include_pad", "true"> $attrs)]>;
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_pass.cc
index 0dc354f998d246..11e2272a145f0b 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_pass.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
@@ -25,6 +26,8 @@ limitations under the License.
 #include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_avg_pool.h"  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.h"  // IWYU pragma: keep
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"  // IWYU pragma: keep
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"  // IWYU pragma: keep
@@ -57,6 +60,7 @@ void CompositeLoweringPass::runOnOperation() {
 
   ConversionTarget target(context);
   target.addLegalDialect<TFL::TensorFlowLiteDialect>();
+  target.addLegalDialect<arith::ArithDialect>();
 
   if (failed(applyPartialConversion(getOperation(), target,
                                     std::move(patterns)))) {
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_patterns.td b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_patterns.td
index 1b62b6fcc4aeae..829cf2fbaf16a4 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_patterns.td
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_patterns.td
@@ -14,15 +14,38 @@ limitations under the License.
 ==============================================================================*/
 
 // Pattern definition file for direct lowering of mhlo composites to tflite ops.
-
 include "mlir/IR/OpBase.td"
+include "mlir/Dialect/Arith/IR/ArithOps.td"
 include "mlir/Dialect/Func/IR/FuncOps.td"
 include "mhlo/IR/hlo_ops.td"
 include "tensorflow/compiler/mlir/lite/ir/tfl_ops.td"
-
+include "tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.td"
+include "tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_avg_pool_patterns.td"
 
 def LegalizeHardSwishComposite: Pat<
-                    (MHLO_CompositeOp:$old_value
+                    (MHLO_CompositeOp:$old_val
                     (variadic $input),
                     ConstantStrAttr<StrAttr, "aten.hardswish.default">, $_, $_, $_),
                     (TFL_HardSwishOp $input)>;
+
+// Checks if the given op is an InterpolateBilinear op with NCHW layout.
+// Supplied arguments are the input, output op values and the output shape.
+def IsSupportedNchwUpsampleBlinear: Constraint<CPred<
+  "IsSupportedNchwUpsampleBlinear($0, $1[0], $2.get(\"output\").dyn_cast<DenseIntElementsAttr>())">>;
+
+def LegalizeTorchUpsampleBlinear2dComposite: Pat<
+                    (MHLO_CompositeOp:$old_val
+                    (variadic $input),
+                    ConstantStrAttr<StrAttr, "odml.upsample_bilinear2d">, $attrs, $_, $_),
+                    (TFL_TransposeOp
+                        (TFL_ResizeBilinearOp
+                            (TFL_TransposeOp $input,
+                                (Arith_ConstantOp
+                                    ConstantAttr<RankedI32ElementsAttr<[4]>,"{0, 2, 3, 1}">)),
+                            (Arith_ConstantOp:$output_size (GetI32DenseAttr (GetAsVectorAttr<"output"> $attrs))),
+                            (GetCompositeAttributeAs<"align_corners", "BoolAttr"> $attrs),
+                            ConstBoolAttrTrue,
+                            (returnType (GetNhwcReturnTypeFromNchw $old_val))),
+                        (Arith_ConstantOp
+                            ConstantAttr<RankedI32ElementsAttr<[4]>,"{0, 3, 1, 2}">)),
+                    [(IsSupportedNchwUpsampleBlinear $input, $old_val, $attrs)]>;
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.cc
new file mode 100644
index 00000000000000..403bf9968a9acd
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.cc
@@ -0,0 +1,105 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.h"
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+
+namespace mlir {
+namespace odml {
+
+DenseIntElementsAttr DenseI64AttrToI32Attr(
+    const DenseIntElementsAttr& dense_attr, PatternRewriter& builder) {
+  std::vector<int32_t> ret(dense_attr.getNumElements());
+  auto range = dense_attr.getValues<int64_t>();
+  std::transform(range.begin(), range.end(), ret.begin(),
+                 [](int64_t attr) { return static_cast<int32_t>(attr); });
+  return DenseIntElementsAttr::get(
+      RankedTensorType::get(ret.size(), builder.getIntegerType(32)), ret);
+}
+
+bool DenseI64AttrToI32Vector(const DenseIntElementsAttr& dense_attr,
+                             std::vector<int32_t>* out_vec) {
+  std::vector<int32_t> ret(dense_attr.getNumElements());
+  auto range = dense_attr.getValues<int64_t>();
+  std::transform(range.begin(), range.end(), ret.begin(),
+                 [](int64_t attr) { return static_cast<int32_t>(attr); });
+  *out_vec = std::move(ret);
+  return true;
+}
+
+bool GetI32VectorFromDenseI64CompositeAttr(
+    const DictionaryAttr& composite_attrs, const std::string& attr_name,
+    std::vector<int32_t>* out_vec) {
+  DenseIntElementsAttr attr;
+  if (!EnsureAttribute<DenseIntElementsAttr>(composite_attrs, attr_name,
+                                             &attr)) {
+    return false;
+  }
+
+  return DenseI64AttrToI32Vector(attr, out_vec);
+}
+
+bool IsSupportedNchwUpsampleBlinear(
+    Value input, Value output, const DenseIntElementsAttr& output_size_attr) {
+  auto input_shape = input.getType().cast<ShapedType>().getShape();
+  auto output_shape = output.getType().cast<ShapedType>().getShape();
+
+  // Only support 4D tensor.
+  if (input_shape.size() != 4 || output_shape.size() != 4) {
+    return false;
+  }
+
+  // Only expects the first two dimensions of input and output to be the same as
+  // in NCHW.
+  if (input_shape[0] != output_shape[0] || input_shape[1] != output_shape[1]) {
+    return false;
+  }
+
+  // Supplied output size should be 2D.
+  if (output_size_attr.getNumElements() != 2) {
+    return false;
+  }
+  auto output_size = output_size_attr.getValues<int64_t>();
+  return output_size[0] == output_shape[2] && output_size[1] == output_shape[3];
+}
+
+ShapedType GetNhwcReturnTypeFromNchw(Operation* old_op) {
+  auto composite_result_shape =
+      old_op->getResults().front().getType().cast<ShapedType>().getShape();
+  std::array<int64_t, 4> output_shape;
+  // NHWC <- NCHW
+  output_shape[0] = composite_result_shape[0];
+  output_shape[1] = composite_result_shape[2];
+  output_shape[2] = composite_result_shape[3];
+  output_shape[3] = composite_result_shape[1];
+
+  auto input_type = old_op->getOperand(0).getType().cast<ShapedType>();
+
+  return RankedTensorType::get(output_shape, input_type.getElementType());
+}
+}  // namespace odml
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.h
new file mode 100644
index 00000000000000..0691dc74997212
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.h
@@ -0,0 +1,83 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_COMPOSITE_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_COMPOSITE_UTILS_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/transforms/passes.h"  // IWYU pragma: keep
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"  // IWYU pragma: keep
+
+namespace mlir {
+namespace odml {
+
+// Ensure an attribute named attr_name exists and it is of type AttrType.
+// If so, sets the `out_attr` pointer to point to the casted attribute.
+template <typename AttrType>
+bool EnsureAttribute(const DictionaryAttr& composite_attributes,
+                     const std::string& attr_name, AttrType* out_attr) {
+  Attribute attr = composite_attributes.get(attr_name);
+  if (!attr.isa_and_nonnull<AttrType>()) {
+    return false;
+  }
+  if (AttrType content = attr.dyn_cast<AttrType>()) {
+    *out_attr = content;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// Changes a DenseIntElementsAttr **containing I64** elements to an I32 Vector.
+bool DenseI64AttrToI32Vector(const DenseIntElementsAttr& dense_attr,
+                             std::vector<int32_t>* out_vec);
+
+// Given a DictionaryAttr, checks if it has a DenseIntElementsAttr attribute
+// with the name attr_name. If so, extracts its values and stores as a vector
+// of int32_t elements.
+// Note: This assumes the DenseIntElementsAttr has its values stored as int64_t.
+bool GetI32VectorFromDenseI64CompositeAttr(
+    const DictionaryAttr& composite_attrs, const std::string& attr_name,
+    std::vector<int32_t>* out_vec);
+
+// Get a DenseIntElementsAttr of type I64 and convert it to an I32 attribute.
+DenseIntElementsAttr DenseI64AttrToI32Attr(
+    const DenseIntElementsAttr& dense_attr, PatternRewriter& builder);
+
+// Returns true if the given input and output are in NCHW layout
+bool IsSupportedNchwUpsampleBlinear(
+    Value input, Value output, const DenseIntElementsAttr& output_size_attr);
+
+// Returns a NHWC shaped type from an NCHW shaped type op.
+// For example- Given a Composite op that wraps a core.aten.avg_pool2d, this
+// returns the return type of the tfl.average_pool_2d emitted. Note that the
+// aten.avg_pool2d works with the NCHW layout while tfl.average_pool_2d assumes
+// NHWC.
+ShapedType GetNhwcReturnTypeFromNchw(Operation* old_op);
+
+}  // namespace odml
+
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_COMPOSITE_UTILS_H_
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.td b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.td
new file mode 100644
index 00000000000000..d39a8efb8b13b3
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.td
@@ -0,0 +1,64 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef COMPOSITE_UTILS_TD
+#define COMPOSITE_UTILS_TD
+
+include "mlir/IR/PatternBase.td"
+
+// See the function doc in the header file.
+def GetNhwcReturnTypeFromNchw: NativeCodeCall<
+  "GetNhwcReturnTypeFromNchw((*$0.begin()).getDefiningOp())">;
+
+
+// When given a DenseIntElementsAttr containing I64 elements, this extracts
+// one I32IntegerAttr from the given index.
+class GetI32At<int index>: NativeCodeCall<
+  "$_builder.getI32IntegerAttr(static_cast<int32_t>(*($0.getValues<int64_t>().begin() + " # index #")))">;
+
+def GetI32DenseAttr: NativeCodeCall<
+  "DenseI64AttrToI32Attr($0, $_builder)">;
+
+// Receives a composite DictionaryAttr and returns the value of the Attribute
+// with the key `attr_name` as the type provided by `attr_type`.
+class GetCompositeAttributeAs<string attr_name, string attr_type>:
+  NativeCodeCall<"$0.get(\"" # attr_name # "\").dyn_cast<" # attr_type # ">()">;
+
+// Receives a composite DictionaryAttr and returns the value of the Attribute
+// with the key `attr_name` as a DenseIntElementsAttr.
+class GetAsVectorAttr<string attr_name>:
+  GetCompositeAttributeAs<attr_name, "DenseIntElementsAttr">;
+
+class IsBoolAttrEqual<string true_or_false> : Constraint<CPred<
+  "$0.getValue() == "#true_or_false#"">>;
+
+// Receives a composite DictionaryAttr as an argument and checks if one of the
+// its attributes (with the name `attr_name`) is of type `attribute` and has
+// the value `val`.
+class IsCompositeAttribute<string attr_name, Attr attribute, string val>:
+  Constraint<CPred<
+    "$0.get(\"" # attr_name # "\") == " # !subst("$0", val, attribute.constBuilderCall)>>;
+
+// Receives a composite DictionaryAttr as an argument and checks if has a
+// BoolAttr with the name `attr_name` and value `val`.
+class IsBoolCompositeAttribute <string attr_name, string val>:
+  IsCompositeAttribute<attr_name, BoolAttr, val>;
+
+// Receives a composite DictionaryAttr as an argument and checks if has a
+// StrAttr with the name `attr_name` and value `val`.
+class IsStrCompositeAttribute <string attr_name, string val>:
+  IsCompositeAttribute<attr_name, StrAttr, "\"" # val # "\"">;
+
+#endif
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_composite_to_tfl_custom.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_composite_to_tfl_custom.cc
index a35f5ba324e3f4..6e0a3325460b7a 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_composite_to_tfl_custom.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_composite_to_tfl_custom.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
@@ -47,6 +48,10 @@ bool IsSupportedComposite(::mlir::stablehlo::CompositeOp op) {
       op.getName());
 }
 
+bool IsKVCacheCompositeOp(::mlir::stablehlo::CompositeOp op) {
+  return op.getName() == "odml.update_kv_cache";
+}
+
 TFL::ConstBytesAttr CustomOption(OpBuilder* builder,
                                  const std::string& content) {
   return TFL::ConstBytesAttr::get(builder->getContext(),
@@ -75,6 +80,12 @@ TFL::CustomOp BuildCustomOp(stablehlo::CompositeOp composite,
                             const std::string& custom_option_buffer) {
   OpBuilder builder(composite->getContext());
   builder.setInsertionPoint(composite);
+  if (IsKVCacheCompositeOp(composite)) {
+    return builder.create<TFL::CustomOp>(
+        composite->getLoc(), composite->getResultTypes(),
+        composite->getOperands().slice(2, 3), composite.getName(),
+        CustomOption(&builder, custom_option_buffer));
+  }
   return builder.create<TFL::CustomOp>(
       composite->getLoc(), composite->getResultTypes(),
       composite->getOperands(), composite.getName(),
@@ -104,11 +115,48 @@ struct LegalizeCompositeToCustomOpPass
 
   void runOnOperation() override {
     func::FuncOp fn = getOperation();
+
+    int num_layers = 0, current_layer_index = 0;
+    // First walk the function to count number of KV Caches.
+    fn.walk([&](Operation* op) {
+      auto composite = llvm::dyn_cast<stablehlo::CompositeOp>(op);
+      if (!composite || !IsKVCacheCompositeOp(composite)) return;
+      num_layers++;
+    });
+
     fn.walk([&](Operation* op) {
       // Process only StableHLO composite ops.
       auto composite = llvm::dyn_cast<stablehlo::CompositeOp>(op);
       if (!composite || !IsSupportedComposite(composite)) return;
 
+      if (IsKVCacheCompositeOp(composite)) {
+        auto comp_attr = composite.getCompositeAttributes();
+        mlir::Builder builder(composite->getContext());
+
+        // num_layers Composite Attribute.
+        mlir::StringAttr num_layers_str = builder.getStringAttr("num_layers");
+        NamedAttribute num_layers_attr(
+            num_layers_str,
+            IntegerAttr::get(IntegerType::get(fn.getContext(), /*width=*/32),
+                             num_layers));
+
+        // current_layer_index Composite Attribute.
+        mlir::StringAttr current_layer_str =
+            builder.getStringAttr("layer_index");
+        NamedAttribute current_layer_attr(
+            current_layer_str,
+            IntegerAttr::get(IntegerType::get(fn.getContext(), /*width=*/32),
+                             current_layer_index++));
+
+        // Build a new CompositeAttributes attr, add in the above,
+        // and set for the op.
+        mlir::NamedAttrList attributes(comp_attr);
+        attributes.append(num_layers_attr);
+        attributes.append(current_layer_attr);
+        comp_attr = attributes.getDictionary(builder.getContext());
+        composite.setCompositeAttributesAttr(comp_attr);
+      }
+
       // Build flexbuffer options.
       std::string custom_option_buffer;
       auto fbb = std::make_unique<flexbuffers::Builder>();
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.cc
index fcacfcf4984db1..a22c392163b09d 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.cc
@@ -52,17 +52,17 @@ class TFToMhloPass
  public:
   explicit TFToMhloPass(bool skip_quantization_ops = false,
                         bool skip_resize = false,
-                        bool skip_stateful_partitioned_call = false)
+                        bool skip_partitioned_calls = false)
       : PassWrapper() {
     skip_quantization_ops_ = skip_quantization_ops;
     skip_resize_ = skip_resize;
-    skip_stateful_partitioned_call_ = skip_stateful_partitioned_call;
+    skip_partitioned_calls_ = skip_partitioned_calls;
   }
 
   TFToMhloPass(const TFToMhloPass &pass) {
     skip_quantization_ops_ = pass.skip_quantization_ops_;
     skip_resize_ = pass.skip_resize_;
-    skip_stateful_partitioned_call_ = pass.skip_stateful_partitioned_call_;
+    skip_partitioned_calls_ = pass.skip_partitioned_calls_;
   }
 
  private:
@@ -90,9 +90,10 @@ class TFToMhloPass
       *this, "skip-resize",
       ::llvm::cl::desc("Skip tf.ResizeBilinear and tf.ResizeNearestNeighbor")};
 
-  Option<bool> skip_stateful_partitioned_call_{
-      *this, "skip-stateful-partitioned-call",
-      ::llvm::cl::desc("Skip tf.StatefulPartitionedCall")};
+  Option<bool> skip_partitioned_calls_{
+      *this, "skip-partitioned-calls",
+      ::llvm::cl::desc(
+          "Skip tf.StatefulPartitionedCall and tf.PartitionedCall")};
 };
 
 void TFToMhloPass::runOnOperation() {
@@ -129,7 +130,8 @@ void TFToMhloPass::runOnOperation() {
     target.addLegalOp<TF::ResizeBilinearOp>();
     target.addLegalOp<TF::ResizeNearestNeighborOp>();
   }
-  if (skip_stateful_partitioned_call_) {
+  if (skip_partitioned_calls_) {
+    target.addLegalOp<TF::PartitionedCallOp>();
     target.addLegalOp<TF::StatefulPartitionedCallOp>();
   }
 
@@ -145,9 +147,10 @@ struct TFToStablehloOptions : public PassPipelineOptions<TFToStablehloOptions> {
   Option<bool> skip_resize{
       *this, "skip-resize",
       ::llvm::cl::desc("Skip tf.ResizeBilinear and tf.ResizeNearestNeighbor")};
-  Option<bool> skip_stateful_partitioned_call{
-      *this, "skip-stateful-partitioned-call",
-      ::llvm::cl::desc("Skip tf.StatefulPartitionedCall")};
+  Option<bool> skip_partitioned_calls{
+      *this, "skip-partitioned-calls",
+      ::llvm::cl::desc(
+          "Skip tf.StatefulPartitionedCall and tf.PartitionedCall")};
 };
 
 void PopulateLegalizeTFToStablehloPipeline(
@@ -157,7 +160,7 @@ void PopulateLegalizeTFToStablehloPipeline(
   // reusing their work, perhaps through `LowerToMlProgramAndHlo`.
   pm.addNestedPass<func::FuncOp>(std::make_unique<TFToMhloPass>(
       options.skip_quantization_ops, options.skip_resize,
-      options.skip_stateful_partitioned_call));
+      options.skip_partitioned_calls));
   pm.addPass(mlir::createCanonicalizerPass());
   pm.addPass(mhlo::createHloLegalizeToStablehloPass());
 }
@@ -170,11 +173,11 @@ static PassPipelineRegistration<TFToStablehloOptions>
 void AddLegalizeTFToStablehloPasses(OpPassManager &pm,
                                     bool skip_quantization_ops,
                                     bool skip_resize,
-                                    bool skip_stateful_partitioned_call) {
+                                    bool skip_partitioned_calls) {
   TFToStablehloOptions options;
   options.skip_quantization_ops = skip_quantization_ops;
   options.skip_resize = skip_resize;
-  options.skip_stateful_partitioned_call = skip_stateful_partitioned_call;
+  options.skip_partitioned_calls = skip_partitioned_calls;
   PopulateLegalizeTFToStablehloPipeline(pm, options);
 }
 
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.h
index 91eafb5ab7fa49..c26a3f36daf675 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.h
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.h
@@ -25,7 +25,7 @@ namespace odml {
 void AddLegalizeTFToStablehloPasses(OpPassManager& pm,
                                     bool skip_quantization_ops,
                                     bool skip_resize,
-                                    bool skip_stateful_partitioned_call);
+                                    bool skip_partitioned_calls);
 
 }  // namespace odml
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc
index 8fed8f3f01ed54..ad3bc3cd4cd24d 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc
@@ -61,8 +61,10 @@ using ::mlir::quant::CreateI32F32UniformQuantizedPerAxisType;
 using ::mlir::quant::CreateI32F32UniformQuantizedType;
 using ::mlir::quant::CreateI8F32UniformQuantizedPerAxisType;
 using ::mlir::quant::CreateI8F32UniformQuantizedType;
+using ::mlir::quant::FindOperandOfType;
 using ::mlir::quant::FindUserOfType;
 using ::mlir::quant::GetElementType;
+using ::mlir::quant::IsDotGeneralFullyConnected;
 using ::mlir::quant::IsI32F32UniformQuantizedPerAxisType;
 using ::mlir::quant::IsI32F32UniformQuantizedType;
 using ::mlir::quant::IsI8F32UniformQuantizedPerAxisType;
@@ -107,6 +109,20 @@ double GetBiasScale(const double input_scale, const double filter_scale) {
   return filter_scale * input_scale;
 }
 
+// Returns the optionally broadcasted bias constant op used for a given op.
+// If no such constant op exists, returns a nullptr.
+Operation* GetBiasConstOp(Operation* op) {
+  Operation* bias_const_op;
+  if (Operation* broadcast_in_dim_op =
+          FindOperandOfType<stablehlo::BroadcastInDimOp>(op);
+      broadcast_in_dim_op != nullptr) {
+    bias_const_op = broadcast_in_dim_op->getOperand(0).getDefiningOp();
+  } else {
+    bias_const_op = FindOperandOfType<stablehlo::ConstantOp>(op);
+  }
+  return isa<stablehlo::ConstantOp>(bias_const_op) ? bias_const_op : nullptr;
+}
+
 // Creates a new `tfl.qconst` op for the quantized filter. Transposes the
 // filter value from [i, o] -> [o, i]. This is because we assume `[i, o]`
 // format for `stablehlo.dot_general` (i.e. contracting dimension == 1)
@@ -426,8 +442,7 @@ class RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp
   LogicalResult match(stablehlo::DotGeneralOp op) const override {
     const stablehlo::DotDimensionNumbersAttr dot_dimension_nums =
         op.getDotDimensionNumbers();
-    const bool is_batch_matmul =
-        !dot_dimension_nums.getLhsBatchingDimensions().empty();
+    const bool is_batch_matmul = !IsDotGeneralFullyConnected(op).value();
     const Type elem_type = GetElementType(op.getResult());
     const bool has_i32_output = IsI32F32UniformQuantizedType(elem_type) ||
                                 IsI32F32UniformQuantizedPerAxisType(elem_type);
@@ -464,8 +479,7 @@ class RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp
         IsI32F32UniformQuantizedPerAxisType(output_type);
     const stablehlo::DotDimensionNumbersAttr dot_dimension_nums =
         op.getDotDimensionNumbers();
-    const bool is_batch_matmul =
-        !dot_dimension_nums.getLhsBatchingDimensions().empty();
+    const bool is_batch_matmul = !IsDotGeneralFullyConnected(op).value();
 
     if (is_batch_matmul) {
       RewriteDotGeneralToTflBatchMatmulOp(op, rewriter, dot_dimension_nums,
@@ -793,15 +807,17 @@ class RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp
                     .cast<UniformQuantizedPerAxisType>()
                     .getZeroPoints(),
                 /*quantization_dimension=*/0);
-        Operation* stablehlo_bias_op = add_op->getOperand(1).getDefiningOp();
-        const auto bias_type = RankedTensorType::getChecked(
-            op->getLoc(), bias_shape, bias_quantized_type);
-        const auto bias_value = cast<DenseIntElementsAttr>(
-            cast<stablehlo::ConstantOp>(stablehlo_bias_op).getValue());
-
-        *bias_tfl_op = rewriter.create<TFL::QConstOp>(
-            op->getLoc(),
-            /*output=*/TypeAttr::get(bias_type), /*value=*/bias_value);
+        Operation* bias_const_op = GetBiasConstOp(add_op);
+        if (bias_const_op != nullptr) {
+          const auto bias_type = RankedTensorType::getChecked(
+              op->getLoc(), bias_shape, bias_quantized_type);
+          const auto bias_value = cast<DenseIntElementsAttr>(
+              cast<stablehlo::ConstantOp>(bias_const_op).getValue());
+
+          *bias_tfl_op = rewriter.create<TFL::QConstOp>(
+              op->getLoc(),
+              /*output=*/TypeAttr::get(bias_type), /*value=*/bias_value);
+        }
       } else {
         uniform_quantize_op = FindUserOfType<TFL::QuantizeOp>(op);
       }
@@ -902,22 +918,14 @@ class RewriteQuantizedConvolutionOp
       return failure();
     }
 
-    // TODO: b/309896242 - Lift the assumptions on adjacent ops below
-    // as we cover more dynamic fused pattern legalization.
     if (fuse_bias_constant) {
       Operation* add_op = FindUserOfType<stablehlo::AddOp>(op);
       if (add_op == nullptr) {
         LLVM_DEBUG(llvm::dbgs() << "Failed to find AddOp for bias fusion.\n");
         return failure();
       }
-      Operation* broadcast_in_dim_op = add_op->getOperand(1).getDefiningOp();
-      if (!isa<stablehlo::BroadcastInDimOp>(broadcast_in_dim_op)) {
-        LLVM_DEBUG(llvm::dbgs() << "Failed to find broadcasted bias.\n");
-        return failure();
-      }
-      Operation* bias_const_op =
-          broadcast_in_dim_op->getOperand(0).getDefiningOp();
-      if (!isa<stablehlo::ConstantOp>(bias_const_op)) {
+      Operation* bias_const_op = GetBiasConstOp(add_op);
+      if (bias_const_op == nullptr) {
         LLVM_DEBUG(llvm::dbgs() << "Failed to find bias constant.\n");
         return failure();
       }
@@ -1413,11 +1421,7 @@ class RewriteQuantizedConvolutionOp
     TFL::QConstOp bias;
     if (fuse_bias_constant && has_i32_output) {
       Operation* add_op = FindUserOfType<stablehlo::AddOp>(op);
-      // TODO: b/309896242 - Lift the assumptions on adjacent ops below
-      // as we cover more dynamic fused pattern legalization.
-      Operation* broadcast_in_dim_op = add_op->getOperand(1).getDefiningOp();
-      Operation* bias_const_op =
-          broadcast_in_dim_op->getOperand(0).getDefiningOp();
+      Operation* bias_const_op = GetBiasConstOp(add_op);
       const ElementsAttr bias_constant_value =
           cast<stablehlo::ConstantOp>(bias_const_op).getValue();
       bias = rewriter.create<TFL::QConstOp>(op.getLoc(),
diff --git a/tensorflow/compiler/mlir/lite/tests/const-fold.mlir b/tensorflow/compiler/mlir/lite/tests/const-fold.mlir
index ba9c1e58565286..f244d15294c253 100644
--- a/tensorflow/compiler/mlir/lite/tests/const-fold.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/const-fold.mlir
@@ -181,6 +181,46 @@ func.func @elementwise_unary_ops() -> (tensor<f32>, tensor<f32>, tensor<f32>, te
   func.return %7, %8, %9, %10, %11, %12, %13 : tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>
 }
 
+// CHECK-LABEL: @max_with_neg_f32_max_val
+// CHECK-SAME: (%[[ARG0:.+]]: tensor<f32>)
+func.func @max_with_neg_f32_max_val(%arg0 : tensor<f32>) -> (tensor<f32>, tensor<f32>) {
+  %neg_f32_max = arith.constant dense<-3.40282347E+38> : tensor<f32>
+  %0 = "tfl.maximum"(%arg0, %neg_f32_max) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  %1 = "tfl.maximum"(%neg_f32_max, %arg0) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  func.return %0, %1 : tensor<f32>, tensor<f32>
+  // CHECK: return %[[ARG0]], %[[ARG0]]
+}
+
+// CHECK-LABEL: @min_with_f32_max_val
+// CHECK-SAME: (%[[ARG0:.+]]: tensor<f32>)
+func.func @min_with_f32_max_val(%arg0 : tensor<f32>) -> (tensor<f32>, tensor<f32>) {
+  %f32_max = arith.constant dense<3.40282347E+38> : tensor<f32>
+  %0 = "tfl.minimum"(%arg0, %f32_max) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  %1 = "tfl.minimum"(%f32_max, %arg0) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  func.return %0, %1 : tensor<f32>, tensor<f32>
+  // CHECK: return %[[ARG0]], %[[ARG0]]
+}
+
+// CHECK-LABEL: @max_with_neg_f64_max_val
+// CHECK-SAME: (%[[ARG0:.+]]: tensor<f64>)
+func.func @max_with_neg_f64_max_val(%arg0 : tensor<f64>) -> (tensor<f64>, tensor<f64>) {
+  %neg_f64_max = arith.constant dense<-1.7976931348623157E+308> : tensor<f64>
+  %0 = "tfl.maximum"(%arg0, %neg_f64_max) : (tensor<f64>, tensor<f64>) -> tensor<f64>
+  %1 = "tfl.maximum"(%neg_f64_max, %arg0) : (tensor<f64>, tensor<f64>) -> tensor<f64>
+  func.return %0, %1 : tensor<f64>, tensor<f64>
+  // CHECK: return %[[ARG0]], %[[ARG0]]
+}
+
+// CHECK-LABEL: @min_with_f64_max_val
+// CHECK-SAME: (%[[ARG0:.+]]: tensor<f64>)
+func.func @min_with_f64_max_val(%arg0 : tensor<f64>) -> (tensor<f64>, tensor<f64>) {
+  %f64_max = arith.constant dense<1.7976931348623157E+308> : tensor<f64>
+  %0 = "tfl.minimum"(%arg0, %f64_max) : (tensor<f64>, tensor<f64>) -> tensor<f64>
+  %1 = "tfl.minimum"(%f64_max, %arg0) : (tensor<f64>, tensor<f64>) -> tensor<f64>
+  func.return %0, %1 : tensor<f64>, tensor<f64>
+  // CHECK: return %[[ARG0]], %[[ARG0]]
+}
+
 // CHECK-LABEL: @mul_int
 func.func @mul_int() -> (tensor<i32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) {
   %0 = arith.constant dense<8> : tensor<i32>
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir
index aeb14ece5e26b7..3c83883dfea9a4 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir
@@ -881,6 +881,25 @@ func.func @QuantizedCatsAddRequantsTest(%arg0: tensor<1x1xf32>, %arg1: tensor<1x
 // CHECK-NEXT: return %[[dqcat_2_0_1_0]], %[[dqcat_2_0_3]] : tensor<1x4xf32>, tensor<1x3xf32>
 }
 
+// QDQ-LABEL: TransposePerTensorQuantizationPropagation
+func.func @TransposePerTensorQuantizationPropagation() -> tensor<2x5xf32> {
+  %perm = arith.constant dense<[1, 0]> : tensor<2xi32>
+  %cst = arith.constant dense<1.0> : tensor<5x2xf32>
+  %q = "tfl.quantize"(%cst) {qtype = tensor<5x2x!quant.uniform<i8<-127:127>:f32, 1.113490e-03>>} : (tensor<5x2xf32>) -> tensor<5x2x!quant.uniform<i8<-127:127>:f32, 1.113490e-03>>
+  %dq = "tfl.dequantize"(%q) : (tensor<5x2x!quant.uniform<i8<-127:127>:f32, 1.113490e-03>>) -> tensor<5x2xf32>
+  %t = "tfl.transpose"(%dq, %perm) : (tensor<5x2xf32>, tensor<2xi32>) -> tensor<2x5xf32>
+  func.return %t : tensor<2x5xf32>
+
+  // QDQ: %[[perm:.*]] = arith.constant dense<[1, 0]> : tensor<2xi32>
+  // QDQ-NEXT: %[[w:.*]] = arith.constant dense<1.000000e+00> : tensor<5x2xf32>
+  // QDQ-NEXT: %[[qw:.*]] = "tfl.quantize"(%[[w]]) {qtype = tensor<5x2x!quant.uniform<i8<-127:127>:f32
+  // QDQ-NEXT: %[[dqw:.*]] = "tfl.dequantize"(%[[qw]]) : (tensor<5x2x!quant.uniform<i8<-127:127>:f32
+  // QDQ-NEXT: %[[tp:.*]] = "tfl.transpose"(%[[dqw]], %[[perm]]) : (tensor<5x2xf32>, tensor<2xi32>) -> tensor<2x5xf32>
+  // QDQ-NEXT: %[[qtw:.*]] = "tfl.quantize"(%[[tp]]) {qtype = tensor<2x5x!quant.uniform<i8<-127:127>:f32
+  // QDQ-NEXT: %[[dqtw:.*]] = "tfl.dequantize"(%[[qtw]]) : (tensor<2x5x!quant.uniform<i8<-127:127>:f32
+  // QDQ-NEXT: return %[[dqtw]] : tensor<2x5xf32>
+}
+
 // QDQ-LABEL: TransposePerChannelNewQuantDim
 func.func @TransposePerChannelNewQuantDim() -> tensor<2x5xf32> {
   %perm = arith.constant dense<[1, 0]> : tensor<2xi32>
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
index 401f34e6e7943c..38a8bffd87bb03 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
@@ -1095,9 +1095,9 @@ void LegalizeTFPass::runOnOperation() {
   addPatterns(context, stage1Patterns, this->preserve_assert_op_);
 
   FrozenRewritePatternSet stage1FrozenPatterns(std::move(stage1Patterns));
-  if (!applyPatterns(func, target, stage1FrozenPatterns))
+  if (!applyPatterns(func, target, stage1FrozenPatterns)) {
     return signalPassFailure();
-
+  }
   // Explict BroadcastTo addition for left-over broadcast-able ops.
   // The following pattern matchings should be done after the other legalization
   // rules in order not to add unnecessary BroadcastTo ops.
@@ -1126,8 +1126,9 @@ void LegalizeTFPass::runOnOperation() {
                      ApplyExplicitBroadcasting<TF::SelectV2Op>>(context);
 
   FrozenRewritePatternSet stage2FrozenPatterns(std::move(stage2Patterns));
-  if (!applyPatterns(func, target, stage2FrozenPatterns))
+  if (!applyPatterns(func, target, stage2FrozenPatterns)) {
     return signalPassFailure();
+  }
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/lite/transforms/lift_tflite_flex_ops.cc b/tensorflow/compiler/mlir/lite/transforms/lift_tflite_flex_ops.cc
index 2ed12a34059588..e212ce16ee6ccd 100644
--- a/tensorflow/compiler/mlir/lite/transforms/lift_tflite_flex_ops.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/lift_tflite_flex_ops.cc
@@ -219,7 +219,7 @@ class LiftFlexCustomOp : public OpRewritePattern<TFL::CustomOp> {
     for (const auto& name_and_value : node_def.attr()) {
       const std::string& attr_name = name_and_value.first;
       const tensorflow::AttrValue& attr_value = name_and_value.second;
-      StatusOr<Attribute> mlir_attr =
+      absl::StatusOr<Attribute> mlir_attr =
           tensorflow::ConvertAttributeValue(attr_value, &builder);
       if (!mlir_attr.ok()) {
         return emitError(loc, mlir_attr.status().message());
diff --git a/tensorflow/compiler/mlir/lite/utils/convert_type.cc b/tensorflow/compiler/mlir/lite/utils/convert_type.cc
index e09030ceb7515f..f2e659b9aea9ce 100644
--- a/tensorflow/compiler/mlir/lite/utils/convert_type.cc
+++ b/tensorflow/compiler/mlir/lite/utils/convert_type.cc
@@ -27,7 +27,7 @@ limitations under the License.
 
 namespace tflite {
 
-using xla::StatusOr;
+using absl::StatusOr;
 
 namespace errors = tensorflow::errors;
 
diff --git a/tensorflow/compiler/mlir/quantization/common/BUILD b/tensorflow/compiler/mlir/quantization/common/BUILD
index 8091fe21ef56ff..da122b67993af7 100644
--- a/tensorflow/compiler/mlir/quantization/common/BUILD
+++ b/tensorflow/compiler/mlir/quantization/common/BUILD
@@ -57,6 +57,7 @@ tf_cc_test(
     name = "lift_as_function_call_test",
     srcs = ["lift_as_function_call_test.cc"],
     deps = [
+        ":attrs_and_constraints",
         ":func",
         ":lift_as_function_call",
         ":test_base",
@@ -148,6 +149,8 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:xla_call_module_attrs",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
@@ -164,12 +167,14 @@ tf_cc_test(
         ":func",
         ":test_base",
         "//tensorflow/compiler/mlir/tensorflow",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
+        "@local_tsl//tsl/platform:status_matchers",
         "@stablehlo//:stablehlo_ops",
     ],
 )
diff --git a/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.cc b/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.cc
index b1f579bc8e71b8..540eff26685968 100644
--- a/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.cc
+++ b/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include <optional>
 
 #include "absl/algorithm/container.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
@@ -119,9 +121,11 @@ bool IsHybridQuantizedOp(Operation* op) {
          !IsQuantizedTensorType(result_type);
 }
 
-std::optional<int64_t> GetDotGeneralQuantizationDim(
-    DotGeneralOp dot_general_op) {
-  if (dot_general_op == nullptr) return std::nullopt;
+absl::StatusOr<bool> IsDotGeneralFullyConnected(DotGeneralOp dot_general_op) {
+  if (dot_general_op == nullptr)
+    return absl::InvalidArgumentError(
+        "Given dot_general op cannot be null when checking "
+        "`IsDotGeneralBatchMatmul`.");
   const ::mlir::stablehlo::DotDimensionNumbersAttr dot_dimension_numbers =
       dot_general_op.getDotDimensionNumbers();
   const ArrayRef<int64_t> lhs_contracting_dims =
@@ -132,10 +136,8 @@ std::optional<int64_t> GetDotGeneralQuantizationDim(
       dot_general_op.getOperand(0).getType().dyn_cast<ShapedType>().getRank();
   const int64_t filter_rank =
       dot_general_op.getOperand(1).getType().dyn_cast<ShapedType>().getRank();
-  // To quantize rhs per-channel, we currently only consider the case where
-  // `stablehlo.dot_general` is legalizable to `tfl.fully_connected`.
   // The following conditions are such requirements:
-  //   - rank(lhs) <= 2
+  //   - rank(lhs) is 1 or 2
   //   - rank(rhs) = 2
   //   - size(lhs_contracting_dimensions) = 1
   //   - size(rhs_contracting_dimensions) = 1
@@ -144,7 +146,8 @@ std::optional<int64_t> GetDotGeneralQuantizationDim(
   //   - quantization_dimension(rhs) should not be in
   //     `rhs_contracting_dimensions`.
   // https://github.com/openxla/stablehlo/blob/main/docs/spec.md#dot_general
-  const bool has_proper_rank = input_rank <= 2 && filter_rank == 2;
+  const bool has_proper_rank =
+      (input_rank == 1 || input_rank == 2) && filter_rank == 2;
   const bool has_proper_contracting_dim =
       lhs_contracting_dims.size() == 1 && rhs_contracting_dims.size() == 1 &&
       lhs_contracting_dims[0] == input_rank - 1;
@@ -153,9 +156,20 @@ std::optional<int64_t> GetDotGeneralQuantizationDim(
   const bool has_proper_quantization_dimension =
       absl::c_find(rhs_contracting_dims, filter_rank) ==
       rhs_contracting_dims.end();
+  return has_proper_rank && has_proper_contracting_dim && is_not_batch_op &&
+         has_proper_quantization_dimension;
+}
+
+std::optional<int64_t> GetDotGeneralQuantizationDim(
+    DotGeneralOp dot_general_op) {
+  if (dot_general_op == nullptr) return std::nullopt;
+  const int64_t filter_rank =
+      dot_general_op.getOperand(1).getType().dyn_cast<ShapedType>().getRank();
+
+  // To quantize rhs per-channel, we currently only consider the case where
+  // `stablehlo.dot_general` is legalizable to `tfl.fully_connected`.
   const bool is_per_axis_quantizable =
-      has_proper_rank && has_proper_contracting_dim && is_not_batch_op &&
-      has_proper_quantization_dimension;
+      IsDotGeneralFullyConnected(dot_general_op).value();
   if (!is_per_axis_quantizable) return std::nullopt;
   return filter_rank - 1;
 }
diff --git a/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h b/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h
index 852902e229a9fc..490a77a3b73ffa 100644
--- a/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h
+++ b/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <optional>
 #include <type_traits>
 
+#include "absl/status/statusor.h"
 #include "llvm/Support/Debug.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
@@ -178,6 +179,15 @@ FailureOr<int32_t> CastI64ToI32(int64_t value);
 FailureOr<SmallVector<int32_t>> CastI64ArrayToI32(
     ArrayRef<int64_t> int64_array);
 
+// Returns the first operation with the given type in the function.
+template <typename OpType>
+OpType FindOperationOfType(func::FuncOp function) {
+  for (auto op : function.getBody().getOps<OpType>()) {
+    return op;
+  }
+  return nullptr;
+}
+
 // Returns the first user of the given operation, optionally of the given
 // type if provided. If there is no user or user of type, return nullptr.
 template <typename T = Operation*>
@@ -190,6 +200,18 @@ Operation* FindUserOfType(Operation* op) {
   return nullptr;
 }
 
+// Returns the first user of the given operation, optionally of the given
+// type if provided. If there is no user or user of type, return nullptr.
+template <typename T = Operation*>
+Operation* FindOperandOfType(Operation* op) {
+  for (Value operand_value : op->getOperands()) {
+    if (isa<T>(operand_value.getDefiningOp())) {
+      return operand_value.getDefiningOp();
+    }
+  }
+  return nullptr;
+}
+
 // Returns the function attribute for the given call op which is lifted for
 // quantization.
 inline FlatSymbolRefAttr GetFuncAttr(TF::PartitionedCallOp call_op) {
@@ -216,6 +238,11 @@ inline bool HasQuantizableTrait(Operation* op) {
 // is quantized.
 bool IsHybridQuantizedOp(Operation* op);
 
+// Returns whether a given `stablehlo.dot_general` can be legalizable to
+// `tfl.fully_connected`.
+absl::StatusOr<bool> IsDotGeneralFullyConnected(
+    ::mlir::stablehlo::DotGeneralOp dot_general_op);
+
 // Returns the quantization dimension for a given `stablehlo.dot_general` op,
 // or `std::nullopt` if the given op is not per-channel quantizable.
 std::optional<int64_t> GetDotGeneralQuantizationDim(
diff --git a/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints_test.cc b/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints_test.cc
index f6e633aa4c7861..ca0df77f81b51c 100644
--- a/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints_test.cc
+++ b/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "llvm/Support/MathExtras.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -33,11 +34,13 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/quantization/common/func.h"
 #include "tensorflow/compiler/mlir/quantization/common/test_base.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tsl/platform/status_matchers.h"
 
 namespace mlir::quant {
 namespace {
 
 using ::mlir::stablehlo::AddOp;
+using ::mlir::stablehlo::ConstantOp;
 using ::mlir::stablehlo::ConvolutionOp;
 using ::mlir::stablehlo::DotGeneralOp;
 using ::mlir::stablehlo::SubtractOp;
@@ -47,6 +50,7 @@ using ::testing::IsEmpty;
 using ::testing::IsNull;
 using ::testing::NotNull;
 using ::testing::Optional;
+using ::tsl::testing::StatusIs;
 
 using AttrsAndConstraintsTest = ::mlir::quant::QuantizationTestBase;
 
@@ -70,10 +74,11 @@ constexpr absl::string_view kModuleDynamic = R"mlir(
 
 constexpr absl::string_view kModuleMultipleUses = R"mlir(
   module {
-    func.func @main(%arg0: tensor<1x1024xf32>, %arg1: tensor<1024x3xf32>, %arg2: tensor<1x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+    func.func @main(%arg0: tensor<1x1024xf32>, %arg1: tensor<1024x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+      %cst = stablehlo.constant dense<1.0> : tensor<1x3xf32>
       %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0], precision = [] : (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
-      %1 = stablehlo.subtract %arg2, %0 : tensor<1x3xf32>
-      %2 = stablehlo.add %0, %arg2 : tensor<1x3xf32>
+      %1 = stablehlo.subtract %cst, %0 : tensor<1x3xf32>
+      %2 = stablehlo.add %0, %cst : tensor<1x3xf32>
       return %2 : tensor<1x3xf32>
     }
   }
@@ -326,6 +331,22 @@ TEST_F(AttrsAndConstraintsTest, FindUserOfDifferentTypes) {
   EXPECT_THAT(FindUserOfType<ConvolutionOp>(dot_general_op), IsNull());
 }
 
+TEST_F(AttrsAndConstraintsTest, FindOperandOfDifferentTypes) {
+  OwningOpRef<ModuleOp> module_op = ParseModuleOpString(kModuleMultipleUses);
+  ASSERT_TRUE(module_op);
+
+  func::FuncOp main_fn = FindMainFuncOp(*module_op);
+  ASSERT_THAT(main_fn, NotNull());
+
+  auto subtract_op = FindOperationOfType<SubtractOp>(main_fn);
+  ASSERT_THAT(subtract_op, NotNull());
+
+  EXPECT_THAT(FindOperandOfType<DotGeneralOp>(subtract_op), NotNull());
+  EXPECT_THAT(FindOperandOfType<ConstantOp>(subtract_op), NotNull());
+  EXPECT_THAT(FindOperandOfType<>(subtract_op), NotNull());
+  EXPECT_THAT(FindOperandOfType<AddOp>(subtract_op), IsNull());
+}
+
 TEST_F(AttrsAndConstraintsTest, XlaCallModuleOpGetFuncAttr) {
   OwningOpRef<ModuleOp> module_op = ParseModuleOpString(kModuleXlaCallModule);
   ASSERT_TRUE(module_op);
@@ -450,6 +471,37 @@ constexpr absl::string_view kModuleDotGeneralBatchMatmul = R"mlir(
   }
 )mlir";
 
+TEST_F(AttrsAndConstraintsTest, IsDotGeneralFullyConnectedReturnsError) {
+  DotGeneralOp dot_general_op = nullptr;
+  StatusIs(absl::StatusCode::kInvalidArgument,
+           "Given dot_general op cannot be null when checking "
+           "`IsDotGeneralBatchMatmul`");
+}
+
+TEST_F(AttrsAndConstraintsTest, IsDotGeneralFullyConnectedReturnsTrue) {
+  OwningOpRef<ModuleOp> module_op =
+      ParseModuleOpString(kModuleDotGeneralFullyConnected);
+  ASSERT_TRUE(module_op);
+
+  func::FuncOp main_fn = FindMainFuncOp(*module_op);
+  ASSERT_THAT(main_fn, NotNull());
+
+  auto dot_general_op = *main_fn.getOps<DotGeneralOp>().begin();
+  EXPECT_THAT(IsDotGeneralFullyConnected(dot_general_op), true);
+}
+
+TEST_F(AttrsAndConstraintsTest, IsDotGeneralFullyConnectedReturnsFalse) {
+  OwningOpRef<ModuleOp> module_op =
+      ParseModuleOpString(kModuleDotGeneralBatchMatmul);
+  ASSERT_TRUE(module_op);
+
+  func::FuncOp main_fn = FindMainFuncOp(*module_op);
+  ASSERT_THAT(main_fn, NotNull());
+
+  auto dot_general_op = *main_fn.getOps<DotGeneralOp>().begin();
+  EXPECT_THAT(IsDotGeneralFullyConnected(dot_general_op), false);
+}
+
 TEST_F(AttrsAndConstraintsTest, DotGeneralFullyConnectedReturnsQuantDim) {
   OwningOpRef<ModuleOp> module_op =
       ParseModuleOpString(kModuleDotGeneralFullyConnected);
diff --git a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h
index bd7421d376102b..bfef9a13df1a01 100644
--- a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h
+++ b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h
@@ -34,6 +34,10 @@ inline constexpr StringRef kFusedFunctionAttr = "tf_quant.composite_function";
 // The keyword to detect if this is a `NullAttribute`.
 inline constexpr StringRef kNullAttributeValue = "N/A";
 
+// Prefixes attached to lifted functions.
+constexpr StringRef kQuantizedFuncPrefix = "quantized_";
+constexpr StringRef kCompositeFuncPrefix = "composite_";
+
 // The attribute will be used for TF::XlaCallModuleOp to restore the original
 // function name when loading it back.
 inline constexpr StringRef kOriginalStablehloEntryFunctionAttrName =
diff --git a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call_test.cc b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call_test.cc
index c37a997217d2b7..5e5e103ba72018 100644
--- a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call_test.cc
+++ b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call_test.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
 #include "tensorflow/compiler/mlir/quantization/common/func.h"
 #include "tensorflow/compiler/mlir/quantization/common/test_base.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
diff --git a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_driver.cc b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_driver.cc
index 327d109946e031..216a4a2b3d58e9 100644
--- a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_driver.cc
+++ b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_driver.cc
@@ -87,6 +87,21 @@ void InitializeStateForValue(
   cached->second = next_state_index;
 }
 
+bool HasPerAxisQuantizedOperand(Operation* op) {
+  for (int i = 0; i < op->getNumOperands(); ++i) {
+    if (auto dq_op = dyn_cast_or_null<quantfork::DequantizeCastOp>(
+            op->getOperand(i).getDefiningOp())) {
+      auto type = dq_op.getArg().getType().cast<TensorType>().getElementType();
+      if (auto per_axis_qtype =
+              QuantizedType::getQuantizedElementType(type)
+                  .dyn_cast_or_null<quant::UniformQuantizedPerAxisType>()) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 }  // namespace
 
 void QuantizationDriver::InitializeArgState(const BlockArgument arg,
@@ -480,7 +495,10 @@ void QuantizationDriver::PreprocessConstantOps() {
     // Skip if the value is NaN or INF.
     // Otherwise the illegal scale/zp will be calculated.
     auto float_attr = cst.getValueAttr().dyn_cast<DenseFPElementsAttr>();
-    if (float_attr && !float_attr.getValues<APFloat>()[0].isFinite()) return;
+    if (float_attr && (float_attr.getValues<APFloat>().empty() ||
+                       !float_attr.getValues<APFloat>()[0].isFinite())) {
+      return;
+    }
 
     const Value value = cst.getResult();
     builder_.setInsertionPoint(cst);
@@ -788,11 +806,18 @@ bool QuantizationDriver::PropagateParamsAndReturnIfChanged() {
       // quantization for the quantized kernel. If the quantized dimension
       // changes, the following logic no longer works as the same `params`
       // shouldn't be used for both input and output quantization params.
-      // E.g. TransposeOp's propagation is handled in
-      // `PropagateTransposedQuantDim` in PrepareQuantize.
+      // E.g. During TransposeOp's quantization propagation in
+      // PrepareQuantize, if the quantization is per-axis and the
+      // QuantizedDimension is transposed, then the output q-dq params must
+      // reflect the new QuantizedDimension. So, check and skip the
+      // propagation if any of the operands has a per-axis quantized type param
+      // and `RequiredSameQuantizedAxes` set to false.
+      // Currently, these lines of code are only applicable to TFL_TransposeOp
+      // and the output q-dq propagation for this Op is performed in
+      // `PropagateTransposedPerAxisQuantDim`.
       if (is_qdq_conversion_ &&
           !scale_spec->required_same_quantized_axes_func()) {
-        continue;
+        if (HasPerAxisQuantizedOperand(op)) continue;
       }
 
       // Use the final state to set all the operands' parameters.
diff --git a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h
index e1d36df58a3fd9..453dc419371932 100644
--- a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h
+++ b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h
@@ -134,11 +134,14 @@ using OpQuantSpecGetter =
 // Quantization scale spec of an op. The information defined in the MLIR
 // interfaces FixedOutputRangeInterface and SameOperandsAndResultsScale should
 // be checked first if present.
+// TODO: b/323478683: Consider deprecating this.
 struct OpQuantScaleSpec {
   // Whether this op has a fixed range requirement (e.g. sigmoid)
   bool has_fixed_output_range = false;
-  // Whether this op should have same result and operand scales (e.g. concat)
+  // Whether this op should have same operand and result scales (e.g. concat)
   bool has_same_scale_requirement = false;
+  // Whether this op should have same operand and result type (e.g. gather)
+  bool has_same_operand_and_result_type_requirement = false;
   // Returns the fixed output range, when has_fixed_output_range is set.
   GetFixedOutputRangeFunc fixed_output_range_func;
   // Returns whether same operands and results scales are required.
diff --git a/tensorflow/compiler/mlir/quantization/common/test_base.h b/tensorflow/compiler/mlir/quantization/common/test_base.h
index a1a770ff616dee..4564de6c3d5603 100644
--- a/tensorflow/compiler/mlir/quantization/common/test_base.h
+++ b/tensorflow/compiler/mlir/quantization/common/test_base.h
@@ -62,15 +62,6 @@ class QuantizationTestBase : public Test {
     return parseSourceString<ModuleOp>(module_op_str, ctx_.get());
   }
 
-  // Returns the first operation with the given type in the function.
-  template <typename OpType>
-  OpType FindOperationOfType(func::FuncOp function) {
-    for (auto op : function.getBody().getOps<OpType>()) {
-      return op;
-    }
-    return nullptr;
-  }
-
   // Convenience function that returns the first operation of type `OpT` from
   // the `@main` function in `module_op`. Useful when testing with a text
   // representation of a `ModuleOp` containing a single function `@main`.
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
index 3b53b3c74bb7cb..3da423119752cb 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
@@ -8,6 +8,7 @@ load("//tensorflow/core/platform:build_config.bzl", "tf_proto_library")
 package_group(
     name = "internal_visibility_allowlist_package",
     packages = [
+        "//learning/brain/mlir/quantization/stablehlo/python/integration_test/...",
         "//tensorflow/compiler/mlir/lite/...",
         "//tensorflow/compiler/mlir/quantization/...",
         "//tensorflow/compiler/mlir/tf2xla/transforms/...",
@@ -54,6 +55,7 @@ cc_library(
         "passes/lift_quantizable_spots_as_functions.cc",
         "passes/lift_quantizable_spots_as_functions_fusion.inc",
         "passes/lift_quantizable_spots_as_functions_simple.inc",
+        "passes/merge_fusion_with_dequantize.cc",
         "passes/nchw_convolution_to_nhwc.cc",
         "passes/optimize_graph.cc",
         "passes/post_quantize.cc",
@@ -67,6 +69,7 @@ cc_library(
         "passes/restore_function_name.cc",
         "passes/unfuse_mhlo_batch_norm.cc",
         "passes/unwrap_xla_call_module_op.cc",
+        "passes/xla_call_module_to_call.cc",
     ],
     hdrs = [
         "passes/passes.h",
@@ -95,6 +98,7 @@ cc_library(
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib",
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib:quantization_config",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:permutation",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:report",
         "//tensorflow/compiler/mlir/quantization/stablehlo/ops:stablehlo_op_quant_spec",
         "//tensorflow/compiler/mlir/quantization/tensorflow:passes",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
@@ -111,12 +115,11 @@ cc_library(
         "//tensorflow/core/ir/types:Dialect",
         "//tensorflow/core/platform:path",
         "//tensorflow/core/tpu:tpu_defs",
-        "//tensorflow/lite/kernels:padding",
-        "//tensorflow/lite/kernels/internal:quantization_util",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:nullability",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/random",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -169,10 +172,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:path",
-        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
@@ -412,7 +412,7 @@ tf_cc_test(
         "@local_xla//xla/mlir_hlo",
         "@local_xla//xla/pjrt:pjrt_client",
         "@local_xla//xla/pjrt:pjrt_executable",
-        "@local_xla//xla/pjrt:tfrt_cpu_pjrt_client",
+        "@local_xla//xla/pjrt/cpu:cpu_client",
         "@local_xla//xla/tests:literal_test_util",
         "@stablehlo//:chlo_ops",
     ],
@@ -528,7 +528,6 @@ cc_library(
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantize_preprocess",
         "//tensorflow/compiler/mlir/quantization/tensorflow/cc:run_passes",
         "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow/transforms:verify_no_outside_compilation_markers_pass",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
@@ -745,7 +744,9 @@ tf_proto_library(
 # py_proto_library(
 #     name = "quantization_config_py_pb2",
 #     api_version = 2,
-#     visibility = [":internal_visibility_allowlist_package"],
+#     visibility = [
+#         ":internal_visibility_allowlist_package",
+#     ],
 #     deps = [":quantization_config_proto"],
 # )
 # copybara:uncomment_end
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD
index 77629c7719bf44..5ae92d648bf5c9 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD
@@ -321,7 +321,15 @@ cc_library(
     hdrs = ["report.h"],
     compatible_with = get_compatible_with_portable(),
     deps = [
+        "//tensorflow/compiler/mlir/quantization/common:lift_as_function_call",
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@local_tsl//tsl/platform:protobuf",
     ],
 )
 
@@ -330,8 +338,12 @@ tf_cc_test(
     srcs = ["report_test.cc"],
     deps = [
         ":report",
+        "//tensorflow/compiler/mlir/quantization/common:test_base",
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform:protobuf",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/BUILD
index 5783ffddd4f050..3fbd4ed586e45f 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/BUILD
@@ -25,6 +25,7 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration:min_max_value",
         "//tensorflow/compiler/mlir/quantization/tensorflow:tf_quant_ops",
         "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:calibration_statistics_proto_cc",
         "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:calibrator_singleton",
@@ -99,3 +100,20 @@ tf_cc_test(
         "@local_tsl//tsl/platform:status_matchers",
     ],
 )
+
+cc_library(
+    name = "calibration_parameters",
+    srcs = [],
+    hdrs = ["calibration_parameters.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = ["//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc"],
+)
+
+tf_cc_test(
+    name = "calibration_parameters_test",
+    srcs = ["calibration_parameters_test.cc"],
+    deps = [
+        ":calibration_parameters",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/calibration_parameters.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/calibration_parameters.h
new file mode 100644
index 00000000000000..ffad37d15d243c
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/calibration_parameters.h
@@ -0,0 +1,79 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CALIBRATION_CALIBRATION_PARAMETERS_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CALIBRATION_CALIBRATION_PARAMETERS_H_
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+
+namespace stablehlo::quantization {
+
+// TODO: b/321158562 - Make the number of bins configurable.
+// Default number of histogram bins for each batch sample.
+constexpr int32_t kDefaultNumOfBins = 1 << 9;
+
+// Calculates the bin width from the range and expected number of bins. The
+// bin width is formalized to the form of 2^n. As a consequence, the actual
+// number of bins might be smaller than the given `num_bins`.
+inline float CalculateBinWidth(const float min_value, const float max_value,
+                               const int32_t num_bins) {
+  const float raw_bin_width = (max_value - min_value) / num_bins;
+  return std::pow(2, std::ceil(std::log2(raw_bin_width)));
+}
+
+// Calculates the lower bound of the histogram. The lower bound is in form of
+// `N * bin_width`.
+inline float CalculateLowerBound(const float min_value, const float bin_width) {
+  return std::floor(min_value / bin_width) * bin_width;
+}
+
+// Calculates the bin index of the current value.
+inline int32_t CalculateBinIndex(const float value, const float lower_bound,
+                                 const float bin_width) {
+  return std::floor((value - lower_bound) / bin_width);
+}
+
+// Same as `CalculateBinIndex` but clamps to avoid out-of-bound.
+inline int32_t CalculateBinIndexSafe(const float value, const float lower_bound,
+                                     const float bin_width,
+                                     const int32_t num_bins) {
+  const int32_t bin_index = CalculateBinIndex(value, lower_bound, bin_width);
+  return std::clamp(bin_index, 0, num_bins - 1);
+}
+
+// Checks if the given method is a histogram-based calibration method.
+inline bool IsHistogramCalibration(
+    const CalibrationOptions::CalibrationMethod method) {
+  return method ==
+             CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_PERCENTILE ||
+         method ==
+             CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_BRUTEFORCE ||
+         method == CalibrationOptions::
+                       CALIBRATION_METHOD_HISTOGRAM_MSE_MAX_FREQUENCY ||
+         method ==
+             CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_SYMMETRIC;
+}
+
+// Gets the number of bins for the given calibration method.
+inline int32_t GetNumBins(const CalibrationOptions::CalibrationMethod method) {
+  return IsHistogramCalibration(method) ? kDefaultNumOfBins : 0;
+}
+
+}  // namespace stablehlo::quantization
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CALIBRATION_CALIBRATION_PARAMETERS_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/calibration_parameters_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/calibration_parameters_test.cc
new file mode 100644
index 00000000000000..bff3f5092a8644
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/calibration_parameters_test.cc
@@ -0,0 +1,101 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/calibration_parameters.h"
+
+#include <cmath>
+#include <cstdint>
+
+#include <gtest/gtest.h>
+
+namespace stablehlo::quantization {
+namespace {
+
+// Calculates the number of bins from the range and bin width.
+inline int32_t CalculateActualNumBins(const float min_value,
+                                      const float max_value,
+                                      const float bin_width) {
+  const float lower_bound = CalculateLowerBound(min_value, bin_width);
+  return std::ceil((max_value - lower_bound) / bin_width);
+}
+
+TEST(CalibrationParametersTest, CalculateBinWidthSmallerThanOne) {
+  float bin_width = CalculateBinWidth(/*min_value=*/0.0, /*max_value=*/25.0,
+                                      /*num_bins=*/256);
+  EXPECT_FLOAT_EQ(bin_width, 0.125);
+  int32_t actual_num_bins =
+      CalculateActualNumBins(/*min_value=*/0.0, /*max_value=*/25.0, bin_width);
+  EXPECT_EQ(actual_num_bins, 200);
+
+  // Calculate the bin width with the actual num bins.
+  float raw_bin_width = 25.0 / actual_num_bins;
+  EXPECT_FLOAT_EQ(bin_width, raw_bin_width);
+}
+
+TEST(CalibrationParametersTest, CalculateBinWidthLargerThanOne) {
+  float bin_width = CalculateBinWidth(/*min_value=*/0.0, /*max_value=*/360.0,
+                                      /*num_bins=*/256);
+  EXPECT_FLOAT_EQ(bin_width, 2.0);
+  int32_t actual_num_bins =
+      CalculateActualNumBins(/*min_value=*/0.0, /*max_value=*/360.0, bin_width);
+  EXPECT_EQ(actual_num_bins, 180);
+
+  // Calculate the bin width with the actual num bins.
+  float raw_bin_width = 360.0 / actual_num_bins;
+  EXPECT_FLOAT_EQ(bin_width, raw_bin_width);
+}
+
+TEST(CalibrationParametersTest, CalculateBinWidthDivisible) {
+  float bin_width = CalculateBinWidth(/*min_value=*/0.0, /*max_value=*/256.0,
+                                      /*num_bins=*/256);
+  EXPECT_FLOAT_EQ(bin_width, 1.0);
+  int32_t actual_num_bins =
+      CalculateActualNumBins(/*min_value=*/0.0, /*max_value=*/256.0, bin_width);
+  EXPECT_EQ(actual_num_bins, 256);
+
+  // Calculate the bin width with the actual num bins.
+  float raw_bin_width = 256.0 / actual_num_bins;
+  EXPECT_FLOAT_EQ(bin_width, raw_bin_width);
+}
+
+TEST(CalibrationParametersTest, CalculateNumBinsDivisible) {
+  int32_t num_bins = CalculateActualNumBins(
+      /*min_value=*/0.0, /*max_value=*/4.0, /*bin_width=*/2.0);
+
+  // Expect 2 bins: [0, 2), [2, 4].
+  EXPECT_EQ(num_bins, 2);
+}
+
+TEST(CalibrationParametersTest, CalculateNumBinsNotDivisible) {
+  int32_t num_bins = CalculateActualNumBins(
+      /*min_value=*/0.0, /*max_value=*/5.0, /*bin_width=*/2.0);
+
+  // Expect 3 bins: [0, 2), [2, 4), [4, 6].
+  EXPECT_EQ(num_bins, 3);
+}
+
+TEST(CalibrationParametersTest, CalculateBinIndex) {
+  int32_t bin_index = CalculateBinIndexSafe(/*value=*/3.0, /*lower_bound=*/0.0,
+                                            /*bin_width=*/2.0, /*num_bins=*/2);
+  EXPECT_EQ(bin_index, 1);
+}
+
+TEST(CalibrationParametersTest, CalculateBinIndexMaxValue) {
+  int32_t bin_index = CalculateBinIndexSafe(/*value=*/4.0, /*lower_bound=*/0.0,
+                                            /*bin_width=*/2.0, /*num_bins=*/2);
+  EXPECT_EQ(bin_index, 1);
+}
+
+}  // namespace
+}  // namespace stablehlo::quantization
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.cc
index ba1671ceb696ca..ce626145318b9f 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.h"
 
+#include <optional>
 #include <string>
 #include <unordered_set>
 #include <utility>
@@ -122,7 +123,7 @@ absl::StatusOr<ModuleOp> CalibrationComponent::Run(
   // Translates `RepresentativeDatasetConfig`s to signature key ->
   // `RepresentativeDatasetFile` mapping.
   const auto dataset_configs =
-      config.static_range_ptq_preset().representative_datasets();
+      config.calibration_options().representative_datasets();
   const std::vector<RepresentativeDatasetConfig> dataset_config_vector(
       dataset_configs.begin(), dataset_configs.end());
   TF_ASSIGN_OR_RETURN(
@@ -132,10 +133,13 @@ absl::StatusOr<ModuleOp> CalibrationComponent::Run(
   // Runs calibration on the exported model. The statistics will be stored in a
   // separate singleton object `CalibratorSingleton` and are directly added to
   // `exported_model` without re-importing it.
-  py_function_lib_->RunCalibration(
-      precalibrated_saved_model_dir, signature_keys_, tags_,
-      config.calibration_options(),
-      /*force_graph_mode_calibration=*/true, representative_dataset_file_map);
+  if (py_function_lib_->RunCalibration(
+          precalibrated_saved_model_dir, signature_keys_, tags_,
+          /*force_graph_mode_calibration=*/true,
+          representative_dataset_file_map) == std::nullopt) {
+    return absl::InternalError(
+        "CalibrationComponent error: Failed to run calibration.");
+  }
 
   if (absl::Status status = AddCalibrationStatistics(
           module_op, config.calibration_options(), *py_function_lib_);
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.cc
index 39f4ca8449ae05..19a44097458f1a 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.cc
@@ -15,13 +15,13 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.h"
 
 #include <optional>
-#include <string>
 
 #include "absl/status/status.h"
 #include "absl/strings/str_format.h"
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/min_max_value.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.h"
@@ -55,11 +55,18 @@ absl::Status AddCalibrationStatistics(
       return;
     }
 
-    const auto [min_value, max_value] =
+    const std::optional<MinMaxValue> min_max_values =
         py_function_library.GetCalibrationMinMaxValue(*statistics,
                                                       calibration_options);
     CalibratorSingleton::ClearData(id);
 
+    if (min_max_values == std::nullopt) {
+      status = absl::InternalError(
+          "Cannot find min/max values for calibration statistics.");
+      return;
+    }
+
+    const auto [min_value, max_value] = *min_max_values;
     mlir::OpBuilder builder(aggregator_op);
     aggregator_op->setAttr("min", builder.getF32FloatAttr(min_value));
     aggregator_op->setAttr("max", builder.getF32FloatAttr(max_value));
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.cc
index 0f9932d053cb4d..b3aa1500a0a3c7 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.cc
@@ -102,12 +102,21 @@ QuantizationSpec GetDefaultStaticRangePtqSpec(StaticRangePtqPreset preset) {
   QuantizationSpec spec{};
   // Default for all ops.
   spec.mutable_matcher()->mutable_function_name()->set_regex(
-      preset.enable_full_int_quantization() ? ".*" : "^.*(conv|dot|gather).*");
+      preset.enable_full_int_quantization() ? ".*"
+                                            : "^.*(dot_general|gather).*");
   spec.mutable_method()->mutable_static_range_ptq();
 
   return spec;
 }
 
+QuantizationSpec GetDefaultWeightOnlyPtqSpec(WeightOnlyPtqPreset preset) {
+  QuantizationSpec spec{};
+  spec.mutable_matcher()->mutable_function_name()->set_regex(
+      "^.*(conv|dot_general).*");
+  spec.mutable_method()->mutable_weight_only_ptq();
+  return spec;
+}
+
 // Returns a `QuantizationSpec` for performing static-range PTQ on the
 // convolution quantizable unit family. Enables per-channel quantization for
 // weights, on the channel dimension.
@@ -122,14 +131,12 @@ QuantizationSpec GetDefaultStaticRangePtqSpec(StaticRangePtqPreset preset) {
 //       value {dimension_specs {dimension: 3}}}}
 //   }}
 // }
-QuantizationSpec GetStaticRangePtqSpecForConvolution() {
+QuantizationSpec GetPtqSpecForConvolution(Method::MethodCase method_case) {
   QuantizationSpec spec{};
 
   // Matches all convolution quantizable unit family.
   spec.mutable_matcher()->mutable_function_name()->set_regex(
       "composite_conv.*");
-  StaticRangePtq& static_range_ptq_spec =
-      *spec.mutable_method()->mutable_static_range_ptq();
 
   // Enable per-channel quantization for convolution weights.
   QuantizedType conv_weight_quantized_type{};
@@ -140,8 +147,17 @@ QuantizationSpec GetStaticRangePtqSpecForConvolution() {
 
   // The index of weight operands passed to lifted functions for convolution
   // is 1.
-  static_range_ptq_spec.mutable_input_quantized_types()->try_emplace(
-      1, std::move(conv_weight_quantized_type));
+  if (method_case == Method::kStaticRangePtq) {
+    StaticRangePtq& static_range_ptq_spec =
+        *spec.mutable_method()->mutable_static_range_ptq();
+    static_range_ptq_spec.mutable_input_quantized_types()->try_emplace(
+        1, std::move(conv_weight_quantized_type));
+  } else if (method_case == Method::kWeightOnlyPtq) {
+    WeightOnlyPtq& weight_only_ptq_spec =
+        *spec.mutable_method()->mutable_weight_only_ptq();
+    weight_only_ptq_spec.mutable_input_quantized_types()->try_emplace(
+        1, std::move(conv_weight_quantized_type));
+  }
 
   return spec;
 };
@@ -164,13 +180,34 @@ void ExpandStaticRangePtqPreset(const StaticRangePtqPreset& preset,
   QuantizationSpecs new_specs{};
   *new_specs.add_specs() =
       GetDefaultStaticRangePtqSpec(/*preset=*/config.static_range_ptq_preset());
-  *new_specs.add_specs() = GetStaticRangePtqSpecForConvolution();
+  *new_specs.add_specs() =
+      GetPtqSpecForConvolution(Method::MethodCase::kStaticRangePtq);
+
+  // Append user-provided specs to override existing specs.
+  const QuantizationSpecs& previous_specs = config.specs();
+  new_specs.mutable_specs()->Add(previous_specs.specs().begin(),
+                                 previous_specs.specs().end());
+
+  config.clear_static_range_ptq_preset();
+  config.mutable_specs()->Swap(&new_specs);
+}
+
+void ExpandWeightOnlyPtqPreset(const WeightOnlyPtqPreset& preset,
+                               QuantizationConfig& config) {
+  // Create a new `QuantizationSpecs` to replace the existing one. The
+  // expansion from `WeightOnlyPtqPreset` gets populated first and then
+  // user-provided explicit `QuantizationSpec`s will be appended.
+  QuantizationSpecs new_specs{};
+  *new_specs.add_specs() =
+      GetDefaultWeightOnlyPtqSpec(/*preset=*/config.weight_only_ptq_preset());
+  // TODO: b/307625297 - Add per-channel weight only support.
 
   // Append user-provided specs to override existing specs.
   const QuantizationSpecs& previous_specs = config.specs();
   new_specs.mutable_specs()->Add(previous_specs.specs().begin(),
                                  previous_specs.specs().end());
 
+  config.clear_weight_only_ptq_preset();
   config.mutable_specs()->Swap(&new_specs);
 }
 
@@ -184,6 +221,9 @@ QuantizationConfig ExpandPresets(const QuantizationConfig& config) {
     case QuantizationConfig::kStaticRangePtqPreset:
       ExpandStaticRangePtqPreset(config.static_range_ptq_preset(), new_config);
       break;
+    case QuantizationConfig::kWeightOnlyPtqPreset:
+      ExpandWeightOnlyPtqPreset(config.weight_only_ptq_preset(), new_config);
+      break;
     default:
       // Preset has not been specified. The expansion is a no-op.
       break;
@@ -192,6 +232,16 @@ QuantizationConfig ExpandPresets(const QuantizationConfig& config) {
   return new_config;
 }
 
+bool HasQuantizationMethod(const QuantizationSpecs& specs,
+                           Method::MethodCase method_case) {
+  for (const auto& spec : specs.specs()) {
+    if (spec.method().method_case() == method_case) {
+      return true;
+    }
+  }
+  return false;
+}
+
 QuantizationConfig PopulateDefaults(
     const QuantizationConfig& user_provided_config) {
   QuantizationConfig config = user_provided_config;
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h
index 5dc4554d784c92..19f250bedfe1b8 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h
@@ -41,6 +41,10 @@ QuantizationConfig PopulateDefaults(
 //   - No-op.
 QuantizationConfig ExpandPresets(const QuantizationConfig& config);
 
+// Returns whether a given QuantizationSpecs has the given quantization method.
+bool HasQuantizationMethod(const QuantizationSpecs& specs,
+                           Method::MethodCase method_case);
+
 }  // namespace stablehlo::quantization
 
 #endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CONFIG_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/config_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config_test.cc
index e3f2bfde3d10c3..c46daaf1252f26 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/config_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config_test.cc
@@ -198,7 +198,7 @@ TEST(ExpandPresetsTest, ExpandStaticRangePtqPresetDefault) {
 
   const QuantizationSpec& spec = new_config.specs().specs(0);
   EXPECT_THAT(spec.matcher().function_name().regex(),
-              StrEq("^.*(conv|dot|gather).*"));
+              StrEq("^.*(dot_general|gather).*"));
   EXPECT_TRUE(spec.method().has_static_range_ptq());
 }
 
@@ -274,5 +274,18 @@ TEST(ExpandPresetsTest, ExpandStaticRangePtqPresetThenAppendExplicitSpecs) {
   EXPECT_TRUE(third_spec.method().has_no_quantization());
 }
 
+TEST(ExpandPresetsTest, ExpandWeightOnlyPtqPresetDefault) {
+  QuantizationConfig config{};
+  *config.mutable_weight_only_ptq_preset() = WeightOnlyPtqPreset();
+
+  const QuantizationConfig new_config = ExpandPresets(config);
+  ASSERT_THAT(new_config.specs().specs(), SizeIs(1));
+
+  const QuantizationSpec& spec = new_config.specs().specs(0);
+  EXPECT_THAT(spec.matcher().function_name().regex(),
+              StrEq("^.*(conv|dot_general).*"));
+  EXPECT_TRUE(spec.method().has_weight_only_ptq());
+}
+
 }  // namespace
 }  // namespace stablehlo::quantization
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.cc
index ebe950c58142f6..622ff502c01ed9 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.cc
@@ -54,18 +54,19 @@ void AddPreCalibrationPasses(OpPassManager& pm,
   pm.addPass(CreateIssueIDsOfCustomAggregationOpsPass());
 }
 
-void AddPostCalibrationPasses(
-    OpPassManager& pm, const PipelineConfig& pipeline_config,
-    const StaticRangePtqPreset& static_range_ptq_preset) {
+void AddPostCalibrationPasses(OpPassManager& pm,
+                              const PipelineConfig& pipeline_config,
+                              const QuantizationSpecs& specs) {
   QuantizeCompositeFunctionsPassOptions options;
-  // TODO: b/331120943 - Use QuantizationConfig instead of preset flags.
-  options.enable_per_channel_quantized_weight_ =
-      static_range_ptq_preset.enable_per_channel_quantized_weight();
-  options.enable_full_int_quantization_ =
-      static_range_ptq_preset.enable_full_int_quantization();
+  // TODO: b/331120943 - Temporarily set below to true, signaling per-channel
+  // quantization will be applied for all where applicable. This will be
+  // replaced by individual `Method` in `QuantizationSpecs`.
+  options.enable_per_channel_quantized_weight_ = true;
   // For debugging purposes.
   options.mlir_dump_file_name_ = "quantize_composite_functions";
   options.enable_weight_only_ = false;
+  options.merge_fusion_with_dequantize_ =
+      pipeline_config.merge_fusion_with_dequantize();
 
   AddShapeLegalizationPasses(pm);
   pm.addNestedPass<func::FuncOp>(
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.h
index 4f94506b6c184e..408152f6fc5a49 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.h
@@ -34,8 +34,7 @@ void AddPreCalibrationPasses(
 void AddPostCalibrationPasses(
     OpPassManager& pm,
     const ::stablehlo::quantization::PipelineConfig& pipeline_config,
-    const ::stablehlo::quantization::StaticRangePtqPreset&
-        static_range_ptq_preset);
+    const ::stablehlo::quantization::QuantizationSpecs& specs);
 
 // Adds passes for weight-only quantization.
 void AddWeightOnlyQuantizationPasses(
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/post_calibration.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/post_calibration.cc
index 6f5f10b48f41f5..001ece707cfe90 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/post_calibration.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/post_calibration.cc
@@ -29,7 +29,7 @@ namespace mlir::quant::stablehlo {
 
 using ::stablehlo::quantization::PipelineConfig;
 using ::stablehlo::quantization::QuantizationConfig;
-using ::stablehlo::quantization::StaticRangePtqPreset;
+using ::stablehlo::quantization::QuantizationSpecs;
 using ::tensorflow::quantization::RunPasses;
 
 PostCalibrationComponent::PostCalibrationComponent(
@@ -40,18 +40,17 @@ absl::StatusOr<ModuleOp> PostCalibrationComponent::Run(
     ModuleOp module_op, const QuantizationConfig& config) {
   TF_RETURN_IF_ERROR(RunPasses(
       kName, /*add_passes_func=*/
-      [&config, this](PassManager& pm) {
-        AddPostCalibrationPasses(pm, config.pipeline_config(),
-                                 config.static_range_ptq_preset());
+      [&config](PassManager& pm) {
+        AddPostCalibrationPasses(pm, config.pipeline_config(), config.specs());
       },
       *ctx_, module_op));
   return module_op;
 }
 
 void PostCalibrationComponent::AddPasses(
-    OpPassManager& pm, const StaticRangePtqPreset& static_range_ptq_preset,
+    OpPassManager& pm, const QuantizationSpecs& specs,
     const PipelineConfig& pipeline_config) const {
-  AddPostCalibrationPasses(pm, pipeline_config, static_range_ptq_preset);
+  AddPostCalibrationPasses(pm, pipeline_config, specs);
 }
 
 }  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/post_calibration.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/post_calibration.h
index 3c218c9f857524..6e3762817e16a1 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/post_calibration.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/post_calibration.h
@@ -47,8 +47,7 @@ class PostCalibrationComponent : public Component {
 
   void AddPasses(
       OpPassManager& pm,
-      const ::stablehlo::quantization::StaticRangePtqPreset&
-          static_range_ptq_preset,
+      const ::stablehlo::quantization::QuantizationSpecs& specs,
       const ::stablehlo::quantization::PipelineConfig& pipeline_config) const;
 
  private:
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/pre_calibration.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/pre_calibration.cc
index a423bdc5f80142..6143b21eec32cd 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/pre_calibration.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/pre_calibration.cc
@@ -38,7 +38,7 @@ absl::StatusOr<ModuleOp> PreCalibrationComponent::Run(
     ModuleOp module_op, const QuantizationConfig& config) {
   TF_RETURN_IF_ERROR(RunPasses(
       kName, /*add_passes_func=*/
-      [&config, this](PassManager& pm) {
+      [&config](PassManager& pm) {
         AddPreCalibrationPasses(pm, config.calibration_options(),
                                 config.specs(), config.debugger_config());
       },
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/report.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/report.cc
index ef24c16dbf4acc..93be3516d76f8d 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/report.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/report.cc
@@ -14,16 +14,142 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/report.h"
 
+#include <optional>
+#include <string>
 #include <utility>
 
+#include "absl/strings/str_cat.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tsl/platform/protobuf.h"  // IWYU pragma: keep
 
 namespace mlir::quant::stablehlo {
+namespace {
 
 using ::stablehlo::quantization::QuantizationResult;
+using ::stablehlo::quantization::QuantizationResults;
+using ::tsl::protobuf::TextFormat;
+
+// Given a `quantized_func_name` that starts with `kQuantizedFuncPrefix`,
+// converts `kQuantizedFuncPrefix` to `kCompositeFuncPrefix`.
+std::string GetCompositeFunctionName(const StringRef quantized_func_name) {
+  return Twine(kCompositeFuncPrefix)
+      .concat(quantized_func_name.rsplit(kQuantizedFuncPrefix).second)
+      .str();
+}
+
+// Retrieves `QuantizationResult` from `call_op`. If the callee's name starts
+// with `kQuantizedFuncPrefix` then a `QuantizationResult` will be returned with
+// its `name` field set to the callee's name reverted back to the lifted
+// function's name. Otherwise, returns `std::nullopt`.
+std::optional<QuantizationResult> GetQuantizationResult(func::CallOp call_op) {
+  const StringRef callee_name = call_op.getCalleeAttr().getValue();
+
+  if (callee_name.starts_with(kQuantizedFuncPrefix)) {
+    // TODO: b/329554870 - Transfer the `Method` used to quantize the op.
+    QuantizationResult result{};
+    result.mutable_quantizable_unit()->set_name(
+        GetCompositeFunctionName(callee_name));
+    return result;
+  } else {
+    return std::nullopt;
+  }
+}
+
+// Retrieves `QuantizationResult` from `xla_call_module_op`. If
+// `xla_call_module_op` is a quantizable unit, then a `QuantizationResult` will
+// be returned with its `name` field set to the callee's name. The `method`
+// field will be set to `NoQuantization` because remaining `xla_call_module_op`s
+// means they are not quantized. Returns `std::nullopt` if `xla_call_module_op`
+// is not a quantizable unit.
+std::optional<QuantizationResult> GetQuantizationResult(
+    TF::XlaCallModuleOp xla_call_module_op) {
+  const StringAttr callee_name_attr =
+      xla_call_module_op
+          ->getDiscardableAttr(kOriginalStablehloEntryFunctionAttrName)
+          .dyn_cast_or_null<StringAttr>();
+
+  // `TF::XlaCallModuleOp` without the `_original_entry_function` means it is
+  // not a quantizable unit.
+  if (callee_name_attr == nullptr) return std::nullopt;
+
+  if (callee_name_attr.getValue().starts_with(kCompositeFuncPrefix)) {
+    QuantizationResult result{};
+    result.mutable_quantizable_unit()->set_name(
+        callee_name_attr.getValue().str());
+    result.mutable_method()->mutable_no_quantization();
+    return result;
+  } else {
+    return std::nullopt;
+  }
+}
+
+// Populates quantized ops from `module_op` to `results`. After going through
+// the quantization passes, quantized ops are represented as `func::CallOp` with
+// a callee's prefix of `quantized_`.
+void PopulateQuantizedResults(ModuleOp module_op,
+                              QuantizationResults& results) {
+  module_op.walk([&results](func::CallOp call_op) {
+    std::optional<QuantizationResult> result = GetQuantizationResult(call_op);
+    if (result == std::nullopt) return WalkResult::skip();
+
+    *results.add_results() = std::move(*result);
+    return WalkResult::advance();
+  });
+}
+
+// Populates non-quantized ops from `module_op` to `results`. After going
+// through the quantization passes, non-quantized quantizable units remain as
+// `TF::XlaCallModuleOp` with a callee's prefix of `composite_`.
+void PopulateNonQuantizedResults(ModuleOp module_op,
+                                 QuantizationResults& results) {
+  module_op.walk([&results](TF::XlaCallModuleOp xla_call_module_op) {
+    std::optional<QuantizationResult> result =
+        GetQuantizationResult(xla_call_module_op);
+    if (result == std::nullopt) return WalkResult::skip();
+
+    *results.add_results() = std::move(*result);
+    return WalkResult::advance();
+  });
+}
+
+}  // namespace
+
+QuantizationReport::QuantizationReport(ModuleOp module_op)
+    : quantization_results_(CollectResultsFromModuleOp(module_op)) {}
+
+QuantizationResults QuantizationReport::CollectResultsFromModuleOp(
+    ModuleOp module_op) const {
+  QuantizationResults results{};
+
+  PopulateQuantizedResults(module_op, results);
+  PopulateNonQuantizedResults(module_op, results);
+
+  return results;
+}
 
 void QuantizationReport::AddQuantizationResult(QuantizationResult&& result) {
   *quantization_results_.add_results() = std::move(result);
 }
 
+std::string QuantizationReport::ToString() const {
+  std::string results_str{};
+  TextFormat::PrintToString(quantization_results_, &results_str);
+
+  return absl::StrCat("===== Quantization Report =====\n\n", results_str,
+                      "\n===== Quantization Report End =====\n\n");
+}
+
+void QuantizationReport::Print() const {
+  llvm::outs() << ToString();
+  llvm::outs().flush();  // Show the report immediately.
+}
+
 }  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/report.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/report.h
index 94eb47463f16c1..a362bb758cb60c 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/report.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/report.h
@@ -15,6 +15,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_REPORT_H_
 #define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_REPORT_H_
 
+#include <string>
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 
 namespace mlir::quant::stablehlo {
@@ -27,6 +30,10 @@ class QuantizationReport {
  public:
   QuantizationReport() = default;
 
+  // Initializes `QuantizationReport` by collecting `QuantizationResults` from
+  // `module_op`.
+  explicit QuantizationReport(ModuleOp module_op);
+
   // Adds a `QuantizationResult` to the report.
   void AddQuantizationResult(
       ::stablehlo::quantization::QuantizationResult&& result);
@@ -37,7 +44,16 @@ class QuantizationReport {
     return quantization_results_;
   }
 
+  // Returns a human-readable string representation of this report.
+  std::string ToString() const;
+
+  // Prints a human-readable report to stdout.
+  void Print() const;
+
  private:
+  ::stablehlo::quantization::QuantizationResults CollectResultsFromModuleOp(
+      ModuleOp module_op) const;
+
   // Quantization results that are registered in this report. A quantization
   // result may be added manually by calling `AddQuantizationResult`.
   ::stablehlo::quantization::QuantizationResults quantization_results_;
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/report_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/report_test.cc
index f6897f7fde401d..4783fb6beebc2d 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/report_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/report_test.cc
@@ -14,11 +14,17 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/report.h"
 
+#include <string>
 #include <utility>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/test_base.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tsl/platform/protobuf.h"  // IWYU pragma: keep
 
 namespace mlir::quant::stablehlo {
 namespace {
@@ -30,15 +36,18 @@ using ::stablehlo::quantization::QuantizationResults;
 using ::testing::IsEmpty;
 using ::testing::SizeIs;
 using ::testing::StrEq;
+using ::tsl::protobuf::TextFormat;
 
-TEST(QuantizationReportTest, GetQuantizationResultsReturnsEmptyResults) {
+using QuantizationReportTest = ::mlir::quant::QuantizationTestBase;
+
+TEST_F(QuantizationReportTest, GetQuantizationResultsReturnsEmptyResults) {
   QuantizationReport report{};
 
   const QuantizationResults& results = report.GetQuantizationResults();
   ASSERT_THAT(results.results(), IsEmpty());
 }
 
-TEST(QuantizationReportTest, AddQuantizationResult) {
+TEST_F(QuantizationReportTest, AddQuantizationResult) {
   // Construct a `QuantizationResult` to add, representing a unit named
   // `quantized_my_function` that is not quantized.
   QuantizationResult result{};
@@ -60,5 +69,144 @@ TEST(QuantizationReportTest, AddQuantizationResult) {
   EXPECT_TRUE(first_result.method().has_no_quantization());
 }
 
+TEST_F(QuantizationReportTest, InitializeWithModuleOp) {
+  constexpr absl::string_view kQuantizedDotGeneral = R"mlir(
+    func.func @main(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> {
+      %0 = stablehlo.constant() {value = dense<127> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {1.000000e+0,2.000000e+0,3.000000e+0}>>
+      %1 = stablehlo.uniform_quantize %arg0 : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, 4.000000e+0>>
+      %2 = call @quantized_dot_general_fn(%1, %0) : (tensor<1x2x!quant.uniform<i8:f32, 4.000000e+0>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {1.000000e+0,2.000000e+0,3.000000e+0}>>) -> tensor<1x3x!quant.uniform<i8:f32, 5.000000e+0>>
+      %3 = stablehlo.uniform_dequantize %2 : (tensor<1x3x!quant.uniform<i8:f32, 5.000000e+0>>) -> tensor<1x3xf32>
+      return %3 : tensor<1x3xf32>
+    }
+
+    func.func private @quantized_dot_general_fn(%arg0: tensor<1x2x!quant.uniform<i8:f32, 4.000000e+0>>, %arg1: tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {1.000000e+0,2.000000e+0,3.000000e+0}>>) -> tensor<1x3x!quant.uniform<i8:f32, 5.000000e+0>> {
+      %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2x!quant.uniform<i8:f32, 4.000000e+0>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {1.000000e+0,2.000000e+0,3.000000e+0}>>) -> tensor<1x3x!quant.uniform<i32:f32:1, {6.000000e+0,7.000000e+0,8.000000e+0}>>
+      %1 = stablehlo.uniform_quantize %0 : (tensor<1x3x!quant.uniform<i32:f32:1, {6.000000e+0,7.000000e+0,8.000000e+0}>>) -> tensor<1x3x!quant.uniform<i8:f32, 5.000000e+0>>
+      return %1 : tensor<1x3x!quant.uniform<i8:f32, 5.000000e+0>>
+    }
+  )mlir";
+
+  const OwningOpRef<ModuleOp> module_op =
+      ParseModuleOpString(kQuantizedDotGeneral);
+  ASSERT_TRUE(module_op);
+
+  const QuantizationReport report(*module_op);
+  const QuantizationResults& results = report.GetQuantizationResults();
+  ASSERT_THAT(results.results(), SizeIs(1));
+
+  // Test that the quantized `QuantizableUnit` corresponding to
+  // `composite_dot_general_fn` is captured.
+  // TODO: Transfer the `Method` used to quantize the op.
+  const QuantizationResult& result = results.results(0);
+  EXPECT_THAT(result.quantizable_unit().name(),
+              StrEq("composite_dot_general_fn"));
+  EXPECT_FALSE(result.has_method());
+}
+
+TEST_F(QuantizationReportTest, InitializeWithModuleOpWithNonQuantizedOp) {
+  constexpr absl::string_view kNonQuantizedDotGeneral = R"mlir(
+    func.func @main(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> {
+      %0 = stablehlo.constant dense<3.000000e+0> : tensor<2x3xf32>
+      %1 = "tf.XlaCallModule"(%arg0, %0) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+      return %1 : tensor<1x3xf32>
+    }
+
+    func.func private @composite_dot_general_fn(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<1x3xf32> {
+      %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+      return %0 : tensor<1x3xf32>
+    }
+  )mlir";
+
+  const OwningOpRef<ModuleOp> module_op =
+      ParseModuleOpString(kNonQuantizedDotGeneral);
+  ASSERT_TRUE(module_op);
+
+  const QuantizationReport report(*module_op);
+  const QuantizationResults& results = report.GetQuantizationResults();
+  ASSERT_THAT(results.results(), SizeIs(1));
+
+  // Test that the unquantized `QuantizableUnit` corresponding to
+  // `composite_dot_general_fn` is captured. The `Method` contains
+  // `NoQuantization`.
+  const QuantizationResult& result = results.results(0);
+  EXPECT_THAT(result.quantizable_unit().name(),
+              StrEq("composite_dot_general_fn"));
+  EXPECT_TRUE(result.method().has_no_quantization());
+}
+
+TEST_F(QuantizationReportTest,
+       InitializeWithModuleOpWithQuantizedAndNonQuantizedOps) {
+  constexpr absl::string_view kQuantizedDotGeneralAndNonQuantizedDotGeneral =
+      R"mlir(
+    func.func @main(%arg0: tensor<1x2xf32>, %arg1: tensor<1x2xf32>) -> tensor<1x3xf32> {
+      // Non-quantized dot_general.
+      %0 = stablehlo.constant dense<3.000000e+0> : tensor<2x3xf32>
+      %1 = "tf.XlaCallModule"(%arg0, %0) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+      // Quantized dot_general.
+      %2 = stablehlo.constant() {value = dense<127> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {1.000000e+0,2.000000e+0,3.000000e+0}>>
+      %3 = stablehlo.uniform_quantize %arg1 : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, 4.000000e+0>>
+      %4 = call @quantized_dot_general_fn_2(%3, %2) : (tensor<1x2x!quant.uniform<i8:f32, 4.000000e+0>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {1.000000e+0,2.000000e+0,3.000000e+0}>>) -> tensor<1x3x!quant.uniform<i8:f32, 5.000000e+0>>
+      %5 = stablehlo.uniform_dequantize %4 : (tensor<1x3x!quant.uniform<i8:f32, 5.000000e+0>>) -> tensor<1x3xf32>
+      // Add is there to prevent from dot_generals from being DCEed.
+      %6 = stablehlo.add %1, %5 : tensor<1x3xf32>
+      return %6 : tensor<1x3xf32>
+    }
+
+    // Callee of non-quantized op.
+    func.func private @composite_dot_general_fn_1(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<1x3xf32> {
+      %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+      return %0 : tensor<1x3xf32>
+    }
+
+    // Callee of quantized op.
+    func.func private @quantized_dot_general_fn_2(%arg0: tensor<1x2x!quant.uniform<i8:f32, 4.000000e+0>>, %arg1: tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {1.000000e+0,2.000000e+0,3.000000e+0}>>) -> tensor<1x3x!quant.uniform<i8:f32, 5.000000e+0>> {
+      %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2x!quant.uniform<i8:f32, 4.000000e+0>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {1.000000e+0,2.000000e+0,3.000000e+0}>>) -> tensor<1x3x!quant.uniform<i32:f32:1, {6.000000e+0,7.000000e+0,8.000000e+0}>>
+      %1 = stablehlo.uniform_quantize %0 : (tensor<1x3x!quant.uniform<i32:f32:1, {6.000000e+0,7.000000e+0,8.000000e+0}>>) -> tensor<1x3x!quant.uniform<i8:f32, 5.000000e+0>>
+      return %1 : tensor<1x3x!quant.uniform<i8:f32, 5.000000e+0>>
+    }
+  )mlir";
+
+  const OwningOpRef<ModuleOp> module_op =
+      ParseModuleOpString(kQuantizedDotGeneralAndNonQuantizedDotGeneral);
+  ASSERT_TRUE(module_op);
+
+  const QuantizationReport report(*module_op);
+  const QuantizationResults& results = report.GetQuantizationResults();
+  ASSERT_THAT(results.results(), SizeIs(2));
+
+  // Test that the quantized op is captured in `results`.
+  const QuantizationResult& quantized_result = results.results(0);
+  EXPECT_THAT(quantized_result.quantizable_unit().name(),
+              StrEq("composite_dot_general_fn_2"));
+  EXPECT_FALSE(quantized_result.has_method());
+
+  // Test that the non-quantized op is captured in `results`.
+  const QuantizationResult& non_quantized_result = results.results(1);
+  EXPECT_THAT(non_quantized_result.quantizable_unit().name(),
+              StrEq("composite_dot_general_fn_1"));
+  EXPECT_TRUE(non_quantized_result.method().has_no_quantization());
+}
+
+TEST_F(QuantizationReportTest, ToString) {
+  QuantizationResult result{};
+  QuantizableUnit& quantizable_unit = *result.mutable_quantizable_unit();
+  quantizable_unit.set_name("quantized_my_function");
+
+  Method& method = *result.mutable_method();
+  method.mutable_no_quantization();
+
+  QuantizationReport report{};
+  report.AddQuantizationResult(std::move(result));
+
+  // Check that the report string is equivalent to the textproto representation
+  // of the `QuantizationResults`.
+  std::string result_str{};
+  TextFormat::PrintToString(report.GetQuantizationResults(), &result_str);
+
+  EXPECT_THAT(report.ToString(), testing::HasSubstr("Quantization Report"));
+  EXPECT_THAT(report.ToString(), testing::HasSubstr(result_str));
+  EXPECT_THAT(report.ToString(), testing::HasSubstr("Quantization Report End"));
+}
+
 }  // namespace
 }  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/ops/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/ops/BUILD
index 35584857f5761f..61da2af4d3fb58 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/ops/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/ops/BUILD
@@ -40,6 +40,8 @@ tf_cc_test(
     deps = [
         ":stablehlo_op_quant_spec",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
+        "//tensorflow/compiler/mlir/quantization/common:attrs_and_constraints",
+        "//tensorflow/compiler/mlir/quantization/common:func",
         "//tensorflow/compiler/mlir/quantization/common:test_base",
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib",
         "//tensorflow/compiler/mlir/tensorflow",
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.cc b/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.cc
index c78ee607993385..3018db7b2649e9 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.cc
@@ -131,7 +131,7 @@ std::unique_ptr<OpQuantSpec> GetStableHloOpQuantSpec(Operation* op) {
   return spec;
 }
 
-std::unique_ptr<OpQuantScaleSpec> GetStableHloQuantScaleSpec(Operation* op) {
+std::unique_ptr<OpQuantScaleSpec> GetStableHloQuantConstraints(Operation* op) {
   auto scale_spec = std::make_unique<OpQuantScaleSpec>();
   if (llvm::isa<mlir::stablehlo::BroadcastInDimOp,
                 mlir::stablehlo::ConcatenateOp,
@@ -142,6 +142,10 @@ std::unique_ptr<OpQuantScaleSpec> GetStableHloQuantScaleSpec(Operation* op) {
                 mlir::stablehlo::SliceOp, mlir::stablehlo::TransposeOp>(op)) {
     scale_spec->has_same_scale_requirement = true;
   }
+  if (llvm::isa<mlir::stablehlo::DynamicSliceOp, mlir::stablehlo::GatherOp,
+                mlir::stablehlo::PadOp, mlir::stablehlo::SliceOp>(op)) {
+    scale_spec->has_same_operand_and_result_type_requirement = true;
+  }
   return scale_spec;
 }
 
@@ -165,7 +169,7 @@ bool IsOpQuantizableStableHlo(Operation* op) {
     return false;
   }
 
-  if (GetStableHloQuantScaleSpec(op)->has_same_scale_requirement) {
+  if (GetStableHloQuantConstraints(op)->has_same_scale_requirement) {
     return true;
   }
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.h b/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.h
index 6edeb9829b6b63..6c688e823c96ba 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.h
@@ -28,9 +28,9 @@ namespace mlir::quant::stablehlo {
 // Returns StableHLO quantization specs for an op.
 std::unique_ptr<OpQuantSpec> GetStableHloOpQuantSpec(Operation* op);
 
-// Returns quantization scale specs (fixed output, same scale) for a StableHLO
-// op.
-std::unique_ptr<OpQuantScaleSpec> GetStableHloQuantScaleSpec(Operation* op);
+// Returns quantization constraints (ex: fixed output, same scale) given
+// a StableHLO op.
+std::unique_ptr<OpQuantScaleSpec> GetStableHloQuantConstraints(Operation* op);
 
 // Checks if an op is quantizable in StableHLO quantizer. Argument op is not
 // necessarily a StableHLO op.
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec_test.cc
index b3ba4818284498..572bf0e05729b0 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec_test.cc
@@ -26,6 +26,8 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
+#include "tensorflow/compiler/mlir/quantization/common/func.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
 #include "tensorflow/compiler/mlir/quantization/common/test_base.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -34,7 +36,9 @@ limitations under the License.
 namespace mlir::quant::stablehlo {
 namespace {
 
+using ::mlir::stablehlo::GatherOp;
 using ::testing::IsEmpty;
+using ::testing::IsTrue;
 using ::testing::NotNull;
 using ::testing::Pair;
 using ::testing::UnorderedElementsAre;
@@ -284,5 +288,42 @@ TEST_F(GetStableHloOpQuantSpecTest,
               UnorderedElementsAre(Pair(1, 3)));
 }
 
+using GetStableHloQuantConstraintsTest = ::mlir::quant::QuantizationTestBase;
+
+TEST_F(GetStableHloQuantConstraintsTest,
+       HasSameOperandAndResultTypeRequirementSucceeds) {
+  // Quantizable ops: constants
+  // Non-quantizable ops: normal StableHLO ops and terminators
+  constexpr absl::string_view kModuleGather = R"mlir(
+    module {
+      func.func @main() -> (tensor<2x3x2x2xf32>) {
+        %0 = stablehlo.constant dense<1.0> : tensor<3x4x2xf32>
+        %1 = stablehlo.constant dense<2> : tensor<2x3x2xi64>
+        %2 = "stablehlo.gather"(%0, %1) {
+          dimension_numbers = #stablehlo.gather<
+            offset_dims = [2, 3],
+            collapsed_slice_dims = [0],
+            start_index_map = [1, 0],
+            index_vector_dim = 2>,
+          slice_sizes = array<i64: 1, 2, 2>,
+          indices_are_sorted = false
+        } : (tensor<3x4x2xf32>, tensor<2x3x2xi64>) -> tensor<2x3x2x2xf32>
+        func.return %2 : tensor<2x3x2x2xf32>
+      }
+    }
+  )mlir";
+  OwningOpRef<ModuleOp> module_op = ParseModuleOpString(kModuleGather);
+  ASSERT_TRUE(module_op);
+
+  func::FuncOp main_fn = FindMainFuncOp(*module_op);
+  ASSERT_THAT(main_fn, NotNull());
+
+  Operation* gather_op = FindOperationOfType<GatherOp>(main_fn);
+  const auto spec = GetStableHloQuantConstraints(gather_op);
+
+  EXPECT_THAT(spec, NotNull());
+  EXPECT_THAT(spec->has_same_operand_and_result_type_requirement, IsTrue());
+}
+
 }  // namespace
 }  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_to_mhlo_int_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_to_mhlo_int_test.cc
index 1987b607392379..ad0179f3c051a1 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_to_mhlo_int_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_to_mhlo_int_test.cc
@@ -48,9 +48,9 @@ limitations under the License.
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "xla/pjrt/cpu/cpu_client.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_executable.h"
-#include "xla/pjrt/tfrt_cpu_pjrt_client.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tests/literal_test_util.h"
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions.cc
index a4bf42ec6f8eba..6577666ab90f10 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions.cc
@@ -177,9 +177,6 @@ FailureOr<std::string> QuantizationMethodToTextProto(const Method& method) {
 // TODO: b/307620778 - Support more advanced selective quantization methods.
 LogicalResult ApplyQuantizationSpec(const QuantizationSpec& spec,
                                     ModuleOp module_op) {
-  func::FuncOp main_func = FindMainFuncOp(module_op);
-  if (!main_func) return failure();
-
   const Method& quantization_method = spec.method();
 
   FailureOr<std::string> quantization_method_txtpb =
@@ -187,14 +184,18 @@ LogicalResult ApplyQuantizationSpec(const QuantizationSpec& spec,
   if (failed(quantization_method_txtpb)) return failure();
 
   const FunctionNameMatcher matcher(spec.matcher().function_name());
-  for (auto xla_call_module_op : main_func.getOps<TF::XlaCallModuleOp>()) {
-    if (!matcher.Match(xla_call_module_op)) continue;
-
-    // Set the text representation of `Method` to matched `TF::XlaCallModuleOp`.
-    xla_call_module_op->setAttr(
-        kQuantizationMethodAttr,
-        StringAttr::get(module_op.getContext(),
-                        std::move(*quantization_method_txtpb)));
+  // Iterate over all XlaCallModuleOp in all FuncOps.
+  for (auto func : module_op.getOps<func::FuncOp>()) {
+    for (auto xla_call_module_op : func.getOps<TF::XlaCallModuleOp>()) {
+      if (!matcher.Match(xla_call_module_op)) continue;
+
+      // Set the text representation of `Method` to matched
+      // `TF::XlaCallModuleOp`.
+      xla_call_module_op->setAttr(
+          kQuantizationMethodAttr,
+          StringAttr::get(module_op.getContext(),
+                          std::move(*quantization_method_txtpb)));
+    }
   }
   return success();
 }
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions_fusion.td b/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions_fusion.td
index 6377740bf6018e..75940a24cf484f 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions_fusion.td
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions_fusion.td
@@ -40,6 +40,28 @@ def LiftDotGeneralWithBiasSameShape : Pat<
       (NamedAttr<"precision_config"> (DefaultOrNullAttr $precision_config)))),
   [(IsNotInLiftedFunc $res), (IsStableHLOConstantOp $bias)], [], (addBenefit 5)>;
 
+def LiftConvWithBiasSameShape : Pat<
+  (StableHLO_AddOp:$res
+    (StableHLO_ConvolutionOp $lhs, $rhs, $window_strides, $padding,
+        $lhs_dilation, $rhs_dilation, $window_reversal, $dimension_numbers,
+        $feature_group_count, $batch_group_count, $precision_config),
+    $bias),
+  (LiftAsTFXlaCallModule<"composite_conv_with_bias_same_shape_fn">
+    (ArgumentList $lhs, $rhs, $bias),
+    (ResultList $res),
+    (NamedAttributeList
+      (NamedAttr<"window_strides"> (DefaultOrNullAttr $window_strides)),
+      (NamedAttr<"padding"> (DefaultOrNullAttr $padding)),
+      (NamedAttr<"lhs_dilation"> (DefaultOrNullAttr $lhs_dilation)),
+      (NamedAttr<"rhs_dilation"> (DefaultOrNullAttr $rhs_dilation)),
+      (NamedAttr<"window_reversal"> (DefaultOrNullAttr $window_reversal)),
+      (NamedAttr<"dimension_numbers"> $dimension_numbers),
+      (NamedAttr<"feature_group_count"> $feature_group_count),
+      (NamedAttr<"batch_group_count"> $batch_group_count),
+      (NamedAttr<"precision_config"> (DefaultOrNullAttr $precision_config)))),
+  [(IsNotInLiftedFunc $res), (IsStableHLOConstantOp $bias)], [], (addBenefit 5)>;
+
+
 def LiftConvWithBias : Pat<
   (StableHLO_AddOp:$res
     (StableHLO_ConvolutionOp $lhs, $rhs, $window_strides, $padding,
@@ -245,6 +267,31 @@ def LiftDotGeneralWithBiasSameShapeAndRelu : Pat<
   [(IsNotInLiftedFunc $res),
    (FloatValueEquals<"0"> $cst), (IsStableHLOConstantOp $bias)], [], (addBenefit 10)>;
 
+def LiftConvWithBiasSameShapeAndRelu : Pat<
+  (StableHLO_MaxOp:$res
+    (StableHLO_AddOp
+      (StableHLO_ConvolutionOp $lhs, $rhs, $window_strides, $padding,
+          $lhs_dilation, $rhs_dilation, $window_reversal, $dimension_numbers,
+          $feature_group_count, $batch_group_count, $precision_config),
+      $bias),
+    (StableHLO_ConstantOp $cst)),
+  (LiftAsTFXlaCallModule<"composite_conv_with_bias_same_shape_and_relu_fn">
+    (ArgumentList $lhs, $rhs, $bias),
+    (ResultList $res),
+    (NamedAttributeList
+      (NamedAttr<"window_strides"> (DefaultOrNullAttr $window_strides)),
+      (NamedAttr<"padding"> (DefaultOrNullAttr $padding)),
+      (NamedAttr<"lhs_dilation"> (DefaultOrNullAttr $lhs_dilation)),
+      (NamedAttr<"rhs_dilation"> (DefaultOrNullAttr $rhs_dilation)),
+      (NamedAttr<"window_reversal"> (DefaultOrNullAttr $window_reversal)),
+      (NamedAttr<"dimension_numbers"> $dimension_numbers),
+      (NamedAttr<"feature_group_count"> $feature_group_count),
+      (NamedAttr<"batch_group_count"> $batch_group_count),
+      (NamedAttr<"precision_config"> (DefaultOrNullAttr $precision_config)))),
+  [(IsNotInLiftedFunc $res),
+   (FloatValueEquals<"0"> $cst), (IsStableHLOConstantOp $bias)], [], (addBenefit 10)>;
+
+
 def LiftConvWithBiasAndRelu : Pat<
   (StableHLO_MaxOp:$res
     (StableHLO_AddOp
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/merge_fusion_with_dequantize.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/merge_fusion_with_dequantize.cc
new file mode 100644
index 00000000000000..acfe3cfd6fc6b2
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/merge_fusion_with_dequantize.cc
@@ -0,0 +1,145 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <utility>
+
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "stablehlo/dialect/ChloOps.h"  // from @stablehlo
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/lite/transforms/passes.h"
+
+namespace mlir::quant::stablehlo {
+
+#define GEN_PASS_DEF_MERGEFUSIONWITHDEQUANTIZEPASS
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h.inc"
+
+namespace {
+
+class MergeFusionWithDequantizePass
+    : public impl::MergeFusionWithDequantizePassBase<
+          MergeFusionWithDequantizePass> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(MergeFusionWithDequantizePass)
+
+  explicit MergeFusionWithDequantizePass() = default;
+
+ private:
+  void runOnOperation() override;
+};
+
+class MergeFusionWithUniformDequantizePattern
+    : public OpRewritePattern<func::CallOp> {
+ public:
+  explicit MergeFusionWithUniformDequantizePattern(MLIRContext* context)
+      : OpRewritePattern<func::CallOp>(context) {}
+  LogicalResult matchAndRewrite(func::CallOp call_op,
+                                PatternRewriter& rewriter) const override {
+    if (call_op.getNumResults() != 1) return failure();
+    auto users = call_op->getUsers();
+    for (auto user : users) {
+      if (!llvm::isa<mlir::stablehlo::UniformDequantizeOp>(user)) {
+        return failure();
+      }
+    }
+    auto func_name = call_op.getCallee();
+    if (!func_name.starts_with("quantized_")) return failure();
+    if (call_op->getNumResults() != 1) return failure();
+    if (!getElementTypeOrSelf(call_op->getResult(0).getType())
+             .isa<UniformQuantizedType>())
+      return failure();
+
+    // Fetch the callee function.
+    SymbolTable symbol_table(call_op->getParentOfType<ModuleOp>());
+    auto func_op =
+        dyn_cast_or_null<func::FuncOp>(symbol_table.lookup(func_name));
+    if (!func_op) return failure();
+    // The quantized fusion should have requantize and return ops at the end.
+    auto return_op = dyn_cast_or_null<func::ReturnOp>(
+        func_op.getRegion().getBlocks().front().getTerminator());
+    if (!return_op) return failure();
+    auto req_op = llvm::dyn_cast_or_null<mlir::stablehlo::UniformQuantizeOp>(
+        return_op.getOperands()[0].getDefiningOp());
+    if (!req_op) return failure();
+
+    // Create a new func.call op with f32 output.
+    auto new_call_op = call_op.clone();
+    new_call_op->getResult(0).setType(
+        call_op.getResult(0).getType().cast<ShapedType>().clone(
+            rewriter.getF32Type()));
+    rewriter.setInsertionPoint(call_op);
+    rewriter.insert(new_call_op);
+
+    // Remove the dequantize ops and replace uses by the new func.call op.
+    SmallVector<Operation*> users_to_erase;
+    for (auto user : users) {
+      llvm::dyn_cast<mlir::stablehlo::UniformDequantizeOp>(user)
+          .replaceAllUsesWith(new_call_op.getResult(0));
+      users_to_erase.push_back(user);
+    }
+    for (auto user : users_to_erase) rewriter.eraseOp(user);
+    rewriter.eraseOp(call_op);
+    func_op.eraseResult(0);
+    func_op.insertResult(0, new_call_op.getResult(0).getType(),
+                         /*resultAttrs=*/nullptr);
+
+    // Modify the quantized fused function to do dequantize+relu(6).
+    rewriter.setInsertionPoint(req_op);
+    Value new_result = rewriter.create<mlir::stablehlo::UniformDequantizeOp>(
+        req_op.getLoc(), func_op.getResultTypes()[0], req_op.getOperand());
+    if (func_name.contains("_relu6_")) {
+      auto min = rewriter.create<mlir::stablehlo::ConstantOp>(
+          req_op.getLoc(), rewriter.getF32FloatAttr(0));
+      auto max = rewriter.create<mlir::stablehlo::ConstantOp>(
+          req_op.getLoc(), rewriter.getF32FloatAttr(6));
+      new_result = rewriter.create<mlir::stablehlo::ClampOp>(
+          req_op.getLoc(), min, new_result, max);
+    } else if (func_name.contains("_relu_")) {
+      auto min = rewriter.create<mlir::stablehlo::ConstantOp>(
+          req_op.getLoc(), rewriter.getF32FloatAttr(0));
+      new_result = rewriter.create<mlir::chlo::BroadcastMaxOp>(
+          req_op.getLoc(), min, new_result, nullptr);
+    }
+    return_op->setOperand(0, new_result);
+    rewriter.eraseOp(req_op);
+
+    return success();
+  }
+};
+
+void MergeFusionWithDequantizePass::runOnOperation() {
+  ModuleOp module_op = getOperation();
+  MLIRContext* ctx = module_op.getContext();
+  RewritePatternSet patterns(ctx);
+  patterns.add<MergeFusionWithUniformDequantizePattern>(ctx);
+  if (failed(applyPatternsAndFoldGreedily(module_op, std::move(patterns)))) {
+    signalPassFailure();
+  }
+}
+
+}  // namespace
+}  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td b/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td
index 63f6f822dbebdf..fdb7fa7941f025 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td
@@ -60,10 +60,6 @@ def QuantizeCompositeFunctionsPass : Pass<"stablehlo-quantize-composite-function
         "enable-per-channel-quantized-weight",
         "bool", /*default=*/"true",
         "Whether to enable per-channel quantized weights.">,
-    Option<"enable_full_int_quantization_",
-        "enable-full-int-quantization",
-        "bool", /*default=*/"false",
-        "Whether to enable full int quantization, including non compute-heavy ops.">,
     Option<"mlir_dump_file_name_", "mlir-dump-file-name",
         "std::optional<std::string>", /*default=*/"std::nullopt",
         "MLIR dump file name.">,
@@ -71,6 +67,10 @@ def QuantizeCompositeFunctionsPass : Pass<"stablehlo-quantize-composite-function
         "enable-weight-only",
         "bool", /*default=*/"false",
         "Whether to produce weight-only quantized op for convolution and dot_general op.">,
+    Option<"merge_fusion_with_dequantize_",
+        "merge-fusion-with-dequantize",
+        "bool", /*default=*/"false",
+        "Whether to merge quantized conv/dot_general fusion with subsequent dequantize.">,
   ];
   let dependentDialects = [
     "mlir::arith::ArithDialect",
@@ -106,10 +106,6 @@ def QuantizePass : Pass<"stablehlo-quantize", "mlir::ModuleOp"> {
         "enable-per-channel-quantized-weight",
         "bool", /*default=*/"true",
         "Whether to enable per-channel quantized weights.">,
-    Option<"enable_full_int_quantization_",
-      "enable-full-int-quantization",
-      "bool", /*default=*/"false",
-      "Whether to apply full int quantization, including non compute-heavy ops.">,
     Option<"enable_weight_only_",
         "enable-weight-only",
         "bool", /*default=*/"false",
@@ -130,6 +126,21 @@ def PostQuantizePass : Pass<"stablehlo-post-quantize", "mlir::func::FuncOp"> {
   ];
 }
 
+def XlaCallModuleToCallPass : Pass<"stablehlo-xla-call-module-to-call", "ModuleOp"> {
+  let summary = "Convert XlaCallModuleOp to func.call op";
+  let dependentDialects = [
+    "TF::TensorFlowDialect",
+  ];
+}
+
+def MergeFusionWithDequantizePass : Pass<"stablehlo-merge-fusion-with-dequantize", "mlir::ModuleOp"> {
+  let summary = "Merge quantized conv/dot_general fusion with subsequent dequantize.";
+  let dependentDialects = [
+    "chlo::ChloDialect",
+    "mlir::stablehlo::StablehloDialect",
+  ];
+}
+
 def UnwrapXlaCallModuleOpPass : Pass<"stablehlo-unwrap-xla-call-module-op", "ModuleOp"> {
   let summary = "Unwrap XlaCallModuleOps into inline functions if not used for quantizing fused patterns.";
   let dependentDialects = ["TF::TensorFlowDialect"];
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/prepare_quantize.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/prepare_quantize.cc
index 05d5d71d4d3c17..7d2df9e27f9220 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/prepare_quantize.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/prepare_quantize.cc
@@ -140,7 +140,7 @@ void PrepareQuantizePass::runOnOperation() {
   MLIRContext* ctx = module_op.getContext();
 
   auto func_op_quant_spec = GetStableHloOpQuantSpec;
-  auto func_op_quant_scale_spec = GetStableHloQuantScaleSpec;
+  auto func_op_quant_scale_spec = GetStableHloQuantConstraints;
 
   for (auto func_op : module_op.getOps<func::FuncOp>()) {
     // The function might contain more stats ops than required, and it will
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc
index 10b15f1132fe62..a6d041a5b8cb9e 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc
@@ -78,8 +78,6 @@ using ::stablehlo::quantization::Method;
 using ::stablehlo::quantization::QuantizedType;
 using ::stablehlo::quantization::StaticRangePtq;
 
-constexpr StringRef kCompositeFuncPrefix = "composite_";
-constexpr StringRef kQuantizedFuncPrefix = "quantized_";
 constexpr StringRef kEntryFuncAttrName = "_entry_function";
 
 // Returns broadcasted user op of an input op. Returns null if
@@ -515,9 +513,35 @@ class QuantizeSingularOpPattern : public EntryFuncBodyQuantizationPattern {
   void rewrite(func::FuncOp entry_func_op, const Method& quantization_method,
                PatternRewriter& rewriter) const override {
     auto singular_op = *entry_func_op.getOps<SingularOpT>().begin();
-
     Value singular_op_result = singular_op.getResult();
-    singular_op_result.setType(entry_func_op.getResultTypes()[0]);
+
+    // For ops that require same operand and result types, use explicit
+    // requantize op rather than using `entry_func_op`'s result as op result.
+    auto spec = GetStableHloQuantConstraints(singular_op);
+    const bool has_same_operand_and_result_type =
+        spec->has_same_operand_and_result_type_requirement;
+    if (has_same_operand_and_result_type) {
+      const Type operand_type = entry_func_op.getArgumentTypes()[0];
+      const Type func_result_type = entry_func_op.getResultTypes()[0];
+
+      // Get the quantized tensor manipulation op's output type and update.
+      const auto singular_op_result_type =
+          singular_op_result.getType().cast<RankedTensorType>();
+      const ArrayRef<int64_t> singular_op_shape =
+          singular_op_result_type.getShape();
+      const TensorType new_singular_op_result_type =
+          singular_op_result_type.cloneWith(
+              singular_op_shape,
+              getElementTypeOrSelf(operand_type).cast<UniformQuantizedType>());
+      singular_op_result.setType(new_singular_op_result_type);
+
+      // Create requantization op and return.
+      rewriter.setInsertionPointAfter(singular_op);
+      CreateAndReturnUniformQuantizeOp(rewriter, *singular_op, entry_func_op,
+                                       func_result_type);
+    } else {
+      singular_op_result.setType(entry_func_op.getResultTypes()[0]);
+    }
   }
 };
 
@@ -543,6 +567,29 @@ void QuantizeEntryFuncOp(
   entry_func_op.setSymName(quantized_function_name);
 }
 
+// Replaces `xla_call_module_op` with a newly created `func::CallOp`, where the
+// callee is `callee_func_op`. The existence of `kQuantizationMethodAttr` in
+// `xla_call_module_op` should be guaranteed.
+void ReplaceXlaCallModuleOpWithNewCallOp(TF::XlaCallModuleOp xla_call_module_op,
+                                         func::FuncOp callee_func_op,
+                                         PatternRewriter& rewriter) {
+  OpBuilder::InsertionGuard insertion_guard(rewriter);
+
+  // Create a new `CallOp` that calls `callee_func_op`.
+  rewriter.setInsertionPoint(xla_call_module_op);
+  auto call_op =
+      rewriter.create<func::CallOp>(xla_call_module_op.getLoc(), callee_func_op,
+                                    xla_call_module_op.getArgs());
+
+  // Transfer the `kQuantizationMethodAttr` attribute to the `CallOp`,
+  // indicating what `Method` has been applied to the quantized unit.
+  call_op->setAttr(
+      kQuantizationMethodAttr,
+      xla_call_module_op->getAttrOfType<StringAttr>(kQuantizationMethodAttr));
+
+  rewriter.replaceOp(xla_call_module_op, call_op);
+}
+
 // Replaces a quantized `xla_call_module_op` with a `func::CallOp`. The callee
 // is expected to remain unquantized (thus having a signature mismatch), and it
 // is also quantized accordingly.
@@ -558,10 +605,8 @@ void ReplaceQuantizedXlaCallModuleOpWithQuantizedCallOp(
   QuantizeEntryFuncOp(ctx, rewriter, xla_call_module_op, entry_func_op,
                       body_rewrite_pattern, quantization_method);
 
-  // Replace the XlaCallModuleOp with a new CallOp.
-  rewriter.setInsertionPoint(xla_call_module_op);
-  rewriter.replaceOpWithNewOp<func::CallOp>(xla_call_module_op, entry_func_op,
-                                            xla_call_module_op.getArgs());
+  ReplaceXlaCallModuleOpWithNewCallOp(xla_call_module_op, entry_func_op,
+                                      rewriter);
 }
 
 // Pattern that mainly does two things:
@@ -593,6 +638,10 @@ class XlaCallModuleOpToCallOp : public OpRewritePattern<TF::XlaCallModuleOp> {
     ModuleOp module_op = op->getParentOfType<ModuleOp>();
     SymbolTable symbol_table(module_op);
 
+    // Ignore ops without quantization method.
+    // Consider adding checks for individual methods.
+    if (!op->getAttr(kQuantizationMethodAttr)) return failure();
+
     // Ignore unquantized ops.
     if (!IsQuantizedXlaCallModuleOp(op)) return failure();
 
@@ -664,7 +713,7 @@ class QuantizeOpWithRegionPattern
       // Quantization parameters can be propagated only for same-scale ops and
       // same-scale ops are quantized only when they are connected to quantized
       // composite functions.
-      if (!GetStableHloQuantScaleSpec(op_with_region)
+      if (!GetStableHloQuantConstraints(op_with_region)
                ->has_same_scale_requirement ||
           !IsConnectedWithQuantizedCompsiteFunction(op_with_region)) {
         return failure();
@@ -866,7 +915,8 @@ bool IsConnectedWithQuantizedCompsiteFunction(Operation* same_scale_op) {
     }
 
     // Check whether the preceding op is a quantized same-scale op.
-    if (GetStableHloQuantScaleSpec(preceding_op)->has_same_scale_requirement) {
+    if (GetStableHloQuantConstraints(preceding_op)
+            ->has_same_scale_requirement) {
       for (const OpResult result : preceding_op->getResults()) {
         const Type element_type = getElementTypeOrSelf(result.getType());
         if (element_type.isa<UniformQuantizedType>()) {
@@ -893,7 +943,7 @@ bool IsConnectedWithQuantizedCompsiteFunction(Operation* same_scale_op) {
       }
 
       // Check whether the following op is a quantized same-scale op.
-      if (GetStableHloQuantScaleSpec(following_op)
+      if (GetStableHloQuantConstraints(following_op)
               ->has_same_scale_requirement) {
         for (Value operand : following_op->getOperands()) {
           const Type element_type = getElementTypeOrSelf(operand.getType());
@@ -923,7 +973,9 @@ class QuantizeWeightOnlyOpPattern : public EntryFuncBodyQuantizationPattern {
 };
 
 // Compute heavy patterns should be quantized for both server and ODML targets.
-void PopulateComputeHeavyPatterns(
+// Most patterns here are useful when quantized since they are compute heavy
+// or memory bound.
+void PopulateCommonQuantizationPatterns(
     MLIRContext& ctx, RewritePatternSet& patterns,
     const bool enable_per_channel_quantized_weight) {
   patterns.add<XlaCallModuleOpToCallOp<QuantizeConvolutionOpPattern>>(
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.h b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.h
index 9aa33ee0316ee1..67eb267c1d9037 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.h
@@ -148,7 +148,7 @@ class StableHloQuantizationPattern : public OpRewritePattern<RootOpT> {
         return failure();
       }
 
-      if (GetStableHloQuantScaleSpec(candidate_op)
+      if (GetStableHloQuantConstraints(candidate_op)
               ->has_same_scale_requirement &&
           !IsConnectedWithQuantizedCompsiteFunction(candidate_op)) {
         return failure();
@@ -250,9 +250,10 @@ class StableHloQuantizationPattern : public OpRewritePattern<RootOpT> {
   }
 };
 
-// Populates pattern for compute heavy operations.
-void PopulateComputeHeavyPatterns(MLIRContext& ctx, RewritePatternSet& patterns,
-                                  bool enable_per_channel_quantized_weight);
+// Populates common patterns that are usually compute heavy or memory bound.
+void PopulateCommonQuantizationPatterns(
+    MLIRContext& ctx, RewritePatternSet& patterns,
+    bool enable_per_channel_quantized_weight);
 
 // Populates conversion patterns for all quantizable ops, including
 // ops that are not compute-heavy and data movement ops.
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize.cc
index 8bb2bd33564481..0000057402886f 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize.cc
@@ -103,10 +103,8 @@ class QuantizePass : public impl::QuantizePassBase<QuantizePass> {
   using impl::QuantizePassBase<QuantizePass>::QuantizePassBase;
 
   explicit QuantizePass(const bool enable_per_channel_quantized_weight,
-                        const bool enable_full_int_quantization,
                         const bool enable_weight_only) {
     enable_per_channel_quantized_weight_ = enable_per_channel_quantized_weight;
-    enable_full_int_quantization_ = enable_full_int_quantization;
     enable_weight_only_ = enable_weight_only;
   }
 
@@ -125,13 +123,11 @@ void QuantizePass::runOnOperation() {
     PopulateQuantizeWeightOnlyPatterns(ctx, patterns);
   }
 
-  PopulateComputeHeavyPatterns(ctx, patterns,
-                               enable_per_channel_quantized_weight_);
+  PopulateCommonQuantizationPatterns(ctx, patterns,
+                                     enable_per_channel_quantized_weight_);
 
   // Quantize all quantizable ops, including ops that are not compute-heavy.
-  if (enable_full_int_quantization_) {
-    PopulateAllQuantizablePatterns(ctx, patterns);
-  }
+  PopulateAllQuantizablePatterns(ctx, patterns);
 
   if (failed(applyPatternsAndFoldGreedily(module_op, std::move(patterns)))) {
     // There are cases where no rewrites happen even if a pattern matches,
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_composite_functions.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_composite_functions.cc
index f3cf92dde359d1..1efc5d40c7ce20 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_composite_functions.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_composite_functions.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include <memory>
 
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project  // IWYU pragma: keep
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -26,6 +27,7 @@ limitations under the License.
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo  // IWYU pragma: keep
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"  // IWYU pragma: keep
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/report.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h"
@@ -41,7 +43,6 @@ namespace mlir::quant::stablehlo {
 
 namespace {
 
-using QuantMethod = tensorflow::quantization::QuantizationMethod::PresetMethod;
 using ::tensorflow::quantization::RunPassesOnModuleOp;
 
 class QuantizeCompositeFunctionsPass
@@ -55,9 +56,8 @@ class QuantizeCompositeFunctionsPass
 
   explicit QuantizeCompositeFunctionsPass(
       const bool enable_per_channel_quantized_weight,
-      const bool enable_weight_only, const bool enable_full_int_quantization) {
+      const bool enable_weight_only) {
     enable_per_channel_quantized_weight_ = enable_per_channel_quantized_weight;
-    enable_full_int_quantization_ = enable_full_int_quantization;
     enable_weight_only_ = enable_weight_only;
   }
 
@@ -90,21 +90,34 @@ void QuantizeCompositeFunctionsPass::runOnOperation() {
   QuantizePassOptions quantize_options;
   quantize_options.enable_per_channel_quantized_weight_ =
       enable_per_channel_quantized_weight_;
-  quantize_options.enable_full_int_quantization_ =
-      enable_full_int_quantization_;
   quantize_options.enable_weight_only_ = enable_weight_only_;
   // QuantizePass modifies FuncOps referenced outside of its given scope
   // and therefore requires a module-level context.
   pm.addPass(createQuantizePass(quantize_options));
   pm.addNestedPass<func::FuncOp>(createPostQuantizePass());
 
+  // Convert XlaCallModuleOps lifted but not quantized to func.call op.
+  // The reasons these ops are not quantized may be:
+  // 1. Disabled due to selective quantization.
+  // 2. Not supported, e.g. add op for server.
+  pm.addPass(createXlaCallModuleToCallPass());
+
+  // TODO: b/321729008 - move this implementation to quantization_patterns.cc.
+  if (merge_fusion_with_dequantize_) {
+    pm.addPass(createMergeFusionWithDequantizePass());
+  }
+
   ModuleOp module_op = getOperation();
   if (const absl::Status pm_run_status =
           RunPassesOnModuleOp(mlir_dump_file_name_, pm, module_op);
       !pm_run_status.ok()) {
     signalPassFailure();
   }
+
+  // Emit human-readable quantization report.
+  const QuantizationReport report(module_op);
+  report.Print();
 }
-}  // namespace
 
+}  // namespace
 }  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.cc
index 5209f6be325979..6ed82c125b0be9 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
@@ -488,7 +489,7 @@ void ReplaceStablehloOpsInMainFunctionWithXlaCallModuleOpsPass::
   func::FuncOp main_func = FindMainFuncOp(module_op);
   if (!main_func) return;
 
-  // To handle the case where `main` function has tf.StatefulPartitionedCallOp,
+  // In case the model has tf.StatefulPartitionedCallOp or tf.PartitionedCallOp,
   // we recursively find called functions and process StableHLO ops in them.
   SmallVector<func::FuncOp> func_ops;
   func_ops.push_back(main_func);
@@ -499,6 +500,10 @@ void ReplaceStablehloOpsInMainFunctionWithXlaCallModuleOpsPass::
     if (!main_func) continue;
 
     SymbolTable symbol_table(module_op);
+    for (auto call_op : main_func.getOps<TF::PartitionedCallOp>()) {
+      func_ops.push_back(dyn_cast_or_null<func::FuncOp>(symbol_table.lookup(
+          call_op.getFAttr().cast<FlatSymbolRefAttr>().getValue())));
+    }
     for (auto call_op : main_func.getOps<TF::StatefulPartitionedCallOp>()) {
       func_ops.push_back(
           dyn_cast_or_null<func::FuncOp>(symbol_table.lookup(call_op.getF())));
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/test_post_calibration_component.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/test_post_calibration_component.cc
index d596d1885c8066..bdf7f311f26bfa 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/test_post_calibration_component.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/test_post_calibration_component.cc
@@ -24,12 +24,12 @@ limitations under the License.
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo  // IWYU pragma: keep
 #include "stablehlo/dialect/VhloOps.h"  // from @stablehlo  // IWYU pragma: keep
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/post_calibration.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.h"  // IWYU pragma: keep
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"  // IWYU pragma: keep
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"  // IWYU pragma: keep
-#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"  // IWYU pragma: keep
 
 namespace mlir::quant::stablehlo::testing {
@@ -39,8 +39,9 @@ namespace mlir::quant::stablehlo::testing {
 
 namespace {
 
+using ::stablehlo::quantization::ExpandPresets;
 using ::stablehlo::quantization::PipelineConfig;
-using ::stablehlo::quantization::StaticRangePtqPreset;
+using ::stablehlo::quantization::QuantizationConfig;
 
 class TestPostCalibrationComponentPass
     : public impl::TestPostCalibrationComponentPassBase<
@@ -61,12 +62,16 @@ void TestPostCalibrationComponentPass::runOnOperation() {
 
   OpPassManager pm(ModuleOp::getOperationName());
 
-  StaticRangePtqPreset static_range_ptq_preset;
+  QuantizationConfig config = QuantizationConfig::default_instance();
+  config.mutable_static_range_ptq_preset();
+
+  const QuantizationConfig new_config = ExpandPresets(config);
+
   PipelineConfig pipeline_config;
   pipeline_config.set_unpack_quantized_types(unpack_quantized_types_);
 
   PostCalibrationComponent component(&ctx);
-  component.AddPasses(pm, static_range_ptq_preset, pipeline_config);
+  component.AddPasses(pm, new_config.specs(), pipeline_config);
 
   if (failed(runPipeline(pm, module_op))) {
     signalPassFailure();
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/xla_call_module_to_call.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/xla_call_module_to_call.cc
new file mode 100644
index 00000000000000..123244db3b7dbb
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/xla_call_module_to_call.cc
@@ -0,0 +1,83 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <utility>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/transforms/passes.h"
+#include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir::quant::stablehlo {
+
+#define GEN_PASS_DEF_XLACALLMODULETOCALLPASS
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h.inc"
+
+namespace {
+
+// Converts XlaCallModuleOps to func.call.
+class XlaCallModuleToCallPass
+    : public impl::XlaCallModuleToCallPassBase<XlaCallModuleToCallPass> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(XlaCallModuleToCallPass)
+
+  explicit XlaCallModuleToCallPass() = default;
+
+ private:
+  void runOnOperation() override;
+};
+
+// Converts XlaCallModuleOps to func.call.
+class XlaCallModuleOpToCallOp : public OpRewritePattern<TF::XlaCallModuleOp> {
+ public:
+  explicit XlaCallModuleOpToCallOp(MLIRContext* context)
+      : OpRewritePattern<TF::XlaCallModuleOp>(context) {}
+
+  LogicalResult matchAndRewrite(TF::XlaCallModuleOp op,
+                                PatternRewriter& rewriter) const override {
+    auto module_op = op->getParentOfType<ModuleOp>();
+    SymbolTable symbol_table(module_op);
+
+    auto entry_func_op = dyn_cast_or_null<func::FuncOp>(
+        symbol_table.lookup(GetEntryFunctionName(op)));
+    if (!entry_func_op) return failure();
+
+    // Replace the XlaCallModuleOp with a new CallOp.
+    rewriter.replaceOpWithNewOp<func::CallOp>(op, entry_func_op, op.getArgs());
+    return success();
+  }
+};
+
+void XlaCallModuleToCallPass::runOnOperation() {
+  ModuleOp module_op = getOperation();
+  MLIRContext* ctx = module_op.getContext();
+  RewritePatternSet patterns(&getContext());
+  patterns.add<XlaCallModuleOpToCallOp>(ctx);
+  if (failed(applyPatternsAndFoldGreedily(module_op, std::move(patterns)))) {
+    signalPassFailure();
+  }
+}
+
+}  // namespace
+}  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/python/BUILD
index 2b20cc48a89d69..df5252b986adf5 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/python/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/BUILD
@@ -60,8 +60,10 @@ pytype_strict_library(
 #         "//tensorflow/python/ops:array_ops",
 #         "//tensorflow/python/ops:math_ops",
 #         "//tensorflow/python/ops:nn_ops",
+#         "//tensorflow/python/ops:variables",
 #         "//tensorflow/python/platform:client_testlib",
 #         "//tensorflow/python/saved_model:load",
+#         "//tensorflow/python/saved_model:loader",
 #         "//tensorflow/python/saved_model:save",
 #         "//tensorflow/python/types:core",
 #         "@absl_py//absl/testing:parameterized",
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py b/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py
index 80a2c560ef865b..f65c56bc577742 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py
@@ -64,6 +64,7 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
               ([10, 1, 1024], [10, 1024, 3]),
               ([2, 3, 1, 1024], [2, 3, 1024, 3]),
           ),
+          'merge_fusion_with_dequantize': (False, True),
       }])
   )
   @test_util.run_in_graph_and_eager_modes
@@ -72,6 +73,7 @@ def test_matmul_ptq_model(
       bias_fn: Optional[ops.Operation],
       activation_fn: Optional[ops.Operation],
       dim_sizes: Sequence[int],
+      merge_fusion_with_dequantize: bool,
   ):
     lhs_dim_size, rhs_dim_size = dim_sizes
     input_shape = (*lhs_dim_size,)
@@ -115,6 +117,9 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
             ]
         ),
         tf_saved_model=qc.TfSavedModelConfig(tags=[tag_constants.SERVING]),
+        pipeline_config=qc.PipelineConfig(
+            merge_fusion_with_dequantize=merge_fusion_with_dequantize
+        ),
     )
     quantization.quantize_saved_model(
         self._input_saved_model_path,
@@ -150,6 +155,19 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
         0.65,
     )
 
+    if merge_fusion_with_dequantize:
+      # Check activation functions are explicitly present.
+      # If present the last op before return should be stablehlo.clamp for relu6
+      # and stablehlo.maximum for relu.
+      if activation_fn is nn_ops.relu6:
+        self.assertRegex(module_str, r'stablehlo.clamp.*\n.*return')
+      elif activation_fn is nn_ops.relu:
+        self.assertRegex(module_str, r'stablehlo.maximum.*\n.*return')
+    else:
+      # Check activation functions are implicit.
+      self.assertNotRegex(module_str, r'stablehlo.clamp.*\n.*return')
+      self.assertNotRegex(module_str, r'stablehlo.maximum.*\n.*return')
+
   @parameterized.parameters(
       testing.parameter_combinations([{
           'same_scale_op': (
@@ -342,6 +360,8 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
               False,
               True,
           ),
+          'merge_fusion_with_dequantize': (False, True),
+          'has_func_alias': (False, True),
       }])
   )
   @test_util.run_in_graph_and_eager_modes
@@ -352,7 +372,9 @@ def test_conv_ptq_model(
       has_batch_norm: bool,
       input_shape_dynamic: bool,
       enable_per_channel_quantized_weight: bool,
+      merge_fusion_with_dequantize: bool,
       dilations: Sequence[int] = None,
+      has_func_alias: bool = False,
   ):
     input_shape = (None, 3, 4, 3) if input_shape_dynamic else (1, 3, 4, 3)
     filter_shape = (2, 3, 3, 2)
@@ -366,15 +388,16 @@ def test_conv_ptq_model(
         has_batch_norm,
         strides,
         dilations,
+        'SAME',
+        has_func_alias,
     )
-    # TODO(b/331809306): investigate why these tests fail.
-    # skip these test cases.
-    if (
-        bias_fn is None
-        and has_batch_norm
-        and input_shape_dynamic
-        and enable_per_channel_quantized_weight
-    ):
+    # TODO: b/331809306 - Investigate why these test fail then re-enable.
+    if has_batch_norm and (bias_fn or not input_shape_dynamic):
+      return
+
+    # TODO: b/331120943 - Re-enable this after correctly handling quantization
+    # granularity per quantizable scope.
+    if has_batch_norm and (not bias_fn and input_shape_dynamic):
       return
 
     # Generate model input data.
@@ -410,6 +433,9 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
             enable_per_channel_quantized_weight=enable_per_channel_quantized_weight,
         ),
         tf_saved_model=qc.TfSavedModelConfig(tags=[tag_constants.SERVING]),
+        pipeline_config=qc.PipelineConfig(
+            merge_fusion_with_dequantize=merge_fusion_with_dequantize
+        ),
     )
     quantization.quantize_saved_model(
         self._input_saved_model_path,
@@ -445,6 +471,27 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
         0.61,
     )
 
+    if merge_fusion_with_dequantize:
+      # Check activation functions are explicitly present.
+      # If present the last op before return should be stablehlo.clamp for relu6
+      # and stablehlo.maximum for relu.
+      if activation_fn is nn_ops.relu6:
+        self.assertRegex(module_str, r'stablehlo.clamp.*\n.*return')
+      elif activation_fn is nn_ops.relu:
+        self.assertRegex(module_str, r'stablehlo.maximum.*\n.*return')
+    else:
+      # Check activation functions are implicit.
+      self.assertNotRegex(module_str, r'stablehlo.clamp.*\n.*return')
+      self.assertNotRegex(module_str, r'stablehlo.maximum.*\n.*return')
+
+    if has_func_alias:
+      func_aliases = self._get_function_aliases(
+          self._output_saved_model_path, [tag_constants.SERVING]
+      )
+      self.assertCountEqual(
+          func_aliases.values(), [quantize_model_test_base.FUNC_ALIAS]
+      )
+
   @parameterized.parameters(
       testing.parameter_combinations([{
           'equation': (
@@ -528,6 +575,66 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
         0.65,
     )
 
+  @parameterized.named_parameters(
+      ('use_constant_with_int32_input', np.int32, False),
+      ('use_variable_with_int32_input', np.int32, True),
+      ('use_constant_with_int64_input', np.int64, False),
+      ('use_variable_with_int64_input', np.int64, True),
+  )
+  @test_util.run_v2_only
+  def test_gather_model(self, input_type, use_variable):
+    model = self._create_gather_model(input_type, use_variable)
+
+    save.save(model, self._input_saved_model_path)
+
+    rng = np.random.default_rng(seed=42)
+    static_input_shape = [6]
+
+    def data_gen() -> repr_dataset.RepresentativeDataset:
+      for _ in range(100):
+        yield {
+            'input_tensor': rng.uniform(
+                low=0.0, high=10, size=static_input_shape
+            ).astype(input_type)
+        }
+
+    dataset_path = self.create_tempfile('tfrecord').full_path
+    path_map = {'serving_default': dataset_path}
+    repr_dataset.TfRecordRepresentativeDatasetSaver(path_map).save(
+        {'serving_default': data_gen()}
+    )
+
+    config = qc.QuantizationConfig(
+        static_range_ptq_preset=qc.StaticRangePtqPreset(
+            representative_datasets=[
+                qc.RepresentativeDatasetConfig(
+                    tf_record=qc.TfRecordFile(path=dataset_path)
+                )
+            ]
+        ),
+        tf_saved_model=qc.TfSavedModelConfig(tags=[tag_constants.SERVING]),
+    )
+    quantization.quantize_saved_model(
+        self._input_saved_model_path,
+        self._output_saved_model_path,
+        config,
+    )
+
+    root = load.load(self._output_saved_model_path)
+    self.assertCountEqual(root.signatures.keys(), {'serving_default'})
+    module_str = self._extract_first_xla_call_module_op(
+        self._output_saved_model_path
+    )
+    self.assertTrue(re.search('stablehlo.gather.*xi8>', module_str))
+
+    # Due to other meta data, the compression is not exactly 1/4.
+    self.assertLess(
+        testing.get_size_ratio(
+            self._output_saved_model_path, self._input_saved_model_path
+        ),
+        1 / 3,
+    )
+
   def test_when_preset_not_srq_raises_error(self):
     self._create_matmul_model(
         input_shape=(1, 1024),
@@ -985,7 +1092,7 @@ def test_matmul_weight_only_model(
     )
 
     config = qc.QuantizationConfig(
-        weight_only_preset=qc.WeightOnlyPreset(),
+        weight_only_ptq_preset=qc.WeightOnlyPtqPreset(),
         tf_saved_model=qc.TfSavedModelConfig(tags=[tag_constants.SERVING]),
     )
     quantization.quantize_saved_model(
@@ -1010,9 +1117,8 @@ def test_matmul_weight_only_model(
         self._output_saved_model_path
     )
 
-    # Tests that the output graph contains subtract and multiply for
+    # Tests that the output graph contains multiply for symmetric
     # dequantization.
-    self.assertTrue(re.search('stablehlo.subtract', module_str))
     self.assertTrue(re.search('stablehlo.multiply', module_str))
     # Tests that the output graph contains float dot_general.
     self.assertTrue(
@@ -1043,6 +1149,7 @@ def test_matmul_weight_only_model(
               False,
               True,
           ),
+          'has_func_alias': (False, True),
       }])
   )
   @test_util.run_in_graph_and_eager_modes
@@ -1053,6 +1160,7 @@ def test_conv_weight_only_model(
       has_batch_norm: bool,
       input_shape_dynamic: bool,
       dilations: Sequence[int] = None,
+      has_func_alias: bool = False,
   ):
     input_shape = (None, 3, 4, 3) if input_shape_dynamic else (1, 3, 4, 3)
     filter_shape = (2, 3, 3, 2)
@@ -1066,6 +1174,8 @@ def test_conv_weight_only_model(
         has_batch_norm,
         strides,
         dilations,
+        'SAME',
+        has_func_alias,
     )
 
     rng = np.random.default_rng(1234)
@@ -1077,7 +1187,7 @@ def test_conv_weight_only_model(
     )
 
     config = qc.QuantizationConfig(
-        weight_only_preset=qc.WeightOnlyPreset(),
+        weight_only_ptq_preset=qc.WeightOnlyPtqPreset(),
         tf_saved_model=qc.TfSavedModelConfig(tags=[tag_constants.SERVING]),
     )
     quantization.quantize_saved_model(
@@ -1111,14 +1221,142 @@ def test_conv_weight_only_model(
         re.search('stablehlo.convolution.*xf32>.*xf32>.*xf32>', module_str)
     )
 
+    if has_func_alias:
+      func_aliases = self._get_function_aliases(
+          self._output_saved_model_path, [tag_constants.SERVING]
+      )
+      self.assertCountEqual(
+          func_aliases.values(), [quantize_model_test_base.FUNC_ALIAS]
+      )
+
     # Due to other meta data, the compression is not exactly 1/4.
     self.assertLess(
         testing.get_size_ratio(
             self._output_saved_model_path, self._input_saved_model_path
         ),
-        0.35,
+        0.4,
+    )
+
+  @parameterized.parameters(
+      testing.parameter_combinations([{
+          'shape_dynamic': (
+              False,
+              True,
+          ),
+      }])
+  )
+  @test_util.run_in_graph_and_eager_modes
+  def test_add_ptq_model(
+      self,
+      shape_dynamic: bool,
+  ):
+    input_shape = (None, 3, 4, 3) if shape_dynamic else (2, 3, 4, 3)
+    self._create_add_model(
+        input_shape,
+        self._input_saved_model_path,
     )
 
+    # Generate model input data.
+    rng = np.random.default_rng(seed=42)
+    static_input_shape = [dim if dim is not None else 2 for dim in input_shape]
+
+    def data_gen() -> repr_dataset.RepresentativeDataset:
+      for _ in range(100):
+        yield {
+            'input_tensor': rng.uniform(
+                low=0.0, high=1.0, size=static_input_shape
+            ).astype(np.float32)
+        }
+
+    dataset_path = self.create_tempfile('tfrecord').full_path
+    path_map = {'serving_default': dataset_path}
+    repr_dataset.TfRecordRepresentativeDatasetSaver(path_map).save(
+        {'serving_default': data_gen()}
+    )
+
+    config = qc.QuantizationConfig(
+        static_range_ptq_preset=qc.StaticRangePtqPreset(
+            representative_datasets=[
+                qc.RepresentativeDatasetConfig(
+                    tf_record=qc.TfRecordFile(path=dataset_path)
+                )
+            ],
+        ),
+        tf_saved_model=qc.TfSavedModelConfig(tags=[tag_constants.SERVING]),
+    )
+    quantization.quantize_saved_model(
+        self._input_saved_model_path,
+        self._output_saved_model_path,
+        config,
+    )
+
+    self.assertEqual(
+        self._get_num_xla_call_module_op(self._output_saved_model_path), 1
+    )
+    module_str = self._extract_first_xla_call_module_op(
+        self._output_saved_model_path
+    )
+
+    # Check add is not quantized.
+    self.assertTrue(re.search(r'stablehlo.add.*f32>', module_str))
+
+  @parameterized.parameters(
+      testing.parameter_combinations([{
+          'shape_dynamic': (
+              False,
+              True,
+          ),
+      }])
+  )
+  @test_util.run_in_graph_and_eager_modes
+  def test_add_weight_only_model(
+      self,
+      shape_dynamic: bool,
+  ):
+    input_shape = (None, 3, 4, 3) if shape_dynamic else (2, 3, 4, 3)
+    self._create_add_model(
+        input_shape,
+        self._input_saved_model_path,
+    )
+
+    # Generate model input data.
+    rng = np.random.default_rng(seed=42)
+    static_input_shape = [dim if dim is not None else 2 for dim in input_shape]
+
+    def data_gen() -> repr_dataset.RepresentativeDataset:
+      for _ in range(100):
+        yield {
+            'input_tensor': rng.uniform(
+                low=0.0, high=1.0, size=static_input_shape
+            ).astype(np.float32)
+        }
+
+    dataset_path = self.create_tempfile('tfrecord').full_path
+    path_map = {'serving_default': dataset_path}
+    repr_dataset.TfRecordRepresentativeDatasetSaver(path_map).save(
+        {'serving_default': data_gen()}
+    )
+
+    config = qc.QuantizationConfig(
+        weight_only_ptq_preset=qc.WeightOnlyPtqPreset(),
+        tf_saved_model=qc.TfSavedModelConfig(tags=[tag_constants.SERVING]),
+    )
+    quantization.quantize_saved_model(
+        self._input_saved_model_path,
+        self._output_saved_model_path,
+        config,
+    )
+
+    self.assertEqual(
+        self._get_num_xla_call_module_op(self._output_saved_model_path), 1
+    )
+    module_str = self._extract_first_xla_call_module_op(
+        self._output_saved_model_path
+    )
+
+    # Check add is not quantized.
+    self.assertTrue(re.search(r'stablehlo.add.*f32>', module_str), module_str)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test_base.py b/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test_base.py
index d71c89e15d313f..31c53a4cf20fe9 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test_base.py
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test_base.py
@@ -31,11 +31,15 @@
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import load
+from tensorflow.python.saved_model import loader_impl
 from tensorflow.python.saved_model import save as saved_model_save
 from tensorflow.python.types import core
 
+FUNC_ALIAS = 'some_alias'
+
 
 class QuantizedModelTest(test.TestCase, parameterized.TestCase):
   """Base test class for StableHLO quant tests."""
@@ -72,6 +76,29 @@ def _extract_first_xla_call_module_op(
             return str(stablehlo_module)
     raise ValueError('No XlaCallModule found in saved model.')
 
+  def _get_num_xla_call_module_op(self, output_saved_model_path: str) -> int:
+    """Gets the number of XlaCallModule ops in the output saved model."""
+    root = load.load(output_saved_model_path)
+    tf_graph_def = root.signatures['serving_default'].graph.as_graph_def()
+    count = 0
+    for node_def in tf_graph_def.node:
+      if node_def.op == 'XlaCallModule':
+        count += 1
+    for function in tf_graph_def.library.function:
+      for node_def in function.node_def:
+        if node_def.op == 'XlaCallModule':
+          count += 1
+    return count
+
+  def _get_function_aliases(
+      self, output_saved_model_path: str, tags: List[str]
+  ) -> dict[str, str]:
+    """Gets the function aliases in the output saved model."""
+    loader = loader_impl.SavedModelLoader(output_saved_model_path)
+    return loader.get_meta_graph_def_from_tags(
+        tags
+    ).meta_info_def.function_aliases
+
   def _create_matmul_model(
       self,
       input_shape: Sequence[int],
@@ -238,6 +265,7 @@ def _create_conv2d_model(
       strides: Sequence[int] = (1, 1, 1, 1),
       dilations: Sequence[int] = (1, 1, 1, 1),
       padding: str = 'SAME',
+      has_func_alias: bool = False,
   ) -> module.Module:
     class ConvModel(module.Module):
       """A simple model with a single conv2d, bias and relu."""
@@ -294,6 +322,11 @@ def conv2d(self, input_tensor: core.Tensor) -> Mapping[str, core.Tensor]:
         return {'output': out}
 
     model = ConvModel()
+    save_options = None
+    if has_func_alias:
+      save_options = tensorflow.saved_model.SaveOptions(
+          function_aliases={FUNC_ALIAS: model.conv2d}
+      )
     saved_model_save.save(
         model,
         saved_model_path,
@@ -302,6 +335,76 @@ def conv2d(self, input_tensor: core.Tensor) -> Mapping[str, core.Tensor]:
                 shape=input_shape, dtype=dtypes.float32, name='input_tensor'
             )
         ),
+        options=save_options,
+    )
+    return model
+
+  def _create_gather_model(self, input_type, use_variable) -> module.Module:
+    class GatherModel(module.Module):
+      """A simple model with a single gather."""
+
+      def __init__(self, use_variable):
+        """Initializes a GatherModel.
+
+        Args:
+          use_variable: If True, creates a variable for weight.
+        """
+        super().__init__()
+        w_val = np.random.randn(128, 32).astype('f4')
+        if use_variable:
+          self.w = variables.Variable(w_val)
+        else:
+          self.w = w_val
+
+      @def_function.function(
+          input_signature=[
+              tensor_spec.TensorSpec(
+                  shape=[6], dtype=input_type, name='input_tensor'
+              )
+          ]
+      )
+      def __call__(
+          self, input_tensor: core.Tensor
+      ) -> Mapping[str, core.Tensor]:
+        """Performs a gather operation."""
+        out = array_ops.gather_v2(self.w, input_tensor)
+        return {'output': out}
+
+    return GatherModel(use_variable)
+
+  def _create_add_model(
+      self,
+      shape: Sequence[int],
+      saved_model_path: str,
+  ) -> module.Module:
+    class AddModel(module.Module):
+      """A simple model with a single add."""
+
+      def __init__(self):
+        pass
+
+      @def_function.function
+      def add(self, input_tensor: core.Tensor) -> Mapping[str, core.Tensor]:
+        """Performs an add operation.
+
+        Args:
+          input_tensor: Input tensor to perform add on.
+
+        Returns:
+          A map of: output key -> output result.
+        """
+        out = math_ops.add(input_tensor, input_tensor)
+        return {'output': out}
+
+    model = AddModel()
+    saved_model_save.save(
+        model,
+        saved_model_path,
+        signatures=model.add.get_concrete_function(
+            tensor_spec.TensorSpec(
+                shape=shape, dtype=dtypes.float32, name='input_tensor'
+            )
+        ),
     )
     return model
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/stablehlo_quantizer_odml_oss.ipynb b/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/stablehlo_quantizer_odml_oss.ipynb
new file mode 100644
index 00000000000000..858401154bec3b
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/stablehlo_quantizer_odml_oss.ipynb
@@ -0,0 +1,192 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "tWhm0JFMPJ5I"
+      },
+      "source": [
+        "Copyright 2024 Google LLC.\n",
+        "\n",
+        "Licensed under the Apache License, Version 2.0 (the \"License\");"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "RJcqTAlfPQjk"
+      },
+      "source": [
+        "# [OSS] JAX to TFLite with StableHLO Quantization Demonstration for ODML."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "cqeGmbO6PPNd"
+      },
+      "source": [
+        "This example shows a JAX Keras reference model converted into a StableHLO module and via `jax2tf`, then quantized in the ODML Converter via the StableHLO Quantizer.\n",
+        "\n",
+        "Note: This API is experimental and will likely have breakages with other models. Please reach out to [scalable-opt-team@google.com](mailto:scalable-opt-team@google.com) and we will support your use case."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "-S0P42BpPSeJ"
+      },
+      "source": [
+        "## StableHLO Quantizer\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "FacwMD9MPUew"
+      },
+      "source": [
+        "StableHLO Quantizer is a quantization API to enable ML framework optionality and hardware retargetability."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "RXZUHZQoQZOo"
+      },
+      "outputs": [],
+      "source": [
+        "!pip uninstall tensorflow --yes"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "aYz36YEKPYRk"
+      },
+      "outputs": [],
+      "source": [
+        "!pip3 install tf-nightly\n",
+        "!pip3 install keras-core"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "duab6P-nPZzF"
+      },
+      "outputs": [],
+      "source": [
+        "import tensorflow as tf\n",
+        "print(\"TensorFlow version:\", tf.__version__)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "c9JX9RJTPaoW"
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "os.environ['KERAS_BACKEND'] = 'jax'\n",
+        "import jax.numpy as jnp\n",
+        "import numpy as np\n",
+        "import tensorflow as tf\n",
+        "from keras_core.applications import ResNet50\n",
+        "from jax.experimental import jax2tf"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "rTcHwDPBPchd"
+      },
+      "outputs": [],
+      "source": [
+        "input_shape = (1, 224, 224, 3)\n",
+        "\n",
+        "jax_callable = jax2tf.convert(\n",
+        "    ResNet50(\n",
+        "      input_shape=input_shape[1:],\n",
+        "      pooling='avg',\n",
+        "  ).call,\n",
+        "    with_gradient=False,\n",
+        "    native_serialization=True,\n",
+        "    native_serialization_platforms=('cpu',))\n",
+        "\n",
+        "tf_module = tf.Module()\n",
+        "tf_module.f = tf.function(\n",
+        "    jax_callable,\n",
+        "    autograph=False,\n",
+        "    input_signature=[\n",
+        "        tf.TensorSpec(input_shape, jnp.float32, 'lhs_operand')\n",
+        "    ],\n",
+        ")\n",
+        "\n",
+        "saved_model_dir = '/tmp/saved_model'\n",
+        "tf.saved_model.save(tf_module, saved_model_dir)\n",
+        "\n",
+        "def calibration_dataset():\n",
+        "  rng = np.random.default_rng(seed=1235)\n",
+        "  for _ in range(2):\n",
+        "    yield {\n",
+        "        'lhs_operand': rng.uniform(low=-1.0, high=1.0, size=input_shape).astype(\n",
+        "            np.float32\n",
+        "        )\n",
+        "    }\n",
+        "converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)\n",
+        "converter.target_spec.supported_ops = [\n",
+        "    tf.lite.OpsSet.SELECT_TF_OPS,  # enable TensorFlow ops.\n",
+        "    tf.lite.OpsSet.TFLITE_BUILTINS,  # enable TFL ops.\n",
+        "]\n",
+        "converter.representative_dataset = calibration_dataset\n",
+        "converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+        "# Below flag controls whether to use StableHLO Quantizer or TFLite quantizer.\n",
+        "converter.experimental_use_stablehlo_quantizer = True\n",
+        "\n",
+        "quantized_model = converter.convert()\n",
+        "\n",
+        "with open('/tmp/resnet50_quantized.tflite', 'wb') as f:\n",
+        "  f.write(quantized_model)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "u3b9Xj8dPdXo"
+      },
+      "outputs": [],
+      "source": [
+        "print(str(os.path.getsize('/tmp/resnet50_quantized.tflite') \u003e\u003e 20) + 'MB')"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "private_outputs": true,
+      "provenance": [
+        {
+          "file_id": "https://github.com/tensorflow/tensorflow/tree/master/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/stablehlo_quantizer_odml_oss.ipynb",
+          "timestamp": 1712841250910
+        }
+      ]
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/quantization.py b/tensorflow/compiler/mlir/quantization/stablehlo/python/quantization.py
index aa3745a3fdd453..f9a1a90e071453 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/python/quantization.py
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/quantization.py
@@ -43,6 +43,16 @@ def _serialize_signature_def_map(
   return signature_def_map_serialized
 
 
+def _has_quantization_method(
+    quantization_specs: qc.QuantizationSpecs, method: str
+) -> bool:
+  """Returns whether a given QuantizationSpecs has the given quantization method."""
+  for spec in quantization_specs.specs:
+    if spec.method.HasField(method):
+      return True
+  return False
+
+
 # TODO: b/310594193 - Export API to pip package.
 def quantize_saved_model(
     src_saved_model_path: str,
@@ -60,15 +70,6 @@ def quantize_saved_model(
     ValueError: When `config` was not configured for static-range PTQ
     single representative dataset.
   """
-  if not (
-      config.HasField('static_range_ptq_preset')
-      and len(config.static_range_ptq_preset.representative_datasets) == 1
-  ) and not config.HasField('weight_only_preset'):
-    raise ValueError(
-        '`quantize_saved_model` currently only supports static-range PTQ with a'
-        ' single signature or weight-only quantization.'
-    )
-
   # Updates user-provided `QuantizationConfig`s for the internal quantization
   # pipeline to work with.
   print('=== User-provided QuantizationConfig ===')
@@ -82,6 +83,15 @@ def quantize_saved_model(
   print('=== Updated QuantizationConfig ===')
   print(config)
 
+  if not (
+      _has_quantization_method(config.specs, 'static_range_ptq')
+      and len(config.calibration_options.representative_datasets) == 1
+  ) and not _has_quantization_method(config.specs, 'weight_only_ptq'):
+    raise ValueError(
+        '`quantize_saved_model` currently only supports static-range PTQ with a'
+        ' single signature or weight-only quantization.'
+    )
+
   signature_def_map = save_model.get_signatures_from_saved_model(
       src_saved_model_path,
       signature_keys=None,
@@ -89,7 +99,9 @@ def quantize_saved_model(
   )
 
   signature_def_map_serialized = _serialize_signature_def_map(signature_def_map)
-  if config.HasField('static_range_ptq_preset'):
+  # Currently, only StaticRangePtq or WeightOnlyPtq is supported.
+  # Consider merging the pipelines to address mixed algorithm models.
+  if _has_quantization_method(config.specs, 'static_range_ptq'):
     pywrap_quantization.static_range_ptq(
         src_saved_model_path,
         dst_saved_model_path,
@@ -98,7 +110,7 @@ def quantize_saved_model(
         signature_def_map_serialized=signature_def_map_serialized,
         py_function_library=py_function_lib.PyFunctionLibrary(),
     )
-  elif config.HasField('weight_only_preset'):
+  elif _has_quantization_method(config.specs, 'weight_only_ptq'):
     pywrap_quantization.weight_only_ptq(
         src_saved_model_path,
         dst_saved_model_path,
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto b/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto
index efdceebd6c2008..81f2ff3686fbbe 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto
@@ -77,8 +77,9 @@ message StaticRangePtqPreset {
   bool enable_full_int_quantization = 3;
 }
 
-// Applies int8 per-tensor weight-only quantization for all dot_general op.
-message WeightOnlyPreset {}
+// Applies int8 per-tensor weight-only post-training quantization for all
+// dot_general op.
+message WeightOnlyPtqPreset {}
 
 // Metadata specific to the input TensorFlow SavedModel, which may be required
 // to identify the specific MetaGraphDef to quantize, for example.
@@ -96,6 +97,12 @@ message PipelineConfig {
   // hardware performs better with integer ops.
   // Default value: true
   optional bool unpack_quantized_types = 1;
+
+  // When set to True, requantize op in the quantized fusion will merge with the
+  // subsequent dequantize op if present.
+  // Default value: false
+  // TODO: b/321729008 - re-consider default value after testing on prod model.
+  bool merge_fusion_with_dequantize = 2;
 }
 
 // Represents a single quantizable unit, a (nearly) minimum unit of work when
@@ -158,6 +165,12 @@ message StaticRangePtq {
   map<int32, QuantizedType> input_quantized_types = 1;
 }
 
+message WeightOnlyPtq {
+  // Operand index -> QuantizedType mapping. Operands that are not specified
+  // here will be quantized with best effort.
+  map<int32, QuantizedType> input_quantized_types = 1;
+}
+
 // Represents a matching method that matches quantizable units by lifted
 // functions' names.
 message FunctionNameMatcherSpec {
@@ -178,6 +191,7 @@ message Method {
   oneof method {
     NoQuantization no_quantization = 1;
     StaticRangePtq static_range_ptq = 2;
+    WeightOnlyPtq weight_only_ptq = 3;
   }
 }
 
@@ -322,7 +336,7 @@ message QuantizationConfig {
   oneof preset {
     // Performs best-effort static-range post-training quantization (PTQ).
     StaticRangePtqPreset static_range_ptq_preset = 1;
-    WeightOnlyPreset weight_only_preset = 7;
+    WeightOnlyPtqPreset weight_only_ptq_preset = 7;
   }
 
   // TF SavedModel specific information for the input model.
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/bridge/convert_tf_quant_ops_to_mhlo.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/bridge/convert_tf_quant_ops_to_mhlo.mlir
index 4e883aa0e11c70..32e605ba7bf0dc 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/bridge/convert_tf_quant_ops_to_mhlo.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/bridge/convert_tf_quant_ops_to_mhlo.mlir
@@ -25,7 +25,7 @@ func.func @uniform_quantized_add(%input: tensor<3x2xf32>) -> tensor<3x2xf32> {
   %input_zps = "tf.Const"() { value = dense<4> : tensor<i32> } : () -> tensor<i32>
 
   // tensor_proto that points to dense<127> of type !tf_type.qint32.
-  // CHECK-DAG: %[[RHS:.*]] = mhlo.constant() {value = dense<127> : tensor<2xi32>} : () -> tensor<2x!quant.uniform<i32:f32, 2.000000e+00:4>>
+  // CHECK-DAG: %[[RHS:.*]] = mhlo.constant() <{value = dense<127> : tensor<2xi32>}> : () -> tensor<2x!quant.uniform<i32:f32, 2.000000e+00:4>>
   %bias = "tf.Const"() { value = #tf_type<tensor_proto : "0x746674656E736F722464747970653A2044545F51494E5433322074656E736F725F7368617065207B207D2074656E736F725F636F6E74656E743A20225C3137375C3030305C3030305C30303022"> : tensor<2x!tf_type.qint32> } : () -> tensor<2x!tf_type.qint32>
   %bias_scales = "tf.Const"() { value = dense<2.0> : tensor<f32> } : () -> tensor<f32>
   %bias_zps = "tf.Const"() { value = dense<4> : tensor<i32> } : () -> tensor<i32>
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/components/post_calibration_component.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/components/post_calibration_component.mlir
index 317da0b762e60d..2f149281fbd0be 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/components/post_calibration_component.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/components/post_calibration_component.mlir
@@ -8,10 +8,10 @@
 // int ops.
 func.func @main(%arg0: tensor<1x1024xf32>) -> tensor<1x3xf32> {
   %0 = "tf.Const"() <{value = dense<0.5> : tensor<1024x3xf32>}> : () -> tensor<1024x3xf32>
-  %1 = "tf.CustomAggregator"(%arg0) <{id = "1"}> {calibration_method = 1 : i64, device = "", initial_num_bins = 0 : i64, max = 0.999992311 : f32, max_percentile = 0.000000e+00 : f32, min = 7.547870e-07 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x1024xf32>) -> tensor<1x1024xf32>
-  %2 = "tf.XlaCallModule"(%1, %0) <{Sout = [#tf_type.shape<1x3>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
-  %3 = "tf.CustomAggregator"(%2) <{id = "2"}> {calibration_method = 1 : i64, device = "", initial_num_bins = 0 : i64, max = 18.3033524 : f32, max_percentile = 0.000000e+00 : f32, min = -17.5216827 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x3xf32>) -> tensor<1x3xf32>
-  return %3 : tensor<1x3xf32>
+  %1:4 = "tf.CustomAggregator"(%arg0) <{id = "1"}> {calibration_method = 1 : i64, device = "", initial_num_bins = 0 : i64, max = 0.999992311 : f32, max_percentile = 0.000000e+00 : f32, min = 7.547870e-07 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x1024xf32>) -> (tensor<1x1024xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+  %2 = "tf.XlaCallModule"(%1#0, %0) <{Sout = [#tf_type.shape<1x3>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _quantization_method = "static_range_ptq {}", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
+  %3:4 = "tf.CustomAggregator"(%2) <{id = "2"}> {calibration_method = 1 : i64, device = "", initial_num_bins = 0 : i64, max = 5.3033524 : f32, max_percentile = 0.000000e+00 : f32, min = -3.5216827 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x3xf32>) -> (tensor<1x3xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+  return %3#0 : tensor<1x3xf32>
 }
 func.func private @composite_dot_general_fn_1(%arg0: tensor<1x1024xf32>, %arg1: tensor<1024x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
   %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
@@ -36,10 +36,10 @@ func.func private @composite_dot_general_fn_1(%arg0: tensor<1x1024xf32>, %arg1:
 
 func.func @main_no_unpack(%arg0: tensor<1x1024xf32>) -> tensor<1x3xf32> {
   %0 = "tf.Const"() <{value = dense<0.5> : tensor<1024x3xf32>}> : () -> tensor<1024x3xf32>
-  %1 = "tf.CustomAggregator"(%arg0) <{id = "1"}> {calibration_method = 1 : i64, device = "", initial_num_bins = 0 : i64, max = 0.999992311 : f32, max_percentile = 0.000000e+00 : f32, min = 7.547870e-07 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x1024xf32>) -> tensor<1x1024xf32>
-  %2 = "tf.XlaCallModule"(%1, %0) <{Sout = [#tf_type.shape<1x3>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
-  %3 = "tf.CustomAggregator"(%2) <{id = "2"}> {calibration_method = 1 : i64, device = "", initial_num_bins = 0 : i64, max = 18.3033524 : f32, max_percentile = 0.000000e+00 : f32, min = -17.5216827 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x3xf32>) -> tensor<1x3xf32>
-  return %3 : tensor<1x3xf32>
+  %1:4 = "tf.CustomAggregator"(%arg0) <{id = "1"}> {calibration_method = 1 : i64, device = "", initial_num_bins = 0 : i64, max = 0.999992311 : f32, max_percentile = 0.000000e+00 : f32, min = 7.547870e-07 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x1024xf32>) -> (tensor<1x1024xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+  %2 = "tf.XlaCallModule"(%1#0, %0) <{Sout = [#tf_type.shape<1x3>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _quantization_method = "static_range_ptq {}", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
+  %3:4 = "tf.CustomAggregator"(%2) <{id = "2"}> {calibration_method = 1 : i64, device = "", initial_num_bins = 0 : i64, max = 5.3033524 : f32, max_percentile = 0.000000e+00 : f32, min = -3.5216827 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x3xf32>) -> (tensor<1x3xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+  return %3#0 : tensor<1x3xf32>
 }
 func.func private @composite_dot_general_fn_1(%arg0: tensor<1x1024xf32>, %arg1: tensor<1024x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
   %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
@@ -47,10 +47,10 @@ func.func private @composite_dot_general_fn_1(%arg0: tensor<1x1024xf32>, %arg1:
 }
 // CHECK-NO-UNPACK-LABEL: func.func @main_no_unpack
 // CHECK-NO-UNPACK-SAME: (%[[ARG_0:.+]]: tensor<1x1024xf32>) -> tensor<1x3xf32>
-// CHECK-NO-UNPACK-DAG: %[[CONST:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<1024x3xi8>} : () -> tensor<1024x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>>
+// CHECK-NO-UNPACK-DAG: %[[CONST:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<1024x3xi8>} : () -> tensor<1024x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>>
 // CHECK-NO-UNPACK: %[[QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x1024xf32>) -> tensor<1x1024x!quant.uniform<i8:f32, {{.*}}>>
 // CHECK-NO-UNPACK: %[[DOT:.+]] = stablehlo.dot_general %[[QUANTIZE_0]], %[[CONST]]
-// CHECK-NO-UNPACK: %[[QUANTIZE_1:.+]] = stablehlo.uniform_quantize %2 : (tensor<1x3x!quant.uniform<i32:f32, 1.5439127852413524E-5>>) -> tensor<1x3x!quant.uniform<i8:f32, 0.14049033370672487:-3>>
+// CHECK-NO-UNPACK: %[[QUANTIZE_1:.+]] = stablehlo.uniform_quantize %[[DOT]] : (tensor<1x3x!quant.uniform<i32:f32:1, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
 // CHECK-NO-UNPACK: %[[DEQUANTIZE:.+]] = stablehlo.uniform_dequantize %[[QUANTIZE_1]] : (tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x3xf32>
 // CHECK-NO-UNPACK: return %[[DEQUANTIZE]] : tensor<1x3xf32>
 
@@ -60,20 +60,15 @@ func.func private @composite_dot_general_fn_1(%arg0: tensor<1x1024xf32>, %arg1:
 
 func.func @main(%arg0: tensor<1x1024xf32>) -> tensor<1x3xf32> {
   %0 = "tf.Const"() <{value = dense<0.5> : tensor<1024x3xf32>}> : () -> tensor<1024x3xf32>
-  %2 = "tf.XlaCallModule"(%arg0, %0) <{Sout = [#tf_type.shape<1x3>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
+  %2 = "tf.XlaCallModule"(%arg0, %0) <{Sout = [#tf_type.shape<1x3>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _quantization_method = "static_range_ptq {}", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
   return %2 : tensor<1x3xf32>
 }
 func.func private @composite_dot_general_fn_1(%arg0: tensor<1x1024xf32>, %arg1: tensor<1024x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
   %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
   return %0 : tensor<1x3xf32>
 }
-// CHECK-LABEL: func.func @main
+// CHECK: func.func @main
 // CHECK-SAME: (%[[ARG_0:.+]]: tensor<1x1024xf32>) -> tensor<1x3xf32>
-
 // CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant dense<{{.*}}> : tensor<1024x3xf32>
-// CHECK: "tf.XlaCallModule"(%[[ARG_0]], %[[CONST_0]])
-
-// CHECK: func.func private @composite_dot_general_fn_1
-// CHECK-SAME: attributes {_from_xla_call_module}
-// CHECK: %[[DOT_GENERAL_0:.+]] = stablehlo.dot_general
-// CHECK-SAME: contracting_dims = [1] x [0] : (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
+// CHECK: stablehlo.dot_general %[[ARG_0]], %[[CONST_0]]
+// CHECK-NOT: tf.XlaCallModule
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/components/pre_calibration_component.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/components/pre_calibration_component.mlir
index 1fe56cde49601d..954323af9ef7ad 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/components/pre_calibration_component.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/components/pre_calibration_component.mlir
@@ -8,10 +8,10 @@ func.func @main(%arg0: tensor<1x4xf32>) -> tensor<1x3xf32> {
 }
 // CHECK: @main(%[[ARG_0:.+]]: tensor<1x4xf32>) -> tensor<1x3xf32>
 // CHECK-DAG: %[[CST:.+]] = stablehlo.constant dense<1.000000e+00> : tensor<4x3xf32>
-// CHECK: %[[CUSTOM_AGGREGATOR_0:.+]] = "tf.CustomAggregator"(%[[ARG_0]]) <{id = "0"}> {{.*}}  : (tensor<1x4xf32>) -> tensor<1x4xf32>
+// CHECK: %[[CUSTOM_AGGREGATOR_0:.+]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[ARG_0]]) <{id = "0"}> {{.*}}  : (tensor<1x4xf32>) -> (tensor<1x4xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
 // CHECK: %[[XLA_CALL_MODULE:.+]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_0]], %[[CST]])
 // CHECK-SAME: _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1"
-// CHECK: %[[CUSTOM_AGGREGATOR_1:.+]] = "tf.CustomAggregator"(%[[XLA_CALL_MODULE]]) <{id = "1"}> {{.*}} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+// CHECK: %[[CUSTOM_AGGREGATOR_1:.+]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[XLA_CALL_MODULE]]) <{id = "1"}> {{.*}} : (tensor<1x3xf32>) -> (tensor<1x3xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
 // CHECK: return %[[CUSTOM_AGGREGATOR_1]] : tensor<1x3xf32>
 // CHECK: }
 // CHECK: }
@@ -28,10 +28,10 @@ func.func @serving_default(%arg0: tensor<1x4xf32>) -> tensor<1x3xf32> {
 }
 // CHECK: @serving_default(%[[ARG_0:.+]]: tensor<1x4xf32>) -> tensor<1x3xf32>
 // CHECK-DAG: %[[CST:.+]] = stablehlo.constant dense<1.000000e+00> : tensor<4x3xf32>
-// CHECK: %[[CUSTOM_AGGREGATOR_0:.+]] = "tf.CustomAggregator"(%[[ARG_0]]) <{id = "0"}> {{.*}} : (tensor<1x4xf32>) -> tensor<1x4xf32>
+// CHECK: %[[CUSTOM_AGGREGATOR_0:.+]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[ARG_0]]) <{id = "0"}> {{.*}} : (tensor<1x4xf32>) -> (tensor<1x4xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
 // CHECK: %[[XLA_CALL_MODULE:.+]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_0]], %[[CST]])
 // CHECK-SAME: _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1"
-// CHECK: %[[CUSTOM_AGGREGATOR_1:.+]] = "tf.CustomAggregator"(%[[XLA_CALL_MODULE]]) <{id = "1"}> {{.*}} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+// CHECK: %[[CUSTOM_AGGREGATOR_1:.+]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[XLA_CALL_MODULE]]) <{id = "1"}> {{.*}} : (tensor<1x3xf32>) -> (tensor<1x3xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
 // CHECK: return %[[CUSTOM_AGGREGATOR_1]] : tensor<1x3xf32>
 // CHECK: }
 // CHECK: }
@@ -51,12 +51,12 @@ func.func @main(%arg0: tensor<1x8x4x4xf32>) -> tensor<1x8x4x4xf32> {
 // [b, 0, 1, f]). The weight constant is folded into [0, 1, i, o] format.
 // CHECK-DAG: %[[CST:.+]] = stablehlo.constant dense<3.000000e+00> : tensor<3x3x8x8xf32>
 // CHECK: %[[TRANSPOSE_1:.+]] = stablehlo.transpose %arg0, dims = [0, 2, 3, 1] : (tensor<1x8x4x4xf32>) -> tensor<1x4x4x8xf32>
-// CHECK: %[[CUSTOM_AGGREGATOR_0:.+]] = "tf.CustomAggregator"(%[[TRANSPOSE_1]]) {{.*}} : (tensor<1x4x4x8xf32>) -> tensor<1x4x4x8xf32>
+// CHECK: %[[CUSTOM_AGGREGATOR_0:.+]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[TRANSPOSE_1]]) {{.*}} : (tensor<1x4x4x8xf32>) -> (tensor<1x4x4x8xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
 
 // Corresponds to the converted `stablehlo.convolution`. Note that the shapes
 // correspond to the dimension numbers of: [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]
 // CHECK: %[[XLA_CALL_MODULE:.+]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_0]], %[[CST]]) {{.*}} : (tensor<1x4x4x8xf32>, tensor<3x3x8x8xf32>) -> tensor<1x4x4x8xf32>
-// CHECK: %[[CUSTOM_AGGREGATOR_1:.+]] = "tf.CustomAggregator"(%[[XLA_CALL_MODULE]]) {{.*}} : (tensor<1x4x4x8xf32>) -> tensor<1x4x4x8xf32>
+// CHECK: %[[CUSTOM_AGGREGATOR_1:.+]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[XLA_CALL_MODULE]]) {{.*}} : (tensor<1x4x4x8xf32>) -> (tensor<1x4x4x8xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
 
 // CHECK: %[[TRANSPOSE_2:.+]] = stablehlo.transpose %[[CUSTOM_AGGREGATOR_1]], dims = [0, 3, 1, 2] : (tensor<1x4x4x8xf32>) -> tensor<1x8x4x4xf32>
 // CHECK: return %[[TRANSPOSE_2]] : tensor<1x8x4x4xf32>
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/components/tf_to_stablehlo.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/components/tf_to_stablehlo.mlir
index 240b10d8438431..ec757bc96effaa 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/components/tf_to_stablehlo.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/components/tf_to_stablehlo.mlir
@@ -114,7 +114,7 @@ func.func @func_conv_batchnorm_relu6_dynamic(%arg_0: tensor<?x3x4x3xf32>) -> (te
 // This test makes sure functions with tf._noinline=true is not inlined.
 
 module {
-  func.func @partitioned_call(%arg0: tensor<1x2x2x3xf32>) -> (tensor<1x2x2x3xf32>) {
+  func.func @stateful_partitioned_call(%arg0: tensor<1x2x2x3xf32>) -> (tensor<1x2x2x3xf32>) {
     %0 = "tf.StatefulPartitionedCall"(%arg0) <{
       config = "", config_proto = "", executor_type = "", f = @some_func
     }> {
@@ -139,7 +139,7 @@ module {
 
 module {
   func.func @partitioned_call(%arg0: tensor<1x2x2x3xf32>) -> (tensor<1x2x2x3xf32>) {
-    %0 = "tf.StatefulPartitionedCall"(%arg0) <{
+    %0 = "tf.PartitionedCall"(%arg0) <{
       config = "", config_proto = "", executor_type = "", f = @some_func
     }> {
       _collective_manager_ids = [], device = ""
@@ -153,6 +153,6 @@ module {
 }
 
 // CHECK: module
-// CHECK-NOT: tf.StatefulPartitionedCall
+// CHECK-NOT: tf.PartitionedCall
 // CHECK-NOT: some_func
 // CHECK-NOT: func.call
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/lift_quantizable_spots_as_functions_with_quantization_specs.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/lift_quantizable_spots_as_functions_with_quantization_specs.mlir
index 69bf09104c814d..d6afb6461c0da9 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/lift_quantizable_spots_as_functions_with_quantization_specs.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/lift_quantizable_spots_as_functions_with_quantization_specs.mlir
@@ -123,3 +123,35 @@ func.func @main(%arg0: tensor<1x2xf32>) -> tensor<1x2xf32> {
 // STATIC-RANGE-PTQ-TO-COMPUTE-HEAVY: _original_entry_function
 // STATIC-RANGE-PTQ-TO-COMPUTE-HEAVY-NOT: _quantization_method
 // STATIC-RANGE-PTQ-TO-COMPUTE-HEAVY: _tfl_quant_trait = "fully_quantizable"
+
+// -----
+
+// RUN: stablehlo-quant-opt %s -stablehlo-test-lift-quantizable-spots-as-functions-with-quantization-specs="quantization-specs=static-range-ptq-to-all" \
+// RUN:   -split-input-file | FileCheck %s --check-prefix=STATIC-RANGE-PTQ-TO-ALL
+
+// STATIC-RANGE-PTQ-TO-ALL-LABEL: @some_func
+func.func @some_func(%arg0: tensor<1x1x167xf32>) -> tensor<1x1x64xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<167x64xf32>
+  %1 = stablehlo.dot_general %arg0, %0, contracting_dims = [2] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x1x167xf32>, tensor<167x64xf32>) -> tensor<1x1x64xf32>
+  return %1 : tensor<1x1x64xf32>
+}
+// Tests that XlaCallModuleOp in non-main function has attributes set correctly.
+
+// STATIC-RANGE-PTQ-TO-ALL: %[[CONST:.+]] = stablehlo.constant dense<2.000000e+00>
+// STATIC-RANGE-PTQ-TO-ALL: %[[XLA_CALL_MODULE:.+]] = "tf.XlaCallModule"(%arg0, %[[CONST]])
+
+// Check that the `_quantization_method` attribute contains the quantization
+// method in textproto format, enabling static-range PTQ.
+// STATIC-RANGE-PTQ-TO-ALL-SAME: _entry_function = @composite_dot_general_fn_1
+// STATIC-RANGE-PTQ-TO-ALL-SAME: _original_entry_function
+// STATIC-RANGE-PTQ-TO-ALL-SAME: _quantization_method = "static_range_ptq { }"
+// STATIC-RANGE-PTQ-TO-ALL-SAME: _tfl_quant_trait = "fully_quantizable"
+
+// STATIC-RANGE-PTQ-TO-ALL: return %[[XLA_CALL_MODULE:.+]] : tensor<1x1x64xf32>
+// STATIC-RANGE-PTQ-TO-ALL: }
+
+// STATIC-RANGE-PTQ-TO-ALL-LABEL: private @composite_dot_general_fn_1
+// STATIC-RANGE-PTQ-TO-ALL-SAME: tf_quant.composite_function
+// STATIC-RANGE-PTQ-TO-ALL: %[[DOT_GENERAL:.+]] = stablehlo.dot_general %arg0, %arg1
+// STATIC-RANGE-PTQ-TO-ALL: return %[[DOT_GENERAL:.+]] : tensor<1x1x64xf32>
+// STATIC-RANGE-PTQ-TO-ALL: }
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/merge-fusion-with-dequantize.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/merge-fusion-with-dequantize.mlir
new file mode 100644
index 00000000000000..c228b25a2903c9
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/merge-fusion-with-dequantize.mlir
@@ -0,0 +1,198 @@
+// RUN: stablehlo-quant-opt %s -stablehlo-merge-fusion-with-dequantize -split-input-file -verify-diagnostics | FileCheck %s
+
+// Merge fusion with dequantize for relu case.
+
+module attributes {tf_saved_model.semantics} {
+  // CHECK-LABEL: func.func private @merge_relu_fusion
+  func.func private @merge_relu_fusion(%arg0: tensor<1x4xf32>) -> tensor<1x3xf32> {
+    %0 = stablehlo.constant() {value = dense<127> : tensor<4x3xi8>} : () -> tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>
+    %1 = stablehlo.uniform_quantize %arg0 : (tensor<1x4xf32>) -> tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>
+    // CHECK: call @quantized_dot_general_relu_fn
+    // CHECK-SAME: -> tensor<1x3xf32>
+    %2 = call @quantized_dot_general_relu_fn(%1, %0) : (tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>, tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+    %3 = stablehlo.uniform_dequantize %2 : (tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>) -> tensor<1x3xf32>
+    return %3 : tensor<1x3xf32>
+  }
+
+  // CHECK-LABEL: func.func private @quantized_dot_general_relu_fn
+  func.func private @quantized_dot_general_relu_fn(
+      %arg0: tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>,
+      %arg1: tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>
+    ) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>> attributes {_from_xla_call_module} {
+    // CHECK: %[[MIN:.*]] = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+    // CHECK: %[[DOT:.*]] = stablehlo.dot_general %arg0, %arg1
+    // CHECK: %[[DQ:.*]] = stablehlo.uniform_dequantize %[[DOT]]
+    // CHECK: %[[MAX:.*]] = chlo.broadcast_maximum %[[DQ]], %[[MIN]]
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>, tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>) -> tensor<1x3x!quant.uniform<i32:f32:1, {3.000000e-05,3.000000e-05,3.000000e-05}>>
+    %1 = stablehlo.uniform_quantize %0 : (tensor<1x3x!quant.uniform<i32:f32:1, {3.000000e-05,3.000000e-05,3.000000e-05}>>) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+    return %1 : tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+  }
+}
+
+// -----
+
+// Merge fusion with dequantize for relu6 case.
+
+module attributes {tf_saved_model.semantics} {
+  // CHECK-LABEL: func.func private @merge_relu6_fusion
+  func.func private @merge_relu6_fusion(%arg0: tensor<1x4xf32>) -> tensor<1x3xf32> {
+    %0 = stablehlo.constant() {value = dense<127> : tensor<4x3xi8>} : () -> tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>
+    %1 = stablehlo.uniform_quantize %arg0 : (tensor<1x4xf32>) -> tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>
+    // CHECK: call @quantized_dot_general_relu6_fn
+    // CHECK-SAME: -> tensor<1x3xf32>
+    %2 = call @quantized_dot_general_relu6_fn(%1, %0) : (tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>, tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+    %3 = stablehlo.uniform_dequantize %2 : (tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>) -> tensor<1x3xf32>
+    return %3 : tensor<1x3xf32>
+  }
+
+  // CHECK-LABEL: func.func private @quantized_dot_general_relu6_fn
+  func.func private @quantized_dot_general_relu6_fn(
+      %arg0: tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>,
+      %arg1: tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>
+    ) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>> attributes {_from_xla_call_module} {
+    // CHECK-DAG: %[[MIN:.*]] = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+    // CHECK-DAG: %[[MAX:.*]] = stablehlo.constant dense<6.000000e+00> : tensor<f32>
+    // CHECK: %[[DOT:.*]] = stablehlo.dot_general %arg0, %arg1
+    // CHECK: %[[DQ:.*]] = stablehlo.uniform_dequantize %[[DOT]]
+    // CHECK: %[[CLAMP:.*]] = stablehlo.clamp %[[MIN]], %[[DQ]], %[[MAX]]
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>, tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>) -> tensor<1x3x!quant.uniform<i32:f32:1, {3.000000e-05,3.000000e-05,3.000000e-05}>>
+    %1 = stablehlo.uniform_quantize %0 : (tensor<1x3x!quant.uniform<i32:f32:1, {3.000000e-05,3.000000e-05,3.000000e-05}>>) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+    return %1 : tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+  }
+}
+
+// -----
+
+// Merge fusion with dequantize for no activation case.
+
+module attributes {tf_saved_model.semantics} {
+  // CHECK-LABEL: func.func private @merge_no_act_fusion
+  func.func private @merge_no_act_fusion(%arg0: tensor<1x4xf32>) -> tensor<1x3xf32> {
+    %0 = stablehlo.constant() {value = dense<127> : tensor<4x3xi8>} : () -> tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>
+    %1 = stablehlo.uniform_quantize %arg0 : (tensor<1x4xf32>) -> tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>
+    // CHECK: call @quantized_dot_general_fn
+    // CHECK-SAME: -> tensor<1x3xf32>
+    %2 = call @quantized_dot_general_fn(%1, %0) : (tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>, tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+    %3 = stablehlo.uniform_dequantize %2 : (tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>) -> tensor<1x3xf32>
+    return %3 : tensor<1x3xf32>
+  }
+
+  // CHECK-LABEL: func.func private @quantized_dot_general_fn
+  func.func private @quantized_dot_general_fn(
+      %arg0: tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>,
+      %arg1: tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>
+    ) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>> attributes {_from_xla_call_module} {
+    // CHECK: %[[DOT:.*]] = stablehlo.dot_general %arg0, %arg1
+    // CHECK: %[[DQ:.*]] = stablehlo.uniform_dequantize %[[DOT]]
+    // CHECK: return %[[DQ]] : tensor<1x3xf32>
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>, tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>) -> tensor<1x3x!quant.uniform<i32:f32:1, {3.000000e-05,3.000000e-05,3.000000e-05}>>
+    %1 = stablehlo.uniform_quantize %0 : (tensor<1x3x!quant.uniform<i32:f32:1, {3.000000e-05,3.000000e-05,3.000000e-05}>>) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+    return %1 : tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+  }
+}
+
+// -----
+
+// Do not merge when quant.uniform result is used directly.
+
+module attributes {tf_saved_model.semantics} {
+  // CHECK-LABEL: func.func private @no_merge_fusion_direct_usage
+  func.func private @no_merge_fusion_direct_usage(%arg0: tensor<1x4xf32>) -> (tensor<1x3xf32>, tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>) {
+    %0 = stablehlo.constant() {value = dense<127> : tensor<4x3xi8>} : () -> tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>
+    %1 = stablehlo.uniform_quantize %arg0 : (tensor<1x4xf32>) -> tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>
+    // CHECK: call @quantized_dot_general_relu_fn
+    // CHECK-SAME: -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+    %2 = call @quantized_dot_general_relu_fn(%1, %0) : (tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>, tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+    %3 = stablehlo.uniform_dequantize %2 : (tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>) -> tensor<1x3xf32>
+    return %3, %2 : tensor<1x3xf32>, tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+  }
+
+  // CHECK-LABEL: func.func private @quantized_dot_general_relu_fn
+  func.func private @quantized_dot_general_relu_fn(
+      %arg0: tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>,
+      %arg1: tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>
+    ) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>> attributes {_from_xla_call_module} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>, tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>) -> tensor<1x3x!quant.uniform<i32:f32:1, {3.000000e-05,3.000000e-05,3.000000e-05}>>
+    %1 = stablehlo.uniform_quantize %0 : (tensor<1x3x!quant.uniform<i32:f32:1, {3.000000e-05,3.000000e-05,3.000000e-05}>>) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+    return %1 : tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+  }
+}
+
+// -----
+
+// Do not merge when fusion and dequantize is already merged.
+
+module attributes {tf_saved_model.semantics} {
+  // CHECK-LABEL: func.func private @no_merge_fusion_already_merged
+  func.func private @no_merge_fusion_already_merged(%arg0: tensor<1x4xf32>) -> (tensor<1x3xf32>) {
+    %0 = stablehlo.constant() {value = dense<127> : tensor<4x3xi8>} : () -> tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>
+    %1 = stablehlo.uniform_quantize %arg0 : (tensor<1x4xf32>) -> tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>
+    // CHECK: call @quantized_dot_general_fn
+    // CHECK-SAME: -> tensor<1x3xf32>
+    %2 = call @quantized_dot_general_fn(%1, %0) : (tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>, tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>) -> tensor<1x3xf32>
+    return %2 : tensor<1x3xf32>
+  }
+
+  // CHECK-LABEL: func.func private @quantized_dot_general_fn
+  func.func private @quantized_dot_general_fn(
+      %arg0: tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>,
+      %arg1: tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>
+    ) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>, tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>) -> tensor<1x3x!quant.uniform<i32:f32:1, {3.000000e-05,3.000000e-05,3.000000e-05}>>
+    %1 = stablehlo.uniform_dequantize %0 : (tensor<1x3x!quant.uniform<i32:f32:1, {3.000000e-05,3.000000e-05,3.000000e-05}>>) -> tensor<1x3xf32>
+    return %1 : tensor<1x3xf32>
+  }
+}
+
+// -----
+
+// Do not merge when function is not quantized function.
+
+module attributes {tf_saved_model.semantics} {
+  // CHECK-LABEL: func.func private @merge_relu_fusion
+  func.func private @merge_relu_fusion(%arg0: tensor<1x4xf32>) -> tensor<1x3xf32> {
+    %0 = stablehlo.constant() {value = dense<127> : tensor<4x3xi8>} : () -> tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>
+    %1 = stablehlo.uniform_quantize %arg0 : (tensor<1x4xf32>) -> tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>
+    // CHECK: call @some_func
+    // CHECK-SAME: -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+    %2 = call @some_func(%1, %0) : (tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>, tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+    %3 = stablehlo.uniform_dequantize %2 : (tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>) -> tensor<1x3xf32>
+    return %3 : tensor<1x3xf32>
+  }
+
+  // CHECK-LABEL: func.func private @some_func
+  func.func private @some_func(
+      %arg0: tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>,
+      %arg1: tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>
+    ) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>> attributes {_from_xla_call_module} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>, tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>) -> tensor<1x3x!quant.uniform<i32:f32:1, {3.000000e-05,3.000000e-05,3.000000e-05}>>
+    %1 = stablehlo.uniform_quantize %0 : (tensor<1x3x!quant.uniform<i32:f32:1, {3.000000e-05,3.000000e-05,3.000000e-05}>>) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+    return %1 : tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+  }
+}
+
+// -----
+
+// Do not merge when the quantized fusion is invalid.
+
+module attributes {tf_saved_model.semantics} {
+  // CHECK-LABEL: func.func private @merge_relu_fusion
+  func.func private @merge_relu_fusion(%arg0: tensor<1x4xf32>) -> tensor<1x3xf32> {
+    %0 = stablehlo.constant() {value = dense<127> : tensor<4x3xi8>} : () -> tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>
+    %1 = stablehlo.uniform_quantize %arg0 : (tensor<1x4xf32>) -> tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>
+    // CHECK: call @quantized_dot_general_relu_fn
+    // CHECK-SAME: -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+    %2 = call @quantized_dot_general_relu_fn(%1, %0) : (tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>, tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+    %3 = stablehlo.uniform_dequantize %2 : (tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>) -> tensor<1x3xf32>
+    return %3 : tensor<1x3xf32>
+  }
+
+  // CHECK-LABEL: func.func private @quantized_dot_general_relu_fn
+  func.func private @quantized_dot_general_relu_fn(
+      %arg0: tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>,
+      %arg1: tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>
+    ) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>> attributes {_from_xla_call_module} {
+    %0 = stablehlo.constant() {value = dense<2> : tensor<1x3xi8>} : () -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+    return %0 : tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+  }
+}
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize.mlir
index 06edf90896e5ca..5e75126225244a 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize.mlir
@@ -11,7 +11,7 @@ module attributes {tf_saved_model.semantics} {
     %2 = "quantfork.dcast"(%1) : (tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03, 5.000000e-03, 5.000000e-03}>>) -> tensor<4x3xf32>
     %3 = "quantfork.qcast"(%arg0) {volatile} : (tensor<1x4xf32>) -> tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>
     %4 = "quantfork.dcast"(%3) : (tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>) -> tensor<1x4xf32>
-    %5 = "tf.XlaCallModule"(%4, %2) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x4xf32>, tensor<4x3xf32>) -> tensor<1x3xf32>
+    %5 = "tf.XlaCallModule"(%4, %2) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x4xf32>, tensor<4x3xf32>) -> tensor<1x3xf32>
     %6 = "quantfork.qcast"(%5) {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
     %7 = "quantfork.dcast"(%6) : (tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>) -> tensor<1x3xf32>
     return %7 : tensor<1x3xf32>
@@ -22,7 +22,10 @@ module attributes {tf_saved_model.semantics} {
 // CHECK: %[[CONST_0:.+]] = stablehlo.constant dense<1.000000e+00> : tensor<4x3xf32>
 // CHECK-DAG: %[[QCAST_0:.+]] = "quantfork.qcast"(%[[CONST_0]]) {volatile} : (tensor<4x3xf32>) -> tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>
 // CHECK-DAG: %[[QCAST_1:.+]] = "quantfork.qcast"(%[[ARG_0]]) {volatile} : (tensor<1x4xf32>) -> tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>
-// CHECK: %[[CALL_0:.+]] =  call @quantized_dot_general_fn(%[[QCAST_1]], %[[QCAST_0]]) : (tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>, tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+// CHECK: %[[CALL_0:.+]] = call @quantized_dot_general_fn(%[[QCAST_1]], %[[QCAST_0]])
+// Test that the `Method` has been copied over.
+// CHECK-SAME: {_quantization_method = "static_range_ptq { }"}
+// CHECK-SAME: : (tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>, tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
 // CHECK: %[[DCAST_0:.+]] = "quantfork.dcast"(%[[CALL_0]]) :  (tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>) -> tensor<1x3xf32>
 // CHECK: return
 
@@ -40,7 +43,7 @@ module attributes {tf_saved_model.semantics} {
 
 // CHECK-LABEL: quantize_simple_xla_call_module_no_operand
 func.func private @quantize_simple_xla_call_module_no_operand() -> tensor<1x3xf32> {
-  %0 = "tf.XlaCallModule"() {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : () -> tensor<1x3xf32>
+  %0 = "tf.XlaCallModule"() {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : () -> tensor<1x3xf32>
   %1 = "quantfork.qcast"(%0) {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
   %2 = "quantfork.dcast"(%1) : (tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>) -> tensor<1x3xf32>
   return %2 : tensor<1x3xf32>
@@ -63,7 +66,7 @@ module attributes {tf_saved_model.semantics} {
    %4 = "quantfork.dcast"(%3) : (tensor<1x2x!quant.uniform<i8:f32, 6.000000e-03:-128>>) -> tensor<1x2xf32>
 // expected-error @+2 {{Failed to find a valid entry function}}
 // expected-error @+1 {{'tf.XlaCallModule' op operand #0 must be variadic of tensor of tf.dtype values}}
-   %5 = "tf.XlaCallModule"(%4, %2) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+   %5 = "tf.XlaCallModule"(%4, %2) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
    %6 = "quantfork.qcast"(%5) {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
    %7 = "quantfork.dcast"(%6) : (tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>) -> tensor<1x3xf32>
    return %7 : tensor<1x3xf32>
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize_op_with_region.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize_op_with_region.mlir
index 04104c308a3b3d..d94e1ca3787a3c 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize_op_with_region.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize_op_with_region.mlir
@@ -32,7 +32,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
     %5 = "quantfork.dcast"(%4) : (tensor<2x3x1024x3x!quant.uniform<i8<-127:127>:f32, 4.000000e-01>>) -> tensor<2x3x1024x3xf32>
     %6 = "quantfork.qcast"(%arg0) {volatile} : (tensor<2x3x1x1024xf32>) -> tensor<2x3x1x1024x!quant.uniform<i8:f32, 5.000000e-01:2>>
     %7 = "quantfork.dcast"(%6) : (tensor<2x3x1x1024x!quant.uniform<i8:f32, 5.000000e-01:2>>) -> tensor<2x3x1x1024xf32>
-    %8 = "tf.XlaCallModule"(%7, %5) <{Sout = [#tf_type.shape<2x3x1x3>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<2x3x1x1024xf32>, tensor<2x3x1024x3xf32>) -> tensor<2x3x1x3xf32>
+    %8 = "tf.XlaCallModule"(%7, %5) <{Sout = [#tf_type.shape<2x3x1x3>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _quantization_method = "static_range_ptq {}", _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<2x3x1x1024xf32>, tensor<2x3x1024x3xf32>) -> tensor<2x3x1x3xf32>
     %9 = "quantfork.qcast"(%8) {volatile} : (tensor<2x3x1x3xf32>) -> tensor<2x3x1x3x!quant.uniform<i8:f32, 3.000000e-01:1>>
     %10 = "quantfork.dcast"(%9) : (tensor<2x3x1x3x!quant.uniform<i8:f32, 3.000000e-01:1>>) -> tensor<2x3x1x3xf32>
     %11 = "stablehlo.reduce_window"(%10, %3) ({
@@ -98,7 +98,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
     %8 = "quantfork.dcast"(%7) : (tensor<2x3x1x1024x!quant.uniform<i8:f32, 5.000000e-01:2>>) -> tensor<2x3x1x1024xf32>
     %9 = "quantfork.qcast"(%1) {volatile} : (tensor<2x3x1024x3xf32>) -> tensor<2x3x1024x3x!quant.uniform<i8<-127:127>:f32, 4.000000e-01>>
     %10 = "quantfork.dcast"(%9) : (tensor<2x3x1024x3x!quant.uniform<i8<-127:127>:f32, 4.000000e-01>>) -> tensor<2x3x1024x3xf32>
-    %11 = "tf.XlaCallModule"(%8, %10) <{Sout = [#tf_type.shape<2x3x1x3>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<2x3x1x1024xf32>, tensor<2x3x1024x3xf32>) -> tensor<2x3x1x3xf32>
+    %11 = "tf.XlaCallModule"(%8, %10) <{Sout = [#tf_type.shape<2x3x1x3>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _quantization_method = "static_range_ptq {}", _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<2x3x1x1024xf32>, tensor<2x3x1024x3xf32>) -> tensor<2x3x1x3xf32>
     %12 = "quantfork.qcast"(%11) {volatile} : (tensor<2x3x1x3xf32>) -> tensor<2x3x1x3x!quant.uniform<i8:f32, 3.000000e-01:1>>
     %13 = "quantfork.dcast"(%12) : (tensor<2x3x1x3x!quant.uniform<i8:f32, 3.000000e-01:1>>) -> tensor<2x3x1x3xf32>
     return %13 : tensor<2x3x1x3xf32>
@@ -150,7 +150,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
     %5 = "quantfork.dcast"(%4) : (tensor<2x3x1024x3x!quant.uniform<i8<-127:127>:f32, 4.000000e-01>>) -> tensor<2x3x1024x3xf32>
     %6 = "quantfork.qcast"(%arg0) {volatile} : (tensor<2x3x1x1024xf32>) -> tensor<2x3x1x1024x!quant.uniform<i8:f32, 5.000000e-01:2>>
     %7 = "quantfork.dcast"(%6) : (tensor<2x3x1x1024x!quant.uniform<i8:f32, 5.000000e-01:2>>) -> tensor<2x3x1x1024xf32>
-    %8 = "tf.XlaCallModule"(%7, %5) <{Sout = [#tf_type.shape<2x3x1x3>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<2x3x1x1024xf32>, tensor<2x3x1024x3xf32>) -> tensor<2x3x1x3xf32>
+    %8 = "tf.XlaCallModule"(%7, %5) <{Sout = [#tf_type.shape<2x3x1x3>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _quantization_method = "static_range_ptq {}", _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<2x3x1x1024xf32>, tensor<2x3x1024x3xf32>) -> tensor<2x3x1x3xf32>
     %9 = "quantfork.qcast"(%8) {volatile} : (tensor<2x3x1x3xf32>) -> tensor<2x3x1x3x!quant.uniform<i8:f32, 3.000000e-01:1>>
     %10 = "quantfork.dcast"(%9) : (tensor<2x3x1x3x!quant.uniform<i8:f32, 3.000000e-01:1>>) -> tensor<2x3x1x3xf32>
     %11 = stablehlo.reshape %10 : (tensor<2x3x1x3xf32>) -> tensor<2x3x3xf32>
@@ -223,7 +223,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
     %11 = "quantfork.dcast"(%10) : (tensor<2x3x1x1024x!quant.uniform<i8:f32, 5.000000e-01:2>>) -> tensor<2x3x1x1024xf32>
     %12 = "quantfork.qcast"(%1) {volatile} : (tensor<2x3x1024x3xf32>) -> tensor<2x3x1024x3x!quant.uniform<i8<-127:127>:f32, 4.000000e-01>>
     %13 = "quantfork.dcast"(%12) : (tensor<2x3x1024x3x!quant.uniform<i8<-127:127>:f32, 4.000000e-01>>) -> tensor<2x3x1024x3xf32>
-    %14 = "tf.XlaCallModule"(%11, %13) <{Sout = [#tf_type.shape<2x3x1x3>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<2x3x1x1024xf32>, tensor<2x3x1024x3xf32>) -> tensor<2x3x1x3xf32>
+    %14 = "tf.XlaCallModule"(%11, %13) <{Sout = [#tf_type.shape<2x3x1x3>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _quantization_method = "static_range_ptq {}", _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<2x3x1x1024xf32>, tensor<2x3x1024x3xf32>) -> tensor<2x3x1x3xf32>
     %15 = "quantfork.qcast"(%14) {volatile} : (tensor<2x3x1x3xf32>) -> tensor<2x3x1x3x!quant.uniform<i8:f32, 3.000000e-01:1>>
     %16 = "quantfork.dcast"(%15) : (tensor<2x3x1x3x!quant.uniform<i8:f32, 3.000000e-01:1>>) -> tensor<2x3x1x3xf32>
     return %16 : tensor<2x3x1x3xf32>
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize_same_scale.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize_same_scale.mlir
index 9be0add0ba4551..25aab3044a3496 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize_same_scale.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize_same_scale.mlir
@@ -16,7 +16,7 @@ module attributes {tf_saved_model.semantics} {
     %1 = "quantfork.dcast"(%0) : (tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03>>) -> tensor<1x2xf32>
     %2 = "quantfork.qcast"(%arg1) {volatile} : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03}>>
     %3 = "quantfork.dcast"(%2) : (tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03}>>) -> tensor<2x3xf32>
-    %4 = "tf.XlaCallModule"(%1, %3) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    %4 = "tf.XlaCallModule"(%1, %3) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _quantization_method = "static_range_ptq {}", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
     %5 = "quantfork.qcast"(%4) {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
     %6 = "quantfork.dcast"(%5) : (tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<1x3xf32>
     %7 = stablehlo.reshape %6 : (tensor<1x3xf32>) -> tensor<3x1xf32>
@@ -58,7 +58,7 @@ module attributes {tf_saved_model.semantics} {
     %1 = "quantfork.dcast"(%0) : (tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03>>) -> tensor<1x2xf32>
     %2 = "quantfork.qcast"(%arg1) {volatile} : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03}>>
     %3 = "quantfork.dcast"(%2) : (tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03}>>) -> tensor<2x3xf32>
-    %4 = "tf.XlaCallModule"(%1, %3) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    %4 = "tf.XlaCallModule"(%1, %3) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _quantization_method = "static_range_ptq {}", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
     %5 = "quantfork.qcast"(%4) {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
     %6 = "quantfork.dcast"(%5) : (tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<1x3xf32>
     %7 = stablehlo.reshape %6 : (tensor<1x3xf32>) -> tensor<3x1xf32>
@@ -133,7 +133,7 @@ module attributes {tf_saved_model.semantics} {
     %6 = "quantfork.dcast"(%5) : (tensor<4x2x!quant.uniform<i8:f32, 5.000000e-03:-1>>) -> tensor<4x2xf32>
     %7 = "quantfork.qcast"(%arg2) {volatile} : (tensor<2x5xf32>) -> tensor<2x5x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03,6.000000e-03,6.000000e-03}>>
     %8 = "quantfork.dcast"(%7) : (tensor<2x5x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03,6.000000e-03,6.000000e-03}>>) -> tensor<2x5xf32>
-    %9 = "tf.XlaCallModule"(%6, %8) {Sout = [#tf_type.shape<4x5>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<4x2xf32>, tensor<2x5xf32>) -> tensor<4x5xf32>
+    %9 = "tf.XlaCallModule"(%6, %8) {Sout = [#tf_type.shape<4x5>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _quantization_method = "static_range_ptq {}", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<4x2xf32>, tensor<2x5xf32>) -> tensor<4x5xf32>
     %10 = "quantfork.qcast"(%9) {volatile} : (tensor<4x5xf32>) -> tensor<4x5x!quant.uniform<i8:f32, 1.000000e-03:-3>>
     %11 = "quantfork.dcast"(%10) : (tensor<4x5x!quant.uniform<i8:f32, 1.000000e-03:-3>>) -> tensor<4x5xf32>
     return %11 : tensor<4x5xf32>
@@ -173,7 +173,7 @@ module attributes {tf_saved_model.semantics} {
     %1 = "quantfork.dcast"(%0) : (tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03>>) -> tensor<1x2xf32>
     %2 = "quantfork.qcast"(%arg1) {volatile} : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03}>>
     %3 = "quantfork.dcast"(%2) : (tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03}>>) -> tensor<2x3xf32>
-    %4 = "tf.XlaCallModule"(%1, %3) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    %4 = "tf.XlaCallModule"(%1, %3) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _quantization_method = "static_range_ptq {}", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
     %5 = "quantfork.qcast"(%4) {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
     %6 = "quantfork.dcast"(%5) : (tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<1x3xf32>
     %7 = "quantfork.qcast"(%arg2) {volatile} : (tensor<f32>) -> tensor<!quant.uniform<i8:f32, 0.13170163023705575:-1>>
@@ -218,7 +218,7 @@ module attributes {tf_saved_model.semantics} {
     %1 = "quantfork.dcast"(%0) : (tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03>>) -> tensor<1x2xf32>
     %2 = "quantfork.qcast"(%arg1) {volatile} : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03}>>
     %3 = "quantfork.dcast"(%2) : (tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03}>>) -> tensor<2x3xf32>
-    %4 = "tf.XlaCallModule"(%1, %3) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    %4 = "tf.XlaCallModule"(%1, %3) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _quantization_method = "static_range_ptq {}", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
     %5 = "quantfork.qcast"(%4) {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
     %6 = "quantfork.dcast"(%5) : (tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<1x3xf32>
     %7 = "quantfork.qcast"(%arg3) {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
@@ -260,7 +260,7 @@ module attributes {tf_saved_model.semantics} {
     %1 = "quantfork.dcast"(%0) : (tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03>>) -> tensor<1x2xf32>
     %2 = "quantfork.qcast"(%arg1) {volatile} : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03}>>
     %3 = "quantfork.dcast"(%2) : (tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03}>>) -> tensor<2x3xf32>
-    %4 = "tf.XlaCallModule"(%1, %3) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    %4 = "tf.XlaCallModule"(%1, %3) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _quantization_method = "static_range_ptq {}", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
     %5 = "quantfork.qcast"(%4) {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
     %6 = "quantfork.dcast"(%5) : (tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<1x3xf32>
     %7 = stablehlo.broadcast_in_dim %6, dims = [2, 1] : (tensor<1x3xf32>) -> tensor<2x3x2xf32>
@@ -302,7 +302,7 @@ module attributes {tf_saved_model.semantics} {
     %1 = "quantfork.dcast"(%0) : (tensor<3x4x5x!quant.uniform<i8:f32, 5.000000e-03>>) -> tensor<3x4x5xf32>
     %2 = "quantfork.qcast"(%arg1) {volatile} : (tensor<3x5x2xf32>) -> tensor<3x5x2x!quant.uniform<i8<-127:127>:f32, 6.000000e-03:13>>
     %3 = "quantfork.dcast"(%2) : (tensor<3x5x2x!quant.uniform<i8<-127:127>:f32, 6.000000e-03:13>>) -> tensor<3x5x2xf32>
-    %4 = "tf.XlaCallModule"(%1, %3) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<3x4x5xf32>, tensor<3x5x2xf32>) -> tensor<3x4x2xf32>
+    %4 = "tf.XlaCallModule"(%1, %3) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _quantization_method = "static_range_ptq {}", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<3x4x5xf32>, tensor<3x5x2xf32>) -> tensor<3x4x2xf32>
     %5 = "quantfork.qcast"(%4) {volatile} : (tensor<3x4x2xf32>) -> tensor<3x4x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
     %6 = "quantfork.dcast"(%5) : (tensor<3x4x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<3x4x2xf32>
     %7 = "stablehlo.gather"(%6, %arg2) {
@@ -350,7 +350,7 @@ module attributes {tf_saved_model.semantics} {
     %1 = "quantfork.dcast"(%0) : (tensor<3x2x!quant.uniform<i8:f32, 5.000000e-03>>) -> tensor<3x2xf32>
     %2 = "quantfork.qcast"(%arg1) {volatile} : (tensor<2x4xf32>) -> tensor<2x4x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03,6.000000e-03}>>
     %3 = "quantfork.dcast"(%2) : (tensor<2x4x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03,6.000000e-03}>>) -> tensor<2x4xf32>
-    %4 = "tf.XlaCallModule"(%1, %3) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<3x2xf32>, tensor<2x4xf32>) -> tensor<3x4xf32>
+    %4 = "tf.XlaCallModule"(%1, %3) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _quantization_method = "static_range_ptq {}", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<3x2xf32>, tensor<2x4xf32>) -> tensor<3x4xf32>
     %5 = "quantfork.qcast"(%4) {volatile} : (tensor<3x4xf32>) -> tensor<3x4x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
     %6 = "quantfork.dcast"(%5) : (tensor<3x4x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<3x4xf32>
     %7 = stablehlo.slice %6 [1:3, 2:4] : (tensor<3x4xf32>) -> tensor<2x2xf32>
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize_weight_only.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize_weight_only.mlir
index 6db474de676ccc..81e8b4bde5e13e 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize_weight_only.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize_weight_only.mlir
@@ -8,7 +8,7 @@ module attributes {tf_saved_model.semantics} {
     %cst = stablehlo.constant dense<3.000000e-01> : tensor<2x3xf32>
     %0 = "quantfork.qcast"(%cst) : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<i8:f32, 6.000000e-03:-128>>
     %1 = "quantfork.dcast"(%0) : (tensor<2x3x!quant.uniform<i8:f32, 6.000000e-03:-128>>) -> tensor<2x3xf32>
-    %2 = "tf.XlaCallModule"(%arg0, %1) <{Sout = [#tf_type.shape<1x3>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    %2 = "tf.XlaCallModule"(%arg0, %1) <{Sout = [#tf_type.shape<1x3>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _quantization_method = "weight_only_ptq { }", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
     return %2 : tensor<1x3xf32>
   }
 
@@ -22,7 +22,8 @@ module attributes {tf_saved_model.semantics} {
 // CHECK-SAME: %[[ARG0:.+]]: tensor<1x2xf32>
 // CHECK: %[[CST:.+]] = stablehlo.constant dense<3.000000e-01> : tensor<2x3xf32>
 // CHECK: %[[Q:.+]] = "quantfork.qcast"(%[[CST]]) : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<i8:f32, 6.000000e-03:-128>>
-// CHECK: %[[CALL:.+]] = call @quantized_dot_general_fn(%[[ARG0]], %[[Q]]) : (tensor<1x2xf32>, tensor<2x3x!quant.uniform<i8:f32, 6.000000e-03:-128>>) -> tensor<1x3xf32>
+// CHECK: %[[CALL:.+]] = call @quantized_dot_general_fn(%[[ARG0]], %[[Q]])
+// CHECK-SAME: {_quantization_method = "weight_only_ptq { }"} : (tensor<1x2xf32>, tensor<2x3x!quant.uniform<i8:f32, 6.000000e-03:-128>>) -> tensor<1x3xf32>
 // CHECK: return %[[CALL]]
 
 // CHECK: quantized_dot_general_fn
@@ -41,7 +42,7 @@ module attributes {tf_saved_model.semantics} {
     %cst = stablehlo.constant dense<3.000000e-01> : tensor<2x3x3x2xf32>
     %0 = "quantfork.qcast"(%cst) : (tensor<2x3x3x2xf32>) -> tensor<2x3x3x2x!quant.uniform<i8:f32, 6.000000e-03:-128>>
     %1 = "quantfork.dcast"(%0) : (tensor<2x3x3x2x!quant.uniform<i8:f32, 6.000000e-03:-128>>) -> tensor<2x3x3x2xf32>
-    %2 = "tf.XlaCallModule"(%arg0, %1) <{Sout = [#tf_type.shape<1x3x4x2>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @composite_conv_fn, _original_entry_function = "composite_conv_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+    %2 = "tf.XlaCallModule"(%arg0, %1) <{Sout = [#tf_type.shape<1x3x4x2>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @composite_conv_fn, _original_entry_function = "composite_conv_fn", _quantization_method = "weight_only_ptq { }", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
     return %2 : tensor<1x3x4x2xf32>
   }
 
@@ -55,7 +56,7 @@ module attributes {tf_saved_model.semantics} {
 // CHECK-SAME: %[[ARG0:.+]]: tensor<1x3x4x3xf32>
 // CHECK: %[[CST:.+]] = stablehlo.constant dense<3.000000e-01> : tensor<2x3x3x2xf32>
 // CHECK: %[[Q:.+]] = "quantfork.qcast"(%[[CST]]) : (tensor<2x3x3x2xf32>) -> tensor<2x3x3x2x!quant.uniform<i8:f32, 6.000000e-03:-128>>
-// CHECK: %[[CALL:.+]] = call @quantized_conv_fn(%[[ARG0]], %[[Q]]) : (tensor<1x3x4x3xf32>, tensor<2x3x3x2x!quant.uniform<i8:f32, 6.000000e-03:-128>>) -> tensor<1x3x4x2xf32>
+// CHECK: %[[CALL:.+]] = call @quantized_conv_fn(%[[ARG0]], %[[Q]]) {_quantization_method = "weight_only_ptq { }"} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2x!quant.uniform<i8:f32, 6.000000e-03:-128>>) -> tensor<1x3x4x2xf32>
 // CHECK: return %[[CALL]]
 
 // CHECK: quantized_conv_fn
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions.mlir
index f9fa9ce5f60b87..09f002559b7830 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions.mlir
@@ -9,7 +9,7 @@ module attributes {tf_saved_model.semantics} {
   func.func private @quantize_dot_general_fn(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
     %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
     %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<1x2xf32>) -> tensor<1x2xf32>
-    %1 = "tf.XlaCallModule"(%0, %cst) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
     %2 = "quantfork.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<1x3xf32>) -> tensor<1x3xf32>
     return %2 : tensor<1x3xf32>
   }
@@ -19,14 +19,14 @@ module attributes {tf_saved_model.semantics} {
 // CHECK: func.func private @quantize_dot_general_fn(%[[ARG_0:.+]]: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"}
 // CHECK: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>
 // CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK: %[[CALL_0:.+]] = call @quantized_dot_general_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]]) : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[CALL_0:.+]] = call @quantized_dot_general_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]]) {_quantization_method = "static_range_ptq { }"} : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
 // CHECK: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3xf32>
 // CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3xf32>
 
 // CHECK-PER-TENSOR: func.func private @quantize_dot_general_fn(%[[ARG_0:.+]]: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"}
 // CHECK-PER-TENSOR: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>
 // CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_dot_general_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]]) : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_dot_general_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]]) {_quantization_method = "static_range_ptq { }"} : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
 // CHECK-PER-TENSOR: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3xf32>
 // CHECK-PER-TENSOR: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3xf32>
 
@@ -57,14 +57,14 @@ module attributes {tf_saved_model.semantics} {
   func.func private @quantize_dot_general_batch_per_tensor_quantized_fn(%arg0: tensor<2x2x2xf32>) -> tensor<2x2x3xf32> attributes {tf._original_func_name = "main_0"} {
     %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x2x3xf32>} : () -> tensor<2x2x3xf32>
     %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<2x2x2xf32>) -> tensor<2x2x2xf32>
-    %1 = "tf.XlaCallModule"(%0, %cst) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<2x2x2xf32>, tensor<2x2x3xf32>) -> tensor<2x2x3xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<2x2x2xf32>, tensor<2x2x3xf32>) -> tensor<2x2x3xf32>
     %2 = "quantfork.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<2x2x3xf32>) -> tensor<2x2x3xf32>
     return %2 : tensor<2x2x3xf32>
   }
 // CHECK: func.func private @quantize_dot_general_batch_per_tensor_quantized_fn(%[[ARG_0:.+]]: tensor<2x2x2xf32>) -> tensor<2x2x3xf32> attributes {tf._original_func_name = "main_0"}
 // CHECK: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<127> : tensor<2x2x3xi8>} : () -> tensor<2x2x3x!quant.uniform<i8:f32, {{.*}}>>
 // CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<2x2x2xf32>) -> tensor<2x2x2x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK: %[[CALL_0:.+]] = call @quantized_dot_general_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]]) : (tensor<2x2x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x2x3x!quant.uniform<i8:f32, {{.*}}>) -> tensor<2x2x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[CALL_0:.+]] = call @quantized_dot_general_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]]) {_quantization_method = "static_range_ptq { }"} : (tensor<2x2x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x2x3x!quant.uniform<i8:f32, {{.*}}>) -> tensor<2x2x3x!quant.uniform<i8:f32, {{.*}}>>
 // CHECK: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<2x2x3x!quant.uniform<i8:f32, {{.*}}>) -> tensor<2x2x3xf32>
 // CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<2x2x3xf32>
 
@@ -83,7 +83,7 @@ module attributes {tf_saved_model.semantics} {
     %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
     %cst_0 = "tf.Const"() {value = dense<4.00000000e-1> : tensor<1x3xf32>} : () -> tensor<1x3xf32>
     %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<1x2xf32>) -> tensor<1x2xf32>
-    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_with_bias_same_shape_fn, _original_entry_function = "composite_dot_general_with_bias_same_shape_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_with_bias_same_shape_fn, _original_entry_function = "composite_dot_general_with_bias_same_shape_fn", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
     %2 = "quantfork.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<1x3xf32>) -> tensor<1x3xf32>
     return %2 : tensor<1x3xf32>
   }
@@ -91,7 +91,7 @@ module attributes {tf_saved_model.semantics} {
 // CHECK: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>
 // CHECK: %[[CONST_1:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<1x3xi32>} : () -> tensor<1x3x!quant.uniform<i32:f32:1, {{.*}}>
 // CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK: %[[CALL_0:.+]] = call @quantized_dot_general_with_bias_same_shape_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]]) : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>, tensor<1x3x!quant.uniform<i32:f32:1, {{.*}}>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>
+// CHECK: %[[CALL_0:.+]] = call @quantized_dot_general_with_bias_same_shape_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]]) {_quantization_method = "static_range_ptq { }"} : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>, tensor<1x3x!quant.uniform<i32:f32:1, {{.*}}>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>
 // CHECK: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3xf32>
 // CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3xf32>
 
@@ -99,7 +99,7 @@ module attributes {tf_saved_model.semantics} {
 // CHECK-PER-TENSOR-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>
 // CHECK-PER-TENSOR-DAG: %[[CONST_1:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<1x3xi32>} : () -> tensor<1x3x!quant.uniform<i32:f32, {{.*}}>
 // CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_dot_general_with_bias_same_shape_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]]) : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>, tensor<1x3x!quant.uniform<i32:f32, {{.*}}>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>
+// CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_dot_general_with_bias_same_shape_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]]) {_quantization_method = "static_range_ptq { }"} : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>, tensor<1x3x!quant.uniform<i32:f32, {{.*}}>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>
 // CHECK-PER-TENSOR: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3xf32>
 // CHECK-PER-TENSOR: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3xf32>
 
@@ -132,7 +132,7 @@ module attributes {tf_saved_model.semantics} {
     %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
     %cst_0 = "tf.Const"() {value = dense<4.00000000e-1> : tensor<3xf32>} : () -> tensor<3xf32>
     %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<?x2xf32>) -> tensor<?x2xf32>
-    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {Sout = [#tf_type.shape<?x3>], _entry_function = @composite_dot_general_with_bias_dynamic_fn, _original_entry_function = "composite_dot_general_with_bias_dynamic_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<?x2xf32>, tensor<2x3xf32>, tensor<3xf32>) -> tensor<?x3xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {Sout = [#tf_type.shape<?x3>], _entry_function = @composite_dot_general_with_bias_dynamic_fn, _original_entry_function = "composite_dot_general_with_bias_dynamic_fn", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<?x2xf32>, tensor<2x3xf32>, tensor<3xf32>) -> tensor<?x3xf32>
     %2 = "quantfork.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<?x3xf32>) -> tensor<?x3xf32>
     return %2 : tensor<?x3xf32>
   }
@@ -140,7 +140,7 @@ module attributes {tf_saved_model.semantics} {
 // CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>
 // CHECK-DAG: %[[CONST_1:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<3xi32>} : () -> tensor<3x!quant.uniform<i32:f32:0, {{.*}}>
 // CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<?x2xf32>) -> tensor<?x2x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK: %[[CALL_0:.+]] = call @quantized_dot_general_with_bias_dynamic_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]]) : (tensor<?x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>, tensor<3x!quant.uniform<i32:f32:0, {{.*}}>) -> tensor<?x3x!quant.uniform<i8:f32, {{.*}}>
+// CHECK: %[[CALL_0:.+]] = call @quantized_dot_general_with_bias_dynamic_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]]) {_quantization_method = "static_range_ptq { }"} : (tensor<?x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>, tensor<3x!quant.uniform<i32:f32:0, {{.*}}>) -> tensor<?x3x!quant.uniform<i8:f32, {{.*}}>
 // CHECK: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<?x3x!quant.uniform<i8:f32, {{.*}}>) -> tensor<?x3xf32>
 // CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<?x3xf32>
 
@@ -148,7 +148,7 @@ module attributes {tf_saved_model.semantics} {
 // CHECK-PER-TENSOR-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>
 // CHECK-PER-TENSOR-DAG: %[[CONST_1:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<3xi32>} : () -> tensor<3x!quant.uniform<i32:f32, {{.*}}>
 // CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<?x2xf32>) -> tensor<?x2x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_dot_general_with_bias_dynamic_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]]) : (tensor<?x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>, tensor<3x!quant.uniform<i32:f32, {{.*}}>) -> tensor<?x3x!quant.uniform<i8:f32, {{.*}}>
+// CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_dot_general_with_bias_dynamic_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]]) {_quantization_method = "static_range_ptq { }"} : (tensor<?x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>, tensor<3x!quant.uniform<i32:f32, {{.*}}>) -> tensor<?x3x!quant.uniform<i8:f32, {{.*}}>
 // CHECK-PER-TENSOR: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<?x3x!quant.uniform<i8:f32, {{.*}}>) -> tensor<?x3xf32>
 // CHECK-PER-TENSOR: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<?x3xf32>
 
@@ -221,14 +221,16 @@ module attributes {tf_saved_model.semantics} {
 // CHECK: func.func private @quantize_conv_fn(%[[ARG_0:.+]]: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"}
 // CHECK: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>
 // CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK: %[[CALL_0:.+]] = call @quantized_conv_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]]) : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[CALL_0:.+]] = call @quantized_conv_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]])
+// CHECK-SAME: {_quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}"} : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
 // CHECK: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3x4x2xf32>
 // CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3x4x2xf32>
 
 // CHECK-PER-TENSOR: func.func private @quantize_conv_fn(%[[ARG_0:.+]]: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"}
 // CHECK-PER-TENSOR: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>
 // CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_conv_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]]) : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_conv_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]])
+// CHECK-PER-TENSOR-SAME: {_quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}"} : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
 // CHECK-PER-TENSOR: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3x4x2xf32>
 // CHECK-PER-TENSOR: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3x4x2xf32>
 
@@ -272,7 +274,7 @@ func.func @quantize_conv_fn_per_tensor(%arg0: tensor<1x3x4x3xf32>) -> tensor<1x3
       version = 5 : i64,
       _entry_function = @composite_conv_fn,
       _original_entry_function = "composite_conv_fn",
-      _quantization_method = "static_range_ptq {}",
+      _quantization_method = "static_range_ptq { }",
       _stablehlo_module_attrs = {},
       _tfl_quant_trait = "fully_quantizable",
       device = ""
@@ -286,7 +288,7 @@ func.func @quantize_conv_fn_per_tensor(%arg0: tensor<1x3x4x3xf32>) -> tensor<1x3
 // CHECK-SAME: (%[[ARG_0:.+]]: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32>
 // CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8:f32, {{.*}}>
 // CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK: %[[CALL_0:.+]] = call @quantized_conv_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]]) : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[CALL_0:.+]] = call @quantized_conv_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]]) {_quantization_method = "static_range_ptq { }"} : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
 // CHECK: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3x4x2xf32>
 // CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3x4x2xf32>
 
@@ -339,7 +341,8 @@ module attributes {tf_saved_model.semantics} {
 // CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>
 // CHECK-DAG: %[[CONST_1:.+]] = stablehlo.constant() {value = dense<47978> : tensor<2xi32>} : () -> tensor<2x!quant.uniform<i32:f32:0, {8.3371932554046126E-6,8.3371932554046126E-6}>>
 // CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK: %[[CALL_0:.+]] = call @quantized_conv_with_bias_1d_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]]) : (tensor<1x3x4x3x!quant.uniform<i8:f32, 0.0035294116712084002:-128>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.0023622048182750312,0.0023622048182750312}>>, tensor<2x!quant.uniform<i32:f32:0, {8.3371932554046126E-6,8.3371932554046126E-6}>>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, 0.0027450979924669452:-128>>
+// CHECK: %[[CALL_0:.+]] = call @quantized_conv_with_bias_1d_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]])
+// CHECK-SAME: {_quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}"} : (tensor<1x3x4x3x!quant.uniform<i8:f32, 0.0035294116712084002:-128>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.0023622048182750312,0.0023622048182750312}>>, tensor<2x!quant.uniform<i32:f32:0, {8.3371932554046126E-6,8.3371932554046126E-6}>>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, 0.0027450979924669452:-128>>
 // CHECK: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3x4x2xf32>
 // CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3x4x2xf32>
 
@@ -347,7 +350,8 @@ module attributes {tf_saved_model.semantics} {
 // CHECK-PER-TENSOR-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>
 // CHECK-PER-TENSOR-DAG: %[[CONST_1:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2xi32>} : () -> tensor<2x!quant.uniform<i32:f32, {{.*}}>
 // CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_conv_with_bias_1d_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]]) : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>, tensor<2x!quant.uniform<i32:f32, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>
+// CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_conv_with_bias_1d_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]])
+// CHECK-PER-TENSOR-SAME: {_quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}"} : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>, tensor<2x!quant.uniform<i32:f32, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>
 // CHECK-PER-TENSOR: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3x4x2xf32>
 // CHECK-PER-TENSOR: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3x4x2xf32>
 
@@ -406,7 +410,8 @@ module attributes {tf_saved_model.semantics} {
 // CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>
 // CHECK-DAG: %[[CONST_1:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<1x1x1x2xi32>} : () -> tensor<1x1x1x2x!quant.uniform<i32:f32:3, {{.*}}>
 // CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK: %[[CALL_0:.+]] = call @quantized_conv_with_bias_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]]) : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>, tensor<1x1x1x2x!quant.uniform<i32:f32:3, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>
+// CHECK: %[[CALL_0:.+]] = call @quantized_conv_with_bias_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]])
+// CHECK-SAME: {_quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}"} : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>, tensor<1x1x1x2x!quant.uniform<i32:f32:3, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>
 // CHECK: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3x4x2xf32>
 // CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3x4x2xf32>
 
@@ -414,7 +419,8 @@ module attributes {tf_saved_model.semantics} {
 // CHECK-PER-TENSOR-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>
 // CHECK-PER-TENSOR-DAG: %[[CONST_1:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<1x1x1x2xi32>} : () -> tensor<1x1x1x2x!quant.uniform<i32:f32, {{.*}}>
 // CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_conv_with_bias_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]]) : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>, tensor<1x1x1x2x!quant.uniform<i32:f32, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>
+// CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_conv_with_bias_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]])
+// CHECK-PER-TENSOR-SAME: {_quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}"} : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>, tensor<1x1x1x2x!quant.uniform<i32:f32, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>
 // CHECK-PER-TENSOR: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3x4x2xf32>
 // CHECK-PER-TENSOR: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3x4x2xf32>
 
@@ -472,7 +478,8 @@ module attributes {tf_saved_model.semantics} {
 // CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>
 // CHECK-DAG: %[[CONST_1:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<1x1x1x2xi32>} : () -> tensor<1x1x1x2x!quant.uniform<i32:f32:3, {{.*}}>
 // CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<?x3x4x3xf32>) -> tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK: %[[CALL_0:.+]] = call @quantized_conv_with_bias_dynamic_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]]) : (tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>, tensor<1x1x1x2x!quant.uniform<i32:f32:3, {{.*}}>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>
+// CHECK: %[[CALL_0:.+]] = call @quantized_conv_with_bias_dynamic_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]])
+// CHECK-SAME: {_quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}"} : (tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>, tensor<1x1x1x2x!quant.uniform<i32:f32:3, {{.*}}>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>
 // CHECK: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<?x3x4x2xf32>
 // CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<?x3x4x2xf32>
 
@@ -480,7 +487,8 @@ module attributes {tf_saved_model.semantics} {
 // CHECK-PER-TENSOR-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>
 // CHECK-PER-TENSOR-DAG: %[[CONST_1:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<1x1x1x2xi32>} : () -> tensor<1x1x1x2x!quant.uniform<i32:f32, {{.*}}>
 // CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<?x3x4x3xf32>) -> tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_conv_with_bias_dynamic_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]]) : (tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>, tensor<1x1x1x2x!quant.uniform<i32:f32, {{.*}}>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>
+// CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_conv_with_bias_dynamic_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]])
+// CHECK-PER_TENSOR-SAME: {_quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}"} : (tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>, tensor<1x1x1x2x!quant.uniform<i32:f32, {{.*}}>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>
 // CHECK-PER-TENSOR: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<?x3x4x2xf32>
 // CHECK-PER-TENSOR: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<?x3x4x2xf32>
 
@@ -564,7 +572,8 @@ module attributes {tf_saved_model.semantics} {
 // CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>
 // CHECK-DAG: %[[CONST_1:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<1x1x1x2xi32>} : () -> tensor<1x1x1x2x!quant.uniform<i32:f32:3, {{.*}}>
 // CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<?x3x4x3xf32>) -> tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK: %[[CALL_0:.+]] = call @quantized_conv_with_bias_and_relu_dynamic_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]]) : (tensor<?x3x4x3x!quant.uniform<i8:f32, 0.0035294116712084002:-128>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.0023622048182750312,0.0023622048182750312}>>, tensor<1x1x1x2x!quant.uniform<i32:f32:3, {8.3371932554046126E-6,8.3371932554046126E-6}>>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, 0.0031372549487095253:-128>>
+// CHECK: %[[CALL_0:.+]] = call @quantized_conv_with_bias_and_relu_dynamic_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]])
+// CHECK-SAME: {_quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}"} : (tensor<?x3x4x3x!quant.uniform<i8:f32, 0.0035294116712084002:-128>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.0023622048182750312,0.0023622048182750312}>>, tensor<1x1x1x2x!quant.uniform<i32:f32:3, {8.3371932554046126E-6,8.3371932554046126E-6}>>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, 0.0031372549487095253:-128>>
 // CHECK: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<?x3x4x2xf32>
 // CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<?x3x4x2xf32>
 
@@ -572,7 +581,8 @@ module attributes {tf_saved_model.semantics} {
 // CHECK-PER-TENSOR-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>
 // CHECK-PER-TENSOR-DAG: %[[CONST_1:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<1x1x1x2xi32>} : () -> tensor<1x1x1x2x!quant.uniform<i32:f32, {{.*}}>
 // CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<?x3x4x3xf32>) -> tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_conv_with_bias_and_relu_dynamic_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]]) : (tensor<?x3x4x3x!quant.uniform<i8:f32, 0.0035294116712084002:-128>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>, tensor<1x1x1x2x!quant.uniform<i32:f32, 8.3371932554046126E-6>>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, 0.0031372549487095253:-128>>
+// CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_conv_with_bias_and_relu_dynamic_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]])
+// CHECK-PER-TENSOR-SAME: {_quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}"} : (tensor<?x3x4x3x!quant.uniform<i8:f32, 0.0035294116712084002:-128>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>, tensor<1x1x1x2x!quant.uniform<i32:f32, 8.3371932554046126E-6>>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, 0.0031372549487095253:-128>>
 // CHECK-PER-TENSOR: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<?x3x4x2xf32>
 // CHECK-PER-TENSOR: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<?x3x4x2xf32>
 
@@ -659,7 +669,8 @@ module attributes {tf_saved_model.semantics} {
 // CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>
 // CHECK-DAG: %[[CONST_1:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<1x1x1x2xi32>} : () -> tensor<1x1x1x2x!quant.uniform<i32:f32:3, {{.*}}>
 // CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<?x3x4x3xf32>) -> tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK: %[[CALL_0:.+]] = call @quantized_conv_with_bias_and_relu6_dynamic_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]]) : (tensor<?x3x4x3x!quant.uniform<i8:f32, 0.0035294116712084002:-128>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.0023622048182750312,0.0023622048182750312}>>, tensor<1x1x1x2x!quant.uniform<i32:f32:3, {8.3371932554046126E-6,8.3371932554046126E-6}>>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, 0.0023529412699680704:-128>>
+// CHECK: %[[CALL_0:.+]] = call @quantized_conv_with_bias_and_relu6_dynamic_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]])
+// CHECK-SAME: {_quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}"} : (tensor<?x3x4x3x!quant.uniform<i8:f32, 0.0035294116712084002:-128>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.0023622048182750312,0.0023622048182750312}>>, tensor<1x1x1x2x!quant.uniform<i32:f32:3, {8.3371932554046126E-6,8.3371932554046126E-6}>>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, 0.0023529412699680704:-128>>
 // CHECK: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<?x3x4x2xf32>
 // CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<?x3x4x2xf32>
 
@@ -667,7 +678,8 @@ module attributes {tf_saved_model.semantics} {
 // CHECK-PER-TENSOR-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>
 // CHECK-PER-TENSOR-DAG: %[[CONST_1:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<1x1x1x2xi32>} : () -> tensor<1x1x1x2x!quant.uniform<i32:f32, {{.*}}>
 // CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<?x3x4x3xf32>) -> tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_conv_with_bias_and_relu6_dynamic_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]]) : (tensor<?x3x4x3x!quant.uniform<i8:f32, 0.0035294116712084002:-128>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>, tensor<1x1x1x2x!quant.uniform<i32:f32, 8.3371932554046126E-6>>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, 0.0023529412699680704:-128>>
+// CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_conv_with_bias_and_relu6_dynamic_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]])
+// CHECK-PER-TENSOR: {_quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}"} : (tensor<?x3x4x3x!quant.uniform<i8:f32, 0.0035294116712084002:-128>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>, tensor<1x1x1x2x!quant.uniform<i32:f32, 8.3371932554046126E-6>>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, 0.0023529412699680704:-128>>
 // CHECK-PER-TENSOR: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<?x3x4x2xf32>
 // CHECK-PER-TENSOR: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<?x3x4x2xf32>
 
@@ -715,12 +727,12 @@ module attributes {tf_saved_model.semantics} {
 
 // -----
 
-// Tests that XlaCallModule op is not quantized without the quantfork.stats ops.
+// Tests that XlaCallModule op is not quantized and converted to func.call without the quantfork.stats ops.
 
 module attributes {tf_saved_model.semantics} {
   func.func private @not_quantized_without_stats_fn(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
     %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
-    %1 = "tf.XlaCallModule"(%arg0, %cst) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    %1 = "tf.XlaCallModule"(%arg0, %cst) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
     return %1 : tensor<1x3xf32>
   }
 // Check that "tf.Const" is converted to stablehlo.constant. XlaCallModule is
@@ -728,8 +740,8 @@ module attributes {tf_saved_model.semantics} {
 
 // CHECK: func.func private @not_quantized_without_stats_fn(%[[ARG_0:.+]]: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"}
 // CHECK: %[[CONST_0:.+]] = stablehlo.constant dense<3.000000e-01> : tensor<2x3xf32>
-// CHECK: %[[XLA_CALL_MODULE_0:.+]] = "tf.XlaCallModule"(%[[ARG_0]], %[[CONST_0]]) <{{{.*}}}> {{{.*_entry_function = @composite_dot_general_fn.*}}} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
-// CHECK: return %[[XLA_CALL_MODULE_0]]
+// CHECK: %[[CALL:.+]] = call @composite_dot_general_fn(%[[ARG_0]], %[[CONST_0]]) : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+// CHECK: return %[[CALL]]
 
   func.func private @composite_dot_general_fn(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
     %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
@@ -750,7 +762,7 @@ module attributes {tf_saved_model.semantics} {
   func.func private @quantize_gather_fn(%arg: tensor<3x4x2xf32>) -> tensor<2x3x2x2xf32> attributes {tf._original_func_name = "main_0"} {
     %cst = "tf.Const"() {value = dense<1> : tensor<2x3x2xi32>} : () -> tensor<2x3x2xi32>
     %0 = "quantfork.stats"(%arg) {layerStats = dense<[4.00000000e-6, 9.80000000e-1]> : tensor<2xf32>} : (tensor<3x4x2xf32>) -> tensor<3x4x2xf32>
-    %1 = "tf.XlaCallModule"(%0, %cst) {Sout = [#tf_type.shape<2x3x2x2>], _entry_function = @composite_gather_fn, _original_entry_function = "composite_gather_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<3x4x2xf32>, tensor<2x3x2xi32>) -> tensor<2x3x2x2xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst) {Sout = [#tf_type.shape<2x3x2x2>], _entry_function = @composite_gather_fn, _original_entry_function = "composite_gather_fn", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<3x4x2xf32>, tensor<2x3x2xi32>) -> tensor<2x3x2x2xf32>
     %2 = "quantfork.stats"(%1) {layerStats = dense<[4.00000000e-6, 9.80000000e-1]> : tensor<2xf32>} : (tensor<2x3x2x2xf32>) -> tensor<2x3x2x2xf32>
     return %2 : tensor<2x3x2x2xf32>
   }
@@ -758,7 +770,7 @@ module attributes {tf_saved_model.semantics} {
 // calls the quantized entry function.
 // CHECK: %[[CONST:.+]] = stablehlo.constant dense<{{.*}}> : tensor<2x3x2xi32>
 // CHECK: %[[UNIFORM_QUANTIZE:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<3x4x2xf32>) -> tensor<3x4x2x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK: %[[CALL:.+]] = call @quantized_gather_fn(%[[UNIFORM_QUANTIZE]], %[[CONST]]) : (tensor<3x4x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x2xi32>) -> tensor<2x3x2x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[CALL:.+]] = call @quantized_gather_fn(%[[UNIFORM_QUANTIZE]], %[[CONST]]) {_quantization_method = "static_range_ptq { }"} : (tensor<3x4x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x2xi32>) -> tensor<2x3x2x2x!quant.uniform<i8:f32, {{.*}}>>
 // CHECK: %[[UNIFORM_DEQUANTIZE:.+]] = stablehlo.uniform_dequantize %[[CALL]] : (tensor<2x3x2x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<2x3x2x2xf32>
 // CHECK: return %[[UNIFORM_DEQUANTIZE]] : tensor<2x3x2x2xf32>
 
@@ -776,5 +788,52 @@ module attributes {tf_saved_model.semantics} {
     return %0 : tensor<2x3x2x2xf32>
   }
 // CHECK: %[[GATHER:.+]] = "stablehlo.gather"(%[[ARG_0]], %[[ARG_1]]) {{.*}} : (tensor<3x4x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x2xi32>) -> tensor<2x3x2x2x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK: return %[[GATHER]] : tensor<2x3x2x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[GATHER]] : tensor<2x3x2x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: return %[[UNIFORM_QUANTIZE_0]] : tensor<2x3x2x2x!quant.uniform<i8:f32, {{.*}}>>
+}
+
+// -----
+
+// Tests that a basic `stablehlo.add` and a fused `stablehlo.dot_general`
+// are properly quantized.
+
+module attributes {tf_saved_model.semantics} {
+// CHECK: func.func private @quantize_add_fn(%[[ARG:.+]]: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"}
+  func.func private @quantize_add_fn(%arg: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst_0 = "tf.Const"() {value = dense<1.00000000e-1> : tensor<1x2xf32>} : () -> tensor<1x2xf32>
+    %cst_1 = "tf.Const"() {value = dense<1.00000000e-1> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
+    %0 = "quantfork.stats"(%arg) {layerStats = dense<[4.00000000e-6, 9.80000000e-1]> : tensor<2xf32>} : (tensor<1x2xf32>) -> tensor<1x2xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst_0) {Sout = [#tf_type.shape<1x2>], _entry_function = @composite_add_fn, _original_entry_function = "composite_add_fn", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<1x2xf32>
+    %2 = "quantfork.stats"(%1) {layerStats = dense<[4.00000000e-6, 9.80000000e-1]> : tensor<2xf32>} : (tensor<1x2xf32>) -> tensor<1x2xf32>
+    %3 = "quantfork.stats"(%2) {layerStats = dense<[5.00000000e-6, 6.00000000e-1]> : tensor<2xf32>} : (tensor<1x2xf32>) -> tensor<1x2xf32>
+    %4 = "tf.XlaCallModule"(%3, %cst_1) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    %5 = "quantfork.stats"(%4) {layerStats = dense<[5.00000000e-6, 9.80000000e-1]> : tensor<2xf32>} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+    return %5 : tensor<1x3xf32>
+  }
+// CHECK: %[[CONST:.+]] = stablehlo.constant() {value = dense<127> : tensor<1x2xi8>} : () -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<127> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>>
+// CHECK: %[[UNIFORM_QUANTIZE:.+]] = stablehlo.uniform_quantize %[[ARG]] : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[CALL:.+]] = call @quantized_add_fn(%[[UNIFORM_QUANTIZE]], %[[CONST]]) {_quantization_method = "static_range_ptq { }"} : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[UNIFORM_DEQUANTIZE:.+]] = stablehlo.uniform_dequantize %[[CALL]] : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x2xf32>
+// CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[UNIFORM_DEQUANTIZE]] : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[CALL_0:.+]] = call @quantized_dot_general_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]]) {_quantization_method = "static_range_ptq { }"} : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x3xf32>
+// CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3xf32>
+
+// CHECK: func.func private @quantized_add_fn(%[[ARG_0:.+]]: tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_1:.+]]: tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+  func.func private @composite_add_fn(%arg0: tensor<1x2xf32>, %arg1: tensor<1x2xf32>) -> tensor<1x2xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.add %arg0, %arg1 : tensor<1x2xf32>
+    return %0 : tensor<1x2xf32>
+  }
+// CHECK: %[[ADD:.+]] = stablehlo.add %arg0, %arg1 : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: return %[[ADD]] : tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
+
+// CHECK: func.func private @quantized_dot_general_fn(%[[ARG_0:.+]]: tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_1:.+]]: tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+  func.func private @composite_dot_general_fn(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    return %0 : tensor<1x3xf32>
+  }
+// CHECK: %[[DOT_GENERAL:.+]] = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1,{{.*}}>>) -> tensor<1x3x!quant.uniform<i32:f32:1, {{.*}}>>
+// CHECK: %[[UNIFORM_QUANTIZE:.+]] = stablehlo.uniform_quantize %[[DOT_GENERAL]] : (tensor<1x3x!quant.uniform<i32:f32:1, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: return %[[UNIFORM_QUANTIZE]] : tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
 }
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions_all_ops.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions_all_ops.mlir
deleted file mode 100644
index 72851d92b64b75..00000000000000
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions_all_ops.mlir
+++ /dev/null
@@ -1,46 +0,0 @@
-// RUN: stablehlo-quant-opt %s -split-input-file -verify-diagnostics \
-// RUN:     -stablehlo-quantize-composite-functions=enable-full-int-quantization=true | FileCheck --check-prefix=CHECK-FULL-INT %s
-
-// Tests that a basic `stablehlo.add` and a fused `stablehlo.dot_general`
-// are properly quantized.
-
-module attributes {tf_saved_model.semantics} {
-// CHECK-FULL-INT: func.func private @quantize_add_fn(%[[ARG:.+]]: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"}
-  func.func private @quantize_add_fn(%arg: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
-    %cst_0 = "tf.Const"() {value = dense<1.00000000e-1> : tensor<1x2xf32>} : () -> tensor<1x2xf32>
-    %cst_1 = "tf.Const"() {value = dense<1.00000000e-1> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
-    %0 = "quantfork.stats"(%arg) {layerStats = dense<[4.00000000e-6, 9.80000000e-1]> : tensor<2xf32>} : (tensor<1x2xf32>) -> tensor<1x2xf32>
-    %1 = "tf.XlaCallModule"(%0, %cst_0) {Sout = [#tf_type.shape<1x2>], _entry_function = @composite_add_fn, _original_entry_function = "composite_add_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<1x2xf32>
-    %2 = "quantfork.stats"(%1) {layerStats = dense<[4.00000000e-6, 9.80000000e-1]> : tensor<2xf32>} : (tensor<1x2xf32>) -> tensor<1x2xf32>
-    %3 = "quantfork.stats"(%2) {layerStats = dense<[5.00000000e-6, 6.00000000e-1]> : tensor<2xf32>} : (tensor<1x2xf32>) -> tensor<1x2xf32>
-    %4 = "tf.XlaCallModule"(%3, %cst_1) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
-    %5 = "quantfork.stats"(%4) {layerStats = dense<[5.00000000e-6, 9.80000000e-1]> : tensor<2xf32>} : (tensor<1x3xf32>) -> tensor<1x3xf32>
-    return %5 : tensor<1x3xf32>
-  }
-// CHECK-FULL-INT: %[[CONST:.+]] = stablehlo.constant() {value = dense<127> : tensor<1x2xi8>} : () -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK-FULL-INT: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<127> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>> 
-// CHECK-FULL-INT: %[[UNIFORM_QUANTIZE:.+]] = stablehlo.uniform_quantize %[[ARG]] : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK-FULL-INT: %[[CALL:.+]] = call @quantized_add_fn(%[[UNIFORM_QUANTIZE]], %[[CONST]]) : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK-FULL-INT: %[[UNIFORM_DEQUANTIZE:.+]] = stablehlo.uniform_dequantize %[[CALL]] : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x2xf32>
-// CHECK-FULL-INT: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[UNIFORM_DEQUANTIZE]] : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK-FULL-INT: %[[CALL_0:.+]] = call @quantized_dot_general_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]]) : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK-FULL-INT: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x3xf32>
-// CHECK-FULL-INT: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3xf32>
-
-// CHECK-FULL-INT: func.func private @quantized_add_fn(%[[ARG_0:.+]]: tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_1:.+]]: tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
-  func.func private @composite_add_fn(%arg0: tensor<1x2xf32>, %arg1: tensor<1x2xf32>) -> tensor<1x2xf32> attributes {_from_xla_call_module} {
-    %0 = stablehlo.add %arg0, %arg1 : tensor<1x2xf32>
-    return %0 : tensor<1x2xf32>
-  }
-// CHECK-FULL-INT: %[[ADD:.+]] = stablehlo.add %arg0, %arg1 : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK-FULL-INT: return %[[ADD]] : tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
-
-// CHECK-FULL-INT: func.func private @quantized_dot_general_fn(%[[ARG_0:.+]]: tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_1:.+]]: tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
-  func.func private @composite_dot_general_fn(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
-    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
-    return %0 : tensor<1x3xf32>
-  }
-// CHECK-FULL-INT: %[[DOT_GENERAL:.+]] = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1,{{.*}}>>) -> tensor<1x3x!quant.uniform<i32:f32:1, {{.*}}>>
-// CHECK-FULL-INT: %[[UNIFORM_QUANTIZE:.+]] = stablehlo.uniform_quantize %[[DOT_GENERAL]] : (tensor<1x3x!quant.uniform<i32:f32:1, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK-FULL-INT: return %[[UNIFORM_QUANTIZE]] : tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
-}
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions_weight_only.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions_weight_only.mlir
index dce15fe07760e2..b96cb15039d763 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions_weight_only.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions_weight_only.mlir
@@ -2,12 +2,12 @@
 // RUN:     -stablehlo-quantize-composite-functions=enable-weight-only=true | FileCheck --check-prefix=CHECK %s
 
 // Test that weight-only quantized dot_general op is produced when
-// enable-weight-only is set to true.
+// weight_only_ptq is provided.
 
 module attributes {tf_saved_model.semantics} {
   func.func private @quantize_dot_general_fn(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
     %0 = stablehlo.constant dense<3.000000e-01> : tensor<2x3xf32>
-    %1 = "tf.XlaCallModule"(%arg0, %0) <{Sout = [#tf_type.shape<1x3>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    %1 = "tf.XlaCallModule"(%arg0, %0) <{Sout = [#tf_type.shape<1x3>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _quantization_method = "weight_only_ptq { }", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
     return %1 : tensor<1x3xf32>
   }
 
@@ -20,7 +20,7 @@ module attributes {tf_saved_model.semantics} {
 // CHECK-LABEL: quantize_dot_general_fn
 // CHECK-SAME: %[[ARG0:.+]]: tensor<1x2xf32>
 // CHECK: %[[CST:.+]] = stablehlo.constant() {value = dense<127> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8:f32, 0.0011764706349840352:-128>>
-// CHECK: %[[CALL:.+]] = call @quantized_dot_general_fn(%[[ARG0]], %[[CST]]) : (tensor<1x2xf32>, tensor<2x3x!quant.uniform<i8:f32, 0.0011764706349840352:-128>>) -> tensor<1x3xf32>
+// CHECK: %[[CALL:.+]] = call @quantized_dot_general_fn(%[[ARG0]], %[[CST]]) {_quantization_method = "weight_only_ptq { }"} : (tensor<1x2xf32>, tensor<2x3x!quant.uniform<i8:f32, 0.0011764706349840352:-128>>) -> tensor<1x3xf32>
 // CHECK: return %[[CALL]]
 
 // CHECK: quantized_dot_general_fn
@@ -31,13 +31,13 @@ module attributes {tf_saved_model.semantics} {
 
 // -----
 
-// Test that hybrid quantized convolution op is produced when enable-weight-only
-// is set to true.
+// Test that hybrid quantized convolution op is produced when weight_only_ptq is
+// provided.
 
 module attributes {tf_saved_model.semantics} {
   func.func private @quantize_conv_fn(%arg0: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"} {
     %0 = stablehlo.constant dense<3.000000e-01> : tensor<2x3x3x2xf32>
-    %1 = "tf.XlaCallModule"(%arg0, %0) <{Sout = [#tf_type.shape<1x3x4x2>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @composite_conv_fn, _original_entry_function = "composite_conv_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+    %1 = "tf.XlaCallModule"(%arg0, %0) <{Sout = [#tf_type.shape<1x3x4x2>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @composite_conv_fn, _original_entry_function = "composite_conv_fn", _quantization_method = "weight_only_ptq { }", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
     return %1 : tensor<1x3x4x2xf32>
   }
 
@@ -50,7 +50,7 @@ module attributes {tf_saved_model.semantics} {
 // CHECK-LABEL: quantize_conv_fn
 // CHECK-SAME: %[[ARG0:.+]]: tensor<1x3x4x3xf32>
 // CHECK: %[[CST:.+]] = stablehlo.constant() {value = dense<127> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8:f32, 0.0011764706349840352:-128>>
-// CHECK: %[[CALL:.+]] = call @quantized_conv_fn(%[[ARG0]], %[[CST]]) : (tensor<1x3x4x3xf32>, tensor<2x3x3x2x!quant.uniform<i8:f32, 0.0011764706349840352:-128>>) -> tensor<1x3x4x2xf32>
+// CHECK: %[[CALL:.+]] = call @quantized_conv_fn(%[[ARG0]], %[[CST]]) {_quantization_method = "weight_only_ptq { }"} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2x!quant.uniform<i8:f32, 0.0011764706349840352:-128>>) -> tensor<1x3x4x2xf32>
 // CHECK: return %[[CALL]]
 
 // CHECK: quantized_conv_fn
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.mlir
index eccf25e1acbfed..02e1c5e9923915 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.mlir
@@ -22,25 +22,25 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
   func.func @main(%arg0: tensor<1x1024xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<1x64xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input_tensor:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
     %0 = stablehlo.constant dense<1.000000e+03> : tensor<1024x3xf32>
     %1 = stablehlo.constant dense<1.000000e+03> : tensor<1x3xf32>
-    %2 = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x1024xf32>) -> tensor<1x1024xf32>
-    %3 = "tf.XlaCallModule"(%2, %0, %1) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64} : (tensor<1x1024xf32>, tensor<1024x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
-    %4 = "tf.CustomAggregator"(%3) {calibration_method = 1 : i32, id = "1", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+    %2:4 = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x1024xf32>) -> (tensor<1x1024xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %3 = "tf.XlaCallModule"(%2#0, %0, %1) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64} : (tensor<1x1024xf32>, tensor<1024x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
+    %4:4 = "tf.CustomAggregator"(%3) {calibration_method = 1 : i32, id = "1", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x3xf32>) -> (tensor<1x3xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
     %5 = stablehlo.constant dense<1.000000e+03> : tensor<3x64xf32>
     %6 = stablehlo.constant dense<1.000000e+03> : tensor<1x64xf32>
-    %7 = "tf.CustomAggregator"(%4) {calibration_method = 1 : i32, id = "0", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x3xf32>) -> tensor<1x3xf32>
-    %8 = "tf.XlaCallModule"(%7, %5, %6) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_with_relu_fn_1, _original_entry_function = "composite_dot_general_with_relu_fn_1", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64} : (tensor<1x3xf32>, tensor<3x64xf32>, tensor<1x64xf32>) -> tensor<1x64xf32>
-    %9 = "tf.CustomAggregator"(%6) {calibration_method = 1 : i32, id = "1", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x64xf32>) -> tensor<1x64xf32>
-    return %9 : tensor<1x64xf32>
+    %7:4 = "tf.CustomAggregator"(%4#0) {calibration_method = 1 : i32, id = "0", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x3xf32>) -> (tensor<1x3xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %8 = "tf.XlaCallModule"(%7#0, %5, %6) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_with_relu_fn_1, _original_entry_function = "composite_dot_general_with_relu_fn_1", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64} : (tensor<1x3xf32>, tensor<3x64xf32>, tensor<1x64xf32>) -> tensor<1x64xf32>
+    %9:4 = "tf.CustomAggregator"(%6) {calibration_method = 1 : i32, id = "1", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x64xf32>) -> (tensor<1x64xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    return %9#0 : tensor<1x64xf32>
   }
 
   // CHECK: %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_0:.*]] = "tf.XlaCallModule"() <{Sout = [#tf_type.shape<{{.*}}>, #tf_type.shape<{{.*}}>], {{.*}}, module = "", platforms = ["CPU", "TPU"], version = 9 : i64}> {_entry_function = @_stablehlo_main_1
-  // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]] = "tf.CustomAggregator"(%arg0) <{id = "0"}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x1024xf32>) -> tensor<1x1024xf32>
+  // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{id = "0"}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32}
   // CHECK: %[[XLA_CALL_MODULE_0:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_0]], %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_0:.*]], %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_0:.*]]) <{Sout = [#tf_type.shape<1x3>], {{.*}}, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _tfl_quant_trait = "fully_quantizable"}
-  // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]] = "tf.CustomAggregator"(%[[XLA_CALL_MODULE_0]])
+  // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[XLA_CALL_MODULE_0]])
   // CHECK: %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_1:.*]] = "tf.XlaCallModule"() <{Sout = [#tf_type.shape<{{.*}}>, #tf_type.shape<{{.*}}>], {{.*}}, module = "", platforms = ["CPU", "TPU"], version = 9 : i64}> {_entry_function = @_stablehlo_main_0
-  // CHECK: %[[CUSTOM_AGGREGATOR_2:.*]] = "tf.CustomAggregator"(%[[CUSTOM_AGGREGATOR_1]]) <{id = "0"}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+  // CHECK: %[[CUSTOM_AGGREGATOR_2:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[CUSTOM_AGGREGATOR_1]]) <{id = "0"}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32}
   // CHECK: %[[XLA_CALL_MODULE_1:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_2]], %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_1:.*]], %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_1:.*]]) <{Sout = [#tf_type.shape<1x3>], {{.*}}, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @composite_dot_general_with_relu_fn_1, _original_entry_function = "composite_dot_general_with_relu_fn_1", _tfl_quant_trait = "fully_quantizable"}
-  // CHECK: %[[CUSTOM_AGGREGATOR_3:.*]] = "tf.CustomAggregator"(%[[XLA_CALL_MODULE_1:.*]])
+  // CHECK: %[[CUSTOM_AGGREGATOR_3:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[XLA_CALL_MODULE_1:.*]])
   // CHECK: return %[[CUSTOM_AGGREGATOR_3]] : tensor<1x64xf32>
   // CHECK: }
 
@@ -111,16 +111,16 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
   // CHECK: @serving_default
   func.func @serving_default(%arg0: tensor<1x1024xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<1x3xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input_tensor:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
     %0 = stablehlo.constant dense<1.000000e+03> : tensor<1024x3xf32>
-    %1 = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x1024xf32>) -> tensor<1x1024xf32>
-    %2 = "tf.XlaCallModule"(%1, %0) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64} : (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
-    %3 = "tf.CustomAggregator"(%2) {calibration_method = 1 : i32, id = "1", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x3xf32>) -> tensor<1x3xf32>
-    return %3 : tensor<1x3xf32>
+    %1:4 = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x1024xf32>) -> (tensor<1x1024xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %2 = "tf.XlaCallModule"(%1#0, %0) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64} : (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
+    %3:4 = "tf.CustomAggregator"(%2) {calibration_method = 1 : i32, id = "1", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x3xf32>) -> (tensor<1x3xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    return %3#0 : tensor<1x3xf32>
   }
 
   // CHECK: %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP:.*]] = "tf.XlaCallModule"() <{Sout = [#tf_type.shape<1024x3>], {{.*}}, module = "", platforms = ["CPU", "TPU"], version = 9 : i64}> {_entry_function = @_stablehlo_main_0, _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}}
-  // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]] = "tf.CustomAggregator"(%arg0) <{id = "0"}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x1024xf32>) -> tensor<1x1024xf32>
+  // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{id = "0"}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32}
   // CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR:.*]], %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP:.*]]) <{Sout = [#tf_type.shape<1x3>], {{.*}}}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1"
-  // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]] = "tf.CustomAggregator"(%[[XLA_CALL_MODULE:.*]])
+  // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[XLA_CALL_MODULE:.*]])
   // CHECK: return %[[CUSTOM_AGGREGATOR_1]]
   // CHECK: }
 
@@ -143,16 +143,16 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
   // CHECK-LABEL: @random_name
   func.func @random_name(%arg0: tensor<1x1024xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<1x3xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input_tensor:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
     %0 = stablehlo.constant dense<1.000000e+03> : tensor<1024x3xf32>
-    %1 = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x1024xf32>) -> tensor<1x1024xf32>
-    %2 = "tf.XlaCallModule"(%1, %0) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64} : (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
-    %3 = "tf.CustomAggregator"(%2) {calibration_method = 1 : i32, id = "1", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x3xf32>) -> tensor<1x3xf32>
-    return %3 : tensor<1x3xf32>
+    %1:4 = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x1024xf32>) -> (tensor<1x1024xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %2 = "tf.XlaCallModule"(%1#0, %0) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64} : (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
+    %3:4 = "tf.CustomAggregator"(%2) {calibration_method = 1 : i32, id = "1", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x3xf32>) -> (tensor<1x3xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    return %3#0 : tensor<1x3xf32>
   }
 
   // CHECK: %[[CONSTANT:.*]] = stablehlo.constant dense<1.000000e+03> : tensor<1024x3xf32>
-  // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]] = "tf.CustomAggregator"(%arg0) <{id = "0"}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x1024xf32>) -> tensor<1x1024xf32>
+  // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{id = "0"}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32}
   // CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR:.*]], %[[XLA_CALL_MODULE_EXTRACTED_FROM_SUBGRAPH:.*]]) <{Sout = [#tf_type.shape<1x3>], {{.*}}, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1"
-  // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]] = "tf.CustomAggregator"(%[[XLA_CALL_MODULE:.*]])
+  // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[XLA_CALL_MODULE:.*]])
   // CHECK: return %[[CUSTOM_AGGREGATOR_1]]
   // CHECK: }
 
@@ -185,19 +185,19 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
   // CHECK: @serving_default
   func.func @serving_default(%arg0: tensor<1024x1024xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<1024x3xf32> {tf_saved_model.index_path = ["output1"]}, tensor<1024x3xf32> {tf_saved_model.index_path = ["output2"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input_tensor:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
     %0 = stablehlo.constant dense<1.000000e+03> : tensor<1024x3xf32>
-    %1 = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
-    %2 = "tf.XlaCallModule"(%1, %0) {Sout = [#tf_type.shape<1024x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1024x1024xf32>, tensor<1024x3xf32>) -> tensor<1024x3xf32>
-    %3 = "tf.CustomAggregator"(%2) {calibration_method = 1 : i32, id = "1", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1024x3xf32>) -> tensor<1024x3xf32>
+    %1:4 = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1024x1024xf32>) -> (tensor<1024x1024xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %2 = "tf.XlaCallModule"(%1#0, %0) {Sout = [#tf_type.shape<1024x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1024x1024xf32>, tensor<1024x3xf32>) -> tensor<1024x3xf32>
+    %3:4 = "tf.CustomAggregator"(%2) {calibration_method = 1 : i32, id = "1", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1024x3xf32>) -> (tensor<1024x3xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
     %4 = stablehlo.constant dense<1.000000e+03> : tensor<1024x3xf32>
-    %5 = stablehlo.add %3, %4 : tensor<1024x3xf32>
-    %6 = stablehlo.multiply %3, %0 : tensor<1024x3xf32>
+    %5 = stablehlo.add %3#0, %4 : tensor<1024x3xf32>
+    %6 = stablehlo.multiply %3#0, %0 : tensor<1024x3xf32>
     return %5, %6 : tensor<1024x3xf32>, tensor<1024x3xf32>
   }
 
   // CHECK: %[[SUBGRAPH_1:.*]] = "tf.XlaCallModule"() <{Sout = [#tf_type.shape<1024x3>], {{.*}} ["CPU", "TPU"], {{.*}}}> {_entry_function = @_stablehlo_main_1
-  // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]] = "tf.CustomAggregator"(%arg0) <{id = "0"}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
+  // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{id = "0"}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32}
   // CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_1]], %[[SUBGRAPH_1]]) <{Sout = [#tf_type.shape<1024x3>], {{.*}}}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1"
-  // CHECK: %[[CUSTOM_AGGREGATOR_2:.*]] = "tf.CustomAggregator"(%[[XLA_CALL_MODULE:.*]])
+  // CHECK: %[[CUSTOM_AGGREGATOR_2:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[XLA_CALL_MODULE:.*]])
   // CHECK: %[[SUBGRAPH_2:.*]]:2 = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_2]], %[[SUBGRAPH_1]]) <{Sout = [#tf_type.shape<1024x3>, #tf_type.shape<1024x3>], {{.*}}}> {_entry_function = @_stablehlo_main_0
   // CHECK: return %[[SUBGRAPH_2]]#0, %[[SUBGRAPH_2]]#1
   // CHECK: }
@@ -235,18 +235,18 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
     %0 = stablehlo.constant dense<1.000000e+03> : tensor<3x11xf32>
     // %1 is large enough that it won't be duplicated.
     %1 = stablehlo.constant dense<1.000000e+01> : tensor<3x11xf32>
-    %2 = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<3x3xf32>) -> tensor<3x3xf32>
-    %3 = "tf.XlaCallModule"(%2, %0) {Sout = [#tf_type.shape<3x11>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<3x3xf32>, tensor<3x11xf32>) -> tensor<3x11xf32>
-    %4 = "tf.CustomAggregator"(%3) {calibration_method = 1 : i32, id = "1", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<3x11xf32>) -> tensor<3x11xf32>
-    %5 = stablehlo.add %4, %1 : tensor<3x11xf32>
+    %2:4 = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<3x3xf32>) -> (tensor<3x3xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %3 = "tf.XlaCallModule"(%2#0, %0) {Sout = [#tf_type.shape<3x11>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<3x3xf32>, tensor<3x11xf32>) -> tensor<3x11xf32>
+    %4:4 = "tf.CustomAggregator"(%3) {calibration_method = 1 : i32, id = "1", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<3x11xf32>) -> (tensor<3x11xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %5 = stablehlo.add %4#0, %1 : tensor<3x11xf32>
     %6 = stablehlo.multiply %5, %1 : tensor<3x11xf32>
     return %6 : tensor<3x11xf32>
   }
 
   // CHECK: %[[SUBGRAPH_1:.*]] = "tf.XlaCallModule"() <{Sout = [#tf_type.shape<3x11>], {{.*}} ["CPU", "TPU"], {{.*}}}> {_entry_function = @_stablehlo_main_1
-  // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]] = "tf.CustomAggregator"(%arg0) <{id = "0"}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<3x3xf32>) -> tensor<3x3xf32>
+  // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{id = "0"}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32}
   // CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_1]], %[[SUBGRAPH_1]]) <{Sout = [#tf_type.shape<3x11>], {{.*}}}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1"
-  // CHECK: %[[CUSTOM_AGGREGATOR_2:.*]] = "tf.CustomAggregator"(%[[XLA_CALL_MODULE:.*]])
+  // CHECK: %[[CUSTOM_AGGREGATOR_2:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[XLA_CALL_MODULE:.*]])
   // CHECK: %[[SUBGRAPH_2:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_2]]) <{Sout = [#tf_type.shape<3x11>], {{.*}} ["CPU", "TPU"], {{.*}}}> {_entry_function = @_stablehlo_main_0
   // CHECK: return %[[SUBGRAPH_2]]
   // CHECK: }
@@ -293,16 +293,16 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
     %4 = stablehlo.compare  EQ, %3, %2,  NOTYPE : (tensor<1024x3xf32>, tensor<1024x3xf32>) -> tensor<1024x3xi1>
     stablehlo.custom_call @shape_assertion(%4) {error_message = "Shape assertion failed", has_side_effect = true} : (tensor<1024x3xi1>) -> ()
     %5 = stablehlo.constant dense<2.000000e+03> : tensor<1024x3xf32>
-    %6 = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
-    %7 = "tf.XlaCallModule"(%6, %5) {Sout = [#tf_type.shape<1024x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1024x1024xf32>, tensor<1024x3xf32>) -> tensor<1024x3xf32>
-    %8 = "tf.CustomAggregator"(%7) {calibration_method = 1 : i32, id = "1", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1024x3xf32>) -> tensor<1024x3xf32>
-    %9 = stablehlo.add %8, %0 : tensor<1024x3xf32>
+    %6:4 = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1024x1024xf32>) -> (tensor<1024x1024xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %7 = "tf.XlaCallModule"(%6#0, %5) {Sout = [#tf_type.shape<1024x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1024x1024xf32>, tensor<1024x3xf32>) -> tensor<1024x3xf32>
+    %8:4 = "tf.CustomAggregator"(%7) {calibration_method = 1 : i32, id = "1", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1024x3xf32>) -> (tensor<1024x3xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %9 = stablehlo.add %8#0, %0 : tensor<1024x3xf32>
     return %9 : tensor<1024x3xf32>
   }
   // CHECK: %[[SUBGRAPH_0:.*]] = "tf.XlaCallModule"() <{Sout = [#tf_type.shape<1024x3>], {{.*}} ["CPU", "TPU"], {{.*}}}> {_entry_function = @_stablehlo_main_1
-  // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]] = "tf.CustomAggregator"(%arg0) <{id = "0"}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
+  // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{id = "0"}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32}
   // CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_0]], %[[SUBGRAPH_0]]) <{Sout = [#tf_type.shape<1024x3>], {{.*}}}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1"
-  // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]] = "tf.CustomAggregator"(%[[XLA_CALL_MODULE:.*]])
+  // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[XLA_CALL_MODULE:.*]])
   // CHECK: %[[SUBGRAPH_1:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_1]]) <{Sout = [#tf_type.shape<1024x3>], {{.*}}}> {_entry_function = @_stablehlo_main_0
   // CHECK: return %[[SUBGRAPH_1]] : tensor<1024x3xf32>
   // CHECK: }
@@ -339,18 +339,18 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
     %2 = stablehlo.remainder %0, %1 : tensor<1024x3xf32>
     %3 = "tf.Identity"(%2) {device = ""} : (tensor<1024x3xf32>) -> tensor<1024x3xf32>
     %4 = stablehlo.constant dense<2.000000e+03> : tensor<1024x3xf32>
-    %5 = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
-    %6 = "tf.XlaCallModule"(%5, %4) {Sout = [#tf_type.shape<1024x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1024x1024xf32>, tensor<1024x3xf32>) -> tensor<1024x3xf32>
-    %7 = "tf.CustomAggregator"(%6) {calibration_method = 1 : i32, id = "1", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1024x3xf32>) -> tensor<1024x3xf32>
-    %8 = stablehlo.add %7, %0 : tensor<1024x3xf32>
+    %5:4 = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1024x1024xf32>) -> (tensor<1024x1024xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %6 = "tf.XlaCallModule"(%5#0, %4) {Sout = [#tf_type.shape<1024x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1024x1024xf32>, tensor<1024x3xf32>) -> tensor<1024x3xf32>
+    %7:4 = "tf.CustomAggregator"(%6) {calibration_method = 1 : i32, id = "1", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1024x3xf32>) -> (tensor<1024x3xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %8 = stablehlo.add %7#0, %0 : tensor<1024x3xf32>
     return %8 : tensor<1024x3xf32>
   }
   // CHECK: %[[SUBGRAPH_0:.*]]:2 = "tf.XlaCallModule"() <{Sout = [#tf_type.shape<1024x3>, #tf_type.shape<1024x3>], {{.*}} ["CPU", "TPU"], {{.*}}}> {_entry_function = @_stablehlo_main_2
   // CHECK: %[[IDENTIFY:.*]] = "tf.Identity"(%[[SUBGRAPH_0]]#1) {device = ""} : (tensor<1024x3xf32>) -> tensor<1024x3xf32>
   // CHECK: %[[SUBGRAPH_1:.*]] = "tf.XlaCallModule"() <{Sout = [#tf_type.shape<1024x3>], {{.*}} ["CPU", "TPU"], {{.*}}}> {_entry_function = @_stablehlo_main_1
-  // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]] = "tf.CustomAggregator"(%arg0) <{id = "0"}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
+  // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{id = "0"}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32}
   // CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_0]], %[[SUBGRAPH_1]]) <{Sout = [#tf_type.shape<1024x3>], {{.*}}}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1"
-  // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]] = "tf.CustomAggregator"(%[[XLA_CALL_MODULE:.*]])
+  // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[XLA_CALL_MODULE:.*]])
   // CHECK: %[[SUBGRAPH_2:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_1]], %[[SUBGRAPH_0]]#0) <{Sout = [#tf_type.shape<1024x3>], {{.*}}}> {_entry_function = @_stablehlo_main_0
   // CHECK: return %[[SUBGRAPH_2]] : tensor<1024x3xf32>
   // CHECK: }
@@ -394,16 +394,16 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
     %7 = stablehlo.compare  EQ, %6, %2,  NOTYPE : (tensor<1024x3xf32>, tensor<1024x3xf32>) -> tensor<1024x3xi1>
     stablehlo.custom_call @shape_assertion(%7) {error_message = "Shape assertion failed", has_side_effect = true} : (tensor<1024x3xi1>) -> ()
     %8 = stablehlo.constant dense<2.000000e+03> : tensor<1024x3xf32>
-    %9 = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
-    %10 = "tf.XlaCallModule"(%9, %8) {Sout = [#tf_type.shape<1024x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1024x1024xf32>, tensor<1024x3xf32>) -> tensor<1024x3xf32>
-    %11 = "tf.CustomAggregator"(%10) {calibration_method = 1 : i32, id = "1", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1024x3xf32>) -> tensor<1024x3xf32>
-    %12 = stablehlo.add %11, %0 : tensor<1024x3xf32>
+    %9:4 = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1024x1024xf32>) -> (tensor<1024x1024xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %10 = "tf.XlaCallModule"(%9#0, %8) {Sout = [#tf_type.shape<1024x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1024x1024xf32>, tensor<1024x3xf32>) -> tensor<1024x3xf32>
+    %11:4 = "tf.CustomAggregator"(%10) {calibration_method = 1 : i32, id = "1", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1024x3xf32>) -> (tensor<1024x3xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %12 = stablehlo.add %11#0, %0 : tensor<1024x3xf32>
     return %12 : tensor<1024x3xf32>
   }
   // CHECK: %[[SUBGRAPH_0:.*]]:2 = "tf.XlaCallModule"() <{Sout = [#tf_type.shape<1024x3>, #tf_type.shape<1024x3>], {{.*}} ["CPU", "TPU"], {{.*}}}> {_entry_function = @_stablehlo_main_1
-  // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]] = "tf.CustomAggregator"(%arg0) <{id = "0"}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
+  // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{id = "0"}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32}
   // CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_0]], %[[SUBGRAPH_0]]#1) <{Sout = [#tf_type.shape<1024x3>], {{.*}}}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1"
-  // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]] = "tf.CustomAggregator"(%[[XLA_CALL_MODULE:.*]])
+  // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[XLA_CALL_MODULE:.*]])
   // CHECK: %[[SUBGRAPH_1:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_1]], %[[SUBGRAPH_0]]#0) <{Sout = [#tf_type.shape<1024x3>], {{.*}}}> {_entry_function = @_stablehlo_main_0
   // CHECK: return %[[SUBGRAPH_1]] : tensor<1024x3xf32>
   // CHECK: }
@@ -411,11 +411,16 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
 
 // -----
 
-// main function contains StatefulPartitionedCall ops which is used to preserve
-// aliased functions. This test make sure stablehlo ops in each PartitionedCall
-// functions are lifted.
+// main function contains PartitionedCall and StatefulPartitionedCall ops which
+// is used to preserve aliased functions. This test make sure stablehlo ops in
+// each PartitionedCall functions are lifted.
 
 module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1629 : i32}, tf_saved_model.semantics} {
+  // CHECK: func private @_stablehlo_main_2
+  // CHECK: stablehlo.multiply %arg1, %arg2 : tensor<3x3xf32>
+  // CHECK: return
+  // CHECK: }
+
   // CHECK: func private @_stablehlo_main_1
   // CHECK: stablehlo.add %arg1, %arg2 : tensor<3x3xf32>
   // CHECK: return
@@ -435,13 +440,20 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
     }> {
       _collective_manager_ids = [], device = ""
     } : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<3x3xf32>
-    return %2 : tensor<3x3xf32>
+    %3 = "tf.PartitionedCall"(%2, %1) <{
+      config = "", config_proto = "", executor_type = "", f = @some_other_func
+    }> {
+      _collective_manager_ids = [], device = ""
+    } : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<3x3xf32>
+    return %3 : tensor<3x3xf32>
   }
   // CHECK: func.func @main
-  // CHECK: %[[INPUT:.*]]:2 = "tf.XlaCallModule"()
+  // CHECK: %[[INPUT:.*]]:3 = "tf.XlaCallModule"()
   // CHECK-SAME: _entry_function = @_stablehlo_main_0
-  // CHECK: "tf.StatefulPartitionedCall"(%[[INPUT]]#0, %[[INPUT]]#1)
+  // CHECK: %[[ADD:.*]] = "tf.StatefulPartitionedCall"(%[[INPUT]]#1, %[[INPUT]]#2)
   // CHECK-SAME: f = @some_func
+  // CHECK: "tf.PartitionedCall"(%[[ADD]], %[[INPUT]]#0)
+  // CHECK-SAME: f = @some_other_func
   // CHECK: return
 
   func.func private @some_func(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<3x3xf32> attributes {tf._noinline = true} {
@@ -452,4 +464,13 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
   // CHECK: tf.XlaCallModule
   // CHECK-SAME: _entry_function = @_stablehlo_main_1
   // CHECK: return
+
+  func.func private @some_other_func(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<3x3xf32> attributes {tf._noinline = true} {
+    %0 = stablehlo.multiply %arg0, %arg1 : tensor<3x3xf32>
+    return %0 : tensor<3x3xf32>
+  }
+  // CHECK: func.func private @some_other_func
+  // CHECK: tf.XlaCallModule
+  // CHECK-SAME: _entry_function = @_stablehlo_main_2
+  // CHECK: return
 }
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/unfuse_mhlo_batch_norm.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/unfuse_mhlo_batch_norm.mlir
index 7a57bdd64b2aab..0873b68b475b5a 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/unfuse_mhlo_batch_norm.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/unfuse_mhlo_batch_norm.mlir
@@ -13,10 +13,10 @@ func.func @unfuse_batch_norm(
   // CHECK-DAG: %[[EPS_BCAST:.+]] = mhlo.constant dense<1.001000e-05> : tensor<256xf32>
   // CHECK-DAG: %[[VARIANCE_EPS:.+]] = mhlo.add %[[VARIANCE]], %[[EPS_BCAST]] : tensor<256xf32>
   // CHECK-DAG: %[[STDDEV:.+]] = mhlo.sqrt %[[VARIANCE_EPS]] : tensor<256xf32>
-  // CHECK-DAG: %[[STDDEV_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[STDDEV]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<256xf32>) -> tensor<4x256xf32>
-  // CHECK-DAG: %[[SCALE_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[SCALE]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<256xf32>) -> tensor<4x256xf32>
-  // CHECK-DAG: %[[OFFSET_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[OFFSET]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<256xf32>) -> tensor<4x256xf32>
-  // CHECK-DAG: %[[MEAN_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[MEAN]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<256xf32>) -> tensor<4x256xf32>
+  // CHECK-DAG: %[[STDDEV_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[STDDEV]]) <{broadcast_dimensions = dense<1> : tensor<1xi64>}> : (tensor<256xf32>) -> tensor<4x256xf32>
+  // CHECK-DAG: %[[SCALE_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[SCALE]]) <{broadcast_dimensions = dense<1> : tensor<1xi64>}> : (tensor<256xf32>) -> tensor<4x256xf32>
+  // CHECK-DAG: %[[OFFSET_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[OFFSET]]) <{broadcast_dimensions = dense<1> : tensor<1xi64>}> : (tensor<256xf32>) -> tensor<4x256xf32>
+  // CHECK-DAG: %[[MEAN_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[MEAN]]) <{broadcast_dimensions = dense<1> : tensor<1xi64>}> : (tensor<256xf32>) -> tensor<4x256xf32>
   // CHECK: %[[X_CENTER:.+]] = mhlo.subtract %[[X]], %[[MEAN_BCAST]] : tensor<4x256xf32>
   // CHECK: %[[X_SCALED:.+]] = mhlo.multiply %[[X_CENTER]], %[[SCALE_BCAST]] : tensor<4x256xf32>
   // CHECK: %[[X_NORMED:.+]] = mhlo.divide %[[X_SCALED]], %[[STDDEV_BCAST]] : tensor<4x256xf32>
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/xla_call_module_to_call.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/xla_call_module_to_call.mlir
new file mode 100644
index 00000000000000..f0330d0266d56d
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/xla_call_module_to_call.mlir
@@ -0,0 +1,23 @@
+// RUN: stablehlo-quant-opt %s -split-input-file -stablehlo-xla-call-module-to-call | FileCheck %s
+
+// -----
+
+// Tests composite tf.XlaCallModule is converted to func.call.
+
+module {
+  // CHECK-LABEL: func.func @main
+  func.func @main(%arg0: tensor<1x1024xf32>) -> tensor<1x3xf32> {
+    // CHECK: call @composite_dot_general_fn_1
+    // CHECK-SAME: (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
+    // CHECK-NOT: tf.XlaCallModule
+    %0 = "tf.Const"() <{value = dense<0.5> : tensor<1024x3xf32>}> : () -> tensor<1024x3xf32>
+    %2 = "tf.XlaCallModule"(%arg0, %0) <{Sout = [#tf_type.shape<1x3>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
+    return %2 : tensor<1x3xf32>
+  }
+  // CHECK-LABEL: func.func private @composite_dot_general_fn_1
+  // CHECK-SAME: -> tensor<1x3xf32>
+  func.func private @composite_dot_general_fn_1(%arg0: tensor<1x1024xf32>, %arg1: tensor<1024x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
+    return %0 : tensor<1x3xf32>
+  }
+}
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
index be0792ab76aff3..94dc1b1569620f 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
@@ -411,6 +411,7 @@ cc_library(
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib",
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib:quantization_config",
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration:calibration_parameters",
         "//tensorflow/compiler/mlir/quantization/tensorflow/cc:const_op_size",
         "//tensorflow/compiler/mlir/quantization/tensorflow/cc:constant_fold",
         "//tensorflow/compiler/mlir/quantization/tensorflow/cc:quantization_unit_loc",
@@ -440,8 +441,6 @@ cc_library(
         "//tensorflow/core/platform:macros",
         "//tensorflow/core/platform:path",
         "//tensorflow/core/tpu:tpu_defs",
-        "//tensorflow/lite/kernels:padding",
-        "//tensorflow/lite/kernels/internal:quantization_util",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_map",
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/BUILD
index fe081684e55736..9ae8d6401afcd6 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/BUILD
@@ -79,7 +79,6 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         ":calibration_statistics_proto_cc",
-        "//tensorflow/core:framework",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -94,6 +93,7 @@ cc_library(
         ":calibration_statistics_proto_cc",
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -107,6 +107,7 @@ cc_library(
         ":calibration_statistics_proto_cc",
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -119,7 +120,9 @@ cc_library(
         ":calibration_statistics_collector_base",
         ":calibration_statistics_proto_cc",
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration:calibration_parameters",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -169,7 +172,7 @@ tf_cc_test(
     srcs = ["calibration_statistics_collector_test.cc"],
     deps = [
         ":calibration_statistics_collector_average_min_max",
-        ":calibration_statistics_collector_base",
+        ":calibration_statistics_collector_histogram",
         ":calibration_statistics_collector_min_max",
         ":calibration_statistics_proto_cc",
         "//tensorflow/core:test",
@@ -203,8 +206,12 @@ tf_kernel_library(
     deps = [
         ":calibrator_singleton_impl",
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration:calibration_parameters",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
         "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/status",
+        "@local_tsl//tsl/platform:errors",
     ],
 )
 
@@ -255,3 +262,48 @@ tf_python_pybind_extension(
         "@pybind11_protobuf//pybind11_protobuf:native_proto_caster",
     ],
 )
+
+tf_kernel_library(
+    name = "calibration_statistics_saver_op",
+    srcs = ["calibration_statistics_saver_op.cc"],
+    compatible_with = get_compatible_with_portable(),
+    visibility = [
+        "//tensorflow:__pkg__",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/python:__pkg__",
+    ],
+    deps = [
+        ":calibration_statistics_collector_average_min_max",
+        ":calibration_statistics_collector_base",
+        ":calibration_statistics_collector_histogram",
+        ":calibration_statistics_collector_min_max",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:logging",
+        "@com_google_absl//absl/base:nullability",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:env",
+    ],
+)
+
+tf_cc_test(
+    name = "calibration_statistics_saver_op_test",
+    srcs = ["calibration_statistics_saver_op_test.cc"],
+    deps = [
+        ":calibration_statistics_proto_cc",
+        ":calibration_statistics_saver_op",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:status_matchers",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics.proto b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics.proto
index eca79c4a141b3c..d4bc053a77cf32 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics.proto
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics.proto
@@ -37,10 +37,15 @@ message CalibrationStatistics {
     // hist_freq[i] saves frequency of range [bins[i], bins[i + 1]).
     // bins[i]     = lower_bound + bin_width * i
     // bins[i + 1] = lower_bound + bin_width * (i + 1)
-    repeated int64 hist_freq = 3;
+    repeated float hist_freq = 3;
   }
 
   MinMaxStatistics min_max_statistics = 1;
   AverageMinMaxStatistics average_min_max_statistics = 2;
   HistogramStatistics histogram_statistics = 3;
 }
+
+message CalibrationStatisticsMap {
+  // A map from the id of CustomAggregator op to its collected statistics.
+  map<string, CalibrationStatistics> statistics = 1;
+}
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_average_min_max.cc b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_average_min_max.cc
index cab8abdee4b5a7..e1faa17505edf5 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_average_min_max.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_average_min_max.cc
@@ -14,10 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_average_min_max.h"
 
-#include <algorithm>
-#include <limits>
+#include <cstdint>
 #include <optional>
 
+#include "absl/types/span.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics.pb.h"
 
 namespace tensorflow {
@@ -30,21 +30,13 @@ void CalibrationStatisticsCollectorAverageMinMax::ClearData() {
 }
 
 void CalibrationStatisticsCollectorAverageMinMax::Collect(
-    const float *data, const unsigned int N) {
-  float input_min = std::numeric_limits<float>::max(),
-        input_max = std::numeric_limits<float>::lowest();
+    const float min, const float max, absl::Span<const int64_t> histogram) {
+  const float current_min_sum = average_min_max_statistics_.min_sum();
+  const float current_max_sum = average_min_max_statistics_.max_sum();
+  const int current_num_samples = average_min_max_statistics_.num_samples();
 
-  for (int i = 0; i < N; ++i) {
-    input_min = std::min(input_min, data[i]);
-    input_max = std::max(input_max, data[i]);
-  }
-
-  float current_min_sum = average_min_max_statistics_.min_sum();
-  float current_max_sum = average_min_max_statistics_.max_sum();
-  int current_num_samples = average_min_max_statistics_.num_samples();
-
-  average_min_max_statistics_.set_min_sum(current_min_sum + input_min);
-  average_min_max_statistics_.set_max_sum(current_max_sum + input_max);
+  average_min_max_statistics_.set_min_sum(current_min_sum + min);
+  average_min_max_statistics_.set_max_sum(current_max_sum + max);
   average_min_max_statistics_.set_num_samples(current_num_samples + 1);
 }
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_average_min_max.h b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_average_min_max.h
index 317b96ea423cb7..f6a5da84f1d675 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_average_min_max.h
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_average_min_max.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <optional>
 
+#include "absl/types/span.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_base.h"
@@ -37,7 +38,8 @@ class CalibrationStatisticsCollectorAverageMinMax
 
   void ClearData() override;
 
-  void Collect(const float *data, unsigned int N) override;
+  void Collect(float min, float max,
+               absl::Span<const int64_t> histogram) override;
 
   std::optional<CalibrationStatistics> GetStatistics() const override;
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_base.h b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_base.h
index 26417a1a6dae4d..9ce6a81930a7d6 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_base.h
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_base.h
@@ -15,12 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CALIBRATOR_CALIBRATION_STATISTICS_COLLECTOR_BASE_H_
 #define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CALIBRATOR_CALIBRATION_STATISTICS_COLLECTOR_BASE_H_
 
+#include <cstdint>
 #include <optional>
-#include <vector>
 
 #include "absl/types/span.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics.pb.h"
-#include "tensorflow/core/framework/tensor.h"
 
 namespace tensorflow {
 namespace calibrator {
@@ -30,25 +29,11 @@ namespace calibrator {
 // statistics based on the calibration methods.
 class CalibrationStatisticsCollectorBase {
  public:
-  // Collect data for calibration using float vector.
-  // It internally calls private method Collect(float*, unsigned int)
-  void Collect(const std::vector<float>& data_vec) {
-    Collect(data_vec.data(), data_vec.size());
-  }
-  // Collect data for calibration using absl::Span<float>.
-  // It internally calls private method Collect(float*, unsigned int)
-  void Collect(absl::Span<float> data_span) {
-    Collect(data_span.data(), data_span.size());
-  }
-  // Collect data for calibration using Tensor
-  // It internally calls private method Collect(float*, unsigned int)
-  void Collect(const Tensor& data_tensor) {
-    auto data_flat = data_tensor.flat<float>();
-    Collect(data_flat.data(), data_flat.size());
-  }
+  // Collect data for calibration.
+  virtual void Collect(float min, float max,
+                       absl::Span<const int64_t> histogram) = 0;
 
   virtual void ClearData() = 0;
-  virtual void Collect(const float* data, unsigned int N) = 0;
   // Return the statistics needed for a given calibration method.
   virtual std::optional<CalibrationStatistics> GetStatistics() const = 0;
   virtual ~CalibrationStatisticsCollectorBase() = default;
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_histogram.cc b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_histogram.cc
index 12e9f19dff2cab..1ad5bf1aba2091 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_histogram.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_histogram.cc
@@ -16,60 +16,94 @@ limitations under the License.
 
 #include <algorithm>
 #include <cmath>
+#include <cstdint>
 #include <deque>
 #include <optional>
+#include <utility>
 
+#include "absl/types/span.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/calibration_parameters.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
 
 namespace tensorflow {
 namespace calibrator {
-
-void CalibrationStatisticsCollectorHistogram::ClearData() {
-  num_bins_ = 256;
-  bin_width_ = 0;
-  hist_freq_.resize(num_bins_, 0);
+namespace {
+
+using ::stablehlo::quantization::CalculateBinIndex;
+using ::stablehlo::quantization::CalculateBinWidth;
+using ::stablehlo::quantization::CalculateLowerBound;
+
+// Gets the histogram frequencies for the given range.
+float GetRangeFrequencies(absl::Span<const int64_t> histogram,
+                          const float bin_width, const float lower_bound,
+                          const float range_start, const float range_end) {
+  float freq_sum = 0.f;
+  for (float range = std::max(range_start, lower_bound); range < range_end;
+       range += bin_width) {
+    const int32_t idx = CalculateBinIndex(range, lower_bound, bin_width);
+    if (idx >= histogram.size()) break;
+
+    //  If the range is smaller than bin width, add the proportional value of
+    //  that bin.
+    const float proportion = std::min(range_end - range, bin_width) / bin_width;
+    freq_sum += histogram[idx] * proportion;
+  }
+  return freq_sum;
 }
 
-void CalibrationStatisticsCollectorHistogram::Collect(const float *data,
-                                                      const unsigned int N) {
-  if (N == 0) return;
+}  // namespace
 
-  // When histogram is not initialized.
-  if (bin_width_ == 0) {
-    hist_freq_.resize(num_bins_, 0);
-    auto minmax = std::minmax_element(data, data + N);
-
-    // The min and max of the first data will be the range of the histogram.
-    float min_value = std::floor(*minmax.first);
-    float max_value = std::ceil(*minmax.second);
+void CalibrationStatisticsCollectorHistogram::ClearData() {
+  hist_freq_.clear();
+}
 
-    // The bin width is (max - min) divided by num_bins.
-    bin_width_ = (max_value - min_value) / num_bins_;
+void CalibrationStatisticsCollectorHistogram::Collect(
+    const float min, const float max, absl::Span<const int64_t> histogram) {
+  if (histogram.empty()) return;
 
-    // The lower bound is min value of data.
-    lower_bound_ = min_value;
+  // Reconstruct the bin width, lower and upper bound from the collected data.
+  const float collected_bin_width =
+      CalculateBinWidth(min, max, histogram.size());
+  const float collected_lower_bound =
+      CalculateLowerBound(min, collected_bin_width);
+  const float collected_upper_bound =
+      std::ceil(max / collected_bin_width) * collected_bin_width;
 
-    // This is the worst case of first initialization, so it returns
-    // instantly. 1e-9 is threshold.
-    if (std::abs(bin_width_) < 1e-9) return;
+  // When histogram is not initialized.
+  if (hist_freq_.empty()) {
+    bin_width_ = collected_bin_width;
+    lower_bound_ = collected_lower_bound;
   }
 
-  for (int i = 0; i < N; ++i) {
-    int idx = GetHistogramIndex(data[i]);
-    hist_freq_[idx]++;
+  const auto [lower_idx, upper_idx] =
+      ExpandHistogramIfNeeded(collected_lower_bound, collected_upper_bound);
+  for (int32_t idx = lower_idx; idx <= upper_idx; ++idx) {
+    // Calculate the range covered by this index then add with the collected
+    // frequency associated to that range.
+    const float range_start = lower_bound_ + idx * bin_width_;
+    hist_freq_[idx] += GetRangeFrequencies(histogram, collected_bin_width,
+                                           collected_lower_bound, range_start,
+                                           range_start + bin_width_);
   }
 }
 
 std::optional<CalibrationStatistics>
 CalibrationStatisticsCollectorHistogram::GetStatistics() const {
-  if (bin_width_ == 0) return std::nullopt;
+  if (hist_freq_.empty()) return std::nullopt;
 
   CalibrationStatistics::HistogramStatistics hist_stats;
 
+  // Skip trailing zeros in the histogram.
+  int32_t real_size = hist_freq_.size();
+  for (; real_size > 0; --real_size) {
+    if (hist_freq_[real_size - 1] != 0) break;
+  }
+
   hist_stats.set_lower_bound(lower_bound_);
   hist_stats.set_bin_width(bin_width_);
-  hist_stats.mutable_hist_freq()->Assign(hist_freq_.begin(), hist_freq_.end());
+  hist_stats.mutable_hist_freq()->Assign(hist_freq_.begin(),
+                                         hist_freq_.begin() + real_size);
 
   CalibrationStatistics statistics;
   statistics.mutable_histogram_statistics()->CopyFrom(hist_stats);
@@ -77,28 +111,23 @@ CalibrationStatisticsCollectorHistogram::GetStatistics() const {
   return statistics;
 }
 
-int CalibrationStatisticsCollectorHistogram::ExpandHistogramIfNeeded(int idx) {
-  // If idx < 0, then expand the histogram to the left.
-  if (idx < 0) {
-    hist_freq_.insert(hist_freq_.begin(), -idx, 0);
-    lower_bound_ -= bin_width_ * (-idx);
-    idx = 0;
+std::pair<int32_t, int32_t>
+CalibrationStatisticsCollectorHistogram::ExpandHistogramIfNeeded(
+    const float lower_bound, const float upper_bound) {
+  int32_t lower_idx = CalculateBinIndex(lower_bound, lower_bound_, bin_width_);
+  // If lower_idx < 0, then expand the histogram to the left.
+  if (lower_idx < 0) {
+    hist_freq_.insert(hist_freq_.begin(), -lower_idx, 0);
+    lower_bound_ -= bin_width_ * (-lower_idx);
+    lower_idx = 0;
   }
 
-  // If idx >= hist_freq_.size(), then expand the histogram to the left.
-  if (idx >= hist_freq_.size()) {
-    hist_freq_.resize(idx + 1, 0);
+  int32_t upper_idx = CalculateBinIndex(upper_bound, lower_bound_, bin_width_);
+  // If upper_idx >= hist_freq_.size(), then expand the histogram to the right.
+  if (upper_idx >= hist_freq_.size()) {
+    hist_freq_.resize(upper_idx + 1, 0);
   }
-
-  return idx;
-}
-
-int CalibrationStatisticsCollectorHistogram::GetHistogramIndex(
-    const float value) {
-  // Calculate index of histogram
-  int idx = (value - lower_bound_) / bin_width_;
-
-  return ExpandHistogramIfNeeded(idx);
+  return std::make_pair(lower_idx, upper_idx);
 }
 
 }  // namespace calibrator
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_histogram.h b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_histogram.h
index cbb2c22c90863e..84f641a5ad0c92 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_histogram.h
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_histogram.h
@@ -18,7 +18,9 @@ limitations under the License.
 #include <cstdint>
 #include <deque>
 #include <optional>
+#include <utility>
 
+#include "absl/types/span.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_base.h"
@@ -27,42 +29,29 @@ limitations under the License.
 namespace tensorflow {
 namespace calibrator {
 
-using ::stablehlo::quantization::CalibrationOptions;
 
 class CalibrationStatisticsCollectorHistogram
     : public CalibrationStatisticsCollectorBase {
  public:
-  explicit CalibrationStatisticsCollectorHistogram(
-      const CalibrationOptions& calib_opts) {
-    ClearData();
-    num_bins_ = calib_opts.calibration_parameters().initial_num_bins();
-  }
+  explicit CalibrationStatisticsCollectorHistogram() { ClearData(); }
 
   void ClearData() override;
 
-  void Collect(const float* data, unsigned int N) override;
+  void Collect(float min, float max,
+               absl::Span<const int64_t> histogram) override;
 
   std::optional<CalibrationStatistics> GetStatistics() const override;
 
  private:
-  // Returns expanded histogram's index. If idx < 0, then expand the histogram
-  // to the left. If idx >= hist_freq_.size(), then expand the histogram to the
-  // right.
-  int ExpandHistogramIfNeeded(int idx);
-
-  // Calculate the histogram index of value and if index of value is exceeds the
-  // range of histogram, then this function extends hist_freq_ and updates
-  // lower_bound_. This function returns the expanded histogram's index.
-  int GetHistogramIndex(float value);
+  // Expands the histogram so the lower_bound and upper_bound can fit in the
+  // histogram. Returns the indexes associated to those values.
+  std::pair<int32_t, int32_t> ExpandHistogramIfNeeded(float lower_bound,
+                                                      float upper_bound);
 
   // hist_freq_[i] saves frequency of range [bins[i], bins[i + 1]).
   // bins[i]     = lower_bound_ + bin_width_ * i
   // bins[i + 1] = lower_bound_ + bin_width_ * (i + 1)
-  std::deque<int64_t> hist_freq_;
-
-  // The number of bins when histogram is initialized. It can be increased
-  // because histogram is dynamically expanded by sample inputs.
-  int num_bins_;
+  std::deque<float> hist_freq_;
 
   // Width of bin
   float bin_width_;
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_min_max.cc b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_min_max.cc
index d549344fcc4a1f..50b7590a2db0b5 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_min_max.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_min_max.cc
@@ -15,9 +15,11 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_min_max.h"
 
 #include <algorithm>
+#include <cstdint>
 #include <limits>
 #include <optional>
 
+#include "absl/types/span.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics.pb.h"
 
 namespace tensorflow {
@@ -33,18 +35,12 @@ void CalibrationStatisticsCollectorMinMax::ClearData() {
   min_max_statistics_.set_global_max(std::numeric_limits<float>::lowest());
 }
 
-void CalibrationStatisticsCollectorMinMax::Collect(const float *data,
-                                                   const unsigned int N) {
-  float input_min = min_max_statistics_.global_min();
-  float input_max = min_max_statistics_.global_max();
-
-  for (int i = 0; i < N; ++i) {
-    input_min = std::min(input_min, data[i]);
-    input_max = std::max(input_max, data[i]);
-  }
-
-  min_max_statistics_.set_global_min(input_min);
-  min_max_statistics_.set_global_max(input_max);
+void CalibrationStatisticsCollectorMinMax::Collect(
+    const float min, const float max, absl::Span<const int64_t> histogram) {
+  min_max_statistics_.set_global_min(
+      std::min(min_max_statistics_.global_min(), min));
+  min_max_statistics_.set_global_max(
+      std::max(min_max_statistics_.global_max(), max));
 }
 
 std::optional<CalibrationStatistics>
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_min_max.h b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_min_max.h
index c282bc29987755..8ee545e53f36b7 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_min_max.h
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_min_max.h
@@ -15,8 +15,10 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CALIBRATOR_CALIBRATION_STATISTICS_COLLECTOR_MIN_MAX_H_
 #define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CALIBRATOR_CALIBRATION_STATISTICS_COLLECTOR_MIN_MAX_H_
 
+#include <cstdint>
 #include <optional>
 
+#include "absl/types/span.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_base.h"
@@ -37,7 +39,8 @@ class CalibrationStatisticsCollectorMinMax
 
   void ClearData() override;
 
-  void Collect(const float *data, unsigned int N) override;
+  void Collect(float min, float max,
+               absl::Span<const int64_t> histogram) override;
 
   std::optional<CalibrationStatistics> GetStatistics() const override;
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_test.cc b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_test.cc
index 3a87488649019b..5e291bec868537 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_test.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_test.cc
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <optional>
-#include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_average_min_max.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_base.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_histogram.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_min_max.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -26,19 +26,16 @@ namespace tensorflow {
 namespace calibrator {
 namespace {
 
+using ::testing::ElementsAre;
+
 TEST(CalibrationStatisticsCollectorTest, SimpleMinMax) {
   auto collector = CalibrationStatisticsCollectorMinMax();
 
-  std::vector<std::vector<float>> collect_vec;
-
-  collect_vec.push_back({1.0f, 2.0f, 3.0f, 4.0f, 5.0f});
-  collect_vec.push_back({1.0f, 2.0f, 3.0f, 4.0f, 10.0f});
-  collect_vec.push_back({-5.0f, 2.0f, 3.0f, 4.0f, 5.0f});
+  collector.Collect(
+      /*min=*/1.0f, /*max=*/10.f, /*histogram=*/{});
+  collector.Collect(
+      /*min=*/-5.0f, /*max=*/5.f, /*histogram=*/{});
 
-  for (auto data_vec : collect_vec) {
-    collector.CalibrationStatisticsCollectorBase::Collect(
-        /*data_vec=*/data_vec);
-  }
   std::optional<CalibrationStatistics> statistics = collector.GetStatistics();
 
   EXPECT_TRUE(statistics.has_value());
@@ -49,42 +46,26 @@ TEST(CalibrationStatisticsCollectorTest, SimpleMinMax) {
 TEST(CalibrationStatisticsCollectorTest, SimpleAverageMinMax) {
   auto collector = CalibrationStatisticsCollectorAverageMinMax();
 
-  std::vector<std::vector<float>> collect_vec;
-
-  collect_vec.push_back({1.0f, 2.0f, 3.0f, 4.0f, 5.0f});  // min=1.0f, max=5.0f
-  collect_vec.push_back(
-      {1.0f, 2.0f, 3.0f, 4.0f, 10.0f});  // min=1.0f, max=10.0f
-  collect_vec.push_back(
-      {-5.0f, 2.0f, 3.0f, 4.0f, 5.0f});  // min=-5.0f, max=5.0f
+  collector.Collect(
+      /*min=*/1.0f, /*max=*/10.f, /*histogram=*/{});
+  collector.Collect(
+      /*min=*/-5.0f, /*max=*/5.f, /*histogram=*/{});
 
-  for (auto data_vec : collect_vec) {
-    collector.CalibrationStatisticsCollectorBase::Collect(
-        /*data_vec=*/data_vec);
-  }
   std::optional<CalibrationStatistics> statistics = collector.GetStatistics();
 
   EXPECT_TRUE(statistics.has_value());
-  // 1.0f + 1.0f - 5.0f
-  EXPECT_EQ(statistics.value().average_min_max_statistics().min_sum(), -3.0f);
-  // 5.0f + 10.0f + 5.0f
-  EXPECT_EQ(statistics.value().average_min_max_statistics().max_sum(), 20.0f);
-  // collect_vec.size()
-  EXPECT_EQ(statistics.value().average_min_max_statistics().num_samples(), 3);
+  EXPECT_EQ(statistics.value().average_min_max_statistics().min_sum(), -4.0f);
+  EXPECT_EQ(statistics.value().average_min_max_statistics().max_sum(), 15.0f);
+  EXPECT_EQ(statistics.value().average_min_max_statistics().num_samples(), 2);
 }
 
 TEST(CalibrationStatisticsCollectorTest, ClearDataAndGetResultsMinMax) {
   auto collector = CalibrationStatisticsCollectorMinMax();
 
-  std::vector<std::vector<float>> collect_vec;
-
-  collect_vec.push_back({1.0f, 2.0f, 3.0f, 4.0f, 5.0f});
-  collect_vec.push_back({1.0f, 2.0f, 3.0f, 4.0f, 10.0f});
-  collect_vec.push_back({-5.0f, 2.0f, 3.0f, 4.0f, 5.0f});
-
-  for (auto data_vec : collect_vec) {
-    collector.CalibrationStatisticsCollectorBase::Collect(
-        /*data_vec=*/data_vec);
-  }
+  collector.Collect(
+      /*min=*/1.0f, /*max=*/10.f, /*histogram=*/{});
+  collector.Collect(
+      /*min=*/-5.0f, /*max=*/5.f, /*histogram=*/{});
 
   std::optional<CalibrationStatistics> statistics = collector.GetStatistics();
 
@@ -96,11 +77,10 @@ TEST(CalibrationStatisticsCollectorTest, ClearDataAndGetResultsMinMax) {
   statistics = collector.GetStatistics();
   EXPECT_FALSE(statistics.has_value());
 
-  collect_vec.pop_back();  // pop last element
-  for (auto data_vec : collect_vec) {
-    collector.CalibrationStatisticsCollectorBase::Collect(
-        /*data_vec=*/data_vec);
-  }
+  collector.Collect(
+      /*min=*/1.0f, /*max=*/10.f, /*histogram=*/{});
+  collector.Collect(
+      /*min=*/2.0f, /*max=*/5.f, /*histogram=*/{});
 
   statistics = collector.GetStatistics();
 
@@ -112,41 +92,213 @@ TEST(CalibrationStatisticsCollectorTest, ClearDataAndGetResultsMinMax) {
 TEST(CalibrationStatisticsCollectorTest, ClearDataAndGetResultsAverageMinMax) {
   auto collector = CalibrationStatisticsCollectorAverageMinMax();
 
-  std::vector<std::vector<float>> collect_vec;
-
-  collect_vec.push_back({1.0f, 2.0f, 3.0f, 4.0f, 5.0f});
-  collect_vec.push_back({1.0f, 2.0f, 3.0f, 4.0f, 20.0f});
-  collect_vec.push_back({-5.0f, 2.0f, 3.0f, 4.0f, 5.0f});
-
-  for (auto data_vec : collect_vec) {
-    collector.CalibrationStatisticsCollectorBase::Collect(
-        /*data_vec=*/data_vec);
-  }
+  collector.Collect(
+      /*min=*/1.0f, /*max=*/10.f, /*histogram=*/{});
+  collector.Collect(
+      /*min=*/-5.0f, /*max=*/5.f, /*histogram=*/{});
 
   std::optional<CalibrationStatistics> statistics = collector.GetStatistics();
 
   EXPECT_TRUE(statistics.has_value());
-  EXPECT_EQ(statistics.value().average_min_max_statistics().min_sum(), -3.0f);
-  EXPECT_EQ(statistics.value().average_min_max_statistics().max_sum(), 30.0f);
-  EXPECT_EQ(statistics.value().average_min_max_statistics().num_samples(), 3);
+  EXPECT_EQ(statistics.value().average_min_max_statistics().min_sum(), -4.0f);
+  EXPECT_EQ(statistics.value().average_min_max_statistics().max_sum(), 15.0f);
+  EXPECT_EQ(statistics.value().average_min_max_statistics().num_samples(), 2);
 
   collector.ClearData();
   statistics = collector.GetStatistics();
   EXPECT_FALSE(statistics.has_value());
 
-  collect_vec.pop_back();  // pop last element
-  for (auto data_vec : collect_vec) {
-    collector.CalibrationStatisticsCollectorBase::Collect(
-        /*data_vec=*/data_vec);
-  }
+  collector.Collect(
+      /*min=*/1.0f, /*max=*/10.f, /*histogram=*/{});
 
   statistics = collector.GetStatistics();
 
   EXPECT_TRUE(statistics.has_value());
-  EXPECT_EQ(statistics.value().average_min_max_statistics().min_sum(), 2.0f);
-  EXPECT_EQ(statistics.value().average_min_max_statistics().max_sum(), 25.0f);
-  EXPECT_EQ(statistics.value().average_min_max_statistics().num_samples(), 2);
+  EXPECT_EQ(statistics.value().average_min_max_statistics().min_sum(), 1.0f);
+  EXPECT_EQ(statistics.value().average_min_max_statistics().max_sum(), 10.0f);
+  EXPECT_EQ(statistics.value().average_min_max_statistics().num_samples(), 1);
 }
+
+TEST(HistogramStatisticsCollectorTest, SingleBatchSimple) {
+  CalibrationOptions calib_opts;
+  calib_opts.set_calibration_method(
+      CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_MAX_FREQUENCY);
+  auto collector = CalibrationStatisticsCollectorHistogram();
+
+  collector.Collect(
+      /*min=*/1.f, /*max=*/16.f, /*histogram=*/{1, 0, 3, 5, 7, 6, 5, 0});
+
+  std::optional<CalibrationStatistics> statistics = collector.GetStatistics();
+  EXPECT_TRUE(statistics.has_value());
+  EXPECT_EQ(statistics.value().histogram_statistics().lower_bound(), 0.f);
+  EXPECT_EQ(statistics.value().histogram_statistics().bin_width(), 2.f);
+  // Trailing zeros should be removed.
+  EXPECT_THAT(statistics.value().histogram_statistics().hist_freq(),
+              ElementsAre(1, 0, 3, 5, 7, 6, 5));
+}
+
+TEST(HistogramStatisticsCollectorTest, AggregateSameBatchSize) {
+  CalibrationOptions calib_opts;
+  calib_opts.set_calibration_method(
+      CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_MAX_FREQUENCY);
+  auto collector = CalibrationStatisticsCollectorHistogram();
+
+  collector.Collect(
+      /*min=*/1.f, /*max=*/16.f, /*histogram=*/{1, 0, 3, 5, 7, 6, 5, 1});
+
+  std::optional<CalibrationStatistics> statistics = collector.GetStatistics();
+  EXPECT_TRUE(statistics.has_value());
+  EXPECT_EQ(statistics.value().histogram_statistics().lower_bound(), 0.f);
+  EXPECT_EQ(statistics.value().histogram_statistics().bin_width(), 2.f);
+  EXPECT_THAT(statistics.value().histogram_statistics().hist_freq(),
+              ElementsAre(1, 0, 3, 5, 7, 6, 5, 1));
+
+  collector.Collect(
+      /*min=*/-1.f, /*max=*/12.f, /*histogram=*/{1, 0, 1, 2, 2, 1, 1, 0});
+
+  statistics = collector.GetStatistics();
+  EXPECT_TRUE(statistics.has_value());
+  EXPECT_EQ(statistics.value().histogram_statistics().lower_bound(), -2.f);
+  EXPECT_EQ(statistics.value().histogram_statistics().bin_width(), 2.f);
+  EXPECT_THAT(statistics.value().histogram_statistics().hist_freq(),
+              ElementsAre(1, 1, 1, 5, 7, 8, 7, 5, 1));
+}
+
+TEST(HistogramStatisticsCollectorTest, AggregateSmallerBatchSizeExpandLeft) {
+  CalibrationOptions calib_opts;
+  calib_opts.set_calibration_method(
+      CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_MAX_FREQUENCY);
+  auto collector = CalibrationStatisticsCollectorHistogram();
+
+  collector.Collect(
+      /*min=*/1.f, /*max=*/16.f, /*histogram=*/{1, 0, 3, 5, 7, 6, 5, 1});
+
+  std::optional<CalibrationStatistics> statistics = collector.GetStatistics();
+  EXPECT_TRUE(statistics.has_value());
+  EXPECT_EQ(statistics.value().histogram_statistics().lower_bound(), 0.f);
+  EXPECT_EQ(statistics.value().histogram_statistics().bin_width(), 2.f);
+  EXPECT_THAT(statistics.value().histogram_statistics().hist_freq(),
+              ElementsAre(1, 0, 3, 5, 7, 6, 5, 1));
+
+  collector.Collect(
+      /*min=*/-1.f, /*max=*/5.f, /*histogram=*/{1, 0, 1, 2, 2, 1, 1, 0});
+
+  statistics = collector.GetStatistics();
+  EXPECT_TRUE(statistics.has_value());
+  EXPECT_EQ(statistics.value().histogram_statistics().lower_bound(), -2.f);
+  EXPECT_EQ(statistics.value().histogram_statistics().bin_width(), 2.f);
+  EXPECT_THAT(statistics.value().histogram_statistics().hist_freq(),
+              ElementsAre(1, 2, 4, 5, 5, 7, 6, 5, 1));
+}
+
+TEST(HistogramStatisticsCollectorTest, AggregateSmallerBatchSizeExpandRight) {
+  CalibrationOptions calib_opts;
+  calib_opts.set_calibration_method(
+      CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_MAX_FREQUENCY);
+  auto collector = CalibrationStatisticsCollectorHistogram();
+
+  collector.Collect(
+      /*min=*/1.f, /*max=*/16.f, /*histogram=*/{1, 0, 3, 5, 7, 6, 5, 1});
+
+  std::optional<CalibrationStatistics> statistics = collector.GetStatistics();
+  EXPECT_TRUE(statistics.has_value());
+  EXPECT_EQ(statistics.value().histogram_statistics().lower_bound(), 0.f);
+  EXPECT_EQ(statistics.value().histogram_statistics().bin_width(), 2.f);
+  EXPECT_THAT(statistics.value().histogram_statistics().hist_freq(),
+              ElementsAre(1, 0, 3, 5, 7, 6, 5, 1));
+
+  collector.Collect(
+      /*min=*/13.f, /*max=*/19.f, /*histogram=*/{1, 0, 1, 2, 2, 1, 1, 0});
+
+  statistics = collector.GetStatistics();
+  EXPECT_TRUE(statistics.has_value());
+  EXPECT_EQ(statistics.value().histogram_statistics().lower_bound(), 0.f);
+  EXPECT_EQ(statistics.value().histogram_statistics().bin_width(), 2.f);
+  EXPECT_THAT(statistics.value().histogram_statistics().hist_freq(),
+              ElementsAre(1, 0, 3, 5, 7, 6, 6, 2, 4, 2));
+}
+
+TEST(HistogramStatisticsCollectorTest, AggregateTinyBinWidth) {
+  CalibrationOptions calib_opts;
+  calib_opts.set_calibration_method(
+      CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_MAX_FREQUENCY);
+  auto collector = CalibrationStatisticsCollectorHistogram();
+
+  collector.Collect(
+      /*min=*/1.f, /*max=*/16.f, /*histogram=*/{1, 0, 3, 5, 7, 6, 5, 1});
+
+  std::optional<CalibrationStatistics> statistics = collector.GetStatistics();
+  EXPECT_TRUE(statistics.has_value());
+  EXPECT_EQ(statistics.value().histogram_statistics().lower_bound(), 0.f);
+  EXPECT_EQ(statistics.value().histogram_statistics().bin_width(), 2.f);
+  EXPECT_THAT(statistics.value().histogram_statistics().hist_freq(),
+              ElementsAre(1, 0, 3, 5, 7, 6, 5, 1));
+
+  collector.Collect(
+      /*min=*/-1.f, /*max=*/-0.99998f, /*histogram=*/{1, 0, 1, 2, 2, 1, 1, 0});
+
+  statistics = collector.GetStatistics();
+  EXPECT_TRUE(statistics.has_value());
+  EXPECT_EQ(statistics.value().histogram_statistics().lower_bound(), -2.f);
+  EXPECT_EQ(statistics.value().histogram_statistics().bin_width(), 2.f);
+  EXPECT_THAT(statistics.value().histogram_statistics().hist_freq(),
+              ElementsAre(8, 1, 0, 3, 5, 7, 6, 5, 1));
+}
+
+TEST(HistogramStatisticsCollectorTest, AggregateLargerBatchSizeExpandLeft) {
+  CalibrationOptions calib_opts;
+  calib_opts.set_calibration_method(
+      CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_MAX_FREQUENCY);
+  auto collector = CalibrationStatisticsCollectorHistogram();
+
+  collector.Collect(
+      /*min=*/1.f, /*max=*/16.f, /*histogram=*/{1, 0, 3, 5, 7, 6, 5, 1});
+
+  std::optional<CalibrationStatistics> statistics = collector.GetStatistics();
+  EXPECT_TRUE(statistics.has_value());
+  EXPECT_EQ(statistics.value().histogram_statistics().lower_bound(), 0.f);
+  EXPECT_EQ(statistics.value().histogram_statistics().bin_width(), 2.f);
+  EXPECT_THAT(statistics.value().histogram_statistics().hist_freq(),
+              ElementsAre(1, 0, 3, 5, 7, 6, 5, 1));
+
+  collector.Collect(
+      /*min=*/-5.f, /*max=*/5.f, /*histogram=*/{1, 2, 2, 1});
+
+  statistics = collector.GetStatistics();
+  EXPECT_TRUE(statistics.has_value());
+  EXPECT_EQ(statistics.value().histogram_statistics().lower_bound(), -8.f);
+  EXPECT_EQ(statistics.value().histogram_statistics().bin_width(), 2.f);
+  EXPECT_THAT(statistics.value().histogram_statistics().hist_freq(),
+              ElementsAre(0.5, 0.5, 1, 1, 2, 1, 3.5, 5.5, 7, 6, 5, 1));
+}
+
+TEST(HistogramStatisticsCollectorTest, AggregateLargerBatchSizeExpandRight) {
+  CalibrationOptions calib_opts;
+  calib_opts.set_calibration_method(
+      CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_MAX_FREQUENCY);
+  auto collector = CalibrationStatisticsCollectorHistogram();
+
+  collector.Collect(
+      /*min=*/1.f, /*max=*/16.f, /*histogram=*/{1, 0, 3, 5, 7, 6, 5, 1});
+
+  std::optional<CalibrationStatistics> statistics = collector.GetStatistics();
+  EXPECT_TRUE(statistics.has_value());
+  EXPECT_EQ(statistics.value().histogram_statistics().lower_bound(), 0.f);
+  EXPECT_EQ(statistics.value().histogram_statistics().bin_width(), 2.f);
+  EXPECT_THAT(statistics.value().histogram_statistics().hist_freq(),
+              ElementsAre(1, 0, 3, 5, 7, 6, 5, 1));
+
+  collector.Collect(
+      /*min=*/10.f, /*max=*/21.f, /*histogram=*/{1, 2, 2, 1});
+
+  statistics = collector.GetStatistics();
+  EXPECT_TRUE(statistics.has_value());
+  EXPECT_EQ(statistics.value().histogram_statistics().lower_bound(), 0.f);
+  EXPECT_EQ(statistics.value().histogram_statistics().bin_width(), 2.f);
+  EXPECT_THAT(statistics.value().histogram_statistics().hist_freq(),
+              ElementsAre(1, 0, 3, 5, 7.5, 6.5, 6, 2, 1, 1, 0.5, 0.5));
+}
+
 }  // namespace
 }  // namespace calibrator
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_saver_op.cc b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_saver_op.cc
new file mode 100644
index 00000000000000..8061ad3fe2d444
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_saver_op.cc
@@ -0,0 +1,187 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/nullability.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_average_min_max.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_base.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_histogram.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_min_max.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/op_requires.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tsl/platform/file_system.h"
+
+namespace tensorflow {
+namespace {
+
+using ::stablehlo::quantization::CalibrationOptions;
+using CalibrationMethod =
+    ::stablehlo::quantization::CalibrationOptions_CalibrationMethod;
+using ::tensorflow::calibrator::CalibrationStatistics;
+using ::tensorflow::calibrator::CalibrationStatisticsCollectorAverageMinMax;
+using ::tensorflow::calibrator::CalibrationStatisticsCollectorBase;
+using ::tensorflow::calibrator::CalibrationStatisticsCollectorHistogram;
+using ::tensorflow::calibrator::CalibrationStatisticsCollectorMinMax;
+using ::tensorflow::calibrator::CalibrationStatisticsMap;
+
+}  // namespace
+
+REGISTER_OP("CalibrationStatisticsSaver")
+    .Input("args: Tin")
+    .Attr("Tin: list(type) >= 0")
+    .Attr("ids: list(string) >= 1")
+    .Attr("calibration_methods: list(int) >= 1")
+    .Attr("output_file_path: string")
+    .SetIsStateful()
+    .Doc(R"doc(
+Aggregates and saves the calibration statistics data.
+
+This op collects outputs of multiples CustomAggregator ops, which includes
+`min`, `max` and `histogram`. Then it aggregates them according to the
+calibration method and save the result to the given file path as a binary
+proto file.)doc");
+
+class CalibrationStatisticsSaverOp : public OpKernel {
+ public:
+  explicit CalibrationStatisticsSaverOp(
+      absl::Nonnull<OpKernelConstruction*> context)
+      : OpKernel(context) {
+    std::string output_file_path;
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("output_file_path", &output_file_path));
+    OP_REQUIRES_OK(context, context->env()->NewWritableFile(output_file_path,
+                                                            &output_file_));
+
+    OP_REQUIRES_OK(context, context->GetAttr("ids", &ids_));
+    OP_REQUIRES_OK(context, context->GetAttr("calibration_methods",
+                                             &calibration_methods_));
+    OP_REQUIRES(
+        context, ids_.size() == calibration_methods_.size(),
+        absl::AbortedError(
+            "The `ids` and `calibration_methods` must have the same size."));
+
+    // Check the number and type of inputs.
+    OP_REQUIRES(context, context->num_inputs() == ids_.size() * 3,
+                absl::AbortedError("The number of inputs must be  three times "
+                                   "the size of the `ids` list."));
+    for (int i = 0; i < ids_.size(); ++i) {
+      OP_REQUIRES(context, context->input_type(i * 3) == DT_FLOAT,
+                  absl::AbortedError("The input `min` must have float type."));
+      OP_REQUIRES(context, context->input_type(i * 3 + 1) == DT_FLOAT,
+                  absl::AbortedError("The input `max` must have float type."));
+      OP_REQUIRES(
+          context, context->input_type(i * 3 + 2) == DT_INT64,
+          absl::AbortedError("The input `histogram` must have int64 type."));
+    }
+  }
+
+  ~CalibrationStatisticsSaverOp() override {
+    // Save to file during destruction so we only save it once.
+    // TODO - b/335044516 : Find a way to flush outside of the destructor.
+    CalibrationStatisticsMap statistics_map;
+    for (const auto& [id, collector] : id_to_collector_) {
+      std::optional<CalibrationStatistics> statistics =
+          collector->GetStatistics();
+      if (!statistics.has_value()) continue;
+
+      statistics_map.mutable_statistics()->emplace(id, std::move(*statistics));
+    }
+
+    if (auto status = output_file_->Append(statistics_map.SerializeAsString());
+        !status.ok()) {
+      LOG(ERROR) << "Failed to write calibration statistics: "
+                 << status.message();
+    }
+    if (auto status = output_file_->Close(); !status.ok()) {
+      LOG(ERROR) << "Failed to close calibration statistics file: "
+                 << status.message();
+    }
+  }
+
+  void Compute(absl::Nonnull<OpKernelContext*> context) override {
+    for (int idx = 0; idx < ids_.size(); ++idx) {
+      AssignIfNotExists(
+          ids_[idx], static_cast<CalibrationMethod>(calibration_methods_[idx]));
+
+      const Tensor& min_tensor = context->input(3 * idx);
+      const Tensor& max_tensor = context->input(3 * idx + 1);
+      const Tensor& histogram_tensor = context->input(3 * idx + 2);
+
+      const float min_value = min_tensor.scalar<float>()();
+      const float max_value = max_tensor.scalar<float>()();
+      auto histogram_flat = histogram_tensor.flat<int64_t>();
+      absl::Span<const int64_t> histogram_data =
+          absl::MakeSpan(histogram_flat.data(), histogram_flat.size());
+      id_to_collector_[ids_[idx]]->Collect(min_value, max_value,
+                                           histogram_data);
+    }
+  }
+
+ private:
+  // The path to save calibration statistics data.
+  std::unique_ptr<tsl::WritableFile> output_file_;
+  // The id and calibration method of preceding CustomAggregator ops.
+  std::vector<std::string> ids_;
+  std::vector<int32_t> calibration_methods_;
+  // Map from id to its collector instance.
+  absl::flat_hash_map<std::string,
+                      std::unique_ptr<CalibrationStatisticsCollectorBase>>
+      id_to_collector_;
+
+  void AssignIfNotExists(absl::string_view id,
+                         const CalibrationMethod calibration_method) {
+    std::unique_ptr<CalibrationStatisticsCollectorBase>& collector =
+        id_to_collector_[id];
+
+    if (collector != nullptr) return;
+
+    switch (calibration_method) {
+      case CalibrationOptions::CALIBRATION_METHOD_AVERAGE_MIN_MAX:
+        collector =
+            std::make_unique<CalibrationStatisticsCollectorAverageMinMax>();
+        break;
+      case CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_PERCENTILE:
+      case CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_BRUTEFORCE:
+      case CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_SYMMETRIC:
+      case CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_MAX_FREQUENCY:
+        collector = std::make_unique<CalibrationStatisticsCollectorHistogram>();
+        break;
+      case CalibrationOptions::CALIBRATION_METHOD_MIN_MAX:
+      default:
+        collector = std::make_unique<CalibrationStatisticsCollectorMinMax>();
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("CalibrationStatisticsSaver").Device(DEVICE_CPU),
+                        CalibrationStatisticsSaverOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_saver_op_test.cc b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_saver_op_test.cc
new file mode 100644
index 00000000000000..8335722cdea929
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_saver_op_test.cc
@@ -0,0 +1,291 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/test.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/status.h"
+#include "tsl/platform/status_matchers.h"
+
+namespace tensorflow {
+namespace {
+
+using ::stablehlo::quantization::CalibrationOptions;
+using ::tensorflow::calibrator::CalibrationStatistics;
+using ::tensorflow::calibrator::CalibrationStatisticsMap;
+using ::testing::Contains;
+using ::testing::ElementsAre;
+using ::testing::HasSubstr;
+using ::testing::Key;
+using ::testing::SizeIs;
+using ::tsl::testing::StatusIs;
+
+class CalibrationStatisticsSaverTest : public OpsTestBase {};
+
+TEST_F(CalibrationStatisticsSaverTest, MissingOutputPath) {
+  std::vector<std::string> ids{"1"};
+  std::vector<int32_t> calibration_methods{
+      CalibrationOptions::CALIBRATION_METHOD_AVERAGE_MIN_MAX};
+
+  std::vector<NodeDefBuilder::NodeOut> inputs;
+  inputs.emplace_back("min", 0, DT_FLOAT);
+  inputs.emplace_back("max", 0, DT_FLOAT);
+
+  TF_CHECK_OK(NodeDefBuilder("op", "CalibrationStatisticsSaver")
+                  .Input(inputs)
+                  .Attr("ids", ids)
+                  .Attr("calibration_methods", calibration_methods)
+                  .Finalize(node_def()));
+  ASSERT_THAT(InitOp(),
+              StatusIs(tsl::error::INVALID_ARGUMENT,
+                       HasSubstr("NodeDef missing attr 'output_file_path'")));
+}
+
+TEST_F(CalibrationStatisticsSaverTest, WrongNumInputs) {
+  std::vector<std::string> ids{"1"};
+  std::vector<int32_t> calibration_methods{
+      CalibrationOptions::CALIBRATION_METHOD_AVERAGE_MIN_MAX};
+
+  std::vector<NodeDefBuilder::NodeOut> inputs;
+  inputs.emplace_back("min", 0, DT_FLOAT);
+  inputs.emplace_back("max", 0, DT_FLOAT);
+
+  TF_CHECK_OK(NodeDefBuilder("op", "CalibrationStatisticsSaver")
+                  .Input(inputs)
+                  .Attr("ids", ids)
+                  .Attr("calibration_methods", calibration_methods)
+                  .Attr("output_file_path", "/tmp/statistics.pbtxt")
+                  .Finalize(node_def()));
+  ASSERT_THAT(InitOp(),
+              StatusIs(tsl::error::ABORTED,
+                       HasSubstr("The number of inputs must be  three times "
+                                 "the size of the `ids` list.")));
+}
+
+TEST_F(CalibrationStatisticsSaverTest, WrongInputTypes) {
+  std::vector<std::string> ids{"1"};
+  std::vector<int32_t> calibration_methods{
+      CalibrationOptions::CALIBRATION_METHOD_AVERAGE_MIN_MAX};
+
+  std::vector<NodeDefBuilder::NodeOut> inputs;
+  inputs.emplace_back("min", 0, DT_FLOAT);
+  inputs.emplace_back("max", 0, DT_FLOAT);
+  inputs.emplace_back("histogram", 0, DT_FLOAT);
+
+  TF_CHECK_OK(NodeDefBuilder("op", "CalibrationStatisticsSaver")
+                  .Input(inputs)
+                  .Attr("ids", ids)
+                  .Attr("calibration_methods", calibration_methods)
+                  .Attr("output_file_path", "/tmp/statistics.pbtxt")
+                  .Finalize(node_def()));
+  ASSERT_THAT(
+      InitOp(),
+      StatusIs(tsl::error::ABORTED,
+               HasSubstr("The input `histogram` must have int64 type")));
+}
+
+TEST_F(CalibrationStatisticsSaverTest, SimpleMinMax) {
+  std::vector<std::string> ids{"1"};
+  std::vector<int32_t> calibration_methods{
+      CalibrationOptions::CALIBRATION_METHOD_MIN_MAX};
+
+  std::vector<NodeDefBuilder::NodeOut> inputs;
+  inputs.emplace_back("min", 0, DT_FLOAT);
+  inputs.emplace_back("max", 0, DT_FLOAT);
+  inputs.emplace_back("histogram", 0, DT_INT64);
+
+  const std::string dir = testing::TmpDir();
+  const std::string output_file_path = io::JoinPath(dir, "statistics.pbtxt");
+
+  TF_CHECK_OK(NodeDefBuilder("op", "CalibrationStatisticsSaver")
+                  .Input(inputs)
+                  .Attr("ids", ids)
+                  .Attr("calibration_methods", calibration_methods)
+                  .Attr("output_file_path", output_file_path)
+                  .Finalize(node_def()));
+  TF_CHECK_OK(InitOp());
+
+  AddInputFromArray<float>(TensorShape({}), {1.f});
+  AddInputFromArray<float>(TensorShape({}), {5.f});
+  AddInputFromArray<int64_t>(TensorShape({0}), {});
+
+  TF_CHECK_OK(RunOpKernel());
+  kernel_.reset();
+
+  CalibrationStatisticsMap statistics_map;
+  TF_CHECK_OK(
+      ReadBinaryProto(Env::Default(), output_file_path, &statistics_map));
+  ASSERT_THAT(statistics_map.statistics(), SizeIs(1));
+  ASSERT_THAT(statistics_map.statistics(), ElementsAre(Key("1")));
+
+  const CalibrationStatistics& stats = statistics_map.statistics().at("1");
+  ASSERT_TRUE(stats.has_min_max_statistics());
+  EXPECT_FLOAT_EQ(stats.min_max_statistics().global_min(), 1.f);
+  EXPECT_FLOAT_EQ(stats.min_max_statistics().global_max(), 5.f);
+}
+
+TEST_F(CalibrationStatisticsSaverTest, SimpleAverageMinMax) {
+  std::vector<std::string> ids{"1"};
+  std::vector<int32_t> calibration_methods{
+      CalibrationOptions::CALIBRATION_METHOD_AVERAGE_MIN_MAX};
+
+  std::vector<NodeDefBuilder::NodeOut> inputs;
+  inputs.emplace_back("min", 0, DT_FLOAT);
+  inputs.emplace_back("max", 0, DT_FLOAT);
+  inputs.emplace_back("histogram", 0, DT_INT64);
+
+  const std::string dir = testing::TmpDir();
+  const std::string output_file_path = io::JoinPath(dir, "statistics.pbtxt");
+
+  TF_CHECK_OK(NodeDefBuilder("op", "CalibrationStatisticsSaver")
+                  .Input(inputs)
+                  .Attr("ids", ids)
+                  .Attr("calibration_methods", calibration_methods)
+                  .Attr("output_file_path", output_file_path)
+                  .Finalize(node_def()));
+  TF_CHECK_OK(InitOp());
+
+  AddInputFromArray<float>(TensorShape({}), {1.f});
+  AddInputFromArray<float>(TensorShape({}), {5.f});
+  AddInputFromArray<int64_t>(TensorShape({0}), {});
+
+  TF_CHECK_OK(RunOpKernel());
+  kernel_.reset();
+
+  CalibrationStatisticsMap statistics_map;
+  TF_CHECK_OK(
+      ReadBinaryProto(Env::Default(), output_file_path, &statistics_map));
+  ASSERT_THAT(statistics_map.statistics(), SizeIs(1));
+  ASSERT_THAT(statistics_map.statistics(), ElementsAre(Key("1")));
+
+  const CalibrationStatistics& stats = statistics_map.statistics().at("1");
+  ASSERT_TRUE(stats.has_average_min_max_statistics());
+  EXPECT_FLOAT_EQ(stats.average_min_max_statistics().min_sum(), 1.f);
+  EXPECT_FLOAT_EQ(stats.average_min_max_statistics().max_sum(), 5.f);
+  EXPECT_EQ(stats.average_min_max_statistics().num_samples(), 1);
+}
+
+TEST_F(CalibrationStatisticsSaverTest, SimpleHistogram) {
+  std::vector<std::string> ids{"1"};
+  std::vector<int32_t> calibration_methods{
+      CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_BRUTEFORCE};
+
+  std::vector<NodeDefBuilder::NodeOut> inputs;
+  inputs.emplace_back("min", 0, DT_FLOAT);
+  inputs.emplace_back("max", 0, DT_FLOAT);
+  inputs.emplace_back("histogram", 0, DT_INT64);
+
+  const std::string dir = testing::TmpDir();
+  const std::string output_file_path = io::JoinPath(dir, "statistics.pbtxt");
+
+  TF_CHECK_OK(NodeDefBuilder("op", "CalibrationStatisticsSaver")
+                  .Input(inputs)
+                  .Attr("ids", ids)
+                  .Attr("calibration_methods", calibration_methods)
+                  .Attr("output_file_path", output_file_path)
+                  .Finalize(node_def()));
+  TF_CHECK_OK(InitOp());
+
+  AddInputFromArray<float>(TensorShape({}), {1.f});
+  AddInputFromArray<float>(TensorShape({}), {5.f});
+  AddInputFromArray<int64_t>(TensorShape({8}), {1, 4, 6, 7, 3, 2, 1, 0});
+
+  TF_CHECK_OK(RunOpKernel());
+  kernel_.reset();
+
+  CalibrationStatisticsMap statistics_map;
+  TF_CHECK_OK(
+      ReadBinaryProto(Env::Default(), output_file_path, &statistics_map));
+  ASSERT_THAT(statistics_map.statistics(), SizeIs(1));
+  ASSERT_THAT(statistics_map.statistics(), ElementsAre(Key("1")));
+
+  const CalibrationStatistics& stats = statistics_map.statistics().at("1");
+  ASSERT_TRUE(stats.has_histogram_statistics());
+  EXPECT_FLOAT_EQ(stats.histogram_statistics().bin_width(), 0.5f);
+  EXPECT_FLOAT_EQ(stats.histogram_statistics().lower_bound(), 1.f);
+  EXPECT_THAT(stats.histogram_statistics().hist_freq(),
+              ElementsAre(1, 4, 6, 7, 3, 2, 1));
+}
+
+TEST_F(CalibrationStatisticsSaverTest, MultipleStats) {
+  std::vector<std::string> ids{"1", "2"};
+  std::vector<int32_t> calibration_methods{
+      CalibrationOptions::CALIBRATION_METHOD_AVERAGE_MIN_MAX,
+      CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_BRUTEFORCE};
+
+  std::vector<NodeDefBuilder::NodeOut> inputs;
+  inputs.emplace_back("min", 0, DT_FLOAT);
+  inputs.emplace_back("max", 0, DT_FLOAT);
+  inputs.emplace_back("histogram", 0, DT_INT64);
+  inputs.emplace_back("min", 0, DT_FLOAT);
+  inputs.emplace_back("max", 0, DT_FLOAT);
+  inputs.emplace_back("histogram", 0, DT_INT64);
+
+  const std::string dir = testing::TmpDir();
+  const std::string output_file_path = io::JoinPath(dir, "statistics.pbtxt");
+
+  TF_CHECK_OK(NodeDefBuilder("op", "CalibrationStatisticsSaver")
+                  .Input(inputs)
+                  .Attr("ids", ids)
+                  .Attr("calibration_methods", calibration_methods)
+                  .Attr("output_file_path", output_file_path)
+                  .Finalize(node_def()));
+  TF_CHECK_OK(InitOp());
+
+  AddInputFromArray<float>(TensorShape({}), {1.f});
+  AddInputFromArray<float>(TensorShape({}), {5.f});
+  AddInputFromArray<int64_t>(TensorShape({0}), {});
+  AddInputFromArray<float>(TensorShape({}), {1.f});
+  AddInputFromArray<float>(TensorShape({}), {5.f});
+  AddInputFromArray<int64_t>(TensorShape({8}), {1, 4, 6, 7, 3, 2, 1, 0});
+
+  TF_CHECK_OK(RunOpKernel());
+  kernel_.reset();
+
+  CalibrationStatisticsMap statistics_map;
+  TF_CHECK_OK(
+      ReadBinaryProto(Env::Default(), output_file_path, &statistics_map));
+  ASSERT_THAT(statistics_map.statistics(), SizeIs(2));
+  ASSERT_THAT(statistics_map.statistics(), Contains(Key("1")));
+  ASSERT_THAT(statistics_map.statistics(), Contains(Key("2")));
+
+  const CalibrationStatistics& stats_1 = statistics_map.statistics().at("1");
+  ASSERT_TRUE(stats_1.has_average_min_max_statistics());
+  EXPECT_FLOAT_EQ(stats_1.average_min_max_statistics().min_sum(), 1.f);
+  EXPECT_FLOAT_EQ(stats_1.average_min_max_statistics().max_sum(), 5.f);
+  EXPECT_EQ(stats_1.average_min_max_statistics().num_samples(), 1);
+
+  const CalibrationStatistics& stats_2 = statistics_map.statistics().at("2");
+  ASSERT_TRUE(stats_2.has_histogram_statistics());
+  EXPECT_FLOAT_EQ(stats_2.histogram_statistics().bin_width(), 0.5f);
+  EXPECT_FLOAT_EQ(stats_2.histogram_statistics().lower_bound(), 1.f);
+  EXPECT_THAT(stats_2.histogram_statistics().hist_freq(),
+              ElementsAre(1, 4, 6, 7, 3, 2, 1));
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.cc b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.cc
index 7fe3b34c8137d1..74575b761737a3 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.cc
@@ -60,40 +60,27 @@ void CalibratorSingleton::ClearData(absl::string_view id) {
   instance.id_to_collector_[id_str].reset(nullptr);
 }
 
-void CalibratorSingleton::Report(absl::string_view id,
-                                 absl::Span<float> data_span,
+void CalibratorSingleton::Report(absl::string_view id, const Tensor& min_tensor,
+                                 const Tensor& max_tensor,
+                                 const Tensor& histogram_tensor,
                                  const CalibrationOptions& calib_opts) {
-  absl::MutexLock lock(&lock_);
-
-  CalibratorSingleton& instance = GetInstance();
-
-  const std::string id_str{id};
-  AssignIfNotExists(id_str, calib_opts);
-  instance.id_to_collector_[id_str]->Collect(data_span);
+  const float min_value = min_tensor.scalar<float>()();
+  const float max_value = max_tensor.scalar<float>()();
+  auto histogram_flat = histogram_tensor.flat<int64_t>();
+  absl::Span<const int64_t> histogram_data =
+      absl::MakeSpan(histogram_flat.data(), histogram_flat.size());
+  Report(id, min_value, max_value, histogram_data, calib_opts);
 }
 
-void CalibratorSingleton::Report(absl::string_view id,
-                                 const std::vector<float>& data_vec,
+void CalibratorSingleton::Report(absl::string_view id, float min, float max,
+                                 absl::Span<const int64_t> histogram,
                                  const CalibrationOptions& calib_opts) {
   absl::MutexLock lock(&lock_);
 
   CalibratorSingleton& instance = GetInstance();
-
   const std::string id_str{id};
   AssignIfNotExists(id_str, calib_opts);
-  instance.id_to_collector_[id_str]->Collect(data_vec);
-}
-
-void CalibratorSingleton::Report(absl::string_view id,
-                                 const Tensor& data_tensor,
-                                 const CalibrationOptions& calib_opts) {
-  absl::MutexLock lock(&lock_);
-
-  CalibratorSingleton& instance = GetInstance();
-
-  const std::string id_str{id};
-  AssignIfNotExists(id_str, calib_opts);
-  instance.id_to_collector_[id_str]->Collect(data_tensor);
+  instance.id_to_collector_[id_str]->Collect(min, max, histogram);
 }
 
 std::optional<CalibrationStatistics> CalibratorSingleton::GetStatistics(
@@ -111,37 +98,27 @@ std::optional<CalibrationStatistics> CalibratorSingleton::GetStatistics(
   return instance.id_to_collector_[id_str]->GetStatistics();
 }
 
-int64_t CalibratorSingleton::IssueNewId() {
-  CalibratorSingleton& instance = GetInstance();
-  return instance.next_id_++;
-}
-
 void CalibratorSingleton::AssignIfNotExists(
     std::string id_str, const CalibrationOptions& calib_opts) {
   CalibratorSingleton& instance = GetInstance();
-
-  if (!instance.id_to_collector_[id_str]) {
-    CalibrationOptions::CalibrationMethod calib_method =
-        calib_opts.calibration_method();
-
-    switch (calib_method) {
-      case CalibrationOptions::CALIBRATION_METHOD_AVERAGE_MIN_MAX:
-        instance.id_to_collector_[id_str] =
-            std::make_unique<CalibrationStatisticsCollectorAverageMinMax>();
-        break;
-      case CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_PERCENTILE:
-      case CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_BRUTEFORCE:
-      case CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_SYMMETRIC:
-      case CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_MAX_FREQUENCY:
-        instance.id_to_collector_[id_str] =
-            std::make_unique<CalibrationStatisticsCollectorHistogram>(
-                calib_opts);
-        break;
-      case CalibrationOptions::CALIBRATION_METHOD_MIN_MAX:
-      default:
-        instance.id_to_collector_[id_str] =
-            std::make_unique<CalibrationStatisticsCollectorMinMax>();
-    }
+  if (instance.id_to_collector_[id_str]) return;
+
+  switch (calib_opts.calibration_method()) {
+    case CalibrationOptions::CALIBRATION_METHOD_AVERAGE_MIN_MAX:
+      instance.id_to_collector_[id_str] =
+          std::make_unique<CalibrationStatisticsCollectorAverageMinMax>();
+      break;
+    case CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_PERCENTILE:
+    case CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_BRUTEFORCE:
+    case CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_SYMMETRIC:
+    case CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_MAX_FREQUENCY:
+      instance.id_to_collector_[id_str] =
+          std::make_unique<CalibrationStatisticsCollectorHistogram>();
+      break;
+    case CalibrationOptions::CALIBRATION_METHOD_MIN_MAX:
+    default:
+      instance.id_to_collector_[id_str] =
+          std::make_unique<CalibrationStatisticsCollectorMinMax>();
   }
 }
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.h b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.h
index d909dcecf76a66..8a6aee81ee9cbd 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.h
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.h
@@ -47,42 +47,27 @@ class CalibratorSingleton {
   // Clears the collected data of the given node id.
   static void ClearData(absl::string_view id);
 
-  // Reports data to singleton using float vector.
-  // Only calculates the required statistics from CalibrationMethod based
-  // on CalibrationOptions.
-  static void Report(absl::string_view id, const std::vector<float>& data_vec,
+  // Reports data to the singleton. Only calculates the required statistics
+  // based on CalibrationOptions.
+  static void Report(absl::string_view id, const Tensor& min_tensor,
+                     const Tensor& max_tensor, const Tensor& histogram_tensor,
                      const CalibrationOptions& calib_opts);
 
-  // Reports data to singleton using absl::Span
-  // Only calculates the required statistics from CalibrationMethod based
-  // on CalibrationOptions.
-  static void Report(absl::string_view id, absl::Span<float> data_span,
-                     const CalibrationOptions& calib_opts);
-
-  // Reports data to singleton using absl::Span
-  // Only calculates the required statistics from CalibrationMethod based
-  // on CalibrationOptions.
-  static void Report(absl::string_view id, const Tensor& data_tensor,
+  // Same as above but accepts primitive input types.
+  static void Report(absl::string_view id, float min, float max,
+                     absl::Span<const int64_t> histogram,
                      const CalibrationOptions& calib_opts);
 
   // Returns the calibration statistics of the given id.
   static std::optional<CalibrationStatistics> GetStatistics(
       absl::string_view id);
 
-  // Issues a new node ID that uniquely identifies a set of calibration
-  // statistics.
-  static int64_t IssueNewId();
-
  private:
   static CalibratorSingleton& GetInstance();
   static absl::Mutex lock_;
   static void AssignIfNotExists(std::string id_str,
                                 const CalibrationOptions& calib_opts);
 
-  // Indicates the next id for a set of calibration statistics. For every new ID
-  // issued this will be incremented atomically.
-  std::atomic<int64_t> next_id_{0};
-
   absl::flat_hash_map<std::string,
                       std::unique_ptr<CalibrationStatisticsCollectorBase>>
       id_to_collector_;
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton_test.cc b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton_test.cc
index d7652c1b6806c4..ca338b58c5909d 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton_test.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton_test.cc
@@ -30,16 +30,12 @@ namespace {
 using ::stablehlo::quantization::CalibrationOptions;
 
 TEST(CalibratorSingletonTest, SimpleMinMax) {
-  std::vector<std::vector<float>> report_vec;
   CalibrationOptions calib_opts;
   calib_opts.set_calibration_method(
       CalibrationOptions::CALIBRATION_METHOD_MIN_MAX);
 
-  report_vec.push_back({1.0f, 2.0f, 3.0f, 4.0f, 5.0f});
-  report_vec.push_back({1.0f, 2.0f, 3.0f, 4.0f, 10.0f});
-  report_vec.push_back({-5.0f, 2.0f, 3.0f, 4.0f, 5.0f});
-
-  CalibratorSingleton::Report(/*id=*/"1", /*data_vec=*/report_vec[0],
+  CalibratorSingleton::Report(/*id=*/"1", /*min=*/1.0f, /*max=*/5.0f,
+                              /*histogram=*/{},
                               /*calib_opts=*/calib_opts);
   std::optional<CalibrationStatistics> statistics =
       CalibratorSingleton::GetStatistics(/*id=*/"1");
@@ -48,7 +44,8 @@ TEST(CalibratorSingletonTest, SimpleMinMax) {
   EXPECT_EQ(statistics.value().min_max_statistics().global_min(), 1.0f);
   EXPECT_EQ(statistics.value().min_max_statistics().global_max(), 5.0f);
 
-  CalibratorSingleton::Report(/*id=*/"1", /*data_vec=*/report_vec[1],
+  CalibratorSingleton::Report(/*id=*/"1", /*min=*/1.0f, /*max=*/10.0f,
+                              /*histogram=*/{},
                               /*calib_opts=*/calib_opts);
   statistics = CalibratorSingleton::GetStatistics(/*id=*/"1");
 
@@ -56,7 +53,8 @@ TEST(CalibratorSingletonTest, SimpleMinMax) {
   EXPECT_EQ(statistics.value().min_max_statistics().global_min(), 1.0f);
   EXPECT_EQ(statistics.value().min_max_statistics().global_max(), 10.0f);
 
-  CalibratorSingleton::Report(/*id=*/"1", /*data_vec=*/report_vec[2],
+  CalibratorSingleton::Report(/*id=*/"1", /*min=*/-5.0f, /*max=*/5.0f,
+                              /*histogram=*/{},
                               /*calib_opts=*/calib_opts);
   statistics = CalibratorSingleton::GetStatistics(/*id=*/"1");
 
@@ -66,16 +64,12 @@ TEST(CalibratorSingletonTest, SimpleMinMax) {
 }
 
 TEST(CalibratorSingletonTest, DifferentSessions) {
-  std::vector<std::vector<float>> report_vec;
   CalibrationOptions calib_opts;
   calib_opts.set_calibration_method(
       CalibrationOptions::CALIBRATION_METHOD_MIN_MAX);
 
-  report_vec.push_back({1.0f, 2.0f, 3.0f, 4.0f, 5.0f});
-  report_vec.push_back({1.0f, 2.0f, 3.0f, 4.0f, 10.0f});
-  report_vec.push_back({-5.0f, 2.0f, 3.0f, 4.0f, 5.0f});
-
-  CalibratorSingleton::Report(/*id=*/"2", /*data_vec=*/report_vec[0],
+  CalibratorSingleton::Report(/*id=*/"2", /*min=*/1.0f, /*max=*/5.0f,
+                              /*histogram=*/{},
                               /*calib_opts=*/calib_opts);
   std::optional<CalibrationStatistics> statistics =
       CalibratorSingleton::GetStatistics(/*id=*/"2");
@@ -84,7 +78,8 @@ TEST(CalibratorSingletonTest, DifferentSessions) {
   EXPECT_EQ(statistics.value().min_max_statistics().global_min(), 1.0f);
   EXPECT_EQ(statistics.value().min_max_statistics().global_max(), 5.0f);
 
-  CalibratorSingleton::Report(/*id=*/"2", /*data_vec=*/report_vec[1],
+  CalibratorSingleton::Report(/*id=*/"2", /*min=*/1.0f, /*max=*/10.0f,
+                              /*histogram=*/{},
                               /*calib_opts=*/calib_opts);
   statistics = CalibratorSingleton::GetStatistics(/*id=*/"2");
 
@@ -92,7 +87,8 @@ TEST(CalibratorSingletonTest, DifferentSessions) {
   EXPECT_EQ(statistics.value().min_max_statistics().global_min(), 1.0f);
   EXPECT_EQ(statistics.value().min_max_statistics().global_max(), 10.0f);
 
-  CalibratorSingleton::Report(/*id=*/"3", /*data_vec=*/report_vec[2],
+  CalibratorSingleton::Report(/*id=*/"3", /*min=*/-5.0f, /*max=*/5.0f,
+                              /*histogram=*/{},
                               /*calib_opts=*/calib_opts);
   statistics = CalibratorSingleton::GetStatistics(/*id=*/"3");
 
@@ -110,7 +106,8 @@ TEST(CalibratorSingletonTest, ClearAndGetEmptyResult) {
   report_vec.push_back({1.0f, 2.0f, 3.0f, 4.0f, 5.0f});
   report_vec.push_back({1.0f, 2.0f, 3.0f, 4.0f, 10.0f});
 
-  CalibratorSingleton::Report(/*id=*/"4", /*data_vec=*/report_vec[0],
+  CalibratorSingleton::Report(/*id=*/"4", /*min=*/1.0f, /*max=*/5.0f,
+                              /*histogram=*/{},
                               /*calib_opts=*/calib_opts);
   std::optional<CalibrationStatistics> statistics =
       CalibratorSingleton::GetStatistics(/*id=*/"4");
@@ -126,16 +123,12 @@ TEST(CalibratorSingletonTest, ClearAndGetEmptyResult) {
 }
 
 TEST(CalibratorSingletonTest, ClearDataAndGetResults) {
-  std::vector<std::vector<float>> report_vec;
   CalibrationOptions calib_opts;
   calib_opts.set_calibration_method(
       CalibrationOptions::CALIBRATION_METHOD_MIN_MAX);
 
-  report_vec.push_back({1.0f, 2.0f, 3.0f, 4.0f, 5.0f});
-  report_vec.push_back({1.0f, 2.0f, 3.0f, 4.0f, 10.0f});
-  report_vec.push_back({-5.0f, 2.0f, 3.0f, 4.0f, 5.0f});
-
-  CalibratorSingleton::Report(/*id=*/"5", /*data_vec=*/report_vec[0],
+  CalibratorSingleton::Report(/*id=*/"5", /*min=*/1.0f, /*max=*/5.0f,
+                              /*histogram=*/{},
                               /*calib_opts=*/calib_opts);
   std::optional<CalibrationStatistics> statistics =
       CalibratorSingleton::GetStatistics(/*id=*/"5");
@@ -144,7 +137,8 @@ TEST(CalibratorSingletonTest, ClearDataAndGetResults) {
   EXPECT_EQ(statistics.value().min_max_statistics().global_min(), 1.0f);
   EXPECT_EQ(statistics.value().min_max_statistics().global_max(), 5.0f);
 
-  CalibratorSingleton::Report(/*id=*/"6", /*data_vec=*/report_vec[1],
+  CalibratorSingleton::Report(/*id=*/"6", /*min=*/1.0f, /*max=*/10.0f,
+                              /*histogram=*/{},
                               /*calib_opts=*/calib_opts);
   statistics = CalibratorSingleton::GetStatistics(/*id=*/"6");
 
@@ -157,7 +151,8 @@ TEST(CalibratorSingletonTest, ClearDataAndGetResults) {
 
   EXPECT_FALSE(statistics.has_value());
 
-  CalibratorSingleton::Report(/*id=*/"6", /*data_vec=*/report_vec[1],
+  CalibratorSingleton::Report(/*id=*/"6", /*min=*/1.0f, /*max=*/10.0f,
+                              /*histogram=*/{},
                               /*calib_opts=*/calib_opts);
   statistics = CalibratorSingleton::GetStatistics(/*id=*/"6");
 
@@ -167,16 +162,12 @@ TEST(CalibratorSingletonTest, ClearDataAndGetResults) {
 }
 
 TEST(CalibratorSingletonTest, SimpleAverageMinMax) {
-  std::vector<std::vector<float>> report_vec;
   CalibrationOptions calib_opts;
   calib_opts.set_calibration_method(
       CalibrationOptions::CALIBRATION_METHOD_AVERAGE_MIN_MAX);
 
-  report_vec.push_back({-10.0f, 2.0f, 3.0f, 4.0f, 30.0f});
-  report_vec.push_back({-20.0f, 2.0f, 3.0f, 4.0f, 60.0f});
-  report_vec.push_back({-30.0f, 2.0f, 3.0f, 4.0f, 90.0f});
-
-  CalibratorSingleton::Report(/*id=*/"7", /*data_vec=*/report_vec[0],
+  CalibratorSingleton::Report(/*id=*/"7", /*min=*/-10.0f, /*max=*/30.0f,
+                              /*histogram=*/{},
                               /*calib_opts=*/calib_opts);
   std::optional<CalibrationStatistics> statistics =
       CalibratorSingleton::GetStatistics(/*id=*/"7");
@@ -186,7 +177,8 @@ TEST(CalibratorSingletonTest, SimpleAverageMinMax) {
   EXPECT_EQ(statistics.value().average_min_max_statistics().max_sum(), 30.0f);
   EXPECT_EQ(statistics.value().average_min_max_statistics().num_samples(), 1);
 
-  CalibratorSingleton::Report(/*id=*/"7", /*data_vec=*/report_vec[1],
+  CalibratorSingleton::Report(/*id=*/"7", /*min=*/-20.0f, /*max=*/60.0f,
+                              /*histogram=*/{},
                               /*calib_opts=*/calib_opts);
   statistics = CalibratorSingleton::GetStatistics(/*id=*/"7");
 
@@ -195,7 +187,8 @@ TEST(CalibratorSingletonTest, SimpleAverageMinMax) {
   EXPECT_EQ(statistics.value().average_min_max_statistics().max_sum(), 90.0f);
   EXPECT_EQ(statistics.value().average_min_max_statistics().num_samples(), 2);
 
-  CalibratorSingleton::Report(/*id=*/"7", /*data_vec=*/report_vec[2],
+  CalibratorSingleton::Report(/*id=*/"7", /*min=*/-30.0f, /*max=*/90.0f,
+                              /*histogram=*/{},
                               /*calib_opts=*/calib_opts);
   statistics = CalibratorSingleton::GetStatistics(/*id=*/"7");
 
@@ -205,12 +198,6 @@ TEST(CalibratorSingletonTest, SimpleAverageMinMax) {
   EXPECT_EQ(statistics.value().average_min_max_statistics().num_samples(), 3);
 }
 
-TEST(CalibratorSingletonTest, IssueNewIdGeneratesNewId) {
-  const int64_t id = CalibratorSingleton::IssueNewId();
-  const int64_t next_id = CalibratorSingleton::IssueNewId();
-  EXPECT_NE(id, next_id);
-}
-
 }  // namespace
 }  // namespace calibrator
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/custom_aggregator_op.cc b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/custom_aggregator_op.cc
index b87271f076f3bd..66d932a44f6179 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/custom_aggregator_op.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/custom_aggregator_op.cc
@@ -12,32 +12,62 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#define EIGEN_USE_THREADS
+#include <cstdint>
 #include <string>
 
+#include "absl/status/status.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/calibration_parameters.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/op_requires.h"
 #include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tsl/platform/errors.h"
 
 namespace tensorflow {
+namespace {
 
+using ::stablehlo::quantization::CalculateBinIndexSafe;
+using ::stablehlo::quantization::CalculateBinWidth;
+using ::stablehlo::quantization::CalculateLowerBound;
 using ::stablehlo::quantization::CalibrationOptions;
+using ::stablehlo::quantization::GetNumBins;
+using CPUDevice = ::Eigen::ThreadPoolDevice;
+using CalibrationMethod =
+    ::stablehlo::quantization::CalibrationOptions_CalibrationMethod;
+
+}  // namespace
 
 REGISTER_OP("CustomAggregator")
     .Input("input: float")
     .Output("output: float")
+    .Output("min: float")
+    .Output("max: float")
+    .Output("histogram: int64")
     .Attr("id: string")
     .Attr("calibration_method: int = 0")
     .Attr("initial_num_bins: int = 0")
     .Attr("min_percentile: float = 0.0")
     .Attr("max_percentile: float = 0.0")
-    .SetIsStateful()
     .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
       c->set_output(0, c->input(0));
-      return OkStatus();
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+
+      const tensorflow::AttrValue* calibration_method_attr;
+      TF_RETURN_IF_ERROR(
+          c->GetAttr("calibration_method", &calibration_method_attr));
+      int32_t num_bins = GetNumBins(
+          static_cast<CalibrationMethod>(calibration_method_attr->i()));
+      c->set_output(3, c->MakeShape({num_bins}));
+
+      return absl::OkStatus();
     });
 
 class CustomAggregatorOp : public OpKernel {
@@ -45,20 +75,29 @@ class CustomAggregatorOp : public OpKernel {
   explicit CustomAggregatorOp(OpKernelConstruction* context)
       : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("id", &id_));
+
+    int calibration_method_value;
     int initial_num_bins;
-    int calibration_method;
     float min_percentile;
     float max_percentile;
-    OP_REQUIRES_OK(
-        context, context->GetAttr("calibration_method", (&calibration_method)));
+    OP_REQUIRES_OK(context, context->GetAttr("calibration_method",
+                                             &calibration_method_value));
     OP_REQUIRES_OK(context,
                    context->GetAttr("initial_num_bins", &initial_num_bins));
     OP_REQUIRES_OK(context,
                    context->GetAttr("min_percentile", &min_percentile));
     OP_REQUIRES_OK(context,
                    context->GetAttr("max_percentile", &max_percentile));
-    calib_opts_.set_calibration_method(
-        static_cast<CalibrationOptions::CalibrationMethod>(calibration_method));
+
+    auto calibration_method =
+        static_cast<CalibrationMethod>(calibration_method_value);
+    OP_REQUIRES(
+        context,
+        calibration_method !=
+            CalibrationOptions::CALIBRATION_METHOD_UNSPECIFIED,
+        absl::AbortedError("The calibration method must be specified."));
+
+    calib_opts_.set_calibration_method(calibration_method);
     calib_opts_.mutable_calibration_parameters()->set_initial_num_bins(
         initial_num_bins);
     calib_opts_.mutable_calibration_parameters()->set_min_percentile(
@@ -70,26 +109,59 @@ class CustomAggregatorOp : public OpKernel {
   void Compute(OpKernelContext* context) override {
     const Tensor& input_tensor = context->input(0);
 
-    auto input_flat = input_tensor.flat<float>();
+    // Use the same input for the first output.
+    context->set_output(0, input_tensor);
+
+    // Calculate min/max statistics.
+    const auto input_flat = input_tensor.flat<float>();
+    Tensor *min_output = nullptr, *max_output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output("min", {}, &min_output));
+    OP_REQUIRES_OK(context, context->allocate_output("max", {}, &max_output));
+    min_output->scalar<float>().device(
+        context->template eigen_device<CPUDevice>()) = input_flat.minimum();
+    max_output->scalar<float>().device(
+        context->template eigen_device<CPUDevice>()) = input_flat.maximum();
 
-    const int N = input_flat.size();
-    if (N == 0) {
-      // Use the same input for the output.
-      context->set_output(0, input_tensor);
-      return;
+    // Calculate histogram statistics.
+    int32_t num_bins = GetNumBins(calib_opts_.calibration_method());
+    Tensor* histogram_output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output("histogram", {num_bins},
+                                                     &histogram_output));
+    if (num_bins > 0) {
+      const float min_value = min_output->scalar<float>()();
+      const float max_value = max_output->scalar<float>()();
+      CalculateHistogramStatistics(context, input_tensor, min_value, max_value,
+                                   num_bins, histogram_output);
     }
 
     // By passing calib_opts_ and input_tensor to CalibratorSingleton,
     // CalibrationStatisticsCollector can calculate statistics for calibration.
-    calibrator::CalibratorSingleton::Report(id_, input_tensor, calib_opts_);
-
-    // Use the same input for the output.
-    context->set_output(0, input_tensor);
+    calibrator::CalibratorSingleton::Report(id_, *min_output, *max_output,
+                                            *histogram_output, calib_opts_);
   }
 
  private:
   std::string id_;
   CalibrationOptions calib_opts_;
+
+  void CalculateHistogramStatistics(OpKernelContext* context,
+                                    const Tensor& input_tensor,
+                                    const float min_value,
+                                    const float max_value,
+                                    const int32_t num_bins,
+                                    Tensor* histogram_tensor) {
+    const auto input_flat = input_tensor.flat<float>();
+    auto histogram_flat = histogram_tensor->flat<int64_t>();
+    histogram_flat.setZero();
+
+    const float bin_width = CalculateBinWidth(min_value, max_value, num_bins);
+    const float lower_bound = CalculateLowerBound(min_value, bin_width);
+    for (int i = 0; i < input_flat.size(); ++i) {
+      int32_t bin_index = CalculateBinIndexSafe(
+          input_flat.data()[i], lower_bound, bin_width, num_bins);
+      histogram_flat.data()[bin_index] += 1;
+    }
+  }
 };
 
 REGISTER_KERNEL_BUILDER(Name("CustomAggregator").Device(DEVICE_CPU),
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/custom_aggregator_op.py b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/custom_aggregator_op.py
deleted file mode 100644
index e7e8a5de064426..00000000000000
--- a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/custom_aggregator_op.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Custom Aggregator op is for collecting numeric metrics from the given input."""
-
-from tensorflow.compiler.mlir.quantization.tensorflow.calibrator import custom_aggregator_op_wrapper
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import load_library
-from tensorflow.python.platform import resource_loader
-
-_custom_aggregator_op = load_library.load_op_library(
-    resource_loader.get_path_to_datafile('_custom_aggregator_op.so'))
-
-
-def custom_aggregator(input_tensor, tensor_id: str):
-  """Creates custom aggregator op that collects numeric metrics from the tensor.
-
-  Args:
-    input_tensor: Tensor to be scanned through this operator. This tensor will
-      be bypassed to the output tensor of this operator.
-    tensor_id: String, the identity of the tensor to be scanned.
-
-  Returns:
-    A `Tensor` of the same value as `input_tensor`.
-
-  Raises:
-    ValueError: If the given type of `input_tensor` is not float32.
-  """
-  if input_tensor.dtype != dtypes.float32:
-    raise ValueError('Custom aggregator op only accept float32 values.')
-  return custom_aggregator_op_wrapper.custom_aggregator(input_tensor, tensor_id)
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/integration_test/custom_aggregator_op_test.py b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/integration_test/custom_aggregator_op_test.py
index 4cda958f398ac6..5940803f470117 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/integration_test/custom_aggregator_op_test.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/integration_test/custom_aggregator_op_test.py
@@ -46,10 +46,14 @@ def testBypassAndMinMax(self):
 
       aggregator = custom_aggregator_op_wrapper.custom_aggregator(
           input_tensor,
-          '1',
+          id='1',
           calibration_method=_CalibrationMethod.CALIBRATION_METHOD_MIN_MAX,
       )
-      self.assertAllEqual(self.evaluate(aggregator), [1.0, 2.0, 3.0, 4.0, 5.0])
+      aggregator_output = self.evaluate(aggregator)
+      self.assertAllEqual(aggregator_output.output, [1.0, 2.0, 3.0, 4.0, 5.0])
+      self.assertEqual(aggregator_output.min, 1.0)
+      self.assertEqual(aggregator_output.max, 5.0)
+      self.assertEmpty(aggregator_output.histogram)
 
       statistics: calib_stat_pb2.CalibrationStatistics = (
           pywrap_calibration.get_statistics_from_calibrator('1')
@@ -71,7 +75,12 @@ def testTwoIdentities(self):
           '2',
           calibration_method=_CalibrationMethod.CALIBRATION_METHOD_MIN_MAX,
       )
-      self.assertAllEqual(self.evaluate(aggregator1), [1.0, 2.0, 3.0, 4.0, 5.0])
+      aggregator1_output = self.evaluate(aggregator1)
+      self.assertAllEqual(aggregator1_output.output, [1.0, 2.0, 3.0, 4.0, 5.0])
+      self.assertEqual(aggregator1_output.min, 1.0)
+      self.assertEqual(aggregator1_output.max, 5.0)
+      self.assertEmpty(aggregator1_output.histogram)
+
       input_tensor2 = array_ops.constant(
           [-1.0, -2.0, -3.0, -4.0, -5.0], dtypes.float32
       )
@@ -80,9 +89,13 @@ def testTwoIdentities(self):
           '3',
           calibration_method=_CalibrationMethod.CALIBRATION_METHOD_MIN_MAX,
       )
+      aggregator2_output = self.evaluate(aggregator2)
       self.assertAllEqual(
-          self.evaluate(aggregator2), [-1.0, -2.0, -3.0, -4.0, -5.0]
+          aggregator2_output.output, [-1.0, -2.0, -3.0, -4.0, -5.0]
       )
+      self.assertEqual(aggregator2_output.min, -5.0)
+      self.assertEqual(aggregator2_output.max, -1.0)
+      self.assertEmpty(aggregator2_output.histogram)
 
       statistics: calib_stat_pb2 = (
           pywrap_calibration.get_statistics_from_calibrator('2')
@@ -108,7 +121,12 @@ def testClearData(self):
           '4',
           calibration_method=_CalibrationMethod.CALIBRATION_METHOD_MIN_MAX,
       )
-      self.assertAllEqual(self.evaluate(aggregator1), [1.0, 2.0, 3.0, 4.0, 5.0])
+      aggregator1_output = self.evaluate(aggregator1)
+      self.assertAllEqual(aggregator1_output.output, [1.0, 2.0, 3.0, 4.0, 5.0])
+      self.assertEqual(aggregator1_output.min, 1.0)
+      self.assertEqual(aggregator1_output.max, 5.0)
+      self.assertEmpty(aggregator1_output.histogram)
+
       input_tensor2 = array_ops.constant(
           [-1.0, -2.0, -3.0, -4.0, -5.0], dtypes.float32
       )
@@ -117,9 +135,13 @@ def testClearData(self):
           '5',
           calibration_method=_CalibrationMethod.CALIBRATION_METHOD_MIN_MAX,
       )
+      aggregator2_output = self.evaluate(aggregator2)
       self.assertAllEqual(
-          self.evaluate(aggregator2), [-1.0, -2.0, -3.0, -4.0, -5.0]
+          aggregator2_output.output, [-1.0, -2.0, -3.0, -4.0, -5.0]
       )
+      self.assertEqual(aggregator2_output.min, -5.0)
+      self.assertEqual(aggregator2_output.max, -1.0)
+      self.assertEmpty(aggregator2_output.histogram)
 
       statistics: calib_stat_pb2 = (
           pywrap_calibration.get_statistics_from_calibrator('4')
@@ -157,10 +179,15 @@ def testBypassAndAverageMinMax(self):
           '6',
           calibration_method=_CalibrationMethod.CALIBRATION_METHOD_AVERAGE_MIN_MAX,
       )
+      aggregator1_output = self.evaluate(aggregator1)
       self.assertAllEqual(
-          self.evaluate(aggregator1),
+          aggregator1_output.output,
           [-50.0, -25.0, 0.0, 25.0, 50.0],
       )
+      self.assertEqual(aggregator1_output.min, -50.0)
+      self.assertEqual(aggregator1_output.max, 50.0)
+      self.assertEmpty(aggregator1_output.histogram)
+
       input_tensor2 = array_ops.constant(
           [-100.0, -50.0, 0.0, 50.0, 100.0], dtypes.float32
       )
@@ -169,9 +196,13 @@ def testBypassAndAverageMinMax(self):
           '6',
           calibration_method=_CalibrationMethod.CALIBRATION_METHOD_AVERAGE_MIN_MAX,
       )
+      aggregator2_output = self.evaluate(aggregator2)
       self.assertAllEqual(
-          self.evaluate(aggregator2), [-100.0, -50.0, 0.0, 50.0, 100.0]
+          aggregator2_output.output, [-100.0, -50.0, 0.0, 50.0, 100.0]
       )
+      self.assertEqual(aggregator2_output.min, -100.0)
+      self.assertEqual(aggregator2_output.max, 100.0)
+      self.assertEmpty(aggregator2_output.histogram)
 
       statistics: calib_stat_pb2 = (
           pywrap_calibration.get_statistics_from_calibrator('6')
@@ -183,6 +214,31 @@ def testBypassAndAverageMinMax(self):
 
       self.assertAllEqual((min_sum, max_sum, num_samples), (-150.0, 150.0, 2))
 
+  def testHistogramCalibration(self):
+    with self.session():
+      pywrap_calibration.clear_calibrator()
+      input_tensor = array_ops.constant(
+          [1.0, 1.0, 3.0, 4.0, 6.0], dtypes.float32
+      )
+
+      aggregator = custom_aggregator_op_wrapper.custom_aggregator(
+          input_tensor,
+          id='7',
+          calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_MSE_BRUTEFORCE,
+          initial_num_bins=256,
+      )
+      aggregator_output = self.evaluate(aggregator)
+      self.assertAllEqual(aggregator_output.output, [1.0, 1.0, 3.0, 4.0, 6.0])
+      self.assertEqual(aggregator_output.min, 1.0)
+      self.assertEqual(aggregator_output.max, 6.0)
+
+      self.assertLen(aggregator_output.histogram, 512)
+      self.assertEqual(sum(aggregator_output.histogram), 5)
+      self.assertEqual(aggregator_output.histogram[0], 2)
+      self.assertEqual(aggregator_output.histogram[128], 1)
+      self.assertEqual(aggregator_output.histogram[192], 1)
+      self.assertEqual(aggregator_output.histogram[320], 1)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/cc/BUILD
index 23ce2105634854..218e229828211a 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/cc/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/BUILD
@@ -173,6 +173,7 @@ tf_cc_test(
     srcs = ["constant_fold_test.cc"],
     deps = [
         ":constant_fold",
+        "//tensorflow/compiler/mlir/quantization/common:attrs_and_constraints",
         "//tensorflow/compiler/mlir/quantization/common:test_base",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/core:tensorflow",
@@ -184,7 +185,6 @@ tf_cc_test(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TransformUtils",
-        "@llvm-project//mlir:Transforms",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/constant_fold_test.cc b/tensorflow/compiler/mlir/quantization/tensorflow/cc/constant_fold_test.cc
index 16d3f18364efbf..aaaf088b507e07 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/cc/constant_fold_test.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/constant_fold_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
 #include "tensorflow/compiler/mlir/quantization/common/test_base.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_custom_aggregation_op_to_quant_stats.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_custom_aggregation_op_to_quant_stats.cc
index 0a5bac34b50c3a..e4229cb97bf45a 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_custom_aggregation_op_to_quant_stats.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_custom_aggregation_op_to_quant_stats.cc
@@ -80,7 +80,7 @@ class ConvertCustomAggregationOpToQuantStats
 
     // When there are no min and max attributes, remove op.
     if (min == nullptr || max == nullptr) {
-      op->replaceAllUsesWith(op->getOperands());
+      op.getOutput().replaceAllUsesWith(op.getInput());
       rewriter.eraseOp(op);
       return success();
     }
@@ -93,8 +93,9 @@ class ConvertCustomAggregationOpToQuantStats
     ElementsAttr axis_stats;
     IntegerAttr axis;
 
-    rewriter.replaceOpWithNewOp<quantfork::StatisticsOp>(
-        op, op->getOperand(0), layer_stats, axis_stats, axis);
+    quantfork::StatisticsOp stats_op = rewriter.create<quantfork::StatisticsOp>(
+        op->getLoc(), op.getInput(), layer_stats, axis_stats, axis);
+    op.getOutput().replaceAllUsesWith(stats_op.getResult());
     return success();
   }
 };
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_tpu_model_to_cpu.td b/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_tpu_model_to_cpu.td
index 9d39d89c42ae53..03e7b18569d7a2 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_tpu_model_to_cpu.td
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_tpu_model_to_cpu.td
@@ -26,7 +26,7 @@ def GetBatchFunctionOpArgOperands:
 // because `TF_BatchFunctionOp` doesn't have the `CallOpInterface` trait.
 def ReplaceBatchFunctionOpToPartitionedCallOp : Pat<
   (TF_BatchFunctionOp:$src_op_res
-      $_, $_, $f, $_, $_, $_, $_, $_, $_, $_, $_, $_, $_, $_, $_, $_),
+      $_, $_, $f, $_, $_, $_, $_, $_, $_, $_, $_, $_, $_, $_, $_, $_, $_),
   (TF_PartitionedCallOp
       (GetBatchFunctionOpArgOperands $src_op_res),
       $f,
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_custom_aggregation_ops.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_custom_aggregation_ops.cc
index 56b9d7393aacfd..5ed89d89339571 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_custom_aggregation_ops.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_custom_aggregation_ops.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <cstdint>
 #include <memory>
 #include <utility>
 
@@ -35,6 +36,7 @@ limitations under the License.
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/calibration_parameters.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.h"
@@ -268,14 +270,21 @@ class AddCustomAggregationOp : public RewritePattern {
                   calib_opts_.calibration_parameters().max_percentile())),
       };
 
+      int32_t num_bins = GetNumBins(calib_opts_.calibration_method());
+      SmallVector<Type, 4> output_types{
+          value.getType(),
+          RankedTensorType::get({}, rewriter.getF32Type()),
+          RankedTensorType::get({}, rewriter.getF32Type()),
+          RankedTensorType::get({num_bins}, rewriter.getI64Type()),
+      };
+
       // Insert custom aggregation op between operand and operator.
       rewriter.setInsertionPointAfterValue(value);
       Operation *aggregator_op = rewriter.create<TF::CustomAggregatorOp>(
-          op->getLoc(), value.getType(), value, attributes);
+          op->getLoc(), output_types, value, attributes);
 
       Value aggregator_op_result = aggregator_op->getOpResult(0);
-      value.replaceAllUsesWith(aggregator_op_result);
-      aggregator_op->replaceUsesOfWith(aggregator_op_result, value);
+      value.replaceAllUsesExcept(aggregator_op_result, aggregator_op);
     }
 
     return success();
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.td b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.td
index 559c5e31a71f09..e33e226be35515 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.td
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.td
@@ -65,7 +65,10 @@ def TF_CustomAggregatorOp : TF_Op<"CustomAggregator", [Pure]> {
   );
 
   let results = (outs
-    TensorOf<[TF_Float32]>:$output
+    TensorOf<[TF_Float32]>:$output,
+    TensorOf<[TF_Float32]>:$min,
+    TensorOf<[TF_Float32]>:$max,
+    TensorOf<[TF_Int64]>:$histogram
   );
 }
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD
index bd53b29ad79255..78a8321f9f87d4 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD
@@ -51,10 +51,11 @@ cc_library(
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantize_passes",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantize_preprocess",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:calibration_statistics_saver_op",  # Required for CalibrationStatisticsSaver op registration.
         "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:custom_aggregator_op",  # Required for CustomAggregator op registration.
         "//tensorflow/compiler/mlir/quantization/tensorflow/cc:convert_asset_args",
         "//tensorflow/compiler/mlir/quantization/tensorflow/cc:run_passes",
-        "//tensorflow/compiler/mlir/quantization/tensorflow/debugging:dump_tensor_op",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/debugging:dump_tensor_op",  # Required for DumpTensor op registration.
         "//tensorflow/compiler/mlir/tensorflow:mlir_import_options",
         "//tensorflow/compiler/mlir/tensorflow:translate_lib",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tf_dialect_passes",
@@ -108,7 +109,6 @@ pytype_strict_library(
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_py",
         "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:calibration_algorithm",
         "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:calibration_statistics_proto_py",
-        "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:pywrap_calibration",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/client:session",
         "//tensorflow/python/eager:context",
@@ -120,7 +120,6 @@ pytype_strict_library(
         "//tensorflow/python/saved_model:loader",
         "//tensorflow/python/trackable:autotrackable",
         "//tensorflow/python/types:core",
-        "//third_party/py/numpy",
         "@absl_py//absl/logging",
     ],
 )
@@ -129,13 +128,7 @@ tf_py_strict_test(
     name = "py_function_lib_py_test",
     srcs = ["py_function_lib_test.py"],
     main = "py_function_lib_test.py",
-    deps = [
-        ":py_function_lib_py",
-        ":pywrap_function_lib",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/compiler/mlir/quantization/tensorflow:exported_model_proto_py",
-        "//tensorflow/python/platform:client_testlib",
-    ],
+    deps = ["//tensorflow/python/platform:client_testlib"],
 )
 
 cc_library(
@@ -220,6 +213,7 @@ tf_python_pybind_extension(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/protobuf:for_core_protos_cc",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:string_view",
         "@pybind11",
     ],
@@ -276,13 +270,11 @@ pytype_strict_library(
         "//tensorflow/python/framework:importer",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/lib/io:file_io",
-        "//tensorflow/python/ops:variables",
         "//tensorflow/python/saved_model:builder",
         "//tensorflow/python/saved_model:constants",
         "//tensorflow/python/saved_model:loader",
         "//tensorflow/python/saved_model:tag_constants",
         "//tensorflow/python/training:saver",
-        "//tensorflow/python/training:training_lib",
         "@absl_py//absl/logging",
     ],
 )
@@ -421,7 +413,6 @@ tf_py_strict_test(
         "//tensorflow/python/saved_model:save",
         "//tensorflow/python/saved_model:tag_constants",
         "//tensorflow/python/trackable:autotrackable",
-        "@absl_py//absl/testing:parameterized",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
index ec86deac1b497d..08ff75ac802613 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
@@ -6211,25 +6211,25 @@ class CalibrationOptionsTest(quantize_model_test_base.QuantizedModelTest):
               stablehlo_quant_config_pb2.CalibrationOptions(
                   calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_PERCENTILE,
                   calibration_parameters=stablehlo_quant_config_pb2.CalibrationOptions.CalibrationParameters(
-                      initial_num_bins=10,
+                      initial_num_bins=32,
                   ),
               ),
               stablehlo_quant_config_pb2.CalibrationOptions(
                   calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_MSE_BRUTEFORCE,
                   calibration_parameters=stablehlo_quant_config_pb2.CalibrationOptions.CalibrationParameters(
-                      initial_num_bins=10,
+                      initial_num_bins=32,
                   ),
               ),
               stablehlo_quant_config_pb2.CalibrationOptions(
                   calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_MSE_MAX_FREQUENCY,
                   calibration_parameters=stablehlo_quant_config_pb2.CalibrationOptions.CalibrationParameters(
-                      initial_num_bins=10,
+                      initial_num_bins=32,
                   ),
               ),
               stablehlo_quant_config_pb2.CalibrationOptions(
                   calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_MSE_SYMMETRIC,
                   calibration_parameters=stablehlo_quant_config_pb2.CalibrationOptions.CalibrationParameters(
-                      initial_num_bins=10,
+                      initial_num_bins=32,
                   ),
               ),
           ],
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h b/tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h
index 4be78b1dc74bf6..fbba72479805d6 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PYTHON_PY_FUNCTION_LIB_H_
 #define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PYTHON_PY_FUNCTION_LIB_H_
 
+#include <optional>
 #include <string>
 #include <unordered_set>
 #include <vector>
@@ -46,10 +47,12 @@ class PyFunctionLibrary {
   // `add_meta_graph_and_variables` function, which is internally used to add a
   // `MetaGraphDef` to save to the SavedModel.
   //
+  // Returns `true` if successful. Returns `std::nullopt` otherwise.
+  //
   // If the function signature changes, likely its corresponding .pyi type
   // hinting and definition should also change.
-  // LINT.IfChange
-  virtual void SaveExportedModel(
+  // LINT.IfChange(save_exported_model)
+  virtual std::optional<bool> SaveExportedModel(
       absl::string_view dst_saved_model_path,
       const ExportedModel& exported_model,
       absl::string_view src_saved_model_path,
@@ -70,18 +73,15 @@ class PyFunctionLibrary {
   // of type `RepresentativeDatasetOrMapping`, which is used to run the
   // calibration.
   //
-  // Returns the updated exported model where the collected calibration
-  // statistics are added to `CustomAggregator` nodes at the `min` and `max`
-  // attributes.
+  // Returns `true` if successful. Returns `std::nullopt` otherwise.
   //
   // If the function signature changes, likely its corresponding .pyi type
   // hinting and definition should also change.
   // LINT.IfChange(run_calibration)
-  virtual void RunCalibration(
+  virtual std::optional<bool> RunCalibration(
       absl::string_view saved_model_path,
       const std::vector<std::string>& signature_keys,
       const std::unordered_set<std::string>& tags,
-      const ::stablehlo::quantization::CalibrationOptions& calibration_options,
       bool force_graph_mode_calibration,
       const absl::flat_hash_map<std::string, RepresentativeDatasetFile>&
           representative_dataset_file_map) const = 0;
@@ -93,14 +93,16 @@ class PyFunctionLibrary {
   // Retrieves min and max value from `calibration_statistics`, based on the
   // calibration method specified by `calibration_options`.
   //
+  // Returns `std::nullopt` if unsuccessful.
+  //
   // If the function signature changes, likely its corresponding .pyi type
   // hinting and definition should also change.
   // LINT.IfChange(get_calibration_min_max_value)
-  virtual stablehlo::quantization::MinMaxValue GetCalibrationMinMaxValue(
-      const tensorflow::calibrator::CalibrationStatistics&
-          calibration_statistics,
-      const ::stablehlo::quantization::CalibrationOptions& calibration_options)
-      const = 0;
+  virtual std::optional<stablehlo::quantization::MinMaxValue>
+  GetCalibrationMinMaxValue(const tensorflow::calibrator::CalibrationStatistics&
+                                calibration_statistics,
+                            const ::stablehlo::quantization::CalibrationOptions&
+                                calibration_options) const = 0;
   // LINT.ThenChange(
   //     pywrap_function_lib.pyi:get_calibration_min_max_value,
   //     py_function_lib.py:get_calibration_min_max_value,
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.py
index 00f261f4a66c7c..f630138f81fca1 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.py
@@ -15,7 +15,9 @@
 """Defines a wrapper class for overridden python method definitions."""
 
 from collections.abc import Callable, Collection, Mapping, Sequence
-from typing import Optional
+import functools
+import traceback
+from typing import Optional, TypeVar
 
 from absl import logging
 
@@ -45,6 +47,11 @@
 _ASSETS_DIR = 'assets'
 _ASSETS_EXTRA_DIR = 'assets.extra'
 
+# Type variable for a type that is not `None`. This represents a return value of
+# methods in `PyFunctionLibrary` that should not be `None`, as `None` represents
+# that the execution was unsucessful, transfored as `std::nullopt_t` from c++.
+NotNoneT = TypeVar('NotNoneT')
+
 
 def _get_saver_def_or_none(
     exported_model: exported_model_pb2.ExportedModel,
@@ -502,6 +509,117 @@ def _run_graph_for_calibration(
   logging.info('Calibration step complete.')
 
 
+def _run_calibration(
+    saved_model_path: str,
+    signature_keys: Sequence[str],
+    tags: Collection[str],
+    force_graph_mode_calibration: bool,
+    representative_dataset_file_map: Mapping[
+        str, quantization_options_pb2.RepresentativeDatasetFile
+    ],
+) -> bool:
+  """Runs calibration and adds calibration statistics to exported model.
+
+  Args:
+    saved_model_path: Path to the SavedModel to run calibration.
+    signature_keys: List of signature keys corresponding to SignatureDefs to run
+      calibration on.
+    tags: A set of tags that identify the MetaGraphDef.
+    force_graph_mode_calibration: If True, runs the calibration in graph mode.
+    representative_dataset_file_map: Signature key ->
+      `RepresentativeDatasetFile` mapping for running the calibration step. Each
+      dataset file stores the representative dataset for the function matching
+      the signature key.
+
+  Returns:
+    `True` upon successfully running calibration.
+  """
+  repr_dataset_map = rd.TfRecordRepresentativeDatasetLoader(
+      representative_dataset_file_map
+  ).load()
+
+  # Uses the representative dataset to collect statistics for calibration.
+  # After this operation, min & max values are stored separately in a global
+  # CalibratorSingleton instance.
+  _run_graph_for_calibration(
+      saved_model_path,
+      signature_keys,
+      tags,
+      repr_dataset_map,
+      force_graph_mode_calibration,
+  )
+
+  # Dummy value to indicate successful run, as `None` would indicate error. See
+  # comments in `NotNoneT`.
+  return True
+
+
+def _call_and_return_none_on_error(
+    func: Callable[[], NotNoneT], error_msg: str
+) -> Optional[NotNoneT]:
+  """Calls `func` and returns `None` on error.
+
+  This is used to gracefully return the 'error status' represented as `None`, as
+  raising exceptions from `PyFunctionLibrary` methods crashes the program.
+
+  Args:
+    func: The function to run. The function should be a callable returning a
+      non-None value.
+    error_msg: The error message to log upon error. Used for debugging purposes.
+
+  Returns:
+    `None` if the function raises an exception. The return value of `func`
+    otherwise.
+  """
+  try:
+    return func()
+  except Exception as ex:  # pylint: disable=broad-exception-caught; Required for graceful failing with pybind11.
+    # Prints the exception traceback for debuggability.
+    traceback.print_exception(ex)
+    # Additional error log for debuggability.
+    logging.error(error_msg)
+    return None
+
+
+def _save_model_and_copy_assets(
+    exported_model: exported_model_pb2.ExportedModel,
+    src_saved_model_path: str,
+    dst_saved_model_path: str,
+    signature_def_map: Mapping[str, meta_graph_pb2.SignatureDef],
+    tags: Collection[str],
+) -> bool:
+  """Saves the model and copies the assets from the source model.
+
+  Args:
+    exported_model: ExportedModel to save.
+    src_saved_model_path: Path to the source SavedModel. This will be used to
+      copy the asset files to `dst_saved_model_path`.
+    dst_saved_model_path: Destination path to save the exported model.
+    signature_def_map: Signature key -> SignatureDef mapping.
+    tags: Tags to attach to the saved MetaGraphDef.
+
+  Returns:
+    `True` upon successfully saving the model.
+  """
+  save_model.save_model_v1(
+      exported_model.graph_def,
+      dst_saved_model_path,
+      signature_def_map,
+      tags,
+      init_op_name=exported_model.init_node_name,
+      saver_def=_get_saver_def_or_none(exported_model),
+      checkpoint_dir=exported_model.checkpoint_dir,
+      function_aliases=exported_model.function_aliases,
+      asset_file_defs=exported_model.asset_file_defs,
+  )
+
+  _copy_assets(src_saved_model_path, dst_saved_model_path)
+
+  # Dummy value to indicate successful run, as `None` would indicate error. See
+  # comments in `NotNoneT`.
+  return True
+
+
 class PyFunctionLibrary(pywrap_function_lib.PyFunctionLibrary):
   """Wrapper class for overridden python method definitions.
 
@@ -517,7 +635,7 @@ def save_exported_model(
       src_saved_model_path: str,
       tags: set[str],
       serialized_signature_def_map: dict[str, bytes],
-  ) -> None:
+  ) -> Optional[bool]:
     # LINT.ThenChange(py_function_lib.h:save_exported_model)
     """Saves `ExportedModel` to `dst_saved_model_path` as a SavedModel.
 
@@ -528,6 +646,10 @@ def save_exported_model(
         copy the asset files to `dst_saved_model_path`.
       tags: Tags to attach to the saved MetaGraphDef.
       serialized_signature_def_map: Signature key -> serialized SignatureDef.
+
+    Returns:
+      `True` upon successful execution. `None` when an error is raised
+      internally.
     """
     exported_model = exported_model_pb2.ExportedModel.FromString(
         exported_model_serialized
@@ -540,20 +662,21 @@ def save_exported_model(
           serialized_signature_def
       )
 
-    save_model.save_model_v1(
-        exported_model.graph_def,
-        dst_saved_model_path,
-        signature_def_map,
-        tags,
-        init_op_name=exported_model.init_node_name,
-        saver_def=_get_saver_def_or_none(exported_model),
-        checkpoint_dir=exported_model.checkpoint_dir,
-        function_aliases=exported_model.function_aliases,
-        asset_file_defs=exported_model.asset_file_defs,
+    return _call_and_return_none_on_error(
+        func=functools.partial(
+            _save_model_and_copy_assets,
+            exported_model,
+            src_saved_model_path,
+            dst_saved_model_path,
+            signature_def_map,
+            tags,
+        ),
+        error_msg=(
+            f'Failed to save model "{dst_saved_model_path}",'
+            f' signature_def_map: {signature_def_map}, tags: {tags}.'
+        ),
     )
 
-    _copy_assets(src_saved_model_path, dst_saved_model_path)
-
   # TODO: b/311097139 - Extract calibration related functions into a separate
   # file.
   # LINT.IfChange(run_calibration)
@@ -562,10 +685,9 @@ def run_calibration(
       saved_model_path: str,
       signature_keys: list[str],
       tags: set[str],
-      calibration_options_serialized: bytes,
       force_graph_mode_calibration: bool,
       representative_dataset_file_map_serialized: dict[str, bytes],
-  ) -> None:
+  ) -> Optional[bool]:
     # LINT.ThenChange(py_function_lib.h:run_calibration)
     """Runs calibration and adds calibration statistics to exported model.
 
@@ -574,7 +696,6 @@ def run_calibration(
       signature_keys: List of signature keys corresponding to SignatureDefs to
         run calibration on.
       tags: A set of tags that identify the MetaGraphDef.
-      calibration_options_serialized: Serialized `CalibrationOptions`.
       force_graph_mode_calibration: If True, runs the calibration in graph mode.
       representative_dataset_file_map_serialized: Signature key ->
         `RepresentativeDatasetFile` mapping for running the calibration step.
@@ -582,10 +703,9 @@ def run_calibration(
         matching the signature key.
 
     Returns:
-      Updated exported model (serialized) where the collected calibration
-      statistics are added to `CustomerAggregator` nodes at the `min` and `max`
-      attributes.
+      The error message if the function raises and exception. `None` otherwise.
     """
+    # Deserialize `RepresentativeDatasetFile` values.
     dataset_file_map = {}
     for (
         signature_key,
@@ -597,19 +717,19 @@ def run_calibration(
           )
       )
 
-    repr_dataset_map = rd.TfRecordRepresentativeDatasetLoader(
-        dataset_file_map=dataset_file_map
-    ).load()
-
-    # Uses the representative dataset to collect statistics for calibration.
-    # After this operation, min & max values are stored separately in a global
-    # CalibratorSingleton instance.
-    _run_graph_for_calibration(
-        saved_model_path,
-        signature_keys,
-        tags,
-        repr_dataset_map,
-        force_graph_mode_calibration,
+    return _call_and_return_none_on_error(
+        func=functools.partial(
+            _run_calibration,
+            saved_model_path,
+            signature_keys,
+            tags,
+            force_graph_mode_calibration,
+            dataset_file_map,
+        ),
+        error_msg=(
+            f'Failed to run calibration on model "{saved_model_path}",'
+            f' signature_keys: {signature_keys}, tags: {tags}.'
+        ),
     )
 
   # LINT.IfChange(get_calibration_min_max_value)
@@ -617,7 +737,7 @@ def get_calibration_min_max_value(
       self,
       calibration_statistics_serialized: bytes,
       calibration_options_serialized: bytes,
-  ) -> tuple[float, float]:
+  ) -> Optional[tuple[float, float]]:
     """Calculates min and max values from statistics.
 
     Args:
@@ -627,17 +747,26 @@ def get_calibration_min_max_value(
         how the min / max should be calculated.
 
     Returns:
-      (min_value, max_value): Min and max calculated using calib_opts.
-
-    Raises:
-      ValueError: Unsupported calibration method is given.
+      (min_value, max_value): Min and max calculated using calib_opts. `None`
+      upon error.
     """
     # LINT.ThenChange(py_function_lib.h:get_calibration_min_max_value)
-    return calibration_algorithm.get_min_max_value(
-        calibration_statistics_pb2.CalibrationStatistics.FromString(
-            calibration_statistics_serialized
+
+    # Deserialize values passed from c++.
+    statistics = calibration_statistics_pb2.CalibrationStatistics.FromString(
+        calibration_statistics_serialized
+    )
+    options = stablehlo_quant_config_pb2.CalibrationOptions.FromString(
+        calibration_options_serialized
+    )
+
+    return _call_and_return_none_on_error(
+        functools.partial(
+            calibration_algorithm.get_min_max_value,
+            statistics,
+            options,
         ),
-        stablehlo_quant_config_pb2.CalibrationOptions.FromString(
-            calibration_options_serialized
+        error_msg=(
+            f'Retrieving calibrated min / max failed. Options: {options}.'
         ),
     )
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_function_lib.cc b/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_function_lib.cc
index 9e8ecd2352ae94..fc181edb8a75f5 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_function_lib.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_function_lib.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "pybind11/cast.h"  // from @pybind11
 #include "pybind11/detail/common.h"  // from @pybind11
@@ -53,35 +54,36 @@ class PyFunctionLibraryTrampoline : public PyFunctionLibrary {
  public:
   using PyFunctionLibrary::PyFunctionLibrary;
 
-  void SaveExportedModel(const absl::string_view dst_saved_model_path,
-                         const ExportedModel& exported_model,
-                         const absl::string_view src_saved_model_path,
-                         const std::unordered_set<std::string>& tags,
-                         const absl::flat_hash_map<std::string, SignatureDef>&
-                             signature_def_map) const override {
-    PYBIND11_OVERRIDE_PURE(void, PyFunctionLibrary, save_exported_model,
-                           dst_saved_model_path, exported_model,
-                           src_saved_model_path, tags, signature_def_map);
+  std::optional<bool> SaveExportedModel(
+      const absl::string_view dst_saved_model_path,
+      const ExportedModel& exported_model,
+      const absl::string_view src_saved_model_path,
+      const std::unordered_set<std::string>& tags,
+      const absl::flat_hash_map<std::string, SignatureDef>& signature_def_map)
+      const override {
+    PYBIND11_OVERRIDE_PURE(std::optional<bool>, PyFunctionLibrary,
+                           save_exported_model, dst_saved_model_path,
+                           exported_model, src_saved_model_path, tags,
+                           signature_def_map);
   }
 
-  void RunCalibration(
+  std::optional<bool> RunCalibration(
       const absl::string_view saved_model_path,
       const std::vector<std::string>& signature_keys,
       const std::unordered_set<std::string>& tags,
-      const CalibrationOptions& calibration_options,
       const bool force_graph_mode_calibration,
       const absl::flat_hash_map<std::string, RepresentativeDatasetFile>&
           representative_dataset_file_map) const override {
-    PYBIND11_OVERRIDE_PURE(void, PyFunctionLibrary, run_calibration,
-                           saved_model_path, signature_keys, tags,
-                           calibration_options, force_graph_mode_calibration,
+    PYBIND11_OVERRIDE_PURE(std::optional<bool>, PyFunctionLibrary,
+                           run_calibration, saved_model_path, signature_keys,
+                           tags, force_graph_mode_calibration,
                            representative_dataset_file_map);
   }
 
-  MinMaxValue GetCalibrationMinMaxValue(
+  std::optional<MinMaxValue> GetCalibrationMinMaxValue(
       const CalibrationStatistics& calibration_statistics,
       const CalibrationOptions& calibration_options) const override {
-    PYBIND11_OVERRIDE_PURE(MinMaxValue, PyFunctionLibrary,
+    PYBIND11_OVERRIDE_PURE(std::optional<MinMaxValue>, PyFunctionLibrary,
                            get_calibration_min_max_value,
                            calibration_statistics, calibration_options);
   }
@@ -100,8 +102,7 @@ PYBIND11_MODULE(pywrap_function_lib, m) {
            py::arg("serialized_signature_def_map"))
       .def("run_calibration", &PyFunctionLibrary::RunCalibration,
            py::arg("saved_model_path"), py::arg("signature_keys"),
-           py::arg("tags"), py::arg("calibration_options_serialized"),
-           py::arg("force_graph_mode_calibration"),
+           py::arg("tags"), py::arg("force_graph_mode_calibration"),
            py::arg("representative_dataset_file_map_serialized"))
       .def("get_calibration_min_max_value",
            &PyFunctionLibrary::GetCalibrationMinMaxValue,
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_function_lib.pyi b/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_function_lib.pyi
index d8c9ed4d9be79e..8e4a7cee6203c7 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_function_lib.pyi
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_function_lib.pyi
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-from typing import Any
+from typing import Any, Optional
 
 class PyFunctionLibrary:
 
@@ -24,7 +24,7 @@ class PyFunctionLibrary:
       src_saved_model_path: str,
       tags: set[str],
       serialized_signature_def_map: dict[str, bytes],
-  ) -> None: ...
+  ) -> Optional[bool]: ...
   # LINT.ThenChange()
 
   # LINT.IfChange(run_calibration)
@@ -33,11 +33,10 @@ class PyFunctionLibrary:
       saved_model_path: str,
       signature_keys: list[str],
       tags: set[str],
-      calibration_options_serialized: bytes,
       force_graph_mode_calibration: bool,
       # Value type: RepresentativeDatasetFile.
       representative_dataset_file_map_serialized: dict[str, bytes],
-  ) -> None: ...
+  ) -> Optional[bool]: ...
   # LINT.ThenChange()
 
   # LINT.IfChange(get_calibration_min_max_value)
@@ -45,5 +44,5 @@ class PyFunctionLibrary:
       self,
       calibration_statistics_serialized: bytes,
       calibration_options_serialized: bytes,
-  ) -> tuple[float, float]: ...
+  ) -> Optional[tuple[float, float]]: ...
   # LINT.ThenChange()
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.cc b/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.cc
index a0865c44664290..6f5db2c5a823e8 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.cc
@@ -223,8 +223,8 @@ PYBIND11_MODULE(pywrap_quantize_model, m) {
         if (!exported_model.ok()) return exported_model.status();
 
         // Remove the `tpu` tag from the debug quantized saved model as it is
-        // for CPU. Note the 'tpu' value should be the same as `TPU` defined in
-        // tensorflow/python/saved_model/tag_constants.py.
+        // for CPU. Note the 'tpu' value should be the same as `TPU` defined
+        // in tensorflow/python/saved_model/tag_constants.py.
         if (quantization_options.has_debugger_config()) {
           tags.erase("tpu");
         }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc
index 89467d30944ca9..9f4621360e2e89 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc
@@ -437,7 +437,6 @@ absl::StatusOr<ExportedModel> QuantizeStaticRangePtq(
 
   py_function_library.RunCalibration(
       *precalibrated_saved_model_dir, signature_keys, tags,
-      quantization_options.calibration_options(),
       quantization_options.force_graph_mode_calibration(),
       representative_dataset_file_map_serialized);
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/unfreeze_constants.cc b/tensorflow/compiler/mlir/quantization/tensorflow/python/unfreeze_constants.cc
index b957ffe469a004..c3f5c32bdd9720 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/unfreeze_constants.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/unfreeze_constants.cc
@@ -47,7 +47,7 @@ absl::Status UnfreezeConstantsAndSaveVariables(
       },
       ctx, module_op));
 
-  if (const tsl::Status create_dir_status =
+  if (const absl::Status create_dir_status =
           Env::Default()->CreateDir(std::string(checkpoint_dir));
       !create_dir_status.ok()) {
     LOG(ERROR) << "Failed to create checkpoint directory at: "
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/convert_custom_aggregation_op_to_quant_stats.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/convert_custom_aggregation_op_to_quant_stats.mlir
index 02a348b8c8fe3e..f72c9f3388c071 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/convert_custom_aggregation_op_to_quant_stats.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/convert_custom_aggregation_op_to_quant_stats.mlir
@@ -1,19 +1,19 @@
 // RUN: tf-quant-opt %s -quant-convert-tf-custom-aggregator-op-to-quant-stats | FileCheck %s
 
 func.func @customAggregator(%arg0: tensor<8x8x8x8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8x8x8x8xf32>) {
-  %0 = "tf.CustomAggregator"(%arg0) {min = -0.1 : f32, max = 0.2 : f32, id = "0"} : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
-  %1 = "tf.CustomAggregator"(%arg0) {id = "1"} : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
-  func.return %0, %1 : tensor<8x8x8x8xf32>, tensor<8x8x8x8xf32>
+  %0:4 = "tf.CustomAggregator"(%arg0) {min = -0.1 : f32, max = 0.2 : f32, id = "0"} : (tensor<8x8x8x8xf32>) -> (tensor<8x8x8x8xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+  %1:4 = "tf.CustomAggregator"(%arg0) {id = "1"} : (tensor<8x8x8x8xf32>) -> (tensor<8x8x8x8xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+  func.return %0#0, %1#0 : tensor<8x8x8x8xf32>, tensor<8x8x8x8xf32>
 }
 // CHECK: func @customAggregator
 // CHECK-NEXT: %[[stats:.*]] = "quantfork.stats"(%arg0) {layerStats = dense<[-1.000000e-01, 2.000000e-01]> : tensor<2xf32>} : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
 // CHECK-NEXT: return %[[stats]], %arg0
 
 func.func @doNotHandleNoMinMaxCases(%arg0: tensor<8x8x8x8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8x8x8x8xf32>, tensor<8x8x8x8xf32>) {
-  %0 = "tf.CustomAggregator"(%arg0) {min = -0.1 : f32, id = "1"} : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
-  %1 = "tf.CustomAggregator"(%arg0) {max = 0.2 : f32, id = "2"} : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
-  %2 = "tf.CustomAggregator"(%arg0) {id = "3"} : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
-  func.return %0, %1, %2 : tensor<8x8x8x8xf32>, tensor<8x8x8x8xf32>, tensor<8x8x8x8xf32>
+  %0:4 = "tf.CustomAggregator"(%arg0) {min = -0.1 : f32, id = "1"} : (tensor<8x8x8x8xf32>) -> (tensor<8x8x8x8xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+  %1:4 = "tf.CustomAggregator"(%arg0) {max = 0.2 : f32, id = "2"} : (tensor<8x8x8x8xf32>) -> (tensor<8x8x8x8xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+  %2:4 = "tf.CustomAggregator"(%arg0) {id = "3"} : (tensor<8x8x8x8xf32>) -> (tensor<8x8x8x8xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+  func.return %0#0, %1#0, %2#0 : tensor<8x8x8x8xf32>, tensor<8x8x8x8xf32>, tensor<8x8x8x8xf32>
 }
 // CHECK: func @doNotHandleNoMinMaxCases
 // CHECK-NOT: "quantfork.stats"
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/insert_custom_aggregation_ops.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/insert_custom_aggregation_ops.mlir
index b8ed5d5f361d36..052da55dce336d 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/insert_custom_aggregation_ops.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/insert_custom_aggregation_ops.mlir
@@ -26,10 +26,10 @@ module {
 
 // CalibrationOptions(calibration_method=CALIBRATION_METHOD_MIN_MAX)
 // MIN-MAX-CHECK: func @wrap_composite_func
-// MIN-MAX-CHECK-NEXT:  [[rhs:%.*]] = "tf.CustomAggregator"(%arg1) <{id = ""}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
-// MIN-MAX-CHECK-NEXT:  [[lhs:%.*]] = "tf.CustomAggregator"(%arg0) <{id = ""}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
+// MIN-MAX-CHECK-NEXT:  [[rhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg1) <{id = ""}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+// MIN-MAX-CHECK-NEXT:  [[lhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{id = ""}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
 // MIN-MAX-CHECK-NEXT:  [[add:%.*]] = "tf.PartitionedCall"([[lhs]], [[rhs]])
-// MIN-MAX-CHECK-NEXT:  [[res:%.*]] = "tf.CustomAggregator"([[add]]) <{id = ""}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
+// MIN-MAX-CHECK-NEXT:  [[res:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"([[add]]) <{id = ""}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
 // MIN-MAX-CHECK-NEXT:  return [[res]] : tensor<*xf32>
 
 // MIN-MAX-CHECK: func @no_composite_func
@@ -43,10 +43,10 @@ module {
 
 // CalibrationOptions(calibration_method=CALIBRATION_METHOD_AVERAGE_MIN_MAX)
 // AVERAGE-MIN-MAX-CHECK: func @wrap_composite_func
-// AVERAGE-MIN-MAX-CHECK-NEXT:  [[rhs:%.*]] = "tf.CustomAggregator"(%arg1) <{id = ""}> {calibration_method = 2 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
-// AVERAGE-MIN-MAX-CHECK-NEXT:  [[lhs:%.*]] = "tf.CustomAggregator"(%arg0) <{id = ""}> {calibration_method = 2 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
+// AVERAGE-MIN-MAX-CHECK-NEXT:  [[rhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg1) <{id = ""}> {calibration_method = 2 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+// AVERAGE-MIN-MAX-CHECK-NEXT:  [[lhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{id = ""}> {calibration_method = 2 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
 // AVERAGE-MIN-MAX-CHECK-NEXT:  [[add:%.*]] = "tf.PartitionedCall"([[lhs]], [[rhs]])
-// AVERAGE-MIN-MAX-CHECK-NEXT:  [[res:%.*]] = "tf.CustomAggregator"([[add]]) <{id = ""}> {calibration_method = 2 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
+// AVERAGE-MIN-MAX-CHECK-NEXT:  [[res:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"([[add]]) <{id = ""}> {calibration_method = 2 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
 // AVERAGE-MIN-MAX-CHECK-NEXT:  return [[res]] : tensor<*xf32>
 
 // AVERAGE-MIN-MAX-CHECK: func @no_composite_func
@@ -63,10 +63,10 @@ module {
 //   calibration_parameters=CalibrationParameters(initial_num_bins=256, min_percentile=0.001, max_percentile=99.999)
 // )
 // HISTOGRAM-PERCENTILE-CHECK: func @wrap_composite_func
-// HISTOGRAM-PERCENTILE-CHECK-NEXT:  [[rhs:%.*]] = "tf.CustomAggregator"(%arg1) <{id = ""}> {calibration_method = 3 : i32, initial_num_bins = 256 : i32, max_percentile = 9.999900e+01 : f32, min_percentile = 1.000000e-03 : f32} : (tensor<*xf32>) -> tensor<*xf32>
-// HISTOGRAM-PERCENTILE-CHECK-NEXT:  [[lhs:%.*]] = "tf.CustomAggregator"(%arg0) <{id = ""}> {calibration_method = 3 : i32, initial_num_bins = 256 : i32, max_percentile = 9.999900e+01 : f32, min_percentile = 1.000000e-03 : f32} : (tensor<*xf32>) -> tensor<*xf32>
+// HISTOGRAM-PERCENTILE-CHECK-NEXT:  [[rhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg1) <{id = ""}> {calibration_method = 3 : i32, initial_num_bins = 256 : i32, max_percentile = 9.999900e+01 : f32, min_percentile = 1.000000e-03 : f32} : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
+// HISTOGRAM-PERCENTILE-CHECK-NEXT:  [[lhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{id = ""}> {calibration_method = 3 : i32, initial_num_bins = 256 : i32, max_percentile = 9.999900e+01 : f32, min_percentile = 1.000000e-03 : f32} : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
 // HISTOGRAM-PERCENTILE-CHECK-NEXT:  [[add:%.*]] = "tf.PartitionedCall"([[lhs]], [[rhs]])
-// HISTOGRAM-PERCENTILE-CHECK-NEXT:  [[res:%.*]] = "tf.CustomAggregator"([[add]]) <{id = ""}> {calibration_method = 3 : i32, initial_num_bins = 256 : i32, max_percentile = 9.999900e+01 : f32, min_percentile = 1.000000e-03 : f32} : (tensor<*xf32>) -> tensor<*xf32>
+// HISTOGRAM-PERCENTILE-CHECK-NEXT:  [[res:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"([[add]]) <{id = ""}> {calibration_method = 3 : i32, initial_num_bins = 256 : i32, max_percentile = 9.999900e+01 : f32, min_percentile = 1.000000e-03 : f32} : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
 // HISTOGRAM-PERCENTILE-CHECK-NEXT:  return [[res]] : tensor<*xf32>
 
 // HISTOGRAM-PERCENTILE-CHECK: func @no_composite_func
@@ -83,10 +83,10 @@ module {
 //   calibration_parameters=CalibrationParameters(initial_num_bins=256)
 // )
 // HISTOGRAM-MSE-BRUTEFORCE-CHECK: func @wrap_composite_func
-// HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  [[rhs:%.*]] = "tf.CustomAggregator"(%arg1) <{id = ""}> {calibration_method = 4 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
-// HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  [[lhs:%.*]] = "tf.CustomAggregator"(%arg0) <{id = ""}> {calibration_method = 4 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
+// HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  [[rhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg1) <{id = ""}> {calibration_method = 4 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
+// HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  [[lhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{id = ""}> {calibration_method = 4 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
 // HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  [[add:%.*]] = "tf.PartitionedCall"([[lhs]], [[rhs]])
-// HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  [[res:%.*]] = "tf.CustomAggregator"([[add]]) <{id = ""}> {calibration_method = 4 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
+// HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  [[res:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"([[add]]) <{id = ""}> {calibration_method = 4 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
 // HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  return [[res]] : tensor<*xf32>
 
 // HISTOGRAM-MSE-BRUTEFORCE-CHECK: func @no_composite_func
@@ -103,10 +103,10 @@ module {
 //   calibration_parameters=CalibrationParameters(initial_num_bins=256)
 // )
 // HISTOGRAM-MSE-MAX-FREQUENCY-CHECK: func @wrap_composite_func
-// HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  [[rhs:%.*]] = "tf.CustomAggregator"(%arg1) <{id = ""}> {calibration_method = 5 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
-// HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  [[lhs:%.*]] = "tf.CustomAggregator"(%arg0) <{id = ""}> {calibration_method = 5 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
+// HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  [[rhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg1) <{id = ""}> {calibration_method = 5 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
+// HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  [[lhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{id = ""}> {calibration_method = 5 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
 // HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  [[add:%.*]] = "tf.PartitionedCall"([[lhs]], [[rhs]])
-// HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  [[res:%.*]] = "tf.CustomAggregator"([[add]]) <{id = ""}> {calibration_method = 5 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
+// HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  [[res:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"([[add]]) <{id = ""}> {calibration_method = 5 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
 // HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  return [[res]] : tensor<*xf32>
 
 // HISTOGRAM-MSE-MAX-FREQUENCY-CHECK: func @no_composite_func
@@ -123,10 +123,10 @@ module {
 //   calibration_parameters=CalibrationParameters(initial_num_bins=256)
 // )
 // HISTOGRAM-MSE-SYMMETRIC-CHECK: func @wrap_composite_func
-// HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  [[rhs:%.*]] = "tf.CustomAggregator"(%arg1) <{id = ""}> {calibration_method = 6 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
-// HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  [[lhs:%.*]] = "tf.CustomAggregator"(%arg0) <{id = ""}> {calibration_method = 6 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
+// HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  [[rhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg1) <{id = ""}> {calibration_method = 6 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
+// HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  [[lhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{id = ""}> {calibration_method = 6 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
 // HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  [[add:%.*]] = "tf.PartitionedCall"([[lhs]], [[rhs]])
-// HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  [[res:%.*]] = "tf.CustomAggregator"([[add]]) <{id = ""}> {calibration_method = 6 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
+// HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  [[res:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"([[add]]) <{id = ""}> {calibration_method = 6 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
 // HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  return [[res]] : tensor<*xf32>
 
 // HISTOGRAM-MSE-SYMMETRIC-CHECK: func @no_composite_func
@@ -174,4 +174,4 @@ module {
     %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<?x100352xf32>, tensor<100352x10xf32>) -> tensor<?x10xf32>
     return %0 : tensor<?x10xf32>
   }
-}
\ No newline at end of file
+}
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/issue_ids_of_custom_aggregation_ops.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/issue_ids_of_custom_aggregation_ops.mlir
index 4aa1ae76b8a83d..6a1621cdf17e89 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/issue_ids_of_custom_aggregation_ops.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/issue_ids_of_custom_aggregation_ops.mlir
@@ -1,17 +1,17 @@
 // RUN: tf-quant-opt %s -quant-issues-ids-of-custom-aggregation-ops | FileCheck %s
 
 func.func @issue_ids(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
-  %0 = "tf.CustomAggregator"(%arg1) {id = ""} : (tensor<*xf32>) -> tensor<*xf32>
-  %1 = "tf.CustomAggregator"(%arg0) {id = ""} : (tensor<*xf32>) -> tensor<*xf32>
+  %0:4 = "tf.CustomAggregator"(%arg1) {id = ""} : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+  %1:4 = "tf.CustomAggregator"(%arg0) {id = ""} : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
   %2 = "tf.AddV2"(%1, %0) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-  %3 = "tf.CustomAggregator"(%2) {id = ""} : (tensor<*xf32>) -> tensor<*xf32>
+  %3:4 = "tf.CustomAggregator"(%2) {id = ""} : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
   func.return %3 : tensor<*xf32>
 }
 
 
 // CHECK: func @issue_ids
-// CHECK-NEXT:  [[rhs:%.*]] = "tf.CustomAggregator"(%arg1) <{id = "0"}> : (tensor<*xf32>) -> tensor<*xf32>
-// CHECK-NEXT:  [[lhs:%.*]] = "tf.CustomAggregator"(%arg0) <{id = "1"}> : (tensor<*xf32>) -> tensor<*xf32>
+// CHECK-NEXT:  [[rhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg1) <{id = "0"}> : (tensor<*xf32>)
+// CHECK-NEXT:  [[lhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{id = "1"}> : (tensor<*xf32>)
 // CHECK-NEXT:  [[add:%.*]] = "tf.AddV2"([[lhs]], [[rhs]]) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-// CHECK-NEXT:  [[res:%.*]] = "tf.CustomAggregator"([[add]]) <{id = "2"}> : (tensor<*xf32>) -> tensor<*xf32>
+// CHECK-NEXT:  [[res:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"([[add]]) <{id = "2"}> : (tensor<*xf32>)
 // CHECK-NEXT:  return [[res]] : tensor<*xf32>
diff --git a/tensorflow/compiler/mlir/stablehlo/BUILD b/tensorflow/compiler/mlir/stablehlo/BUILD
index 5d5342e8a264c4..28e2f104221284 100644
--- a/tensorflow/compiler/mlir/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/stablehlo/BUILD
@@ -1,4 +1,4 @@
-load("@local_tsl//tsl:tsl.default.bzl", "tsl_pybind_extension")
+load("@local_xla//xla/tsl:tsl.default.bzl", "tsl_pybind_extension")
 load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
 load("//tensorflow:strict.default.bzl", "py_strict_test")
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/host_runtime/tfrt_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/host_runtime/tfrt_ops.td
index 83eac78b7574d6..0c783c01caa287 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/host_runtime/tfrt_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/host_runtime/tfrt_ops.td
@@ -109,7 +109,8 @@ def TF_IfrtLoadVariableOp : TF_Op<"IfrtLoadVariable", [Pure]> {
     configuration specified in `VariableDeviceShardingConfigProto`.
 
     This op returns a scalar string tensor containing the loaded variable name, which can be
-    used as a key to look for the loaded IFRT array in runtime.
+    used as a key to look for the loaded IFRT array in runtime and a restored tensor, which
+    maybe lowered to a future by runtime.
   }];
 
   let arguments = (ins
@@ -119,7 +120,8 @@ def TF_IfrtLoadVariableOp : TF_Op<"IfrtLoadVariable", [Pure]> {
   );
 
   let results = (outs
-    TF_StrTensor:$array_key
+    TF_StrTensor:$array_key,
+    TF_Tensor: $tensor_future
   );
 
   TF_DerivedOperandTypeListAttr Tin = TF_DerivedOperandTypeListAttr<0>;
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 5e0e58c279e358..6ba660297366ba 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -1136,6 +1136,7 @@ to be batched.}]>:$captured_tensors,
     DefaultValuedOptionalAttr<I64Attr, "0">:$low_priority_batch_timeout_micros,
     DefaultValuedOptionalAttr<I64ArrayAttr, "{}">:$low_priority_allowed_batch_sizes,
     DefaultValuedOptionalAttr<I64Attr, "0">:$low_priority_max_enqueued_batches,
+    DefaultValuedOptionalAttr<TF_AnyStrAttrOf<["low_priority_padding_with_max_batch_size", "low_priority_padding_with_next_allowed_batch_size", "priority_isolation"]>, "\"low_priority_padding_with_max_batch_size\"">:$mixed_priority_policy,
     DefaultValuedOptionalAttr<BoolAttr, "false">:$enable_large_batch_splitting
   );
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/order_by_dialect.mlir b/tensorflow/compiler/mlir/tensorflow/tests/order_by_dialect.mlir
index bb3f74d702b261..f19de0b5996999 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/order_by_dialect.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/order_by_dialect.mlir
@@ -85,12 +85,12 @@ func.func @tf_and_mhlo(%arg0: tensor<32x28x28x1xf32>, %arg1: tensor<!tf_type.res
   %4 = "tf.ReadVariableOp"(%arg1) : (tensor<!tf_type.resource<tensor<3x3x1x5xf32>>>) -> tensor<3x3x1x5xf32>
   %5 = "tf.ReadVariableOp"(%arg3) : (tensor<!tf_type.resource<tensor<3920x10xf32>>>) -> tensor<3920x10xf32>
   %6 = mhlo.convolution(%arg0, %4) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<32x28x28x1xf32>, tensor<3x3x1x5xf32>) -> tensor<32x28x28x5xf32>
-  %7 = "mhlo.broadcast_in_dim"(%3) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<5xf32>) -> tensor<32x28x28x5xf32>
+  %7 = "mhlo.broadcast_in_dim"(%3) <{broadcast_dimensions = dense<3> : tensor<1xi64>}> : (tensor<5xf32>) -> tensor<32x28x28x5xf32>
   %8 = mhlo.add %6, %7 : tensor<32x28x28x5xf32>
   %9 = mhlo.maximum %8, %1 : tensor<32x28x28x5xf32>
   %10 = "mhlo.reshape"(%9) : (tensor<32x28x28x5xf32>) -> tensor<32x3920xf32>
   %11 = "mhlo.dot"(%10, %5) : (tensor<32x3920xf32>, tensor<3920x10xf32>) -> tensor<32x10xf32>
-  %12 = "mhlo.broadcast_in_dim"(%2) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<10xf32>) -> tensor<32x10xf32>
+  %12 = "mhlo.broadcast_in_dim"(%2) <{broadcast_dimensions = dense<1> : tensor<1xi64>}> : (tensor<10xf32>) -> tensor<32x10xf32>
   %13 = mhlo.add %11, %12 : tensor<32x10xf32>
   %14 = mhlo.maximum %13, %0 : tensor<32x10xf32>
   return %14 : tensor<32x10xf32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/xla_outline_entry_functions.mlir b/tensorflow/compiler/mlir/tensorflow/tests/xla_outline_entry_functions.mlir
deleted file mode 100644
index 60f767a04cbf58..00000000000000
--- a/tensorflow/compiler/mlir/tensorflow/tests/xla_outline_entry_functions.mlir
+++ /dev/null
@@ -1,70 +0,0 @@
-// RUN: tf-opt %s -split-input-file -verify-diagnostics -tf-xla-outline-entry-functions | FileCheck %s
-
-// Check that we outline the top-level functions.
-
-// CHECK-LABEL: func.func private @main_outlined(%arg0: tensor<i32>) -> tensor<i32> attributes {_xla_compile_device_type = "CPU", allow_soft_placement = true} {
-// CHECK:         %0 = "tf.StatefulPartitionedCall"(%arg0) <{config = "", config_proto = "", executor_type = "", f = @f}> {_xla_compile_device_type = "CPU"} : (tensor<i32>) -> tensor<i32>
-// CHECK:         %cst = "tf.Const"() <{value = dense<5> : tensor<i32>}> : () -> tensor<i32>
-// CHECK:         %1 = "tf.Add"(%0, %cst) : (tensor<i32>, tensor<i32>) -> tensor<i32>
-// CHECK:         return %1 : tensor<i32>
-// CHECK:       }
-
-// CHECK:       func.func @main(%arg0: tensor<i32>) -> tensor<i32> attributes {_xla_compile_device_type = "CPU", tf.entry_function = {}} {
-// CHECK:         %0 = "tf.StatefulPartitionedCall"(%arg0) <{config = "", config_proto = "", executor_type = "", f = @main_outlined}> {_xla_compile_device_type = "CPU", allow_soft_placement = true} : (tensor<i32>) -> tensor<i32>
-// CHECK:         return %0 : tensor<i32>
-// CHECK:       }
-func.func @main(%arg0: tensor<i32>) -> tensor<i32> attributes {_xla_compile_device_type = "CPU", allow_soft_placement = true, tf.entry_function = {}} {
-  %0 = "tf.StatefulPartitionedCall"(%arg0) {config = "", config_proto = "", executor_type = "", _xla_compile_device_type = "CPU", f = @f} : (tensor<i32>) -> (tensor<i32>)
-  %1 = "tf.Const"() {value = dense<5> : tensor<i32>} : () -> tensor<i32>
-  %2 = "tf.Add"(%0, %1) : (tensor<i32>, tensor<i32>) -> (tensor<i32>)
-  func.return %2 : tensor<i32>
-}
-
-func.func @f(%arg0: tensor<i32>) -> tensor<i32> {
-  func.return %arg0 : tensor<i32>
-}
-
-// -----
-
-// Tests multiple entry functions.
-
-// CHECK-LABEL: func.func private @entry1_outlined(%arg0: tensor<i32>) -> tensor<i32> attributes {_xla_compile_device_type = "CPU", allow_soft_placement = true} {
-// CHECK:         %0 = "tf.StatefulPartitionedCall"(%arg0) <{config = "", config_proto = "", executor_type = "", f = @f1}> {_xla_compile_device_type = "CPU"} : (tensor<i32>) -> tensor<i32>
-// CHECK:         %cst = "tf.Const"() <{value = dense<5> : tensor<i32>}> : () -> tensor<i32>
-// CHECK:         %1 = "tf.Add"(%0, %cst) : (tensor<i32>, tensor<i32>) -> tensor<i32>
-// CHECK:         return %1 : tensor<i32>
-// CHECK:       }
-
-// CHECK-LABEL: func.func private @entry2_outlined(%arg0: tensor<i32>) -> tensor<i32> attributes {_xla_compile_device_type = "CPU", allow_soft_placement = true} {
-// CHECK:         %0 = "tf.StatefulPartitionedCall"(%arg0) <{config = "", config_proto = "", executor_type = "", f = @f1}> {_xla_compile_device_type = "CPU"} : (tensor<i32>) -> tensor<i32>
-// CHECK:         %cst = "tf.Const"() <{value = dense<5> : tensor<i32>}> : () -> tensor<i32>
-// CHECK:         %1 = "tf.Add"(%0, %cst) : (tensor<i32>, tensor<i32>) -> tensor<i32>
-// CHECK:         return %1 : tensor<i32>
-// CHECK:       }
-
-// CHECK:       func.func @entry1(%arg0: tensor<i32>) -> tensor<i32> attributes {_xla_compile_device_type = "CPU", tf.entry_function = {}} {
-// CHECK:         %0 = "tf.StatefulPartitionedCall"(%arg0) <{config = "", config_proto = "", executor_type = "", f = @entry1_outlined}> {_xla_compile_device_type = "CPU", allow_soft_placement = true} : (tensor<i32>) -> tensor<i32>
-// CHECK:         return %0 : tensor<i32>
-// CHECK:       }
-
-// CHECK:       func.func @entry2(%arg0: tensor<i32>) -> tensor<i32> attributes {_xla_compile_device_type = "CPU", tf.entry_function = {}} {
-// CHECK:         %0 = "tf.StatefulPartitionedCall"(%arg0) <{config = "", config_proto = "", executor_type = "", f = @entry2_outlined}> {_xla_compile_device_type = "CPU", allow_soft_placement = true} : (tensor<i32>) -> tensor<i32>
-// CHECK:         return %0 : tensor<i32>
-// CHECK:       }
-func.func @entry1(%arg0: tensor<i32>) -> tensor<i32> attributes {_xla_compile_device_type = "CPU", allow_soft_placement = true, tf.entry_function = {}} {
-  %0 = "tf.StatefulPartitionedCall"(%arg0) {config = "", config_proto = "", executor_type = "", _xla_compile_device_type = "CPU", f = @f1} : (tensor<i32>) -> (tensor<i32>)
-  %1 = "tf.Const"() {value = dense<5> : tensor<i32>} : () -> tensor<i32>
-  %2 = "tf.Add"(%0, %1) : (tensor<i32>, tensor<i32>) -> (tensor<i32>)
-  func.return %2 : tensor<i32>
-}
-
-func.func @entry2(%arg0: tensor<i32>) -> tensor<i32> attributes {_xla_compile_device_type = "CPU", allow_soft_placement = true, tf.entry_function = {}} {
-  %0 = "tf.StatefulPartitionedCall"(%arg0) {config = "", config_proto = "", executor_type = "", _xla_compile_device_type = "CPU", f = @f1} : (tensor<i32>) -> (tensor<i32>)
-  %1 = "tf.Const"() {value = dense<5> : tensor<i32>} : () -> tensor<i32>
-  %2 = "tf.Add"(%0, %1) : (tensor<i32>, tensor<i32>) -> (tensor<i32>)
-  func.return %2 : tensor<i32>
-}
-
-func.func @f1(%arg0: tensor<i32>) -> tensor<i32> {
-  func.return %arg0 : tensor<i32>
-}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/xla_rewrite_v2.mlir b/tensorflow/compiler/mlir/tensorflow/tests/xla_rewrite_v2.mlir
deleted file mode 100644
index e36bdaa72e41b8..00000000000000
--- a/tensorflow/compiler/mlir/tensorflow/tests/xla_rewrite_v2.mlir
+++ /dev/null
@@ -1,110 +0,0 @@
-// RUN: tf-opt %s -split-input-file -tf-xla-rewrite-v2 | FileCheck %s
-
-
-module attributes {tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:GPU:0"]} {
-  // CHECK-LABEL: func.func @convert_cluster_func
-  func.func @convert_cluster_func(%arg0: tensor<i32>) -> tensor<i32> {
-    // CHECK: "tf_device.launch"()
-    // CHECK-SAME: <{device = "/job:localhost/replica:0/task:0/device:GPU:0"}>
-    // CHECK: "tf._XlaCompile"(%arg0) <{function = @func, must_compile = true, operandSegmentSizes = array<i32: 0, 1, 0>}> : (tensor<i32>) -> (tensor<3x!tf_type.string>, tensor<!tf_type.boolref>)
-    // CHECK: "tf_device.launch"()
-    // CHECK-SAME: <{device = "/job:localhost/replica:0/task:0/device:GPU:0"}>
-    // CHECK: "tf._XlaRun"(%arg0, %0#0) : (tensor<i32>, tensor<3x!tf_type.string>) -> tensor<i32>
-    %0 = "tf_device.cluster_func"(%arg0) {func = @func, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<i32>) -> tensor<i32>
-    func.return %0 : tensor<i32>
-  }
-
-  func.func @func(%arg0: tensor<i32>) -> tensor<i32> {
-    func.return %arg0 : tensor<i32>
-  }
-}
-
-// -----
-
-module attributes {tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:GPU:0"]} {
-  // CHECK-LABEL: func.func @convert_cluster_func_with_resources_in_order
-  func.func @convert_cluster_func_with_resources_in_order(%arg0: tensor<!tf_type.resource>, %arg1: tensor<i32>) -> tensor<i32> {
-    // CHECK: "tf_device.launch"()
-    // CHECK-SAME: <{device = "/job:localhost/replica:0/task:0/device:GPU:0"}>
-    // CHECK: "tf._XlaCompile"(%arg1, %arg0) <{function = @func_with_resources_in_order, must_compile = true, operandSegmentSizes = array<i32: 0, 1, 1>}> : (tensor<i32>, tensor<!tf_type.resource>)
-    // CHECK: "tf_device.launch"()
-    // CHECK-SAME: <{device = "/job:localhost/replica:0/task:0/device:GPU:0"}>
-    // CHECK: "tf._XlaRun"(%arg1, %arg0, %0#0) : (tensor<i32>, tensor<!tf_type.resource>, tensor<3x!tf_type.string>) -> tensor<i32>
-    %0 = "tf_device.cluster_func"(%arg1, %arg0) {func = @func_with_resources_in_order, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<i32>, tensor<!tf_type.resource>) -> (tensor<i32>)
-    func.return %0 : tensor<i32>
-  }
-
-  func.func @func_with_resources_in_order(%arg0 : tensor<i32>, %arg1 : tensor<!tf_type.resource>) -> tensor<i32> {
-    func.return %arg0 : tensor<i32>
-  }
-}
-
-// -----
-
-module attributes {tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:GPU:0"]} {
-  // CHECK-LABEL: func.func @convert_cluster_func_with_resources
-  func.func @convert_cluster_func_with_resources(%arg0: tensor<!tf_type.resource>, %arg1: tensor<i32>) -> tensor<i32> {
-    // CHECK: "tf_device.launch"()
-    // CHECK-SAME: <{device = "/job:localhost/replica:0/task:0/device:GPU:0"}>
-    // CHECK: "tf._XlaCompile"(%arg1, %arg0) <{function = @func_with_resources_1, must_compile = true, operandSegmentSizes = array<i32: 0, 1, 1>}> : (tensor<i32>, tensor<!tf_type.resource>) -> (tensor<3x!tf_type.string>, tensor<!tf_type.boolref>)
-    // CHECK: "tf_device.launch"()
-    // CHECK-SAME: <{device = "/job:localhost/replica:0/task:0/device:GPU:0"}>
-    // CHECK: "tf._XlaRun"(%arg1, %arg0, %0#0) : (tensor<i32>, tensor<!tf_type.resource>, tensor<3x!tf_type.string>) -> tensor<i32>
-    %0 = "tf_device.cluster_func"(%arg0, %arg1) {func = @func_with_resources_1, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<!tf_type.resource>, tensor<i32>) -> tensor<i32>
-    // CHECK: "tf_device.launch"()
-    // CHECK-SAME: <{device = "/job:localhost/replica:0/task:0/device:GPU:0"}>
-    // CHECK: "tf._XlaCompile"(%arg1, %arg0) <{function = @func_with_resources_2, must_compile = true, operandSegmentSizes = array<i32: 0, 1, 1>}> : (tensor<i32>, tensor<!tf_type.resource>) -> (tensor<3x!tf_type.string>, tensor<!tf_type.boolref>)
-    // CHECK: "tf_device.launch"()
-    // CHECK-SAME: <{device = "/job:localhost/replica:0/task:0/device:GPU:0"}>
-    // CHECK: "tf._XlaRun"(%arg1, %arg0, %2#0) : (tensor<i32>, tensor<!tf_type.resource>, tensor<3x!tf_type.string>) -> tensor<i32>
-    %1 = "tf_device.cluster_func"(%arg0, %arg1) {func = @func_with_resources_2, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<!tf_type.resource>, tensor<i32>) -> tensor<i32>
-    return %0 : tensor<i32>
-  }
-
-
-  func.func @func_with_resources_1(%arg0 : tensor<!tf_type.resource>, %arg1: tensor<i32>) -> tensor<i32> {
-    func.return %arg1 : tensor<i32>
-  }
-
-  func.func @func_with_resources_2(%arg0 : tensor<!tf_type.resource>, %arg1: tensor<i32>) -> tensor<i32> {
-    func.return %arg1 : tensor<i32>
-  }
-}
-
-// -----
-
-// CHECK-LABEL: func.func @outside_compilation_in_generic_pipeline
-module attributes {tf.devices = ["/job:localhost/replica:0/task:0/device:CPU:0"], tf.versions = {producer = 888 : i32}} {
-  func.func @outside_compilation_in_generic_pipeline(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-    // CHECK: tf_device.launch
-    // CHECK-SAME: <{device = "/job:localhost/replica:0/task:0/device:GPU:0"}>
-    // CHECK: "tf._XlaCompile"() <{function = @func, must_compile = true, operandSegmentSizes = array<i32: 0, 0, 0>}>
-    // CHECK: tf_device.parallel_execute
-    // CHECK: tf_device.launch
-    // CHECK-SAME: <{device = "/job:localhost/replica:0/task:0/device:CPU:0"}>
-    // CHECK: tf.B
-    // CHECK: tf._XlaSendFromHost
-    // CHECK: tf_device.launch
-    // CHECK-SAME: <{device = "/job:localhost/replica:0/task:0/device:GPU:0"}>
-    // CHECK: tf._XlaRun
-    %0 = "tf_device.parallel_execute"() ({
-      "tf_device.launch"() ({
-        %1 = "tf._XlaCompileMlirPlaceholderProgramKey"() : () -> tensor<3x!tf_type.string>
-        %2 = "tf.B"() : () -> tensor<2xi32>
-        "tf._XlaSendFromHost"(%2, %1) {_xla_has_host_transfer = true, device_ordinal = 0 : i64, key = "host_compute_channel_0_retvals"} : (tensor<2xi32>, tensor<3x!tf_type.string>) -> ()
-        tf_device.return
-      }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> ()
-      tf_device.return
-    }, {
-      %0 = "tf_device.cluster_func"() {func = @func, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : () -> tensor<2xi32>
-      tf_device.return %0 : tensor<2xi32>
-    }) : () -> tensor<2xi32>
-    return %0 : tensor<2xi32>
-  }
-  func.func @func() -> tensor<2xi32> {
-    %2 = "tf.A"() : () -> tensor<2xi32>
-    %3 = "tf._XlaHostComputeMlir"() {host_mlir_module = "", manual_sharding = false, recv_key = "host_compute_channel_0_retvals", send_key = "host_compute_channel_0_args"} : () -> tensor<2xi32>
-    %4 = "tf.C"(%3) : (tensor<2xi32>) -> tensor<2xi32>
-    func.return %4 : tensor<2xi32>
-  }
-}
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/BUILD b/tensorflow/compiler/mlir/tensorflow/transforms/BUILD
index 3d1cf1bd58fa38..2e090224a5c86c 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/BUILD
@@ -541,7 +541,6 @@ cc_library(
         "xla_call_module_serialization.cc",
         "xla_inline_device_ops.cc",
         "xla_rewrite.cc",
-        "xla_rewrite_v2.cc",
         "xla_validate_inputs.cc",
     ],
     hdrs = [
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc b/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
index 84220aa346bf50..880dfa837e881c 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
@@ -122,10 +122,6 @@ LogicalResult ConstantFoldFallbackHook(
     inputs.push_back(input.cast<ElementsAttr>());
   }
 
-  // Avoid overlapping folds with the same context.
-  // TODO(jpienaar): Avoid using global context & mutex here.
-  static auto* mu = new tensorflow::mutex();
-  tensorflow::mutex_lock l(*mu);
   SmallVector<Attribute> constants;
   LogicalResult status = EvaluateOperation(inst, inputs, constants);
   results.assign(constants.begin(), constants.end());
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops.cc b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops.cc
index a239c7304a0ae0..c61b1e0c14a852 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops.cc
@@ -94,12 +94,7 @@ void AddTPULowerClusterToRuntimeOpsPassPipeline(OpPassManager& pm,
 void AddNonTPULowerClusterToRuntimeOpsPassPipeline(
     OpPassManager& pm, llvm::StringRef module_name) {
   // Rewrite cluster functions into XLA launch ops.
-  if (tensorflow::GetMlirCommonFlags()
-          ->tf_mlir_enable_generic_outside_compilation) {
-    pm.addPass(mlir::TFDevice::CreateXlaRewriteV2Pass());
-  } else {
-    pm.addPass(mlir::TFDevice::CreateXlaRewritePass());
-  }
+  pm.addPass(mlir::TFDevice::CreateXlaRewritePass());
   // Re-run the canonicalizer pass as some cleanup during resource op lifting
   // pass opens up some opportunities for canonicalization of cluster ops.
   // Specifically, we want to eliminate pass through results from the cluster
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
index da89e77cb0862c..3a2ba6f181f649 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
@@ -483,10 +483,6 @@ std::unique_ptr<OperationPass<ModuleOp>> CreateXlaInlineDeviceOpsPass();
 // type` with `tf.XlaLaunch` ops.
 std::unique_ptr<OperationPass<ModuleOp>> CreateXlaRewritePass();
 
-// Creates a pass that rewrites partitioned calls with `tf._XlaCompile` op and
-// `tf.XlaRun` op.
-std::unique_ptr<OperationPass<ModuleOp>> CreateXlaRewriteV2Pass();
-
 // Create a pass that validates the input graph to the CPU/GPU bridge.
 std::unique_ptr<OperationPass<ModuleOp>> CreateXlaValidateInputsPass();
 }  // namespace TFDevice
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_pipelining.cc b/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_pipelining.cc
index 0c450126e4e090..e565d50660558c 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_pipelining.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_pipelining.cc
@@ -193,14 +193,15 @@ struct EmbeddingPipeliningPass
 
 bool UseEmbeddingPipelining(ModuleOp& module) {
   // Enable automated pipelining pass unless:
-  // 1. The user disables it via flog, or
+  // 1. The user disables it via flag, or
   // 2. The graph contains TF.Summary ops. Graphs like this typically only run
   //    for a single step which doesn't work in pipelining.
 
   if (tensorflow::GetBuildXlaOpsPassFlags()
-          ->tf_xla_disable_full_embedding_pipelining)
+          ->tf_xla_disable_full_embedding_pipelining) {
+    LOG(INFO) << "Embedding pipelining disabled via flag.";
     return false;
-
+  }
   // Detect summaries by looking for key Ops in the graph. It would be better to
   // do this via operator attributes rather than looking for a specific op.
   WalkResult walk_result = module.walk([&](Operation* op) -> WalkResult {
@@ -208,10 +209,10 @@ bool UseEmbeddingPipelining(ModuleOp& module) {
     return WalkResult::advance();
   });
   if (walk_result.wasInterrupted()) {
-    VLOG(1) << "TF summaries detected - disabling embedding pipelining.";
+    LOG(INFO) << "TF summaries detected - disabling embedding pipelining.";
     return false;
   }
-  VLOG(1) << "Embedding pipelining rewrite enabled.";
+  LOG(INFO) << "Embedding pipelining rewrite enabled.";
   return true;
 }
 
@@ -1685,12 +1686,11 @@ Operation* LiftNonTpuFuncCaller(mlir::OpBuilder& builder,
 }
 
 void EmbeddingPipeliningPass::runOnOperation() {
-  VLOG(3) << "EmbeddingPipeliningPass::runOnOperation()";
+  LOG(INFO) << "EmbeddingPipeliningPass::runOnOperation()";
   ModuleOp module = getOperation();
 
   // We only use one of the EmbeddingPipelining and EmbeddingSequencing passes.
   if (!UseEmbeddingPipelining(module)) return;
-  VLOG(1) << "Embedding pipelining rewrite enabled.";
 
   SymbolTable symbol_table(module);
 
@@ -1722,7 +1722,7 @@ void EmbeddingPipeliningPass::runOnOperation() {
   // If there are no forward pass ops, there is no SC, so we end early.
   if (forward_pass_ops.empty()) {
     if (backward_pass_ops.empty()) {
-      VLOG(1) << "no pipelining ops found";
+      LOG(INFO) << "no pipelining ops found";
       return;
     } else {
       (*backward_pass_ops.begin())->emitOpError()
@@ -1812,11 +1812,11 @@ void EmbeddingPipeliningPass::runOnOperation() {
   if (failed(result)) return signalPassFailure();
   merged_set.insert(non_tpu_ops.begin(), non_tpu_ops.end());
 
-  VLOG(3) << "Forwards pass " << forward_pass_ops.size()
-          << " ops, backwards pass " << backward_pass_ops.size()
-          << " ops, core " << core_tpu_ops.size()
-          << " ops. Total = " << merged_set.size() << " of "
-          << GetNumOps(loop_body_func);
+  LOG(INFO) << "Forwards pass " << forward_pass_ops.size()
+            << " ops, backwards pass " << backward_pass_ops.size()
+            << " ops, core " << core_tpu_ops.size()
+            << " ops. Total = " << merged_set.size() << " of "
+            << GetNumOps(loop_body_func);
 
   builder.setInsertionPointAfter(*non_tpu_ops.begin());
   TF::StatefulPartitionedCallOp non_tpu_caller = nullptr;
@@ -2185,7 +2185,8 @@ void EmbeddingPipeliningPass::runOnOperation() {
   int parallel_iterations = parallel_iterations_flag > 0
                                 ? parallel_iterations_flag
                                 : orig_while_op.getParallelIterations();
-  VLOG(1) << "Setting parallel_iterations_flag to " << parallel_iterations_flag;
+  LOG(INFO) << "Setting parallel_iterations_flag to "
+            << parallel_iterations_flag;
   auto new_while_op = builder.create<TF::WhileOp>(
       orig_while_op->getLoc(), new_body_return_types,
       new_while_operands.getArrayRef(), cond.getSymName(), body.getSymName(),
@@ -2252,7 +2253,7 @@ void EmbeddingPipeliningPass::runOnOperation() {
   orig_while_op.body_function().erase();
   orig_while_op.erase();
 
-  VLOG(3) << "EmbeddingPipeliningPass::runOnOperation done.";
+  LOG(INFO) << "EmbeddingPipeliningPass::runOnOperation done.";
 }
 }  // namespace
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_sequencing.cc b/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_sequencing.cc
index 7ed29a3ed58cc3..577b374a43847d 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_sequencing.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_sequencing.cc
@@ -413,8 +413,7 @@ LogicalResult FindForwardPassOps(OpBuilder& builder,
       if (is_non_variable && is_variable) {
         loop_body_func.emitOpError()
             << "resource input " << argument.getArgNumber()
-            << " is used both as a varible and not "
-            << " a variable";
+            << " is used both as a varible and not a variable";
         return LogicalResult::failure();
       }
       if (is_variable && use_in_forward)
@@ -772,7 +771,7 @@ LogicalResult ExtractOpsAsFunc(
 }
 
 void EmbeddingSequencingPass::runOnOperation() {
-  VLOG(3) << "EmbeddingSequencingPass::runOnOperation()";
+  LOG(INFO) << "EmbeddingSequencingPass::runOnOperation()";
   ModuleOp module = getOperation();
 
   llvm::SetVector<Operation*> forward_pass_ops;
@@ -803,6 +802,8 @@ void EmbeddingSequencingPass::runOnOperation() {
   // If there are no forward pass ops, there is no SC, so we end early.
   if (forward_pass_ops.empty()) {
     if (backward_pass_ops.empty()) {
+      LOG(INFO) << "No unprocessed embedding ops found - skipping embedding "
+                << "sequencing rewrite.";
       return;
     } else {
       (*backward_pass_ops.begin())->emitOpError()
@@ -810,7 +811,7 @@ void EmbeddingSequencingPass::runOnOperation() {
       return signalPassFailure();
     }
   }
-  VLOG(1) << "Embedding sequencing rewrite enabled.";
+  LOG(INFO) << "Embedding sequencing rewrite enabled.";
 
   // Ensure that all ops are in the same region, and have the same replication
   // info.
@@ -860,18 +861,17 @@ void EmbeddingSequencingPass::runOnOperation() {
   TF::WhileOp while_op = nullptr;
   result = FindOwningWhileOp(loop_body_func, module, &while_op);
   if (failed(result)) {
-    VLOG(1) << "WhileOp not found: assuming external loop.";
+    LOG(INFO) << "WhileOp not found: assuming external loop.";
   } else {
     // Override the WhileOp parallel_iterations if requested by flag.
     int parallel_iterations_flag = tensorflow::GetBuildXlaOpsPassFlags()
                                        ->tf_xla_embedding_parallel_iterations;
     if (parallel_iterations_flag > 0) {
-      VLOG(1) << "Setting WhileOp parallel_iterations_flag to "
-              << parallel_iterations_flag;
+      LOG(INFO) << "Setting WhileOp parallel_iterations_flag to "
+                << parallel_iterations_flag;
       while_op.setParallelIterations(parallel_iterations_flag);
     } else {
-      VLOG(1) << "Using original WhileOp parallel_iterations = "
-              << while_op.getParallelIterations();
+      LOG(INFO) << "Using original WhileOp parallel_iteration";
     }
   }
 
@@ -898,11 +898,11 @@ void EmbeddingSequencingPass::runOnOperation() {
   if (failed(result)) return signalPassFailure();
   merged_set.insert(non_tpu_ops.begin(), non_tpu_ops.end());
 
-  VLOG(2) << "Forwards pass " << forward_pass_ops.size()
-          << " ops, backwards pass " << backward_pass_ops.size()
-          << " ops, core " << core_tpu_ops.size()
-          << " ops. Total = " << merged_set.size() << " of "
-          << GetNumOps(loop_body_func) << ".\n";
+  LOG(INFO) << "Forwards pass " << forward_pass_ops.size()
+            << " ops, backwards pass " << backward_pass_ops.size()
+            << " ops, core " << core_tpu_ops.size()
+            << " ops. Total = " << merged_set.size() << " of "
+            << GetNumOps(loop_body_func) << ".\n";
 
   builder.setInsertionPointAfter(*non_tpu_ops.begin());
   Operation* non_tpu_caller = nullptr;
@@ -936,7 +936,7 @@ void EmbeddingSequencingPass::runOnOperation() {
   metadata_op->erase();
   compilation_op->erase();
 
-  VLOG(3) << "EmbeddingSequencingPass::runOnOperation done.";
+  LOG(INFO) << "EmbeddingSequencingPass::runOnOperation done.";
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_device_passes.td b/tensorflow/compiler/mlir/tensorflow/transforms/tf_device_passes.td
index c89c909375df67..169f5f206dabc5 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tf_device_passes.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_device_passes.td
@@ -330,54 +330,3 @@ def XlaValidateInputsPass : Pass<"tf-xla-validate-inputs", "ModuleOp"> {
   let constructor = "TFDevice::CreateXlaValidateInputsPass()";
   let dependentDialects = ["tf_device::TensorFlowDeviceDialect"];
 }
-
-def XlaRewriteV2Pass : Pass<"tf-xla-rewrite-v2", "mlir::ModuleOp"> {
-  let summary = "Rewrites `tf_device.cluster_func op` into `_XlaCompile` and `_XlaRun` ops to make the attached function run on XLA.";
-
-  let description = [{
-    This pass rewrites `tf_device.cluster_func` op into 
-    `tf._XlaCompile` op and `tf._XlaRun` op. This makes the attached
-    function execute with XLA. `tf.XlaCompile` requires resource-type arguments
-    come at the end, so this pass rewrites the called function if necessary.
-    This pass assumes there are no nested `tf_device.cluster`s so we don't end
-    up creating nested `tf._XlaCompile` and `tf._XlaRun` ops.
-
-    For example, the `tf_device.cluster_func` operation in the following code
-
-    ```mlir
-    func.func @convert_cluster_func_with_resources(%arg0: tensor<!tf_type.resource>, %arg1: tensor<i32>) -> tensor<i32> {
-      %0 = "tf_device.cluster_func"(%arg0, %arg1) {device = "/job:localhost/replica:0/task:0/device:GPU:0", func = @func_with_resources} : (tensor<!tf_type.resource>, tensor<i32>) -> tensor<i32>
-      return %0 : tensor<i32>
-    }
-
-    func.func @func_with_resources(%arg0: tensor<!tf_type.resource>, %arg1: tensor<i32>) -> tensor<i32> {
-      return %arg1 : tensor<i32>
-    }
-    ```
-
-    will be replaced by a `tf._XlaCompile` and `tf._XlaRun` operation.
-
-    ```mlir
-    func.func @convert_cluster_func_with_resources(%arg0: tensor<!tf_type.resource>, %arg1: tensor<i32>) -> tensor<i32> {
-      %0:2 = "tf_device.launch"() ({
-        %key, %compilation_successful = "tf._XlaCompile"(%arg1, %arg0) {function = @func_with_resources, must_compile = true, operand_segment_sizes = array<i32: 0, 1, 1>} : (tensor<i32>, tensor<!tf_type.resource>) -> (tensor<3x!tf_type.string>, tensor<!tf_type.boolref>)
-        tf_device.return %key, %compilation_successful : tensor<3x!tf_type.string>, tensor<!tf_type.boolref>
-      }) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : () -> (tensor<3x!tf_type.string>, tensor<!tf_type.boolref>)
-      %1 = "tf_device.launch"() ({
-        %2 = "tf._XlaRun"(%arg1, %arg0, %0#0) : (tensor<i32>, tensor<!tf_type.resource>, tensor<3x!tf_type.string>) -> tensor<i32>
-        tf_device.return %2 : tensor<i32>
-      }) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : () -> tensor<i32>
-      return %1 : tensor<i32>
-    }
-
-    func.func @func_with_resources(%arg0: tensor<i32>, %arg1: tensor<!tf_type.resource>) -> tensor<i32> {
-      return %arg0 : tensor<i32>
-    }
-    ```
-    Notice that the called function is rewritten, with the order of its parameters changed.
-  }];
-
-  let constructor = "TFDevice::CreateXlaRewriteV2Pass()";
-  let dependentDialects = ["tf_device::TensorFlowDeviceDialect"];
-}
-
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/xla_rewrite_v2.cc b/tensorflow/compiler/mlir/tensorflow/transforms/xla_rewrite_v2.cc
deleted file mode 100644
index f8752e316233dd..00000000000000
--- a/tensorflow/compiler/mlir/tensorflow/transforms/xla_rewrite_v2.cc
+++ /dev/null
@@ -1,397 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This transformation pass converts tf_device.cluster_func op into
-// tf._XlaCompile and tf._XlaRun ops.
-
-#include <memory>
-#include <string>
-
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
-#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/parallel_execute_util.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/xla_rewrite_util.h"
-
-#define DEBUG_TYPE "tf-xla-rewrite-v2"
-
-namespace mlir {
-namespace {
-
-#define GEN_PASS_DEF_XLAREWRITEV2PASS
-#include "tensorflow/compiler/mlir/tensorflow/transforms/tf_device_passes.h.inc"
-
-constexpr absl::string_view kDeviceAttr = "device";
-
-struct XlaRewriteV2Pass : public impl::XlaRewriteV2PassBase<XlaRewriteV2Pass> {
-  void runOnOperation() override;
-};
-
-// Get the device from `tf_device.cluster_func` op
-mlir::LogicalResult GetClusterFuncDevice(tf_device::ClusterFuncOp cluster_func,
-                                         std::string& compilation_device) {
-  auto device_attr = cluster_func->getAttrOfType<StringAttr>(kDeviceAttr);
-  if (device_attr) {
-    compilation_device = device_attr.str();
-  } else {
-    return cluster_func.emitOpError("No device assigned for cluster_func ");
-  }
-  return success();
-}
-
-// Rearrange the input order by putting resource args after non resource args
-// Returns true when the inputs is in order, otherwise return false
-bool RearrangeInputOrder(llvm::SmallVector<mlir::Value, 4> inputs,
-                         llvm::SmallVector<Value>& non_resource_args,
-                         llvm::SmallVector<Value>& resource_args) {
-  bool has_resources = false;
-  bool in_order = true;
-  for (const Value& arg : inputs) {
-    if (!getElementTypeOrSelf(arg.getType()).template isa<TF::ResourceType>()) {
-      non_resource_args.push_back(arg);
-      if (has_resources) in_order = false;
-    } else {
-      resource_args.push_back(arg);
-      has_resources = true;
-    }
-  }
-  return in_order;
-}
-
-// Move the resource args to the end of the function operand list.
-void MoveResourceArgsToEnd(func::FuncOp callee) {
-  llvm::DenseMap<BlockArgument, BlockArgument> mapping;
-  unsigned num_params = callee.getNumArguments();
-  llvm::BitVector removed_params(num_params);
-  // Copy the resource-type parameters to the end.
-  for (unsigned i = 0; i < num_params; ++i) {
-    BlockArgument param = callee.getArgument(i);
-    if (getElementTypeOrSelf(param.getType())
-            .template isa<TF::ResourceType>()) {
-      removed_params.set(i);
-      callee.getBody().addArgument(param.getType(), param.getLoc());
-      param.replaceAllUsesWith(callee.getArguments().back());
-      removed_params.push_back(false);
-    }
-  }
-  // Remove old resource-type parameters.
-  callee.getBody().front().eraseArguments(removed_params);
-  // Update function type.
-  callee.setFunctionType(FunctionType::get(callee.getContext(),
-                                           callee.getBody().getArgumentTypes(),
-                                           callee.getResultTypes()));
-}
-
-mlir::LogicalResult GetOutputTypesForClusterFunc(
-    mlir::tf_device::ClusterFuncOp cluster_func,
-    llvm::SmallVectorImpl<mlir::Type>* output_types) {
-  output_types->reserve(cluster_func.getNumResults());
-  for (const auto& result_and_index :
-       llvm::enumerate(cluster_func.getResults())) {
-    const auto cluster_func_output_type =
-        result_and_index.value().getType().cast<mlir::TensorType>();
-    output_types->emplace_back(cluster_func_output_type);
-  }
-  return mlir::success();
-}
-
-mlir::LogicalResult ExtractInputsForLogicalDevices(
-    const int num_cores_per_replica,
-    mlir::tf_device::ClusterFuncOp cluster_func, mlir::OpBuilder* builder,
-    llvm::SmallVectorImpl<llvm::SmallVector<mlir::Value, 4>>* input_list) {
-  // Initialize the input list for each logical devices.
-  input_list->reserve(num_cores_per_replica);
-  for (int i = 0; i < num_cores_per_replica; ++i)
-    input_list->emplace_back(llvm::SmallVector<mlir::Value, 4>());
-
-  llvm::SmallVector<mlir::Value, 4> cluster_func_inputs(
-      cluster_func.getOperands());
-
-  // If sharding attribute does not exist, then all inputs are placed on 0th
-  // logical core by default.
-  (*input_list)[0] = cluster_func_inputs;
-  return mlir::success();
-}
-
-// Creates a `tf._XlaRun` op that executes XLA program.
-LogicalResult BuildExecuteOp(llvm::SmallVector<mlir::Value, 4> input,
-                             tf_device::ClusterFuncOp cluster_func,
-                             Operation* compile_op, int core,
-                             OpBuilder* builder, TF::_XlaRunOp* execute_op) {
-  llvm::SmallVector<Type, 4> output_types;
-  llvm::SmallVector<int, 4> cluster_to_core_index;
-  auto result = GetOutputTypesForClusterFunc(cluster_func, &output_types);
-  if (failed(result)) return failure();
-
-  llvm::SmallVector<Value> non_resource_args, resource_args;
-  bool in_order = RearrangeInputOrder(input, non_resource_args, resource_args);
-
-  llvm::SmallVector<mlir::Value, 4> execute_inputs;
-  if (!in_order) {
-    for (auto non_resource_arg : non_resource_args) {
-      execute_inputs.emplace_back(non_resource_arg);
-    }
-    for (auto resource_arg : resource_args) {
-      execute_inputs.emplace_back(resource_arg);
-    }
-  } else {
-    execute_inputs = input;
-  }
-  execute_inputs.emplace_back(compile_op->getResult(core));
-
-  // _XlaRun op has same output types as cluster_func.
-  *execute_op = builder->create<TF::_XlaRunOp>(cluster_func.getLoc(),
-                                               output_types, execute_inputs);
-  return success();
-}
-
-// parallel_execute op returns concatenated list of return values of all its
-// regions.
-mlir::LogicalResult GetConcatenatedOutputTypes(
-    const int num_cores_per_replica, tf_device::ClusterFuncOp cluster_func,
-    tf_device::ParallelExecuteOp old_parallel_execute,
-    const ValueTypeRange<ResultRange>& cluster_result_types,
-    llvm::SmallVector<Type, 8>& concatenated_output_types) {
-  // parallel_execute op returns concatenated list of return values of
-  // all its regions.
-  concatenated_output_types.reserve(cluster_result_types.size() *
-                                    num_cores_per_replica);
-  for (mlir::Region& region : old_parallel_execute.getRegions()) {
-    if (!isa<tf_device::ClusterFuncOp>(region.front().front())) {
-      for (Type t : region.front().front().getResultTypes())
-        concatenated_output_types.emplace_back(t);
-    }
-  }
-
-  for (int core = 0; core < num_cores_per_replica; ++core) {
-    llvm::SmallVector<Type, 4> output_types;
-    auto result = GetOutputTypesForClusterFunc(cluster_func, &output_types);
-    if (failed(result)) return failure();
-    for (Type t : output_types) {
-      concatenated_output_types.emplace_back(t);
-    }
-  }
-  return success();
-}
-
-// Given a `ParallelExecute`, replace it with a new `ParallelExecute`. The
-// new `ParallelExecute` will replace the child that contains the
-// `ClusterFunc` with `num_cores_per_replica` children. It keep other children
-// the same. Return values from the child with the `ClusterFunc` will be
-// duplicated `num_cores_per_replica` times.
-LogicalResult AddToParallelExecuteOp(
-    llvm::SmallVectorImpl<llvm::SmallVector<int, 4>>* cluster_to_core_index,
-    Operation* compile_op, tf_device::ClusterFuncOp cluster_func,
-    OpBuilder* builder, tf_device::ParallelExecuteOp old_parallel_execute,
-    tf_device::ParallelExecuteOp* new_parallel_execute, int* cluster_idx) {
-  const int num_cores_per_replica = 1;
-  const auto cluster_result_types = cluster_func.getResultTypes();
-  llvm::SmallVector<Type, 8> concatenated_output_types;
-
-  if (failed(GetConcatenatedOutputTypes(
-          num_cores_per_replica, cluster_func, old_parallel_execute,
-          cluster_result_types, concatenated_output_types)))
-    return failure();
-
-  *cluster_idx = tensorflow::MovePreservedParallelExecuteChildren(
-      num_cores_per_replica, concatenated_output_types, builder, cluster_func,
-      old_parallel_execute, new_parallel_execute);
-
-  // Extract inputs for each block of the parallel_execute op. The i-th
-  // element in the list represents the input lists to XLA computation for
-  // i-th logical core.
-  llvm::SmallVector<llvm::SmallVector<mlir::Value, 4>, 4> input_list;
-  builder->setInsertionPoint(*new_parallel_execute);
-  auto result = ExtractInputsForLogicalDevices(
-      num_cores_per_replica, cluster_func, builder, &input_list);
-  if (failed(result)) return failure();
-
-  // For each logical core, create a region with tf._XlaRun op.
-  for (int core = 0; core < num_cores_per_replica; ++core) {
-    auto& block =
-        new_parallel_execute->GetRegionBlockWithIndex((*cluster_idx) + core);
-    builder->setInsertionPointToEnd(&block);
-
-    // Create Execute op _XlaRun.
-    TF::_XlaRunOp execute;
-    if (failed(BuildExecuteOp(input_list[core], cluster_func, compile_op, core,
-                              builder, &execute)))
-      return failure();
-
-    std::string execute_device;
-    if (failed(GetClusterFuncDevice(cluster_func, execute_device)))
-      return failure();
-
-    auto block_launch_op = tensorflow::WrapOpInLaunch(
-        builder, block.getParent()->getLoc(), execute, execute_device);
-
-    builder->create<tf_device::ReturnOp>(block.getParent()->getLoc(),
-                                         block_launch_op.getResults());
-  }
-
-  return success();
-}
-
-// Replace the uses of old parallel execute outputs with new outputs
-mlir::LogicalResult RemapOutputsFromLogicalDevices(
-    mlir::tf_device::ParallelExecuteOp old_parallel_execute, int cluster_idx,
-    mlir::tf_device::ParallelExecuteOp new_parallel_execute,
-    mlir::OpBuilder* builder) {
-  for (auto [output_index, old_parallel_execute_output] :
-       llvm::enumerate(old_parallel_execute.getResults())) {
-    const auto output_from_logical_device =
-        new_parallel_execute.GetRegionOutputs(cluster_idx)[output_index];
-    old_parallel_execute_output.replaceAllUsesWith(output_from_logical_device);
-  }
-  return mlir::success();
-}
-
-// Create a `tf._XlaCompile` op
-Operation* BuildCompileOp(tf_device::ClusterFuncOp cluster_func,
-                          llvm::StringRef compilation_device,
-                          SymbolTable& symtab, OpBuilder* builder) {
-  llvm::SmallVector<Value> non_resource_args, resource_args;
-  bool in_order = RearrangeInputOrder(cluster_func.getOperands(),
-                                      non_resource_args, resource_args);
-  if (!in_order) {
-    // Functions do not get reused in practice, so skip the check for if the
-    // callee has been updated.
-    StringAttr callee_sym = cluster_func.getFuncAttr().getAttr();
-    MoveResourceArgsToEnd(symtab.lookup<func::FuncOp>(callee_sym));
-  }
-
-  auto program_type =
-      RankedTensorType::get({3}, builder->getType<TF::StringType>());
-  auto compilation_status_type =
-      RankedTensorType::get({}, builder->getType<TF::BoolRefType>());
-  auto compile_op = builder->create<TF::_XlaCompileOp>(
-      cluster_func.getLoc(), program_type, compilation_status_type,
-      /*constants=*/ValueRange({}), ValueRange(non_resource_args),
-      ValueRange(resource_args), builder->getBoolAttr(true),
-      cluster_func.getFuncAttr());
-  return tensorflow::WrapOpInLaunch(builder, compile_op.getLoc(), compile_op,
-                                    compilation_device);
-}
-
-mlir::LogicalResult GetCompilationDeviceFromParallelExecuteOp(
-    tf_device::ParallelExecuteOp& old_parallel_execute,
-    std::string& compilation_device) {
-  auto& first_block = old_parallel_execute.GetRegionBlockWithIndex(0);
-  if (isa<tf_device::LaunchOp>(first_block.front())) {
-    auto device_attr =
-        first_block.front().getAttrOfType<StringAttr>(kDeviceAttr);
-    if (device_attr) {
-      compilation_device = device_attr.str();
-    } else {
-      return failure();
-    }
-  }
-  return success();
-}
-
-mlir::LogicalResult Rewrite(tf_device::ClusterFuncOp cluster_func,
-                            SymbolTable& symtab, OpBuilder& builder) {
-  // Fetch the ParallelExecute parent of `cluster_func`, or create it if
-  // it does not exist.
-  tf_device::ParallelExecuteOp old_parallel_execute =
-      cluster_func->getParentOfType<tf_device::ParallelExecuteOp>();
-  if (old_parallel_execute &&
-      cluster_func->getParentOp() != old_parallel_execute) {
-    cluster_func->emitError() << "The ParallelExecute ancestor of a "
-                                 "ClusterFunc must be its direct parent.";
-  }
-
-  // Fetch compilation device
-  std::string compilation_device;
-  if (failed(GetClusterFuncDevice(cluster_func, compilation_device)))
-    return failure();
-
-  if (!old_parallel_execute) {
-    old_parallel_execute =
-        mlir::TF::BuildParallelExecuteOp(cluster_func, &builder);
-  }
-
-  // Build compile op _XlaCompile
-  builder.setInsertionPoint(old_parallel_execute);
-  Operation* compile_op =
-      BuildCompileOp(cluster_func, compilation_device, symtab, &builder);
-  if (!compile_op) {
-    return failure();
-  }
-
-  old_parallel_execute.walk(
-      [&](TF::_XlaCompileMlirPlaceholderProgramKeyOp key_op) {
-        key_op.replaceAllUsesWith(compile_op->getResult(0));
-        key_op.erase();
-      });
-
-  // Build new parallel execute op
-  tf_device::ParallelExecuteOp new_parallel_execute;
-  int num_cores_per_replica = 1;
-  int cluster_idx;
-  llvm::SmallVector<llvm::SmallVector<int, 4>, 4> cluster_to_core_index;
-  cluster_to_core_index.reserve(num_cores_per_replica);
-
-  if (failed(AddToParallelExecuteOp(
-          &cluster_to_core_index, compile_op, cluster_func, &builder,
-          old_parallel_execute, &new_parallel_execute, &cluster_idx)))
-    return failure();
-
-  // As tf_device.parallel_execute wraps # logical cores number of tf._XlaRun
-  // ops, the number of return values of parallel_execute op may exceed that of
-  // cluster_func op. As such, each return value of parallel_execute op must
-  // be mapped with corresponding return value usages of cluster_func.
-  if (failed(RemapOutputsFromLogicalDevices(old_parallel_execute, cluster_idx,
-                                            new_parallel_execute, &builder)))
-    return failure();
-
-  if (failed(mlir::TF::RemoveSingletonParallelExecuteOp(new_parallel_execute,
-                                                        &builder)))
-    return failure();
-
-  return success();
-}
-
-void XlaRewriteV2Pass::runOnOperation() {
-  ModuleOp module = getOperation();
-  SymbolTable symtab(module);
-  OpBuilder builder(&getContext());
-  llvm::SmallVector<tf_device::ClusterFuncOp, 4> cluster_func_ops;
-  module.walk([&](tf_device::ClusterFuncOp cluster_func) {
-    cluster_func_ops.push_back(cluster_func);
-  });
-
-  for (tf_device::ClusterFuncOp cluster_func : cluster_func_ops) {
-    if (failed(Rewrite(cluster_func, symtab, builder)))
-      return signalPassFailure();
-  }
-
-  // Erase all the tf_device.cluster_func ops
-  if (failed(tensorflow::EraseClusterFuncs(cluster_func_ops))) {
-    return signalPassFailure();
-  }
-}
-
-}  // namespace
-
-namespace TFDevice {
-std::unique_ptr<OperationPass<ModuleOp>> CreateXlaRewriteV2Pass() {
-  return std::make_unique<XlaRewriteV2Pass>();
-}
-
-}  // namespace TFDevice
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
index 95a637dbfbb3b3..42b059cbd0a527 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
@@ -4280,17 +4280,13 @@ SavedModelMLIRImportInput::~SavedModelMLIRImportInput() {}
 
 absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertGraphdefToMlir(
     const GraphDef& graphdef, const GraphDebugInfo& debug_info,
-    const GraphImportConfig& specs, mlir::MLIRContext* context,
-    bool add_default_attributes) {
+    const GraphImportConfig& specs, mlir::MLIRContext* context) {
   GraphConstructorOptions options;
   options.allow_internal_ops = true;
-  options.add_default_attributes = add_default_attributes;
   Graph graph(OpRegistry::Global());
-
   GraphDef preprocessed_graphdef(graphdef);
-  if (add_default_attributes) {
-    TF_RETURN_IF_ERROR(PreprocessGraphDef(&specs, &preprocessed_graphdef));
-  }
+  TF_RETURN_IF_ERROR(PreprocessGraphDef(&specs, &preprocessed_graphdef));
+
   if (specs.upgrade_legacy) {
     TF_RETURN_IF_ERROR(GenerateResourceSharedNameIfEmpty(
         preprocessed_graphdef, graph.flib_def().default_registry()));
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.h b/tensorflow/compiler/mlir/tensorflow/translate/import_model.h
index 182a53078ba215..1670fd11a1f819 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.h
@@ -41,8 +41,7 @@ inline constexpr absl::string_view kImportModelDefaultGraphFuncName = "main";
 // tf_executor dialect.
 tsl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertGraphdefToMlir(
     const GraphDef& graphdef, const GraphDebugInfo& debug_info,
-    const GraphImportConfig& specs, mlir::MLIRContext* context,
-    bool add_default_attributes = true);
+    const GraphImportConfig& specs, mlir::MLIRContext* context);
 
 // Given a Graph, returns a MLIR module containing the graph, expressed with
 // tf_executor dialect.
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf_test.cc b/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf_test.cc
index e674989d2174ba..2dda3809fc2b9b 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf_test.cc
@@ -60,16 +60,16 @@ class SessionClusterTensorflowDialectTest : public ::testing::Test {
     context_.loadAllAvailableDialects();
   }
 
-  tsl::Status CreateMlirModule(std::string mlir_module_filename) {
+  absl::Status CreateMlirModule(std::string mlir_module_filename) {
     std::string mlir_module_path = TestDataPath() + mlir_module_filename;
     mlir_module_ =
         mlir::parseSourceFile<mlir::ModuleOp>(mlir_module_path, &context_);
     if (!mlir_module_) {
-      return tsl::Status(
+      return absl::Status(
           absl::StatusCode::kNotFound,
           absl::StrCat("Could not find MLIR module at ", mlir_module_path));
     }
-    return tsl::OkStatus();
+    return absl::OkStatus();
   }
 
   DialectRegistry registry_;
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.cc b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.cc
index 59fb22e87eab58..322862828e63b3 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.cc
@@ -110,7 +110,7 @@ Status MaybeRewriteLayoutWithShardedShape(
     mlir::StringAttr sharding,
     const XlaShapeLayoutHelpers::ShapeDeterminationFns shape_determination_fns,
     xla::Shape* shape) {
-  if (!sharding) return OkStatus();
+  if (!sharding) return absl::OkStatus();
 
   xla::OpSharding op_sharding;
   if (tensorflow::DecodeShardingAttribute(sharding, op_sharding).failed()) {
@@ -121,7 +121,7 @@ Status MaybeRewriteLayoutWithShardedShape(
   TF_ASSIGN_OR_RETURN(hlo_sharding, xla::HloSharding::FromProto(op_sharding));
   TF_RETURN_IF_ERROR(RewriteLayoutWithShardedShape(
       hlo_sharding, /*use_fast_memory=*/false, shape_determination_fns, shape));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Converts arg_shapes to xla::Shape's and store into xla_input_shapes.
@@ -168,7 +168,7 @@ Status GetXlaInputShapes(
   } else {
     *xla_input_shapes = individual_arg_shapes;
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Returns a static ranked tensor type corresponding to the given static or
@@ -307,7 +307,7 @@ Status GetOutputInfo(
 
   // XLA computation always uses Tuple shape.
   *xla_output_shape = xla::ShapeUtil::MakeTupleShape(shapes);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Creates a vector that maps from the parameters of the XLA computation to
@@ -666,7 +666,7 @@ Status ConvertMLIRWithOptionalXlaComputation(
         module_op, &hlo_proto, use_tuple_args, return_tuple, options));
     *xla_computation = xla::XlaComputation(hlo_proto.hlo_module());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Wraps the optional lowering version to keep the api the same for clients.
@@ -692,7 +692,7 @@ Status CompileMlirSetup(mlir::ModuleOp module_op,
   if (VLOG_IS_ON(2))
     tensorflow::DumpMlirOpToFile("compile_mlir_shape_refiner", module_op);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status BuildHloFromTf(mlir::ModuleOp module_op, xla::XlaBuilder& builder,
@@ -715,7 +715,7 @@ Status BuildHloFromTf(mlir::ModuleOp module_op, xla::XlaBuilder& builder,
   if (VLOG_IS_ON(2))
     tensorflow::DumpMlirOpToFile("build_hlo_tf_after", module_op);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status PopulateCollectiveInfo(mlir::ModuleOp module_op,
@@ -729,7 +729,7 @@ Status PopulateCollectiveInfo(mlir::ModuleOp module_op,
           kGroupSizeAttrName.data(), kGroupSizeAttrName.size()));
   if (group_key_attr == nullptr && group_size_attr == nullptr) {
     // No CollectiveInfo is present.
-    return OkStatus();
+    return absl::OkStatus();
   }
   DCHECK(group_key_attr != nullptr)
       << "module attribute " << kGroupKeyAttrName
@@ -742,7 +742,7 @@ Status PopulateCollectiveInfo(mlir::ModuleOp module_op,
   VLOG(2) << "Populating CollectiveInfo: group_key=" << group_key
           << " group_size=" << group_size;
   compilation_result->collective_info = {group_key, group_size, 0};
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status PopulateResultIOInfo(
@@ -945,7 +945,7 @@ Status CompileGraphSetup(
   if (VLOG_IS_ON(1))
     tensorflow::DumpMlirOpToFile("compile_graph_setup_after", module_op);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status BuildHloFromModule(mlir::ModuleOp module_op, xla::XlaBuilder& builder,
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.h b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.h
index aaccd39a3db398..6a7a4c42ee3d52 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.h
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.h
@@ -147,7 +147,7 @@ Status PopulateResultIOInfo(
 // If enable_op_fallback is set to false, graph is legalized only if the graph
 // analysis for the graph is successful. Otherwise, an error is returned.
 ABSL_DEPRECATED("Use v2/legalize_tf.h::LegalizeMlirToHlo instead.")
-StatusOr<std::string> CompileMlirToXlaHlo(
+absl::StatusOr<std::string> CompileMlirToXlaHlo(
     mlir::ModuleOp module_op, llvm::ArrayRef<TensorOrResourceShape> arg_shapes,
     llvm::StringRef device_type, bool use_tuple_args, bool enable_op_fallback,
     bool use_return_tuple, bool use_resource_updates_for_aliases,
@@ -163,7 +163,7 @@ StatusOr<std::string> CompileMlirToXlaHlo(
 // If lower_to_xla_hlo is true then compiles down into XLA HLO, generates all
 // accompanying metadata and stores them in CompilationResult.
 ABSL_DEPRECATED("Use v2/legalize_tf.h::LegalizeMlirToHlo instead.")
-StatusOr<std::string> CompileSerializedMlirToXlaHlo(
+absl::StatusOr<std::string> CompileSerializedMlirToXlaHlo(
     llvm::StringRef mlir_module_string, llvm::ArrayRef<TensorShape> arg_shapes,
     llvm::StringRef device_type, bool use_tuple_args, bool enable_op_fallback,
     const XlaShapeLayoutHelpers::ShapeDeterminationFns shape_determination_fns,
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_tf_graph.cc b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_tf_graph.cc
index 0355204506068c..a289e3a6d84148 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_tf_graph.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_tf_graph.cc
@@ -130,11 +130,11 @@ Status PopulateInputOutputAliasing(
       output_to_input_alias[aliasing_output.getInt()] = arg_index;
   }
 
-  if (output_to_input_alias.empty()) return OkStatus();
+  if (output_to_input_alias.empty()) return absl::OkStatus();
 
   xla::HloModuleProto* module_proto =
       compilation_result->computation->mutable_proto();
-  StatusOr<xla::ProgramShape> program_shape_or_status =
+  absl::StatusOr<xla::ProgramShape> program_shape_or_status =
       compilation_result->computation->GetProgramShape();
   TF_RET_CHECK(program_shape_or_status.ok());
 
@@ -155,10 +155,10 @@ Status PopulateInputOutputAliasing(
     }
   }
   *module_proto->mutable_input_output_alias() = config.ToProto();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-bool failed(const tsl::Status& status) { return !status.ok(); }
+bool failed(const absl::Status& status) { return !status.ok(); }
 
 // Transforms the given module to be suitable for export to TensorFlow GraphDef
 // and then exports all functions to the given library.
@@ -203,7 +203,7 @@ Status PrepareAndExportToLibrary(mlir::ModuleOp module,
                                         flib_def);
 }
 
-tsl::Status CompileTFFunctionWithoutMlir(
+absl::Status CompileTFFunctionWithoutMlir(
     FunctionToHloArgs function_computation,
     const tpu::TPUCompileMetadataProto& metadata, bool use_tuple_args,
     const XlaShapeLayoutHelpers::ShapeDeterminationFns
@@ -230,7 +230,7 @@ tsl::Status CompileTFFunctionWithoutMlir(
   return comp_status;
 }
 
-tsl::Status CompileMLIRTFFunction(
+absl::Status CompileMLIRTFFunction(
     tpu::MlirToHloArgs mlir_computation,
     const tpu::TPUCompileMetadataProto& metadata, bool use_tuple_args,
     const XlaShapeLayoutHelpers::ShapeDeterminationFns
@@ -293,7 +293,7 @@ tsl::Status CompileMLIRTFFunction(
 
 }  // namespace
 
-tsl::Status CompileTensorflowGraphToHlo(
+absl::Status CompileTensorflowGraphToHlo(
     const std::variant<tpu::MlirToHloArgs, tpu::FunctionToHloArgs>& computation,
     const tpu::TPUCompileMetadataProto& metadata, bool use_tuple_args,
     const XlaShapeLayoutHelpers::ShapeDeterminationFns
@@ -331,7 +331,7 @@ tsl::Status CompileTensorflowGraphToHlo(
   phase2_bridge_compilation_time->GetCell(kBridgePhase2Config)
       ->Add(timer.ElapsedCyclesInMilliseconds());
 
-  return tsl::OkStatus();
+  return absl::OkStatus();
 }
 
 };  // namespace v1
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_tf_graph.h b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_tf_graph.h
index 0a4d8709393ef9..c3f2a6d2d0d868 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_tf_graph.h
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_tf_graph.h
@@ -34,7 +34,7 @@ namespace v1 {
 // Compiles the given Tensorflow graph into xla::HLO. The result is in
 // compilation_result. If the input computation is in MLIR, it will be
 // converted to a Tensorflow graph. Otherwise, the graph compiler will be run.
-tsl::Status CompileTensorflowGraphToHlo(
+absl::Status CompileTensorflowGraphToHlo(
     const std::variant<tpu::MlirToHloArgs, tpu::FunctionToHloArgs>& computation,
     const tpu::TPUCompileMetadataProto& metadata, bool use_tuple_args,
     XlaShapeLayoutHelpers::ShapeDeterminationFns shape_determination_funcs,
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_tf_graph_test.cc b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_tf_graph_test.cc
index fdff5122c3516e..06208be8fc5893 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_tf_graph_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_tf_graph_test.cc
@@ -76,7 +76,7 @@ MlirToHloArgs CreateTestMlirToHloArgs(const char* module_str = kMlirModuleStr) {
 
 class CompileTFGraphTest : public ::testing::Test {
  public:
-  tsl::StatusOr<XlaCompilationResult> CompileWithComputation(
+  absl::StatusOr<XlaCompilationResult> CompileWithComputation(
       const std::variant<tpu::MlirToHloArgs, tpu::FunctionToHloArgs>
           computation) {
     XlaCompilationResult compilation_result;
@@ -99,7 +99,7 @@ class CompileTFGraphTest : public ::testing::Test {
 
     XlaShapeLayoutHelpers::ShapeDeterminationFns shape_determination_fns;
 
-    tsl::Status compilation_status =
+    absl::Status compilation_status =
         tensorflow::tf2xla::v1::CompileTensorflowGraphToHlo(
             computation, metadata_proto, use_tuple_args,
             shape_determination_fns, arg_shapes, &arg_core_mapping,
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor_test.cc b/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor_test.cc
index 38393d3753146e..cad1edf2b89018 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor_test.cc
@@ -61,16 +61,16 @@ class TensorflowDialectToExecutorTest : public ::testing::Test {
     context_.loadAllAvailableDialects();
   }
 
-  tsl::Status CreateMlirModule(std::string mlir_module_filename) {
+  absl::Status CreateMlirModule(std::string mlir_module_filename) {
     std::string mlir_module_path = TestDataPath() + mlir_module_filename;
     mlir_module_ =
         mlir::parseSourceFile<mlir::ModuleOp>(mlir_module_path, &context_);
     if (!mlir_module_) {
-      return tsl::Status(
+      return absl::Status(
           absl::StatusCode::kNotFound,
           absl::StrCat("Could not find MLIR module at ", mlir_module_path));
     }
-    return tsl::OkStatus();
+    return absl::OkStatus();
   }
 
   DialectRegistry registry_;
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf_test.cc b/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf_test.cc
index a5f64a91cd8cb4..14a9c1b1a99bff 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf_test.cc
@@ -66,16 +66,16 @@ class FunctionClusterTensorflowDialectTest : public ::testing::Test {
     context_.loadAllAvailableDialects();
   }
 
-  tsl::Status CreateMlirModule(std::string mlir_module_filename) {
+  absl::Status CreateMlirModule(std::string mlir_module_filename) {
     std::string mlir_module_path = TestDataPath() + mlir_module_filename;
     mlir_module_ =
         mlir::parseSourceFile<mlir::ModuleOp>(mlir_module_path, &context_);
     if (!mlir_module_) {
-      return tsl::Status(
+      return absl::Status(
           absl::StatusCode::kNotFound,
           absl::StrCat("Could not find MLIR module at ", mlir_module_path));
     }
-    return tsl::OkStatus();
+    return absl::OkStatus();
   }
 
   DialectRegistry registry_;
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf.cc b/tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf.cc
index d297e45b70e0bb..d84e4d8692a19d 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf.cc
@@ -51,8 +51,6 @@ namespace tensorflow {
 namespace tf2xla {
 namespace v2 {
 
-using metrics::IncrementTfMlirBridgeSecondPhaseCounter;
-using metrics::MlirBridgeSecondPhaseMetric;
 using tpu::FunctionToHloArgs;
 using tpu::MlirToHloArgs;
 using tpu::ShardingAndIndex;
@@ -110,7 +108,7 @@ Status DumpHloCompilationResult(std::string_view name,
                                 XlaCompilationResult* compilation_result) {
   if (!VLOG_IS_ON(2) &&
       !DEBUG_DATA_DUMPER()->ShouldDump(std::string(name), kDebugGroupMain)) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   TF_ASSIGN_OR_RETURN(
@@ -130,12 +128,12 @@ Status DumpHloCompilationResult(std::string_view name,
 
   tensorflow::DumpRawStringToFile(name, all_computations);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
 
-tsl::StatusOr<tensorflow::XlaCompilationResult> LegalizeMlirToHlo(
+absl::StatusOr<tensorflow::XlaCompilationResult> LegalizeMlirToHlo(
     const std::variant<tpu::MlirToHloArgs, tpu::FunctionToHloArgs>& computation,
     const tpu::TPUCompileMetadataProto& metadata, bool use_tuple_args,
     llvm::StringRef device_type,
@@ -185,7 +183,7 @@ tsl::StatusOr<tensorflow::XlaCompilationResult> LegalizeMlirToHlo(
   }
 
   VLOG(1) << "Failed to compile MLIR computation to XLA HLO using Combined "
-             "MLIR and XlaBuilder Bridge. Falling back to MLIR tf2xla Bridge. "
+             "MLIR and XlaBuilder Bridge. Failed to lower to hlo."
           << combined_bridge_status.status();
   tsl::error_logging::Log(kBridgeComponent, "TFXLA_API_V2_COMBINED_BRIDGE",
                           combined_bridge_status.status().ToString())
@@ -201,7 +199,7 @@ tsl::StatusOr<tensorflow::XlaCompilationResult> LegalizeMlirToHlo(
     VLOG(1) << "Successfully compiled MLIR computation to XLA HLO using MLIR "
                "tf2xla Bridge";
     IncrementTfMlirBridgeSecondPhaseCounter(
-        MlirBridgeSecondPhaseMetric::kMlirWithFallbackModeSuccess);
+        metrics::MlirBridgeSecondPhaseMetric::kMlirWithFallbackModeSuccess);
 
     DumpHloCompilationResult("legalize_tf_mlir_bridge.hlo",
                              compilation_result.get())
@@ -219,7 +217,7 @@ tsl::StatusOr<tensorflow::XlaCompilationResult> LegalizeMlirToHlo(
                             mlir_bridge_status.status().ToString())
         .IgnoreError();
     IncrementTfMlirBridgeSecondPhaseCounter(
-        MlirBridgeSecondPhaseMetric::kMlirWithFallbackModeFailure);
+        metrics::MlirBridgeSecondPhaseMetric::kMlirWithFallbackModeFailure);
   }
 
   return mlir_bridge_status.status();
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf.h b/tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf.h
index c3dc6e18b92a1e..14a8271de171d1 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf.h
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf.h
@@ -50,7 +50,7 @@ namespace v2 {
 //  arg_core_mapping - Which args go on which cores.
 //  per_core_arg_shapes - For each core, the shapes for each argument.
 //  client - The Xla Compilation client.
-tsl::StatusOr<tensorflow::XlaCompilationResult> LegalizeMlirToHlo(
+absl::StatusOr<tensorflow::XlaCompilationResult> LegalizeMlirToHlo(
     const std::variant<tpu::MlirToHloArgs, tpu::FunctionToHloArgs>& computation,
     const tpu::TPUCompileMetadataProto& metadata, bool use_tuple_args,
     llvm::StringRef device_type,
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf_test.cc b/tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf_test.cc
index 81b3b5a180eb93..0e7e61999d8f2b 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf_test.cc
@@ -101,7 +101,7 @@ static constexpr char kUnsupportedMlirBridgeModuleStr[] = R"(
   }
 })";
 
-tsl::StatusOr<XlaCompiler::CompilationResult> CompileMlirModule(
+absl::StatusOr<XlaCompiler::CompilationResult> CompileMlirModule(
     const char* mlir_module_str,
     ConfigProto::Experimental::MlirBridgeRollout rollout_state) {
   MlirToHloArgs mlir_to_hlo_args;
@@ -291,7 +291,7 @@ TEST(LegalizeTFTest, RecordsStreamzForNoMlirFallback) {
   std::vector<std::unique_ptr<mlir::Pass>> custom_legalization_passes;
 
   // This doesn't actually compile correctly.
-  tsl::StatusOr<XlaCompiler::CompilationResult> compile_result =
+  absl::StatusOr<XlaCompiler::CompilationResult> compile_result =
       LegalizeMlirToHlo(function_to_hlo_args, metadata_proto, use_tuple_args,
                         /*device_type=*/"XLA_CPU_JIT",
                         custom_legalization_passes,
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor_test.cc b/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor_test.cc
index 9940a8d52c18e8..0c64dd3dcbe1a3 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor_test.cc
@@ -61,16 +61,16 @@ class TensorflowDialectToExecutorTest : public ::testing::Test {
     context_.loadAllAvailableDialects();
   }
 
-  tsl::Status CreateMlirModule(std::string mlir_module_filename) {
+  absl::Status CreateMlirModule(std::string mlir_module_filename) {
     std::string mlir_module_path = TestDataPath() + mlir_module_filename;
     mlir_module_ =
         mlir::parseSourceFile<mlir::ModuleOp>(mlir_module_path, &context_);
     if (!mlir_module_) {
-      return tsl::Status(
+      return absl::Status(
           absl::StatusCode::kNotFound,
           absl::StrCat("Could not find MLIR module at ", mlir_module_path));
     }
-    return tsl::OkStatus();
+    return absl::OkStatus();
   }
 
   DialectRegistry registry_;
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes.cc b/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes.cc
index e289934b69fbe0..d0909c452a0325 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes.cc
+++ b/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes.cc
@@ -189,8 +189,6 @@ void AddNonReplicatedBridgeClusteringPipelinePasses(OpPassManager& pm) {
   // inference.
   pm.addPass(mlir::TF::CreateGuaranteeAllFuncsOneUsePass());
   pm.addPass(mlir::TF::CreateTFShapeInferencePass());
-  pm.addPass(
-      tensorflow::tf2xla::internal::CreateXlaOutlineEntryFunctionsPass());
   // Encapsulate PartitionedCall ops within a cluster so that the composite
   // resource ops can be decomposed.
   pm.addPass(tensorflow::tf2xla::internal::CreateXlaClusterFormationPass());
@@ -200,12 +198,6 @@ void AddNonReplicatedBridgeClusteringPipelinePasses(OpPassManager& pm) {
   pm.addNestedPass<FuncOp>(mlir::createCanonicalizerPass());
   // Decompose resource ops.
   pm.addPass(mlir::TFDevice::CreateDecomposeResourceOpsInClusterPass());
-  // TODO(b/267193636): Remove this flag when outside compilation
-  // for generic pipeline is landed.
-  if (tensorflow::GetMlirCommonFlags()
-          ->tf_mlir_enable_generic_outside_compilation) {
-    pm.addPass(mlir::TF::CreateTFFunctionalControlFlowToRegions());
-  }
   // Run another shape inference pass because resource decomposition might have
   // created new partial types. Also, after dropping `shape_invariant` attribute
   // from While/WhileRegion ops within cluster would lead to more precise
@@ -220,17 +212,6 @@ void AddNonReplicatedBridgeClusteringPipelinePasses(OpPassManager& pm) {
   // Lift resource operations out of device computation. This step needs to be
   // done after inlining.
   pm.addPass(mlir::TFDevice::CreateResourceOpLiftingPass());
-  // TODO(b/267193636): Remove this flag when outside compilation
-  // for generic pipeline is landed.
-  if (tensorflow::GetMlirCommonFlags()
-          ->tf_mlir_enable_generic_outside_compilation) {
-    pm.addPass(
-        tensorflow::tf2xla::internal::CreateMarkOpsForOutsideCompilationPass());
-    pm.addPass(tensorflow::tf2xla::internal::
-                   CreateExtractHeadTailOutsideCompilationPass());
-    pm.addPass(
-        tensorflow::tf2xla::internal::CreateExtractOutsideCompilationPass());
-  }
   // Outline clusters into cluster functions.
   pm.addPass(mlir::TFDevice::CreateClusterOutliningPass());
   // Verifies clustering has conformed with the expected invariants
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes_test.cc b/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes_test.cc
index d7d26cc1f7de50..9507ab371c1bcf 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes_test.cc
@@ -35,7 +35,7 @@ TEST(ClusteringBridgePassesTest, AddsNonTPUBridgePasses) {
   OpPassManager pass_manager;
   AddNonReplicatedBridgeClusteringPipelinePasses(pass_manager);
 
-  EXPECT_EQ(pass_manager.size(), 16);
+  EXPECT_EQ(pass_manager.size(), 15);
 }
 
 };  // namespace internal
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_mlir.cc b/tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_mlir.cc
index 4fd0c21d68331f..581f28fb4c9557 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_mlir.cc
+++ b/tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_mlir.cc
@@ -59,7 +59,7 @@ constexpr char kBridgeComponent[] = "TFXLABridge";
 using tpu::MlirToHloArgs;
 using tpu::ShardingAndIndex;
 
-tsl::StatusOr<std::string> CompileFromMlirToXlaHlo(
+absl::StatusOr<std::string> CompileFromMlirToXlaHlo(
     bool lower_to_xla_hlo, const MlirToHloArgs& computation,
     const tpu::TPUCompileMetadataProto& metadata, llvm::StringRef device_type,
     const XlaShapeLayoutHelpers::ShapeDeterminationFns& shape_determination_fns,
@@ -103,7 +103,7 @@ tsl::StatusOr<std::string> CompileFromMlirToXlaHlo(
   return compiled_mlir;
 }
 
-tsl::StatusOr<XlaCompilationResult> LegalizeWithMlirBridge(
+absl::StatusOr<XlaCompilationResult> LegalizeWithMlirBridge(
     const tpu::MlirToHloArgs& computation,
     const tpu::TPUCompileMetadataProto& metadata, bool use_tuple_args,
     llvm::StringRef device_type,
@@ -119,7 +119,7 @@ tsl::StatusOr<XlaCompilationResult> LegalizeWithMlirBridge(
   // Enabling op fallback also enables whole graph fallback if op by op
   // fallback failed.
 
-  tsl::StatusOr<std::string> mlir_bridge_status = CompileFromMlirToXlaHlo(
+  absl::StatusOr<std::string> mlir_bridge_status = CompileFromMlirToXlaHlo(
       /*lower_to_xla_hlo=*/true, computation, metadata, device_type,
       shape_determination_fns, use_tuple_args, compilation_result,
       custom_legalization_passes, arg_shapes, arg_core_mapping,
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_mlir.h b/tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_mlir.h
index 91a613f8c6f848..014d9a4f35d31c 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_mlir.h
+++ b/tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_mlir.h
@@ -33,7 +33,7 @@ namespace internal {
 // result of running all the MLIR Bridge passes. If compile_to_xla_hlo is true
 // then those passes include all the Legalization to XLA HLO which is returned
 // in the compilation_result.
-tsl::StatusOr<std::string> CompileFromMlirToXlaHlo(
+absl::StatusOr<std::string> CompileFromMlirToXlaHlo(
     bool lower_to_xla_hlo, const tpu::MlirToHloArgs& computation,
     const tpu::TPUCompileMetadataProto& metadata, llvm::StringRef device_type,
     const XlaShapeLayoutHelpers::ShapeDeterminationFns& shape_determination_fns,
@@ -45,7 +45,7 @@ tsl::StatusOr<std::string> CompileFromMlirToXlaHlo(
 
 // Compiles a serialized MLIR module into XLA HLO, generates all accompanying
 // metadata and stores them in CompilationResult.
-tsl::StatusOr<XlaCompilationResult> LegalizeWithMlirBridge(
+absl::StatusOr<XlaCompilationResult> LegalizeWithMlirBridge(
     const tpu::MlirToHloArgs& computation,
     const tpu::TPUCompileMetadataProto& metadata, bool use_tuple_args,
     llvm::StringRef device_type,
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_mlir_test.cc b/tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_mlir_test.cc
index 904bfa85a3cc98..c8b9577e0daa6a 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_mlir_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_mlir_test.cc
@@ -49,8 +49,8 @@ static constexpr char kMlirModuleStr[] = R"(
     }
   })";
 
-tsl::StatusOr<std::string> CompileMlirModule(bool compile_to_xla_hlo,
-                                             const char* module_str) {
+absl::StatusOr<std::string> CompileMlirModule(bool compile_to_xla_hlo,
+                                              const char* module_str) {
   MlirToHloArgs mlir_to_hlo_args;
   mlir_to_hlo_args.mlir_module = module_str;
 
@@ -71,7 +71,7 @@ tsl::StatusOr<std::string> CompileMlirModule(bool compile_to_xla_hlo,
       &per_core_arg_shapes);
 }
 
-tsl::StatusOr<XlaCompiler::CompilationResult> LegalizeMlirModule(
+absl::StatusOr<XlaCompiler::CompilationResult> LegalizeMlirModule(
     const char* module_str) {
   MlirToHloArgs mlir_to_hlo_args;
   mlir_to_hlo_args.mlir_module = module_str;
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_to_hlo.cc b/tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_to_hlo.cc
index ba1a20a27ef751..e26741e0877d7b 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_to_hlo.cc
+++ b/tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_to_hlo.cc
@@ -44,7 +44,7 @@ using metrics::IncrementTfMlirBridgeSecondPhaseCounter;
 using metrics::MlirBridgeSecondPhaseMetric;
 using tpu::MlirToHloArgs;
 
-tsl::StatusOr<XlaCompilationResult> LegalizeTfToHlo(
+absl::StatusOr<XlaCompilationResult> LegalizeTfToHlo(
     const tpu::MlirToHloArgs& computation,
     const tpu::TPUCompileMetadataProto& metadata, bool use_tuple_args,
     llvm::StringRef device_type,
@@ -57,8 +57,8 @@ tsl::StatusOr<XlaCompilationResult> LegalizeTfToHlo(
   LOG_FIRST_N(INFO, 1) << "Compiling MLIR computation to XLA HLO using the "
                           "Combined MLIR Tf2Xla Bridge.";
 
-  tsl::StatusOr<std::string> mlir_compilation
-      = internal::CompileFromMlirToXlaHlo(
+  absl::StatusOr<std::string> mlir_compilation =
+      internal::CompileFromMlirToXlaHlo(
           /*lower_to_xla_hlo=*/false, computation, metadata, device_type,
           shape_determination_fns, use_tuple_args, compilation_result,
           custom_legalization_passes, arg_shapes, arg_core_mapping,
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_to_hlo.h b/tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_to_hlo.h
index c0a8283ed30605..664bd549ed360d 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_to_hlo.h
+++ b/tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_to_hlo.h
@@ -29,7 +29,7 @@ namespace internal {
 
 // Legalize the given MLIR module to XLA HLO using a combination of the MLIR
 // Bridge and XlaBuilder
-tsl::StatusOr<XlaCompilationResult> LegalizeTfToHlo(
+absl::StatusOr<XlaCompilationResult> LegalizeTfToHlo(
     const tpu::MlirToHloArgs& computation,
     const tpu::TPUCompileMetadataProto& metadata, bool use_tuple_args,
     llvm::StringRef device_type,
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_to_hlo_test.cc b/tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_to_hlo_test.cc
index ef5a82ed844728..686081c049e1b9 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_to_hlo_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_to_hlo_test.cc
@@ -72,7 +72,7 @@ static constexpr char kBadMlirModuleStr[] = R"(
     }
   })";
 
-tsl::StatusOr<XlaCompiler::CompilationResult> CompileMlirModule(
+absl::StatusOr<XlaCompiler::CompilationResult> CompileMlirModule(
     const char* module_str) {
   MlirToHloArgs mlir_to_hlo_args;
   mlir_to_hlo_args.rollout_state =
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/logging_hooks_test.cc b/tensorflow/compiler/mlir/tf2xla/internal/logging_hooks_test.cc
index a52e316d0bc334..840d4c971e7bb5 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/logging_hooks_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/internal/logging_hooks_test.cc
@@ -67,16 +67,16 @@ class LoggingHooksTest : public ::testing::Test {
     setenv("TF_DUMP_GRAPH_PREFIX", test_dir_.c_str(), 1);
   }
 
-  tsl::Status CreateMlirModule(std::string mlir_module_filename) {
+  absl::Status CreateMlirModule(std::string mlir_module_filename) {
     std::string mlir_module_path = TestDataPath() + mlir_module_filename;
     mlir_module_ =
         mlir::parseSourceFile<mlir::ModuleOp>(mlir_module_path, &context_);
     if (!mlir_module_) {
-      return tsl::Status(
+      return absl::Status(
           absl::StatusCode::kNotFound,
           absl::StrCat("Could not find MLIR module at ", mlir_module_path));
     }
-    return tsl::OkStatus();
+    return absl::OkStatus();
   }
 
   DialectRegistry registry_;
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/mlir_bridge_pass_util.cc b/tensorflow/compiler/mlir/tf2xla/internal/mlir_bridge_pass_util.cc
index 2f8d85d7325298..7bf4c74e094af5 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/mlir_bridge_pass_util.cc
+++ b/tensorflow/compiler/mlir/tf2xla/internal/mlir_bridge_pass_util.cc
@@ -76,6 +76,36 @@ LogicalResult HasAttr(
   return failure();
 }
 
+// Check if the `graph` has parameter server jobs and resource variable
+// arguments that are on parameter servers
+bool HasPsWithResourceVariable(const Graph& graph) {
+  // Check parameter serverjobs and resource variable arguments that are
+  // on parameter servers.
+  const std::string jobType = "ps";
+  const std::string nodeType = "_Arg";
+  const std::string attrKey = "T";
+  for (const Node* node : graph.nodes()) {
+    if (node->type_string() == nodeType) {
+      auto device_name = node->assigned_device_name();
+      DeviceNameUtils::ParsedName device;
+      if (DeviceNameUtils::ParseFullName(device_name, &device) &&
+          device.has_job && device.job == jobType) {
+        for (const auto& attr : node->attrs()) {
+          auto attr_key = attr.first;
+          auto attr_value = attr.second;
+          if (attr_key == attrKey &&
+              attr_value.value_case() == AttrValue::kType &&
+              attr_value.type() == DT_RESOURCE) {
+            return true;
+            break;
+          }
+        }
+      }
+    }
+  }
+  return false;
+}
+
 bool IsNonReplicatedGraph(const Graph& graph,
                           const FunctionLibraryDefinition* function_library) {
   auto predicate = [](const Graph& graph) {
@@ -111,22 +141,6 @@ bool IsReplicatedGraph(const Graph& graph,
   return HasAttr(graph, function_library, predicate).succeeded();
 }
 
-bool IsSingleCoreTpuGraph(const Graph& graph,
-                          const FunctionLibraryDefinition* function_library) {
-  auto predicate = [](const Graph& graph) {
-    for (const Node* node : graph.nodes()) {
-      // _xla_compile_device_type=TPU is found in single-core TPU graphs.
-      auto attr =
-          node->attrs().FindByString(std::string(kCompileDeviceTypeAttr));
-      if (attr && attr->s() == kTpuDevice) {
-        return true;
-      }
-    }
-    return false;
-  };
-  return HasAttr(graph, function_library, predicate).succeeded();
-}
-
 bool IsReplicatedGraph(mlir::ModuleOp module) {
   auto walk_result = module.walk([&](mlir::Operation* op) {
     // TODO(b/223677572): Once the scope for new compilation and replication
@@ -144,25 +158,6 @@ bool IsReplicatedGraph(mlir::ModuleOp module) {
   return walk_result.wasInterrupted();
 }
 
-bool IsSingleCoreTPUGraph(mlir::ModuleOp module) {
-  auto walk_result = module.walk([&](mlir::Operation* op) {
-    // Check for ops with compile device type "TPU". This allows us to support
-    // TPU compilation without replication. Note that currently the compile
-    // device type is not set by default before bridge, only if eager context
-    // attribute `jit_compile_rewrite` is true.
-    // TODO(b/229028654): Remove string conversion once we have C++17.
-    const llvm::StringRef compile_device_type_attr_name(
-        kCompileDeviceTypeAttr.data(), kCompileDeviceTypeAttr.size());
-    auto compilation_attr =
-        op->getAttrOfType<mlir::StringAttr>(compile_device_type_attr_name);
-    if (compilation_attr && compilation_attr.getValue().str() == kTpuDevice) {
-      return mlir::WalkResult::interrupt();
-    }
-    return mlir::WalkResult::advance();
-  });
-  return walk_result.wasInterrupted();
-}
-
 // Traverses each node in the graph and check if any of them is
 // TPUPartitionedCall. If so, return true. Otherwise, return false.
 bool DoesGraphContainTPUPartitionedCall(const Graph& graph) {
@@ -206,17 +201,17 @@ bool AreFunctionsFromFlibDefInference(
 
 bool IsSupportedByNonReplicatedBridge(
     const Graph& graph, const FunctionLibraryDefinition* function_library) {
-  return IsNonReplicatedGraph(graph, function_library);
+  return IsNonReplicatedGraph(graph, function_library) &&
+         HasPsWithResourceVariable(graph);
 }
 
 bool IsSupportedByReplicatedBridge(
     const Graph& graph, const FunctionLibraryDefinition* function_library) {
-  return IsReplicatedGraph(graph, function_library) ||
-         IsSingleCoreTpuGraph(graph, function_library);
+  return IsReplicatedGraph(graph, function_library);
 }
 
 bool IsSupportedByReplicatedBridge(mlir::ModuleOp module) {
-  return IsReplicatedGraph(module) || IsSingleCoreTPUGraph(module);
+  return IsReplicatedGraph(module);
 }
 
 bool HasTPUPartitionedCallOpInModule(mlir::ModuleOp module) {
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/mlir_bridge_pass_util_test.cc b/tensorflow/compiler/mlir/tf2xla/internal/mlir_bridge_pass_util_test.cc
index 699964e989f1e9..6cbc67d4ec395c 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/mlir_bridge_pass_util_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/internal/mlir_bridge_pass_util_test.cc
@@ -44,28 +44,19 @@ namespace tensorflow {
 
 namespace {
 
-FunctionDef OuterXTimesTwo() {
+// Produce a valid graph with a resource-type input.
+FunctionDef PassThroughResource() {
   return FunctionDefHelper::Define(
-      // Name
-      "OuterXTimesTwo",
-      // Args
-      {"x: float"},
-      // Return values
-      {"y: float"},
-      // Attr def
-      {},
-      {{{"y"},
-        "StatefulPartitionedCall",
-        {"x"},
-        {{"Tin", DataTypeSlice{DT_FLOAT}},
-         {"Tout", DataTypeSlice{DT_FLOAT}},
-         {"f",
-          FunctionDefHelper::FunctionRef("XTimesTwoFloat", {{"T", DT_FLOAT}})},
-         {std::string(kMustCompileAttr), true}}}});
+      /*function_name=*/"PassThroughResource",
+      /*arg_def=*/{"in: resource"},
+      /*ret_def=*/{"out: resource"},
+      /*attr_def=*/{},
+      /*node_def=*/
+      {{{"out"}, "Identity", {"in"}, {{"T", DataType::DT_RESOURCE}}}});
 }
 
 TEST(IsSupportedByNonReplicatedBridge, NonReplicatedGraph) {
-  const FunctionDef& fd = test::function::XTimesTwo();
+  const FunctionDef& fd = PassThroughResource();
   FunctionDefLibrary flib;
   *flib.add_function() = fd;
   FunctionLibraryDefinition flib_def(OpRegistry::Global(), flib);
@@ -76,7 +67,7 @@ TEST(IsSupportedByNonReplicatedBridge, NonReplicatedGraph) {
   ConfigProto config = ConfigProto();
   Scope root = Scope::NewRootScope().ExitOnError();
 
-  Output a = ops::_Arg(root.WithOpName("A"), DT_FLOAT, 0);
+  Output a = ops::_Arg(root.WithOpName("A"), DT_RESOURCE, 0);
   std::vector<NodeBuilder::NodeOut> inputs({NodeBuilder::NodeOut(a.node())});
 
   Node* call;
@@ -85,50 +76,21 @@ TEST(IsSupportedByNonReplicatedBridge, NonReplicatedGraph) {
   TF_ASSERT_OK(
       NodeBuilder("B", "StatefulPartitionedCall", &root.graph()->flib_def())
           .Input(inputs)
-          .Attr("Tin", {DT_FLOAT})
-          .Attr("Tout", {DT_FLOAT})
+          .Attr("Tin", {DT_RESOURCE})
+          .Attr("Tout", {DT_RESOURCE})
           .Attr("f", f_name_attr)
           .Finalize(root.graph(), &call));
   call->AddAttr(std::string(kMustCompileAttr), true);
 
   TF_ASSERT_OK(root.ToGraph(&graph));
 
-  EXPECT_TRUE(
-      IsSupportedByNonReplicatedBridge(graph, /*function_library=*/nullptr));
-}
-
-// Checks that HasAttr actually goes through function library.
-TEST(IsSupportedByNonReplicatedBridge, NonReplicatedFunctionLibrary) {
-  const FunctionDef& fd = OuterXTimesTwo();
-  FunctionDefLibrary flib;
-  *flib.add_function() = fd;
-  FunctionLibraryDefinition flib_def(OpRegistry::Global(), flib);
-  Graph graph(OpRegistry::Global());
-  graph.SetConstructionContext(ConstructionContext::kEagerRuntime);
-  tensorflow::set_tf2_execution(true);
-
-  ConfigProto config = ConfigProto();
-  Scope root = Scope::NewRootScope().ExitOnError();
-
-  Output a = ops::_Arg(root.WithOpName("A"), DT_FLOAT, 0);
-  std::vector<NodeBuilder::NodeOut> inputs({NodeBuilder::NodeOut(a.node())});
-
-  // Builds a call without compilation markers that calls a function with Xla
-  // clusters.
-  Node* call;
-  NameAttrList f_name_attr;
-  f_name_attr.set_name(fd.signature().name());
-  TF_ASSERT_OK(
-      NodeBuilder("B", "StatefulPartitionedCall", &root.graph()->flib_def())
-          .Input(inputs)
-          .Attr("Tin", {DT_FLOAT})
-          .Attr("Tout", {DT_FLOAT})
-          .Attr("f", f_name_attr)
-          .Finalize(root.graph(), &call));
+  // Required for passing the PS server parameter check.
+  for (Node* node : graph.nodes()) {
+    node->set_assigned_device_name("/job:ps/replica:0/task:0/device:GPU:0");
+  }
 
-  TF_ASSERT_OK(root.ToGraph(&graph));
   EXPECT_TRUE(
-      IsSupportedByNonReplicatedBridge(graph, /*function_library=*/&flib_def));
+      IsSupportedByNonReplicatedBridge(graph, /*function_library=*/nullptr));
 }
 
 TEST(IsSupportedByReplicatedBridge, ReplicatedGraph) {
@@ -164,39 +126,6 @@ TEST(IsSupportedByReplicatedBridge, ReplicatedGraph) {
       IsSupportedByReplicatedBridge(graph, /*function_library=*/nullptr));
 }
 
-TEST(IsSupportedByReplicatedBridge, SingleCoreTpuGraph) {
-  const FunctionDef& fd = test::function::XTimesTwo();
-  FunctionDefLibrary flib;
-  *flib.add_function() = fd;
-  FunctionLibraryDefinition flib_def(OpRegistry::Global(), flib);
-  Graph graph(flib_def);
-  graph.SetConstructionContext(ConstructionContext::kEagerRuntime);
-  tensorflow::set_tf2_execution(true);
-
-  ConfigProto config = ConfigProto();
-  Scope root = Scope::NewRootScope().ExitOnError();
-
-  Output a = ops::_Arg(root.WithOpName("A"), DT_FLOAT, 0);
-  std::vector<NodeBuilder::NodeOut> inputs({NodeBuilder::NodeOut(a.node())});
-
-  Node* call;
-  NameAttrList f_name_attr;
-  f_name_attr.set_name(fd.signature().name());
-  TF_ASSERT_OK(
-      NodeBuilder("B", "StatefulPartitionedCall", &root.graph()->flib_def())
-          .Input(inputs)
-          .Attr("Tin", {DT_FLOAT})
-          .Attr("Tout", {DT_FLOAT})
-          .Attr("f", f_name_attr)
-          .Finalize(root.graph(), &call));
-  call->AddAttr(std::string(kCompileDeviceTypeAttr), kTpuDevice);
-
-  TF_ASSERT_OK(root.ToGraph(&graph));
-
-  EXPECT_TRUE(
-      IsSupportedByReplicatedBridge(graph, /*function_library=*/nullptr));
-}
-
 TEST(IsSupportedByReplicatedBridge, ReplicatedModule) {
   const char* const code = R"mlir(
 func.func @entry_func_1(%arg0: tensor<i32>) -> tensor<i32> attributes {tf.entry_function = {}} {
@@ -212,21 +141,6 @@ func.func @entry_func_1(%arg0: tensor<i32>) -> tensor<i32> attributes {tf.entry_
   EXPECT_TRUE(IsSupportedByReplicatedBridge(*module));
 }
 
-TEST(IsSupportedByReplicatedBridge, SingleCoreTpuModule) {
-  const char* const code = R"mlir(
-func.func @entry_func_1(%arg0: tensor<i32>) -> tensor<i32> attributes {tf.entry_function = {}} {
-  %0 = "tf.Identity"(%arg0) {_xla_compile_device_type = "TPU"} : (tensor<i32>) -> (tensor<i32>)
-  func.return %0 : tensor<i32>
-}
-)mlir";
-  mlir::MLIRContext context;
-  context.loadDialect<mlir::func::FuncDialect, mlir::TF::TensorFlowDialect>();
-  mlir::OwningOpRef<mlir::ModuleOp> module =
-      mlir::parseSourceString<mlir::ModuleOp>(code, &context);
-  ASSERT_TRUE(module);
-  EXPECT_TRUE(IsSupportedByReplicatedBridge(*module));
-}
-
 TEST(HasTPUPartitionedCallOpInModule, HasTPUPartitionedCallModule) {
   const char* const code = R"mlir(
 module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/BUILD b/tensorflow/compiler/mlir/tf2xla/internal/passes/BUILD
index 4c6f68a3419656..9641e092815b58 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/passes/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/BUILD
@@ -33,7 +33,6 @@ cc_library(
         ":verify_clustering_pass",
         ":xla_broadcast",
         ":xla_cluster_formation",
-        ":xla_outline_entry_functions",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
@@ -135,7 +134,6 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:tpu_rewrite_device_util",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tf_pass_inc_gen",
         "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings",
@@ -289,36 +287,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "xla_outline_entry_functions",
-    srcs = ["xla_outline_entry_functions.cc"],
-    textual_hdrs = [
-        "clustering_passes.h.inc",
-    ],
-    deps = [
-        ":clustering_passes_inc_gen",
-        "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow:attribute_utils",
-        "//tensorflow/compiler/mlir/tensorflow:call_graph_util",
-        "//tensorflow/compiler/mlir/tensorflow:cluster_util",
-        "//tensorflow/compiler/mlir/tensorflow:string_util",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_analysis",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
-        "//tensorflow/compiler/mlir/tensorflow:tpu_rewrite_device_util",
-        "//tensorflow/core:core_cpu_base",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:portable_gif_internal",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TransformUtils",
-    ],
-)
-
 cc_library(
     name = "mark_ops_for_outside_compilation",
     srcs = ["mark_ops_for_outside_compilation.cc"],
@@ -425,22 +393,6 @@ cc_library(
     ],
 )
 
-tf_cc_test(
-    name = "tpu_cluster_formation_test",
-    srcs = ["tpu_cluster_formation_test.cc"],
-    deps = [
-        ":clustering_passes",
-        "//tensorflow/compiler/mlir/tf2xla/transforms:test_utils",
-        "//tensorflow/core/lib/monitoring:cell_reader",
-        "@com_google_googletest//:gtest_main",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Support",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
 cc_library(
     name = "lowering_passes",
     hdrs = [
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.h b/tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.h
index 5eba662c4bae60..fb6e32ac377b79 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.h
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.h
@@ -46,11 +46,6 @@ CreateExtractOutsideCompilationPass();
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
 CreateXlaClusterFormationPass();
 
-// Create a pass that rewrites entry functions with `_xla_compile_device` into a
-// `tf.StatefulPartitionedCall` to the original function.
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-CreateXlaOutlineEntryFunctionsPass();
-
 // Creates a pass that marks unsupported ops in device cluster for outside
 // compilation.
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.td b/tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.td
index 5b77ddd5afe991..2f617f7c154935 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.td
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.td
@@ -253,56 +253,6 @@ def XlaClusterFormationPass : Pass<"tf-xla-cluster-formation", "ModuleOp"> {
   let dependentDialects = ["mlir::tf_device::TensorFlowDeviceDialect"];
 }
 
-def XlaOutlineEntryFunctionsPass : Pass<"tf-xla-outline-entry-functions", "ModuleOp"> {
-  let summary = "Outline the body of an entry function into a call to the "
-    "original function body";
-  let description = [{
-    This pass adds support for top-level function with
-    `_xla_compile_device_type` attribute in MLIR generic pipeline.
-    It renames such a function, and creates a new function taking the original
-    name with a `tf.StatefulPartitionedCall` to the original function. It
-    allows the MLIR generic pipeline to handle such functions the same way it
-    handles other partitioned calls with the attribute.
-
-    For example, the following code
-
-    ```mlir
-    func.func @main(%arg0: tensor<i32>) -> tensor<i32> attributes {_xla_compile_device_type = "CPU", allow_soft_placement = true, tf.entry_function = {}} {
-      %0 = "tf.StatefulPartitionedCall"(%arg0) {config = "", config_proto = "", executor_type = "", _xla_compile_device_type = "CPU", f = @stateful_pcall_func} : (tensor<i32>) -> (tensor<i32>)
-      %1 = "tf.Const"() {value = dense<5> : tensor<i32>} : () -> tensor<i32>
-      %2 = "tf.Add"(%0, %1) : (tensor<i32>, tensor<i32>) -> (tensor<i32>)
-      func.return %2 : tensor<i32>
-    }
-
-    func.func @stateful_pcall_func(%arg0: tensor<i32>) -> tensor<i32> {
-      func.return %arg0 : tensor<i32>
-    }
-    ```
-
-    will be replaced as
-
-    ```mlir
-    func.func private @main_outlined(%arg0: tensor<i32>) -> tensor<i32> attributes {_xla_compile_device_type = "CPU", allow_soft_placement = true} {
-      %0 = "tf.StatefulPartitionedCall"(%arg0) {_xla_compile_device_type = "CPU", config = "", config_proto = "", executor_type = "", f = @stateful_pcall_func} : (tensor<i32>) -> tensor<i32>
-      %cst = "tf.Const"() {value = dense<5> : tensor<i32>} : () -> tensor<i32>
-      %1 = "tf.Add"(%0, %cst) : (tensor<i32>, tensor<i32>) -> tensor<i32>
-      return %1 : tensor<i32>
-    }
-
-    func.func @main(%arg0: tensor<i32>) -> tensor<i32> attributes {_xla_compile_device_type = "CPU", tf.entry_function = {}} {
-      %0 = "tf.StatefulPartitionedCall"(%arg0) {_xla_compile_device_type = "CPU", allow_soft_placement = true, config = "", config_proto = "", executor_type = "", f = @main_outlined} : (tensor<i32>) -> tensor<i32>
-      return %0 : tensor<i32>
-    }
-
-    func.func @stateful_pcall_func(%arg0: tensor<i32>) -> tensor<i32> {
-      func.return %arg0 : tensor<i32>
-    }
-    ```
-  }];
-  let constructor = "tensorflow::tf2xla::internal::CreateXlaOutlineEntryFunctionsPass()";
-  let dependentDialects = ["mlir::tf_device::TensorFlowDeviceDialect"];
-}
-
 def MarkOpsForOutsideCompilationPass : Pass<"tf-mark-ops-for-outside-compilation", "ModuleOp"> {
   let summary = "Marks ops in device cluster for outside compilation if they are unsupported on device.";
 
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/tpu_cluster_formation.cc b/tensorflow/compiler/mlir/tf2xla/internal/passes/tpu_cluster_formation.cc
index 637369ed4fb6fc..b600c865661d58 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/passes/tpu_cluster_formation.cc
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/tpu_cluster_formation.cc
@@ -59,7 +59,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/string_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h"
-#include "tensorflow/core/lib/monitoring/counter.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
@@ -96,8 +95,6 @@ constexpr llvm::StringRef kNoReplicationCluster = "__no_replication_cluster";
 constexpr llvm::StringRef kBadReplicateInfoAttrMsg =
     "requires '_replication_info' string attribute";
 
-constexpr char kUseMlirBridge[] = "kUseMlirBridge";
-
 // Mapping for `_replication_info` attribute to TPUReplicateMetadata attributes.
 using MetadataMap = llvm::SmallDenseMap<llvm::StringRef, NamedAttrList, 8>;
 
@@ -108,15 +105,6 @@ using OpSetVector = llvm::SmallSetVector<Operation*, 8>;
 // Mapping for `_replication_info` attribute to ops of a cluster.
 using ClusterMap = llvm::SmallDenseMap<llvm::StringRef, OpSetVector, 8>;
 
-auto* jit_compile_single_core_tpu_count =
-    tensorflow::monitoring::Counter<1>::New(
-        /* metric name */
-        "/tensorflow/core/jit_compile_single_core_tpu_count",
-        /* metric description */
-        "Tracks if single core tpu support goes through the first "
-        "phase of the MLIR bridge",
-        /* metric field */ "use_mlir_bridge");
-
 #define GEN_PASS_DEF_TPUCLUSTERFORMATIONPASS
 #include "tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.h.inc"
 
@@ -943,7 +931,7 @@ void SetNoReplicationClusterAttrs(mlir::tf_device::ClusterOp cluster,
 LogicalResult FormClustersInBlock(
     Block* block,
     const mlir::TF::SideEffectAnalysis::Info& side_effect_analysis,
-    bool strict_clusters, bool& has_replication_in_module) {
+    bool strict_clusters) {
   MetadataMap metadata_map;
   LogicalResult result = CollectMetadata(block, &metadata_map);
   if (failed(result)) return result;
@@ -956,8 +944,7 @@ LogicalResult FormClustersInBlock(
         if (!llvm::hasSingleElement(region))
           return op.emitOpError("Expected single block region");
         if (failed(FormClustersInBlock(&region.front(), side_effect_analysis,
-                                       strict_clusters,
-                                       has_replication_in_module)))
+                                       strict_clusters)))
           return mlir::failure();
       }
     }
@@ -998,7 +985,6 @@ LogicalResult FormClustersInBlock(
         block, cluster_ops, results, cluster_successor_ops.getArrayRef());
 
     if (!has_replication) {
-      has_replication_in_module = false;
       SetNoReplicationClusterAttrs(cluster, device_type, device);
       continue;
     }
@@ -1034,12 +1020,12 @@ LogicalResult FormClustersInBlock(
 LogicalResult FormClustersInFunction(
     mlir::func::FuncOp func,
     const mlir::TF::SideEffectAnalysis::Info& side_effect_analysis,
-    bool strict_clusters, bool& has_replication_in_module) {
+    bool strict_clusters) {
   if (!llvm::hasSingleElement(func))
     return func.emitOpError("Expecting a single block function");
 
   if (failed(FormClustersInBlock(&func.front(), side_effect_analysis,
-                                 strict_clusters, has_replication_in_module)))
+                                 strict_clusters)))
     return mlir::failure();
 
   // Remove TPUReplicatedInput and TPUReplicatedOutput nodes.
@@ -1091,17 +1077,12 @@ void TPUClusterFormationPass::runOnOperation() {
   });
 
   auto& side_effect_analysis = getAnalysis<mlir::TF::SideEffectAnalysis>();
-  bool has_replication_in_module = true;
   for (auto func : getOperation().getOps<mlir::func::FuncOp>())
     if (!func.isExternal() &&
         failed(FormClustersInFunction(
             func, side_effect_analysis.GetAnalysisForFunc(func),
-            strict_clusters_, has_replication_in_module)))
+            strict_clusters_)))
       return signalPassFailure();
-
-  if (!has_replication_in_module) {
-    jit_compile_single_core_tpu_count->GetCell(kUseMlirBridge)->IncrementBy(1);
-  }
 }
 }  // anonymous namespace
 
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/tpu_cluster_formation_test.cc b/tensorflow/compiler/mlir/tf2xla/internal/passes/tpu_cluster_formation_test.cc
deleted file mode 100644
index 640385f0156aae..00000000000000
--- a/tensorflow/compiler/mlir/tf2xla/internal/passes/tpu_cluster_formation_test.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-
-#include <gtest/gtest.h>
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
-#include "mlir/Pass/PassManager.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.h"
-#include "tensorflow/compiler/mlir/tf2xla/transforms/test_utils.h"
-#include "tensorflow/core/lib/monitoring/cell_reader.h"
-#include "tsl/platform/statusor.h"
-
-namespace tensorflow {
-namespace tf2xla {
-namespace internal {
-
-namespace {
-
-constexpr char kJitCompileSingleCoreTpuCount[] =
-    "/tensorflow/core/jit_compile_single_core_tpu_count";
-constexpr char kUseMlirBridge[] = "kUseMlirBridge";
-using mlir::mhlo::test::GetMlirModuleFromString;
-
-class TPUClusterFormationPassTest : public testing::Test {
- protected:
-  void CreateModule(const char* module_string) {
-    TF_ASSERT_OK_AND_ASSIGN(module_,
-                            GetMlirModuleFromString(module_string, &context_));
-    bool strict_clusters = true;
-    pm_ = std::make_unique<mlir::PassManager>(&context_);
-    pm_->addPass(tensorflow::tf2xla::internal::CreateTPUClusterFormationPass(
-        strict_clusters));
-  }
-
-  mlir::LogicalResult Run() { return pm_->run(module_.get()); }
-
- private:
-  mlir::MLIRContext context_;
-  mlir::OwningOpRef<mlir::ModuleOp> module_;
-  std::unique_ptr<mlir::PassManager> pm_;
-};
-
-TEST_F(TPUClusterFormationPassTest, NonReplicatedTPU) {
-  monitoring::testing::CellReader<int64_t> feature_metric_reader(
-      kJitCompileSingleCoreTpuCount);
-  static constexpr char kMlirModuleStr[] = R"(
-  module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
-    func.func @valid_compilation_cluster_no_replication() {
-      "tf.opA"() { _xla_compile_device_type = "TPU", is_stateless = true} : () -> ()
-      "tf.opB"() { _xla_compile_device_type = "TPU", is_stateless = true} : () -> ()
-      func.return
-    }
-  })";
-  CreateModule(kMlirModuleStr);
-  auto result = Run();
-  EXPECT_TRUE(result.succeeded());
-  EXPECT_EQ(feature_metric_reader.Delta(kUseMlirBridge), 1);
-}
-
-TEST_F(TPUClusterFormationPassTest, ReplicatedTPU) {
-  monitoring::testing::CellReader<int64_t> feature_metric_reader(
-      kJitCompileSingleCoreTpuCount);
-  static constexpr char kMlirModuleStr[] = R"(
-  module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
-    func.func @interleaved_clusters(%arg0 : tensor<i1>) -> (tensor<i1>, tensor<i1>) {
-      "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _replication_info = "replicate_1", device = "device_1", num_replicas = 1, topology = "topology_1"} : () -> ()
-      %0 = "tf.opA"(%arg0) {_xla_compile_device_type = "TPU", _replication_info = "replicate_0", is_stateless = true} : (tensor<i1>) -> tensor<i1>
-      %1 = "tf.opB"(%arg0) {_xla_compile_device_type = "TPU", _replication_info = "replicate_1", is_stateless = true} : (tensor<i1>) -> tensor<i1>
-      %2 = "tf.opC"(%0) {_xla_compile_device_type = "TPU", _replication_info = "replicate_0", is_stateless = true} : (tensor<i1>) -> tensor<i1>
-      %3 = "tf.opD"(%1) {_xla_compile_device_type = "TPU", _replication_info = "replicate_1", is_stateless = true} : (tensor<i1>) -> tensor<i1>
-      "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _replication_info = "replicate_0", device = "device_0", num_replicas = 1, topology = "topology_0"} : () -> ()
-      func.return %2, %3 : tensor<i1>, tensor<i1>
-    }
-  })";
-  CreateModule(kMlirModuleStr);
-  auto result = Run();
-  EXPECT_TRUE(result.succeeded());
-  EXPECT_EQ(feature_metric_reader.Delta(kUseMlirBridge), 0);
-}
-
-}  // namespace
-}  // namespace internal
-}  // namespace tf2xla
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/xla_outline_entry_functions.cc b/tensorflow/compiler/mlir/tf2xla/internal/passes/xla_outline_entry_functions.cc
deleted file mode 100644
index 78d700f1514b7c..00000000000000
--- a/tensorflow/compiler/mlir/tf2xla/internal/passes/xla_outline_entry_functions.cc
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <string>
-
-#include "llvm/ADT/SmallVector.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/Block.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/IR/Location.h"  // from @llvm-project
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/IR/SymbolTable.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/call_graph_util.h"
-
-namespace tensorflow {
-namespace tf2xla {
-namespace internal {
-
-using mlir::ModuleOp;
-using mlir::Operation;
-using mlir::SymbolTable;
-
-#define GEN_PASS_DEF_XLAOUTLINEENTRYFUNCTIONSPASS
-#include "tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.h.inc"
-
-inline constexpr char kOutlinedFuncSuffix[] = "_outlined";
-
-// Outlines the body of an entry function with `_xla_compile_device_type`
-// attribute and calls the outlined function with a
-// `tf.StatefulPartitionedCall`.
-struct XlaOutlineEntryFunctionsPass
-    : public impl::XlaOutlineEntryFunctionsPassBase<
-          XlaOutlineEntryFunctionsPass> {
-  void runOnOperation() override;
-};
-
-void RenameFunction(mlir::func::FuncOp func, const std::string &new_func_name,
-                    SymbolTable &symtab) {
-  symtab.remove(func);
-  symtab.setSymbolName(func, new_func_name);
-  // Name conflicts are resolved automatically by SymbolTable class by attaching
-  // a unique counter value to the names.
-  symtab.insert(func);
-}
-
-// Propagate compilation markers from the source to the destination.
-void PropagateCompilationMarkers(Operation *src, Operation *dest) {
-  mlir::TF::CopyUnderscoredAttributes(src, dest);
-  if (src->hasAttr(mlir::TF::kAllowSoftPlacementAttr)) {
-    dest->setAttr(mlir::TF::kAllowSoftPlacementAttr,
-                  src->getAttr(mlir::TF::kAllowSoftPlacementAttr));
-  }
-}
-
-mlir::func::FuncOp CreateWrapperFunction(mlir::func::FuncOp func,
-                                         const std::string &caller_name,
-                                         const std::string &callee_name) {
-  mlir::OpBuilder builder(func);
-  mlir::OpBuilder::InsertionGuard guard(builder);
-  mlir::FunctionType func_type = func.getFunctionType();
-  mlir::Location loc = func.getLoc();
-  auto wrapper_func = mlir::func::FuncOp::create(loc, caller_name, func_type);
-  mlir::Block *block = builder.createBlock(&wrapper_func.getBody());
-  block->addArguments(
-      wrapper_func.getArgumentTypes(),
-      llvm::SmallVector<mlir::Location>(wrapper_func.getNumArguments(), loc));
-  auto pcall_op = builder.create<mlir::TF::StatefulPartitionedCallOp>(
-      loc, func_type.getResults(), wrapper_func.getArguments(),
-      mlir::SymbolRefAttr::get(builder.getContext(), callee_name),
-      builder.getStringAttr(""), builder.getStringAttr(""),
-      builder.getStringAttr(""));
-  builder.create<mlir::func::ReturnOp>(loc, pcall_op.getResults());
-  PropagateCompilationMarkers(func, pcall_op);
-  // Mark the original function private so it can be inlined.
-  func.setVisibility(mlir::func::FuncOp::Visibility::Private);
-  return wrapper_func;
-}
-
-void ReplaceEntryFunction(mlir::func::FuncOp original_func,
-                          mlir::func::FuncOp new_func) {
-  auto move_attr = [&](auto attr, Operation *src, Operation *dest) {
-    if (src->hasAttr(attr)) {
-      dest->setAttr(attr, src->getAttr(attr));
-      src->removeAttr(attr);
-    }
-  };
-
-  for (const auto &attr : mlir::GetEntryFunctionAttributeNames()) {
-    move_attr(attr, original_func, new_func);
-  }
-  mlir::TF::CopyDeviceAndUnderscoredAttributes(original_func, new_func);
-}
-
-mlir::func::FuncOp RewriteEntryFunctionWithCompilationMarkers(
-    mlir::func::FuncOp entry_func, SymbolTable &symtab) {
-  const std::string entry_func_name = entry_func.getSymName().str(),
-                    outlined_entry_func_name =
-                        entry_func_name + kOutlinedFuncSuffix;
-  RenameFunction(entry_func, outlined_entry_func_name, symtab);
-  auto new_entry_func = CreateWrapperFunction(entry_func, entry_func_name,
-                                              outlined_entry_func_name);
-  ReplaceEntryFunction(entry_func, new_entry_func);
-  symtab.insert(new_entry_func);
-  return new_entry_func;
-}
-
-void XlaOutlineEntryFunctionsPass::runOnOperation() {
-  ModuleOp module = getOperation();
-  SymbolTable symtab(module);
-
-  llvm::SmallVector<mlir::func::FuncOp> entry_funcs = GetEntryFunctions(module);
-
-  for (auto &entry_func : entry_funcs) {
-    if (entry_func->hasAttr(mlir::TF::kCompileDeviceTypeAttr)) {
-      RewriteEntryFunctionWithCompilationMarkers(entry_func, symtab);
-    }
-  }
-}
-
-std::unique_ptr<mlir::OperationPass<ModuleOp>>
-CreateXlaOutlineEntryFunctionsPass() {
-  return std::make_unique<XlaOutlineEntryFunctionsPass>();
-}
-
-}  // namespace internal
-}  // namespace tf2xla
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/test_matchers_test.cc b/tensorflow/compiler/mlir/tf2xla/internal/test_matchers_test.cc
index 5ae2fcecedc3fc..696776f75b021c 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/test_matchers_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/internal/test_matchers_test.cc
@@ -43,14 +43,14 @@ template <typename T>
 tsl::StatusOr<T> success(T t) {
   return t;
 }
-tsl::StatusOr<int> success() { return kArbitraryIntResult; }
+absl::StatusOr<int> success() { return kArbitraryIntResult; }
 template <typename T>
 tsl::StatusOr<T> filtered(T t) {
   return tsl::StatusOr<T>(tensorflow::CompileToHloGraphAnalysisFailedError());
 }
-tsl::StatusOr<int> filtered() { return filtered(kArbitraryIntResult); }
-tsl::StatusOr<int> failed() {
-  return tsl::StatusOr<int>(absl::InternalError("fail"));
+absl::StatusOr<int> filtered() { return filtered(kArbitraryIntResult); }
+absl::StatusOr<int> failed() {
+  return absl::StatusOr<int>(absl::InternalError("fail"));
 }
 
 TEST(TestUtil, MatchesOk) { ASSERT_THAT(success(), IsOkOrFiltered()); }
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/adjust-layout.mlir b/tensorflow/compiler/mlir/tf2xla/tests/adjust-layout.mlir
index dde3c4213e6146..70b42f32392ca1 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/adjust-layout.mlir
+++ b/tensorflow/compiler/mlir/tf2xla/tests/adjust-layout.mlir
@@ -4,7 +4,9 @@ func.func @infeed_dequeue_tuple() -> (tensor<1x8x4x4xi32>, tensor<1x100x1xf32>)
   // CHECK: [[TOKEN:%.*]] = mhlo.create_token : !mhlo.token
   %0 = "mhlo.create_token"() : () -> !mhlo.token
 
-  // CHECK: [[INFEED:%.*]]:3 = "mhlo.infeed"([[TOKEN]]) {infeed_config = "", layout = [{{\[1, 3, 2, 0], \[1, 2, 0]}}]} : (!mhlo.token) -> (tensor<1x8x4x4xi32>, tensor<1x100x1xf32>, !mhlo.token)
+  // CHECK:               [[INFEED:%.*]]:3 = "mhlo.infeed"([[TOKEN]]) <{
+  // CHECK-SAME{LITERAL}:   infeed_config = "", layout = [[1, 3, 2, 0], [1, 2, 0]]
+  // CHECK-SAME:          }> : (!mhlo.token) -> (tensor<1x8x4x4xi32>, tensor<1x100x1xf32>, !mhlo.token)
   %1:3 = "mhlo.infeed"(%0) {infeed_config = ""} : (!mhlo.token) -> (tensor<1x8x4x4xi32>, tensor<1x100x1xf32>, !mhlo.token)
 
   // CHECK: return [[INFEED]]#0, [[INFEED]]#1
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-BatchMatMulV2.mlir b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-BatchMatMulV2.mlir
index ebefc1ca1ab140..f62e9a140e83d9 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-BatchMatMulV2.mlir
+++ b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-BatchMatMulV2.mlir
@@ -15,7 +15,7 @@ func.func @batchmatmulv2_basic(%arg0: tensor<1x4x2xf32>, %arg1: tensor<3x2x4xf32
 // CHECK:           [[BCASTHEAD:%.*]] = shape.broadcast [[LHSHEAD]], [[RHSHEAD]]
 // CHECK:           [[LHSBCASTSHAPE:%.*]] = shape.concat [[BCASTHEAD]], [[LHSTAIL]]
 // CHECK:           [[LHSSHAPEEXTENTS:%.*]] = shape.to_extent_tensor [[LHSBCASTSHAPE]]
-// CHECK:           [[LHSBCAST:%.*]] = "mhlo.dynamic_broadcast_in_dim"([[LHS]], [[LHSSHAPEEXTENTS]]) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x4x2xf32>, tensor<3xindex>) -> tensor<3x4x2xf32>
+// CHECK:           [[LHSBCAST:%.*]] = "mhlo.dynamic_broadcast_in_dim"([[LHS]], [[LHSSHAPEEXTENTS]]) <{broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>}> : (tensor<1x4x2xf32>, tensor<3xindex>) -> tensor<3x4x2xf32>
 // CHECK:           [[RHSBCASTSHAPE:%.*]] = shape.concat [[BCASTHEAD]], [[RHSTAIL]]
 // CHECK:           [[RESULT:%.*]] = "mhlo.dot_general"([[LHSBCAST]], [[RHS]])
 // CHECK:           return [[RESULT]] : tensor<3x4x4xf32>
@@ -27,8 +27,8 @@ func.func @batchmatmulv2_basic(%arg0: tensor<1x4x2xf32>, %arg1: tensor<3x2x4xf32
 
 func.func @batchmatmulv2_lhs_batch(%arg0: tensor<3x4x2xf32>, %arg1: tensor<2x4xf32>) -> tensor<3x4x4xf32> {
 // CHECK-LABEL:   func @batchmatmulv2_lhs_batch
-// CHECK:           "mhlo.dynamic_broadcast_in_dim"({{.*}}, {{.*}}) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>}
-// CHECK:           "mhlo.dot_general"({{.*}}, {{.*}}) {
+// CHECK:           "mhlo.dynamic_broadcast_in_dim"({{.*}}, {{.*}}) <{broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>}>
+// CHECK:           "mhlo.dot_general"({{.*}}, {{.*}}) <{
 // CHECK-SAME:        lhs_batching_dimensions = [0]
 // CHECK-SAME:        rhs_batching_dimensions = [0]
 // CHECK-SAME:        lhs_contracting_dimensions = [2]
@@ -39,8 +39,8 @@ func.func @batchmatmulv2_lhs_batch(%arg0: tensor<3x4x2xf32>, %arg1: tensor<2x4xf
 
 func.func @batchmatmulv2_rhs_batch(%arg0: tensor<4x2xf32>, %arg1: tensor<3x2x4xf32>) -> tensor<3x4x4xf32> {
 // CHECK-LABEL:   func @batchmatmulv2_rhs_batch
-// CHECK:           "mhlo.dynamic_broadcast_in_dim"({{.*}}, {{.*}}) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>}
-// CHECK:           "mhlo.dot_general"({{.*}}, {{.*}}) {
+// CHECK:           "mhlo.dynamic_broadcast_in_dim"({{.*}}, {{.*}}) <{broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>}>
+// CHECK:           "mhlo.dot_general"({{.*}}, {{.*}}) <{
 // CHECK-SAME:        lhs_batching_dimensions = [0]
 // CHECK-SAME:        rhs_batching_dimensions = [0]
 // CHECK-SAME:        lhs_contracting_dimensions = [2]
@@ -51,7 +51,7 @@ func.func @batchmatmulv2_rhs_batch(%arg0: tensor<4x2xf32>, %arg1: tensor<3x2x4xf
 
 func.func @batchmatmulv2_dynamic(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
 // CHECK-LABEL:   func @batchmatmulv2_dynamic
-// CHECK:           "mhlo.dot_general"({{.*}}, {{.*}}) {
+// CHECK:           "mhlo.dot_general"({{.*}}, {{.*}}) <{
 // CHECK-SAME:  lhs_batching_dimensions = [0]
 // CHECK-SAME:  rhs_batching_dimensions = [0]
 // CHECK-SAME:  lhs_contracting_dimensions = [2]
@@ -62,7 +62,7 @@ func.func @batchmatmulv2_dynamic(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf
 
 func.func @batchmatmulv2_adj_real(%arg0: tensor<2x5xf32>, %arg1: tensor<4x2xf32>) -> tensor<5x4xf32> {
 // CHECK-LABEL:   func @batchmatmulv2_adj_real
-// CHECK:           "mhlo.dot_general"({{.*}}, {{.*}}) {
+// CHECK:           "mhlo.dot_general"({{.*}}, {{.*}}) <{
 // CHECK-NOT:         lhs_batching_dimensions
 // CHECK-NOT:         rhs_batching_dimensions
 // CHECK-SAME:        lhs_contracting_dimensions = [0]
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-binary-elementwise.mlir b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-binary-elementwise.mlir
index 01dc4701923675..da64452a3039f8 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-binary-elementwise.mlir
+++ b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-binary-elementwise.mlir
@@ -23,7 +23,7 @@ func.func @add(%arg0: tensor<2xi32>) -> tensor<2xi32> {
 // patterns unambiguous and more interesting (once broadcastable trait is
 // fixed upstream).
 func.func @broadcast_add(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  // CHECK-NEXT: %[[LHS_BCAST:.+]] = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-NEXT: %[[LHS_BCAST:.+]] = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<1> : tensor<1xi64>}>
   // CHECK-NEXT: mhlo.add %[[LHS_BCAST]], %arg1
   %0 = "tf.AddV2"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   func.return %0: tensor<1x2xi32>
@@ -33,7 +33,7 @@ func.func @broadcast_add(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor
 // TODO(laurenzo): Change this to a (4x1x1 + 1x4x4x4) shaped add once upstream
 // broadcastable bug is fixed (helps make the CHECK matching unambiguous)
 func.func @broadcast_multi_dim_add(%arg0: tensor<4x1x1xi32>, %arg1: tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32> {
-  // CHECK-NEXT: %[[LHS_BCAST:.+]] = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[1, 2, 3]> : tensor<3xi64>}
+  // CHECK-NEXT: %[[LHS_BCAST:.+]] = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<[1, 2, 3]> : tensor<3xi64>}>
   // CHECK-NEXT: mhlo.add %[[LHS_BCAST]], %arg1
   %0 = "tf.AddV2"(%arg0, %arg1) : (tensor<4x1x1xi32>, tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32>
   func.return %0: tensor<4x4x4x4xi32>
@@ -48,8 +48,8 @@ func.func @add_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?x?xi32>) -> tensor<?
   // CHECK-DAG:    %[[LHS_SHAPE:.+]] = shape.shape_of %arg0
   // CHECK-DAG:    %[[RHS_SHAPE:.+]] = shape.shape_of %arg1
   // CHECK-NEXT:   %[[RESULT_EXTENTS:.+]] = shape.broadcast %[[LHS_SHAPE]], %[[RHS_SHAPE]] : tensor<1xindex>, tensor<2xindex> -> tensor<2xindex>
-  // CHECK-NEXT:   %[[LHS_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-NEXT:   %[[RHS_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
+  // CHECK-NEXT:   %[[LHS_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) <{broadcast_dimensions = dense<1> : tensor<1xi64>}>
+  // CHECK-NEXT:   %[[RHS_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) <{broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}>
   // CHECK-NEXT:   %[[RESULT:.+]] = mhlo.add %[[LHS_BCAST]], %[[RHS_BCAST]] : tensor<?x?xi32>
   // CHECK-NEXT:   shape.assuming_yield %[[RESULT]]
   %0 = "tf.AddV2"(%arg0, %arg1) : (tensor<?xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
@@ -149,7 +149,7 @@ func.func @shift_right_unsigned(%arg0: tensor<4xui8>, %arg1: tensor<4xui8>) -> t
 
 // CHECK-LABEL: func @broadcast_shift_right_unsigned
 func.func @broadcast_shift_right_unsigned(%arg0: tensor<4xui8>, %arg1: tensor<2x4xui8>) -> tensor<2x4xui8> {
-  // CHECK: %[[BROADCAST:.*]] = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xui8>) -> tensor<2x4xui8>
+  // CHECK: %[[BROADCAST:.*]] = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<1> : tensor<1xi64>}> : (tensor<4xui8>) -> tensor<2x4xui8>
   // CHECK:  mhlo.shift_right_logical %[[BROADCAST]], %arg1 : tensor<2x4xui8>
   %0 = "tf.RightShift"(%arg0, %arg1) : (tensor<4xui8>, tensor<2x4xui8>) -> tensor<2x4xui8>
   func.return %0 : tensor<2x4xui8>
@@ -248,8 +248,8 @@ func.func @equal_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?
   // NOT-CHECK-DAG:    %[[LHS_SHAPE1:.+]] = shape.shape_of %arg0
   // NOT-CHECK-NEXT:   %[[RESULT_SHAPE:.+]] = shape.broadcast %[[LHS_SHAPE1]], %[[RHS_SHAPE]] : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
   // NOT-CHECK-NEXT:   %[[RESULT_EXTENTS:.+]] = tensor.cast %[[RESULT_SHAPE]] : tensor<?xindex> to tensor<1xindex>
-  // NOT-CHECK-DAG:    %[[LHS_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
-  // NOT-CHECK-DAG:    %[[RHS_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // NOT-CHECK-DAG:    %[[LHS_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) <{broadcast_dimensions = dense<0> : tensor<1xi64>}>
+  // NOT-CHECK-DAG:    %[[RHS_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) <{broadcast_dimensions = dense<0> : tensor<1xi64>}>
   // NOT-CHECK-NEXT:   %[[RESULT:.+]] = mhlo.compare EQ, %[[LHS_BCAST]], %[[RHS_BCAST]]
   // NOT-CHECK-NEXT:   shape.assuming_yield %[[RESULT]]
   %0 = "tf.Equal"(%arg0, %arg1) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
@@ -258,7 +258,7 @@ func.func @equal_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?
 
 // CHECK-LABEL: func @equal_broadcast
 func.func @equal_broadcast(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  // CHECK-DAG: %[[LHS_BCAST:.+]] = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<1> : tensor<1xi64>}>
   // CHECK-NEXT: mhlo.compare EQ, %[[LHS_BCAST]], %arg1
   %0 = "tf.Equal"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   func.return %0: tensor<1x2xi1>
@@ -329,7 +329,7 @@ func.func @greater(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2xi1> {
 
 // CHECK-LABEL: func @broadcast_greater
 func.func @broadcast_greater(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  // CHECK-NEXT: %[[LHS_BCAST:.+]] = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-NEXT: %[[LHS_BCAST:.+]] = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<1> : tensor<1xi64>}>
   // CHECK-NEXT: mhlo.compare GT, %[[LHS_BCAST]], %arg1
   %0 = "tf.Greater"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   func.return %0: tensor<1x2xi1>
@@ -344,8 +344,8 @@ func.func @greater_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?xi32>) -> tensor
   // CHECK-DAG:    %[[LHS_SHAPE1:.+]] = shape.shape_of %arg0
   // CHECK-DAG:    %[[RHS_SHAPE1:.+]] = shape.shape_of %arg1
   // CHECK-NEXT:   %[[RESULT_EXTENTS:.+]] = shape.broadcast %[[LHS_SHAPE1]], %[[RHS_SHAPE1]] : tensor<1xindex>, tensor<1xindex> -> tensor<1xindex>
-  // CHECK-DAG:    %[[LHS_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
-  // CHECK-DAG:    %[[RHS_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK-DAG:    %[[LHS_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) <{broadcast_dimensions = dense<0> : tensor<1xi64>}>
+  // CHECK-DAG:    %[[RHS_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) <{broadcast_dimensions = dense<0> : tensor<1xi64>}>
   // CHECK-NEXT:   mhlo.compare GT, %[[LHS_BCAST]], %[[RHS_BCAST]]
   %0 = "tf.Greater"(%arg0, %arg1) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi1>
   func.return %0: tensor<?xi1>
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-collective.mlir b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-collective.mlir
index fa0bc94a980eb0..9c5653c61b9703 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-collective.mlir
+++ b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-collective.mlir
@@ -8,9 +8,9 @@
 func.func @all_reduce_cross_replica(%input: tensor<f32>) -> tensor<f32> {
   %group_assignment = "tf.Const"() { value = dense<[[0],[1]]> : tensor<2x1xi32> } : () -> tensor<2x1xi32>
   // CHECK: "mhlo.all_reduce"
-  // CHECK: mhlo.add
   // CHECK{LITERAL}: replica_groups = dense<[[0], [1]]> : tensor<2x1xi64>
   // CHECK-NOT: channel_handle
+  // CHECK: mhlo.add
   %0 = "tf.XlaAllReduce"(%input, %group_assignment) {reduce_op = "Add", mode = "CrossReplica"} : (tensor<f32>, tensor<2x1xi32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
@@ -24,16 +24,16 @@ func.func @all_reduce_cross_replica(%input: tensor<f32>) -> tensor<f32> {
 func.func @all_reduce_cross_replica_and_partition(%input: tensor<f32>) -> tensor<f32> {
   %group_assignment = "tf.Const"() { value = dense<[[0],[1]]> : tensor<2x1xi32> } : () -> tensor<2x1xi32>
   // CHECK: "mhlo.all_reduce"
+  // CHECK-SAME: channel_handle = #mhlo.channel_handle<handle = 10001, type = 1>
+  // CHECK-SAME{LITERAL}: replica_groups = dense<[[0], [1]]> : tensor<2x1xi64>
   // CHECK: mhlo.add
   // CHECK: mhlo.return
-  // CHECK-NEXT: channel_handle = #mhlo.channel_handle<handle = 10001, type = 1>
-  // CHECK-SAME{LITERAL}: replica_groups = dense<[[0], [1]]> : tensor<2x1xi64>
   %0 = "tf.XlaAllReduce"(%input, %group_assignment) {reduce_op = "Add", mode = "CrossReplicaAndPartition"} : (tensor<f32>, tensor<2x1xi32>) -> tensor<f32>
   // CHECK: "mhlo.all_reduce"
+  // CHECK-SAME: channel_handle = #mhlo.channel_handle<handle = 10000, type = 1>
+  // CHECK-SAME{LITERAL}: replica_groups = dense<[[0], [1]]> : tensor<2x1xi64>
   // CHECK: mhlo.add
   // CHECK: mhlo.return
-  // CHECK-NEXT: channel_handle = #mhlo.channel_handle<handle = 10000, type = 1>
-  // CHECK-SAME{LITERAL}: replica_groups = dense<[[0], [1]]> : tensor<2x1xi64>
   %1 = "tf.XlaAllReduce"(%input, %group_assignment) {reduce_op = "Add", mode = "CrossReplicaAndPartition"} : (tensor<f32>, tensor<2x1xi32>) -> tensor<f32>
   %2 = "tf.Add"(%0, %1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %2 : tensor<f32>
@@ -110,16 +110,16 @@ func.func @collective_reduce_v2(%input: tensor<f32>) -> tensor<f32> {
   %group_size = "tf.Const"() { value = dense<2> : tensor<i32> } : () -> tensor<i32>
   %instance_key = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
   // CHECK: "mhlo.all_reduce"
+  // CHECK-SAME: channel_handle = #mhlo.channel_handle<handle = 10001, type = 1>
+  // CHECK-SAME{LITERAL}: replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>
   // CHECK: mhlo.add
   // CHECK: mhlo.return
-  // CHECK-NEXT: channel_handle = #mhlo.channel_handle<handle = 10001, type = 1>
-  // CHECK-SAME{LITERAL}: replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>
   %0 = "tf.CollectiveReduceV2"(%input, %group_size, %group_key, %instance_key) {merge_op = "Add", final_op = "Id"} : (tensor<f32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<f32>
   // CHECK: "mhlo.all_reduce"
+  // CHECK-SAME: channel_handle = #mhlo.channel_handle<handle = 10000, type = 1>
+  // CHECK-SAME{LITERAL}: replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>
   // CHECK: mhlo.add
   // CHECK: mhlo.return
-  // CHECK-NEXT: channel_handle = #mhlo.channel_handle<handle = 10000, type = 1>
-  // CHECK-SAME{LITERAL}: replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>
   %1 = "tf.CollectiveReduceV2"(%input, %group_size, %group_key, %instance_key) {merge_op = "Add", final_op = "Id"} : (tensor<f32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<f32>
   %2 = "tf.Add"(%0, %1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %2 : tensor<f32>
@@ -133,10 +133,10 @@ func.func @collective_reduce_v2_add_id(%input: tensor<f32>) -> tensor<f32> {
   %group_size = "tf.Const"() { value = dense<2> : tensor<i32> } : () -> tensor<i32>
   %instance_key = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
   // CHECK: %[[REDUCE:.*]] = "mhlo.all_reduce"
+  // CHECK-SAME{LITERAL}: replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>
   // CHECK: mhlo.add
   // CHECK: mhlo.return
-  // CHECK-NEXT{LITERAL}: replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>
-  // CHECK-NEXT: return %[[REDUCE]]
+  // CHECK: return %[[REDUCE]]
   %0 = "tf.CollectiveReduceV2"(%input, %group_size, %group_key, %instance_key) {merge_op = "Add", final_op = "Id"} : (tensor<f32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
@@ -147,10 +147,10 @@ func.func @collective_reduce_v2_max_id(%input: tensor<f32>) -> tensor<f32> {
   %group_size = "tf.Const"() { value = dense<2> : tensor<i32> } : () -> tensor<i32>
   %instance_key = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
   // CHECK: %[[REDUCE:.*]] = "mhlo.all_reduce"
+  // CHECK-SAME{LITERAL}: replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>
   // CHECK: mhlo.maximum
   // CHECK: mhlo.return
-  // CHECK-NEXT{LITERAL}: replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>
-  // CHECK-NEXT: return %[[REDUCE]]
+  // CHECK: return %[[REDUCE]]
   %0 = "tf.CollectiveReduceV2"(%input, %group_size, %group_key, %instance_key) {merge_op = "Max", final_op = "Id"} : (tensor<f32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
@@ -161,10 +161,10 @@ func.func @collective_reduce_v2_min_id(%input: tensor<f32>) -> tensor<f32> {
   %group_size = "tf.Const"() { value = dense<2> : tensor<i32> } : () -> tensor<i32>
   %instance_key = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
   // CHECK: %[[REDUCE:.*]] = "mhlo.all_reduce"
+  // CHECK-SAME{LITERAL}: replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>
   // CHECK: mhlo.minimum
   // CHECK: mhlo.return
-  // CHECK-NEXT{LITERAL}: replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>
-  // CHECK-NEXT: return %[[REDUCE]]
+  // CHECK: return %[[REDUCE]]
   %0 = "tf.CollectiveReduceV2"(%input, %group_size, %group_key, %instance_key) {merge_op = "Min", final_op = "Id"} : (tensor<f32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
@@ -175,10 +175,10 @@ func.func @collective_reduce_v2_mul_id(%input: tensor<f32>) -> tensor<f32> {
   %group_size = "tf.Const"() { value = dense<2> : tensor<i32> } : () -> tensor<i32>
   %instance_key = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
   // CHECK: %[[REDUCE:.*]] = "mhlo.all_reduce"
+  // CHECK-SAME{LITERAL}: replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>
   // CHECK: mhlo.mul
   // CHECK: mhlo.return
-  // CHECK-NEXT{LITERAL}: replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>
-  // CHECK-NEXT: return %[[REDUCE]]
+  // CHECK: return %[[REDUCE]]
   %0 = "tf.CollectiveReduceV2"(%input, %group_size, %group_key, %instance_key) {merge_op = "Mul", final_op = "Id"} : (tensor<f32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
@@ -190,10 +190,10 @@ func.func @collective_reduce_v2_add_div(%input: tensor<f32>) -> tensor<f32> {
   %instance_key = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
   // CHECK: %[[GROUP_SIZE:.*]] = mhlo.constant dense<2.000000e+00>
   // CHECK: %[[REDUCE:.*]] = "mhlo.all_reduce"
+  // CHECK-SAME{LITERAL}: replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>
   // CHECK: mhlo.add
   // CHECK: mhlo.return
-  // CHECK-NEXT{LITERAL}: replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>
-  // CHECK-NEXT: %[[RESULT:.*]] = mhlo.divide %[[REDUCE]], %[[GROUP_SIZE]]
+  // CHECK: %[[RESULT:.*]] = mhlo.divide %[[REDUCE]], %[[GROUP_SIZE]]
   // CHECK-NEXT: return %[[RESULT]]
   %0 = "tf.CollectiveReduceV2"(%input, %group_size, %group_key, %instance_key) {merge_op = "Add", final_op = "Div"} : (tensor<f32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<f32>
   func.return %0 : tensor<f32>
@@ -206,10 +206,10 @@ func.func @collective_reduce_v2_max_div(%input: tensor<f32>) -> tensor<f32> {
   %instance_key = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
   // CHECK: %[[GROUP_SIZE:.*]] = mhlo.constant dense<2.000000e+00>
   // CHECK: %[[REDUCE:.*]] = "mhlo.all_reduce"
+  // CHECK-SAME{LITERAL}: replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>
   // CHECK: mhlo.maximum
   // CHECK: mhlo.return
-  // CHECK-NEXT{LITERAL}: replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>
-  // CHECK-NEXT: %[[RESULT:.*]] = mhlo.divide %[[REDUCE]], %[[GROUP_SIZE]]
+  // CHECK: %[[RESULT:.*]] = mhlo.divide %[[REDUCE]], %[[GROUP_SIZE]]
   // CHECK-NEXT: return %[[RESULT]]
   %0 = "tf.CollectiveReduceV2"(%input, %group_size, %group_key, %instance_key) {merge_op = "Max", final_op = "Div"} : (tensor<f32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<f32>
   func.return %0 : tensor<f32>
@@ -222,10 +222,10 @@ func.func @collective_reduce_v2_min_div(%input: tensor<f32>) -> tensor<f32> {
   %instance_key = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
   // CHECK: %[[GROUP_SIZE:.*]] = mhlo.constant dense<2.000000e+00>
   // CHECK: %[[REDUCE:.*]] = "mhlo.all_reduce"
+  // CHECK-SAME{LITERAL}: replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>
   // CHECK: mhlo.minimum
   // CHECK: mhlo.return
-  // CHECK-NEXT{LITERAL}: replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>
-  // CHECK-NEXT: %[[RESULT:.*]] = mhlo.divide %[[REDUCE]], %[[GROUP_SIZE]]
+  // CHECK: %[[RESULT:.*]] = mhlo.divide %[[REDUCE]], %[[GROUP_SIZE]]
   // CHECK-NEXT: return %[[RESULT]]
   %0 = "tf.CollectiveReduceV2"(%input, %group_size, %group_key, %instance_key) {merge_op = "Min", final_op = "Div"} : (tensor<f32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<f32>
   func.return %0 : tensor<f32>
@@ -238,10 +238,10 @@ func.func @collective_reduce_v2_mul_div(%input: tensor<f32>) -> tensor<f32> {
   %instance_key = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
   // CHECK: %[[GROUP_SIZE:.*]] = mhlo.constant dense<2.000000e+00>
   // CHECK: %[[REDUCE:.*]] = "mhlo.all_reduce"
+  // CHECK-SAME{LITERAL}: replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>
   // CHECK: mhlo.mul
   // CHECK: mhlo.return
-  // CHECK-NEXT{LITERAL}: replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>
-  // CHECK-NEXT: %[[RESULT:.*]] = mhlo.divide %[[REDUCE]], %[[GROUP_SIZE]]
+  // CHECK: %[[RESULT:.*]] = mhlo.divide %[[REDUCE]], %[[GROUP_SIZE]]
   // CHECK-NEXT: return %[[RESULT]]
   %0 = "tf.CollectiveReduceV2"(%input, %group_size, %group_key, %instance_key) {merge_op = "Mul", final_op = "Div"} : (tensor<f32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<f32>
   func.return %0 : tensor<f32>
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-prefer-tf2xla.mlir b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-prefer-tf2xla.mlir
index c1bff70e2e4ff6..49a26ef844623f 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-prefer-tf2xla.mlir
+++ b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-prefer-tf2xla.mlir
@@ -69,22 +69,22 @@ func.func @random_uniform_without_seeds(%arg0: tensor<4xi32>) -> tensor<32x12x12
 func.func @random_uniform_with_seeds(%arg0: tensor<4xi32>) -> tensor<32x12x12x64xf32> {
     // CHECK: %0 = mhlo.constant dense<[32, 12, 12, 64]> : tensor<4xi32>
     // CHECK-NEXT: %1 = mhlo.constant dense<[32, 12, 12, 64]> : tensor<4xi32>
-    // CHECK-NEXT: %2 = "mhlo.slice"(%1) {limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<4xi32>) -> tensor<1xi32>
+    // CHECK-NEXT: %2 = "mhlo.slice"(%1) <{limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<4xi32>) -> tensor<1xi32>
     // CHECK-NEXT: %3 = mhlo.reshape %2 : (tensor<1xi32>) -> tensor<i32>
     // CHECK-NEXT: %4 = mhlo.convert %3 : tensor<i32>
-    // CHECK-NEXT: %5 = "mhlo.slice"(%1) {limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<4xi32>) -> tensor<1xi32>
+    // CHECK-NEXT: %5 = "mhlo.slice"(%1) <{limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<4xi32>) -> tensor<1xi32>
     // CHECK-NEXT: %6 = mhlo.reshape %5 : (tensor<1xi32>) -> tensor<i32>
     // CHECK-NEXT: %7 = mhlo.convert %6 : tensor<i32>
-    // CHECK-NEXT: %8 = "mhlo.slice"(%1) {limit_indices = dense<3> : tensor<1xi64>, start_indices = dense<2> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<4xi32>) -> tensor<1xi32>
+    // CHECK-NEXT: %8 = "mhlo.slice"(%1) <{limit_indices = dense<3> : tensor<1xi64>, start_indices = dense<2> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<4xi32>) -> tensor<1xi32>
     // CHECK-NEXT: %9 = mhlo.reshape %8 : (tensor<1xi32>) -> tensor<i32>
     // CHECK-NEXT: %10 = mhlo.convert %9 : tensor<i32>
-    // CHECK-NEXT: %11 = "mhlo.slice"(%1) {limit_indices = dense<4> : tensor<1xi64>, start_indices = dense<3> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<4xi32>) -> tensor<1xi32>
+    // CHECK-NEXT: %11 = "mhlo.slice"(%1) <{limit_indices = dense<4> : tensor<1xi64>, start_indices = dense<3> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<4xi32>) -> tensor<1xi32>
     // CHECK-NEXT: %12 = mhlo.reshape %11 : (tensor<1xi32>) -> tensor<i32>
     // CHECK-NEXT: %13 = mhlo.convert %12 : tensor<i32>
     // CHECK-NEXT: %14 = mhlo.constant dense<0.000000e+00> : tensor<f32>
     // CHECK-NEXT: %15 = mhlo.constant dense<1.000000e+00> : tensor<f32>
     // CHECK-NEXT: %16 = mhlo.constant dense<[32, 12, 12, 64]> : tensor<4xi64>
-    // CHECK-NEXT: %17 = "mhlo.rng"(%14, %15, %16) {rng_distribution = #mhlo.rng_distribution<UNIFORM>} : (tensor<f32>, tensor<f32>, tensor<4xi64>) -> tensor<32x12x12x64xf32>
+    // CHECK-NEXT: %17 = "mhlo.rng"(%14, %15, %16) <{rng_distribution = #mhlo.rng_distribution<UNIFORM>}> : (tensor<f32>, tensor<f32>, tensor<4xi64>) -> tensor<32x12x12x64xf32>
   %cst = "tf.Const"() {value = dense<[32, 12, 12, 64]> : tensor<4xi32>} : () -> tensor<4xi32>
   %0 = "tf.RandomUniform"(%cst) {seed = 87654321 : i64, seed2 = 0 : i64} : (tensor<4xi32>) -> tensor<32x12x12x64xf32>
     // CHECK: return %17 : tensor<32x12x12x64xf32>
@@ -103,7 +103,7 @@ func.func @slice_variable_start(%arg0: tensor<3x4xi32>, %arg1: tensor<2xi64>) ->
   // CHECK-DAG-SAME: start_indices = dense<1> : tensor<1xi64>,
   // CHECK-DAG-SAME: strides = dense<1> : tensor<1xi64>} : (tensor<2xi64>) -> tensor<1xi64>
   // CHECK: %[[RESHAPED_START2:.*]] = mhlo.reshape %[[SLICED_START2]] : (tensor<1xi64>) -> tensor<i64>
-  // CHECK: %[[RESULT:.*]] = "mhlo.dynamic_slice"(%arg0, %[[RESHAPED_START1]], %[[RESHAPED_START2]]) {slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
+  // CHECK: %[[RESULT:.*]] = "mhlo.dynamic_slice"(%arg0, %[[RESHAPED_START1]], %[[RESHAPED_START2]]) <{slice_sizes = dense<[1, 4]> : tensor<2xi64>}> : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
   // CHECK: return %[[RESULT]] : tensor<1x4xi32>
   %sizes = "tf.Const"() {value = dense<[1, 4]> : tensor<2xi64>} : () -> (tensor<2xi64>)
   %0 = "tf.Slice"(%arg0, %arg1, %sizes) : (tensor<3x4xi32>, tensor<2xi64>, tensor<2xi64>) -> tensor<1x4xi32>
@@ -114,26 +114,26 @@ func.func @slice_variable_start(%arg0: tensor<3x4xi32>, %arg1: tensor<2xi64>) ->
 func.func @slice_variable_start_negsize(%arg0: tensor<3x4xi32>, %arg1: tensor<2xi32>) -> tensor<1x4xi32> {
   // CHECK: %0 = mhlo.constant dense<[1, -1]> : tensor<2xi32>
   // CHECK-NEXT: %1 = mhlo.constant dense<[1, -1]> : tensor<2xi32>
-  // CHECK-NEXT: %2 = "mhlo.slice"(%1) {limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi32>) -> tensor<1xi32>
+  // CHECK-NEXT: %2 = "mhlo.slice"(%1) <{limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi32>) -> tensor<1xi32>
   // CHECK-NEXT: %3 = mhlo.reshape %2 : (tensor<1xi32>) -> tensor<i32>
   // CHECK-NEXT: %4 = mhlo.constant dense<[1, -1]> : tensor<2xi32>
-  // CHECK-NEXT: %5 = "mhlo.slice"(%4) {limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi32>) -> tensor<1xi32>
+  // CHECK-NEXT: %5 = "mhlo.slice"(%4) <{limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi32>) -> tensor<1xi32>
   // CHECK-NEXT: %6 = mhlo.reshape %5 : (tensor<1xi32>) -> tensor<i32>
   // CHECK-NEXT: %7 = mhlo.constant dense<3> : tensor<i32>
-  // CHECK-NEXT: %8 = "mhlo.set_dimension_size"(%arg0, %7) {dimension = 0 : i64} : (tensor<3x4xi32>, tensor<i32>) -> tensor<3x4xi32>
+  // CHECK-NEXT: %8 = "mhlo.set_dimension_size"(%arg0, %7) <{dimension = 0 : i64}> : (tensor<3x4xi32>, tensor<i32>) -> tensor<3x4xi32>
   // CHECK-NEXT: %9 = mhlo.constant dense<4> : tensor<i32>
-  // CHECK-NEXT: %10 = "mhlo.set_dimension_size"(%8, %9) {dimension = 1 : i64} : (tensor<3x4xi32>, tensor<i32>) -> tensor<3x4xi32>
+  // CHECK-NEXT: %10 = "mhlo.set_dimension_size"(%8, %9) <{dimension = 1 : i64}> : (tensor<3x4xi32>, tensor<i32>) -> tensor<3x4xi32>
   // CHECK-NEXT: %11 = mhlo.constant dense<0> : tensor<i32>
-  // CHECK-NEXT: %12 = "mhlo.pad"(%10, %11) {edge_padding_high = dense<[3, 4]> : tensor<2xi64>, edge_padding_low = dense<0> : tensor<2xi64>, interior_padding = dense<0> : tensor<2xi64>} : (tensor<3x4xi32>, tensor<i32>) -> tensor<6x8xi32>
-  // CHECK-NEXT: %13 = "mhlo.slice"(%arg1) {limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi32>) -> tensor<1xi32>
+  // CHECK-NEXT: %12 = "mhlo.pad"(%10, %11) <{edge_padding_high = dense<[3, 4]> : tensor<2xi64>, edge_padding_low = dense<0> : tensor<2xi64>, interior_padding = dense<0> : tensor<2xi64>}> : (tensor<3x4xi32>, tensor<i32>) -> tensor<6x8xi32>
+  // CHECK-NEXT: %13 = "mhlo.slice"(%arg1) <{limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi32>) -> tensor<1xi32>
   // CHECK-NEXT: %14 = mhlo.reshape %13 : (tensor<1xi32>) -> tensor<i32>
-  // CHECK-NEXT: %15 = "mhlo.slice"(%arg1) {limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi32>) -> tensor<1xi32>
+  // CHECK-NEXT: %15 = "mhlo.slice"(%arg1) <{limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi32>) -> tensor<1xi32>
   // CHECK-NEXT: %16 = mhlo.reshape %15 : (tensor<1xi32>) -> tensor<i32>
-  // CHECK-NEXT: %17 = "mhlo.dynamic_slice"(%12, %14, %16) {slice_sizes = dense<[3, 4]> : tensor<2xi64>} : (tensor<6x8xi32>, tensor<i32>, tensor<i32>) -> tensor<3x4xi32>
-  // CHECK-NEXT: %18 = "mhlo.slice"(%17) {limit_indices = dense<[1, 4]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<3x4xi32>) -> tensor<1x4xi32>
+  // CHECK-NEXT: %17 = "mhlo.dynamic_slice"(%12, %14, %16) <{slice_sizes = dense<[3, 4]> : tensor<2xi64>}> : (tensor<6x8xi32>, tensor<i32>, tensor<i32>) -> tensor<3x4xi32>
+  // CHECK-NEXT: %18 = "mhlo.slice"(%17) <{limit_indices = dense<[1, 4]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}> : (tensor<3x4xi32>) -> tensor<1x4xi32>
   // CHECK-NEXT: %19 = mhlo.constant dense<4> : tensor<i32>
   // CHECK-NEXT: %20 = mhlo.subtract %19, %16 : tensor<i32>
-  // CHECK-NEXT: %21 = "mhlo.set_dimension_size"(%18, %20) {dimension = 1 : i64} : (tensor<1x4xi32>, tensor<i32>) -> tensor<1x?xi32, #mhlo.type_extensions<bounds = [?, 4]>>
+  // CHECK-NEXT: %21 = "mhlo.set_dimension_size"(%18, %20) <{dimension = 1 : i64}> : (tensor<1x4xi32>, tensor<i32>) -> tensor<1x?xi32, #mhlo.type_extensions<bounds = [?, 4]>>
   // CHECK-NEXT: %cast = tensor.cast %21 : tensor<1x?xi32, #mhlo.type_extensions<bounds = [?, 4]>> to tensor<1x4xi32>
   // CHECK-NEXT: return %cast : tensor<1x4xi32>
   %sizes = "tf.Const"() {value = dense<[1, -1]> : tensor<2xi32>} : () -> (tensor<2xi32>)
@@ -178,7 +178,7 @@ func.func @fused_conv2d(%input: tensor<1x300x300x40xi8>,
   // CHECK-NEXT:  %[[v1:.*]] = mhlo.constant dense<2.000000e+00> : tensor<f32>
   // CHECK-NEXT:  %[[v2:.*]] = mhlo.constant dense<2.000000e+00> : tensor<f32>
   // CHECK-NEXT:  %[[v3:.*]] = mhlo.constant dense<-1.280000e+02> : tensor<f32>
-  // CHECK-NEXT:  %[[v4:.*]] = "mhlo.broadcast_in_dim"(%3) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<1x300x300x40xf32>
+  // CHECK-NEXT:  %[[v4:.*]] = "mhlo.broadcast_in_dim"(%3) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<f32>) -> tensor<1x300x300x40xf32>
   // CHECK-NEXT:  %[[v5:.*]] = mhlo.convert %arg0 : (tensor<1x300x300x40xi8>) -> tensor<1x300x300x40xf32>
   // CHECK-NEXT:  %[[v6:.*]] = mhlo.convert %arg1 : (tensor<3x3x40x40xi8>) -> tensor<3x3x40x40xf32>
   // CHECK:       %[[v7:.*]] = mhlo.convolution(%[[v5]], %[[v6]])
@@ -188,16 +188,16 @@ func.func @fused_conv2d(%input: tensor<1x300x300x40xi8>,
   // CHECK-SAME:  feature_group_count = 1
   // CHECK-NEXT:  %[[v8:.*]] = mhlo.convert %7 : tensor<1x300x300x40xf32>
   // CHECK-NEXT:  %[[v9:.*]] = mhlo.constant dense<1.000000e+00> : tensor<f32>
-  // CHECK-NEXT:  %[[v10:.*]] = "mhlo.broadcast_in_dim"(%9) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<1x300x300x40xf32>
+  // CHECK-NEXT:  %[[v10:.*]] = "mhlo.broadcast_in_dim"(%9) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<f32>) -> tensor<1x300x300x40xf32>
   // CHECK-NEXT:  %11 = mhlo.multiply %8, %10 : tensor<1x300x300x40xf32>
   // CHECK-NEXT:  %12 = mhlo.convert %arg2 : tensor<40xf32>
-  // CHECK-NEXT:  %13 = "mhlo.broadcast_in_dim"(%12) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<40xf32>) -> tensor<1x300x300x40xf32>
+  // CHECK-NEXT:  %13 = "mhlo.broadcast_in_dim"(%12) <{broadcast_dimensions = dense<3> : tensor<1xi64>}> : (tensor<40xf32>) -> tensor<1x300x300x40xf32>
   // CHECK-NEXT:  %14 = mhlo.add %11, %13 : tensor<1x300x300x40xf32>
   // CHECK-NEXT:  %15 = mhlo.constant dense<0.000000e+00> : tensor<f32>
-  // CHECK-NEXT:  %16 = "mhlo.broadcast_in_dim"(%15) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<1x300x300x40xf32>
+  // CHECK-NEXT:  %16 = "mhlo.broadcast_in_dim"(%15) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<f32>) -> tensor<1x300x300x40xf32>
   // CHECK-NEXT:  %17 = mhlo.maximum %14, %16 : tensor<1x300x300x40xf32>
   // CHECK-NEXT:  %18 = mhlo.constant dense<1.270000e+02> : tensor<f32>
-  // CHECK-NEXT:  %19 = "mhlo.broadcast_in_dim"(%18) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<1x300x300x40xf32>
+  // CHECK-NEXT:  %19 = "mhlo.broadcast_in_dim"(%18) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<f32>) -> tensor<1x300x300x40xf32>
   // CHECK-NEXT:  %20 = mhlo.clamp %4, %17, %19 : tensor<1x300x300x40xf32>
   // CHECK-NEXT:  %21 = mhlo.round_nearest_even %20 : tensor<1x300x300x40xf32>
   // CHECK-NEXT:  %22 = mhlo.convert %21 : (tensor<1x300x300x40xf32>) -> tensor<1x300x300x40xi8>
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-quant.mlir b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-quant.mlir
index 6004400ffe8802..328a00ce59bbec 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-quant.mlir
+++ b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-quant.mlir
@@ -256,7 +256,7 @@ func.func @uniform_quantized_dot(%input: tensor<?xf32>) -> tensor<?x!tf_type.qin
   %output_zps = "tf.Const"() { value = dense<5> : tensor<i32> } : () -> tensor<i32>
 
   // CHECK-DAG: %[[RHS:.*]] = mhlo.constant()
-  // CHECK-SAME{LITERAL}: {value = dense<[[1, 2], [3, 4]]> : tensor<2x2xi8>} : () -> tensor<2x2x!quant.uniform<i8:f32, 1.000000e+00:3>>
+  // CHECK-SAME{LITERAL}: <{value = dense<[[1, 2], [3, 4]]> : tensor<2x2xi8>}> : () -> tensor<2x2x!quant.uniform<i8:f32, 1.000000e+00:3>>
   // CHECK-DAG: %[[LHS:.*]] = mhlo.uniform_quantize %arg0 : (tensor<?xf32>) -> tensor<?x!quant.uniform<i8:f32, 2.000000e+00:4>>
   // CHECK-DAG: %[[CONVERT_1:.*]] = mhlo.bitcast_convert %[[LHS]] : (tensor<?x!quant.uniform<i8:f32, 2.000000e+00:4>>) -> tensor<?xi8>
   // CHECK-DAG: %[[CONVERT_2:.*]] = mhlo.bitcast_convert %[[CONVERT_1]] : (tensor<?xi8>) -> tensor<?x!quant.uniform<i8:f32, 2.000000e+00:4>>
@@ -306,7 +306,7 @@ func.func @uniform_quantized_convolution(%input: tensor<1x6x6x3xf32>) -> tensor<
   %output_zps = "tf.Const"() { value = dense<5> : tensor<i32> } : () -> tensor<i32>
 
   // CHECK-DAG: %[[RHS:.*]] = mhlo.constant()
-  // CHECK-SAME{LITERAL}: {value = dense<127> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8:f32, 1.000000e+00:3>>
+  // CHECK-SAME{LITERAL}: <{value = dense<127> : tensor<2x3x3x2xi8>}> : () -> tensor<2x3x3x2x!quant.uniform<i8:f32, 1.000000e+00:3>>
   // CHECK-DAG: %[[LHS:.*]] = mhlo.uniform_quantize %arg0 : (tensor<1x6x6x3xf32>) -> tensor<1x6x6x3x!quant.uniform<i8:f32, 2.000000e+00:4>>
   // CHECK-DAG: %[[CONVERT_1:.*]] = mhlo.bitcast_convert %[[LHS]] : (tensor<1x6x6x3x!quant.uniform<i8:f32, 2.000000e+00:4>>) -> tensor<1x6x6x3xi8>
   // CHECK-DAG: %[[CONVERT_2:.*]] = mhlo.bitcast_convert %[[CONVERT_1]] : (tensor<1x6x6x3xi8>) -> tensor<1x6x6x3x!quant.uniform<i8:f32, 2.000000e+00:4>>
@@ -367,7 +367,7 @@ func.func @uniform_quantized_add(%arg0: tensor<3x2x!tf_type.qint32>) -> tensor<3
   %output_zps = "tf.Const"() { value = dense<4> : tensor<i32> } : () -> tensor<i32>
 
   // CHECK-DAG: %[[LHS:.*]] = mhlo.bitcast_convert %arg0 : (tensor<3x2xi32>) -> tensor<3x2x!quant.uniform<i32:f32, 2.000000e+00:4>>
-  // CHECK-DAG: %[[RHS:.*]] = mhlo.constant() {value = dense<127> : tensor<2xi32>} : () -> tensor<2x!quant.uniform<i32:f32, 2.000000e+00:4>>
+  // CHECK-DAG: %[[RHS:.*]] = mhlo.constant() <{value = dense<127> : tensor<2xi32>}> : () -> tensor<2x!quant.uniform<i32:f32, 2.000000e+00:4>>
   // CHECK: %[[RES:.*]] = chlo.broadcast_add %[[LHS]], %[[RHS]] {broadcast_dimensions = array<i64: 1>} :
   // CHECK-SAME: (tensor<3x2x!quant.uniform<i32:f32, 2.000000e+00:4>>, tensor<2x!quant.uniform<i32:f32, 2.000000e+00:4>>)
   // CHECK-SAME: -> tensor<3x2x!quant.uniform<i32:f32, 2.000000e+00:4>>
@@ -407,7 +407,7 @@ func.func @uniform_quantized_clip_by_value(%input: tensor<3x2xf32>) -> tensor<3x
   %zps = "tf.Const"() { value = dense<4> : tensor<2xi32> } : () -> tensor<2xi32>
 
   // tensor_proto that points to dense<127> of type !tf_type.qint32.
-  // CHECK-DAG: %[[MIN_MAX:.*]] = mhlo.constant() {value = dense<127> : tensor<2xi32>} : () -> tensor<2x!quant.uniform<i32:f32:1, {2.000000e+00:4,2.000000e+00:4}>>
+  // CHECK-DAG: %[[MIN_MAX:.*]] = mhlo.constant() <{value = dense<127> : tensor<2xi32>}> : () -> tensor<2x!quant.uniform<i32:f32:1, {2.000000e+00:4,2.000000e+00:4}>>
   %min = "tf.Const"() { value = #tf_type<tensor_proto : "0x746674656E736F722464747970653A2044545F51494E5433322074656E736F725F7368617065207B207D2074656E736F725F636F6E74656E743A20225C3137375C3030305C3030305C30303022"> : tensor<2x!tf_type.qint32> } : () -> tensor<2x!tf_type.qint32>
   %max = "tf.Const"() { value = #tf_type<tensor_proto : "0x746674656E736F722464747970653A2044545F51494E5433322074656E736F725F7368617065207B207D2074656E736F725F636F6E74656E743A20225C3137375C3030305C3030305C30303022"> : tensor<2x!tf_type.qint32> } : () -> tensor<2x!tf_type.qint32>
 
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-with-tf2xla-hlo-importer.mlir b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-with-tf2xla-hlo-importer.mlir
index aabc9d471f8385..f1fb2fec85722c 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-with-tf2xla-hlo-importer.mlir
+++ b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-with-tf2xla-hlo-importer.mlir
@@ -123,12 +123,12 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
 
   // CHECK-LABEL: binary_op_broadcast
   func.func @binary_op_broadcast(%arg0: tensor<4x1xf32>, %arg1: tensor<4x1x4xf32>) -> tensor<4x4x4xf32> {
-    // CHECK: %[[BROADCAST0:.*]] = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<4x1xf32>) -> tensor<4x4x1xf32>
+    // CHECK: %[[BROADCAST0:.*]] = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>}> : (tensor<4x1xf32>) -> tensor<4x4x1xf32>
     // CHECK: %[[RESHAPE0:.*]] = mhlo.reshape %[[BROADCAST0]] : (tensor<4x4x1xf32>) -> tensor<4x4xf32>
-    // CHECK: %[[UPDATED_ARG0:.*]] = "mhlo.broadcast_in_dim"(%[[RESHAPE0]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<4x4xf32>) -> tensor<4x4x4xf32>
+    // CHECK: %[[UPDATED_ARG0:.*]] = "mhlo.broadcast_in_dim"(%[[RESHAPE0]]) <{broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}> : (tensor<4x4xf32>) -> tensor<4x4x4xf32>
 
     // CHECK: %[[RESHAPE1:.*]] = mhlo.reshape %arg1 : (tensor<4x1x4xf32>) -> tensor<4x4xf32>
-    // CHECK: %[[UPDATED_ARG1:.*]] = "mhlo.broadcast_in_dim"(%[[RESHAPE1]]) {broadcast_dimensions = dense<[0, 2]> : tensor<2xi64>} : (tensor<4x4xf32>) -> tensor<4x4x4xf32>
+    // CHECK: %[[UPDATED_ARG1:.*]] = "mhlo.broadcast_in_dim"(%[[RESHAPE1]]) <{broadcast_dimensions = dense<[0, 2]> : tensor<2xi64>}> : (tensor<4x4xf32>) -> tensor<4x4x4xf32>
 
     // CHECK: %[[RESULT:.*]] = mhlo.atan2 %[[UPDATED_ARG0]], %[[UPDATED_ARG1]] : tensor<4x4x4xf32>
     // CHECK: return %[[RESULT]] : tensor<4x4x4xf32>
@@ -228,18 +228,18 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
   // CHECK-SAME: (%[[ARG0:.*]]: tensor<3x2xi32>, %[[ARG1:.*]]: tensor<3xf32>, %[[ARG2:.*]]: tensor<f32>)
   func.func @sparse_to_dense(%arg0: tensor<3x2xi32>, %arg1: tensor<3xf32>, %arg2: tensor<f32>) -> tensor<3x3xf32> {
 
-  // CHECK:      %[[DEFAULT:.*]] = "mhlo.broadcast_in_dim"(%[[ARG2]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<3x3xf32>
+  // CHECK:      %[[DEFAULT:.*]] = "mhlo.broadcast_in_dim"(%[[ARG2]]) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<f32>) -> tensor<3x3xf32>
 
-  // CHECK:      %[[RESULT:.*]] = "mhlo.scatter"(%[[DEFAULT]], %[[ARG0]], %[[ARG1]]) ({
-  // CHECK:      ^bb0(%[[ARG3:.*]]: tensor<f32>, %[[ARG4:.*]]: tensor<f32>):
-  // CHECK:        mhlo.return %[[ARG4]] : tensor<f32>
-  // CHECK:      })
+  // CHECK:      %[[RESULT:.*]] = "mhlo.scatter"(%[[DEFAULT]], %[[ARG0]], %[[ARG1]])
   // CHECK-SAME: indices_are_sorted = false
   // CHECK-SAME: scatter_dimension_numbers
   // CHECK-SAME:   inserted_window_dims = [0, 1]
   // CHECK-SAME:   scatter_dims_to_operand_dims = [0, 1]
   // CHECK-SAME:   index_vector_dim = 1
   // CHECK-SAME: unique_indices = false
+  // CHECK-NEXT: ^bb0(%[[ARG3:.*]]: tensor<f32>, %[[ARG4:.*]]: tensor<f32>):
+  // CHECK:        mhlo.return %[[ARG4]] : tensor<f32>
+  // CHECK:      })
   // CHECK-SAME: (tensor<3x3xf32>, tensor<3x2xi32>, tensor<3xf32>) -> tensor<3x3xf32>
 
   // return %[[RESULT]] : tensor<3x3xf32>
@@ -332,7 +332,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
   func.func @set_dynamic_dimension_size(%input: tensor<4xf32>, %size: tensor<i32>) -> tensor<?xf32> {
     %dimension = "tf.Const"() { value = dense<0> : tensor<i32> } : () -> tensor<i32>
     // CHECK: mhlo.set_dimension_size
-    // CHECK-SAME: {dimension = 0 : i64} : (tensor<4xf32>, tensor<i32>) -> tensor<?xf32, #mhlo.type_extensions<bounds = [4]>>
+    // CHECK-SAME: <{dimension = 0 : i64}> : (tensor<4xf32>, tensor<i32>) -> tensor<?xf32, #mhlo.type_extensions<bounds = [4]>>
     %0 = "tf.XlaSetDynamicDimensionSize"(%input, %dimension, %size) : (tensor<4xf32>, tensor<i32>, tensor<i32>) -> tensor<?xf32>
     func.return %0 : tensor<?xf32>
   }
@@ -469,7 +469,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
   func.func @bounds_propagation(%input: tensor<4xf32>, %size: tensor<i32>) -> tensor<?xf32> {
     %dimension = "tf.Const"() { value = dense<0> : tensor<i32> } : () -> tensor<i32>
     // CHECK: %[[BOUNDED:.*]] = "mhlo.set_dimension_size"
-    // CHECK-SAME: {dimension = 0 : i64} : (tensor<4xf32>, tensor<i32>) -> tensor<?xf32, #mhlo.type_extensions<bounds = [4]>>
+    // CHECK-SAME: <{dimension = 0 : i64}> : (tensor<4xf32>, tensor<i32>) -> tensor<?xf32, #mhlo.type_extensions<bounds = [4]>>
     %0 = "tf.XlaSetDynamicDimensionSize"(%input, %dimension, %size) : (tensor<4xf32>, tensor<i32>, tensor<i32>) -> tensor<?xf32>
 
     %axis = "tf.Const"() { value = dense<0> : tensor<1xi32> } : () -> tensor<1xi32>
@@ -487,7 +487,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
   func.func @bounds_propagation_skip_symbol_ref_ops(%input: tensor<4xf32>, %size: tensor<i32>) -> tensor<?xf32> {
     %dimension = "tf.Const"() { value = dense<0> : tensor<i32> } : () -> tensor<i32>
     // CHECK: %[[BOUNDED:.*]] = "mhlo.set_dimension_size"
-    // CHECK-SAME: {dimension = 0 : i64} : (tensor<4xf32>, tensor<i32>) -> tensor<?xf32, #mhlo.type_extensions<bounds = [4]>>
+    // CHECK-SAME: <{dimension = 0 : i64}> : (tensor<4xf32>, tensor<i32>) -> tensor<?xf32, #mhlo.type_extensions<bounds = [4]>>
     %0 = "tf.XlaSetDynamicDimensionSize"(%input, %dimension, %size) : (tensor<4xf32>, tensor<i32>, tensor<i32>) -> tensor<?xf32>
 
     // CHECK: %[[ORIGINAL:.*]] = tensor.cast %[[BOUNDED]] : tensor<?xf32, #mhlo.type_extensions<bounds = [4]>> to tensor<?xf32>
@@ -538,14 +538,14 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
 
   // CHECK-LABEL: fusedBatchNormV3_noTraining
   func.func @fusedBatchNormV3_noTraining(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>) -> (tensor<8x8x8x8xf32>) {
-    // CHECK: "mhlo.batch_norm_inference"({{.*}}, %arg1, %arg2, %arg3, %arg4) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> tensor<8x8x8x8xf32>
+    // CHECK: "mhlo.batch_norm_inference"({{.*}}, %arg1, %arg2, %arg3, %arg4) <{epsilon = 1.000000e-03 : f32, feature_index = 3 : i64}> : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> tensor<8x8x8x8xf32>
     %0:6 = "tf.FusedBatchNormV3"(%arg0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, is_training = false} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
     func.return %0#0 : tensor<8x8x8x8xf32>
   }
 
   // CHECK-LABEL: fusedBatchNormV3_training
   func.func @fusedBatchNormV3_training(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>) -> (tensor<8x8x8x8xf32>) {
-    // CHECK: %[[OUT:.*]], %[[MEAN:.*]], %[[VAR:.*]] = "mhlo.batch_norm_training"({{.*}}, %arg1, %arg2) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>)
+    // CHECK: %[[OUT:.*]], %[[MEAN:.*]], %[[VAR:.*]] = "mhlo.batch_norm_training"({{.*}}, %arg1, %arg2) <{epsilon = 1.000000e-03 : f32, feature_index = 3 : i64}> : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>)
     %0:6 = "tf.FusedBatchNormV3"(%arg0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, exponential_avg_factor = 1.0 : f32, is_training = true} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
     func.return %0#0 : tensor<8x8x8x8xf32>
   }
@@ -555,7 +555,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
     // CHECK-NEXT: %[[grad:.*]] = mhlo.convert %arg0 : tensor<8x8x8x8xf32>
     // CHECK-NEXT: %[[act:.*]] = mhlo.convert %arg1 : tensor<8x8x8x8xf32>
     // CHECK: %[[scr1:.*]] = mhlo.rsqrt
-    // CHECK: %[[bcast_arg3:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg3, {{.*}}) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
+    // CHECK: %[[bcast_arg3:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg3, {{.*}}) <{broadcast_dimensions = dense<3> : tensor<1xi64>}> : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
     // CHECK: %[[sub:.*]] = mhlo.subtract %[[act]], %[[bcast_arg3]] : tensor<8x8x8x8xf32>
     // CHECK: %[[mul:.*]] = mhlo.multiply %[[grad]], %[[sub]] : tensor<8x8x8x8xf32>
     // CHECK: mhlo.constant dense<[0, 1, 2]> : tensor<3xi64>
@@ -566,7 +566,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
     // CHECK: %[[scr2:.*]] = mhlo.convert %[[red1]] : tensor<8xf32>
 
     // CHECK: %[[mul2:.*]] = mhlo.multiply %arg2, %[[scr1]] : tensor<8xf32>
-    // CHECK: %[[bcast_mul2:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[mul2]], {{.*}}) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
+    // CHECK: %[[bcast_mul2:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[mul2]], {{.*}}) <{broadcast_dimensions = dense<3> : tensor<1xi64>}> : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
     // CHECK: %[[mul3:.*]] = mhlo.multiply %[[grad]], %[[bcast_mul2]] : tensor<8x8x8x8xf32>
 
     // CHECK: %[[scale_backprop:.*]] = mhlo.multiply %[[scr1]], %[[scr2]] : tensor<8xf32>
@@ -589,7 +589,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
   func.func @fusedBatchNormGradV3_Training(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8x8x8x8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>, %arg5: tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<0xf32>, tensor<*xf32>) {
     // CHECK-NEXT: %[[grad:.*]] = mhlo.convert %arg0 : tensor<8x8x8x8xf32>
     // CHECK-NEXT: %[[act:.*]] = mhlo.convert %arg1 : tensor<8x8x8x8xf32>
-    // CHECK-NEXT: %[[grad_operand:.*]], %[[grad_scale:.*]], %[[grad_offset:.*]] = "mhlo.batch_norm_grad"(%[[act]], %arg2, %arg3, %arg4, %[[grad]]) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8x8x8x8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>)
+    // CHECK-NEXT: %[[grad_operand:.*]], %[[grad_scale:.*]], %[[grad_offset:.*]] = "mhlo.batch_norm_grad"(%[[act]], %arg2, %arg3, %arg4, %[[grad]]) <{epsilon = 1.000000e-03 : f32, feature_index = 3 : i64}> : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8x8x8x8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>)
     // CHECK-NEXT: %[[x_backprop:.*]] = mhlo.convert %[[grad_operand]] : tensor<8x8x8x8xf32>
     // CHECK: return %[[x_backprop]]
     // CHECK-SAME: tensor<8x8x8x8xf32>
@@ -602,7 +602,9 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
   // CHECK-SAME: %[[INPUT:.*]]: tensor<10x24x24x64xf32>, %arg1: tensor<10x12x12x64xf32>, %[[GRAD:.*]]: tensor<10x12x12x64xf32>
   func.func @max_pool_grad_valid(%orig_input: tensor<10x24x24x64xf32>, %orig_output: tensor<10x12x12x64xf32>, %grad: tensor<10x12x12x64xf32>) -> tensor<10x24x24x64xf32> {
     // CHECK: %[[ZERO:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
-    // CHECK: %[[RESULT:.*]] = "mhlo.select_and_scatter"(%[[INPUT]], %[[GRAD]], %[[ZERO]]) ({
+    // CHECK: %[[RESULT:.*]] = "mhlo.select_and_scatter"(%[[INPUT]], %[[GRAD]], %[[ZERO]]) <{
+    // CHECK-SAME:  padding = dense<0> : tensor<4x2xi64>, window_dimensions = dense<[1, 2, 2, 1]> : tensor<4xi64>, window_strides = dense<[1, 2, 2, 1]> : tensor<4xi64>
+    // CHECK-SAME }> ({
     // CHECK: ^bb0(%[[VALUE_A:.*]]: tensor<f32>, %[[VALUE_B:.*]]: tensor<f32>):
     // CHECK: %[[SELECT_RESULT:.*]] = mhlo.compare GE, %[[VALUE_A]], %[[VALUE_B]] : (tensor<f32>, tensor<f32>) -> tensor<i1>
     // CHECK: mhlo.return %[[SELECT_RESULT]] : tensor<i1>
@@ -610,7 +612,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
     // CHECK: ^bb0(%[[VALUE_A:.*]]: tensor<f32>, %[[VALUE_B:.*]]: tensor<f32>):
     // CHECK: %[[SELECT_RESULT:.*]] = mhlo.add %[[VALUE_A]], %[[VALUE_B]] : tensor<f32>
     // CHECK: mhlo.return %[[SELECT_RESULT]] : tensor<f32>
-    // CHECK: }) {padding = dense<0> : tensor<4x2xi64>, window_dimensions = dense<[1, 2, 2, 1]> : tensor<4xi64>, window_strides = dense<[1, 2, 2, 1]> : tensor<4xi64>} : (tensor<10x24x24x64xf32>, tensor<10x12x12x64xf32>, tensor<f32>) -> tensor<10x24x24x64xf32>
+    // CHECK: }) : (tensor<10x24x24x64xf32>, tensor<10x12x12x64xf32>, tensor<f32>) -> tensor<10x24x24x64xf32>
     // CHECK: return %[[RESULT]] : tensor<10x24x24x64xf32>
     %result = "tf.MaxPoolGrad"(%orig_input, %orig_output, %grad) {
       data_format = "NHWC",
@@ -661,7 +663,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
 
   // CHECK-LABEL: func @concat_v2
   func.func @concat_v2(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<6x3xf32> {
-    // CHECK: "mhlo.concatenate"({{.*}}) {dimension = 0 : i64} : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<6x3xf32>
+    // CHECK: "mhlo.concatenate"({{.*}}) <{dimension = 0 : i64}> : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<6x3xf32>
     %axis = "tf.Const"() { value = dense<0> : tensor<i64> } : () -> tensor<i64>
     %1 = "tf.ConcatV2"(%arg0, %arg1, %axis) : (tensor<3x3xf32>, tensor<3x3xf32>, tensor<i64>) -> tensor<6x3xf32>
     func.return %1 : tensor<6x3xf32>
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf.mlir
index ed288a27fa7383..bb9ca266fc7abc 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf.mlir
@@ -18,7 +18,7 @@
 
 // CHECK-LABEL: fusedBatchNormV2_noTraining
 func.func @fusedBatchNormV2_noTraining(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>) -> (tensor<8x8x8x8xf32>) {
-  // CHECK: "mhlo.batch_norm_inference"({{.*}}, %arg1, %arg2, %arg3, %arg4) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> tensor<8x8x8x8xf32>
+  // CHECK: "mhlo.batch_norm_inference"({{.*}}, %arg1, %arg2, %arg3, %arg4) <{epsilon = 1.000000e-03 : f32, feature_index = 3 : i64}> : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> tensor<8x8x8x8xf32>
   %0:5 = "tf.FusedBatchNormV2"(%arg0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, is_training = false} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
   func.return %0#0 : tensor<8x8x8x8xf32>
 }
@@ -27,7 +27,7 @@ func.func @fusedBatchNormV2_noTraining(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor
 
 // CHECK-LABEL: fusedBatchNormV2_training
 func.func @fusedBatchNormV2_training(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>) -> (tensor<8x8x8x8xf32>) {
-  // CHECK: %[[OUT:.*]], %[[MEAN:.*]], %[[VAR:.*]] = "mhlo.batch_norm_training"({{.*}}, %arg1, %arg2) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>)
+  // CHECK: %[[OUT:.*]], %[[MEAN:.*]], %[[VAR:.*]] = "mhlo.batch_norm_training"({{.*}}, %arg1, %arg2) <{epsilon = 1.000000e-03 : f32, feature_index = 3 : i64}> : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>)
   %0:5 = "tf.FusedBatchNormV2"(%arg0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, exponential_avg_factor = 1.0 : f32, is_training = true} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
   // CHECK: mhlo.constant
   // CHECK: chlo.broadcast_multiply %[[VAR]], {{.*}} : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
@@ -38,7 +38,7 @@ func.func @fusedBatchNormV2_training(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8
 
 // CHECK-LABEL: fusedBatchNormV3_noTraining
 func.func @fusedBatchNormV3_noTraining(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>) -> (tensor<8x8x8x8xf32>) {
-  // CHECK: "mhlo.batch_norm_inference"({{.*}}, %arg1, %arg2, %arg3, %arg4) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> tensor<8x8x8x8xf32>
+  // CHECK: "mhlo.batch_norm_inference"({{.*}}, %arg1, %arg2, %arg3, %arg4) <{epsilon = 1.000000e-03 : f32, feature_index = 3 : i64}> : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> tensor<8x8x8x8xf32>
   %0:6 = "tf.FusedBatchNormV3"(%arg0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, is_training = false} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
   func.return %0#0 : tensor<8x8x8x8xf32>
 }
@@ -49,7 +49,7 @@ func.func @fusedBatchNormV3_noTraining(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor
 // CHECK-SAME:  ([[X:%.*]]: tensor<8x8x8x8xbf16>, [[SCALE:%.*]]: tensor<8xf32>, [[OFFSET:%.*]]: tensor<8xf32>, [[MEAN:%.*]]: tensor<8xf32>, [[VARIANCE:%.*]]: tensor<8xf32>)
 func.func @fusedBatchNormV3_noTraining_mixedPrecision(%arg0: tensor<8x8x8x8xbf16>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>) -> (tensor<8x8x8x8xbf16>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<*xf32>) {
   // CHECK: [[CONVERT_X:%.*]] = mhlo.convert [[X]] : (tensor<8x8x8x8xbf16>) -> tensor<8x8x8x8xf32>
-  // CHECK: [[Y:%.*]] = "mhlo.batch_norm_inference"([[CONVERT_X]], [[SCALE]], [[OFFSET]], [[MEAN]], [[VARIANCE]]) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64}
+  // CHECK: [[Y:%.*]] = "mhlo.batch_norm_inference"([[CONVERT_X]], [[SCALE]], [[OFFSET]], [[MEAN]], [[VARIANCE]]) <{epsilon = 1.000000e-03 : f32, feature_index = 3 : i64}>
   %0:6 = "tf.FusedBatchNormV3"(%arg0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, is_training = false} : (tensor<8x8x8x8xbf16>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xbf16>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<*xf32>)
   // CHECK: [[Y_CONVERT:%.*]] = mhlo.convert [[Y]] : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xbf16>
   // CHECK: [[DUMMY:%.*]] = mhlo.constant dense<0.000000e+00> : tensor<0xf32>
@@ -62,7 +62,7 @@ func.func @fusedBatchNormV3_noTraining_mixedPrecision(%arg0: tensor<8x8x8x8xbf16
 
 // CHECK-LABEL: fusedBatchNormV3_training
 func.func @fusedBatchNormV3_training(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>) -> (tensor<8x8x8x8xf32>) {
-  // CHECK: %[[OUT:.*]], %[[MEAN:.*]], %[[VAR:.*]] = "mhlo.batch_norm_training"({{.*}}, %arg1, %arg2) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>)
+  // CHECK: %[[OUT:.*]], %[[MEAN:.*]], %[[VAR:.*]] = "mhlo.batch_norm_training"({{.*}}, %arg1, %arg2) <{epsilon = 1.000000e-03 : f32, feature_index = 3 : i64}> : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>)
   %0:6 = "tf.FusedBatchNormV3"(%arg0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, exponential_avg_factor = 1.0 : f32, is_training = true} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
   // CHECK: mhlo.constant
   // CHECK: chlo.broadcast_multiply %[[VAR]], {{.*}} : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
@@ -73,7 +73,7 @@ func.func @fusedBatchNormV3_training(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8
 
 // CHECK-LABEL: func @fusedBatchNormV3_training_batchVariance
 func.func @fusedBatchNormV3_training_batchVariance(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>) -> tensor<8xf32> {
-  // CHECK: %[[OUT:.*]], %[[MEAN:.*]], %[[VAR:.*]] = "mhlo.batch_norm_training"({{.*}}, %arg1, %arg2) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>)
+  // CHECK: %[[OUT:.*]], %[[MEAN:.*]], %[[VAR:.*]] = "mhlo.batch_norm_training"({{.*}}, %arg1, %arg2) <{epsilon = 1.000000e-03 : f32, feature_index = 3 : i64}> : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>)
   %0:6 = "tf.FusedBatchNormV3"(%arg0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, exponential_avg_factor = 1.0 : f32, is_training = true} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
   // CHECK: return %[[VAR]]
   func.return %0#4 : tensor<8xf32>
@@ -83,7 +83,7 @@ func.func @fusedBatchNormV3_training_batchVariance(%arg0: tensor<8x8x8x8xf32>, %
 
 // CHECK-LABEL: fusedBatchNormV3_training_exponentialAvgFactor
 func.func @fusedBatchNormV3_training_exponentialAvgFactor(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>) -> (tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) {
-  // CHECK: %[[OUT:.*]], %[[MEAN:.*]], %[[VAR:.*]] = "mhlo.batch_norm_training"({{.*}}, %arg1, %arg2) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>)
+  // CHECK: %[[OUT:.*]], %[[MEAN:.*]], %[[VAR:.*]] = "mhlo.batch_norm_training"({{.*}}, %arg1, %arg2) <{epsilon = 1.000000e-03 : f32, feature_index = 3 : i64}> : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>)
   %0:6 = "tf.FusedBatchNormV3"(%arg0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, exponential_avg_factor = 0.8 : f32, is_training = true} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
   // CHECK: %[[FACTOR:.*]] = mhlo.constant dense<1.00195694>
   // CHECK: %[[CORRECTED_VAR:.*]] = chlo.broadcast_multiply %[[VAR]], %[[FACTOR]]
@@ -117,7 +117,7 @@ func.func @fusedBatchNormV3_training_mixedPrecision(%arg0: tensor<8x8x8x8xbf16>,
 
 // CHECK-LABEL: fusedBatchNormV3_NCHW
 func.func @fusedBatchNormV3_NCHW(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>) -> (tensor<8x8x8x8xf32>) {
-  // CHECK: "mhlo.batch_norm_training"({{.*}}, %arg1, %arg2) {epsilon = 1.000000e-03 : f32, feature_index = 1 : i64} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>)
+  // CHECK: "mhlo.batch_norm_training"({{.*}}, %arg1, %arg2) <{epsilon = 1.000000e-03 : f32, feature_index = 1 : i64}> : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>)
   %0:6 = "tf.FusedBatchNormV3"(%arg0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NCHW", epsilon = 0.001 : f32, exponential_avg_factor = 1.0 : f32, is_training = true} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
   func.return %0#0 : tensor<8x8x8x8xf32>
 }
@@ -135,7 +135,7 @@ func.func @fusedBatchNormV3_NDHWC(%arg0: tensor<8x8x8x8x8xf32>, %arg1: tensor<8x
 
 // CHECK-LABEL: fusedBatchNormV3_noTraining_dynamic_supported
 func.func @fusedBatchNormV3_noTraining_dynamic_supported(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<?xf32>, %arg2: tensor<?xf32>, %arg3: tensor<?xf32>, %arg4: tensor<?xf32>) -> (tensor<?x?x?x?xf32>) {
-  // CHECK: "mhlo.batch_norm_inference"({{.*}}, %arg1, %arg2, %arg3, %arg4) {epsilon = 1.000000e-03 : f32, feature_index = 1 : i64} : (tensor<?x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?x?x?x?xf32>
+  // CHECK: "mhlo.batch_norm_inference"({{.*}}, %arg1, %arg2, %arg3, %arg4) <{epsilon = 1.000000e-03 : f32, feature_index = 1 : i64}> : (tensor<?x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?x?x?x?xf32>
   %0:6 = "tf.FusedBatchNormV3"(%arg0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NCHW", epsilon = 0.001 : f32, exponential_avg_factor = 1.0 : f32, is_training = false} : (tensor<?x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> (tensor<?x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>)
   func.return %0#0 : tensor<?x?x?x?xf32>
 }
@@ -169,7 +169,7 @@ func.func @fusedBatchNormGrad_noTraining(%arg0: tensor<8x8x8x8xf32>, %arg1: tens
   // CHECK-NEXT: %[[add:.*]] = chlo.broadcast_add %arg4, %[[eps]] {broadcast_dimensions = array<i64>} : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[scr1:.*]] = mhlo.rsqrt %[[add]] : tensor<8xf32>
 
-  // CHECK:      %[[bcast_arg3:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg3, {{.*}}) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
+  // CHECK:      %[[bcast_arg3:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg3, {{.*}}) <{broadcast_dimensions = dense<3> : tensor<1xi64>}> : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[sub:.*]] = mhlo.subtract %[[act]], %[[bcast_arg3]] : tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[mul:.*]] = mhlo.multiply %[[grad]], %[[sub]] : tensor<8x8x8x8xf32>
   // CHECK-NEXT: mhlo.constant dense<[0, 1, 2]> : tensor<3xi64>
@@ -179,7 +179,7 @@ func.func @fusedBatchNormGrad_noTraining(%arg0: tensor<8x8x8x8xf32>, %arg1: tens
   // CHECK-NEXT: %[[scr2:.*]] = mhlo.convert %[[red1]] : tensor<8xf32>
 
   // CHECK-NEXT: %[[mul2:.*]] = mhlo.multiply %arg2, %[[scr1]] : tensor<8xf32>
-  // CHECK:      %[[bcast_mul2:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[mul2]], {{.*}}) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
+  // CHECK:      %[[bcast_mul2:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[mul2]], {{.*}}) <{broadcast_dimensions = dense<3> : tensor<1xi64>}> : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[mul3:.*]] = mhlo.multiply %[[grad]], %[[bcast_mul2]] : tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[scale_backprop:.*]] = mhlo.multiply %[[scr1]], %[[scr2]] : tensor<8xf32>
 
@@ -202,7 +202,7 @@ func.func @fusedBatchNormGrad_noTraining(%arg0: tensor<8x8x8x8xf32>, %arg1: tens
 func.func @fusedBatchNormGrad_Training(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8x8x8x8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>) -> (tensor<8x8x8x8xf32>) {
   // CHECK-NEXT: %[[grad:.*]] = mhlo.convert %arg0 : tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[act:.*]] = mhlo.convert %arg1 : tensor<8x8x8x8xf32>
-  // CHECK-NEXT: %[[grad_operand:.*]], %[[grad_scale:.*]], %[[grad_offset:.*]] = "mhlo.batch_norm_grad"(%[[act]], %arg2, %arg3, %arg4, %[[grad]]) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8x8x8x8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>)
+  // CHECK-NEXT: %[[grad_operand:.*]], %[[grad_scale:.*]], %[[grad_offset:.*]] = "mhlo.batch_norm_grad"(%[[act]], %arg2, %arg3, %arg4, %[[grad]]) <{epsilon = 1.000000e-03 : f32, feature_index = 3 : i64}> : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8x8x8x8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>)
   // CHECK-NEXT: %[[x_backprop:.*]] = mhlo.convert %[[grad_operand]] : tensor<8x8x8x8xf32>
   // CHECK-NEXT: return %[[x_backprop]] : tensor<8x8x8x8xf32>
 
@@ -221,7 +221,7 @@ func.func @fusedBatchNormGradV2_noTraining(%arg0: tensor<8x8x8x8xf32>, %arg1: te
   // CHECK-NEXT: %[[add:.*]] = chlo.broadcast_add %arg4, %[[eps]] {broadcast_dimensions = array<i64>} : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[scr1:.*]] = mhlo.rsqrt %[[add]] : tensor<8xf32>
 
-  // CHECK:      %[[bcast_arg3:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg3, {{.*}}) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
+  // CHECK:      %[[bcast_arg3:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg3, {{.*}}) <{broadcast_dimensions = dense<3> : tensor<1xi64>}> : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[sub:.*]] = mhlo.subtract %[[act]], %[[bcast_arg3]] : tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[mul:.*]] = mhlo.multiply %[[grad]], %[[sub]] : tensor<8x8x8x8xf32>
   // CHECK-NEXT: mhlo.constant dense<[0, 1, 2]> : tensor<3xi64>
@@ -231,7 +231,7 @@ func.func @fusedBatchNormGradV2_noTraining(%arg0: tensor<8x8x8x8xf32>, %arg1: te
   // CHECK-NEXT: %[[scr2:.*]] = mhlo.convert %[[red1]] : tensor<8xf32>
 
   // CHECK-NEXT: %[[mul2:.*]] = mhlo.multiply %arg2, %[[scr1]] : tensor<8xf32>
-  // CHECK:      %[[bcast_mul2:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[mul2]], {{.*}}) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
+  // CHECK:      %[[bcast_mul2:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[mul2]], {{.*}}) <{broadcast_dimensions = dense<3> : tensor<1xi64>}> : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[mul3:.*]] = mhlo.multiply %[[grad]], %[[bcast_mul2]] : tensor<8x8x8x8xf32>
 
   // CHECK-NEXT: %[[scale_backprop:.*]] = mhlo.multiply %[[scr1]], %[[scr2]] : tensor<8xf32>
@@ -255,7 +255,7 @@ func.func @fusedBatchNormGradV2_noTraining(%arg0: tensor<8x8x8x8xf32>, %arg1: te
 func.func @fusedBatchNormGradV2_Training(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8x8x8x8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>) -> (tensor<8x8x8x8xf32>) {
   // CHECK-NEXT: %[[grad:.*]] = mhlo.convert %arg0 : tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[act:.*]] = mhlo.convert %arg1 : tensor<8x8x8x8xf32>
-  // CHECK-NEXT: %[[grad_operand:.*]], %[[grad_scale:.*]], %[[grad_offset:.*]] = "mhlo.batch_norm_grad"(%[[act]], %arg2, %arg3, %arg4, %[[grad]]) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8x8x8x8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>)
+  // CHECK-NEXT: %[[grad_operand:.*]], %[[grad_scale:.*]], %[[grad_offset:.*]] = "mhlo.batch_norm_grad"(%[[act]], %arg2, %arg3, %arg4, %[[grad]]) <{epsilon = 1.000000e-03 : f32, feature_index = 3 : i64}> : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8x8x8x8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>)
   // CHECK-NEXT: %[[x_backprop:.*]] = mhlo.convert %[[grad_operand]] : tensor<8x8x8x8xf32>
   // CHECK-NEXT: return %[[x_backprop]] : tensor<8x8x8x8xf32>
 
@@ -283,7 +283,7 @@ func.func @fusedBatchNormGradV2_noTraining_mixed_precision(%arg0: tensor<8x8x8x8
 func.func @fusedBatchNormGradV2_Training_mixed_precision(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8x8x8x8xbf16>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>) -> (tensor<8x8x8x8xbf16>) {
   // CHECK-NEXT: %[[grad:.*]] = mhlo.convert %arg0 : tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[act:.*]] = mhlo.convert %arg1 : (tensor<8x8x8x8xbf16>) -> tensor<8x8x8x8xf32>
-  // CHECK-NEXT: %[[grad_operand:.*]], %[[grad_scale:.*]], %[[grad_offset:.*]] = "mhlo.batch_norm_grad"(%[[act]], %arg2, %arg3, %arg4, %[[grad]]) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8x8x8x8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>)
+  // CHECK-NEXT: %[[grad_operand:.*]], %[[grad_scale:.*]], %[[grad_offset:.*]] = "mhlo.batch_norm_grad"(%[[act]], %arg2, %arg3, %arg4, %[[grad]]) <{epsilon = 1.000000e-03 : f32, feature_index = 3 : i64}> : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8x8x8x8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>)
   // CHECK-NEXT: %[[x_backprop:.*]] = mhlo.convert %[[grad_operand]] : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xbf16>
   // CHECK-NEXT: return %[[x_backprop]] : tensor<8x8x8x8xbf16>
 
@@ -302,7 +302,7 @@ func.func @fusedBatchNormGradV3_noTraining(%arg0: tensor<8x8x8x8xf32>, %arg1: te
   // CHECK-NEXT: %[[add:.*]] = chlo.broadcast_add %arg4, %[[eps]] {broadcast_dimensions = array<i64>} : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[scr1:.*]] = mhlo.rsqrt %[[add]] : tensor<8xf32>
 
-  // CHECK:      %[[bcast_arg3:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg3, {{.*}}) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
+  // CHECK:      %[[bcast_arg3:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg3, {{.*}}) <{broadcast_dimensions = dense<3> : tensor<1xi64>}> : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[sub:.*]] = mhlo.subtract %[[act]], %[[bcast_arg3]] : tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[mul:.*]] = mhlo.multiply %[[grad]], %[[sub]] : tensor<8x8x8x8xf32>
   // CHECK-NEXT: mhlo.constant dense<[0, 1, 2]> : tensor<3xi64>
@@ -312,7 +312,7 @@ func.func @fusedBatchNormGradV3_noTraining(%arg0: tensor<8x8x8x8xf32>, %arg1: te
   // CHECK-NEXT: %[[scr2:.*]] = mhlo.convert %[[red1]] : tensor<8xf32>
 
   // CHECK-NEXT: %[[mul2:.*]] = mhlo.multiply %arg2, %[[scr1]] : tensor<8xf32>
-  // CHECK:      %[[bcast_mul2:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[mul2]], {{.*}}) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
+  // CHECK:      %[[bcast_mul2:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[mul2]], {{.*}}) <{broadcast_dimensions = dense<3> : tensor<1xi64>}> : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[mul3:.*]] = mhlo.multiply %[[grad]], %[[bcast_mul2]] : tensor<8x8x8x8xf32>
 
   // CHECK-NEXT: %[[scale_backprop:.*]] = mhlo.multiply %[[scr1]], %[[scr2]] : tensor<8xf32>
@@ -336,7 +336,7 @@ func.func @fusedBatchNormGradV3_noTraining(%arg0: tensor<8x8x8x8xf32>, %arg1: te
 func.func @fusedBatchNormGradV3_Training(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8x8x8x8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>, %arg5: tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<0xf32>, tensor<*xf32>) {
   // CHECK-NEXT: %[[grad:.*]] = mhlo.convert %arg0 : tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[act:.*]] = mhlo.convert %arg1 : tensor<8x8x8x8xf32>
-  // CHECK-NEXT: %[[grad_operand:.*]], %[[grad_scale:.*]], %[[grad_offset:.*]] = "mhlo.batch_norm_grad"(%[[act]], %arg2, %arg3, %arg4, %[[grad]]) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8x8x8x8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>)
+  // CHECK-NEXT: %[[grad_operand:.*]], %[[grad_scale:.*]], %[[grad_offset:.*]] = "mhlo.batch_norm_grad"(%[[act]], %arg2, %arg3, %arg4, %[[grad]]) <{epsilon = 1.000000e-03 : f32, feature_index = 3 : i64}> : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8x8x8x8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>)
   // CHECK-NEXT: %[[x_backprop:.*]] = mhlo.convert %[[grad_operand]] : tensor<8x8x8x8xf32>
   // CHECK: return %[[x_backprop]]
   // CHECK-SAME: tensor<8x8x8x8xf32>
@@ -365,7 +365,7 @@ func.func @fusedBatchNormGradV3_noTraining_mixed_precision(%arg0: tensor<8x8x8x8
 func.func @fusedBatchNormGradV3_Training_mixed_precision(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8x8x8x8xbf16>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>, %arg5: tensor<8xf32>) -> (tensor<8x8x8x8xbf16>) {
   // CHECK-NEXT: %[[grad:.*]] = mhlo.convert %arg0 : tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[act:.*]] = mhlo.convert %arg1 : (tensor<8x8x8x8xbf16>) -> tensor<8x8x8x8xf32>
-  // CHECK-NEXT: %[[grad_operand:.*]], %[[grad_scale:.*]], %[[grad_offset:.*]] = "mhlo.batch_norm_grad"(%[[act]], %arg2, %arg3, %arg4, %[[grad]]) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8x8x8x8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>)
+  // CHECK-NEXT: %[[grad_operand:.*]], %[[grad_scale:.*]], %[[grad_offset:.*]] = "mhlo.batch_norm_grad"(%[[act]], %arg2, %arg3, %arg4, %[[grad]]) <{epsilon = 1.000000e-03 : f32, feature_index = 3 : i64}> : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8x8x8x8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>)
   // CHECK-NEXT: %[[x_backprop:.*]] = mhlo.convert %[[grad_operand]] : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xbf16>
   // CHECK-NEXT: return %[[x_backprop]] : tensor<8x8x8x8xbf16>
 
@@ -384,7 +384,7 @@ func.func @fusedBatchNormGradV3_noTraining_NCHW(%arg0: tensor<8x8x8x8xf32>, %arg
   // CHECK-NEXT: %[[add:.*]] = chlo.broadcast_add %arg4, %[[eps]] {broadcast_dimensions = array<i64>} : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[scr1:.*]] = mhlo.rsqrt %[[add]] : tensor<8xf32>
 
-  // CHECK:      %[[bcast_arg3:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg3, {{.*}}) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
+  // CHECK:      %[[bcast_arg3:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg3, {{.*}}) <{broadcast_dimensions = dense<1> : tensor<1xi64>}> : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[sub:.*]] = mhlo.subtract %[[act]], %[[bcast_arg3]] : tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[mul:.*]] = mhlo.multiply %[[grad]], %[[sub]] : tensor<8x8x8x8xf32>
   // CHECK-NEXT: mhlo.constant dense<[0, 2, 3]> : tensor<3xi64>
@@ -394,7 +394,7 @@ func.func @fusedBatchNormGradV3_noTraining_NCHW(%arg0: tensor<8x8x8x8xf32>, %arg
   // CHECK-NEXT: %[[scr2:.*]] = mhlo.convert %[[red1]] : tensor<8xf32>
 
   // CHECK-NEXT: %[[mul2:.*]] = mhlo.multiply %arg2, %[[scr1]] : tensor<8xf32>
-  // CHECK:      %[[bcast_mul2:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[mul2]], {{.*}}) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
+  // CHECK:      %[[bcast_mul2:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[mul2]], {{.*}}) <{broadcast_dimensions = dense<1> : tensor<1xi64>}> : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[mul3:.*]] = mhlo.multiply %[[grad]], %[[bcast_mul2]] : tensor<8x8x8x8xf32>
 
   // CHECK-NEXT: %[[scale_backprop:.*]] = mhlo.multiply %[[scr1]], %[[scr2]] : tensor<8xf32>
@@ -416,7 +416,7 @@ func.func @fusedBatchNormGradV3_noTraining_NCHW(%arg0: tensor<8x8x8x8xf32>, %arg
 
 // CHECK-LABEL: fusedBatchNormGradV3_Training_NCHW
 func.func @fusedBatchNormGradV3_Training_NCHW(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8x8x8x8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>, %arg5: tensor<8xf32>) -> (tensor<8x8x8x8xf32>) {
-  // CHECK: %{{.*}} = "mhlo.batch_norm_grad"(%{{.*}}, %arg2, %arg3, %arg4, %[[grad]]) {epsilon = 1.000000e-03 : f32, feature_index = 1 : i64} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8x8x8x8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>)
+  // CHECK: %{{.*}} = "mhlo.batch_norm_grad"(%{{.*}}, %arg2, %arg3, %arg4, %[[grad]]) <{epsilon = 1.000000e-03 : f32, feature_index = 1 : i64}> : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8x8x8x8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>)
   %0:5 = "tf.FusedBatchNormGradV3"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5) {T = "tfdtype$DT_FLOAT", data_format = "NCHW", epsilon = 0.001 : f32, is_training = true} : (tensor<8x8x8x8xf32>, tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
   func.return %0#0 : tensor<8x8x8x8xf32>
 }
@@ -524,8 +524,8 @@ func.func @clip_dynamic(%arg0 : tensor<?xf32>, %arg1 : tensor<?xf32>, %arg2 : te
 // CHECK-LABEL: @clip_static_broadcast
 func.func @clip_static_broadcast(%arg0 : tensor<5xf32>, %arg1 : tensor<f32>, %arg2 : tensor<f32>) -> tensor<5xf32> {
   // CHECK-DAG: [[SHPIDX:%.+]] = mhlo.constant dense<5>
-  // CHECK-DAG: [[BROADCAST_MIN:%.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg1, [[SHPIDX]]) {broadcast_dimensions = dense<> : tensor<0xi64>}
-  // CHECK-DAG: [[BROADCAST_MAX:%.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg2, [[SHPIDX]]) {broadcast_dimensions = dense<> : tensor<0xi64>}
+  // CHECK-DAG: [[BROADCAST_MIN:%.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg1, [[SHPIDX]]) <{broadcast_dimensions = dense<> : tensor<0xi64>}>
+  // CHECK-DAG: [[BROADCAST_MAX:%.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg2, [[SHPIDX]]) <{broadcast_dimensions = dense<> : tensor<0xi64>}>
   // CHECK-DAG: [[CLAMP:%.+]] = mhlo.clamp [[BROADCAST_MIN]], %arg0, [[BROADCAST_MAX]]
   %0 = "tf.ClipByValue"(%arg0, %arg1, %arg2) : (tensor<5xf32>, tensor<f32>, tensor<f32>) -> tensor<5xf32>
 
@@ -538,8 +538,8 @@ func.func @clip_static_broadcast(%arg0 : tensor<5xf32>, %arg1 : tensor<f32>, %ar
 func.func @clip_dynamic_broadcast(%arg0 : tensor<?xf32>, %arg1 : tensor<f32>, %arg2 : tensor<f32>) -> tensor<?xf32> {
   // CHECK: [[SHP:%.+]] = shape.shape_of %arg0
   // CHECK: [[SHPIDX:%.+]] = arith.index_cast [[SHP]] : tensor<1xindex> to tensor<1xi32>
-  // CHECK-DAG: [[BROADCAST_MIN:%.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg1, [[SHPIDX]]) {broadcast_dimensions = dense<> : tensor<0xi64>}
-  // CHECK-DAG: [[BROADCAST_MAX:%.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg2, [[SHPIDX]]) {broadcast_dimensions = dense<> : tensor<0xi64>}
+  // CHECK-DAG: [[BROADCAST_MIN:%.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg1, [[SHPIDX]]) <{broadcast_dimensions = dense<> : tensor<0xi64>}>
+  // CHECK-DAG: [[BROADCAST_MAX:%.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg2, [[SHPIDX]]) <{broadcast_dimensions = dense<> : tensor<0xi64>}>
   // CHECK-DAG: [[CLAMP:%.+]] = mhlo.clamp [[BROADCAST_MIN]], %arg0, [[BROADCAST_MAX]]
   %0 = "tf.ClipByValue"(%arg0, %arg1, %arg2) : (tensor<?xf32>, tensor<f32>, tensor<f32>) -> tensor<?xf32>
 
@@ -557,11 +557,11 @@ func.func @clip_dynamic_broadcast(%arg0 : tensor<?xf32>, %arg1 : tensor<f32>, %a
 // CHECK-SAME: %[[ARG:.*]]: tensor<4x3x4x3xf32>
 func.func @diag_part(%arg0: tensor<4x3x4x3xf32>) -> tensor<4x3xf32> {
   // CHECK: %[[RS:.*]] = mhlo.reshape %[[ARG]] : (tensor<4x3x4x3xf32>) -> tensor<12x12xf32>
-  // CHECK-DAG: %[[IOTA0:.*]] = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<12x12xi32>
-  // CHECK-DAG: %[[IOTA1:.*]] = "mhlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<12x12xi32>
+  // CHECK-DAG: %[[IOTA0:.*]] = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<12x12xi32>
+  // CHECK-DAG: %[[IOTA1:.*]] = "mhlo.iota"() <{iota_dimension = 1 : i64}> : () -> tensor<12x12xi32>
   // CHECK-DAG: %[[COMP:.*]] = mhlo.compare EQ, %[[IOTA0]], %[[IOTA1]], NOTYPE : (tensor<12x12xi32>, tensor<12x12xi32>) -> tensor<12x12xi1>
   // CHECK-DAG: %[[ZERO:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
-  // CHECK-DAG: %[[ZERO_MAT:.*]] = "mhlo.broadcast"(%[[ZERO]]) {broadcast_sizes = dense<12> : tensor<2xi64>} : (tensor<f32>) -> tensor<12x12xf32>
+  // CHECK-DAG: %[[ZERO_MAT:.*]] = "mhlo.broadcast"(%[[ZERO]]) <{broadcast_sizes = dense<12> : tensor<2xi64>}> : (tensor<f32>) -> tensor<12x12xf32>
   // CHECK-DAG: %[[SEL:.*]] = mhlo.select %[[COMP]], %[[RS]], %[[ZERO_MAT]] : tensor<12x12xi1>, tensor<12x12xf32>
   // CHECK-DAG: %[[RED:.*]] = mhlo.reduce(%[[SEL]] init: %[[ZERO]]) applies mhlo.add across dimensions = [0] : (tensor<12x12xf32>, tensor<f32>) -> tensor<12xf32>
   // CHECK-DAG:  %[[RES:.*]] = mhlo.reshape %[[RED]] : (tensor<12xf32>) -> tensor<4x3xf32>
@@ -581,22 +581,22 @@ func.func @diag_part(%arg0: tensor<4x3x4x3xf32>) -> tensor<4x3xf32> {
 func.func @matrix_diag_part(%arg0: tensor<7x140x128xi32>) -> tensor<7x22x128xi32> {
   // CHECK-DAG: %[[V0:.*]] = mhlo.constant dense<42> : tensor<i32>
   // CHECK-DAG: %[[V1:.*]] = mhlo.constant dense<[-10, 11]> : tensor<2xi32>
-  // CHECK-DAG: %[[V2:.*]] = "mhlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<1x22x128xi32>
-  // CHECK-DAG: %[[V3:.*]] = "mhlo.iota"() {iota_dimension = 2 : i64} : () -> tensor<1x22x128xi32>
+  // CHECK-DAG: %[[V2:.*]] = "mhlo.iota"() <{iota_dimension = 1 : i64}> : () -> tensor<1x22x128xi32>
+  // CHECK-DAG: %[[V3:.*]] = "mhlo.iota"() <{iota_dimension = 2 : i64}> : () -> tensor<1x22x128xi32>
   // CHECK-DAG: %[[V4:.*]] = mhlo.constant dense<0> : tensor<i32>
-  // CHECK-DAG: %[[V5:.*]] = "mhlo.broadcast"(%[[V4]]) {broadcast_sizes = dense<[1, 22, 128]> : tensor<3xi64>} : (tensor<i32>) -> tensor<1x22x128xi32>
+  // CHECK-DAG: %[[V5:.*]] = "mhlo.broadcast"(%[[V4]]) <{broadcast_sizes = dense<[1, 22, 128]> : tensor<3xi64>}> : (tensor<i32>) -> tensor<1x22x128xi32>
   // CHECK-DAG: %[[V6:.*]] = mhlo.constant dense<false> : tensor<i1>
-  // CHECK-DAG: %[[V7:.*]] = "mhlo.broadcast"(%[[V6]]) {broadcast_sizes = dense<[1, 22, 128]> : tensor<3xi64>} : (tensor<i1>) -> tensor<1x22x128xi1>
+  // CHECK-DAG: %[[V7:.*]] = "mhlo.broadcast"(%[[V6]]) <{broadcast_sizes = dense<[1, 22, 128]> : tensor<3xi64>}> : (tensor<i1>) -> tensor<1x22x128xi1>
   // CHECK-DAG: %[[V8:.*]] = mhlo.constant dense<true> : tensor<i1>
-  // CHECK-DAG: %[[V9:.*]] = "mhlo.broadcast"(%[[V8]]) {broadcast_sizes = dense<[1, 22, 128]> : tensor<3xi64>} : (tensor<i1>) -> tensor<1x22x128xi1>
+  // CHECK-DAG: %[[V9:.*]] = "mhlo.broadcast"(%[[V8]]) <{broadcast_sizes = dense<[1, 22, 128]> : tensor<3xi64>}> : (tensor<i1>) -> tensor<1x22x128xi1>
   // CHECK-DAG: %[[V10:.*]] = mhlo.constant dense<11> : tensor<i32>
-  // CHECK-DAG: %[[V11:.*]] = "mhlo.broadcast"(%[[V10]]) {broadcast_sizes = dense<[1, 22, 128]> : tensor<3xi64>} : (tensor<i32>) -> tensor<1x22x128xi32>
+  // CHECK-DAG: %[[V11:.*]] = "mhlo.broadcast"(%[[V10]]) <{broadcast_sizes = dense<[1, 22, 128]> : tensor<3xi64>}> : (tensor<i32>) -> tensor<1x22x128xi32>
   // CHECK-DAG: %[[V12:.*]] = mhlo.constant dense<140> : tensor<i32>
-  // CHECK-DAG: %[[V13:.*]] = "mhlo.broadcast"(%[[V12]]) {broadcast_sizes = dense<[1, 22, 128]> : tensor<3xi64>} : (tensor<i32>) -> tensor<1x22x128xi32>
+  // CHECK-DAG: %[[V13:.*]] = "mhlo.broadcast"(%[[V12]]) <{broadcast_sizes = dense<[1, 22, 128]> : tensor<3xi64>}> : (tensor<i32>) -> tensor<1x22x128xi32>
   // CHECK-DAG: %[[V14:.*]] = mhlo.constant dense<128> : tensor<i32>
-  // CHECK-DAG: %[[V15:.*]] = "mhlo.broadcast"(%[[V14]]) {broadcast_sizes = dense<[1, 22, 128]> : tensor<3xi64>} : (tensor<i32>) -> tensor<1x22x128xi32>
+  // CHECK-DAG: %[[V15:.*]] = "mhlo.broadcast"(%[[V14]]) <{broadcast_sizes = dense<[1, 22, 128]> : tensor<3xi64>}> : (tensor<i32>) -> tensor<1x22x128xi32>
   // CHECK-DAG: %[[V16:.*]] = mhlo.constant dense<128> : tensor<i32>
-  // CHECK-DAG: %[[V17:.*]] = "mhlo.broadcast"(%[[V16]]) {broadcast_sizes = dense<[1, 22, 128]> : tensor<3xi64>} : (tensor<i32>) -> tensor<1x22x128xi32>
+  // CHECK-DAG: %[[V17:.*]] = "mhlo.broadcast"(%[[V16]]) <{broadcast_sizes = dense<[1, 22, 128]> : tensor<3xi64>}> : (tensor<i32>) -> tensor<1x22x128xi32>
   // CHECK-DAG: %[[V18:.*]] = mhlo.subtract %[[V11]], %[[V2]] : tensor<1x22x128xi32>
   // CHECK-DAG: %[[V19:.*]] = mhlo.negate %[[V18]] : tensor<1x22x128xi32>
   // CHECK-DAG: %[[V20:.*]] = mhlo.minimum %[[V18]], %[[V5]] : tensor<1x22x128xi32>
@@ -621,10 +621,10 @@ func.func @matrix_diag_part(%arg0: tensor<7x140x128xi32>) -> tensor<7x22x128xi32
   // CHECK-DAG: %[[V39:.*]] = mhlo.and %[[V37]], %[[V38]] : tensor<1x22x128xi1>
   // CHECK-DAG: %[[V40:.*]] = mhlo.and %[[V36]], %[[V39]] : tensor<1x22x128xi1>
   // CHECK-DAG: %[[V41:.*]] = mhlo.reshape %[[V40]] : (tensor<1x22x128xi1>) -> tensor<22x128xi1>
-  // CHECK-DAG: %[[V42:.*]] = "mhlo.concatenate"(%[[V33]], %[[V32]]) {dimension = 0 : i64} : (tensor<1x22x128xi32>, tensor<1x22x128xi32>) -> tensor<2x22x128xi32>
-  // CHECK-DAG: %[[V43:.*]] = "mhlo.gather"(%[[ARG]], %[[V42]]) {dimension_numbers = #mhlo.gather<offset_dims = [0], collapsed_slice_dims = [1, 2], start_index_map = [1, 2]>, indices_are_sorted = false, slice_sizes = dense<[7, 1, 1]> : tensor<3xi64>} : (tensor<7x140x128xi32>, tensor<2x22x128xi32>) -> tensor<7x22x128xi32>
-  // CHECK-DAG: %[[V44:.*]] = "mhlo.broadcast"(%[[V41]]) {broadcast_sizes = dense<7> : tensor<1xi64>} : (tensor<22x128xi1>) -> tensor<7x22x128xi1>
-  // CHECK-DAG: %[[V45:.*]] = "mhlo.broadcast"(%[[V0]]) {broadcast_sizes = dense<[7, 22, 128]> : tensor<3xi64>} : (tensor<i32>) -> tensor<7x22x128xi32>
+  // CHECK-DAG: %[[V42:.*]] = "mhlo.concatenate"(%[[V33]], %[[V32]]) <{dimension = 0 : i64}> : (tensor<1x22x128xi32>, tensor<1x22x128xi32>) -> tensor<2x22x128xi32>
+  // CHECK-DAG: %[[V43:.*]] = "mhlo.gather"(%[[ARG]], %[[V42]]) <{dimension_numbers = #mhlo.gather<offset_dims = [0], collapsed_slice_dims = [1, 2], start_index_map = [1, 2]>, indices_are_sorted = false, slice_sizes = dense<[7, 1, 1]> : tensor<3xi64>}> : (tensor<7x140x128xi32>, tensor<2x22x128xi32>) -> tensor<7x22x128xi32>
+  // CHECK-DAG: %[[V44:.*]] = "mhlo.broadcast"(%[[V41]]) <{broadcast_sizes = dense<7> : tensor<1xi64>}> : (tensor<22x128xi1>) -> tensor<7x22x128xi1>
+  // CHECK-DAG: %[[V45:.*]] = "mhlo.broadcast"(%[[V0]]) <{broadcast_sizes = dense<[7, 22, 128]> : tensor<3xi64>}> : (tensor<i32>) -> tensor<7x22x128xi32>
   // CHECK: %[[V46:.*]] = mhlo.select %[[V44]], %[[V43]], %[[V45]] : tensor<7x22x128xi1>, tensor<7x22x128xi32>
   // CHECK: return %[[V46]] : tensor<7x22x128xi32>
   %0 = mhlo.constant dense<42> : tensor<i32>  // padding value
@@ -670,7 +670,7 @@ func.func @matrix_diag_part_align_ll(%arg0: tensor<7x140x128xi32>) -> tensor<7x2
       T = i32, align = "LEFT_LEFT"
   } : (tensor<7x140x128xi32>, tensor<2xi32>, tensor<i32>) -> tensor<7x22x128xi32>
   // CHECK: %[[false:.*]] = mhlo.constant dense<false> : tensor<i1>
-  // CHECK: %[[b_false:.*]] = "mhlo.broadcast"(%[[false]]) {broadcast_sizes = dense<[1, 22, 128]> : tensor<3xi64>} : (tensor<i1>) -> tensor<1x22x128xi1>
+  // CHECK: %[[b_false:.*]] = "mhlo.broadcast"(%[[false]]) <{broadcast_sizes = dense<[1, 22, 128]> : tensor<3xi64>}> : (tensor<i1>) -> tensor<1x22x128xi1>
   // CHECK: %{{[0-9]*}} = mhlo.select %[[b_false]], %{{[0-9]*}}, %{{[0-9]*}} : tensor<1x22x128xi1>, tensor<1x22x128xi32>
   func.return %2: tensor<7x22x128xi32>
 }
@@ -713,7 +713,7 @@ func.func @matrix_diag_part_align_rr(%arg0: tensor<7x140x128xi32>) -> tensor<7x2
       T = i32, align = "RIGHT_RIGHT"
   } : (tensor<7x140x128xi32>, tensor<2xi32>, tensor<i32>) -> tensor<7x22x128xi32>
   // CHECK: %[[true:.*]] = mhlo.constant dense<true> : tensor<i1>
-  // CHECK: %[[b_true:.*]] = "mhlo.broadcast"(%[[true]]) {broadcast_sizes = dense<[1, 22, 128]> : tensor<3xi64>} : (tensor<i1>) -> tensor<1x22x128xi1>
+  // CHECK: %[[b_true:.*]] = "mhlo.broadcast"(%[[true]]) <{broadcast_sizes = dense<[1, 22, 128]> : tensor<3xi64>}> : (tensor<i1>) -> tensor<1x22x128xi1>
   // CHECK: %{{[0-9]*}} = mhlo.select %[[b_true]], %{{[0-9]*}}, %{{[0-9]*}} : tensor<1x22x128xi1>, tensor<1x22x128xi32>
   func.return %2: tensor<7x22x128xi32>
 }
@@ -1009,7 +1009,7 @@ func.func @floormod_dynamic_broadcast_denominator_(%arg0: tensor<?x?xf32>, %arg1
 // CHECK-LABEL: @ones_like
 // CHECK-SAME:  (%[[ARG:.*]]: tensor<2x?xf32>)
 func.func @ones_like(%arg0: tensor<2x?xf32>) -> tensor<2x?xf32> {
-  // CHECK: %[[RES:.*]] = "chlo.constant_like"(%[[ARG]]) {value = 1.0{{.*}}}
+  // CHECK: %[[RES:.*]] = "chlo.constant_like"(%[[ARG]]) <{value = 1.0{{.*}}}>
   // CHECK: return %[[RES]]
   %0 = "tf.OnesLike"(%arg0) : (tensor<2x?xf32>) -> tensor<2x?xf32>
   func.return %0 : tensor<2x?xf32>
@@ -1024,7 +1024,7 @@ func.func @ones_like(%arg0: tensor<2x?xf32>) -> tensor<2x?xf32> {
 // CHECK-LABEL: @zeros_like
 // CHECK-SAME:  (%[[ARG:.*]]: tensor<2x?xf32>)
 func.func @zeros_like(%arg0: tensor<2x?xf32>) -> tensor<2x?xf32> {
-  // CHECK: %[[RES:.*]] = "chlo.constant_like"(%[[ARG]]) {value = 0.0{{.*}}}
+  // CHECK: %[[RES:.*]] = "chlo.constant_like"(%[[ARG]]) <{value = 0.0{{.*}}}>
   // CHECK: return %[[RES]]
   %0 = "tf.ZerosLike"(%arg0) : (tensor<2x?xf32>) -> tensor<2x?xf32>
   func.return %0 : tensor<2x?xf32>
@@ -1086,7 +1086,7 @@ func.func @real(%arg0: tensor<3xcomplex<f32>>) -> tensor<3xf32> {
 
 // CHECK-LABEL: func @concat_v2
 func.func @concat_v2(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<6x3xf32> {
-  // CHECK: "mhlo.concatenate"({{.*}}) {dimension = 0 : i64} : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<6x3xf32>
+  // CHECK: "mhlo.concatenate"({{.*}}) <{dimension = 0 : i64}> : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<6x3xf32>
   %axis = "tf.Const"() { value = dense<0> : tensor<i64> } : () -> tensor<i64>
   %1 = "tf.ConcatV2"(%arg0, %arg1, %axis) : (tensor<3x3xf32>, tensor<3x3xf32>, tensor<i64>) -> tensor<6x3xf32>
   func.return %1 : tensor<6x3xf32>
@@ -1096,7 +1096,7 @@ func.func @concat_v2(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<6
 
 // CHECK-LABEL: func @concat_v2_neg_axis
 func.func @concat_v2_neg_axis(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<6x3xf32> {
-  // CHECK: "mhlo.concatenate"({{.*}}) {dimension = 0 : i64} : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<6x3xf32>
+  // CHECK: "mhlo.concatenate"({{.*}}) <{dimension = 0 : i64}> : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<6x3xf32>
 
   %axis = "tf.Const"() { value = dense<-2> : tensor<i64> } : () -> tensor<i64>
   %1 = "tf.ConcatV2"(%arg0, %arg1, %axis) : (tensor<3x3xf32>, tensor<3x3xf32>, tensor<i64>) -> tensor<6x3xf32>
@@ -1107,7 +1107,7 @@ func.func @concat_v2_neg_axis(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) ->
 
 // CHECK-LABEL: func @concat_v2_1d_axis
 func.func @concat_v2_1d_axis(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<3x6xf32> {
-  // CHECK: "mhlo.concatenate"({{.*}}) {dimension = 1 : i64} : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<3x6xf32>
+  // CHECK: "mhlo.concatenate"({{.*}}) <{dimension = 1 : i64}> : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<3x6xf32>
 
   %axis = "tf.Const"() { value = dense<[1]> : tensor<1xi64> } : () -> tensor<1xi64>
   %1 = "tf.ConcatV2"(%arg0, %arg1, %axis) : (tensor<3x3xf32>, tensor<3x3xf32>, tensor<1xi64>) -> tensor<3x6xf32>
@@ -1132,7 +1132,7 @@ func.func @concat_v2_non_const_axis(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf3
 // CHECK-LABEL: func @padv2_1D
 func.func @padv2_1D(%arg0: tensor<3xf32>, %arg1: tensor<f32>) -> tensor<6xf32> {
   %padding = "tf.Const"() { value = dense<[[1, 2]]> : tensor<1x2xi64> } : () -> tensor<1x2xi64>
-  // CHECK: "mhlo.pad"(%arg0, %arg1) {
+  // CHECK: "mhlo.pad"(%arg0, %arg1) <{
   // CHECK-SAME: edge_padding_high = dense<2> : tensor<1xi64>,
   // CHECK-SAME: edge_padding_low = dense<1> : tensor<1xi64>,
   // CHECK-SAME: interior_padding = dense<0> : tensor<1xi64>
@@ -1145,7 +1145,7 @@ func.func @padv2_1D(%arg0: tensor<3xf32>, %arg1: tensor<f32>) -> tensor<6xf32> {
 // CHECK-LABEL: func @padv2_2D
 func.func @padv2_2D(%arg0: tensor<3x2xf32>, %arg1: tensor<f32>) -> tensor<6x9xf32> {
   %padding = "tf.Const"() { value = dense<[[1,2],[3,4]]> : tensor<2x2xi64> } : () -> tensor<2x2xi64>
-  // CHECK: "mhlo.pad"(%arg0, %arg1) {
+  // CHECK: "mhlo.pad"(%arg0, %arg1) <{
   // CHECK-SAME:    edge_padding_high = dense<[2, 4]> : tensor<2xi64>,
   // CHECK-SAME:    edge_padding_low = dense<[1, 3]> : tensor<2xi64>,
   // CHECK-SAME:    interior_padding = dense<0> : tensor<2xi64>
@@ -1158,7 +1158,7 @@ func.func @padv2_2D(%arg0: tensor<3x2xf32>, %arg1: tensor<f32>) -> tensor<6x9xf3
 // CHECK-LABEL: func @padv2_i32_paddings
 func.func @padv2_i32_paddings(%arg0: tensor<3x2xf32>, %arg1: tensor<f32>) -> tensor<6x9xf32> {
   %padding = "tf.Const"() { value = dense<[[1,2],[3,4]]> : tensor<2x2xi32> } : () -> tensor<2x2xi32>
-  // CHECK: "mhlo.pad"(%arg0, %arg1) {
+  // CHECK: "mhlo.pad"(%arg0, %arg1) <{
   // CHECK-SAME:    edge_padding_high = dense<[2, 4]> : tensor<2xi64>,
   // CHECK-SAME:    edge_padding_low = dense<[1, 3]> : tensor<2xi64>,
   // CHECK-SAME:    interior_padding = dense<0> : tensor<2xi64>
@@ -1170,10 +1170,10 @@ func.func @padv2_i32_paddings(%arg0: tensor<3x2xf32>, %arg1: tensor<f32>) -> ten
 
 // CHECK-LABEL: func @padv2_dynamic
 func.func @padv2_dynamic(%arg0: tensor<?xf32>, %arg1: tensor<f32>, %arg2: tensor<1x2xi64>) -> tensor<?xf32> {
-  // CHECK: "mhlo.transpose"({{.*}}) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x2xi64>) -> tensor<2x1xi64>
+  // CHECK: "mhlo.transpose"({{.*}}) <{permutation = dense<[1, 0]> : tensor<2xi64>}> : (tensor<1x2xi64>) -> tensor<2x1xi64>
   // CHECK: mhlo.reshape {{.*}} : (tensor<2x1xi64>) -> tensor<2xi64>
-  // CHECK: "mhlo.slice"({{.*}}) {limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi64>) -> tensor<1xi64>
-  // CHECK: "mhlo.slice"({{.*}}) {limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi64>) -> tensor<1xi64>
+  // CHECK: "mhlo.slice"({{.*}}) <{limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi64>) -> tensor<1xi64>
+  // CHECK: "mhlo.slice"({{.*}}) <{limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi64>) -> tensor<1xi64>
   // CHECK: mhlo.dynamic_pad {{.*}} : (tensor<?xf32>, tensor<f32>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<?xf32>
   %1 = "tf.PadV2"(%arg0, %arg2, %arg1) : (tensor<?xf32>, tensor<1x2xi64>, tensor<f32>) -> tensor<?xf32>
   func.return %1 : tensor<?xf32>
@@ -1237,7 +1237,7 @@ func.func @checkNumerics(%arg0: tensor<1xf32>) -> tensor<1xf32> {
 // CHECK-LABEL: func @infeed_dequeue_tuple
 func.func @infeed_dequeue_tuple() -> (tensor<1x8x4x4xi32>, tensor<1x100x1xf32>) {
 // CHECK: [[TOKEN:%.*]] = mhlo.create_token  : !mhlo.token
-// CHECK: [[INFEED:%.*]]:3 = "mhlo.infeed"([[TOKEN]]) {infeed_config = ""{{.*}}} : (!mhlo.token) -> (tensor<1x8x4x4xi32>, tensor<1x100x1xf32>, !mhlo.token)
+// CHECK: [[INFEED:%.*]]:3 = "mhlo.infeed"([[TOKEN]]) <{infeed_config = ""{{.*}}}> : (!mhlo.token) -> (tensor<1x8x4x4xi32>, tensor<1x100x1xf32>, !mhlo.token)
 // CHECK: return [[INFEED]]#0, [[INFEED]]#1
   %0:2 = "tf.InfeedDequeueTuple"() : () -> (tensor<1x8x4x4xi32>, tensor<1x100x1xf32>)
   func.return %0#0, %0#1 : tensor<1x8x4x4xi32>, tensor<1x100x1xf32>
@@ -1338,7 +1338,7 @@ func.func @matmul_notranspose(%a: tensor<5x7xf32>, %b: tensor<7x11xf32>) -> tens
 // CHECK-LABEL: matmul_transpose_b
 // CHECK-SAME: (%[[A:.*]]: tensor<5x7xf32>, %[[B:.*]]: tensor<11x7xf32>)
 func.func @matmul_transpose_b(%a: tensor<5x7xf32>, %b: tensor<11x7xf32>) -> tensor<5x11xf32> {
-  // CHECK: %[[UPDATED_B:.*]] = "mhlo.transpose"(%[[B]]) {permutation = dense<[1, 0]> : tensor<2xi64>}
+  // CHECK: %[[UPDATED_B:.*]] = "mhlo.transpose"(%[[B]]) <{permutation = dense<[1, 0]> : tensor<2xi64>}>
   // CHECK: "mhlo.dot"(%[[A]], %[[UPDATED_B]])
   %0 = "tf.MatMul"(%a, %b) {transpose_a = false, transpose_b = true} : (tensor<5x7xf32>, tensor<11x7xf32>) -> tensor<5x11xf32>
 
@@ -1350,8 +1350,8 @@ func.func @matmul_transpose_b(%a: tensor<5x7xf32>, %b: tensor<11x7xf32>) -> tens
 // CHECK-LABEL: matmul_transpose_both
 // CHECK-SAME: (%[[A:.*]]: tensor<7x5xf32>, %[[B:.*]]: tensor<11x7xf32>)
 func.func @matmul_transpose_both(%a: tensor<7x5xf32>, %b: tensor<11x7xf32>) -> tensor<5x11xf32> {
-  // CHECK: %[[UPDATED_A:.*]] = "mhlo.transpose"(%[[A]]) {permutation = dense<[1, 0]> : tensor<2xi64>}
-  // CHECK: %[[UPDATED_B:.*]] = "mhlo.transpose"(%[[B]]) {permutation = dense<[1, 0]> : tensor<2xi64>}
+  // CHECK: %[[UPDATED_A:.*]] = "mhlo.transpose"(%[[A]]) <{permutation = dense<[1, 0]> : tensor<2xi64>}>
+  // CHECK: %[[UPDATED_B:.*]] = "mhlo.transpose"(%[[B]]) <{permutation = dense<[1, 0]> : tensor<2xi64>}>
   // CHECK: "mhlo.dot"(%[[UPDATED_A]], %[[UPDATED_B]])
   %0 = "tf.MatMul"(%a, %b) {transpose_a = true, transpose_b = true} : (tensor<7x5xf32>, tensor<11x7xf32>) -> tensor<5x11xf32>
 
@@ -1419,9 +1419,9 @@ func.func @test_sparse_mat_mul_with_cast(%arg0: tensor<3x4xf32>, %arg1: tensor<4
 func.func @maxpool_valid_padding(%arg0: tensor<2x12x20x7xi32>) -> tensor<2x3x5x7xi32> {
   // CHECK: %[[INIT:.*]] = mhlo.constant dense<-2147483648> : tensor<i32>
   // CHECK: "mhlo.reduce_window"(%[[ARG]], %[[INIT]])
+  // CHECK: <{window_dimensions = dense<[1, 2, 2, 1]> : tensor<4xi64>, window_strides = dense<[1, 4, 4, 1]> : tensor<4xi64>}>
   // CHECK: mhlo.maximum
   // CHECK: mhlo.return
-  // CHECK: {window_dimensions = dense<[1, 2, 2, 1]> : tensor<4xi64>, window_strides = dense<[1, 4, 4, 1]> : tensor<4xi64>}
 
   %0 = "tf.MaxPool"(%arg0) {data_format = "NHWC", ksize = [1, 2, 2, 1], padding = "VALID", strides = [1, 4, 4, 1]} : (tensor<2x12x20x7xi32>) -> tensor<2x3x5x7xi32>
   func.return %0 : tensor<2x3x5x7xi32>
@@ -1445,9 +1445,9 @@ func.func @maxpool_same_padding(%arg0: tensor<2x13x25x7xi32>) -> tensor<2x4x7x7x
 func.func @maxpool_3d_valid_padding(%arg0: tensor<2x8x12x20x7xf32>) -> tensor<2x8x3x5x7xf32> {
   // CHECK: %[[INIT:.*]] = mhlo.constant dense<0xFF800000> : tensor<f32>
   // CHECK: "mhlo.reduce_window"(%[[ARG]], %[[INIT]])
+  // CHECK: <{window_dimensions = dense<[1, 1, 2, 2, 1]> : tensor<5xi64>, window_strides = dense<[1, 1, 4, 4, 1]> : tensor<5xi64>}>
   // CHECK: mhlo.maximum
   // CHECK: mhlo.return
-  // CHECK: {window_dimensions = dense<[1, 1, 2, 2, 1]> : tensor<5xi64>, window_strides = dense<[1, 1, 4, 4, 1]> : tensor<5xi64>}
 
   %0 = "tf.MaxPool3D"(%arg0) {data_format = "NDHWC", ksize = [1, 1, 2, 2, 1], padding = "VALID", strides = [1, 1, 4, 4, 1]} : (tensor<2x8x12x20x7xf32>) -> tensor<2x8x3x5x7xf32>
   func.return %0 : tensor<2x8x3x5x7xf32>
@@ -1485,7 +1485,7 @@ func.func @maxpool_explicit_padding(%arg0: tensor<2x12x20x7xi32>) -> tensor<2x3x
 // CHECK-SAME: %[[INPUT:.*]]: tensor<10x24x24x64xf32>, %arg1: tensor<10x12x12x64xf32>, %[[GRAD:.*]]: tensor<10x12x12x64xf32>
 func.func @max_pool_grad_valid(%orig_input: tensor<10x24x24x64xf32>, %orig_output: tensor<10x12x12x64xf32>, %grad: tensor<10x12x12x64xf32>) -> tensor<10x24x24x64xf32> {
   // CHECK: %[[ZERO:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
-  // CHECK: %[[RESULT:.*]] = "mhlo.select_and_scatter"(%[[INPUT]], %[[GRAD]], %[[ZERO]]) ({
+  // CHECK: %[[RESULT:.*]] = "mhlo.select_and_scatter"(%[[INPUT]], %[[GRAD]], %[[ZERO]]) <{window_dimensions = dense<[1, 2, 2, 1]> : tensor<4xi64>, window_strides = dense<[1, 2, 2, 1]> : tensor<4xi64>}> ({
   // CHECK: ^bb0(%[[VALUE_A:.*]]: tensor<f32>, %[[VALUE_B:.*]]: tensor<f32>):
   // CHECK: %[[SELECT_RESULT:.*]] = mhlo.compare GE, %[[VALUE_A]], %[[VALUE_B]], NOTYPE : (tensor<f32>, tensor<f32>) -> tensor<i1>
   // CHECK: mhlo.return %[[SELECT_RESULT]] : tensor<i1>
@@ -1493,7 +1493,7 @@ func.func @max_pool_grad_valid(%orig_input: tensor<10x24x24x64xf32>, %orig_outpu
   // CHECK: ^bb0(%[[VALUE_A:.*]]: tensor<f32>, %[[VALUE_B:.*]]: tensor<f32>):
   // CHECK: %[[SELECT_RESULT:.*]] = mhlo.add %[[VALUE_A]], %[[VALUE_B]] : tensor<f32>
   // CHECK: mhlo.return %[[SELECT_RESULT]] : tensor<f32>
-  // CHECK: }) {window_dimensions = dense<[1, 2, 2, 1]> : tensor<4xi64>, window_strides = dense<[1, 2, 2, 1]> : tensor<4xi64>} : (tensor<10x24x24x64xf32>, tensor<10x12x12x64xf32>, tensor<f32>) -> tensor<10x24x24x64xf32>
+  // CHECK: }) : (tensor<10x24x24x64xf32>, tensor<10x12x12x64xf32>, tensor<f32>) -> tensor<10x24x24x64xf32>
   // CHECK: return %[[RESULT]] : tensor<10x24x24x64xf32>
   %result = "tf.MaxPoolGrad"(%orig_input, %orig_output, %grad) {
      data_format = "NHWC",
@@ -1510,7 +1510,7 @@ func.func @max_pool_grad_valid(%orig_input: tensor<10x24x24x64xf32>, %orig_outpu
 // CHECK-SAME: %[[INPUT:.*]]: tensor<10x8x24x24x64xf32>, %arg1: tensor<10x8x12x12x64xf32>, %[[GRAD:.*]]: tensor<10x8x12x12x64xf32>
 func.func @max_pool_3d_grad_valid(%orig_input: tensor<10x8x24x24x64xf32>, %orig_output: tensor<10x8x12x12x64xf32>, %grad: tensor<10x8x12x12x64xf32>) -> tensor<10x8x24x24x64xf32> {
   // CHECK: %[[ZERO:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
-  // CHECK: %[[RESULT:.*]] = "mhlo.select_and_scatter"(%[[INPUT]], %[[GRAD]], %[[ZERO]]) ({
+  // CHECK: %[[RESULT:.*]] = "mhlo.select_and_scatter"(%[[INPUT]], %[[GRAD]], %[[ZERO]]) <{window_dimensions = dense<[1, 1, 2, 2, 1]> : tensor<5xi64>, window_strides = dense<[1, 1, 2, 2, 1]> : tensor<5xi64>}> ({
   // CHECK: ^bb0(%[[VALUE_A:.*]]: tensor<f32>, %[[VALUE_B:.*]]: tensor<f32>):
   // CHECK: %[[SELECT_RESULT:.*]] = mhlo.compare GE, %[[VALUE_A]], %[[VALUE_B]], NOTYPE : (tensor<f32>, tensor<f32>) -> tensor<i1>
   // CHECK: mhlo.return %[[SELECT_RESULT]] : tensor<i1>
@@ -1518,7 +1518,7 @@ func.func @max_pool_3d_grad_valid(%orig_input: tensor<10x8x24x24x64xf32>, %orig_
   // CHECK: ^bb0(%[[VALUE_A:.*]]: tensor<f32>, %[[VALUE_B:.*]]: tensor<f32>):
   // CHECK: %[[SELECT_RESULT:.*]] = mhlo.add %[[VALUE_A]], %[[VALUE_B]] : tensor<f32>
   // CHECK: mhlo.return %[[SELECT_RESULT]] : tensor<f32>
-  // CHECK: }) {window_dimensions = dense<[1, 1, 2, 2, 1]> : tensor<5xi64>, window_strides = dense<[1, 1, 2, 2, 1]> : tensor<5xi64>} : (tensor<10x8x24x24x64xf32>, tensor<10x8x12x12x64xf32>, tensor<f32>) -> tensor<10x8x24x24x64xf32>
+  // CHECK: }) : (tensor<10x8x24x24x64xf32>, tensor<10x8x12x12x64xf32>, tensor<f32>) -> tensor<10x8x24x24x64xf32>
   // CHECK: return %[[RESULT]] : tensor<10x8x24x24x64xf32>
   %result = "tf.MaxPool3DGrad"(%orig_input, %orig_output, %grad) {data_format = "NDHWC", ksize = [1, 1, 2, 2, 1], padding = "VALID", strides = [1, 1, 2, 2, 1]} : (tensor<10x8x24x24x64xf32>, tensor<10x8x12x12x64xf32>, tensor<10x8x12x12x64xf32>) -> tensor<10x8x24x24x64xf32>
   func.return %result : tensor<10x8x24x24x64xf32>
@@ -1555,11 +1555,11 @@ func.func @max_pool_3d_grad_same(%orig_input: tensor<2x8x13x25x7xf32>, %orig_out
 
 // CHECK-LABEL:one_hot
 func.func @one_hot(%indices: tensor<3xi32>, %on_value: tensor<f32>, %off_value: tensor<f32>) -> tensor<3x5xf32> {
-  // CHECK: %[[IOTA:.*]] = "mhlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<3x5xi32>
-  // CHECK: %[[BCAST_ARG0:.+]] = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<3xi32>) -> tensor<3x5xi32>
+  // CHECK: %[[IOTA:.*]] = "mhlo.iota"() <{iota_dimension = 1 : i64}> : () -> tensor<3x5xi32>
+  // CHECK: %[[BCAST_ARG0:.+]] = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<0> : tensor<1xi64>}> : (tensor<3xi32>) -> tensor<3x5xi32>
   // CHECK: %[[COMPARE:.*]] = mhlo.compare EQ, %[[BCAST_ARG0]], %[[IOTA]], NOTYPE : (tensor<3x5xi32>, tensor<3x5xi32>) -> tensor<3x5xi1>
-  // CHECK: %[[ON_VALUE:.*]] = "mhlo.broadcast"(%arg1) {broadcast_sizes = dense<[3, 5]> : tensor<2xi64>} : (tensor<f32>) -> tensor<3x5xf32>
-  // CHECK: %[[OFF_VALUE:.*]] = "mhlo.broadcast"(%arg2) {broadcast_sizes = dense<[3, 5]> : tensor<2xi64>} : (tensor<f32>) -> tensor<3x5xf32>
+  // CHECK: %[[ON_VALUE:.*]] = "mhlo.broadcast"(%arg1) <{broadcast_sizes = dense<[3, 5]> : tensor<2xi64>}> : (tensor<f32>) -> tensor<3x5xf32>
+  // CHECK: %[[OFF_VALUE:.*]] = "mhlo.broadcast"(%arg2) <{broadcast_sizes = dense<[3, 5]> : tensor<2xi64>}> : (tensor<f32>) -> tensor<3x5xf32>
   // CHECK: %[[RESULT:.*]] = mhlo.select %[[COMPARE]], %[[ON_VALUE]], %[[OFF_VALUE]] : tensor<3x5xi1>, tensor<3x5xf32>
   // CHECK: return %[[RESULT]] : tensor<3x5xf32>
   %depth = "tf.Const"() { value = dense<5> : tensor<i32> } : () -> tensor<i32>
@@ -1577,7 +1577,7 @@ func.func @one_hot(%indices: tensor<3xi32>, %on_value: tensor<f32>, %off_value:
 // CHECK-SAME: [[VAL_0:%.*]]: tensor<3xi32>, [[VAL_1:%.*]]: tensor<4xf32>)
 func.func @outfeed_enqueue_tuple(%data_1: tensor<3xi32>, %data_2: tensor<4xf32>) -> () {
 // CHECK: [[TOKEN:%.*]] = mhlo.create_token  : !mhlo.token
-// CHECK: "mhlo.outfeed"([[VAL_0]], [[VAL_1]], [[TOKEN]]) {outfeed_config = ""} : (tensor<3xi32>, tensor<4xf32>, !mhlo.token) -> !mhlo.token
+// CHECK: "mhlo.outfeed"([[VAL_0]], [[VAL_1]], [[TOKEN]]) <{outfeed_config = ""}> : (tensor<3xi32>, tensor<4xf32>, !mhlo.token) -> !mhlo.token
   "tf.OutfeedEnqueueTuple"(%data_1, %data_2) : (tensor<3xi32>, tensor<4xf32>) -> ()
   func.return
 }
@@ -1592,7 +1592,7 @@ func.func @outfeed_enqueue_tuple(%data_1: tensor<3xi32>, %data_2: tensor<4xf32>)
 func.func @pack(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2x2xi32> {
   // CHECK: mhlo.reshape {{.*}} : (tensor<2xi32>) -> tensor<1x2xi32>
   // CHECK: mhlo.reshape {{.*}} : (tensor<2xi32>) -> tensor<1x2xi32>
-  // CHECK: "mhlo.concatenate"({{.*}}) {dimension = 0 : i64} : (tensor<1x2xi32>, tensor<1x2xi32>) -> tensor<2x2xi32>
+  // CHECK: "mhlo.concatenate"({{.*}}) <{dimension = 0 : i64}> : (tensor<1x2xi32>, tensor<1x2xi32>) -> tensor<2x2xi32>
 
   %0 = "tf.Pack"(%arg0, %arg1) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2x2xi32>
   func.return %0 : tensor<2x2xi32>
@@ -1689,7 +1689,7 @@ func.func @callee() {
 func.func @reverse_func_32(%arg0: tensor<5xi32>) -> tensor<5xi32> {
   %axis = "tf.Const"() {value = dense<0> : tensor<1xi32>} : () -> (tensor<1xi32>)
 
-  // CHECK: [[VAL:%.+]] = "mhlo.reverse"(%arg0) {dimensions = dense<0> : tensor<1xi64>}
+  // CHECK: [[VAL:%.+]] = "mhlo.reverse"(%arg0) <{dimensions = dense<0> : tensor<1xi64>}>
   %reversed = "tf.ReverseV2"(%arg0, %axis) : (tensor<5xi32>, tensor<1xi32>) -> tensor<5xi32>
 
   // CHECK: return [[VAL]] : tensor<5xi32>
@@ -1702,7 +1702,7 @@ func.func @reverse_func_32(%arg0: tensor<5xi32>) -> tensor<5xi32> {
 func.func @reverse_func_64(%arg0: tensor<5xi32>) -> tensor<5xi32> {
   %axis = "tf.Const"() {value = dense<0> : tensor<1xi64>} : () -> (tensor<1xi64>)
 
-  // CHECK: [[VAL:%.+]] = "mhlo.reverse"(%arg0) {dimensions = dense<0> : tensor<1xi64>}
+  // CHECK: [[VAL:%.+]] = "mhlo.reverse"(%arg0) <{dimensions = dense<0> : tensor<1xi64>}>
   %reversed = "tf.ReverseV2"(%arg0, %axis) : (tensor<5xi32>, tensor<1xi64>) -> tensor<5xi32>
 
   // CHECK: return [[VAL]] : tensor<5xi32>
@@ -1715,7 +1715,7 @@ func.func @reverse_func_64(%arg0: tensor<5xi32>) -> tensor<5xi32> {
 func.func @reverse_func_neg(%arg0: tensor<5x5xi32>) -> tensor<5x5xi32> {
   %axis = "tf.Const"() {value = dense<[-1]> : tensor<1xi32>} : () -> (tensor<1xi32>)
 
-  // CHECK: [[VAL:%.+]] = "mhlo.reverse"(%arg0) {dimensions = dense<1> : tensor<1xi64>}
+  // CHECK: [[VAL:%.+]] = "mhlo.reverse"(%arg0) <{dimensions = dense<1> : tensor<1xi64>}>
   %reversed = "tf.ReverseV2"(%arg0, %axis) : (tensor<5x5xi32>, tensor<1xi32>) -> tensor<5x5xi32>
 
   // CHECK: return [[VAL]] : tensor<5x5xi32>
@@ -1762,7 +1762,7 @@ func.func @stateful_pcall_multi_in_out(%arg0: tensor<i32>, %arg1: tensor<i32>) -
 
 // CHECK-LABEL: func @elu
 func.func @elu(%arg0: tensor<1xf32>) -> tensor<1xf32> {
-  // CHECK-DAG: %[[ZERO:.*]] = "chlo.constant_like"(%arg0) {value = 0.000000e+00 : f32} : (tensor<1xf32>) -> tensor<1xf32>
+  // CHECK-DAG: %[[ZERO:.*]] = "chlo.constant_like"(%arg0) <{value = 0.000000e+00 : f32}> : (tensor<1xf32>) -> tensor<1xf32>
   // CHECK-DAG: %[[PRED:.*]] = mhlo.compare GT, %arg0, %[[ZERO]]
   // CHECK-DAG: %[[EXP:.*]] = mhlo.exponential_minus_one %arg0
   // CHECK: %[[RESULT:.*]] = mhlo.select %[[PRED]], %arg0, %[[EXP]]
@@ -1837,8 +1837,8 @@ func.func @relu6_unsigned(%arg0: tensor<?xui32>) -> tensor<?xui32> {
 
 // CHECK-LABEL: func @leaky_relu
 func.func @leaky_relu(%arg0: tensor<1x4x4x3xf32>) -> tensor<1x4x4x3xf32> attributes {tf.entry_function = {}} {
-    // CHECK-NEXT: %[[ALPHA:.*]] = "chlo.constant_like"(%arg0) {value = 2.000000e-01 : f32} : (tensor<1x4x4x3xf32>) -> tensor<1x4x4x3xf32>
-    // CHECK-NEXT: %[[ZERO:.*]] = "chlo.constant_like"(%arg0) {value = 0.000000e+00 : f32} : (tensor<1x4x4x3xf32>) -> tensor<1x4x4x3xf32>
+    // CHECK-NEXT: %[[ALPHA:.*]] = "chlo.constant_like"(%arg0) <{value = 2.000000e-01 : f32}> : (tensor<1x4x4x3xf32>) -> tensor<1x4x4x3xf32>
+    // CHECK-NEXT: %[[ZERO:.*]] = "chlo.constant_like"(%arg0) <{value = 0.000000e+00 : f32}> : (tensor<1x4x4x3xf32>) -> tensor<1x4x4x3xf32>
     // CHECK-NEXT: %[[LEAKY:.*]] = mhlo.multiply %[[INP:.*]], %[[ALPHA]] : tensor<1x4x4x3xf32>
     // CHECK-NEXT: %[[CMP:.*]] = mhlo.compare GT, %[[INP]], %[[ZERO]], NOTYPE : (tensor<1x4x4x3xf32>, tensor<1x4x4x3xf32>) -> tensor<1x4x4x3xi1>
     // CHECK-NEXT: %[[RES:.*]] = mhlo.select %[[CMP]], %[[INP]], %[[LEAKY]] : tensor<1x4x4x3xi1>, tensor<1x4x4x3xf32>
@@ -1851,8 +1851,8 @@ func.func @leaky_relu(%arg0: tensor<1x4x4x3xf32>) -> tensor<1x4x4x3xf32> attribu
 
 // CHECK-LABEL: func @leaky_relu_grad
 func.func @leaky_relu_grad(%arg0: tensor<1x4x4xf32>, %arg1: tensor<1x4x4xf32>) -> tensor<1x4x4xf32> attributes {tf.entry_function = {}} {
-    // CHECK-NEXT: %[[ALPHA:.*]] = "chlo.constant_like"(%arg1) {value = 2.000000e-01 : f32} : (tensor<1x4x4xf32>) -> tensor<1x4x4xf32>
-    // CHECK-NEXT: %[[ZERO:.*]] = "chlo.constant_like"(%arg1) {value = 0.000000e+00 : f32} : (tensor<1x4x4xf32>) -> tensor<1x4x4xf32>
+    // CHECK-NEXT: %[[ALPHA:.*]] = "chlo.constant_like"(%arg1) <{value = 2.000000e-01 : f32}> : (tensor<1x4x4xf32>) -> tensor<1x4x4xf32>
+    // CHECK-NEXT: %[[ZERO:.*]] = "chlo.constant_like"(%arg1) <{value = 0.000000e+00 : f32}> : (tensor<1x4x4xf32>) -> tensor<1x4x4xf32>
     // CHECK-NEXT: %[[LEAKYGRAD:.*]] = mhlo.multiply %[[GRADIENT:.*]], %[[ALPHA]] : tensor<1x4x4xf32>
     // CHECK-NEXT: %[[CMP:.*]] = mhlo.compare GT, %[[INP:.*]], %[[ZERO]], NOTYPE : (tensor<1x4x4xf32>, tensor<1x4x4xf32>) -> tensor<1x4x4xi1>
     // CHECK-NEXT: %[[RES:.*]] = mhlo.select %[[CMP]], %[[GRADIENT]], %[[LEAKYGRAD]] : tensor<1x4x4xi1>, tensor<1x4x4xf32>
@@ -1865,7 +1865,7 @@ func.func @leaky_relu_grad(%arg0: tensor<1x4x4xf32>, %arg1: tensor<1x4x4xf32>) -
 
 // CHECK-LABEL: func @softsign
 func.func @softsign(%arg0: tensor<4x10xf32>) -> tensor<4x10xf32> {
-    // CHECK-NEXT: %[[ONE:.*]] = "chlo.constant_like"(%arg0) {value = 1.000000e+00 : f32} : (tensor<4x10xf32>) -> tensor<4x10xf32>
+    // CHECK-NEXT: %[[ONE:.*]] = "chlo.constant_like"(%arg0) <{value = 1.000000e+00 : f32}> : (tensor<4x10xf32>) -> tensor<4x10xf32>
     // CHECK-NEXT: %[[ABS:.*]] = mhlo.abs %{{.*}} : tensor<4x10xf32>
     // CHECK-NEXT: %[[ADD:.*]] = mhlo.add %[[ONE]], %[[ABS]] : tensor<4x10xf32>
     // CHECK-NEXT: %[[DIV:.*]] = mhlo.divide %{{.*}}, %[[ADD]] : tensor<4x10xf32>
@@ -1903,7 +1903,7 @@ func.func @Roll_0D(%arg0: tensor<512xi32>, %shift: tensor<i32>) -> tensor<512xi3
   //      CHECK: %[[T1:.+]] = mhlo.remainder %arg1, %[[AXIS_SIZE]] : tensor<i32>
   //      CHECK: %[[T2:.+]] = mhlo.add %[[T1]], %[[AXIS_SIZE]] : tensor<i32>
   //      CHECK: %[[T3:.+]] = mhlo.remainder %[[T2]], %[[AXIS_SIZE]] : tensor<i32>
-  //      CHECK: %[[CONCAT:.+]] = "mhlo.concatenate"(%arg0, %arg0) {dimension = 0 : i64}
+  //      CHECK: %[[CONCAT:.+]] = "mhlo.concatenate"(%arg0, %arg0) <{dimension = 0 : i64}>
   //      CHECK: %[[OFFSET:.+]] = mhlo.subtract %[[AXIS_SIZE]], %[[T3]] : tensor<i32>
   //      CHECK: "mhlo.dynamic_slice"(%[[CONCAT]], %[[OFFSET]])
   // CHECK-SAME:    {slice_sizes = dense<512> : tensor<1xi64>}
@@ -1920,7 +1920,7 @@ func.func @Roll_0D(%arg0: tensor<512xi32>, %shift: tensor<i32>) -> tensor<512xi3
 
 // CHECK-LABEL: func @select_batch_static
 func.func @select_batch_static(%arg0: tensor<2xi1>, %arg1: tensor<2x6x8xi32>, %arg2: tensor<2x6x8xi32>) -> tensor<2x6x8xi32> {
-  // CHECK: %[[BCAST:.*]] = "mhlo.dynamic_broadcast_in_dim"(%arg0, %{{.*}}) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<2xi1>, tensor<3xindex>) -> tensor<2x6x8xi1>
+  // CHECK: %[[BCAST:.*]] = "mhlo.dynamic_broadcast_in_dim"(%arg0, %{{.*}}) <{broadcast_dimensions = dense<0> : tensor<1xi64>}> : (tensor<2xi1>, tensor<3xindex>) -> tensor<2x6x8xi1>
   // CHECK: mhlo.select %[[BCAST]], %arg1, %arg2
   %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<2xi1>, tensor<2x6x8xi32>, tensor<2x6x8xi32>) -> tensor<2x6x8xi32>
   func.return %0: tensor<2x6x8xi32>
@@ -1958,7 +1958,7 @@ func.func @select_batch_dynamic_r1(%arg0: tensor<?xi1>, %arg1: tensor<?x?x8xi32>
   // CHECK-NEXT: %[[SHAPEEQ:.*]] = shape.assuming_all %[[SHAPEEQ1]], %[[SHAPEEQ2]]
   // CHECK-NEXT: %[[ASSUMING:.*]] = shape.assuming %[[SHAPEEQ]] -> (tensor<?x?x8xi32>) {
   // CHECK-NEXT: %[[SHAPE1E:.*]] = shape.to_extent_tensor %[[SHAPE1]] : tensor<3xindex> -> tensor<3xindex>
-  // CHECK-NEXT: %[[BCAST:.*]] = "mhlo.dynamic_broadcast_in_dim"(%arg0, %[[SHAPE1E]]) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<?xi1>, tensor<3xindex>) -> tensor<?x?x8xi1>
+  // CHECK-NEXT: %[[BCAST:.*]] = "mhlo.dynamic_broadcast_in_dim"(%arg0, %[[SHAPE1E]]) <{broadcast_dimensions = dense<0> : tensor<1xi64>}> : (tensor<?xi1>, tensor<3xindex>) -> tensor<?x?x8xi1>
   // CHECK-NEXT: %[[SELECT:.*]] = mhlo.select %[[BCAST]], %arg1, %arg2 : tensor<?x?x8xi1>, tensor<?x?x8xi32>
   // CHECK-NEXT: shape.assuming_yield %[[SELECT]] : tensor<?x?x8xi32>
   %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<?xi1>, tensor<?x?x8xi32>, tensor<?x?x8xi32>) -> tensor<?x?x8xi32>
@@ -2026,7 +2026,7 @@ func.func @selectv2_dynamic_ranked(%arg0: tensor<1xi1>, %arg1: tensor<2x?x8xi32>
 
 // CHECK-LABEL: func @fft_1D
 func.func @fft_1D(%arg0: tensor<8xcomplex<f32>>) -> tensor<8xcomplex<f32>> {
-  // CHECK: "mhlo.fft"(%arg0) {fft_length = dense<8> : tensor<1xi64>, fft_type = #mhlo<fft_type FFT>} : (tensor<8xcomplex<f32>>
+  // CHECK: "mhlo.fft"(%arg0) <{fft_length = dense<8> : tensor<1xi64>, fft_type = #mhlo<fft_type FFT>}> : (tensor<8xcomplex<f32>>
   %0 = "tf.FFT"(%arg0) : (tensor<8xcomplex<f32>>) -> tensor<8xcomplex<f32>>
   func.return %0 : tensor<8xcomplex<f32>>
 }
@@ -2035,7 +2035,7 @@ func.func @fft_1D(%arg0: tensor<8xcomplex<f32>>) -> tensor<8xcomplex<f32>> {
 
 // CHECK-LABEL: func @ifft_1D
 func.func @ifft_1D(%arg0: tensor<8xcomplex<f32>>) -> tensor<8xcomplex<f32>> {
-  // CHECK: "mhlo.fft"(%arg0) {fft_length = dense<8> : tensor<1xi64>, fft_type = #mhlo<fft_type IFFT>} : (tensor<8xcomplex<f32>>
+  // CHECK: "mhlo.fft"(%arg0) <{fft_length = dense<8> : tensor<1xi64>, fft_type = #mhlo<fft_type IFFT>}> : (tensor<8xcomplex<f32>>
   %0 = "tf.IFFT"(%arg0) : (tensor<8xcomplex<f32>>) -> tensor<8xcomplex<f32>>
   func.return %0 : tensor<8xcomplex<f32>>
 }
@@ -2045,7 +2045,7 @@ func.func @ifft_1D(%arg0: tensor<8xcomplex<f32>>) -> tensor<8xcomplex<f32>> {
 // CHECK-LABEL: func @rfft_1D
 func.func @rfft_1D(%arg0: tensor<8xf32>) -> tensor<5xcomplex<f32>> {
   %fftlength = "tf.Const"() {value = dense<[8]> : tensor<1xi32>} : () -> (tensor<1xi32>)
-  // CHECK: "mhlo.fft"(%arg0) {fft_length = dense<8> : tensor<1xi64>, fft_type = #mhlo<fft_type RFFT>} : (tensor<8xf32>
+  // CHECK: "mhlo.fft"(%arg0) <{fft_length = dense<8> : tensor<1xi64>, fft_type = #mhlo<fft_type RFFT>}> : (tensor<8xf32>
   %0 = "tf.RFFT"(%arg0, %fftlength) : (tensor<8xf32>, tensor<1xi32>) -> tensor<5xcomplex<f32>>
   func.return %0 : tensor<5xcomplex<f32>>
 }
@@ -2055,8 +2055,8 @@ func.func @rfft_1D(%arg0: tensor<8xf32>) -> tensor<5xcomplex<f32>> {
 // CHECK-LABEL: func @rfft_1D_padded
 func.func @rfft_1D_padded(%arg0: tensor<7xf32>) -> tensor<5xcomplex<f32>> {
   %fftlength = "tf.Const"() {value = dense<[8]> : tensor<1xi32>} : () -> (tensor<1xi32>)
-  // CHECK: %[[PADDED:.*]] = "mhlo.pad"(%arg0, %{{.*}}) {edge_padding_high = dense<1> : tensor<1xi64>, edge_padding_low = dense<0> : tensor<1xi64>, interior_padding = dense<0> : tensor<1xi64>} : (tensor<7xf32>, tensor<f32>) -> tensor<8xf32>
-  // CHECK: "mhlo.fft"(%[[PADDED]]) {fft_length = dense<8> : tensor<1xi64>, fft_type = #mhlo<fft_type RFFT>} : (tensor<8xf32>
+  // CHECK: %[[PADDED:.*]] = "mhlo.pad"(%arg0, %{{.*}}) <{edge_padding_high = dense<1> : tensor<1xi64>, edge_padding_low = dense<0> : tensor<1xi64>, interior_padding = dense<0> : tensor<1xi64>}> : (tensor<7xf32>, tensor<f32>) -> tensor<8xf32>
+  // CHECK: "mhlo.fft"(%[[PADDED]]) <{fft_length = dense<8> : tensor<1xi64>, fft_type = #mhlo<fft_type RFFT>}> : (tensor<8xf32>
   %0 = "tf.RFFT"(%arg0, %fftlength) : (tensor<7xf32>, tensor<1xi32>) -> tensor<5xcomplex<f32>>
   func.return %0 : tensor<5xcomplex<f32>>
 }
@@ -2066,8 +2066,8 @@ func.func @rfft_1D_padded(%arg0: tensor<7xf32>) -> tensor<5xcomplex<f32>> {
 // CHECK-LABEL: func @rfft_1D_sliced
 func.func @rfft_1D_sliced(%arg0: tensor<2x9xf32>) -> tensor<2x5xcomplex<f32>> {
   %fftlength = "tf.Const"() {value = dense<[8]> : tensor<1xi32>} : () -> (tensor<1xi32>)
-  // CHECK: %[[SLICED:.*]] = "mhlo.slice"(%arg0) {limit_indices = dense<[2, 8]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<2x9xf32>) -> tensor<2x8xf32>
-  // CHECK: "mhlo.fft"(%[[SLICED]]) {fft_length = dense<8> : tensor<1xi64>, fft_type = #mhlo<fft_type RFFT>} : (tensor<2x8xf32>
+  // CHECK: %[[SLICED:.*]] = "mhlo.slice"(%arg0) <{limit_indices = dense<[2, 8]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}> : (tensor<2x9xf32>) -> tensor<2x8xf32>
+  // CHECK: "mhlo.fft"(%[[SLICED]]) <{fft_length = dense<8> : tensor<1xi64>, fft_type = #mhlo<fft_type RFFT>}> : (tensor<2x8xf32>
   %0 = "tf.RFFT"(%arg0, %fftlength) : (tensor<2x9xf32>, tensor<1xi32>) -> tensor<2x5xcomplex<f32>>
   func.return %0 : tensor<2x5xcomplex<f32>>
 }
@@ -2077,8 +2077,8 @@ func.func @rfft_1D_sliced(%arg0: tensor<2x9xf32>) -> tensor<2x5xcomplex<f32>> {
 // CHECK-LABEL: func @irfft_1D
 func.func @irfft_1D(%arg0: tensor<8xcomplex<f32>>) -> tensor<8xf32> {
   %fftlength = "tf.Const"() {value = dense<[8]> : tensor<1xi32>} : () -> (tensor<1xi32>)
-  // CHECK: %[[SLICED:.*]] = "mhlo.slice"(%arg0) {limit_indices = dense<5> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<8xcomplex<f32>>) -> tensor<5xcomplex<f32>>
-  // CHECK: "mhlo.fft"(%[[SLICED]]) {fft_length = dense<8> : tensor<1xi64>, fft_type = #mhlo<fft_type IRFFT>} : (tensor<5xcomplex<f32>>
+  // CHECK: %[[SLICED:.*]] = "mhlo.slice"(%arg0) <{limit_indices = dense<5> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<8xcomplex<f32>>) -> tensor<5xcomplex<f32>>
+  // CHECK: "mhlo.fft"(%[[SLICED]]) <{fft_length = dense<8> : tensor<1xi64>, fft_type = #mhlo<fft_type IRFFT>}> : (tensor<5xcomplex<f32>>
   %0 = "tf.IRFFT"(%arg0, %fftlength) : (tensor<8xcomplex<f32>>, tensor<1xi32>) -> tensor<8xf32>
   func.return %0 : tensor<8xf32>
 }
@@ -2816,7 +2816,7 @@ func.func @slice_constant_start(%arg0: tensor<4xi32>) -> tensor<2xi32> {
 // CHECK-LABEL: slice_i32_consts
 func.func @slice_i32_consts(%arg0: tensor<4xi32>) -> tensor<2xi32> {
   // CHECK: %[[START:.*]] = mhlo.constant dense<1> : tensor<i32>
-  // CHECK: "mhlo.dynamic_slice"(%arg0, %[[START]]) {slice_sizes = dense<2> : tensor<1xi64>} : (tensor<4xi32>, tensor<i32>) -> tensor<2xi32>
+  // CHECK: "mhlo.dynamic_slice"(%arg0, %[[START]]) <{slice_sizes = dense<2> : tensor<1xi64>}> : (tensor<4xi32>, tensor<i32>) -> tensor<2xi32>
   %starts = "tf.Const"() {value = dense<[1]> : tensor<1xi32>} : () -> (tensor<1xi32>)
   %sizes = "tf.Const"() {value = dense<[2]> : tensor<1xi32>} : () -> (tensor<1xi32>)
   %0 = "tf.Slice"(%arg0, %starts, %sizes) : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
@@ -2828,7 +2828,7 @@ func.func @slice_i32_consts(%arg0: tensor<4xi32>) -> tensor<2xi32> {
 // CHECK-LABEL: slice_constant_start_negative_one_size
 func.func @slice_constant_start_negative_one_size(%arg0: tensor<4xi32>) -> tensor<3xi32> {
   // CHECK: %[[START:.*]] = mhlo.constant dense<1> : tensor<i64>
-  // CHECK: %[[RESULT:.*]] =  "mhlo.dynamic_slice"(%arg0, %[[START]]) {slice_sizes = dense<3> : tensor<1xi64>} : (tensor<4xi32>, tensor<i64>) -> tensor<3xi32>
+  // CHECK: %[[RESULT:.*]] =  "mhlo.dynamic_slice"(%arg0, %[[START]]) <{slice_sizes = dense<3> : tensor<1xi64>}> : (tensor<4xi32>, tensor<i64>) -> tensor<3xi32>
   // CHECK: return %[[RESULT]] : tensor<3xi32>
   %starts = "tf.Const"() {value = dense<[1]> : tensor<1xi64>} : () -> (tensor<1xi64>)
   %sizes = "tf.Const"() {value = dense<[-1]> : tensor<1xi64>} : () -> (tensor<1xi64>)
@@ -2867,7 +2867,7 @@ func.func @slice_variable_start(%arg0: tensor<3x4xi32>, %arg1: tensor<2xi64>) ->
   // CHECK-DAG-SAME: start_indices = dense<1> : tensor<1xi64>,
   // CHECK-DAG-SAME: strides = dense<1> : tensor<1xi64>} : (tensor<2xi64>) -> tensor<1xi64>
   // CHECK: %[[RESHAPED_START2:.*]] = mhlo.reshape %[[SLICED_START2]] : (tensor<1xi64>) -> tensor<i64>
-  // CHECK: %[[RESULT:.*]] = "mhlo.dynamic_slice"(%arg0, %[[RESHAPED_START1]], %[[RESHAPED_START2]]) {slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
+  // CHECK: %[[RESULT:.*]] = "mhlo.dynamic_slice"(%arg0, %[[RESHAPED_START1]], %[[RESHAPED_START2]]) <{slice_sizes = dense<[1, 4]> : tensor<2xi64>}> : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
   // CHECK: return %[[RESULT]] : tensor<1x4xi32>
   %sizes = "tf.Const"() {value = dense<[1, 4]> : tensor<2xi64>} : () -> (tensor<2xi64>)
   %0 = "tf.Slice"(%arg0, %arg1, %sizes) : (tensor<3x4xi32>, tensor<2xi64>, tensor<2xi64>) -> tensor<1x4xi32>
@@ -2961,7 +2961,7 @@ func.func @strided_slice_negative_indices(%input: tensor<4x8xf32>) -> tensor<3x2
   %end = "tf.Const"() {value = dense<[-4, -8]> : tensor<2xi32>} : () -> (tensor<2xi32>)
   %strides = "tf.Const"() {value = dense<[-1, -3]> : tensor<2xi32>} : () -> (tensor<2xi32>)
 
-  // CHECK: "mhlo.reverse"(%arg0) {dimensions = dense<[0, 1]> : tensor<2xi64>}
+  // CHECK: "mhlo.reverse"(%arg0) <{dimensions = dense<[0, 1]> : tensor<2xi64>}>
 
   // CHECK: mhlo.slice
   // CHECK-DAG-SAME: start_indices = dense<[0, 1]>
@@ -3581,7 +3581,7 @@ func.func @any_dynamic(%input: tensor<4x?xi1>) -> tensor<4x1xi1> {
 
 // CHECK-LABEL: func @tile_by_reshape
 func.func @tile_by_reshape(%arg0: tensor<4x8xf32>) -> tensor<28x24xf32> {
-  // CHECK: %[[BROADCASTED:.*]] = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[1, 3]> : tensor<2xi64>} : (tensor<4x8xf32>) -> tensor<7x4x3x8xf32>
+  // CHECK: %[[BROADCASTED:.*]] = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<[1, 3]> : tensor<2xi64>}> : (tensor<4x8xf32>) -> tensor<7x4x3x8xf32>
   // CHECK: %[[RESULT:.*]] = mhlo.reshape %[[BROADCASTED]] : (tensor<7x4x3x8xf32>) -> tensor<28x24xf32>
   // CHECK: return %[[RESULT]] : tensor<28x24xf32>
   %multiples = "tf.Const"() { value = dense<[7,3]> : tensor<2xi64> } : () -> tensor<2xi64>
@@ -3593,7 +3593,7 @@ func.func @tile_by_reshape(%arg0: tensor<4x8xf32>) -> tensor<28x24xf32> {
 
 // CHECK-LABEL: func @tile_just_broadcast
 func.func @tile_just_broadcast(%arg0: tensor<1x1xf32>) -> tensor<7x3xf32> {
-  // CHECK: %[[RESULT:.*]] = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x1xf32>) -> tensor<7x3xf32>
+  // CHECK: %[[RESULT:.*]] = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}> : (tensor<1x1xf32>) -> tensor<7x3xf32>
   // CHECK: return %[[RESULT]] : tensor<7x3xf32>
   %multiples = "tf.Const"() { value = dense<[7,3]> : tensor<2xi64> } : () -> tensor<2xi64>
   %0 = "tf.Tile"(%arg0, %multiples) : (tensor<1x1xf32>, tensor<2xi64>) -> tensor<7x3xf32>
@@ -3607,7 +3607,7 @@ func.func @tile_dynamic_shape(%arg0: tensor<?x8xf32>) -> tensor<?x24xf32> {
   %multiples = "tf.Const"() { value = dense<[7,3]> : tensor<2xi32> } : () -> tensor<2xi32>
   // CHECK: tensor.dim {{.*}} : tensor<?x8xf32>
   // CHECK: tensor.from_elements  {{.*}} : tensor<4xindex>
-  // CHECK: "mhlo.dynamic_broadcast_in_dim"({{.*}}) {broadcast_dimensions = dense<[1, 3]> : tensor<2xi64>} : (tensor<?x8xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
+  // CHECK: "mhlo.dynamic_broadcast_in_dim"({{.*}}) <{broadcast_dimensions = dense<[1, 3]> : tensor<2xi64>}> : (tensor<?x8xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
   // CHECK: muli {{.*}} : index
   // CHECK: tensor.from_elements {{.*}} : tensor<2xindex>
   // CHECK: mhlo.dynamic_reshape {{.*}} : (tensor<?x?x?x?xf32>, tensor<2xindex>) -> tensor<?x24xf32>
@@ -3626,7 +3626,7 @@ func.func @argmax_i64_input_i32_output_axis_0(%arg0: tensor<3x7xi64>) -> tensor<
   // CHECK: %[[INIT:.*]] = mhlo.constant dense<-9223372036854775808> : tensor<i64>
   // CHECK-NEXT: %[[INDEX_INIT:.*]] = mhlo.constant dense<0> : tensor<i32>
   // CHECK: %[[SHAPE:.*]] = shape.shape_of %arg0 : tensor<3x7xi64> -> tensor<2xindex>
-  // CHECK: %[[INDEX:.*]] = "mhlo.dynamic_iota"(%[[SHAPE]]) {iota_dimension = 0 : i64} : (tensor<2xindex>) -> tensor<3x7xi32>
+  // CHECK: %[[INDEX:.*]] = "mhlo.dynamic_iota"(%[[SHAPE]]) <{iota_dimension = 0 : i64}> : (tensor<2xindex>) -> tensor<3x7xi32>
   // CHECK: %[[REDUCE:.*]]:2 = mhlo.reduce(%arg0 init: %[[INIT]]), (%[[INDEX]] init: %[[INDEX_INIT]])
   // CHECK: (%[[ARG1:.*]]: tensor<i64>, %[[ARG3:.*]]: tensor<i64>) (%[[ARG2:.*]]: tensor<i32>, %[[ARG4:.*]]: tensor<i32>)
   // CHECK: %[[COMPARE:.*]] = mhlo.compare GE, %[[ARG1]], %[[ARG3]], NOTYPE : (tensor<i64>, tensor<i64>) -> tensor<i1>
@@ -3649,7 +3649,7 @@ func.func @argmax_f32_input_i64_output_axis_1(%arg0: tensor<3x7xf32>) -> tensor<
   // CHECK: %[[INIT:.*]] = mhlo.constant dense<0xFF800000> : tensor<f32>
   // CHECK-NEXT: %[[INDEX_INIT:.*]] = mhlo.constant  dense<0> : tensor<i64>
   // CHECK: %[[SHAPE:.*]] = shape.shape_of %arg0 : tensor<3x7xf32> -> tensor<2xindex>
-  // CHECK: %[[INDEX:.*]] = "mhlo.dynamic_iota"(%[[SHAPE]]) {iota_dimension = 1 : i64} : (tensor<2xindex>) -> tensor<3x7xi64>
+  // CHECK: %[[INDEX:.*]] = "mhlo.dynamic_iota"(%[[SHAPE]]) <{iota_dimension = 1 : i64}> : (tensor<2xindex>) -> tensor<3x7xi64>
   // CHECK: %[[REDUCE:.*]]:2 = mhlo.reduce(%arg0 init: %[[INIT]]), (%[[INDEX]] init: %[[INDEX_INIT]])
   // CHECK: return %[[REDUCE]]#1 : tensor<3xi64>
   %axis = "tf.Const"() { value = dense<1> : tensor<i32> } : () -> tensor<i32>
@@ -3664,7 +3664,7 @@ func.func @argmax_i1_input_i64_output_axis_1(%arg0: tensor<3x7xi1>) -> tensor<3x
   // CHECK-DAG: %[[INIT:.*]] = mhlo.constant dense<false> : tensor<i1>
   // CHECK-DAG: %[[INDEX_INIT:.*]] = mhlo.constant  dense<0> : tensor<i64>
   // CHECK: %[[SHAPE:.*]] = shape.shape_of %arg0 : tensor<3x7xi1> -> tensor<2xindex>
-  // CHECK: %[[INDEX:.*]] = "mhlo.dynamic_iota"(%[[SHAPE]]) {iota_dimension = 1 : i64} : (tensor<2xindex>) -> tensor<3x7xi64>
+  // CHECK: %[[INDEX:.*]] = "mhlo.dynamic_iota"(%[[SHAPE]]) <{iota_dimension = 1 : i64}> : (tensor<2xindex>) -> tensor<3x7xi64>
   // CHECK: %[[REDUCE:.*]]:2 = mhlo.reduce(%arg0 init: %[[INIT]]), (%[[INDEX]] init: %[[INDEX_INIT]])
   // CHECK: return %[[REDUCE]]#1 : tensor<3xi64>
   %axis = "tf.Const"() { value = dense<1> : tensor<i32> } : () -> tensor<i32>
@@ -3679,7 +3679,7 @@ func.func @argmax_dynamic_shape_input_output(%arg0: tensor<3x?xi32>) -> tensor<?
   // CHECK: %[[INIT:.*]] = mhlo.constant dense<-2147483648> : tensor<i32>
   // CHECK-NEXT: %[[INDEX_INIT:.*]] = mhlo.constant dense<0> : tensor<i32>
   // CHECK: %[[SHAPE:.*]] = shape.shape_of %arg0 : tensor<3x?xi32> -> tensor<2xindex>
-  // CHECK: %[[INDEX:.*]] = "mhlo.dynamic_iota"(%[[SHAPE]]) {iota_dimension = 0 : i64} : (tensor<2xindex>) -> tensor<3x?xi32>
+  // CHECK: %[[INDEX:.*]] = "mhlo.dynamic_iota"(%[[SHAPE]]) <{iota_dimension = 0 : i64}> : (tensor<2xindex>) -> tensor<3x?xi32>
   // CHECK: %[[REDUCE:.*]]:2 = mhlo.reduce(%arg0 init: %[[INIT]]), (%[[INDEX]] init: %[[INDEX_INIT]])
   // CHECK: return %[[REDUCE]]#1 : tensor<?xi32>
   %axis = "tf.Const"() { value = dense<0> : tensor<i32> } : () -> tensor<i32>
@@ -3694,7 +3694,7 @@ func.func @argmax_dynamic_shape_input(%arg0: tensor<3x?xi32>) -> tensor<3xi32> {
   // CHECK-DAG: %[[INIT:.*]] = mhlo.constant dense<-2147483648> : tensor<i32>
   // CHECK-DAG: %[[INDEX_INIT:.*]] = mhlo.constant dense<0> : tensor<i32>
   // CHECK: %[[SHAPE:.*]] = shape.shape_of %arg0 : tensor<3x?xi32> -> tensor<2xindex>
-  // CHECK: %[[INDEX:.*]] = "mhlo.dynamic_iota"(%[[SHAPE]]) {iota_dimension = 1 : i64} : (tensor<2xindex>) -> tensor<3x?xi32>
+  // CHECK: %[[INDEX:.*]] = "mhlo.dynamic_iota"(%[[SHAPE]]) <{iota_dimension = 1 : i64}> : (tensor<2xindex>) -> tensor<3x?xi32>
   // CHECK: %[[REDUCE:.*]]:2 = mhlo.reduce(%arg0 init: %[[INIT]]), (%[[INDEX]] init: %[[INDEX_INIT]])
   // CHECK: return %[[REDUCE]]#1 : tensor<3xi32>
   %axis = "tf.Const"() { value = dense<1> : tensor<i32> } : () -> tensor<i32>
@@ -3709,7 +3709,7 @@ func.func @argmin_i64_input_i32_output_axis_0(%arg0: tensor<3x7xi64>) -> tensor<
   // CHECK: %[[INIT:.*]] = mhlo.constant dense<9223372036854775807> : tensor<i64>
   // CHECK-NEXT: %[[INDEX_INIT:.*]] = mhlo.constant dense<0> : tensor<i32>
   // CHECK: %[[SHAPE:.*]] = shape.shape_of %arg0 : tensor<3x7xi64> -> tensor<2xindex>
-  // CHECK: %[[INDEX:.*]] = "mhlo.dynamic_iota"(%[[SHAPE]]) {iota_dimension = 0 : i64} : (tensor<2xindex>) -> tensor<3x7xi32>
+  // CHECK: %[[INDEX:.*]] = "mhlo.dynamic_iota"(%[[SHAPE]]) <{iota_dimension = 0 : i64}> : (tensor<2xindex>) -> tensor<3x7xi32>
   // CHECK: %[[REDUCE:.*]]:2 = mhlo.reduce(%arg0 init: %[[INIT]]), (%[[INDEX]] init: %[[INDEX_INIT]])
   // CHECK: (%[[ARG1:.*]]: tensor<i64>, %[[ARG3:.*]]: tensor<i64>) (%[[ARG2:.*]]: tensor<i32>, %[[ARG4:.*]]: tensor<i32>)
   // CHECK: %[[COMPARE:.*]] = mhlo.compare LE, %[[ARG1]], %[[ARG3]], NOTYPE : (tensor<i64>, tensor<i64>) -> tensor<i1>
@@ -3763,7 +3763,7 @@ func.func @random_uniform_with_seeds(%arg0: tensor<4xi32>) -> tensor<32x12x12x64
   // CHECK-NEXT:  %1 = mhlo.constant dense<0.000000e+00> : tensor<f32>
   // CHECK-NEXT:  %2 = mhlo.constant dense<1.000000e+00> : tensor<f32>
   // CHECK-NEXT:  %3 = mhlo.convert %0 : (tensor<4xi32>) -> tensor<4xi64>
-  // CHECK-NEXT:  %4 = "mhlo.rng"(%1, %2, %3) {rng_distribution = #mhlo.rng_distribution<UNIFORM>} : (tensor<f32>, tensor<f32>, tensor<4xi64>) -> tensor<32x12x12x64xf32>
+  // CHECK-NEXT:  %4 = "mhlo.rng"(%1, %2, %3) <{rng_distribution = #mhlo.rng_distribution<UNIFORM>}> : (tensor<f32>, tensor<f32>, tensor<4xi64>) -> tensor<32x12x12x64xf32>
   %cst = "tf.Const"() {value = dense<[32, 12, 12, 64]> : tensor<4xi32>} : () -> tensor<4xi32>
   %0 = "tf.RandomUniform"(%cst) {seed = 87654321 : i64, seed2 = 0 : i64} : (tensor<4xi32>) -> tensor<32x12x12x64xf32>
   // CHECK: return %4 : tensor<32x12x12x64xf32>
@@ -3813,7 +3813,7 @@ func.func @range_dynamic(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<f
   // CHECK-DAG: [[CEIL:%.+]] = mhlo.ceil [[DIV]]
   // CHECK-DAG: [[CONVERT_3:%.+]] = mhlo.convert [[CEIL]]
   // CHECK-DAG: [[RESHAPE:%.+]] = mhlo.reshape [[CONVERT_3]]
-  // CHECK-DAG: [[IOTA:%.+]] = "mhlo.dynamic_iota"([[RESHAPE]]) {iota_dimension = 0 : i64}
+  // CHECK-DAG: [[IOTA:%.+]] = "mhlo.dynamic_iota"([[RESHAPE]]) <{iota_dimension = 0 : i64}>
   // CHECK-DAG: [[CONVERT_3:%.+]] = mhlo.convert %arg0
   // CHECK-DAG: [[CONVERT_4:%.+]] = mhlo.convert %arg2
   // CHECK-DAG: [[MUL:%.+]] = chlo.broadcast_multiply [[IOTA]], [[CONVERT_4]] {broadcast_dimensions = array<i64>}
@@ -3837,7 +3837,7 @@ func.func @range_int_dynamic(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tens
   // CHECK-DAG: [[CEIL:%.+]] = mhlo.ceil [[DIV]]
   // CHECK-DAG: [[CONVERT_3:%.+]] = mhlo.convert [[CEIL]]
   // CHECK-DAG: [[RESHAPE:%.+]] = mhlo.reshape [[CONVERT_3]]
-  // CHECK-DAG: [[IOTA:%.+]] = "mhlo.dynamic_iota"([[RESHAPE]]) {iota_dimension = 0 : i64}
+  // CHECK-DAG: [[IOTA:%.+]] = "mhlo.dynamic_iota"([[RESHAPE]]) <{iota_dimension = 0 : i64}>
   // CHECK-DAG: [[CONVERT_3:%.+]] = mhlo.convert %arg0
   // CHECK-DAG: [[CONVERT_4:%.+]] = mhlo.convert %arg2
   // CHECK-DAG: [[MUL:%.+]] = chlo.broadcast_multiply [[IOTA]], [[CONVERT_4]] {broadcast_dimensions = array<i64>}
@@ -3859,7 +3859,7 @@ func.func @linspace_static(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<4xf
   // CHECK-DAG: [[STEP_DENOMINATOR:%.*]] = chlo.broadcast_subtract [[NUM_F32]], [[ONE]]
   // CHECK-DAG: [[STEP_NUMERATOR:%.*]] = chlo.broadcast_subtract [[STOP]], [[START]]
   // CHECK-DAG: [[STEP:%.*]] = chlo.broadcast_divide [[STEP_NUMERATOR]], [[STEP_DENOMINATOR]]
-  // CHECK-DAG: [[IOTA:%.*]] = "mhlo.iota"() {iota_dimension = 0 : i64}
+  // CHECK-DAG: [[IOTA:%.*]] = "mhlo.iota"() <{iota_dimension = 0 : i64}>
   // CHECK-DAG: [[MUL:%.*]] = chlo.broadcast_multiply [[IOTA]], [[STEP]] {broadcast_dimensions = array<i64>}
   // CHECK-DAG: [[LINSPACE:%.*]] = chlo.broadcast_add [[MUL]], [[START]] {broadcast_dimensions = array<i64>}
   // CHECK: return [[LINSPACE]]
@@ -3998,7 +3998,7 @@ func.func @conv_explicit_paddings(%arg0: tensor<256x32x32x6xf32>, %arg1: tensor<
 
 // CHECK-LABEL: @conv2d_backprop_input_dynamic
 func.func @conv2d_backprop_input_dynamic(%filter: tensor<2x2x1x16xf32>, %out_backprop: tensor<?x256x256x16xf32>) -> tensor<?x512x512x1xf32> {
-  // CHECK: %[[REV_FILTER:.*]] = "mhlo.reverse"(%arg0) {dimensions = dense<[0, 1]> : tensor<2xi64>}
+  // CHECK: %[[REV_FILTER:.*]] = "mhlo.reverse"(%arg0) <{dimensions = dense<[0, 1]> : tensor<2xi64>}>
   // CHECK: %[[RESULT:.*]] = mhlo.convolution(%arg1, %[[REV_FILTER]])
   // CHECK-SAME: dim_numbers = [b, 0, 1, f]x[0, 1, o, i]->[b, 0, 1, f]
   // CHECK-SAME{LITERAL}: window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [2, 2], rhs_dilate = [1, 1]}
@@ -4023,7 +4023,7 @@ func.func @conv2d_backprop_input(
     %filter: tensor<3x3x1x32xf32>,
     %out_backprop: tensor<100x26x26x32xf32>
   ) -> tensor<100x28x28x1xf32> {
-    // CHECK: %[[REV_FILTER:.*]] = "mhlo.reverse"(%arg0) {dimensions = dense<[0, 1]> : tensor<2xi64>}
+    // CHECK: %[[REV_FILTER:.*]] = "mhlo.reverse"(%arg0) <{dimensions = dense<[0, 1]> : tensor<2xi64>}>
     // CHECK: %[[RESULT:.*]] = mhlo.convolution(%arg1, %[[REV_FILTER]])
     // CHECK-SAME: dim_numbers = [b, 0, 1, f]x[0, 1, o, i]->[b, 0, 1, f]
     // CHECK-SAME{LITERAL}: window = {stride = [1, 1], pad = [[2, 2], [2, 2]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]}
@@ -4073,7 +4073,7 @@ func.func @conv2d_backprop_input_grouped(
 
 // CHECK-LABEL: @conv3d_backprop_input
 func.func @conv3d_backprop_input(%filter: tensor<3x3x3x1x6xf32>, %out_backprop: tensor<2x8x8x8x6xf32>) -> tensor<2x8x8x8x1xf32> {
-  // CHECK: %[[REV_FILTER:.*]] = "mhlo.reverse"(%arg0) {dimensions = dense<[0, 1, 2]> : tensor<3xi64>}
+  // CHECK: %[[REV_FILTER:.*]] = "mhlo.reverse"(%arg0) <{dimensions = dense<[0, 1, 2]> : tensor<3xi64>}>
   // CHECK: %[[RESULT:.*]] = mhlo.convolution(%arg1, %[[REV_FILTER]])
   // CHECK-SAME: dim_numbers = [b, 0, 1, 2, f]x[0, 1, 2, o, i]->[b, 0, 1, 2, f]
   // CHECK-SAME{LITERAL}: window = {stride = [1, 1, 1], pad = [[1, 1], [1, 1], [1, 1]], lhs_dilate = [1, 1, 1], rhs_dilate = [1, 1, 1]}
@@ -4184,7 +4184,7 @@ func.func @cross_replica_sum(%input: tensor<10xf32>) -> tensor<10xf32> {
 // CHECK-LABEL: conv_dynamic
 func.func @conv_dynamic(%arg0: tensor<?x32x32x6xf32>, %arg1: tensor<3x3x3x16xf32>) -> tensor<?x8x7x16xf32> {
   // CHECK: "mhlo.dynamic_conv"
-  // CHECK-SAME: {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<[b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]>, feature_group_count = 2 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>], rhs_dilation = dense<[2, 3]> : tensor<2xi64>, window_strides = dense<[4, 5]> : tensor<2xi64>} : (tensor<?x32x32x6xf32>, tensor<3x3x3x16xf32>, tensor<4xi32>) -> tensor<?x8x7x16xf32>
+  // CHECK-SAME: <{batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<[b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]>, feature_group_count = 2 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>], rhs_dilation = dense<[2, 3]> : tensor<2xi64>, window_strides = dense<[4, 5]> : tensor<2xi64>}> : (tensor<?x32x32x6xf32>, tensor<3x3x3x16xf32>, tensor<4xi32>) -> tensor<?x8x7x16xf32>
   %0 = "tf.Conv2D"(%arg0, %arg1) {data_format = "NHWC", dilations = [1, 2, 3, 1], padding = "SAME", strides = [1, 4, 5, 1]} : (tensor<?x32x32x6xf32>, tensor<3x3x3x16xf32>) -> tensor<?x8x7x16xf32>
   func.return %0 : tensor<?x8x7x16xf32>
 }
@@ -4245,8 +4245,8 @@ func.func @split_not_match_static_split_dim_size(%input: tensor<4x?x4xf32>) -> (
 // CHECK-LABEL: @split_match_and_split_into_two
 func.func @split_match_and_split_into_two(%input: tensor<4x6xf32>) -> (tensor<2x6xf32>, tensor<2x6xf32>) {
   %cst = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-  // CHECK: %[[ONE:.*]] = "mhlo.slice"(%{{.*}}) {limit_indices = dense<[2, 6]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<4x6xf32>) -> tensor<2x6xf32>
-  // CHECK: %[[TWO:.*]] = "mhlo.slice"(%{{.*}}) {limit_indices = dense<[4, 6]> : tensor<2xi64>, start_indices = dense<[2, 0]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<4x6xf32>) -> tensor<2x6xf32>
+  // CHECK: %[[ONE:.*]] = "mhlo.slice"(%{{.*}}) <{limit_indices = dense<[2, 6]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}> : (tensor<4x6xf32>) -> tensor<2x6xf32>
+  // CHECK: %[[TWO:.*]] = "mhlo.slice"(%{{.*}}) <{limit_indices = dense<[4, 6]> : tensor<2xi64>, start_indices = dense<[2, 0]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}> : (tensor<4x6xf32>) -> tensor<2x6xf32>
   %0:2 = "tf.Split"(%cst, %input) : (tensor<i32>, tensor<4x6xf32>) -> (tensor<2x6xf32>, tensor<2x6xf32>)
   // CHECK: return %[[ONE]], %[[TWO]]
   func.return %0#0, %0#1 : tensor<2x6xf32>, tensor<2x6xf32>
@@ -4258,9 +4258,9 @@ func.func @split_match_and_split_into_two(%input: tensor<4x6xf32>) -> (tensor<2x
 // CHECK-SAME: (%[[ARG:.*]]: tensor<4x6xf32>)
 func.func @split_match_and_split_into_three(%input: tensor<4x6xf32>) -> (tensor<4x2xf32>, tensor<4x2xf32>, tensor<4x2xf32>) {
   %cst = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-  // CHECK: %[[ONE:.*]] = "mhlo.slice"(%[[ARG]]) {limit_indices = dense<[4, 2]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<4x6xf32>) -> tensor<4x2xf32>
-  // CHECK: %[[TWO:.*]] = "mhlo.slice"(%[[ARG]]) {limit_indices = dense<4> : tensor<2xi64>, start_indices = dense<[0, 2]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<4x6xf32>) -> tensor<4x2xf32>
-  // CHECK: %[[THREE:.*]] = "mhlo.slice"(%[[ARG]]) {limit_indices = dense<[4, 6]> : tensor<2xi64>, start_indices = dense<[0, 4]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<4x6xf32>) -> tensor<4x2xf32>
+  // CHECK: %[[ONE:.*]] = "mhlo.slice"(%[[ARG]]) <{limit_indices = dense<[4, 2]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}> : (tensor<4x6xf32>) -> tensor<4x2xf32>
+  // CHECK: %[[TWO:.*]] = "mhlo.slice"(%[[ARG]]) <{limit_indices = dense<4> : tensor<2xi64>, start_indices = dense<[0, 2]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}> : (tensor<4x6xf32>) -> tensor<4x2xf32>
+  // CHECK: %[[THREE:.*]] = "mhlo.slice"(%[[ARG]]) <{limit_indices = dense<[4, 6]> : tensor<2xi64>, start_indices = dense<[0, 4]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}> : (tensor<4x6xf32>) -> tensor<4x2xf32>
   %0:3 = "tf.Split"(%cst, %input) : (tensor<i32>, tensor<4x6xf32>) -> (tensor<4x2xf32>, tensor<4x2xf32>, tensor<4x2xf32>)
   // CHECK: return %[[ONE]], %[[TWO]], %[[THREE]]
   func.return %0#0, %0#1, %0#2 : tensor<4x2xf32>, tensor<4x2xf32>, tensor<4x2xf32>
@@ -4312,9 +4312,9 @@ func.func @topk_v2(%input: tensor<16x16xf32>) -> (tensor<16x8xf32>, tensor<16x8x
 func.func @splitv_match_and_split_into_three(%input: tensor<4x6xf32>) -> (tensor<4x1xf32>, tensor<4x2xf32>, tensor<4x3xf32>) {
   %split_sizes = "tf.Const"() {value = dense<[1, 2, 3]> : tensor<3xi32>} : () -> tensor<3xi32>
   %split_dim = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-  // CHECK: %[[ONE:.*]] = "mhlo.slice"(%[[ARG]]) {limit_indices = dense<[4, 1]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<4x6xf32>) -> tensor<4x1xf32>
-  // CHECK: %[[TWO:.*]] = "mhlo.slice"(%[[ARG]]) {limit_indices = dense<[4, 3]> : tensor<2xi64>, start_indices = dense<[0, 1]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<4x6xf32>) -> tensor<4x2xf32>
-  // CHECK: %[[THREE:.*]] = "mhlo.slice"(%[[ARG]]) {limit_indices = dense<[4, 6]> : tensor<2xi64>, start_indices = dense<[0, 3]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<4x6xf32>) -> tensor<4x3xf32>
+  // CHECK: %[[ONE:.*]] = "mhlo.slice"(%[[ARG]]) <{limit_indices = dense<[4, 1]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}> : (tensor<4x6xf32>) -> tensor<4x1xf32>
+  // CHECK: %[[TWO:.*]] = "mhlo.slice"(%[[ARG]]) <{limit_indices = dense<[4, 3]> : tensor<2xi64>, start_indices = dense<[0, 1]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}> : (tensor<4x6xf32>) -> tensor<4x2xf32>
+  // CHECK: %[[THREE:.*]] = "mhlo.slice"(%[[ARG]]) <{limit_indices = dense<[4, 6]> : tensor<2xi64>, start_indices = dense<[0, 3]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}> : (tensor<4x6xf32>) -> tensor<4x3xf32>
   %0:3 = "tf.SplitV"(%input, %split_sizes, %split_dim) : (tensor<4x6xf32>, tensor<3xi32>, tensor<i32>) -> (tensor<4x1xf32>, tensor<4x2xf32>, tensor<4x3xf32>)
   // CHECK: return %[[ONE]], %[[TWO]], %[[THREE]]
   func.return %0#0, %0#1, %0#2 : tensor<4x1xf32>, tensor<4x2xf32>, tensor<4x3xf32>
@@ -4365,11 +4365,11 @@ func.func @assert(%arg0: tensor<i1>, %arg1: tensor<*xf32>) {
 
 // CHECK-LABEL: @unpack
 func.func @unpack(%input: tensor<4x3x6xf32>) -> (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) {
-  // CHECK: %[[SLICE1:.*]] = "mhlo.slice"(%{{.*}}) {limit_indices = dense<[4, 1, 6]> : tensor<3xi64>, start_indices = dense<0> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<4x3x6xf32>) -> tensor<4x1x6xf32>
+  // CHECK: %[[SLICE1:.*]] = "mhlo.slice"(%{{.*}}) <{limit_indices = dense<[4, 1, 6]> : tensor<3xi64>, start_indices = dense<0> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>}> : (tensor<4x3x6xf32>) -> tensor<4x1x6xf32>
   // CHECK: %[[RES1:.*]] = mhlo.reshape %[[SLICE1]] : (tensor<4x1x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[SLICE2:.*]] = "mhlo.slice"(%{{.*}}) {limit_indices = dense<[4, 2, 6]> : tensor<3xi64>, start_indices = dense<[0, 1, 0]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<4x3x6xf32>) -> tensor<4x1x6xf32>
+  // CHECK: %[[SLICE2:.*]] = "mhlo.slice"(%{{.*}}) <{limit_indices = dense<[4, 2, 6]> : tensor<3xi64>, start_indices = dense<[0, 1, 0]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>}> : (tensor<4x3x6xf32>) -> tensor<4x1x6xf32>
   // CHECK: %[[RES2:.*]] = mhlo.reshape %[[SLICE2]] : (tensor<4x1x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[SLICE3:.*]] = "mhlo.slice"(%{{.*}}) {limit_indices = dense<[4, 3, 6]> : tensor<3xi64>, start_indices = dense<[0, 2, 0]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<4x3x6xf32>) -> tensor<4x1x6xf32>
+  // CHECK: %[[SLICE3:.*]] = "mhlo.slice"(%{{.*}}) <{limit_indices = dense<[4, 3, 6]> : tensor<3xi64>, start_indices = dense<[0, 2, 0]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>}> : (tensor<4x3x6xf32>) -> tensor<4x1x6xf32>
   // CHECK: %[[RES3:.*]] = mhlo.reshape %[[SLICE3]] : (tensor<4x1x6xf32>) -> tensor<4x6xf32>
 
   %0:3 = "tf.Unpack"(%input) {axis = 1} : (tensor<4x3x6xf32>) -> (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>)
@@ -4405,19 +4405,19 @@ func.func @unpack_dynamic(%arg0: tensor<?x?x2xf32>) -> (tensor<?x?xf32>, tensor<
 func.func @unsorted_segment_sum(%data: tensor<8x16x64xf32>, %segment_ids : tensor<8x16xi32>) -> (tensor<4x64xf32>) {
   %num_segments = "tf.Const"() {value = dense<4> : tensor<i32>} : () -> tensor<i32>
   // CHECK: [[ZERO:%.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
-  // CHECK: [[INIT:%.*]] = "mhlo.broadcast"([[ZERO]]) {broadcast_sizes = dense<[4, 64]> : tensor<2xi64>} : (tensor<f32>) -> tensor<4x64xf32>
-  // CHECK: [[SCATTER:%.*]] = "mhlo.scatter"([[INIT]], [[SI]], [[DATA]]) ({
-  // CHECK: ^{{.*}}([[LHS:%.*]]: tensor<f32>, [[RHS:%.*]]: tensor<f32>):
-  // CHECK:   [[ADD:%.*]] = mhlo.add [[LHS]], [[RHS]] : tensor<f32>
-  // CHECK:   mhlo.return [[ADD]]
-  // CHECK: indices_are_sorted = false,
+  // CHECK: [[INIT:%.*]] = "mhlo.broadcast"([[ZERO]]) <{broadcast_sizes = dense<[4, 64]> : tensor<2xi64>}> : (tensor<f32>) -> tensor<4x64xf32>
+  // CHECK: [[SCATTER:%.*]] = "mhlo.scatter"([[INIT]], [[SI]], [[DATA]])
+  // CHECK-SAME: indices_are_sorted = false,
   // CHECK-SAME: scatter_dimension_numbers =
   // CHECK-SAME:   update_window_dims = [2]
   // CHECK-SAME:   inserted_window_dims = [0]
   // CHECK-SAME:   scatter_dims_to_operand_dims = [0]
   // CHECK-SAME:   index_vector_dim = 2
   // CHECK-SAME: unique_indices = false
-  // CHECK-SAME: (tensor<4x64xf32>, tensor<8x16xi32>, tensor<8x16x64xf32>) -> tensor<4x64xf32>
+  // CHECK: ^{{.*}}([[LHS:%.*]]: tensor<f32>, [[RHS:%.*]]: tensor<f32>):
+  // CHECK:   [[ADD:%.*]] = mhlo.add [[LHS]], [[RHS]] : tensor<f32>
+  // CHECK:   mhlo.return [[ADD]]
+  // CHECK-NEXT: (tensor<4x64xf32>, tensor<8x16xi32>, tensor<8x16x64xf32>) -> tensor<4x64xf32>
   // CHECK: return [[SCATTER]]
   %0 = "tf.UnsortedSegmentSum"(%data, %segment_ids, %num_segments) : (tensor<8x16x64xf32>, tensor<8x16xi32>, tensor<i32>) -> (tensor<4x64xf32>)
   func.return %0: tensor<4x64xf32>
@@ -4431,19 +4431,19 @@ func.func @unsorted_segment_sum(%data: tensor<8x16x64xf32>, %segment_ids : tenso
 func.func @unsorted_segment_prod(%data: tensor<8x?x64xf32>, %segment_ids : tensor<?x16xi32>) -> (tensor<4x?xf32>) {
   %num_segments = "tf.Const"() {value = dense<4> : tensor<i32>} : () -> tensor<i32>
   // CHECK: [[ONE:%.*]] = mhlo.constant dense<1.000000e+00> : tensor<f32>
-  // CHECK: [[INIT:%.*]] = "mhlo.broadcast"([[ONE]]) {broadcast_sizes = dense<[4, 64]> : tensor<2xi64>} : (tensor<f32>) -> tensor<4x64xf32>
-  // CHECK: [[SCATTER:%.*]] = "mhlo.scatter"([[INIT]], [[SI]], [[DATA]]) ({
-  // CHECK: ^{{.*}}([[LHS:%.*]]: tensor<f32>, [[RHS:%.*]]: tensor<f32>):
-  // CHECK:   [[MUL:%.*]] = mhlo.multiply [[LHS]], [[RHS]] : tensor<f32>
-  // CHECK:   mhlo.return [[MUL]]
-  // CHECK: indices_are_sorted = false
+  // CHECK: [[INIT:%.*]] = "mhlo.broadcast"([[ONE]]) <{broadcast_sizes = dense<[4, 64]> : tensor<2xi64>}> : (tensor<f32>) -> tensor<4x64xf32>
+  // CHECK: [[SCATTER:%.*]] = "mhlo.scatter"([[INIT]], [[SI]], [[DATA]])
+  // CHECK-SAME: indices_are_sorted = false
   // CHECK-SAME: scatter_dimension_numbers =
   // CHECK-SAME:   update_window_dims = [2]
   // CHECK-SAME:   inserted_window_dims = [0]
   // CHECK-SAME:   scatter_dims_to_operand_dims = [0]
   // CHECK-SAME:   index_vector_dim = 2
   // CHECK-SAME: unique_indices = false
-  // CHECK-SAME: (tensor<4x64xf32>, tensor<?x16xi32>, tensor<8x?x64xf32>) -> tensor<4x?xf32>
+  // CHECK: ^{{.*}}([[LHS:%.*]]: tensor<f32>, [[RHS:%.*]]: tensor<f32>):
+  // CHECK:   [[MUL:%.*]] = mhlo.multiply [[LHS]], [[RHS]] : tensor<f32>
+  // CHECK:   mhlo.return [[MUL]]
+  // CHECK-NEXT: (tensor<4x64xf32>, tensor<?x16xi32>, tensor<8x?x64xf32>) -> tensor<4x?xf32>
   // CHECK: return [[SCATTER]]
   %0 = "tf.UnsortedSegmentProd"(%data, %segment_ids, %num_segments) : (tensor<8x?x64xf32>, tensor<?x16xi32>, tensor<i32>) -> (tensor<4x?xf32>)
   func.return %0: tensor<4x?xf32>
@@ -4496,7 +4496,7 @@ func.func @gatherNd_dynamic(%arg0: tensor<?x?x?xi32>, %arg1: tensor<?x6x2xi32>)
 
 // CHECK-LABEL: func @gatherNd_static
 func.func @gatherNd_static(%arg0: tensor<2x4x128xf32>, %arg1: tensor<2x1xi32>) -> tensor<2x4x128xf32> {
-  // CHECK:      "mhlo.gather"({{.*}}) {
+  // CHECK:      "mhlo.gather"({{.*}}) <{
   // CHECK-SAME:   dimension_numbers =
   // CHECK-SAME:     offset_dims = [1, 2]
   // CHECK-SAME:     collapsed_slice_dims = [0]
@@ -4610,9 +4610,9 @@ func.func @strided_slice_grad(%grad: tensor<4x16x1022xf32>) -> tensor<4x128x1024
   %strides = "tf.Const"() {value = dense<[1, 4, -1]> : tensor<3xi32>} : () -> (tensor<3xi32>)
 
   // CHECK: [[RESHAPE:%.*]] = mhlo.reshape %arg0 : (tensor<4x16x1022xf32>) -> tensor<4x16x1022xf32>
-  // CHECK: [[REVERSE:%.*]] = "mhlo.reverse"([[RESHAPE]]) {dimensions = dense<2> : tensor<1xi64>} : (tensor<4x16x1022xf32>) -> tensor<4x16x1022xf32>
+  // CHECK: [[REVERSE:%.*]] = "mhlo.reverse"([[RESHAPE]]) <{dimensions = dense<2> : tensor<1xi64>}> : (tensor<4x16x1022xf32>) -> tensor<4x16x1022xf32>
   // CHECK: [[ZERO:%.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
-  // CHECK: [[PAD:%.*]] = "mhlo.pad"([[REVERSE]], [[ZERO]]) {edge_padding_high = dense<[0, 63, 2]> : tensor<3xi64>, edge_padding_low = dense<[0, 4, 0]> : tensor<3xi64>, interior_padding = dense<[0, 3, 0]> : tensor<3xi64>} : (tensor<4x16x1022xf32>, tensor<f32>) -> tensor<4x128x1024xf32>
+  // CHECK: [[PAD:%.*]] = "mhlo.pad"([[REVERSE]], [[ZERO]]) <{edge_padding_high = dense<[0, 63, 2]> : tensor<3xi64>, edge_padding_low = dense<[0, 4, 0]> : tensor<3xi64>, interior_padding = dense<[0, 3, 0]> : tensor<3xi64>}> : (tensor<4x16x1022xf32>, tensor<f32>) -> tensor<4x128x1024xf32>
 
   %0 = "tf.StridedSliceGrad"(%shape, %begin, %end, %strides, %grad) {begin_mask = 1, end_mask = 4} : (tensor<3xi32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>, tensor<4x16x1022xf32>) -> tensor<4x128x1024xf32>
   // CHECK: return [[PAD]]
@@ -4744,10 +4744,7 @@ func.func @strided_slice_grad_all_masks(%grad: tensor<1x4x8x8x10x2x1xf32>) -> te
 
 // CHECK-LABEL: @tensor_scatter_update
 func.func @tensor_scatter_update(%tensor: tensor<?x?x?xf32>, %indices: tensor<?x2xi32>, %updates: tensor<?x?xf32>) -> tensor<?x?x?xf32> {
-  // CHECK: "mhlo.scatter"(%arg0, %arg1, %arg2) ({
-  // CHECK:  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
-  // CHECK:    mhlo.return %arg4 : tensor<f32>
-  // CHECK:  })
+  // CHECK: "mhlo.scatter"(%arg0, %arg1, %arg2)
   // CHECK-SAME: indices_are_sorted = false
   // CHECK-SAME: scatter_dimension_numbers
   // CHECK-SAME:   update_window_dims = [1]
@@ -4755,6 +4752,9 @@ func.func @tensor_scatter_update(%tensor: tensor<?x?x?xf32>, %indices: tensor<?x
   // CHECK-SAME:   scatter_dims_to_operand_dims = [0, 1]
   // CHECK-SAME:   index_vector_dim = 1
   // CHECK-SAME: unique_indices = false
+  // CHECK:  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
+  // CHECK:    mhlo.return %arg4 : tensor<f32>
+  // CHECK:  })
   %0 = "tf.TensorScatterUpdate"(%tensor, %indices, %updates) : (tensor<?x?x?xf32>, tensor<?x2xi32>, tensor<?x?xf32>) -> tensor<?x?x?xf32>
   func.return %0 : tensor<?x?x?xf32>
 }
@@ -4764,7 +4764,7 @@ func.func @tensor_scatter_update(%tensor: tensor<?x?x?xf32>, %indices: tensor<?x
 // CHECK-LABEL: @tensor_scatter_update_scalar_update
 func.func @tensor_scatter_update_scalar_update(%tensor: tensor<4x3xi32>, %indices: tensor<2x1xi32>, %updates: tensor<i32>) -> tensor<4x3xi32> {
   // CHECK: mhlo.constant dense<[2, 3]> : tensor<2xi64>
-  // CHECK: "mhlo.dynamic_broadcast_in_dim"(%arg2, %0) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<i32>, tensor<2xi64>) -> tensor<2x3xi32>
+  // CHECK: "mhlo.dynamic_broadcast_in_dim"(%arg2, %0) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<i32>, tensor<2xi64>) -> tensor<2x3xi32>
   // CHECK: "mhlo.scatter"
   %0 = "tf.TensorScatterUpdate"(%tensor, %indices, %updates) : (tensor<4x3xi32>, tensor<2x1xi32>, tensor<i32>) -> tensor<4x3xi32>
   func.return %0 : tensor<4x3xi32>
@@ -4774,11 +4774,7 @@ func.func @tensor_scatter_update_scalar_update(%tensor: tensor<4x3xi32>, %indice
 
 // CHECK-LABEL: @tensor_scatter_add
 func.func @tensor_scatter_add(%tensor: tensor<?x?x?xf32>, %indices: tensor<?x2xi32>, %updates: tensor<?x?xf32>) -> tensor<?x?x?xf32> {
-  // CHECK: "mhlo.scatter"(%arg0, %arg1, %arg2) ({
-  // CHECK:  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
-  // CHECK:    %1 = mhlo.add %arg3, %arg4 : tensor<f32>
-  // CHECK:    mhlo.return %1 : tensor<f32>
-  // CHECK:  })
+  // CHECK: "mhlo.scatter"(%arg0, %arg1, %arg2)
   // CHECK-SAME: indices_are_sorted = false
   // CHECK-SAME: scatter_dimension_numbers
   // CHECK-SAME:   update_window_dims = [1]
@@ -4786,6 +4782,10 @@ func.func @tensor_scatter_add(%tensor: tensor<?x?x?xf32>, %indices: tensor<?x2xi
   // CHECK-SAME:   scatter_dims_to_operand_dims = [0, 1]
   // CHECK-SAME:   index_vector_dim = 1
   // CHECK-SAME: unique_indices = false
+  // CHECK:  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
+  // CHECK:    %1 = mhlo.add %arg3, %arg4 : tensor<f32>
+  // CHECK:    mhlo.return %1 : tensor<f32>
+  // CHECK:  })
   %0 = "tf.TensorScatterAdd"(%tensor, %indices, %updates) : (tensor<?x?x?xf32>, tensor<?x2xi32>, tensor<?x?xf32>) -> tensor<?x?x?xf32>
   func.return %0 : tensor<?x?x?xf32>
 }
@@ -4795,8 +4795,8 @@ func.func @tensor_scatter_add(%tensor: tensor<?x?x?xf32>, %indices: tensor<?x2xi
 // CHECK-LABEL: @tensor_scatter_add_scalar_update
 func.func @tensor_scatter_add_scalar_update(%tensor: tensor<4x3xi32>, %indices: tensor<2x1xi32>, %updates: tensor<i32>) -> tensor<4x3xi32> {
   // CHECK: mhlo.constant dense<[2, 3]> : tensor<2xi64>
-  // CHECK: "mhlo.dynamic_broadcast_in_dim"(%arg2, %0) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<i32>, tensor<2xi64>) -> tensor<2x3xi32>
-  // CHECK: "mhlo.scatter"
+  // CHECK: "mhlo.dynamic_broadcast_in_dim"(%arg2, %0) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<i32>, tensor<2xi64>) -> tensor<2x3xi32>
+  // CHECK: "mhlo.scatter
   %0 = "tf.TensorScatterAdd"(%tensor, %indices, %updates) : (tensor<4x3xi32>, tensor<2x1xi32>, tensor<i32>) -> tensor<4x3xi32>
   func.return %0 : tensor<4x3xi32>
 }
@@ -4805,11 +4805,7 @@ func.func @tensor_scatter_add_scalar_update(%tensor: tensor<4x3xi32>, %indices:
 
 // CHECK-LABEL: @tensor_scatter_sub
 func.func @tensor_scatter_sub(%tensor: tensor<?x?x?xf32>, %indices: tensor<?x2xi32>, %updates: tensor<?x?xf32>) -> tensor<?x?x?xf32> {
-  // CHECK: "mhlo.scatter"(%arg0, %arg1, %arg2) ({
-  // CHECK:  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
-  // CHECK:    %1 = mhlo.subtract %arg3, %arg4 : tensor<f32>
-  // CHECK:    mhlo.return %1 : tensor<f32>
-  // CHECK:  })
+  // CHECK: "mhlo.scatter"(%arg0, %arg1, %arg2)
   // CHECK-SAME: indices_are_sorted = false
   // CHECK-SAME: scatter_dimension_numbers
   // CHECK-SAME:   update_window_dims = [1]
@@ -4817,6 +4813,10 @@ func.func @tensor_scatter_sub(%tensor: tensor<?x?x?xf32>, %indices: tensor<?x2xi
   // CHECK-SAME:   scatter_dims_to_operand_dims = [0, 1]
   // CHECK-SAME:   index_vector_dim = 1
   // CHECK-SAME: unique_indices = false
+  // CHECK:  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
+  // CHECK:    %1 = mhlo.subtract %arg3, %arg4 : tensor<f32>
+  // CHECK:    mhlo.return %1 : tensor<f32>
+  // CHECK:  })
   %0 = "tf.TensorScatterSub"(%tensor, %indices, %updates) : (tensor<?x?x?xf32>, tensor<?x2xi32>, tensor<?x?xf32>) -> tensor<?x?x?xf32>
   func.return %0 : tensor<?x?x?xf32>
 }
@@ -4825,11 +4825,7 @@ func.func @tensor_scatter_sub(%tensor: tensor<?x?x?xf32>, %indices: tensor<?x2xi
 
 // CHECK-LABEL: @tensor_scatter_min
 func.func @tensor_scatter_min(%tensor: tensor<?x?x?xf32>, %indices: tensor<?x2xi32>, %updates: tensor<?x?xf32>) -> tensor<?x?x?xf32> {
-  // CHECK: "mhlo.scatter"(%arg0, %arg1, %arg2) ({
-  // CHECK:  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
-  // CHECK:    %1 = mhlo.minimum %arg3, %arg4 : tensor<f32>
-  // CHECK:    mhlo.return %1 : tensor<f32>
-  // CHECK:  })
+  // CHECK: "mhlo.scatter"(%arg0, %arg1, %arg2)
   // CHECK-SAME: indices_are_sorted = false
   // CHECK-SAME: scatter_dimension_numbers
   // CHECK-SAME:   update_window_dims = [1]
@@ -4837,6 +4833,10 @@ func.func @tensor_scatter_min(%tensor: tensor<?x?x?xf32>, %indices: tensor<?x2xi
   // CHECK-SAME:   scatter_dims_to_operand_dims = [0, 1]
   // CHECK-SAME:   index_vector_dim = 1
   // CHECK-SAME: unique_indices = false
+  // CHECK:  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
+  // CHECK:    %1 = mhlo.minimum %arg3, %arg4 : tensor<f32>
+  // CHECK:    mhlo.return %1 : tensor<f32>
+  // CHECK:  })
   %0 = "tf.TensorScatterMin"(%tensor, %indices, %updates) : (tensor<?x?x?xf32>, tensor<?x2xi32>, tensor<?x?xf32>) -> tensor<?x?x?xf32>
   func.return %0 : tensor<?x?x?xf32>
 }
@@ -4845,11 +4845,7 @@ func.func @tensor_scatter_min(%tensor: tensor<?x?x?xf32>, %indices: tensor<?x2xi
 
 // CHECK-LABEL: @tensor_scatter_max
 func.func @tensor_scatter_max(%tensor: tensor<?x?x?xf32>, %indices: tensor<?x2xi32>, %updates: tensor<?x?xf32>) -> tensor<?x?x?xf32> {
-  // CHECK: "mhlo.scatter"(%arg0, %arg1, %arg2) ({
-  // CHECK:  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
-  // CHECK:    %1 = mhlo.maximum %arg3, %arg4 : tensor<f32>
-  // CHECK:    mhlo.return %1 : tensor<f32>
-  // CHECK:  })
+  // CHECK: "mhlo.scatter"(%arg0, %arg1, %arg2)
   // CHECK-SAME: indices_are_sorted = false
   // CHECK-SAME: scatter_dimension_numbers
   // CHECK-SAME:   update_window_dims = [1]
@@ -4857,6 +4853,10 @@ func.func @tensor_scatter_max(%tensor: tensor<?x?x?xf32>, %indices: tensor<?x2xi
   // CHECK-SAME:   scatter_dims_to_operand_dims = [0, 1]
   // CHECK-SAME:   index_vector_dim = 1
   // CHECK-SAME: unique_indices = false
+  // CHECK:  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
+  // CHECK:    %1 = mhlo.maximum %arg3, %arg4 : tensor<f32>
+  // CHECK:    mhlo.return %1 : tensor<f32>
+  // CHECK:  })
   %0 = "tf.TensorScatterMax"(%tensor, %indices, %updates) : (tensor<?x?x?xf32>, tensor<?x2xi32>, tensor<?x?xf32>) -> tensor<?x?x?xf32>
   func.return %0 : tensor<?x?x?xf32>
 }
@@ -4894,11 +4894,11 @@ func.func @random_shuffle_1D_16(%input: tensor<16xf32>) -> tensor<16xf32> {
   // CHECK-DAG: [[SHAPE:%.*]] = mhlo.constant dense<16> : tensor<1xi64>
   // CHECK-DAG: [[LOWER:%.*]] = mhlo.constant dense<0> : tensor<i32>
   // CHECK-DAG: [[UPPER:%.*]] = mhlo.constant dense<-1> : tensor<i32>
-  // CHECK: [[RNG:%.*]] = "mhlo.rng"([[LOWER]], [[UPPER]], [[SHAPE]]) {rng_distribution = #mhlo.rng_distribution<UNIFORM>}
-  // CHECK: [[SORT:%.*]]:2 = "mhlo.sort"([[RNG]], [[INPUT]]) ({
+  // CHECK: [[RNG:%.*]] = "mhlo.rng"([[LOWER]], [[UPPER]], [[SHAPE]]) <{rng_distribution = #mhlo.rng_distribution<UNIFORM>}>
+  // CHECK: [[SORT:%.*]]:2 = "mhlo.sort"([[RNG]], [[INPUT]]) <{dimension = -1 : i64, is_stable = {{.*}}}> ({
   // CHECK: ^{{.*}}([[ARG1:%.*]]: tensor<i32>, [[ARG2:%.*]]: tensor<i32>, {{.*}}: tensor<f32>, {{.*}}: tensor<f32>):
   // CHECK:   mhlo.compare LT, [[ARG1]], [[ARG2]], TOTALORDER
-  // CHECK: }) {dimension = -1 : i64, is_stable = {{.*}}} : (tensor<16xi32>, tensor<16xf32>) -> (tensor<16xi32>, tensor<16xf32>)
+  // CHECK: }) : (tensor<16xi32>, tensor<16xf32>) -> (tensor<16xi32>, tensor<16xf32>)
   // CHECK: return [[SORT]]#1
   %0 = "tf.RandomShuffle"(%input) : (tensor<16xf32>) -> (tensor<16xf32>)
   func.return %0: tensor<16xf32>
@@ -4921,12 +4921,12 @@ func.func @random_shuffle_1D_10240(%input: tensor<10240xf32>) -> tensor<10240xf3
 // CHECK-LABEL: @random_shuffle_3D
 // CHECK-SAME: [[INPUT:%.*]]: tensor<4x?x16xf32>
 func.func @random_shuffle_3D(%input: tensor<4x?x16xf32>) -> tensor<4x?x16xf32> {
-  // CHECK: [[INDICES:%.*]] = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<4xi32>
+  // CHECK: [[INDICES:%.*]] = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<4xi32>
 
   // CHECK-DAG: [[RNG_SHAPE:%.*]] = mhlo.constant dense<4> : tensor<1xi64>
   // CHECK-DAG: [[RNG_LOWER:%.*]] = mhlo.constant dense<0> : tensor<i32>
   // CHECK-DAG: [[RNG_UPPER:%.*]] = mhlo.constant dense<4> : tensor<i32>
-  // CHECK: [[SWAPS:%.*]] = "mhlo.rng"([[RNG_LOWER]], [[RNG_UPPER]], [[RNG_SHAPE]]) {rng_distribution = #mhlo.rng_distribution<UNIFORM>}
+  // CHECK: [[SWAPS:%.*]] = "mhlo.rng"([[RNG_LOWER]], [[RNG_UPPER]], [[RNG_SHAPE]]) <{rng_distribution = #mhlo.rng_distribution<UNIFORM>}>
 
   // CHECK: [[IV_INIT:%.*]] = mhlo.constant dense<0> : tensor<i32>
 
@@ -4935,10 +4935,10 @@ func.func @random_shuffle_3D(%input: tensor<4x?x16xf32>) -> tensor<4x?x16xf32> {
   // CHECK:   [[CMP:%.*]] = mhlo.compare LT, [[ITER_ARG0]], [[LIMIT]], NOTYPE
   // CHECK:   mhlo.return [[CMP]]
   // CHECK: } do {
-  // CHECK:   [[SRC_IDX:%.*]] = "mhlo.dynamic_slice"([[ITER_ARG2]], [[ITER_ARG0]]) {slice_sizes = dense<1> : tensor<1xi64>} : (tensor<4xi32>, tensor<i32>) -> tensor<1xi32>
-  // CHECK:   [[SWP_IDX:%.*]] = "mhlo.dynamic_slice"([[ITER_ARG1]], [[ITER_ARG0]]) {slice_sizes = dense<1> : tensor<1xi64>} : (tensor<4xi32>, tensor<i32>) -> tensor<1xi32>
+  // CHECK:   [[SRC_IDX:%.*]] = "mhlo.dynamic_slice"([[ITER_ARG2]], [[ITER_ARG0]]) <{slice_sizes = dense<1> : tensor<1xi64>}> : (tensor<4xi32>, tensor<i32>) -> tensor<1xi32>
+  // CHECK:   [[SWP_IDX:%.*]] = "mhlo.dynamic_slice"([[ITER_ARG1]], [[ITER_ARG0]]) <{slice_sizes = dense<1> : tensor<1xi64>}> : (tensor<4xi32>, tensor<i32>) -> tensor<1xi32>
   // CHECK:   [[SWP:%.*]] = mhlo.reshape [[SWP_IDX]] : (tensor<1xi32>) -> tensor<i32>
-  // CHECK:   [[TGT_IDX:%.*]] = "mhlo.dynamic_slice"([[ITER_ARG2]], [[SWP]]) {slice_sizes = dense<1> : tensor<1xi64>}
+  // CHECK:   [[TGT_IDX:%.*]] = "mhlo.dynamic_slice"([[ITER_ARG2]], [[SWP]]) <{slice_sizes = dense<1> : tensor<1xi64>}>
   // CHECK:   [[INDICES1:%.*]] = mhlo.dynamic_update_slice [[ITER_ARG2]], [[TGT_IDX]], [[ITER_ARG0]] : (tensor<4xi32>, tensor<1xi32>, tensor<i32>) -> tensor<4xi32>
   // CHECK:   [[INDICES2:%.*]] = mhlo.dynamic_update_slice [[INDICES1]], [[SRC_IDX]], [[SWP]] : (tensor<4xi32>, tensor<1xi32>, tensor<i32>) -> tensor<4xi32>
   // CHECK:   [[ONE:%.*]] = mhlo.constant dense<1> : tensor<i32>
@@ -4952,7 +4952,7 @@ func.func @random_shuffle_3D(%input: tensor<4x?x16xf32>) -> tensor<4x?x16xf32> {
   // CHECK: [[INDEX_CAST:%.*]] = arith.index_cast [[SHAPE_DIM]] : index to i64
   // CHECK: [[FROM_ELEMENTS:%.*]] = tensor.from_elements [[INDEX_CAST]] : tensor<1xi64>
   // CHECK: [[CONSTANT2:%.*]] = mhlo.constant dense<16> : tensor<1xi64>
-  // CHECK: [[CONCATENATE:%.*]] = "mhlo.concatenate"([[CONSTANT1]], [[FROM_ELEMENTS]], [[CONSTANT2]]) {dimension = 0 : i64} : (tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<3xi64>
+  // CHECK: [[CONCATENATE:%.*]] = "mhlo.concatenate"([[CONSTANT1]], [[FROM_ELEMENTS]], [[CONSTANT2]]) <{dimension = 0 : i64}> : (tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<3xi64>
   // CHECK: [[DYNAMIC_GATHER:%.*]] = "mhlo.dynamic_gather"([[INPUT]], [[WHILE_OUT]]#2, [[CONCATENATE]])
   // CHECK-SAME:   dimension_numbers =
   // CHECK-SAME:     offset_dims = [1, 2]
@@ -4978,13 +4978,13 @@ func.func @random_shuffle_3D(%input: tensor<4x?x16xf32>) -> tensor<4x?x16xf32> {
 // CHECK-SAME:      [[ARG:%.+]]: tensor<2x12x21x7xf16>
 // CHECK:           [[CONV32:%.+]] = mhlo.convert %arg0 : (tensor<2x12x21x7xf16>) -> tensor<2x12x21x7xf32>
 // CHECK:           [[ZERO:%.+]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
-// CHECK:           [[DIVIDEND:%.+]] = "mhlo.reduce_window"([[CONV32]], [[ZERO]]) ({
+// CHECK:           [[DIVIDEND:%.+]] = "mhlo.reduce_window"([[CONV32]], [[ZERO]])
+// CHECK-SAME:        window_dimensions = dense<[1, 2, 2, 1]>
+// CHECK-SAME:        window_strides = dense<[1, 4, 4, 1]>
 // CHECK:           ^bb0([[ARG1:%.+]]: tensor<f32>, [[ARG2:%.+]]: tensor<f32>):
 // CHECK:             [[ADD:%.+]] = mhlo.add [[ARG1]], [[ARG2]]
 // CHECK:             mhlo.return [[ADD]]
 // CHECK:           })
-// CHECK-SAME:        window_dimensions = dense<[1, 2, 2, 1]>
-// CHECK-SAME:        window_strides = dense<[1, 4, 4, 1]>
 // CHECK-SAME:        -> tensor<2x3x5x7xf32>
 // CHECK:           [[COUNT:%.+]] = mhlo.constant dense<4.000000e+00> : tensor<f32>
 // CHECK:           [[DIV_RESULT:%.+]] = chlo.broadcast_divide [[DIVIDEND]], [[COUNT]]
@@ -5004,13 +5004,13 @@ func.func @avgpool_valid_padding(%arg0: tensor<2x12x21x7xf16>) -> tensor<2x3x5x7
 // CHECK-SAME:      [[ARG:%.+]]: tensor<2x4x12x21x7xf16>
 // CHECK:           [[CONV32:%.+]] = mhlo.convert %arg0 : (tensor<2x4x12x21x7xf16>) -> tensor<2x4x12x21x7xf32>
 // CHECK:           [[ZERO:%.+]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
-// CHECK:           [[DIVIDEND:%.+]] = "mhlo.reduce_window"([[CONV32]], [[ZERO]]) ({
+// CHECK:           [[DIVIDEND:%.+]] = "mhlo.reduce_window"([[CONV32]], [[ZERO]])
+// CHECK-SAME:        window_dimensions = dense<[1, 1, 2, 2, 1]>
+// CHECK-SAME:        window_strides = dense<[1, 1, 4, 4, 1]>
 // CHECK:           ^bb0([[ARG1:%.+]]: tensor<f32>, [[ARG2:%.+]]: tensor<f32>):
 // CHECK:           [[ADD:%.+]] = mhlo.add [[ARG1]], [[ARG2]]
 // CHECK:             mhlo.return [[ADD]]
 // CHECK:           })
-// CHECK-SAME:        window_dimensions = dense<[1, 1, 2, 2, 1]>
-// CHECK-SAME:        window_strides = dense<[1, 1, 4, 4, 1]>
 // CHECK-SAME:        -> tensor<2x4x3x5x7xf32>
 // CHECK:           [[COUNT:%.+]] = mhlo.constant dense<4.000000e+00> : tensor<f32>
 // CHECK:           [[DIV_RESULT:%.+]] = chlo.broadcast_divide [[DIVIDEND]], [[COUNT]]
@@ -5030,13 +5030,13 @@ func.func @avgpool_3d_valid_padding(%arg0: tensor<2x4x12x21x7xf16>) -> tensor<2x
 // CHECK-SAME:      [[ARG:%.+]]: tensor<2x7x12x21xf16>
 // CHECK:           [[CONV32:%.+]] = mhlo.convert %arg0 : (tensor<2x7x12x21xf16>) -> tensor<2x7x12x21xf32>
 // CHECK:           [[ZERO:%.+]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
-// CHECK:           [[DIVIDEND:%.+]] = "mhlo.reduce_window"([[CONV32]], [[ZERO]]) ({
+// CHECK:           [[DIVIDEND:%.+]] = "mhlo.reduce_window"([[CONV32]], [[ZERO]])
+// CHECK-SAME:        window_dimensions = dense<[1, 1, 2, 2]>
+// CHECK-SAME:        window_strides = dense<[1, 1, 4, 4]>
 // CHECK:           ^bb0([[ARG1:%.+]]: tensor<f32>, [[ARG2:%.+]]: tensor<f32>):
 // CHECK:             [[ADD:%.+]] = mhlo.add [[ARG1]], [[ARG2]]
 // CHECK:             mhlo.return [[ADD]]
 // CHECK:           })
-// CHECK-SAME:        window_dimensions = dense<[1, 1, 2, 2]>
-// CHECK-SAME:        window_strides = dense<[1, 1, 4, 4]>
 // CHECK-SAME:        -> tensor<2x7x3x5xf32>
 // CHECK:           [[COUNT:%.+]] = mhlo.constant dense<4.000000e+00> : tensor<f32>
 // CHECK:           [[DIV_RESULT:%.+]] = chlo.broadcast_divide [[DIVIDEND]], [[COUNT]]
@@ -5056,13 +5056,13 @@ func.func @avgpool_nchw_format(%arg0: tensor<2x7x12x21xf16>) -> tensor<2x7x3x5xf
 // CHECK-SAME:      [[ARG:%.+]]: tensor<2x7x4x12x21xf16>
 // CHECK:           [[CONV32:%.+]] = mhlo.convert %arg0 : (tensor<2x7x4x12x21xf16>) -> tensor<2x7x4x12x21xf32>
 // CHECK:           [[ZERO:%.+]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
-// CHECK:           [[DIVIDEND:%.+]] = "mhlo.reduce_window"([[CONV32]], [[ZERO]]) ({
+// CHECK:           [[DIVIDEND:%.+]] = "mhlo.reduce_window"([[CONV32]], [[ZERO]])
+// CHECK-SAME:        window_dimensions = dense<[1, 1, 1, 2, 2]>
+// CHECK-SAME:        window_strides = dense<[1, 1, 1, 4, 4]>
 // CHECK:           ^bb0([[ARG1:%.+]]: tensor<f32>, [[ARG2:%.+]]: tensor<f32>):
 // CHECK:             [[ADD:%.+]] = mhlo.add [[ARG1]], [[ARG2]]
 // CHECK:             mhlo.return [[ADD]]
 // CHECK:           })
-// CHECK-SAME:        window_dimensions = dense<[1, 1, 1, 2, 2]>
-// CHECK-SAME:        window_strides = dense<[1, 1, 1, 4, 4]>
 // CHECK-SAME:        -> tensor<2x7x4x3x5xf32>
 // CHECK:           [[COUNT:%.+]] = mhlo.constant dense<4.000000e+00> : tensor<f32>
 // CHECK:           [[DIV_RESULT:%.+]] = chlo.broadcast_divide [[DIVIDEND]], [[COUNT]]
@@ -5081,24 +5081,24 @@ func.func @avgpool_3d_ncdhw_format(%arg0: tensor<2x7x4x12x21xf16>) -> tensor<2x7
 // CHECK-LABEL:   @avgpool_same_padding(
 // CHECK-SAME:      %[[ARG0:.*]]: tensor<2x12x21x7xf32>) -> tensor<2x4x6x7xf32>
 // CHECK:           %[[ZERO:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
-// CHECK:           %[[DIVIDEND:.*]] = "mhlo.reduce_window"(%[[ARG0]], %[[ZERO]]) ({
+// CHECK:           %[[DIVIDEND:.*]] = "mhlo.reduce_window"(%[[ARG0]], %[[ZERO]])
+// CHECK-SAME:        padding = dense<{{\[\[}}0, 0], [1, 1], [0, 1], [0, 0]]>
+// CHECK-SAME:        window_dimensions = dense<[1, 5, 2, 1]>
+// CHECK-SAME:        window_strides = dense<[1, 3, 4, 1]>
 // CHECK:           ^bb0(%[[ARG1:.*]]: tensor<f32>, %[[ARG2:.*]]: tensor<f32>):
 // CHECK:             %[[SUM1:.*]] = mhlo.add %[[ARG1]], %[[ARG2]] : tensor<f32>
 // CHECK:             mhlo.return %[[SUM1]] : tensor<f32>
 // CHECK:           })
+// CHECK-SAME:        -> tensor<2x4x6x7xf32>
+// CHECK:           %[[ONES:.*]] = mhlo.constant dense<1.000000e+00> : tensor<2x12x21x7xf32>
+// CHECK:           %[[DIVISOR:.*]] = "mhlo.reduce_window"(%[[ONES]], %[[ZERO]])
 // CHECK-SAME:        padding = dense<{{\[\[}}0, 0], [1, 1], [0, 1], [0, 0]]>
 // CHECK-SAME:        window_dimensions = dense<[1, 5, 2, 1]>
 // CHECK-SAME:        window_strides = dense<[1, 3, 4, 1]>
-// CHECK-SAME:        -> tensor<2x4x6x7xf32>
-// CHECK:           %[[ONES:.*]] = mhlo.constant dense<1.000000e+00> : tensor<2x12x21x7xf32>
-// CHECK:           %[[DIVISOR:.*]] = "mhlo.reduce_window"(%[[ONES]], %[[ZERO]]) ({
 // CHECK:           ^bb0(%[[ARG3:.*]]: tensor<f32>, %[[ARG4:.*]]: tensor<f32>):
 // CHECK:             %[[SUM2:.*]] = mhlo.add %[[ARG3]], %[[ARG4]] : tensor<f32>
 // CHECK:             mhlo.return %[[SUM2]] : tensor<f32>
 // CHECK:           })
-// CHECK-SAME:        padding = dense<{{\[\[}}0, 0], [1, 1], [0, 1], [0, 0]]>
-// CHECK-SAME:        window_dimensions = dense<[1, 5, 2, 1]>
-// CHECK-SAME:        window_strides = dense<[1, 3, 4, 1]>
 // CHECK-SAME:        -> tensor<2x4x6x7xf32>
 // CHECK:           %[[RESULT:.*]] = mhlo.divide %[[DIVIDEND]], %[[DIVISOR]] : tensor<2x4x6x7xf32>
 // CHECK:           return %[[RESULT]] : tensor<2x4x6x7xf32>
@@ -5113,24 +5113,24 @@ func.func @avgpool_same_padding(%arg0: tensor<2x12x21x7xf32>) -> tensor<2x4x6x7x
 // CHECK-LABEL:   @avgpool_3d_same_padding(
 // CHECK-SAME:      %[[ARG0:.*]]: tensor<2x4x12x21x7xf32>) -> tensor<2x4x4x6x7xf32>
 // CHECK:           %[[ZERO:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
-// CHECK:           %[[DIVIDEND:.*]] = "mhlo.reduce_window"(%[[ARG0]], %[[ZERO]]) ({
+// CHECK:           %[[DIVIDEND:.*]] = "mhlo.reduce_window"(%[[ARG0]], %[[ZERO]])
+// CHECK-SAME:        padding = dense<{{\[\[}}0, 0], [0, 0], [1, 1], [0, 1], [0, 0]]>
+// CHECK-SAME:        window_dimensions = dense<[1, 1, 5, 2, 1]>
+// CHECK-SAME:        window_strides = dense<[1, 1, 3, 4, 1]>
 // CHECK:           ^bb0(%[[ARG1:.*]]: tensor<f32>, %[[ARG2:.*]]: tensor<f32>):
 // CHECK:             %[[SUM1:.*]] = mhlo.add %[[ARG1]], %[[ARG2]] : tensor<f32>
 // CHECK:             mhlo.return %[[SUM1]] : tensor<f32>
 // CHECK:           })
+// CHECK-SAME:        -> tensor<2x4x4x6x7xf32>
+// CHECK:           %[[ONES:.*]] = mhlo.constant dense<1.000000e+00> : tensor<2x4x12x21x7xf32>
+// CHECK:           %[[DIVISOR:.*]] = "mhlo.reduce_window"(%[[ONES]], %[[ZERO]])
 // CHECK-SAME:        padding = dense<{{\[\[}}0, 0], [0, 0], [1, 1], [0, 1], [0, 0]]>
 // CHECK-SAME:        window_dimensions = dense<[1, 1, 5, 2, 1]>
 // CHECK-SAME:        window_strides = dense<[1, 1, 3, 4, 1]>
-// CHECK-SAME:        -> tensor<2x4x4x6x7xf32>
-// CHECK:           %[[ONES:.*]] = mhlo.constant dense<1.000000e+00> : tensor<2x4x12x21x7xf32>
-// CHECK:           %[[DIVISOR:.*]] = "mhlo.reduce_window"(%[[ONES]], %[[ZERO]]) ({
 // CHECK:           ^bb0(%[[ARG3:.*]]: tensor<f32>, %[[ARG4:.*]]: tensor<f32>):
 // CHECK:             %[[SUM2:.*]] = mhlo.add %[[ARG3]], %[[ARG4]] : tensor<f32>
 // CHECK:             mhlo.return %[[SUM2]] : tensor<f32>
 // CHECK:           })
-// CHECK-SAME:        padding = dense<{{\[\[}}0, 0], [0, 0], [1, 1], [0, 1], [0, 0]]>
-// CHECK-SAME:        window_dimensions = dense<[1, 1, 5, 2, 1]>
-// CHECK-SAME:        window_strides = dense<[1, 1, 3, 4, 1]>
 // CHECK-SAME:        -> tensor<2x4x4x6x7xf32>
 // CHECK:           %[[RESULT:.*]] = mhlo.divide %[[DIVIDEND]], %[[DIVISOR]]
 // CHECK:           return %[[RESULT]] : tensor<2x4x4x6x7xf32>
@@ -5158,13 +5158,13 @@ func.func @avgpool_3d_same_padding(%arg0: tensor<2x4x12x21x7xf32>) -> tensor<2x4
 // CHECK-SAME:        edge_padding_low = dense<[0, 1, 1, 0]>
 // CHECK-SAME:        interior_padding = dense<[0, 1, 1, 0]>
 // CHECK-SAME:        -> tensor<10x25x33x64xf32>
-// CHECK:           %[[RESULT:.*]] = "mhlo.reduce_window"(%[[REDUCE_WINDOW_INPUT]], %[[ZERO]]) ({
+// CHECK:           %[[RESULT:.*]] = "mhlo.reduce_window"(%[[REDUCE_WINDOW_INPUT]], %[[ZERO]])
+// CHECK-SAME:        window_dimensions = dense<[1, 2, 2, 1]>
+// CHECK-SAME:        window_strides = dense<1>
 // CHECK:           ^bb0(%[[ARG1:.*]]: tensor<f32>, %[[ARG2:.*]]: tensor<f32>):
 // CHECK:             %[[SUM:.*]] = mhlo.add %[[ARG1]], %[[ARG2]] : tensor<f32>
 // CHECK:             mhlo.return %[[SUM]] : tensor<f32>
 // CHECK:           })
-// CHECK-SAME:        window_dimensions = dense<[1, 2, 2, 1]>
-// CHECK-SAME:        window_strides = dense<1>
 // CHECK-SAME:        -> tensor<10x24x32x64xf32>
 // CHECK:           return %[[RESULT]] : tensor<10x24x32x64xf32>
 func.func @avgpool_grad_valid_padding(%grad: tensor<10x12x16x64xf32>) -> tensor<10x24x32x64xf32> {
@@ -5190,13 +5190,13 @@ func.func @avgpool_grad_valid_padding(%grad: tensor<10x12x16x64xf32>) -> tensor<
 // CHECK-SAME:        edge_padding_low = dense<[0, 0, 1, 1, 0]>
 // CHECK-SAME:        interior_padding = dense<[0, 0, 1, 1, 0]>
 // CHECK-SAME:        -> tensor<10x8x25x33x64xf32>
-// CHECK:           %[[RESULT:.*]] = "mhlo.reduce_window"(%[[REDUCE_WINDOW_INPUT]], %[[ZERO]]) ({
+// CHECK:           %[[RESULT:.*]] = "mhlo.reduce_window"(%[[REDUCE_WINDOW_INPUT]], %[[ZERO]])
+// CHECK-SAME:        window_dimensions = dense<[1, 1, 2, 2, 1]>
+// CHECK-SAME:        window_strides = dense<1>
 // CHECK:           ^bb0(%[[ARG1:.*]]: tensor<f32>, %[[ARG2:.*]]: tensor<f32>):
 // CHECK:             %[[SUM:.*]] = mhlo.add %[[ARG1]], %[[ARG2]] : tensor<f32>
 // CHECK:             mhlo.return %[[SUM]] : tensor<f32>
 // CHECK:           })
-// CHECK-SAME:        window_dimensions = dense<[1, 1, 2, 2, 1]>
-// CHECK-SAME:        window_strides = dense<1>
 // CHECK-SAME:        -> tensor<10x8x24x32x64xf32>
 // CHECK:           return %[[RESULT]] : tensor<10x8x24x32x64xf32>
 func.func @avgpool_3d_grad_valid_padding(%grad: tensor<10x8x12x16x64xf32>) -> tensor<10x8x24x32x64xf32> {
@@ -5215,14 +5215,14 @@ func.func @avgpool_3d_grad_valid_padding(%grad: tensor<10x8x12x16x64xf32>) -> te
 // CHECK-SAME:      %[[OUT_GRAD:.*]]: tensor<2x4x7x9xf32>) -> tensor<2x13x25x9xf32> {
 // CHECK-DAG:       %[[ZERO:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
 // CHECK-DAG:       %[[ALL_ONES:.*]] = mhlo.constant dense<1.000000e+00> : tensor<2x13x25x9xf32>
-// CHECK:           %[[DIVISOR:.*]] = "mhlo.reduce_window"(%[[ALL_ONES]], %[[ZERO]]) ({
+// CHECK:           %[[DIVISOR:.*]] = "mhlo.reduce_window"(%[[ALL_ONES]], %[[ZERO]])
+// CHECK-SAME:        padding = dense<{{\[\[}}0, 0], [0, 1], [1, 1], [0, 0]]>
+// CHECK-SAME:        window_dimensions = dense<[1, 2, 3, 1]>
+// CHECK-SAME:        window_strides = dense<[1, 4, 4, 1]>
 // CHECK:           ^bb0(%[[ARG1:.*]]: tensor<f32>, %[[ARG2:.*]]: tensor<f32>):
 // CHECK:             %[[SUM1:.*]] = mhlo.add %[[ARG1]], %[[ARG2]] : tensor<f32>
 // CHECK:             mhlo.return %[[SUM1]] : tensor<f32>
 // CHECK:           })
-// CHECK-SAME:        padding = dense<{{\[\[}}0, 0], [0, 1], [1, 1], [0, 0]]>
-// CHECK-SAME:        window_dimensions = dense<[1, 2, 3, 1]>
-// CHECK-SAME:        window_strides = dense<[1, 4, 4, 1]>
 // CHECK-SAME:        -> tensor<2x4x7x9xf32>
 // CHECK:           %[[OUT_GRAD_DIVIDED:.*]] = mhlo.divide %[[OUT_GRAD]], %[[DIVISOR]] : tensor<2x4x7x9xf32>
 // CHECK:           %[[REDUCE_WINDOW_INPUT:.*]] = "mhlo.pad"(%[[OUT_GRAD_DIVIDED]], %[[ZERO]])
@@ -5230,13 +5230,13 @@ func.func @avgpool_3d_grad_valid_padding(%grad: tensor<10x8x12x16x64xf32>) -> te
 // CHECK-SAME:        edge_padding_low = dense<[0, 1, 1, 0]>
 // CHECK-SAME:        interior_padding = dense<[0, 3, 3, 0]>
 // CHECK-SAME:        -> tensor<2x14x27x9xf32>
-// CHECK:           %[[RESULT:.*]] = "mhlo.reduce_window"(%[[REDUCE_WINDOW_INPUT]], %[[ZERO]]) ({
+// CHECK:           %[[RESULT:.*]] = "mhlo.reduce_window"(%[[REDUCE_WINDOW_INPUT]], %[[ZERO]])
+// CHECK-SAME:        window_dimensions = dense<[1, 2, 3, 1]>
+// CHECK-SAME:        window_strides = dense<1>
 // CHECK:           ^bb0(%[[ARG3:.*]]: tensor<f32>, %[[ARG4:.*]]: tensor<f32>):
 // CHECK:             %[[SUM2:.*]] = mhlo.add %[[ARG3]], %[[ARG4]] : tensor<f32>
 // CHECK:             mhlo.return %[[SUM2]] : tensor<f32>
 // CHECK:           })
-// CHECK-SAME:        window_dimensions = dense<[1, 2, 3, 1]>
-// CHECK-SAME:        window_strides = dense<1>
 // CHECK-SAME:        -> tensor<2x13x25x9xf32>
 // CHECK:           return %[[RESULT]] : tensor<2x13x25x9xf32>
 func.func @avgpool_grad_same_padding(%grad: tensor<2x4x7x9xf32>) -> tensor<2x13x25x9xf32> {
@@ -5256,14 +5256,14 @@ func.func @avgpool_grad_same_padding(%grad: tensor<2x4x7x9xf32>) -> tensor<2x13x
 // CHECK-SAME:      %[[OUT_GRAD:.*]]: tensor<2x8x4x7x9xf32>) -> tensor<2x8x13x25x9xf32> {
 // CHECK-DAG:       %[[ZERO:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
 // CHECK-DAG:       %[[ALL_ONES:.*]] = mhlo.constant dense<1.000000e+00> : tensor<2x8x13x25x9xf32>
-// CHECK:           %[[DIVISOR:.*]] = "mhlo.reduce_window"(%[[ALL_ONES]], %[[ZERO]]) ({
+// CHECK:           %[[DIVISOR:.*]] = "mhlo.reduce_window"(%[[ALL_ONES]], %[[ZERO]])
+// CHECK-SAME:        padding = dense<{{\[\[}}0, 0], [0, 0], [0, 1], [1, 1], [0, 0]]>
+// CHECK-SAME:        window_dimensions = dense<[1, 1, 2, 3, 1]>
+// CHECK-SAME:        window_strides = dense<[1, 1, 4, 4, 1]>
 // CHECK:           ^bb0(%[[ARG1:.*]]: tensor<f32>, %[[ARG2:.*]]: tensor<f32>):
 // CHECK:             %[[SUM1:.*]] = mhlo.add %[[ARG1]], %[[ARG2]] : tensor<f32>
 // CHECK:             mhlo.return %[[SUM1]] : tensor<f32>
 // CHECK:           })
-// CHECK-SAME:        padding = dense<{{\[\[}}0, 0], [0, 0], [0, 1], [1, 1], [0, 0]]>
-// CHECK-SAME:        window_dimensions = dense<[1, 1, 2, 3, 1]>
-// CHECK-SAME:        window_strides = dense<[1, 1, 4, 4, 1]>
 // CHECK-SAME:        -> tensor<2x8x4x7x9xf32>
 // CHECK:           %[[OUT_GRAD_DIVIDED:.*]] = mhlo.divide %[[OUT_GRAD]], %[[DIVISOR]] : tensor<2x8x4x7x9xf32>
 // CHECK:           %[[REDUCE_WINDOW_INPUT:.*]] = "mhlo.pad"(%[[OUT_GRAD_DIVIDED]], %[[ZERO]])
@@ -5271,13 +5271,13 @@ func.func @avgpool_grad_same_padding(%grad: tensor<2x4x7x9xf32>) -> tensor<2x13x
 // CHECK-SAME:        edge_padding_low = dense<[0, 0, 1, 1, 0]>
 // CHECK-SAME:        interior_padding = dense<[0, 0, 3, 3, 0]>
 // CHECK-SAME:        -> tensor<2x8x14x27x9xf32>
-// CHECK:           %[[RESULT:.*]] = "mhlo.reduce_window"(%[[REDUCE_WINDOW_INPUT]], %[[ZERO]]) ({
+// CHECK:           %[[RESULT:.*]] = "mhlo.reduce_window"(%[[REDUCE_WINDOW_INPUT]], %[[ZERO]])
+// CHECK-SAME:        window_dimensions = dense<[1, 1, 2, 3, 1]>
+// CHECK-SAME:        window_strides = dense<1>
 // CHECK:           ^bb0(%[[ARG3:.*]]: tensor<f32>, %[[ARG4:.*]]: tensor<f32>):
 // CHECK:             %[[SUM2:.*]] = mhlo.add %[[ARG3]], %[[ARG4]] : tensor<f32>
 // CHECK:             mhlo.return %[[SUM2]] : tensor<f32>
 // CHECK:           })
-// CHECK-SAME:        window_dimensions = dense<[1, 1, 2, 3, 1]>
-// CHECK-SAME:        window_strides = dense<1>
 // CHECK-SAME:        -> tensor<2x8x13x25x9xf32>
 // CHECK:           return %[[RESULT]] : tensor<2x8x13x25x9xf32>
 func.func @avgpool_3d_grad_same_padding(%grad: tensor<2x8x4x7x9xf32>) -> tensor<2x8x13x25x9xf32> {
@@ -5296,14 +5296,14 @@ func.func @avgpool_3d_grad_same_padding(%grad: tensor<2x8x4x7x9xf32>) -> tensor<
 // CHECK-SAME:      %[[OUT_GRAD:.*]]: tensor<2x9x4x7xf32>) -> tensor<2x9x13x25xf32> {
 // CHECK-DAG:       %[[ZERO:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
 // CHECK-DAG:       %[[ALL_ONES:.*]] = mhlo.constant dense<1.000000e+00> : tensor<2x9x13x25xf32>
-// CHECK:           %[[DIVISOR:.*]] = "mhlo.reduce_window"(%[[ALL_ONES]], %[[ZERO]]) ({
+// CHECK:           %[[DIVISOR:.*]] = "mhlo.reduce_window"(%[[ALL_ONES]], %[[ZERO]])
+// CHECK-SAME:        padding = dense<{{\[\[}}0, 0], [0, 0], [0, 1], [1, 1]]>
+// CHECK-SAME:        window_dimensions = dense<[1, 1, 2, 3]>
+// CHECK-SAME:        window_strides = dense<[1, 1, 4, 4]>
 // CHECK:           ^bb0(%[[ARG1:.*]]: tensor<f32>, %[[ARG2:.*]]: tensor<f32>):
 // CHECK:             %[[SUM1:.*]] = mhlo.add %[[ARG1]], %[[ARG2]] : tensor<f32>
 // CHECK:             mhlo.return %[[SUM1]] : tensor<f32>
 // CHECK:           })
-// CHECK-SAME:        padding = dense<{{\[\[}}0, 0], [0, 0], [0, 1], [1, 1]]>
-// CHECK-SAME:        window_dimensions = dense<[1, 1, 2, 3]>
-// CHECK-SAME:        window_strides = dense<[1, 1, 4, 4]>
 // CHECK-SAME:        -> tensor<2x9x4x7xf32>
 // CHECK:           %[[OUT_GRAD_DIVIDED:.*]] = mhlo.divide %[[OUT_GRAD]], %[[DIVISOR]] : tensor<2x9x4x7xf32>
 // CHECK:           %[[REDUCE_WINDOW_INPUT:.*]] = "mhlo.pad"(%[[OUT_GRAD_DIVIDED]], %[[ZERO]])
@@ -5311,13 +5311,13 @@ func.func @avgpool_3d_grad_same_padding(%grad: tensor<2x8x4x7x9xf32>) -> tensor<
 // CHECK-SAME:        edge_padding_low = dense<[0, 0, 1, 1]>
 // CHECK-SAME:        interior_padding = dense<[0, 0, 3, 3]>
 // CHECK-SAME:        -> tensor<2x9x14x27xf32>
-// CHECK:           %[[RESULT:.*]] = "mhlo.reduce_window"(%[[REDUCE_WINDOW_INPUT]], %[[ZERO]]) ({
+// CHECK:           %[[RESULT:.*]] = "mhlo.reduce_window"(%[[REDUCE_WINDOW_INPUT]], %[[ZERO]])
+// CHECK-SAME:        window_dimensions = dense<[1, 1, 2, 3]>
+// CHECK-SAME:        window_strides = dense<1>
 // CHECK:           ^bb0(%[[ARG3:.*]]: tensor<f32>, %[[ARG4:.*]]: tensor<f32>):
 // CHECK:             %[[SUM2:.*]] = mhlo.add %[[ARG3]], %[[ARG4]] : tensor<f32>
 // CHECK:             mhlo.return %[[SUM2]] : tensor<f32>
 // CHECK:           })
-// CHECK-SAME:        window_dimensions = dense<[1, 1, 2, 3]>
-// CHECK-SAME:        window_strides = dense<1>
 // CHECK-SAME:        -> tensor<2x9x13x25xf32>
 // CHECK:           return %[[RESULT]] : tensor<2x9x13x25xf32>
 func.func @avgpool_grad_nchw_format(%grad: tensor<2x9x4x7xf32>) -> tensor<2x9x13x25xf32> {
@@ -5337,14 +5337,14 @@ func.func @avgpool_grad_nchw_format(%grad: tensor<2x9x4x7xf32>) -> tensor<2x9x13
 // CHECK-SAME:      %[[OUT_GRAD:.*]]: tensor<2x9x8x4x7xf32>) -> tensor<2x9x8x13x25xf32> {
 // CHECK-DAG:       %[[ZERO:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
 // CHECK-DAG:       %[[ALL_ONES:.*]] = mhlo.constant dense<1.000000e+00> : tensor<2x9x8x13x25xf32>
-// CHECK:           %[[DIVISOR:.*]] = "mhlo.reduce_window"(%[[ALL_ONES]], %[[ZERO]]) ({
+// CHECK:           %[[DIVISOR:.*]] = "mhlo.reduce_window"(%[[ALL_ONES]], %[[ZERO]])
+// CHECK-SAME:        padding = dense<{{\[\[}}0, 0], [0, 0], [0, 0], [0, 1], [1, 1]]>
+// CHECK-SAME:        window_dimensions = dense<[1, 1, 1, 2, 3]>
+// CHECK-SAME:        window_strides = dense<[1, 1, 1, 4, 4]>
 // CHECK:           ^bb0(%[[ARG1:.*]]: tensor<f32>, %[[ARG2:.*]]: tensor<f32>):
 // CHECK:             %[[SUM1:.*]] = mhlo.add %[[ARG1]], %[[ARG2]] : tensor<f32>
 // CHECK:             mhlo.return %[[SUM1]] : tensor<f32>
 // CHECK:           })
-// CHECK-SAME:        padding = dense<{{\[\[}}0, 0], [0, 0], [0, 0], [0, 1], [1, 1]]>
-// CHECK-SAME:        window_dimensions = dense<[1, 1, 1, 2, 3]>
-// CHECK-SAME:        window_strides = dense<[1, 1, 1, 4, 4]>
 // CHECK-SAME:        -> tensor<2x9x8x4x7xf32>
 // CHECK:           %[[OUT_GRAD_DIVIDED:.*]] = mhlo.divide %[[OUT_GRAD]], %[[DIVISOR]] : tensor<2x9x8x4x7xf32>
 // CHECK:           %[[REDUCE_WINDOW_INPUT:.*]] = "mhlo.pad"(%[[OUT_GRAD_DIVIDED]], %[[ZERO]])
@@ -5352,13 +5352,13 @@ func.func @avgpool_grad_nchw_format(%grad: tensor<2x9x4x7xf32>) -> tensor<2x9x13
 // CHECK-SAME:        edge_padding_low = dense<[0, 0, 0, 1, 1]>
 // CHECK-SAME:        interior_padding = dense<[0, 0, 0, 3, 3]>
 // CHECK-SAME:        -> tensor<2x9x8x14x27xf32>
-// CHECK:           %[[RESULT:.*]] = "mhlo.reduce_window"(%[[REDUCE_WINDOW_INPUT]], %[[ZERO]]) ({
+// CHECK:           %[[RESULT:.*]] = "mhlo.reduce_window"(%[[REDUCE_WINDOW_INPUT]], %[[ZERO]])
+// CHECK-SAME:        window_dimensions = dense<[1, 1, 1, 2, 3]>
+// CHECK-SAME:        window_strides = dense<1> : tensor<5xi64>
 // CHECK:           ^bb0(%[[ARG3:.*]]: tensor<f32>, %[[ARG4:.*]]: tensor<f32>):
 // CHECK:             %[[SUM2:.*]] = mhlo.add %[[ARG3]], %[[ARG4]] : tensor<f32>
 // CHECK:             mhlo.return %[[SUM2]] : tensor<f32>
 // CHECK:           })
-// CHECK-SAME:        window_dimensions = dense<[1, 1, 1, 2, 3]>
-// CHECK-SAME:        window_strides = dense<1> : tensor<5xi64>
 // CHECK-SAME:        -> tensor<2x9x8x13x25xf32>
 // CHECK:           return %[[RESULT]] : tensor<2x9x8x13x25xf32>
 func.func @avgpool_3d_grad_ncdwh_format(%grad: tensor<2x9x8x4x7xf32>) -> tensor<2x9x8x13x25xf32> {
@@ -5387,13 +5387,13 @@ func.func @avgpool_3d_grad_ncdwh_format(%grad: tensor<2x9x8x4x7xf32>) -> tensor<
 // CHECK-SAME:        -> tensor<10x25x33x64xbf16>
 // CHECK:           %[[REDUCE_WINDOW_INPUT_CONVERTED:.*]] = mhlo.convert %[[REDUCE_WINDOW_INPUT]] : (tensor<10x25x33x64xbf16>) -> tensor<10x25x33x64xf32>
 // CHECK:           %[[ZERO_F32:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
-// CHECK:           %[[RESULT:.*]] = "mhlo.reduce_window"(%[[REDUCE_WINDOW_INPUT_CONVERTED]], %[[ZERO_F32]]) ({
+// CHECK:           %[[RESULT:.*]] = "mhlo.reduce_window"(%[[REDUCE_WINDOW_INPUT_CONVERTED]], %[[ZERO_F32]])
+// CHECK-SAME:        window_dimensions = dense<[1, 2, 2, 1]>
+// CHECK-SAME:        window_strides = dense<1>
 // CHECK:           ^bb0(%[[ARG1:.*]]: tensor<f32>, %[[ARG2:.*]]: tensor<f32>):
 // CHECK:             %[[SUM:.*]] = mhlo.add %[[ARG1]], %[[ARG2]] : tensor<f32>
 // CHECK:             mhlo.return %[[SUM]] : tensor<f32>
 // CHECK:           })
-// CHECK-SAME:        window_dimensions = dense<[1, 2, 2, 1]>
-// CHECK-SAME:        window_strides = dense<1>
 // CHECK-SAME:        -> tensor<10x24x32x64xf32>
 // CHECK:           %[[RESULT_CONVERTED:.*]] = mhlo.convert %[[RESULT]] : (tensor<10x24x32x64xf32>) -> tensor<10x24x32x64xbf16>
 // CHECK:           return %[[RESULT_CONVERTED]] : tensor<10x24x32x64xbf16>
@@ -5422,8 +5422,8 @@ func.func @xla_sharding(%arg0: tensor<4x16xf32>) -> tensor<4x16xf32> {
 // CHECK-LABEL: inplace_update_one
 func.func @inplace_update_one(%arg0: tensor<8x4xf32>, %arg1: tensor<1x4xf32>, %arg2: tensor<1xi32>) -> tensor<8x4xf32> {
   // CHECK-DAG: [[CST:%.+]] = mhlo.constant dense<0>
-  // CHECK-DAG: [[SLICE1:%.+]] = "mhlo.slice"(%arg2) {limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: [[SLICE2:%.+]] = "mhlo.slice"(%arg1) {limit_indices = dense<[1, 4]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
+  // CHECK-DAG: [[SLICE1:%.+]] = "mhlo.slice"(%arg2) <{limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}>
+  // CHECK-DAG: [[SLICE2:%.+]] = "mhlo.slice"(%arg1) <{limit_indices = dense<[1, 4]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}>
   // CHECK-DAG: [[RESHAPE1:%.+]] = mhlo.reshape [[SLICE1]]
   // CHECK-DAG: [[UPDATE:%.+]] = mhlo.dynamic_update_slice %arg0, [[SLICE2]], [[RESHAPE1]], [[CST]]
   %0 = "tf.InplaceUpdate"(%arg0, %arg2, %arg1) : (tensor<8x4xf32>, tensor<1xi32>, tensor<1x4xf32>) -> tensor<8x4xf32>
@@ -5437,12 +5437,12 @@ func.func @inplace_update_one(%arg0: tensor<8x4xf32>, %arg1: tensor<1x4xf32>, %a
 // CHECK-LABEL: inplace_update_three
 func.func @inplace_update_three(%arg0: tensor<8x8x4xf32>, %arg1: tensor<3x8x4xf32>, %arg2: tensor<3xi32>) -> tensor<8x8x4xf32> {
   // CHECK-DAG: [[CST:%.+]] = mhlo.constant dense<0>
-  // CHECK-DAG: [[SLICE1:%.+]] = "mhlo.slice"(%arg2) {limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: [[SLICE2:%.+]] = "mhlo.slice"(%arg2) {limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: [[SLICE3:%.+]] = "mhlo.slice"(%arg2) {limit_indices = dense<3> : tensor<1xi64>, start_indices = dense<2> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: [[SLICE4:%.+]] = "mhlo.slice"(%arg1) {limit_indices = dense<[1, 8, 4]> : tensor<3xi64>, start_indices = dense<0> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>}
-  // CHECK-DAG: [[SLICE5:%.+]] = "mhlo.slice"(%arg1) {limit_indices = dense<[2, 8, 4]> : tensor<3xi64>, start_indices = dense<[1, 0, 0]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>}
-  // CHECK-DAG: [[SLICE6:%.+]] = "mhlo.slice"(%arg1) {limit_indices = dense<[3, 8, 4]> : tensor<3xi64>, start_indices = dense<[2, 0, 0]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>}
+  // CHECK-DAG: [[SLICE1:%.+]] = "mhlo.slice"(%arg2) <{limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}>
+  // CHECK-DAG: [[SLICE2:%.+]] = "mhlo.slice"(%arg2) <{limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}>
+  // CHECK-DAG: [[SLICE3:%.+]] = "mhlo.slice"(%arg2) <{limit_indices = dense<3> : tensor<1xi64>, start_indices = dense<2> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}>
+  // CHECK-DAG: [[SLICE4:%.+]] = "mhlo.slice"(%arg1) <{limit_indices = dense<[1, 8, 4]> : tensor<3xi64>, start_indices = dense<0> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>}>
+  // CHECK-DAG: [[SLICE5:%.+]] = "mhlo.slice"(%arg1) <{limit_indices = dense<[2, 8, 4]> : tensor<3xi64>, start_indices = dense<[1, 0, 0]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>}>
+  // CHECK-DAG: [[SLICE6:%.+]] = "mhlo.slice"(%arg1) <{limit_indices = dense<[3, 8, 4]> : tensor<3xi64>, start_indices = dense<[2, 0, 0]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>}>
   // CHECK-DAG: [[RESHAPE1:%.+]] = mhlo.reshape [[SLICE1]]
   // CHECK-DAG: [[RESHAPE2:%.+]] = mhlo.reshape [[SLICE2]]
   // CHECK-DAG: [[RESHAPE3:%.+]] = mhlo.reshape [[SLICE3]]
@@ -5459,9 +5459,9 @@ func.func @inplace_update_three(%arg0: tensor<8x8x4xf32>, %arg1: tensor<3x8x4xf3
 
 // CHECK-LABEL: xla_dynamic_update_slice
 func.func @xla_dynamic_update_slice(%arg0: tensor<4x16xf32>, %arg1: tensor<2x4xf32>, %arg2: tensor<2xi32>) -> tensor<4x16xf32> {
-  // CHECK: [[SLICE0:%.+]] = "mhlo.slice"(%arg2) {limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi32>) -> tensor<1xi32>
+  // CHECK: [[SLICE0:%.+]] = "mhlo.slice"(%arg2) <{limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi32>) -> tensor<1xi32>
   // CHECK: [[RESHAPE0:%.+]] = mhlo.reshape [[SLICE0]] : (tensor<1xi32>) -> tensor<i32>
-  // CHECK: [[SLICE1:%.+]] = "mhlo.slice"(%arg2) {limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi32>) -> tensor<1xi32>
+  // CHECK: [[SLICE1:%.+]] = "mhlo.slice"(%arg2) <{limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi32>) -> tensor<1xi32>
   // CHECK: [[RESHAPE1:%.+]] = mhlo.reshape [[SLICE1]] : (tensor<1xi32>) -> tensor<i32>
   // CHECK: [[DUS:%.+]] = mhlo.dynamic_update_slice %arg0, %arg1, [[RESHAPE0]], [[RESHAPE1]] : (tensor<4x16xf32>, tensor<2x4xf32>, tensor<i32>, tensor<i32>) -> tensor<4x16xf32>
   // CHECK: return [[DUS]]
@@ -5473,7 +5473,7 @@ func.func @xla_dynamic_update_slice(%arg0: tensor<4x16xf32>, %arg1: tensor<2x4xf
 
 // CHECK-LABEL: xla_dynamic_update_slice2
 func.func @xla_dynamic_update_slice2(%arg0: tensor<4xf32>, %arg1: tensor<2xf32>, %arg2: tensor<1xi32>) -> tensor<4xf32> {
-  // CHECK: [[SLICE0:%.+]] = "mhlo.slice"(%arg2) {limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<1xi32>) -> tensor<1xi32>
+  // CHECK: [[SLICE0:%.+]] = "mhlo.slice"(%arg2) <{limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<1xi32>) -> tensor<1xi32>
   // CHECK: [[RESHAPE0:%.+]] = mhlo.reshape [[SLICE0]] : (tensor<1xi32>) -> tensor<i32>
   // CHECK: [[DUS:%.+]] = mhlo.dynamic_update_slice %arg0, %arg1, [[RESHAPE0]] : (tensor<4xf32>, tensor<2xf32>, tensor<i32>) -> tensor<4xf32>
   // CHECK: return [[DUS]]
@@ -5512,11 +5512,11 @@ func.func @cumsum_static(%arg0: tensor<4xf32>) -> tensor<4xf32> {
   // CHECK: [[AXIS:%.*]] = mhlo.constant dense<0> : tensor<i32>
   // CHECK: [[CONVERT_X:%.*]] = mhlo.convert [[X]] : tensor<4xf32>
   // CHECK: [[INIT:%.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
-  // CHECK: [[REDUCE:%.*]] = "mhlo.reduce_window"([[CONVERT_X]], [[INIT]]) ({
+  // CHECK: [[REDUCE:%.*]] = "mhlo.reduce_window"([[CONVERT_X]], [[INIT]]) <{padding = dense<{{\[\[}}3, 0]]> : tensor<1x2xi64>, window_dimensions = dense<4> : tensor<1xi64>, window_strides = dense<1> : tensor<1xi64>}> ({
   // CHECK: ^bb0([[A:%.*]]: tensor<f32>, [[B:%.*]]: tensor<f32>):
   // CHECK:   [[SUM:%.*]] = mhlo.add [[A]], [[B]] : tensor<f32>
   // CHECK:   mhlo.return [[SUM]] : tensor<f32>
-  // CHECK: }) {padding = dense<{{\[\[}}3, 0]]> : tensor<1x2xi64>, window_dimensions = dense<4> : tensor<1xi64>, window_strides = dense<1> : tensor<1xi64>} : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
+  // CHECK: }) : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
   // CHECK: [[CONVERT_REDUCE:%.*]] = mhlo.convert [[REDUCE]] : tensor<4xf32>
   // CHECK: return [[CONVERT_REDUCE]]
   %0 = "tf.Const"() {_output_shapes = ["tfshape$"], device = "", dtype = i32, value = dense<0> : tensor<i32>} : () -> tensor<i32>
@@ -5532,12 +5532,12 @@ func.func @cumsum_exclusive(%arg0: tensor<4xf32>) -> tensor<4xf32> {
   // CHECK: [[AXIS:%.*]] = mhlo.constant dense<0> : tensor<i32>
   // CHECK: [[CONVERT_X:%.*]] = mhlo.convert [[X]] : tensor<4xf32>
   // CHECK: [[INIT:%.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
-  // CHECK: [[REDUCE:%.*]] = "mhlo.reduce_window"([[CONVERT_X]], [[INIT]]) ({
+  // CHECK: [[REDUCE:%.*]] = "mhlo.reduce_window"([[CONVERT_X]], [[INIT]]) <{padding = dense<{{\[\[}}3, 0]]> : tensor<1x2xi64>, window_dimensions = dense<4> : tensor<1xi64>, window_strides = dense<1> : tensor<1xi64>}> ({
   // CHECK: ^bb0([[A:%.*]]: tensor<f32>, [[B:%.*]]: tensor<f32>):
   // CHECK:   [[SUM:%.*]] = mhlo.add [[A]], [[B]] : tensor<f32>
   // CHECK:   mhlo.return [[SUM]] : tensor<f32>
-  // CHECK: }) {padding = dense<{{\[\[}}3, 0]]> : tensor<1x2xi64>, window_dimensions = dense<4> : tensor<1xi64>, window_strides = dense<1> : tensor<1xi64>} : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
-  // CHECK: [[PAD:%.*]] = "mhlo.pad"([[REDUCE]], %{{.*}}) {edge_padding_high = dense<-1> : tensor<1xi64>, edge_padding_low = dense<1> : tensor<1xi64>, interior_padding = dense<0> : tensor<1xi64>} : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
+  // CHECK: }) : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
+  // CHECK: [[PAD:%.*]] = "mhlo.pad"([[REDUCE]], %{{.*}}) <{edge_padding_high = dense<-1> : tensor<1xi64>, edge_padding_low = dense<1> : tensor<1xi64>, interior_padding = dense<0> : tensor<1xi64>}> : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
   // CHECK: [[CONVERT_REDUCE:%.*]] = mhlo.convert [[PAD]] : tensor<4xf32>
   // CHECK: return [[CONVERT_REDUCE]]
   %0 = "tf.Const"() {_output_shapes = ["tfshape$"], device = "", dtype = i32, value = dense<0> : tensor<i32>} : () -> tensor<i32>
@@ -5551,16 +5551,16 @@ func.func @cumsum_exclusive(%arg0: tensor<4xf32>) -> tensor<4xf32> {
 // CHECK-SAME: [[X:%.*]]: tensor<4xf32>
 func.func @cumsum_reverse(%arg0: tensor<4xf32>) -> tensor<4xf32> {
   // CHECK: [[AXIS:%.*]] = mhlo.constant dense<0> : tensor<i32>
-  // CHECK: [[REVERSE1:%.*]] = "mhlo.reverse"([[X]]) {dimensions = dense<0> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<4xf32>
+  // CHECK: [[REVERSE1:%.*]] = "mhlo.reverse"([[X]]) <{dimensions = dense<0> : tensor<1xi64>}> : (tensor<4xf32>) -> tensor<4xf32>
   // CHECK: [[CONVERT_X:%.*]] = mhlo.convert [[REVERSE1]] : tensor<4xf32>
   // CHECK: [[INIT:%.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
-  // CHECK: [[REDUCE:%.*]] = "mhlo.reduce_window"([[CONVERT_X]], [[INIT]]) ({
+  // CHECK: [[REDUCE:%.*]] = "mhlo.reduce_window"([[CONVERT_X]], [[INIT]]) <{padding = dense<{{\[\[}}3, 0]]> : tensor<1x2xi64>, window_dimensions = dense<4> : tensor<1xi64>, window_strides = dense<1> : tensor<1xi64>}> ({
   // CHECK: ^bb0([[A:%.*]]: tensor<f32>, [[B:%.*]]: tensor<f32>):
   // CHECK:   [[SUM:%.*]] = mhlo.add [[A]], [[B]] : tensor<f32>
   // CHECK:   mhlo.return [[SUM]] : tensor<f32>
-  // CHECK: }) {padding = dense<{{\[\[}}3, 0]]> : tensor<1x2xi64>, window_dimensions = dense<4> : tensor<1xi64>, window_strides = dense<1> : tensor<1xi64>} : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
+  // CHECK: }) : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
   // CHECK: [[CONVERT_REDUCE:%.*]] = mhlo.convert [[REDUCE]] : tensor<4xf32>
-  // CHECK: [[REVERSE_BACK:%.*]] = "mhlo.reverse"([[CONVERT_REDUCE]]) {dimensions = dense<0> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<4xf32>
+  // CHECK: [[REVERSE_BACK:%.*]] = "mhlo.reverse"([[CONVERT_REDUCE]]) <{dimensions = dense<0> : tensor<1xi64>}> : (tensor<4xf32>) -> tensor<4xf32>
   // CHECK: return [[REVERSE_BACK]]
   %0 = "tf.Const"() {_output_shapes = ["tfshape$"], device = "", dtype = i32, value = dense<0> : tensor<i32>} : () -> tensor<i32>
   %1 = "tf.Cumsum"(%arg0, %0) {exclusive = false, reverse = true} : (tensor<4xf32>, tensor<i32>) -> tensor<4xf32>
@@ -5573,17 +5573,17 @@ func.func @cumsum_reverse(%arg0: tensor<4xf32>) -> tensor<4xf32> {
 // CHECK-SAME: [[X:%.*]]: tensor<4xf32>
 func.func @cumsum_exclusive_reverse(%arg0: tensor<4xf32>) -> tensor<4xf32> {
   // CHECK: [[AXIS:%.*]] = mhlo.constant dense<0> : tensor<i32>
-  // CHECK: [[REVERSE1:%.*]] = "mhlo.reverse"([[X]]) {dimensions = dense<0> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<4xf32>
+  // CHECK: [[REVERSE1:%.*]] = "mhlo.reverse"([[X]]) <{dimensions = dense<0> : tensor<1xi64>}> : (tensor<4xf32>) -> tensor<4xf32>
   // CHECK: [[CONVERT_X:%.*]] = mhlo.convert [[REVERSE1]] : tensor<4xf32>
   // CHECK: [[INIT:%.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
-  // CHECK: [[REDUCE:%.*]] = "mhlo.reduce_window"([[CONVERT_X]], [[INIT]]) ({
+  // CHECK: [[REDUCE:%.*]] = "mhlo.reduce_window"([[CONVERT_X]], [[INIT]]) <{padding = dense<{{\[\[}}3, 0]]> : tensor<1x2xi64>, window_dimensions = dense<4> : tensor<1xi64>, window_strides = dense<1> : tensor<1xi64>}> ({
   // CHECK: ^bb0([[A:%.*]]: tensor<f32>, [[B:%.*]]: tensor<f32>):
   // CHECK:   [[SUM:%.*]] = mhlo.add [[A]], [[B]] : tensor<f32>
   // CHECK:   mhlo.return [[SUM]] : tensor<f32>
-  // CHECK: }) {padding = dense<{{\[\[}}3, 0]]> : tensor<1x2xi64>, window_dimensions = dense<4> : tensor<1xi64>, window_strides = dense<1> : tensor<1xi64>} : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
-  // CHECK: [[PAD:%.*]] = "mhlo.pad"([[REDUCE]], %{{.*}}) {edge_padding_high = dense<-1> : tensor<1xi64>, edge_padding_low = dense<1> : tensor<1xi64>, interior_padding = dense<0> : tensor<1xi64>} : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
+  // CHECK: }) : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
+  // CHECK: [[PAD:%.*]] = "mhlo.pad"([[REDUCE]], %{{.*}}) <{edge_padding_high = dense<-1> : tensor<1xi64>, edge_padding_low = dense<1> : tensor<1xi64>, interior_padding = dense<0> : tensor<1xi64>}> : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
   // CHECK: [[CONVERT_REDUCE:%.*]] = mhlo.convert [[PAD]] : tensor<4xf32>
-  // CHECK: [[REVERSE_BACK:%.*]] = "mhlo.reverse"([[CONVERT_REDUCE]]) {dimensions = dense<0> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<4xf32>
+  // CHECK: [[REVERSE_BACK:%.*]] = "mhlo.reverse"([[CONVERT_REDUCE]]) <{dimensions = dense<0> : tensor<1xi64>}> : (tensor<4xf32>) -> tensor<4xf32>
   // CHECK: return [[REVERSE_BACK]]
   %0 = "tf.Const"() {_output_shapes = ["tfshape$"], device = "", dtype = i32, value = dense<0> : tensor<i32>} : () -> tensor<i32>
   %1 = "tf.Cumsum"(%arg0, %0) {exclusive = true, reverse = true} : (tensor<4xf32>, tensor<i32>) -> tensor<4xf32>
@@ -5619,7 +5619,7 @@ func.func @cumsum_dynamic(%arg0: tensor<?xf32>, %arg1: tensor<i32>) -> tensor<?x
 // CHECK-LABEL: func @cumprod
 func.func @cumprod(%arg0: tensor<4xf32>) -> tensor<4xf32> {
   // CHECK: [[INIT:%.*]] = mhlo.constant dense<1.000000e+00> : tensor<f32>
-  // CHECK: "mhlo.reduce_window"({{.*}}, [[INIT]]) ({
+  // CHECK: "mhlo.reduce_window"({{.*}}, [[INIT]])
   // CHECK:   mhlo.mul
   %0 = "tf.Const"() {_output_shapes = ["tfshape$"], device = "", dtype = i32, value = dense<0> : tensor<i32>} : () -> tensor<i32>
   %1 = "tf.Cumprod"(%arg0, %0) {exclusive = false, reverse = false} : (tensor<4xf32>, tensor<i32>) -> tensor<4xf32>
@@ -5857,7 +5857,7 @@ func.func @xla_conv_v2(%lhs: tensor<8x4x16x16x16xf32>, %rhs: tensor<4x3x3x16x16x
 // CHECK-LABEL: @xladot_matmul(
 // CHECK-SAME:    %[[LHS:.*]]: tensor<64x32xi8>, %[[RHS:.*]]: tensor<32x16xi8>) -> tensor<64x16xi32>
 func.func @xladot_matmul(%lhs : tensor<64x32xi8>, %rhs : tensor<32x16xi8>) -> tensor<64x16xi32> {
-  // CHECK: "mhlo.dot_general"(%[[LHS]], %[[RHS]]) {
+  // CHECK: "mhlo.dot_general"(%[[LHS]], %[[RHS]]) <{
   // CHECK-SAME:  dot_dimension_numbers = #mhlo.dot<
   // CHECK-NOT:     lhs_batching_dimensions =
   // CHECK-NOT:     rhs_batching_dimensions =
@@ -5877,7 +5877,7 @@ func.func @xladot_matmul(%lhs : tensor<64x32xi8>, %rhs : tensor<32x16xi8>) -> te
 // CHECK-LABEL: @xladotv2_matmul(
 // CHECK-SAME:    %[[LHS:.*]]: tensor<64x32xi8>, %[[RHS:.*]]: tensor<32x16xi8>) -> tensor<64x16xi32>
 func.func @xladotv2_matmul(%lhs : tensor<64x32xi8>, %rhs : tensor<32x16xi8>) -> tensor<64x16xi32> {
-  // CHECK: "mhlo.dot_general"(%[[LHS]], %[[RHS]]) {
+  // CHECK: "mhlo.dot_general"(%[[LHS]], %[[RHS]]) <{
   // CHECK-SAME:  dot_dimension_numbers = #mhlo.dot<
   // CHECK-NOT:     lhs_batching_dimensions =
   // CHECK-NOT:     rhs_batching_dimensions =
@@ -5916,7 +5916,7 @@ func.func @xla_dynamic_slice_constant_start(%arg0: tensor<4xi32>) -> tensor<2xi3
 // CHECK-LABEL: xla_dynamic_slice_i32_consts
 func.func @xla_dynamic_slice_i32_consts(%arg0: tensor<4xi32>) -> tensor<2xi32> {
   // CHECK: %[[START:.*]] = mhlo.constant dense<1> : tensor<i32>
-  // CHECK: "mhlo.dynamic_slice"(%arg0, %[[START]]) {slice_sizes = dense<2> : tensor<1xi64>} : (tensor<4xi32>, tensor<i32>) -> tensor<2xi32>
+  // CHECK: "mhlo.dynamic_slice"(%arg0, %[[START]]) <{slice_sizes = dense<2> : tensor<1xi64>}> : (tensor<4xi32>, tensor<i32>) -> tensor<2xi32>
   %starts = "tf.Const"() {value = dense<[1]> : tensor<1xi32>} : () -> (tensor<1xi32>)
   %sizes = "tf.Const"() {value = dense<[2]> : tensor<1xi32>} : () -> (tensor<1xi32>)
   %0 = "tf.XlaDynamicSlice"(%arg0, %starts, %sizes) : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
@@ -5954,7 +5954,7 @@ func.func @xla_dynamic_slice_variable_start(%arg0: tensor<3x4xi32>, %arg1: tenso
   // CHECK-DAG-SAME: start_indices = dense<1> : tensor<1xi64>,
   // CHECK-DAG-SAME: strides = dense<1> : tensor<1xi64>} : (tensor<2xi64>) -> tensor<1xi64>
   // CHECK: %[[RESHAPED_START2:.*]] = mhlo.reshape %[[SLICED_START2]] : (tensor<1xi64>) -> tensor<i64>
-  // CHECK: %[[RESULT:.*]] = "mhlo.dynamic_slice"(%arg0, %[[RESHAPED_START1]], %[[RESHAPED_START2]]) {slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
+  // CHECK: %[[RESULT:.*]] = "mhlo.dynamic_slice"(%arg0, %[[RESHAPED_START1]], %[[RESHAPED_START2]]) <{slice_sizes = dense<[1, 4]> : tensor<2xi64>}> : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
   // CHECK: return %[[RESULT]] : tensor<1x4xi32>
   %sizes = "tf.Const"() {value = dense<[1, 4]> : tensor<2xi64>} : () -> (tensor<2xi64>)
   %0 = "tf.XlaDynamicSlice"(%arg0, %arg1, %sizes) : (tensor<3x4xi32>, tensor<2xi64>, tensor<2xi64>) -> tensor<1x4xi32>
@@ -5996,11 +5996,11 @@ func.func @test_xla_reduce_window(%arg0: tensor<7xf32>, %arg1: tensor<f32>) -> t
   %cst_1 = "tf.Const"() {value = dense<2> : tensor<1xi32>} : () -> tensor<1xi32>
   %cst_2 = "tf.Const"() {value = dense<3> : tensor<1xi32>} : () -> tensor<1xi32>
   %cst_3 = "tf.Const"() {value = dense<4> : tensor<1xi32>} : () -> tensor<1xi32>
-  // CHECK: %[[REDUCE:.*]] = "mhlo.reduce_window"(%arg0, %arg1) ({
+  // CHECK: %[[REDUCE:.*]] = "mhlo.reduce_window"(%arg0, %arg1) <{base_dilations = dense<3> : tensor<1xi64>, padding = dense<0> : tensor<1x2xi64>, window_dilations = dense<4> : tensor<1xi64>, window_dimensions = dense<1> : tensor<1xi64>, window_strides = dense<2> : tensor<1xi64>}> ({
   // CHECK-NEXT: ^{{.*}}(%[[ARG0:.*]]: tensor<f32>, %[[ARG1:.*]]: tensor<f32>)
   // CHECK-NEXT:   %[[SUM:.*]] = func.call @sum_reducer3(%[[ARG0]], %[[ARG1]]){{.*}}
   // CHECK-NEXT:   mhlo.return %[[SUM]] : tensor<f32>
-  // CHECK-NEXT: }) {base_dilations = dense<3> : tensor<1xi64>, padding = dense<0> : tensor<1x2xi64>, window_dilations = dense<4> : tensor<1xi64>, window_dimensions = dense<1> : tensor<1xi64>, window_strides = dense<2> : tensor<1xi64>} : (tensor<7xf32>, tensor<f32>) -> tensor<10xf32>
+  // CHECK-NEXT: }) : (tensor<7xf32>, tensor<f32>) -> tensor<10xf32>
   // CHECK-NEXT: return %[[REDUCE]]
   %0 = "tf.XlaReduceWindow"(%arg0, %arg1, %cst_0, %cst_1, %cst_2, %cst_3, %cst) {computation = @sum_reducer3} : (tensor<7xf32>, tensor<f32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1x2xi32>) -> tensor<10xf32>
   func.return %0 : tensor<10xf32>
@@ -6020,11 +6020,11 @@ func.func private @sum_reducer3(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tenso
 // CHECK-LABEL: @xlasort_int
 // CHECK-SAME: %[[INPUT:.*]]: tensor<16xi32>
 func.func @xlasort_int(%input: tensor<16xi32>) -> (tensor<16xi32>) {
-  // CHECK-NEXT: %[[SORT:.*]] = "mhlo.sort"(%[[INPUT]]) ({
+  // CHECK-NEXT: %[[SORT:.*]] = "mhlo.sort"(%[[INPUT]]) <{dimension = -1 : i64, is_stable = false}> ({
   // CHECK-NEXT: ^{{.*}}(%[[LHS:.*]]: tensor<i32>, %[[RHS:.*]]: tensor<i32>)
   // CHECK-NEXT:   %[[CMP:.*]] = mhlo.compare LT, %[[LHS]], %[[RHS]], NOTYPE
   // CHECK-NEXT:   mhlo.return %[[CMP]]
-  // CHECK-NEXT: }) {dimension = -1 : i64, is_stable = false} : (tensor<16xi32>) -> tensor<16xi32>
+  // CHECK-NEXT: }) : (tensor<16xi32>) -> tensor<16xi32>
   // CHECK-NEXT: return %[[SORT]]
   %output = "tf.XlaSort"(%input) : (tensor<16xi32>) -> (tensor<16xi32>)
   func.return %output : tensor<16xi32>
@@ -6035,11 +6035,11 @@ func.func @xlasort_int(%input: tensor<16xi32>) -> (tensor<16xi32>) {
 // CHECK-LABEL: @xlasort_float
 // CHECK-SAME: %[[INPUT:.*]]: tensor<8xf64>
 func.func @xlasort_float(%input: tensor<8xf64>) -> (tensor<8xf64>) {
-  // CHECK-NEXT: %[[SORT:.*]] = "mhlo.sort"(%[[INPUT]]) ({
+  // CHECK-NEXT: %[[SORT:.*]] = "mhlo.sort"(%[[INPUT]]) <{dimension = -1 : i64, is_stable = false}> ({
   // CHECK-NEXT: ^{{.*}}(%[[LHS:.*]]: tensor<f64>, %[[RHS:.*]]: tensor<f64>)
   // CHECK-NEXT:   %[[CMP:.*]] = mhlo.compare LT, %[[LHS]], %[[RHS]], TOTALORDER
   // CHECK-NEXT:   mhlo.return %[[CMP]]
-  // CHECK-NEXT: }) {dimension = -1 : i64, is_stable = false} : (tensor<8xf64>) -> tensor<8xf64>
+  // CHECK-NEXT: }) : (tensor<8xf64>) -> tensor<8xf64>
   // CHECK-NEXT: return %[[SORT]]
   %output = "tf.XlaSort"(%input) : (tensor<8xf64>) -> (tensor<8xf64>)
   func.return %output : tensor<8xf64>
@@ -6067,7 +6067,7 @@ func.func @xla_rng_bit_generator(%arg0: tensor<2xui64>) -> (tensor<2xui64>, tens
   %cst = "tf.Const"() {value = dense<[10, 12]> : tensor<2xi32>} : () -> tensor<2xi32>
   // CHECK-NEXT: %1 = mhlo.constant dense<3> : tensor<i32>
   %cst_0 = "tf.Const"() {value = dense<3> : tensor<i32>} : () -> tensor<i32>
-  // CHECK-NEXT: %[[OUTPUT_STATE:.*]], %[[OUTPUT:.*]] = "mhlo.rng_bit_generator"(%[[STATE]]) {rng_algorithm = #mhlo.rng_algorithm<DEFAULT>} : (tensor<2xui64>) -> (tensor<2xui64>, tensor<10x12xui32>)
+  // CHECK-NEXT: %[[OUTPUT_STATE:.*]], %[[OUTPUT:.*]] = "mhlo.rng_bit_generator"(%[[STATE]]) <{rng_algorithm = #mhlo.rng_algorithm<DEFAULT>}> : (tensor<2xui64>) -> (tensor<2xui64>, tensor<10x12xui32>)
   // CHECK-NEXT: return %[[OUTPUT_STATE]], %[[OUTPUT]] : tensor<2xui64>, tensor<10x12xui32>
   %output_key, %output = "tf.XlaRngBitGenerator"(%cst_0, %arg0, %cst) : (tensor<i32>, tensor<2xui64>, tensor<2xi32>) -> (tensor<2xui64>, tensor<10x12xui32>)
   func.return %output_key, %output : tensor<2xui64>, tensor<10x12xui32>
@@ -6123,11 +6123,11 @@ func.func private @sum_reducer2(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tenso
 func.func @xla_variadic_sort(%arg0: tensor<2x3x4xui8>) -> tensor<2x3x4xui8> attributes {tf.entry_function = {control_outputs = "", inputs = "_arg0,_arg1", outputs = "_retval0"}} {
   // CHECK-NEXT: {{.*}} = mhlo.constant dense<0> : tensor<i32>
   %cst = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-  // CHECK-NEXT: %[[SORT:.*]] = "mhlo.sort"(%[[INPUT]]) ({
+  // CHECK-NEXT: %[[SORT:.*]] = "mhlo.sort"(%[[INPUT]]) <{dimension = 0 : i64, is_stable = false}> ({
   // CHECK-NEXT: ^{{.*}}(%[[LHS:.*]]: tensor<ui8>, %[[RHS:.*]]: tensor<ui8>)
   // CHECK-NEXT:   %[[CMP:.*]] = func.call @compare_lt(%[[LHS]], %[[RHS]]) : (tensor<ui8>, tensor<ui8>) -> tensor<i1>
   // CHECK-NEXT:   mhlo.return %[[CMP]]
-  // CHECK-NEXT: }) {dimension = 0 : i64, is_stable = false} : (tensor<2x3x4xui8>) -> tensor<2x3x4xui8>
+  // CHECK-NEXT: }) : (tensor<2x3x4xui8>) -> tensor<2x3x4xui8>
   // CHECK-NEXT: return %[[SORT]]
   %0 = "tf.XlaVariadicSort"(%arg0, %cst) {_XlaHasReferenceVars = false, comparator = @compare_lt, device = "/job:localhost/replica:0/task:0/device:XLA_GPU:0", is_stable = false} : (tensor<2x3x4xui8>, tensor<i32>) -> tensor<2x3x4xui8>
   func.return %0 : tensor<2x3x4xui8>
@@ -6172,7 +6172,7 @@ func.func @test_xla_select_and_scatter(%arg0: tensor<4x5x1x1xbf16>, %arg1: tenso
   %cst = "tf.Const"() {value = dense<0> : tensor<4x2xi32>} : () -> tensor<4x2xi32>
   %cst_0 = "tf.Const"() {value = dense<[2, 2, 1, 1]> : tensor<4xi32>} : () -> tensor<4xi32>
   %cst_1 = "tf.Const"() {value = dense<[2, 3, 1, 1]> : tensor<4xi32>} : () -> tensor<4xi32>
-  // CHECK: %[[SELECT_AND_SCATTER:.*]] = "mhlo.select_and_scatter"(%arg0, %arg1, %arg2) ({
+  // CHECK: %[[SELECT_AND_SCATTER:.*]] = "mhlo.select_and_scatter"(%arg0, %arg1, %arg2) <{padding = dense<0> : tensor<4x2xi64>, window_dimensions = dense<[2, 3, 1, 1]> : tensor<4xi64>, window_strides = dense<[2, 2, 1, 1]> : tensor<4xi64>}> ({
   // CHECK-NEXT: ^{{.*}}(%[[ARG0:.*]]: tensor<bf16>, %[[ARG1:.*]]: tensor<bf16>)
   // CHECK-NEXT:   %[[RES:.*]] = func.call @ge_select(%[[ARG0]], %[[ARG1]]){{.*}}
   // CHECK-NEXT:   mhlo.return %[[RES]] : tensor<i1>
@@ -6180,7 +6180,7 @@ func.func @test_xla_select_and_scatter(%arg0: tensor<4x5x1x1xbf16>, %arg1: tenso
   // CHECK-NEXT: ^{{.*}}(%[[ARG2:.*]]: tensor<bf16>, %[[ARG3:.*]]: tensor<bf16>)
   // CHECK-NEXT:   %[[RES:.*]] = func.call @add_scatter(%[[ARG2]], %[[ARG3]]){{.*}}
   // CHECK-NEXT:   mhlo.return %[[RES]] : tensor<bf16>
-  // CHECK-NEXT: }) {padding = dense<0> : tensor<4x2xi64>, window_dimensions = dense<[2, 3, 1, 1]> : tensor<4xi64>, window_strides = dense<[2, 2, 1, 1]> : tensor<4xi64>} : (tensor<4x5x1x1xbf16>, tensor<2x2x1x1xbf16>, tensor<bf16>) -> tensor<?x?x?x?xbf16>
+  // CHECK-NEXT: }) : (tensor<4x5x1x1xbf16>, tensor<2x2x1x1xbf16>, tensor<bf16>) -> tensor<?x?x?x?xbf16>
   // CHECK-NEXT: return %[[SELECT_AND_SCATTER]]
   %0 = "tf.XlaSelectAndScatter"(%arg0, %cst_1, %cst_0, %cst, %arg1, %arg2) {scatter = @add_scatter, select = @ge_select} : (tensor<4x5x1x1xbf16>, tensor<4xi32>, tensor<4xi32>, tensor<4x2xi32>, tensor<2x2x1x1xbf16>, tensor<bf16>) -> tensor<?x?x?x?xbf16>
   func.return %0 : tensor<?x?x?x?xbf16>
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/verify-tfxla-legalization.mlir b/tensorflow/compiler/mlir/tf2xla/tests/verify-tfxla-legalization.mlir
index e6623350380fcb..fa0170a2b6b7a9 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/verify-tfxla-legalization.mlir
+++ b/tensorflow/compiler/mlir/tf2xla/tests/verify-tfxla-legalization.mlir
@@ -6,7 +6,7 @@
 func.func @allowsMHLO() -> (tensor<8x64x32x4xcomplex<f32>> {mhlo.sharding = ""}) {
   %0 = mhlo.constant dense<(1.000000e+00,-1.000000e+00)> : tensor<128x32x4xcomplex<f32>>
   %1 = mhlo.constant dense<(1.000000e+00,1.000000e+00)> : tensor<8x64x128xcomplex<f32>>
-  %2 = "mhlo.einsum"(%1, %0) {einsum_config = "abc,cde->abde"} : (tensor<8x64x128xcomplex<f32>>, tensor<128x32x4xcomplex<f32>>) -> tensor<8x64x32x4xcomplex<f32>>
+  %2 = "mhlo.einsum"(%1, %0) <{einsum_config = "abc,cde->abde"}> : (tensor<8x64x128xcomplex<f32>>, tensor<128x32x4xcomplex<f32>>) -> tensor<8x64x32x4xcomplex<f32>>
   return %2 : tensor<8x64x32x4xcomplex<f32>>
 }
 
@@ -53,7 +53,7 @@ func.func @nonstatic_shape_mhlo() -> tensor<?xi32> attributes {tf.entry_function
   %1 = mhlo.convert %0 : (tensor<f64>) -> tensor<i64>
   %2 = mhlo.reshape %1 : (tensor<i64>) -> tensor<1xi64>
   // expected-error @+1 {{Node `mhlo.dynamic_iota` must have compile-time constant}}
-  %3 = "mhlo.dynamic_iota"(%2) {iota_dimension = 0 : i64} : (tensor<1xi64>) -> tensor<?xi32>
+  %3 = "mhlo.dynamic_iota"(%2) <{iota_dimension = 0 : i64}> : (tensor<1xi64>) -> tensor<?xi32>
   %4 = mhlo.multiply %3, %3 : tensor<?xi32>
   return %4 : tensor<?xi32>
 }
diff --git a/tensorflow/compiler/mlir/tfrt/BUILD b/tensorflow/compiler/mlir/tfrt/BUILD
index 21cdf1203a3554..31b6aa272faf1d 100644
--- a/tensorflow/compiler/mlir/tfrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/BUILD
@@ -472,9 +472,7 @@ tf_proto_library(
 
 cc_library(
     name = "passes",
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
     deps = [
         "//tensorflow/compiler/mlir/tfrt:tf_to_tfrt",
     ],
@@ -695,6 +693,7 @@ cc_library(
     hdrs = ["backend_compiler.h"],
     deps = [
         "//tensorflow/core/tfrt/runtime",
+        "@com_google_absl//absl/status",
         "@llvm-project//mlir:IR",
     ],
 )
diff --git a/tensorflow/compiler/mlir/tfrt/backend_compiler.h b/tensorflow/compiler/mlir/tfrt/backend_compiler.h
index 0e959f04f43554..7167c8ef18e0ea 100644
--- a/tensorflow/compiler/mlir/tfrt/backend_compiler.h
+++ b/tensorflow/compiler/mlir/tfrt/backend_compiler.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TFRT_BACKEND_COMPILER_H_
 #define TENSORFLOW_COMPILER_MLIR_TFRT_BACKEND_COMPILER_H_
 
+#include "absl/status/status.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/DialectRegistry.h"  // from @llvm-project
 #include "tensorflow/core/tfrt/runtime/runtime.h"
diff --git a/tensorflow/compiler/mlir/tfrt/ir/mlrt/BUILD b/tensorflow/compiler/mlir/tfrt/ir/mlrt/BUILD
index ce69fa85189423..bfc93b9252ccbf 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/mlrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/ir/mlrt/BUILD
@@ -88,9 +88,7 @@ td_library(
         "tf_mlrt_tpu_ops.td",
     ],
     includes = ["."],
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
     deps = [
         ":mlrt_td_files",
         ":tf_mlrt_td_files",
diff --git a/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.td b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.td
index 6ff38dda69bd85..fcbf2358b3b936 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.td
+++ b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.td
@@ -462,7 +462,8 @@ def IfrtLoadVariableOp: TensorflowMlrt_Op<"ifrt_load_variable", [Pure]> {
     `tf.IfrtLoadVariableOp` converts the tensor into an IFRT array based on device and sharding
     configuration specified in `VariableDeviceShardingConfigProto`.
 
-    This op returns a scalar string tensor as a key for user to look for the loaded array.
+    This op returns a scalar string tensor as a key for user to look for the loaded array
+    and a future containing the restored tensor.
   }];
 
   let arguments = (ins
@@ -472,7 +473,8 @@ def IfrtLoadVariableOp: TensorflowMlrt_Op<"ifrt_load_variable", [Pure]> {
   );
 
   let results = (outs
-    TFTensorType:$array_key
+    TFTensorType:$array_key,
+    MlrtFutureType: $tensor_future
   );
 }
 
diff --git a/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_ops.td b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_ops.td
index bb567f32106215..0791423a91c17f 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_ops.td
+++ b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_ops.td
@@ -148,5 +148,39 @@ def TFTPUCompileAndExecuteOp : TensorflowMlrt_Op<"tf_tpu_compile_and_execute", [
   }];
 }
 
+def TFIfrtLoadVariableOp: TensorflowMlrt_Op<"tf_ifrt_load_variable", [Pure]> {
+  let summary = "Loads a variable tensor as an IFRT array for mlrt";
+
+  let description = [{
+    This is the MLRT version of tf.IfrtLoadVariableOp.
+
+    This op loads a variable tensor as an IFRT array and binds it with the specified name.
+
+    This op is an replacement of `tf.ReadVariableOp` in the case that a constant
+    variable tensor is an input to the tpu program invoked by `tf.IfrtCall`.
+
+    After a `tf.ReadVariableOp` is lowered into `tf.IfrtLoadVariableOp`, the `tf.IfrtCall` kernel
+    will bind the loaded IFRT array by name with the tpu program's input.
+
+    `tf.IfrtLoadVariableOp` converts the tensor into an IFRT array based on device and sharding
+    configuration specified in `VariableDeviceShardingConfigProto`.
+
+    This op returns a scalar string tensor as a key for user to look for the loaded array
+    and a future containing the restored tensor.
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$variable,
+    StrAttr:$device_sharding_config_proto_text,
+    StrAttr:$name
+  );
+
+  let results = (outs
+    TF_Tensor:$array_key,
+    MlrtFutureType: $tensor_future
+  );
+}
+
+
 
 #endif
diff --git a/tensorflow/compiler/mlir/tfrt/tests/ifrt/sink_variable_as_named_array.mlir b/tensorflow/compiler/mlir/tfrt/tests/ifrt/sink_variable_as_named_array.mlir
index ba644948c6b06d..dec4b733d25b19 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/ifrt/sink_variable_as_named_array.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/ifrt/sink_variable_as_named_array.mlir
@@ -6,7 +6,7 @@
 //
 // CHECK-LABEL:  func.func @serving_default(%arg0: tensor<1x3xf32>) -> tensor<1x1xf32> {
 // CHECK-NEXT:   [[HANDLE2:%.*]] = "tf.VarHandleOp"
-// CHECK-NEXT:   [[KEY:%.*]] = "tf.IfrtLoadVariable"([[HANDLE2]])
+// CHECK-NEXT:   [[KEY:%.*]], [[FUTURE:%.*]] = "tf.IfrtLoadVariable"([[HANDLE2]])
 // CHECK-SAME:       device_sharding_config_proto_text = "sharding { type: OTHER tile_assignment_dimensions: 2 tile_assignment_dimensions: 1 tile_assignment_devices: 0 tile_assignment_devices: 1 } device_ids: 0 device_ids: 1 "
 // CHECK-SAME:       name = "__y"
 // CHECK-NEXT:   [[RES:%.*]] = "tf.IfrtCall"([[KEY]], %arg0) <{program_id = 6515870160938153680 : i64, variable_arg_indices = [0 : i32]}>
@@ -27,9 +27,9 @@ module {
 //
 // CHECK-LABEL:  func.func @serving_default(%arg0: tensor<1x3xf32>) -> tensor<1x1xf32> {
 // CHECK:  "tf.VarHandleOp"
-// CHECK-NEXT:  [[VARIABLE:%.*]] = "tf.ReadVariableOp"
-// CHECK-NEXT:  [[KEY:%.*]] = "tf.IfrtLoadVariable"
-// CHECK-NEXT:  "tf.MatMul"(%arg0, [[VARIABLE]])
+// CHECK-NOT:  [[VARIABLE:%.*]] = "tf.ReadVariableOp"
+// CHECK-NEXT:  [[KEY:%.*]], [[FUTURE:%.*]] = "tf.IfrtLoadVariable"
+// CHECK-NEXT:  "tf.MatMul"(%arg0, [[FUTURE]])
 // CHECK-NEXT:   [[RES:%.*]] = "tf.IfrtCall"(%arg0, [[KEY]]) <{program_id = 6515870160938153680 : i64, variable_arg_indices = [1 : i32]}>
 // CHECK-NEXT:    return [[RES]] : tensor<1x1xf32>
 //
@@ -42,3 +42,22 @@ module {
     return %result : tensor<1x1xf32>
   }
 }
+
+// -----
+// Variable tensor is only for host
+//
+// CHECK-LABEL:  func.func @serving_default(%arg0: tensor<1x3xf32>) -> tensor<1x1xf32> {
+// CHECK:  "tf.VarHandleOp"
+// CHECK-NOT:  [[VARIABLE:%.*]] = "tf.ReadVariableOp"
+// CHECK-NEXT:  [[KEY:%.*]], [[FUTURE:%.*]] = "tf.IfrtLoadVariable"
+// CHECK-NEXT:  [[RES:%.*]] = "tf.MatMul"(%arg0, [[FUTURE]])
+// CHECK-NEXT:    return [[RES]] : tensor<1x1xf32>
+//
+module {
+  func.func @serving_default(%arg0: tensor<1x3xf32>) -> tensor<1x1xf32> {
+    %0 = "tf.VarHandleOp"() <{container = "", shared_name = "y"}> : () -> tensor<!tf_type.resource<tensor<3x1xf32>>>
+    %2 = "tf.ReadVariableOp"(%0) : (tensor<!tf_type.resource<tensor<3x1xf32>>>) -> tensor<3x1xf32>
+    %3 = "tf.MatMul"(%arg0, %2) : (tensor<1x3xf32>, tensor<3x1xf32>) -> tensor<1x1xf32>
+    return %3: tensor<1x1xf32>
+  }
+}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/mlrt/rewrite_ifrt_load_variable.mlir b/tensorflow/compiler/mlir/tfrt/tests/mlrt/rewrite_ifrt_load_variable.mlir
new file mode 100644
index 00000000000000..e1ad0aea205007
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/tests/mlrt/rewrite_ifrt_load_variable.mlir
@@ -0,0 +1,20 @@
+// RUN: tf-tfrt-opt -split-input-file -tf-mlrt-rewrite-ifrt-load-variable %s | FileCheck %s
+
+// Variable is used by both CPU and TPU
+//
+// CHECK-LABEL: func @serving_default(%arg0: tensor<1x3xf32>) -> tensor<1x1xf32>
+// CHECK-NEXT:    [[HANDLE:%.*]] = "tf.VarHandleOp"()
+// CHECK-NEXT:    [[ARRAYKEY:%.*]], [[FURTURE:%.*]] = "tf_mlrt.tf_ifrt_load_variable"([[HANDLE]])
+// CHECK-SAME:       {device_sharding_config_proto_text = "sharding { }", name = "__y"} : (tensor<!tf_type.resource<tensor<3x1xf32>>>) -> (tensor<!tf_type.string>, !mlrt.future)
+// CHECK-NEXT:    [[TENSOR:%.*]] = "tf_mlrt.tf_await"([[FURTURE]]) : (!mlrt.future) -> tensor<3x1xf32>
+// CHECK-NEXT:    "tf.MatMul"(%arg0, [[TENSOR]]) : (tensor<1x3xf32>, tensor<3x1xf32>) -> tensor<1x1xf32>
+// CHECK-NEXT:    "tf.IfrtCall"(%arg0, [[ARRAYKEY]]) <{program_id = 6515870160938153680 : i64, variable_arg_indices = [1 : i32]}> {__tpu_compile_metadata_text = "retvals { sharding { } }"} : (tensor<1x3xf32>, tensor<!tf_type.string>) -> tensor<1x1xf32>
+// CHECK-NEXT:    return
+//
+ func.func @serving_default(%arg0: tensor<1x3xf32>) -> tensor<1x1xf32> {
+    %0 = "tf.VarHandleOp"() <{container = "", shared_name = "y"}> : () -> tensor<!tf_type.resource<tensor<3x1xf32>>>
+    %array_key, %tensor = "tf.IfrtLoadVariable"(%0) <{device_sharding_config_proto_text = "sharding { }", name = "__y"}> : (tensor<!tf_type.resource<tensor<3x1xf32>>>) -> (tensor<!tf_type.string>, tensor<3x1xf32>)
+    %1 = "tf.MatMul"(%arg0, %tensor) : (tensor<1x3xf32>, tensor<3x1xf32>) -> tensor<1x1xf32>
+    %2 = "tf.IfrtCall"(%arg0, %array_key) <{program_id = 6515870160938153680 : i64, variable_arg_indices = [1 : i32]}> {__tpu_compile_metadata_text = "retvals { sharding { } }"} : (tensor<1x3xf32>, tensor<!tf_type.string>) -> tensor<1x1xf32>
+    return %2 : tensor<1x1xf32>
+  }
diff --git a/tensorflow/compiler/mlir/tfrt/tests/mlrt/tf_to_mlrt.mlir b/tensorflow/compiler/mlir/tfrt/tests/mlrt/tf_to_mlrt.mlir
index 3cb879dabe97f7..3151daf80ec759 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/mlrt/tf_to_mlrt.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/mlrt/tf_to_mlrt.mlir
@@ -470,7 +470,8 @@ func.func @ifrt_load_variable_test() -> () {
   // CHECK-NEXT: "tf_mlrt.ifrt_load_variable"([[HANDLE]])
   // CHECK-SAME: device_sharding_config_proto_text
   // CHECK-SAME: name = "__variable"
-  %1 = "tf.IfrtLoadVariable"(%0) <{device_sharding_config_proto_text = "sharding { } device_ids: 0 device_ids: 1 ", name = "__variable"}> {__op_key = 2: i32, device = "/device:CPU:0"} : (tensor<!tf_type.resource<tensor<1x3xf32>>>) -> (tensor<!tf_type.string>)
+  %1, %2 = "tf_mlrt.tf_ifrt_load_variable"(%0) {device_sharding_config_proto_text = "sharding { } device_ids: 0 device_ids: 1 ", name = "__variable", __op_key = 2: i32, device = "/device:CPU:0"} : (tensor<!tf_type.resource<tensor<1x3xf32>>>) -> (tensor<!tf_type.string>, !mlrt.future)
+  // CHECK-NEXT: mlrt.await_all_control
   // CHECK-NEXT: return
   func.return
 }
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/BUILD b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/BUILD
index 6ef5c011d0a11d..305195e744932f 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/BUILD
@@ -124,9 +124,15 @@ cc_library(
         ":ifrt_constants",
         ":ifrt_types",
         "//tensorflow/compiler/jit:xla_cpu_jit",
+        "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
+        "//tensorflow/compiler/mlir/tensorflow:error_util",
         "//tensorflow/compiler/mlir/tensorflow:serialize_mlir_module_utils",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops_a_m_inc_gen",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops_n_z_inc_gen",
+        "//tensorflow/compiler/mlir/tensorflow:visitor",
         "//tensorflow/compiler/mlir/tf2xla/api/v2:legalize_tf",
+        "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:layout_util",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/core:core_cpu_base",
@@ -160,6 +166,29 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "extract_callback",
+    srcs = ["extract_callback.cc"],
+    hdrs = ["extract_callback.h"],
+    deps = [
+        "//tensorflow/compiler/mlir/tensorflow:error_util",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops_a_m_inc_gen",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops_n_z_inc_gen",
+        "//tensorflow/compiler/mlir/tensorflow:visitor",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
 tf_cc_test(
     name = "tf2hlo_test",
     srcs = [
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/extract_callback.cc b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/extract_callback.cc
new file mode 100644
index 00000000000000..67f10482d16d15
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/extract_callback.cc
@@ -0,0 +1,81 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/extract_callback.h"
+
+#include <utility>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h.inc"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.h.inc"
+#include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/visitor.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
+
+namespace tensorflow {
+namespace ifrt_serving {
+
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ExtractCallbackModule(
+    mlir::ModuleOp module, absl::string_view callback_key) {
+  // Find the entry function name first.
+  mlir::func::FuncOp callback_entry_func;
+  module.walk([&](mlir::func::FuncOp func) {
+    if (func.getSymName().str() == callback_key) {
+      callback_entry_func = func;
+      return mlir::WalkResult::skip();
+    }
+    return mlir::WalkResult::advance();
+  });
+
+  if (!callback_entry_func) {
+    return absl::NotFoundError(
+        absl::StrCat("Callback key ", callback_key, " not found"));
+  }
+
+  mlir::StatusScopedDiagnosticHandler diag_handler(module->getContext());
+  auto entry_function_name = callback_entry_func.getSymName();
+  auto submodule = mlir::TF::CreatePrunedModule(module, entry_function_name);
+  if (mlir::failed(submodule)) {
+    return diag_handler.ConsumeStatus();
+  }
+
+  // Remove the attribute inherited from saved model loading. They impose
+  // additional constraint on public functions that are not necessary.
+  submodule->get()->removeAttr("tf_saved_model.semantics");
+  submodule->get().walk([&](mlir::func::FuncOp func) {
+    if (func.getSymName() == entry_function_name) {
+      func.setPublic();
+    }
+  });
+  return std::move(*submodule);
+}
+
+}  // namespace ifrt_serving
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/extract_callback.h b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/extract_callback.h
new file mode 100644
index 00000000000000..a345d1d881a79e
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/extract_callback.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_IFRT_EXTRACT_CALLBACK_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_IFRT_EXTRACT_CALLBACK_H_
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "tensorflow/core/framework/types.pb.h"
+
+namespace tensorflow {
+namespace ifrt_serving {
+
+// Extracts a module that consists of a public callback function in name of
+// `callback_key` and all its reachables.
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ExtractCallbackModule(
+    mlir::ModuleOp module, absl::string_view callback_key);
+
+}  // namespace ifrt_serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_IFRT_EXTRACT_CALLBACK_H_
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_backend_compiler.cc b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_backend_compiler.cc
index 8646be3554a0d9..ebaf2570bba3f4 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_backend_compiler.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_backend_compiler.cc
@@ -92,6 +92,8 @@ CompileAndRegisterIfrtPrograms(absl::string_view model_name,
         model_name, entry_function_name.str(), *std::move(submodule),
         ifrt_model_context.GetClient(), &ifrt_model_context.GetThreadPool(),
         &ifrt_model_context.GetLoadedVariableRegistry(),
+        &ifrt_model_context.GetRestoreTensorRegistry(),
+        ifrt_model_context.GetDeviceMgr(),
         ifrt_model_context.GetShapeRepresentationFn());
 
     // Register the Ifrt program to `ServingExecutableRegistry` so that
@@ -145,14 +147,14 @@ absl::Status IfrtBackendCompiler::CompileTensorflow(
     tensorflow::DumpMlirOpToFile("ifrt_tpu_bct_conversion_before", module);
   }
 
-  // Run backward compat pass so that we can use bridge to do clustering.
-  auto backward_compat_result =
-      tensorflow::RunTPUBackwardCompatConversion(module, {});
-  if (mlir::failed(backward_compat_result)) {
-    return diag_handler.Combine(
-        absl::InternalError("Failed to handle legacy TPU Ops"));
+  if (tpu_compiler_ != nullptr) {
+    // Run backward compat pass so that we can use bridge to do clustering.
+    if (mlir::failed(
+            tpu_compiler_->RunTPUBackwardCompatConversion(module, {}))) {
+      return diag_handler.Combine(
+          absl::InternalError("Failed to handle legacy TPU Ops"));
+    }
   }
-
   if (VLOG_IS_ON(1)) {
     tensorflow::DumpMlirOpToFile("ifrt_tpu_bct_conversion_after", module);
   }
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_backend_compiler.h b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_backend_compiler.h
index c227027a48ba17..2407fe7cc3546c 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_backend_compiler.h
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_backend_compiler.h
@@ -16,10 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_IFRT_IFRT_BACKEND_COMPILER_H_
 #define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_IFRT_IFRT_BACKEND_COMPILER_H_
 
-
 #include "absl/status/status.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tfrt/backend_compiler.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/tpu_passes.h"
 #include "tensorflow/core/tfrt/runtime/runtime.h"
 
 namespace tensorflow {
@@ -28,11 +28,17 @@ namespace ifrt_serving {
 // Implements the custom backend compiler for IFRT based serving in TFRT.
 class IfrtBackendCompiler : public tensorflow::BackendCompiler {
  public:
+  explicit IfrtBackendCompiler(TpuCompiler* tpu_compiler = nullptr)
+      : tpu_compiler_(tpu_compiler) {}
+
   // Rewrites the tensorflow graph in MLIR for IFRT serving. The methods
   // extracts regions for IFRT execution on accelerator (e.g. TPU).
   absl::Status CompileTensorflow(
       tensorflow::tfrt_stub::ModelRuntimeContext& model_context,
       mlir::ModuleOp module) const override;
+
+ private:
+  TpuCompiler* tpu_compiler_;  // Not owned.
 };
 
 }  // namespace ifrt_serving
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/sink_variable_as_named_array.cc b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/sink_variable_as_named_array.cc
index 90bdbf1f1ce6e8..b3bf510003e797 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/sink_variable_as_named_array.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/sink_variable_as_named_array.cc
@@ -84,31 +84,60 @@ class SinkVariableAsNamedArrayPass
       }
     }
 
+    // TODO(b/332906178): collapse the below with the
+    // CollectVariablesUsedByDevice above or just remove the
+    // CollectVariablesUsedByDevice.
+    //
     // Rewrite ReadVariableOp with IfrtLoadVariableOp
     llvm::SmallDenseMap<mlir::TF::ReadVariableOp, mlir::TF::IfrtLoadVariableOp>
         read_to_load;
-    for (auto& [name, variable_config] : variable_config_by_name) {
-      for (auto& read_variable_op : variable_config.read_variable_op) {
-        builder.setInsertionPointAfter(read_variable_op);
-        // TODO(b/319045348): consider use resource alias analysis for this.
-        auto var_handle = GetDefiningOp<mlir::TF::VarHandleOp>(
-            read_variable_op.getResource());
-
-        if (!var_handle) {
-          read_variable_op->emitError(
-              "ReadVariableOp has no defining VarHandleOp.");
-          return signalPassFailure();
-        }
 
-        auto load_variable_op = builder.create<mlir::TF::IfrtLoadVariableOp>(
-            read_variable_op->getLoc(),
-            mlir::RankedTensorType::get(
-                {}, builder.getType<mlir::TF::StringType>()),
-            var_handle.getResult(),
-            builder.getStringAttr(variable_config.device_sharding_config),
-            builder.getStringAttr(name));
-        read_to_load[read_variable_op] = load_variable_op;
-      }
+    mlir::WalkResult walk_result =
+        module.walk([&](mlir::TF::ReadVariableOp read_variable_op) {
+          mlir::FailureOr<std::string> variable_runtime_name =
+              GetVariableTensorName(read_variable_op);
+          if (mlir::failed(variable_runtime_name)) {
+            read_variable_op->emitError() << "Failed to get variable name.";
+            return mlir::WalkResult::interrupt();
+          }
+
+          builder.setInsertionPointAfter(read_variable_op);
+          // TODO(b/319045348): consider use resource alias analysis for
+          // this.
+          auto var_handle = GetDefiningOp<mlir::TF::VarHandleOp>(
+              read_variable_op.getResource());
+
+          if (!var_handle) {
+            read_variable_op->emitError(
+                "ReadVariableOp has no defining VarHandleOp.");
+            return mlir::WalkResult::interrupt();
+          }
+
+          auto iter = variable_config_by_name.find(*variable_runtime_name);
+          mlir::StringAttr device_sharding_config_attr;
+          if (iter == variable_config_by_name.end()) {
+            device_sharding_config_attr = builder.getStringAttr("");
+          } else {
+            device_sharding_config_attr =
+                builder.getStringAttr(iter->second.device_sharding_config);
+          }
+
+          std::vector<mlir::Type> result_types;
+          result_types.push_back(mlir::RankedTensorType::get(
+              {}, builder.getType<mlir::TF::StringType>()));
+          result_types.push_back(read_variable_op.getResult().getType());
+
+          auto load_variable_op = builder.create<mlir::TF::IfrtLoadVariableOp>(
+              read_variable_op->getLoc(), result_types, var_handle.getResult(),
+              device_sharding_config_attr,
+              builder.getStringAttr(*variable_runtime_name));
+          read_to_load[read_variable_op] = load_variable_op;
+
+          return mlir::WalkResult::advance();
+        });
+
+    if (walk_result.wasInterrupted()) {
+      return signalPassFailure();
     }
 
     // Rewrite ifrt call: variable tensors are sunk as attribute.
@@ -142,7 +171,7 @@ class SinkVariableAsNamedArrayPass
           variable_arg_indices.push_back(arg_idx);
           // Variable use the key from IfrtLoadVariable.
           updated_args.push_back(
-              read_to_load[arg.read_variable_op].getResult());
+              read_to_load[arg.read_variable_op].getArrayKey());
         } else {
           // non variable
           updated_args.push_back(call->getOperand(arg_idx));
@@ -162,14 +191,16 @@ class SinkVariableAsNamedArrayPass
       call.erase();
     }
 
-    // Delete all ReadVariableOps that are not used.
-    for (auto& [name, variable_config] : variable_config_by_name) {
-      for (auto& read_variable_op : variable_config.read_variable_op) {
-        if (read_variable_op.use_empty()) {
-          read_variable_op.erase();
-        }
+    // Remove all ReadVariableOp after replacing the CPU usage of
+    // ReadVariableOp.
+    module.walk([&](mlir::TF::ReadVariableOp read_variable_op) {
+      if (!read_variable_op->use_empty()) {
+        // Replace CPU use of ReadVariableOp
+        read_variable_op.replaceAllUsesWith(
+            read_to_load[read_variable_op].getTensorFuture());
       }
-    }
+      read_variable_op.erase();
+    });
   }
 
  private:
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_ifrt_passes.cc b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_ifrt_passes.cc
index 9737c681d28aa8..1a76571e25bc03 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_ifrt_passes.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_ifrt_passes.cc
@@ -73,6 +73,7 @@ void AddClusterToIfrtRuntimeOpsPassPipeline(OpPassManager& pm,
   pm.addNestedPass<mlir::func::FuncOp>(CreateTfIdentityPropagationPass());
 
   pm.addNestedPass<mlir::func::FuncOp>(CreateTfRestoreSplittingPass());
+  pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
   pm.addNestedPass<mlir::func::FuncOp>(CreateTfRestorePruningPass());
   pm.addNestedPass<mlir::func::FuncOp>(CreateTfRestoreMergingPass());
 
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/BUILD b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/BUILD
index 7d28571db5030a..d7fafb49ee6cdd 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/BUILD
@@ -99,6 +99,7 @@ cc_library(
         ":async_while",
         ":fuse_mlrt_ops",
         ":parallelization",
+        ":rewrite_ifrt_load_variable",
         ":tf_to_mlrt",
         ":while_to_map_fn",
         "//tensorflow/compiler/mlir/tfrt:tfrt_pipeline_options",
@@ -228,3 +229,24 @@ cc_library(
         "@llvm-project//mlir:TransformUtils",
     ],
 )
+
+cc_library(
+    name = "rewrite_ifrt_load_variable",
+    srcs = ["rewrite_ifrt_load_variable.cc"],
+    hdrs = ["rewrite_ifrt_load_variable.h"],
+    deps = [
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow/ir/host_runtime:tensorflow_tfrt_ops",
+        "//tensorflow/compiler/mlir/tfrt/ir/mlrt:mlrt_ops",
+        "//tensorflow/compiler/mlir/tfrt/ir/mlrt:tf_mlrt_ops",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/import_model.cc b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/import_model.cc
index b288ccde63c2f8..af932ff5011895 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/import_model.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/import_model.cc
@@ -51,7 +51,7 @@ limitations under the License.
 namespace tensorflow {
 namespace mlrt_compiler {
 
-StatusOr<mlrt::bc::Buffer> ConvertTfMlirToBytecode(
+absl::StatusOr<mlrt::bc::Buffer> ConvertTfMlirToBytecode(
     const TfrtCompileOptions& options, tfrt_stub::FallbackState& fallback_state,
     mlir::ModuleOp module, tfrt_stub::ModelRuntimeContext& model_context,
     mlir::OwningOpRef<mlir::ModuleOp>* module_with_op_keys,
@@ -132,13 +132,13 @@ StatusOr<mlrt::bc::Buffer> ConvertTfMlirToBytecode(
         auto statusor = mlrt::EmitExecutable(registry, module);
         if (!statusor.ok()) return statusor.status();
         bytecode_buffer = std::move(*statusor);
-        return OkStatus();
+        return absl::OkStatus();
       },
       model_context, &fallback_state, added_xla_function_names));
   return bytecode_buffer;
 }
 
-StatusOr<mlrt::bc::Buffer> ConvertTfMlirWithOpKeysToBytecode(
+absl::StatusOr<mlrt::bc::Buffer> ConvertTfMlirWithOpKeysToBytecode(
     const TfrtCompileOptions& options,
     const tfrt_stub::FallbackState& fallback_state,
     mlir::ModuleOp module_with_op_keys,
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/passes.cc b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/passes.cc
index ac9606d4ee7f6c..eaa53c838e3796 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/passes.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/passes.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tfrt/transforms/mlrt/async_while.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/mlrt/fuse_mlrt_ops.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/mlrt/parallelization.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/mlrt/rewrite_ifrt_load_variable.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/mlrt/while_to_map_fn.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/tfrt_pipeline_options.h"
@@ -37,6 +38,7 @@ void RegisterMlrtPasses() {
   mlir::registerPass([]() { return CreateAsyncWhilePass(); });
   mlir::registerPass([]() { return CreateParallelizationPass(); });
   mlir::registerPass([]() { return CreateWhileToMapFnPass(); });
+  mlir::registerPass([]() { return CreateRewriteIfrtLoadVariablePass(); });
   mlir::registerPass(
       []() { return CreateTfToMlrtPreParallelizationConversionPass({}); });
   mlir::registerPass([]() { return CreateTfToMlrtConversionPass({}); });
@@ -50,6 +52,8 @@ void CreateTfToMlrtPipeline(mlir::OpPassManager &pm,
   pm.addPass(
       mlrt_compiler::CreateTfToMlrtPreParallelizationConversionPass(options));
 
+  pm.addPass(mlrt_compiler::CreateRewriteIfrtLoadVariablePass());
+
   if (options.enable_while_parallel_iterations) {
     pm.addPass(mlrt_compiler::CreateAsyncWhilePass());
   }
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/rewrite_ifrt_load_variable.cc b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/rewrite_ifrt_load_variable.cc
new file mode 100644
index 00000000000000..368a91ac54f955
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/rewrite_ifrt_load_variable.cc
@@ -0,0 +1,105 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tfrt/transforms/mlrt/rewrite_ifrt_load_variable.h"
+
+#include <memory>
+#include <vector>
+
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/host_runtime/tfrt_ops.h"
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.h"
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.h"
+
+namespace tensorflow {
+namespace mlrt_compiler {
+namespace {
+
+class RewriteIfrtLoadVariablePass
+    : public mlir::PassWrapper<RewriteIfrtLoadVariablePass,
+                               mlir::OperationPass<mlir::ModuleOp>> {
+ public:
+  RewriteIfrtLoadVariablePass() = default;
+  RewriteIfrtLoadVariablePass &operator=(const RewriteIfrtLoadVariablePass &) =
+      delete;
+
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(RewriteIfrtLoadVariablePass)
+
+ private:
+  void getDependentDialects(mlir::DialectRegistry &registry) const override {
+    registry.insert<tensorflow::tf_mlrt::TensorflowMlrtDialect>();
+    registry.insert<mlrt::compiler::MlrtDialect>();
+  }
+
+  llvm::StringRef getArgument() const final {
+    return "tf-mlrt-rewrite-ifrt-load-variable";
+  }
+
+  llvm::StringRef getDescription() const final {
+    return "Convert tf.IfrtLoadVariable to tf_mlrt.TFIfrtLoadVariable";
+  }
+
+  void runOnOperation() override {
+    mlir::ModuleOp module = getOperation();
+    mlir::OpBuilder builder(module);
+
+    module->walk([&](mlir::TF::IfrtLoadVariableOp load_variable_op) {
+      builder.setInsertionPoint(load_variable_op);
+
+      std::vector<mlir::Type> result_types;
+      result_types.push_back(load_variable_op.getArrayKey().getType());
+      result_types.push_back(builder.getType<mlrt::compiler::FutureType>());
+      auto mlrt_load_variable_op =
+          builder.create<tf_mlrt::TFIfrtLoadVariableOp>(
+              load_variable_op->getLoc(), result_types,
+              load_variable_op->getOperands(), load_variable_op->getAttrs());
+      for (auto user : load_variable_op.getTensorFuture().getUsers()) {
+        builder.setInsertionPoint(user);
+        auto await_op = builder.create<tf_mlrt::TFAwaitOp>(
+            user->getLoc(), load_variable_op.getTensorFuture().getType(),
+            mlrt_load_variable_op.getTensorFuture());
+        user->replaceUsesOfWith(load_variable_op.getTensorFuture(),
+                                await_op.getResult());
+      }
+
+      for (auto user : load_variable_op.getArrayKey().getUsers()) {
+        user->replaceUsesOfWith(load_variable_op.getArrayKey(),
+                                mlrt_load_variable_op.getArrayKey());
+      }
+
+      load_variable_op->erase();
+    });
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateRewriteIfrtLoadVariablePass() {
+  return std::make_unique<RewriteIfrtLoadVariablePass>();
+}
+
+}  // namespace mlrt_compiler
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/rewrite_ifrt_load_variable.h b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/rewrite_ifrt_load_variable.h
new file mode 100644
index 00000000000000..1423011b05b01c
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/rewrite_ifrt_load_variable.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_REWRITE_IFRT_LOAD_VARIABLE_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_REWRITE_IFRT_LOAD_VARIABLE_H_
+
+#include <memory>
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace tensorflow {
+namespace mlrt_compiler {
+
+// Creates a pass that converts tf.IfrtLoadVariableOp to
+// tf_mlrt.TFIfrtLoadVariableOp and inserts tf_mlrt.Await on the returned future
+// from tf_mlrt.TFIfrtLoadVariableOp if it is used by CPU ops.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateRewriteIfrtLoadVariablePass();
+
+}  // namespace mlrt_compiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_REWRITE_IFRT_LOAD_VARIABLE_H_
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc
index 37ddf0b1bf076d..350c424636b2f8 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc
@@ -16,7 +16,6 @@ limitations under the License.
 
 #include <stdint.h>
 
-#include <cstddef>
 #include <iostream>
 #include <memory>
 #include <optional>
@@ -24,16 +23,20 @@ limitations under the License.
 #include <utility>
 
 #include "google/protobuf/text_format.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/Func/Transforms/FuncConversions.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinDialect.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/host_runtime/tfrt_ops.h.inc"
@@ -326,17 +329,24 @@ class GetResourceOpConversion final
   }
 };
 
-// Convert tf.IfrtLoadVariableOp to tf_mlrt.IfrtLoadVariableOp
-class IfrtLoadVariableOpConversion
-    : public mlir::OpConversionPattern<mlir::TF::IfrtLoadVariableOp> {
+// Convert tf_mlrt.TFIfrtLoadVariableOp to tf_mlrt.IfrtLoadVariableOp
+class TFIfrtLoadVariableOpConversion
+    : public mlir::OpConversionPattern<tf_mlrt::TFIfrtLoadVariableOp> {
  public:
-  using OpConversionPattern::OpConversionPattern;
+  TFIfrtLoadVariableOpConversion(mlir::MLIRContext *context,
+                                 mlir::TypeConverter *type_converter)
+      : mlir::OpConversionPattern<tf_mlrt::TFIfrtLoadVariableOp>(context),
+        type_converter_(*type_converter) {}
 
   mlir::LogicalResult matchAndRewrite(
-      mlir::TF::IfrtLoadVariableOp op, OpAdaptor adaptor,
+      tf_mlrt::TFIfrtLoadVariableOp op, OpAdaptor adaptor,
       mlir::ConversionPatternRewriter &rewriter) const override {
-    llvm::SmallVector<mlir::Type> result_types(
-        op->getNumResults(), rewriter.getType<tf_mlrt::TFTensorType>());
+    llvm::SmallVector<mlir::Type, 4> result_types;
+    for (auto type : op->getResultTypes()) {
+      if (failed(type_converter_.convertType(type, result_types)))
+        return mlir::failure();
+    }
+
     auto new_op = rewriter.create<tf_mlrt::IfrtLoadVariableOp>(
         op.getLoc(), result_types, adaptor.getOperands()[0],
         op.getDeviceShardingConfigProtoTextAttr(), op.getNameAttr());
@@ -344,6 +354,9 @@ class IfrtLoadVariableOpConversion
 
     return mlir::success();
   }
+
+ private:
+  mlir::TypeConverter &type_converter_;
 };
 
 // Convert tf.IfrtRestoreVariableOp to tf_mlrt.IfrtRestoreVariableOp
@@ -523,7 +536,7 @@ class ExecuteOpConversion final : public mlir::ConversionPattern {
           node_def.device(), op->getNumOperands(),
           [&](tensorflow::AttrValueMap *attr_value_map) {
             *attr_value_map = node_def.attr();
-            return OkStatus();
+            return absl::OkStatus();
           },
           fallback_state_.device_manager(),
           fallback_state_.process_function_library_runtime());
@@ -1187,6 +1200,7 @@ class TfToMlrtConversionPass
     target.addIllegalDialect<mlir::TF::TensorFlowDialect>();
 
     target.addIllegalOp<tf_mlrt::TFAsyncWhileOp>();
+    target.addIllegalOp<tf_mlrt::TFIfrtLoadVariableOp>();
     target.addIllegalOp<tf_mlrt::TFAwaitOp>();
     target.addIllegalOp<tf_mlrt::TFPromiseOp>();
     target.addIllegalOp<tf_mlrt::TFMapFnOp>();
@@ -1223,16 +1237,16 @@ class TfToMlrtConversionPass
     // Order the list of added ops alphabetically.
     patterns.add<WhileOpConversion>(&context, &type_converter_, &symbol_table);
     patterns.add<AsyncOpConversion, GetResourceOpConversion,
-                 SetResourceOpConversion, IfrtLoadVariableOpConversion,
-                 IfrtRestoreVariableOpConversion, TFAwaitOpConversion,
-                 TFPromiseOpConversion>(&context);
+                 SetResourceOpConversion, IfrtRestoreVariableOpConversion,
+                 TFAwaitOpConversion, TFPromiseOpConversion>(&context);
     patterns.add<BatchFunctionOpConversion, CaseOpConversion, CondOpConversion,
                  TFAsyncWhileOpConversion, TFMapFnOpConversion>(type_converter_,
                                                                 &context);
     patterns.add<ExecuteOpConversion>(&context, &symbol_table, &type_converter_,
                                       &execute_op_registry_, &op_kernel_cache_,
                                       &fallback_state_);
-    patterns.add<TFCallOpConversion<mlir::TF::PartitionedCallOp>,
+    patterns.add<TFIfrtLoadVariableOpConversion,
+                 TFCallOpConversion<mlir::TF::PartitionedCallOp>,
                  TFCallOpConversion<mlir::TF::StatefulPartitionedCallOp>,
                  TFCallOpConversion<mlir::TF::LegacyCallOp>>(&context,
                                                              &type_converter_);
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/util.cc b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/util.cc
index a1f9d401f5c485..5ab6678da892a4 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/util.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/util.cc
@@ -34,7 +34,7 @@ bool UseFallback(mlir::Operation *op) {
   // TF kernels so that we don't need to check every op here.
   return !llvm::isa<
       mlir::TF::_TfrtSetResourceOp, mlir::TF::_TfrtGetResourceOp,
-      mlir::TF::BatchFunctionOp, mlir::TF::CaseOp, mlir::TF::IfrtLoadVariableOp,
+      mlir::TF::BatchFunctionOp, mlir::TF::CaseOp,
       mlir::TF::IfrtRestoreVariableOp, mlir::TF::StatefulPartitionedCallOp,
       mlir::TF::PartitionedCallOp, mlir::TF::LegacyCallOp, mlir::TF::IfOp,
       mlir::TF::WhileOp, mlir::TF::TPUCompileMlirAndExecuteOp>(op);
diff --git a/tensorflow/compiler/mlir/tfrt/translate/import_model.cc b/tensorflow/compiler/mlir/tfrt/translate/import_model.cc
index f61a087e782704..3cf8be9c90cb62 100644
--- a/tensorflow/compiler/mlir/tfrt/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tfrt/translate/import_model.cc
@@ -80,7 +80,7 @@ namespace {
 
 // Exports all XLA functions in the form of XlaLaunch, and their nested
 // functions.
-StatusOr<std::vector<FunctionDef>> ExportXlaFunctions(
+absl::StatusOr<std::vector<FunctionDef>> ExportXlaFunctions(
     mlir::ModuleOp module, std::vector<std::string>* added_xla_function_names) {
   // Find all XLA functions.
   std::vector<std::string> xla_functions;
@@ -306,7 +306,7 @@ Status ConvertTfMlirToBef(const TfrtCompileOptions& options,
               absl::InternalError("failed to convert MLIR to BEF."));
 
         bef_buffer->shrink_to_fit();
-        return OkStatus();
+        return absl::OkStatus();
       },
       model_context, fallback_state, added_xla_function_names);
 }
@@ -364,7 +364,7 @@ tensorflow::Status AddXlaFunctions(
     }
   }
 
-  return tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/utils/export.cc b/tensorflow/compiler/mlir/tfrt/utils/export.cc
index a26008731405f7..182fdb3008193a 100644
--- a/tensorflow/compiler/mlir/tfrt/utils/export.cc
+++ b/tensorflow/compiler/mlir/tfrt/utils/export.cc
@@ -37,7 +37,8 @@ namespace tensorflow {
 
 absl::Status ExportFunctionDefs(
     mlir::ModuleOp module,
-    absl::AnyInvocable<absl::Status(tensorflow::FunctionDef)> callback) {
+    absl::AnyInvocable<absl::Status(tensorflow::FunctionDef)> callback,
+    bool export_tf_original_func_name) {
   tsl::profiler::TraceMe traceme([&]() {
     return tsl::profiler::TraceMeEncode(
         "ExportFunctionDefs",
@@ -58,7 +59,7 @@ absl::Status ExportFunctionDefs(
     }
   }
   tensorflow::GraphExportConfig configs;
-  configs.export_original_tf_func_name = true;
+  configs.export_original_tf_func_name = export_tf_original_func_name;
 
   for (auto func : module.getOps<mlir::func::FuncOp>()) {
     tensorflow::FunctionDef function_def;
diff --git a/tensorflow/compiler/mlir/tfrt/utils/export.h b/tensorflow/compiler/mlir/tfrt/utils/export.h
index 7a226974bffcf6..84f0e272d4f828 100644
--- a/tensorflow/compiler/mlir/tfrt/utils/export.h
+++ b/tensorflow/compiler/mlir/tfrt/utils/export.h
@@ -15,7 +15,6 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TFRT_UTILS_EXPORT_H_
 #define TENSORFLOW_COMPILER_MLIR_TFRT_UTILS_EXPORT_H_
 
-#include <memory>
 
 #include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
@@ -29,7 +28,8 @@ namespace tensorflow {
 // be suitable for FunctionDef export.
 absl::Status ExportFunctionDefs(
     mlir::ModuleOp module,
-    absl::AnyInvocable<absl::Status(tensorflow::FunctionDef)> callback);
+    absl::AnyInvocable<absl::Status(tensorflow::FunctionDef)> callback,
+    bool export_tf_original_func_name = true);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
index 86e2e269e4d329..1069f3fd172411 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
@@ -129,6 +129,7 @@ tf_cc_binary(
         "@llvm-project//llvm:TargetParser",
         "@llvm-project//mlir:BufferizationInterfaces",
         "@llvm-project//mlir:ExecutionEngineUtils",
+        "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LLVMToLLVMIRTranslation",
         "@llvm-project//mlir:MemRefTransforms",
         "@llvm-project//mlir:Pass",
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/hlo_to_kernel.cc b/tensorflow/compiler/mlir/tools/kernel_gen/hlo_to_kernel.cc
index d5c29e90ef7ed1..7c53bc23fda464 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/hlo_to_kernel.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/hlo_to_kernel.cc
@@ -24,19 +24,26 @@
 
 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/CodeGen/CommandFlags.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/TargetParser/Host.h"
 #include "mlir/Dialect/MemRef/Transforms/AllocationOpInterfaceImpl.h"  // from @llvm-project
 #include "mlir/ExecutionEngine/OptUtils.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Export.h"  // from @llvm-project
@@ -45,7 +52,8 @@
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/platform/statusor.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
 
 namespace tensorflow {
 namespace kernel_gen {
@@ -149,7 +157,7 @@ Status Run(llvm::StringRef input_file, llvm::StringRef output_file,
   // Write .a file.
   TF_RETURN_IF_ERROR(
       WriteStringToFile(Env::Default(), output_file.str(), binary));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc
index eb6c06ac54f9c4..277511fed098e0 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc
@@ -144,7 +144,7 @@ Status LowerHloToJITInvocation(mlir::ModuleOp module,
   if (failed(pm.run(module))) {
     return absl::InternalError("Lowering HLO to JIT invocation failed.");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status LowerHlotoLoops(mlir::ModuleOp module,
@@ -236,7 +236,7 @@ Status LowerHlotoLoops(mlir::ModuleOp module,
   if (failed(pm.run(module))) {
     return absl::InternalError("Lowering HLO to loops failed.");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status LowerLoopsToGPU(mlir::ModuleOp module, bool index_64bit,
@@ -305,7 +305,7 @@ Status LowerLoopsToGPU(mlir::ModuleOp module, bool index_64bit,
   if (failed(pm.run(module))) {
     return absl::InternalError("Lowering to GPU kernels failed.");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status LowerKernelBodiesToLowLevelIr(mlir::ModuleOp module,
@@ -350,7 +350,7 @@ Status LowerKernelBodiesToLowLevelIr(mlir::ModuleOp module,
         "Lowering to low-level device IR failed.");
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status AmendKernelLLVMIRWithStaticKnowledge(mlir::ModuleOp module,
@@ -366,7 +366,7 @@ Status AmendKernelLLVMIRWithStaticKnowledge(mlir::ModuleOp module,
   return failed(pm.run(module))
              ? tensorflow::errors::Internal(
                    "Amending LLVMIR with static knowledge failed.")
-             : OkStatus();
+             : absl::OkStatus();
 }
 
 Status GenerateDeviceCode(mlir::ModuleOp module,
@@ -387,7 +387,7 @@ Status GenerateDeviceCode(mlir::ModuleOp module,
 
   return failed(pm.run(module))
              ? tensorflow::errors::Internal("Generating device code failed.")
-             : OkStatus();
+             : absl::OkStatus();
 }
 
 Status LowerHostSideToFinalForm(mlir::ModuleOp module, bool apply_cl_options) {
@@ -402,7 +402,7 @@ Status LowerHostSideToFinalForm(mlir::ModuleOp module, bool apply_cl_options) {
 
   return failed(pm.run(module)) ? tensorflow::errors::Internal(
                                       "Final lowering of host side failed.")
-                                : OkStatus();
+                                : absl::OkStatus();
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
index c4abb6420d9b38..d1c3af0b9a6191 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
@@ -166,7 +166,7 @@ cc_library(
         "@local_xla//xla/service/gpu/llvm_gpu_backend",
     ] + if_cuda_is_configured([
         "@local_tsl//tsl/platform:cuda_libdevice_path",
-        "@local_xla//xla/stream_executor/gpu:asm_compiler",
+        "@local_xla//xla/stream_executor/cuda:cuda_asm_compiler",
     ]) + if_rocm_is_configured([
         "@local_xla//xla/stream_executor/gpu:asm_compiler",
         "//tensorflow/core/platform:rocm_rocdl_path",
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc
index 345eccfd12c5f0..58f3c195e900d1 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc
@@ -39,7 +39,7 @@ limitations under the License.
 #include "tsl/platform/cuda_libdevice_path.h"
 
 #if GOOGLE_CUDA
-#include "xla/stream_executor/gpu/asm_compiler.h"
+#include "xla/stream_executor/cuda/cuda_asm_compiler.h"
 #elif TENSORFLOW_USE_ROCM
 #include "xla/stream_executor/gpu/asm_compiler.h"
 #include "tensorflow/core/platform/rocm_rocdl_path.h"
diff --git a/tensorflow/compiler/tests/randomized_tests.cc b/tensorflow/compiler/tests/randomized_tests.cc
index 8847aa1a811455..6fb2158b00e3a5 100644
--- a/tensorflow/compiler/tests/randomized_tests.cc
+++ b/tensorflow/compiler/tests/randomized_tests.cc
@@ -281,7 +281,7 @@ Status OpTestBuilder::BuildGraph(const string& name_prefix,
     *test_node_def = test_def;
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Test fixture. The fixture manages the random number generator and its seed,
@@ -1386,7 +1386,7 @@ Status TensorsAreCloseImpl(const Tensor& x, const Tensor& y, double atol,
                        " rtol = ", rtol, " tol = ", atol + rtol * Abs(Tx(i))));
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename T>
@@ -1400,7 +1400,7 @@ Status TensorsAreEqualImpl(const Tensor& x, const Tensor& y) {
           Str(Ty(i)), ". x = ", x.DebugString(), "y = ", y.DebugString()));
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status TensorsAreEqualImplBfloat16(const Tensor& x, const Tensor& y) {
@@ -1414,7 +1414,7 @@ Status TensorsAreEqualImplBfloat16(const Tensor& x, const Tensor& y) {
           "y = ", y.DebugString()));
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Tests if "x" and "y" are tensors of the same type, same shape, and with
diff --git a/tensorflow/compiler/tf2xla/light_outside_compilation_kernels_for_test.cc b/tensorflow/compiler/tf2xla/light_outside_compilation_kernels_for_test.cc
index 07ed120620e8be..5cc05693996934 100644
--- a/tensorflow/compiler/tf2xla/light_outside_compilation_kernels_for_test.cc
+++ b/tensorflow/compiler/tf2xla/light_outside_compilation_kernels_for_test.cc
@@ -32,7 +32,7 @@ REGISTER_OP("TestStaticTf")
     .Output("output: float")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       c->set_output(0, c->input(0));
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 class TestStaticTfOp : public OpKernel {
@@ -69,7 +69,7 @@ REGISTER_OP("TestStaticMultipleOutputTf")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       c->set_output(0, c->input(0));
       c->set_output(1, c->input(0));
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 class TestStaticMultipleOutputTfOp : public OpKernel {
@@ -117,7 +117,7 @@ REGISTER_OP("TestDynamicTf")
     .Output("output: float")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       c->set_output(0, c->UnknownShapeOfRank(c->Rank(c->input(0))));
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 // Same as TestStaticTfOp, but only copies up to `max_size` attribute.
@@ -183,7 +183,7 @@ REGISTER_OP("DynamicMultidim")
     .Output("output: float")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       c->set_output(0, c->UnknownShapeOfRank(5));
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 // Just fill in the data with ones for a given shape.
@@ -245,7 +245,7 @@ REGISTER_OP("DynamicUnranked")
     .Output("output: float")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       c->set_output(0, c->UnknownShape());
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 REGISTER_XLA_OP(Name("DynamicUnranked").Device(DEVICE_GPU_XLA_JIT),
@@ -258,7 +258,7 @@ REGISTER_OP("TestTfMustBeConstant")
     .Output("output: float")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       c->set_output(0, c->input(0));
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 class TestTfMustBeConstantOp : public OpKernel {
@@ -318,7 +318,7 @@ REGISTER_OP("TestDynamicTfWithBound")
     .Output("output: float")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       c->set_output(0, c->input(0));
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 class TestDynamicTfWithBoundOp : public OpKernel {
diff --git a/tensorflow/compiler/tf2xla/literal_util.cc b/tensorflow/compiler/tf2xla/literal_util.cc
index b7dc020187883e..e00ad8eb92132e 100644
--- a/tensorflow/compiler/tf2xla/literal_util.cc
+++ b/tensorflow/compiler/tf2xla/literal_util.cc
@@ -40,7 +40,7 @@ Status HostTensorToBorrowingLiteral(const xla::Shape& xla_shape,
       << "Provided xla::Shape must have the same dims as the Tensor shape.";
   *literal = xla::BorrowingLiteral(
       static_cast<const char*>(DMAHelper::base(&host_tensor)), xla_shape);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 absl::StatusOr<xla::Literal> HostTensorToLiteral(const Tensor& host_tensor) {
@@ -63,7 +63,7 @@ Status HostTensorToMutableBorrowingLiteral(
   *literal = xla::MutableBorrowingLiteral(
       static_cast<const char*>(DMAHelper::base(host_tensor)), xla_shape);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status HostTensorsToBorrowingLiteralTuple(absl::Span<const Tensor> host_tensors,
@@ -83,7 +83,7 @@ Status HostTensorsToBorrowingLiteralTuple(absl::Span<const Tensor> host_tensors,
   *literal = xla::BorrowingLiteral(
       buf_ptrs, xla::ShapeUtil::MakeTupleShape(tensor_shapes));
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status CopyLiteralToHostTensor(const xla::LiteralSlice& literal,
@@ -106,7 +106,7 @@ Status CopyLiteralToHostTensor(const xla::LiteralSlice& literal,
     void* dst_ptr = DMAHelper::base(host_tensor);
     memcpy(dst_ptr, src_ptr, total_bytes);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status LiteralToHostTensor(const xla::LiteralSlice& literal,
diff --git a/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc b/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
index c24654c894b34f..d1a2a68d045bfc 100644
--- a/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
+++ b/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
@@ -36,7 +36,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.h"
 #include "tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.h"
 #include "tensorflow/compiler/mlir/tf2xla/internal/mlir_bridge_pass_util.h"
-// #include "tensorflow/compiler/tf2xla/tf2xla_defs.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/framework/device.h"
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 656c02e1214ac6..29e0de5edafbc2 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -646,7 +646,6 @@ cc_library(
         "//tensorflow/core/kernels/mkl:mkl_matmul_op",
         "//tensorflow/core/kernels/mkl:mkl_sparse_matrix_matmul_op",
         "//tensorflow/core/kernels/mkl:mkl_tmp_ops",
-        "//tensorflow/core/kernels/mkl:mkl_deprecated_ops",
     ]) + if_cuda_or_rocm([
         "//tensorflow/core/kernels:cudnn_rnn_kernels",
     ]) + if_cuda([
@@ -662,9 +661,7 @@ cc_library(
 
 cc_library(
     name = "dynamic_kernels_impl",
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
     deps = [
         "//tensorflow/core/kernels:sobol_op",
     ],
@@ -1993,7 +1990,6 @@ tf_cc_test_mkl(
         "//tensorflow/core/kernels/mkl:mkl_softmax_op",
         "//tensorflow/core/kernels/mkl:mkl_transpose_op",
         "//tensorflow/core/kernels/mkl:mkl_tmp_ops",
-        "//tensorflow/core/kernels/mkl:mkl_deprecated_ops",
     ]),
 )
 
diff --git a/tensorflow/core/api_def/base_api/api_def_AssignVariableXlaConcatND.pbtxt b/tensorflow/core/api_def/base_api/api_def_AssignVariableXlaConcatND.pbtxt
index 646f602af22688..6bd6bcd8d05ad5 100644
--- a/tensorflow/core/api_def/base_api/api_def_AssignVariableXlaConcatND.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_AssignVariableXlaConcatND.pbtxt
@@ -5,17 +5,13 @@ op {
     name: "resource"
     description: <<END
 Resource variable for concatenated input tensors across all dimensions.
+END
   }
   in_arg {
     name: "inputs"
     description: <<END
 Input tensor slices in row-major order to merge across all dimensions. All
 inputs must have the same shape.
-  }
-  out_arg {
-    name: "output"
-    description: <<END
-Output tensor formed from merging input slices based on num_concats defined.
 END
   }
   attr {
diff --git a/tensorflow/core/api_def/base_api/api_def_IndexFlatMapDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_IndexFlatMapDataset.pbtxt
new file mode 100644
index 00000000000000..36104cae1a66b1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_IndexFlatMapDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "IndexFlatMapDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ReadVariableXlaSplitND.pbtxt b/tensorflow/core/api_def/base_api/api_def_ReadVariableXlaSplitND.pbtxt
index 74e5268062aad6..914995604d3059 100644
--- a/tensorflow/core/api_def/base_api/api_def_ReadVariableXlaSplitND.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ReadVariableXlaSplitND.pbtxt
@@ -5,6 +5,7 @@ op {
     name: "resource"
     description: <<END
 Resource variable of input tensor to split across all dimensions.
+END
   }
   out_arg {
     name: "outputs"
diff --git a/tensorflow/core/api_def/base_api/api_def_WeightedFlatMapDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_WeightedFlatMapDataset.pbtxt
new file mode 100644
index 00000000000000..5377487b1ea814
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_WeightedFlatMapDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "WeightedFlatMapDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_XlaConcatND.pbtxt b/tensorflow/core/api_def/base_api/api_def_XlaConcatND.pbtxt
index 4f72952f346454..792a5c8625d907 100644
--- a/tensorflow/core/api_def/base_api/api_def_XlaConcatND.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_XlaConcatND.pbtxt
@@ -6,6 +6,7 @@ op {
     description: <<END
 Input tensor slices in row-major order to merge across all dimensions. All
 inputs must have the same shape.
+END
   }
   out_arg {
     name: "output"
diff --git a/tensorflow/core/common_runtime/BUILD b/tensorflow/core/common_runtime/BUILD
index 0eee0cdf05a4aa..fd46e483f39906 100644
--- a/tensorflow/core/common_runtime/BUILD
+++ b/tensorflow/core/common_runtime/BUILD
@@ -1191,6 +1191,8 @@ cc_library(
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:portable_gif_internal",
+        "//tensorflow/core/config:flag_defs",
+        "//tensorflow/core/config:flags",
         "//tensorflow/core/framework:node_def_util",
         "//tensorflow/core/framework:tensor_proto_cc",
         "//tensorflow/core/framework:tensor_shape_proto_cc",
@@ -2408,6 +2410,7 @@ tf_cc_tests(
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/config:flag_defs",
+        "//tensorflow/core/config:flags",
         "//tensorflow/core/kernels:ops_util",
         "//tensorflow/core/nccl:collective_communicator",
         "//tensorflow/core/platform:regexp",
diff --git a/tensorflow/core/common_runtime/colocate_predecessor_trees_pass.cc b/tensorflow/core/common_runtime/colocate_predecessor_trees_pass.cc
index 31a101a421a284..8b14843a400947 100644
--- a/tensorflow/core/common_runtime/colocate_predecessor_trees_pass.cc
+++ b/tensorflow/core/common_runtime/colocate_predecessor_trees_pass.cc
@@ -27,6 +27,8 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/tsl/util/device_name_utils.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/config/flag_defs.h"
+#include "tensorflow/core/config/flags.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.pb.h"
@@ -168,6 +170,10 @@ void PropagateColocationInfo(Node* root_node,
 
 Status ColocatePredecessorTreesPass::Run(
     const GraphOptimizationPassOptions& options) {
+  if (!flags::Global().enable_tf2min_ici_weight.value()) {
+    return absl::OkStatus();
+  }
+
   // find all potential node.
   if (options.graph == nullptr) {
     VLOG(1) << "No graph in colocate_predecessor_trees_pass.\n";
@@ -201,6 +207,8 @@ Status ColocatePredecessorTreesPass::Run(
   return absl::OkStatus();
 }
 
+// TODO(b/331245915): Fix the regression issue then set flag
+// enable_tf2min_ici_weight to true.
 REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 50,
                       ColocatePredecessorTreesPass);
 
diff --git a/tensorflow/core/common_runtime/colocate_predecessor_trees_pass_test.cc b/tensorflow/core/common_runtime/colocate_predecessor_trees_pass_test.cc
index 3be3caa835182b..63a9e188cd18e5 100644
--- a/tensorflow/core/common_runtime/colocate_predecessor_trees_pass_test.cc
+++ b/tensorflow/core/common_runtime/colocate_predecessor_trees_pass_test.cc
@@ -21,6 +21,8 @@ limitations under the License.
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/core/common_runtime/graph_def_builder_util.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/config/flag_defs.h"
+#include "tensorflow/core/config/flags.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -44,8 +46,42 @@ Node* GetNode(const Graph& graph, const std::string& name) {
   return nullptr;
 }
 
+// Test the pass is skipped by default because flag enable_tf2min_ici_weight is
+// false by default.
+TEST(ColocatePredecessorTreesPassTest, ICIFlagFalse) {
+  auto graph = std::make_unique<Graph>(OpRegistry::Global());
+  GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+  Node* const_0 = ops::SourceOp("Const", builder.opts()
+                                             .WithName("const_0")
+                                             .WithAttr("dtype", DT_INT32)
+                                             .WithAttr("value", Tensor(1.0)));
+  Node* const_1 = ops::SourceOp("Const", builder.opts()
+                                             .WithName("const_1")
+                                             .WithAttr("dtype", DT_INT32)
+                                             .WithAttr("value", Tensor(2.0)));
+  Node* fill =
+      ops::BinaryOp("Fill", const_0, const_1, builder.opts().WithName("fill"));
+  ops::UnaryOp("Identity", fill, builder.opts().WithName("identity"));
+
+  TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
+  GetNode(*graph, "identity")->set_requested_device(kCpu0);
+
+  GraphDef before;
+  graph->ToGraphDef(&before);
+  GraphOptimizationPassOptions options;
+  options.graph = &graph;
+  ColocatePredecessorTreesPass pass;
+  TF_ASSERT_OK(pass.Run(options));
+
+  EXPECT_FALSE(HasNodeAttr(GetNode(*graph, "const_0")->def(), kClassAttr));
+  EXPECT_FALSE(HasNodeAttr(GetNode(*graph, "const_1")->def(), kClassAttr));
+  EXPECT_FALSE(HasNodeAttr(GetNode(*graph, "fill")->def(), kClassAttr));
+  EXPECT_FALSE(HasNodeAttr(GetNode(*graph, "identity")->def(), kClassAttr));
+}
+
 // Test a simple colocate predecessor tree example.
 TEST(ColocatePredecessorTreesPassTest, SimpleExample) {
+  flags::Global().enable_tf2min_ici_weight.reset(true);
   auto graph = std::make_unique<Graph>(OpRegistry::Global());
   GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
   Node* const_0 = ops::SourceOp("Const", builder.opts()
@@ -92,6 +128,7 @@ TEST(ColocatePredecessorTreesPassTest, SimpleExample) {
 
 // Test colocate two predecessor trees case.
 TEST(ColocatePredecessorTreesPassTest, PropagateTwoTrees) {
+  flags::Global().enable_tf2min_ici_weight.reset(true);
   auto graph = std::make_unique<Graph>(OpRegistry::Global());
   GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
   Node* const_0 = ops::SourceOp("Const", builder.opts()
@@ -170,6 +207,7 @@ TEST(ColocatePredecessorTreesPassTest, PropagateTwoTrees) {
 
 // Test a simple colocate predecessor tree example.
 TEST(ColocatePredecessorTreesPassTest, RootHasMultipleOutputs) {
+  flags::Global().enable_tf2min_ici_weight.reset(true);
   auto graph = std::make_unique<Graph>(OpRegistry::Global());
   GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
   Node* const_0 = ops::SourceOp("Const", builder.opts()
@@ -221,6 +259,7 @@ TEST(ColocatePredecessorTreesPassTest, RootHasMultipleOutputs) {
 
 // Test that a const op has device attr, no colocation info is propagated.
 TEST(ColocatePredecessorTreesPassTest, ConstHasDeviceAttr) {
+  flags::Global().enable_tf2min_ici_weight.reset(true);
   auto graph = std::make_unique<Graph>(OpRegistry::Global());
   GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
   Node* const_0 = ops::SourceOp("Const", builder.opts()
@@ -255,6 +294,7 @@ TEST(ColocatePredecessorTreesPassTest, ConstHasDeviceAttr) {
 
 // Test that a const op has colocation info, no colocation info is propagated.
 TEST(ColocatePredecessorTreesPassTest, ConstHasColocationInfo) {
+  flags::Global().enable_tf2min_ici_weight.reset(true);
   auto graph = std::make_unique<Graph>(OpRegistry::Global());
   GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
   Node* const_0 =
@@ -291,6 +331,7 @@ TEST(ColocatePredecessorTreesPassTest, ConstHasColocationInfo) {
 
 // Test that one input is Arg, no colocation info is propagated.
 TEST(ColocatePredecessorTreesPassTest, InputArg) {
+  flags::Global().enable_tf2min_ici_weight.reset(true);
   auto graph = std::make_unique<Graph>(OpRegistry::Global());
   GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
   Node* arg_0 = ops::SourceOp("_Arg", builder.opts()
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index cb4d241e639f44..0fe70927d8d45d 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -248,14 +248,14 @@ tf_cuda_library(
             "@com_google_absl//absl/strings",
             "@com_google_absl//absl/synchronization",
             "@com_google_absl//absl/time",
-            "@local_tsl//tsl/distributed_runtime/coordination:coordination_service",
-            "@local_tsl//tsl/distributed_runtime/coordination:coordination_service_agent",
-            "@local_tsl//tsl/distributed_runtime/coordination:coordination_service_impl",
-            "@local_tsl//tsl/distributed_runtime/preemption:preemption_notifier",
             "@local_tsl//tsl/platform:mutex",
             "@local_tsl//tsl/platform:statusor",
             "@local_xla//xla/pjrt:pjrt_stream_executor_client",
             "@local_xla//xla/pjrt/gpu:se_gpu_pjrt_client",
+            "@local_xla//xla/tsl/distributed_runtime/coordination:coordination_service",
+            "@local_xla//xla/tsl/distributed_runtime/coordination:coordination_service_agent",
+            "@local_xla//xla/tsl/distributed_runtime/coordination:coordination_service_impl",
+            "@local_xla//xla/tsl/distributed_runtime/preemption:preemption_notifier",
         ],
     }) + if_google(
         # TODO(b/282068262): PJRT pulls in TFRT components that are incompatible with ARM platform.
diff --git a/tensorflow/core/common_runtime/eager/context_distributed_manager.cc b/tensorflow/core/common_runtime/eager/context_distributed_manager.cc
index 3abc2605144133..9ddb221ab4ef20 100644
--- a/tensorflow/core/common_runtime/eager/context_distributed_manager.cc
+++ b/tensorflow/core/common_runtime/eager/context_distributed_manager.cc
@@ -1109,7 +1109,7 @@ Status EagerContextDistributedManager::EnableCollectiveOps(
           "sigterm", Env::Default());
       preemption_notifier_->WillBePreemptedAtAsync(
           [coord_agent = coordination_service_agent_](
-              StatusOr<absl::Time> time_or_status) {
+              absl::StatusOr<absl::Time> time_or_status) {
             if (time_or_status.ok()) {
               const auto coord_task = coord_agent->GetOwnTask().value();
               Status s = coord_agent->InsertKeyValue(
diff --git a/tensorflow/core/common_runtime/eager/context_distributed_manager.h b/tensorflow/core/common_runtime/eager/context_distributed_manager.h
index c5ba7ba4c5197a..ebc535b20fb253 100644
--- a/tensorflow/core/common_runtime/eager/context_distributed_manager.h
+++ b/tensorflow/core/common_runtime/eager/context_distributed_manager.h
@@ -26,8 +26,8 @@ limitations under the License.
 #include "tensorflow/core/platform/status.h"
 
 #if !defined(IS_MOBILE_PLATFORM)
-#include "tsl/distributed_runtime/coordination/coordination_service_agent.h"
-#include "tsl/distributed_runtime/preemption/preemption_notifier.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
+#include "xla/tsl/distributed_runtime/preemption/preemption_notifier.h"
 #endif  // !IS_MOBILE_PLATFORM
 
 namespace tensorflow {
diff --git a/tensorflow/core/common_runtime/eager/copy_to_device_node.h b/tensorflow/core/common_runtime/eager/copy_to_device_node.h
index f27b9245791deb..d496627d418499 100644
--- a/tensorflow/core/common_runtime/eager/copy_to_device_node.h
+++ b/tensorflow/core/common_runtime/eager/copy_to_device_node.h
@@ -52,7 +52,7 @@ class CopyToDeviceNode : public EagerNode {
 
   Status Run() override {
     tensorflow::Tensor tensor;
-    profiler::ScopedMemoryDebugAnnotation op_annotation(
+    tsl::profiler::ScopedMemoryDebugAnnotation op_annotation(
         "eager::CopyToDeviceNode", "dynamic", tensor.dtype(),
         [&tensor]() { return tensor.shape().DebugString(); });
     TF_RETURN_IF_ERROR(src_->CopyToDevice(ctx_, dstd_, &tensor));
diff --git a/tensorflow/core/common_runtime/eager/custom_device_op_handler.cc b/tensorflow/core/common_runtime/eager/custom_device_op_handler.cc
index 9c291fb69625d9..e3e7f455b01123 100644
--- a/tensorflow/core/common_runtime/eager/custom_device_op_handler.cc
+++ b/tensorflow/core/common_runtime/eager/custom_device_op_handler.cc
@@ -175,7 +175,8 @@ Status CustomDeviceOpHandler::MaybePinToCustomDevice(
     // When there is a single custom device present, let the custom device
     // choose whether to pin on the custom device if it overrides choosing.
     if (first != nullptr) {
-      StatusOr<bool> pin_to_custom_device = first->ShallPinToThisDevice(&op);
+      absl::StatusOr<bool> pin_to_custom_device =
+          first->ShallPinToThisDevice(&op);
       // Custom devices that do not override will throw an unimplemented error.
       if (pin_to_custom_device.ok()) {
         if (pin_to_custom_device.value()) {
diff --git a/tensorflow/core/common_runtime/eager/custom_device_test.cc b/tensorflow/core/common_runtime/eager/custom_device_test.cc
index bde191278cf25b..603ed6a1eafac8 100644
--- a/tensorflow/core/common_runtime/eager/custom_device_test.cc
+++ b/tensorflow/core/common_runtime/eager/custom_device_test.cc
@@ -65,7 +65,7 @@ class TestCustomDevice : public CustomDevice {
   }
 
   // Pins `op` to `device`.
-  StatusOr<bool> ShallPinToThisDevice(
+  absl::StatusOr<bool> ShallPinToThisDevice(
       const ImmediateExecutionOperation* op) override {
     return errors::Unimplemented("No preference in custom device pinning.");
   }
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 57af63ddb05e3d..cda35c20062173 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -217,12 +217,12 @@ Status CopyInputToExpectedDevice(EagerContext* ctx, EagerOperation* op,
   // We are only here if the policy is warn or silent copies, so we should
   // trigger a copy.
   TensorHandle* result_handle = nullptr;
-  profiler::TraceMe activity(
+  tsl::profiler::TraceMe activity(
       [&] {
         return absl::StrCat("_Send input ", i, " from ", handle_device->name(),
                             " to ", expected_input_device->name());
       },
-      profiler::TraceMeLevel::kInfo);
+      tsl::profiler::TraceMeLevel::kInfo);
   Status status =
       EagerCopyToDevice(handle, ctx, &op->Executor(), expected_input_device,
                         /* mirror= */ true, &result_handle);
@@ -246,8 +246,8 @@ Status CopyInputToExpectedDevice(EagerContext* ctx, EagerOperation* op,
 Status ValidateInputTypeAndPlacement(
     EagerContext* ctx, EagerOperation* op,
     const core::RefCountPtr<KernelAndDevice>& kernel) {
-  profiler::TraceMe activity("ValidateInputTypeAndPlacement",
-                             profiler::TraceMeLevel::kInfo);
+  tsl::profiler::TraceMe activity("ValidateInputTypeAndPlacement",
+                                  tsl::profiler::TraceMeLevel::kInfo);
   const int n_inputs = op->Inputs().size();
   if (kernel->num_inputs() != n_inputs) {
     return errors::InvalidArgument("expected ", kernel->num_inputs(),
@@ -1085,8 +1085,8 @@ using BoolTensorInputs = std::vector<std::pair<std::string, bool>>;
 // function's input signature. Currently this is only useful to invoke when
 // small_constants_optimizer is enabled because the runtime will have equivalent
 // FunctionDefs of the original tf.function without the boolean tensor input.
-StatusOr<BoolTensorInputs> GetBoolInputs(EagerOperation* op,
-                                         bool delete_inputs) {
+absl::StatusOr<BoolTensorInputs> GetBoolInputs(EagerOperation* op,
+                                               bool delete_inputs) {
   BoolTensorInputs result;
   if (!op->is_function()) return result;
   // Extract tensor inputs.
@@ -1199,7 +1199,7 @@ bool IsSummaryOptimizerEnabled(const EagerOperation* op) {
   return arg_value.value() == include_summary_arg.second;
 }
 
-StatusOr<Fprint128> GetKernelCacheKey(
+absl::StatusOr<Fprint128> GetKernelCacheKey(
     const EagerOperation& op, const Fprint128& op_cache_key,
     const std::vector<Device*>& input_device_ptrs,
     const std::unordered_map<int, DtypeAndPartialTensorShape>&
@@ -1253,8 +1253,8 @@ Status ExtractFunctionInputInfo(
     absl::flat_hash_map<string, const std::vector<string>*>& composite_devices,
     std::unordered_map<int, DtypeAndPartialTensorShape>&
         input_resource_variable_dtypes_and_shapes) {
-  profiler::TraceMe activity("EagerCopyToDevice",
-                             profiler::TraceMeLevel::kInfo);
+  tsl::profiler::TraceMe activity("EagerCopyToDevice",
+                                  tsl::profiler::TraceMeLevel::kInfo);
   EagerContext& ctx = op->EagerContext();
   input_device_ptrs.reserve(op->Inputs().size());
   const absl::InlinedVector<TensorHandle*, 4>* inputs;
@@ -1718,13 +1718,13 @@ Status AddOrExecuteNode(core::RefCountPtr<KernelAndDevice> kernel,
 //    running without an explicitly requested device.
 Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
                          int* num_retvals) {
-  profiler::ScopedMemoryDebugAnnotation op_annotation(
+  tsl::profiler::ScopedMemoryDebugAnnotation op_annotation(
       op->op_name(), op->eager_func_params().has_value()
                          ? op->eager_func_params().value().step_id.value_or(0)
                          : 0);
-  profiler::TraceMe activity(
+  tsl::profiler::TraceMe activity(
       [&] { return absl::StrCat("EagerLocalExecute: ", op->Name()); },
-      profiler::TraceMeLevel::kInfo);
+      tsl::profiler::TraceMeLevel::kInfo);
   EagerContext& ctx = op->EagerContext();
   auto& executor = op->Executor();
   TF_RETURN_IF_ERROR(executor.status());
@@ -1881,8 +1881,8 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
 
   tensorflow::Device* op_device = std::get<Device*>(op->Device());
   {
-    profiler::TraceMe activity("CopyInputToExpectedDevice",
-                               profiler::TraceMeLevel::kInfo);
+    tsl::profiler::TraceMe activity("CopyInputToExpectedDevice",
+                                    tsl::profiler::TraceMeLevel::kInfo);
     const bool is_function = op->is_function();
     const absl::InlinedVector<TensorHandle*, 4>* inputs;
     TF_RETURN_IF_ERROR(op->TensorHandleInputs(&inputs));
@@ -2128,8 +2128,8 @@ void CollectGraphs(EagerContext* ctx) {
 
 Status DoEagerExecute(EagerOperation* op, TensorHandle** retvals,
                       int* num_retvals) {
-  profiler::TraceMe activity([&] {
-    return ::tensorflow::profiler::TraceMeEncode(
+  tsl::profiler::TraceMe activity([&] {
+    return tsl::profiler::TraceMeEncode(
         "EagerExecute",
         {{"eager_op", op->Name()}, {"is_func", op->is_function()}});
   });
@@ -2178,8 +2178,8 @@ Status EagerKernelExecute(
     GraphCollector* graph_collector, CancellationManager* cancellation_manager,
     absl::Span<TensorHandle*> retvals,
     const absl::optional<ManagedStackTrace>& stack_trace) {
-  profiler::TraceMe activity("EagerKernelExecute",
-                             profiler::TraceMeLevel::kInfo);
+  tsl::profiler::TraceMe activity("EagerKernelExecute",
+                                  tsl::profiler::TraceMeLevel::kInfo);
   std::vector<EagerKernelRet> outputs(1);
 
   ExecuteNodeArgs inputs(op_inputs.size());
@@ -2468,13 +2468,13 @@ void EagerLocalExecuteAsync(EagerOperation* op, TensorHandle** retvals,
     return;
   }
 
-  profiler::ScopedMemoryDebugAnnotation op_annotation(
+  tsl::profiler::ScopedMemoryDebugAnnotation op_annotation(
       op->op_name(), op->eager_func_params().has_value()
                          ? op->eager_func_params().value().step_id.value_or(0)
                          : 0);
-  profiler::TraceMe activity(
+  tsl::profiler::TraceMe activity(
       [&] { return absl::StrCat("EagerLocalExecuteAsync: ", op->Name()); },
-      profiler::TraceMeLevel::kInfo);
+      tsl::profiler::TraceMeLevel::kInfo);
   EagerContext& ctx = op->EagerContext();
 
   core::RefCountPtr<KernelAndDevice> kernel;
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
index f94d6d4cb14389..4dc051f5c69221 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@@ -336,7 +336,7 @@ Status KernelAndDeviceOp::Run(
     // time on device of the OpKernel.
     profiler::AnnotatedTraceMe activity(
         [&] { return kernel_->TraceString(context, /*verbose=*/false); },
-        profiler::TraceMeLevel::kInfo);
+        tsl::profiler::TraceMeLevel::kInfo);
     device_->Compute(kernel_.get(), &context);
   }
 
@@ -436,8 +436,8 @@ Status KernelAndDeviceFunc::Run(
     const absl::optional<EagerFunctionParams>& eager_func_params,
     const absl::optional<ManagedStackTrace>& stack_trace,
     tsl::CoordinationServiceAgent* coordination_service_agent) {
-  profiler::TraceMe activity("KernelAndDeviceFunc::Run",
-                             profiler::TraceMeLevel::kInfo);
+  tsl::profiler::TraceMe activity("KernelAndDeviceFunc::Run",
+                                  tsl::profiler::TraceMeLevel::kInfo);
   // Don't try to handle packed or remote inputs synchronously.
   if (inputs.HasRemoteOrPackedInputs() || eager_func_params.has_value()) {
     Notification n;
@@ -481,12 +481,12 @@ void KernelAndDeviceFunc::RunAsync(
     const absl::optional<EagerFunctionParams>& eager_func_params,
     tsl::CoordinationServiceAgent* coordination_service_agent,
     std::function<void(const Status&)> done) {
-  profiler::TraceMe activity(
+  tsl::profiler::TraceMe activity(
       [] {
-        return profiler::TraceMeEncode("KernelAndDeviceFunc::RunAsync",
-                                       {{"_r", 1}});
+        return tsl::profiler::TraceMeEncode("KernelAndDeviceFunc::RunAsync",
+                                            {{"_r", 1}});
       },
-      profiler::TraceMeLevel::kInfo);
+      tsl::profiler::TraceMeLevel::kInfo);
   tsl::core::RefCountPtr<Rendezvous> created_rendezvous;
   std::shared_ptr<FunctionLibraryRuntime::Options> opts = PrepareForRun(
       step_container, outputs, cancellation_manager, eager_func_params,
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index fe3eada262b3c3..8aa56efd495c80 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -191,8 +191,9 @@ Status TensorHandle::GetResourceHandleDtypesAndShapes(
   }
 
   // Wait for this TensorHandle to be ready.
-  profiler::TraceMe activity("TensorHandle::GetResourceHandleInfo WaitReady",
-                             profiler::TraceMeLevel::kVerbose);
+  tsl::profiler::TraceMe activity(
+      "TensorHandle::GetResourceHandleInfo WaitReady",
+      tsl::profiler::TraceMeLevel::kVerbose);
   auto& data = std::get<LocalTensorHandleData>(data_);
   TF_RETURN_IF_ERROR(data.WaitReady("TensorHandle::GetResourceHandleInfo"));
 
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle_data.cc b/tensorflow/core/common_runtime/eager/tensor_handle_data.cc
index d7c160b59c7fd9..07f7051f5010a4 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle_data.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle_data.cc
@@ -113,10 +113,10 @@ Status LocalTensorHandleData::BlockingControl::WaitReady(
     const char* caller) const {
   tf_shared_lock l(mu_);
   if (!is_ready_) {
-    profiler::TraceMe activity(
+    tsl::profiler::TraceMe activity(
         [caller] { return absl::StrCat(caller, " WaitReady"); },
 
-        profiler::TraceMeLevel::kInfo);
+        tsl::profiler::TraceMeLevel::kInfo);
     DVLOG(3) << "WaitReady: " << caller << " " << this;
     mu_.Await(Condition(&is_ready_));
   }
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index aba35de1602ddd..432167b8c23b5d 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -635,7 +635,7 @@ TEST_F(FunctionLibraryRuntimeTest, StateHandle) {
       // Attrs
       {},
       // Nodes
-      {FDH::Const<int32>("shape", gtl::ArraySlice<int32>({1})),
+      {FDH::Const<int32>("shape", absl::Span<const int32>({1})),
        FDH::Const<int32>("minval", 0),
        FDH::Const<int32>("maxval", 10),
        // A stateful node.
diff --git a/tensorflow/core/common_runtime/function_testlib.cc b/tensorflow/core/common_runtime/function_testlib.cc
index d8965a7357e487..e5e80cb8dc77ad 100644
--- a/tensorflow/core/common_runtime/function_testlib.cc
+++ b/tensorflow/core/common_runtime/function_testlib.cc
@@ -127,7 +127,7 @@ FunctionDef BlockingOpFn() {
 
 // TODO(phawkins): replace with C++ API for calling functions, when that exists.
 Output Call(Scope* scope, const string& op_name, const string& fn_name,
-            gtl::ArraySlice<Input> inputs) {
+            absl::Span<const Input> inputs) {
   NodeDef def;
   NodeDefBuilder builder(op_name, fn_name, scope->graph()->op_registry());
   for (const Input& input : inputs) {
diff --git a/tensorflow/core/common_runtime/function_testlib.h b/tensorflow/core/common_runtime/function_testlib.h
index 0874dd796f354b..9618c4083b869e 100644
--- a/tensorflow/core/common_runtime/function_testlib.h
+++ b/tensorflow/core/common_runtime/function_testlib.h
@@ -45,7 +45,7 @@ FunctionDef BlockingOpFn();
 // Adds a function call to the given scope and returns the output for the node.
 // TODO(phawkins): replace with C++ API for calling functions, when that exists.
 Output Call(Scope* scope, const string& op_name, const string& fn_name,
-            gtl::ArraySlice<Input> inputs);
+            absl::Span<const Input> inputs);
 
 }  // namespace function
 }  // namespace test
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.cc b/tensorflow/core/common_runtime/gpu/gpu_util.cc
index f27b9d0cda4496..0b7f8371dcea75 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_util.cc
@@ -172,7 +172,7 @@ void GPUUtil::SetProtoFromGPU(const Tensor& tensor, Device* dev,
   char* buf = nullptr;
   const int64_t total_bytes = is_dead ? 0 : tensor.TotalBytes();
   if (total_bytes > 0) {
-    profiler::ScopedAnnotation annotation("SetProtoFromGPU");
+    tsl::profiler::ScopedAnnotation annotation("SetProtoFromGPU");
     alloc =
         GPUProcessState::singleton()->GetGpuHostAllocator(/*options=*/{}, 0);
     buf = static_cast<char*>(
@@ -351,7 +351,7 @@ void GPUUtil::CopyGPUTensorToCPU(Device* gpu_device,
     auto literal = std::make_unique<xla::MutableBorrowingLiteral>();
     auto status = tensorflow::HostTensorToMutableBorrowingLiteral(
         cpu_tensor, literal.get());
-    xla::PjRtFuture<Status> future =
+    xla::PjRtFuture<> future =
         pjrt_tensor_buffer->pjrt_buffer()->ToLiteral(literal.get());
     future.OnReady([literal = std::move(literal),
                     done](const tensorflow::Status& status) { done(status); });
diff --git a/tensorflow/core/common_runtime/gradients.cc b/tensorflow/core/common_runtime/gradients.cc
index 9f65429171a25c..7c48847cb22149 100644
--- a/tensorflow/core/common_runtime/gradients.cc
+++ b/tensorflow/core/common_runtime/gradients.cc
@@ -99,7 +99,7 @@ static Node* AddZerosLike(Graph* g, NodeOut input) {
   }
 }
 
-static Node* AddSymGrad(Graph* g, Node* n, gtl::ArraySlice<NodeOut> grads) {
+static Node* AddSymGrad(Graph* g, Node* n, absl::Span<const NodeOut> grads) {
   const int num_x = n->num_inputs();
   const int num_y = n->num_outputs();
   CHECK_EQ(num_y, grads.size());
@@ -151,18 +151,18 @@ static Node* AddSymGrad(Graph* g, Node* n, gtl::ArraySlice<NodeOut> grads) {
 
 class SymbolicGradientBuilder {
  public:
-  SymbolicGradientBuilder(gtl::ArraySlice<NodeOut> y_node_outputs,
-                          gtl::ArraySlice<NodeOut> x_node_outputs,
-                          gtl::ArraySlice<NodeOut> y_grad_node_outputs,
+  SymbolicGradientBuilder(absl::Span<const NodeOut> y_node_outputs,
+                          absl::Span<const NodeOut> x_node_outputs,
+                          absl::Span<const NodeOut> y_grad_node_outputs,
                           std::vector<NodeOut>* x_grad_node_outputs,
                           Graph* graph);
 
   Status Compute();
 
  private:
-  gtl::ArraySlice<NodeOut> y_node_outputs_;
-  gtl::ArraySlice<NodeOut> x_node_outputs_;
-  gtl::ArraySlice<NodeOut> y_grad_node_outputs_;
+  absl::Span<const NodeOut> y_node_outputs_;
+  absl::Span<const NodeOut> x_node_outputs_;
+  absl::Span<const NodeOut> y_grad_node_outputs_;
   std::vector<NodeOut>* x_grad_node_outputs_;
   Graph* graph_;  // Not owned.
 
@@ -209,9 +209,9 @@ class SymbolicGradientBuilder {
 };
 
 SymbolicGradientBuilder::SymbolicGradientBuilder(
-    gtl::ArraySlice<NodeOut> y_node_outputs,
-    gtl::ArraySlice<NodeOut> x_node_outputs,
-    gtl::ArraySlice<NodeOut> y_grad_node_outputs,
+    absl::Span<const NodeOut> y_node_outputs,
+    absl::Span<const NodeOut> x_node_outputs,
+    absl::Span<const NodeOut> y_grad_node_outputs,
     std::vector<NodeOut>* x_grad_node_outputs, Graph* graph)
     : y_node_outputs_(y_node_outputs),
       x_node_outputs_(x_node_outputs),
@@ -405,9 +405,9 @@ Status SymbolicGradientBuilder::Compute() {
   return absl::OkStatus();
 }
 
-Status AddSymbolicGradients(gtl::ArraySlice<NodeOut> y_node_outputs,
-                            gtl::ArraySlice<NodeOut> x_node_outputs,
-                            gtl::ArraySlice<NodeOut> y_grad_node_outputs,
+Status AddSymbolicGradients(absl::Span<const NodeOut> y_node_outputs,
+                            absl::Span<const NodeOut> x_node_outputs,
+                            absl::Span<const NodeOut> y_grad_node_outputs,
                             std::vector<NodeOut>* x_grad_node_outputs,
                             Graph* graph) {
   SymbolicGradientBuilder builder(y_node_outputs, x_node_outputs,
diff --git a/tensorflow/core/common_runtime/gradients.h b/tensorflow/core/common_runtime/gradients.h
index 1f86af5298de38..d14c85d112f08a 100644
--- a/tensorflow/core/common_runtime/gradients.h
+++ b/tensorflow/core/common_runtime/gradients.h
@@ -47,9 +47,9 @@ struct NodeOut {
 // implementation only supports gradients for functions). In particular,
 // the nodes in 'x_nodes' are currently restricted to have one output.
 
-Status AddSymbolicGradients(gtl::ArraySlice<NodeOut> y_node_outputs,
-                            gtl::ArraySlice<NodeOut> x_node_outputs,
-                            gtl::ArraySlice<NodeOut> y_grad_node_outputs,
+Status AddSymbolicGradients(absl::Span<const NodeOut> y_node_outputs,
+                            absl::Span<const NodeOut> x_node_outputs,
+                            absl::Span<const NodeOut> y_grad_node_outputs,
                             std::vector<NodeOut>* x_grad_node_outputs,
                             Graph* graph);
 
diff --git a/tensorflow/core/common_runtime/graph_constructor.cc b/tensorflow/core/common_runtime/graph_constructor.cc
index 0e1f20909eec17..66109aee89eaa9 100644
--- a/tensorflow/core/common_runtime/graph_constructor.cc
+++ b/tensorflow/core/common_runtime/graph_constructor.cc
@@ -164,7 +164,7 @@ class GraphConstructor {
     string default_device;
   };
 
-  typedef gtl::ArraySlice<const NodeDef*> NodeDefSlice;
+  typedef absl::Span<const NodeDef* const> NodeDefSlice;
 
   // versions, library, and debug_info may be nullptr
   static Status Construct(
@@ -1534,7 +1534,7 @@ Status ConvertGraphDefToGraph(const GraphConstructorOptions& opts,
 }
 
 Status ConvertNodeDefsToGraph(const GraphConstructorOptions& opts,
-                              gtl::ArraySlice<NodeDef> nodes, Graph* g,
+                              absl::Span<const NodeDef> nodes, Graph* g,
                               const GraphDebugInfo* debug_info) {
   ShapeRefiner refiner(TF_GRAPH_DEF_VERSION, g->op_registry());
   // TODO(irving): Copy will go away once NodeInfo exists
diff --git a/tensorflow/core/common_runtime/graph_constructor.h b/tensorflow/core/common_runtime/graph_constructor.h
index cb164732da150e..34dfc6657d55a7 100644
--- a/tensorflow/core/common_runtime/graph_constructor.h
+++ b/tensorflow/core/common_runtime/graph_constructor.h
@@ -62,7 +62,7 @@ extern Status ConvertGraphDefToGraph(const GraphConstructorOptions& opts,
 // instantiation.
 // TODO(irving): This will turn into std::vector<NodeInfoPtr> soon.
 extern Status ConvertNodeDefsToGraph(
-    const GraphConstructorOptions& opts, gtl::ArraySlice<NodeDef> nodes,
+    const GraphConstructorOptions& opts, absl::Span<const NodeDef> nodes,
     Graph* g, const GraphDebugInfo* debug_info = nullptr);
 
 // Options for calling ImportGraphDef().
diff --git a/tensorflow/core/common_runtime/graph_view.h b/tensorflow/core/common_runtime/graph_view.h
index b0b1047856cace..ed9b14cfa1f73d 100644
--- a/tensorflow/core/common_runtime/graph_view.h
+++ b/tensorflow/core/common_runtime/graph_view.h
@@ -111,9 +111,8 @@ struct NodeItem {
   // is true if and only if the ith output is consumed by another node.
   std::unique_ptr<bool[]> outputs_required;
 
-  gtl::MutableArraySlice<EdgeInfo> mutable_output_edges() {
-    return gtl::MutableArraySlice<EdgeInfo>(output_edge_base(),
-                                            num_output_edges);
+  absl::Span<EdgeInfo> mutable_output_edges() {
+    return absl::Span<EdgeInfo>(output_edge_base(), num_output_edges);
   }
 
   gtl::ArraySlice<EdgeInfo> output_edges() const {
diff --git a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
index 1556290485c17e..f05811e8893cd1 100644
--- a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
+++ b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
@@ -408,7 +408,7 @@ void HierarchicalTreeBroadcaster::DispatchSend(int subdiv, int dst_rank,
                                                int src_rank,
                                                const Tensor* src_tensor,
                                                const StatusCallback& done) {
-  profiler::ScopedMemoryDebugAnnotation op_annotation(
+  tsl::profiler::ScopedMemoryDebugAnnotation op_annotation(
       col_params_->name.data(), col_ctx_->step_id, "dynamic",
       src_tensor->dtype(),
       [src_tensor]() { return src_tensor->shape().DebugString(); });
diff --git a/tensorflow/core/common_runtime/mkl_layout_pass.cc b/tensorflow/core/common_runtime/mkl_layout_pass.cc
index 3272940402b737..1ca7cf01b0517c 100644
--- a/tensorflow/core/common_runtime/mkl_layout_pass.cc
+++ b/tensorflow/core/common_runtime/mkl_layout_pass.cc
@@ -244,7 +244,6 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
  public:
   MklLayoutRewritePass() : num_intra_threads_(port::MaxParallelism()) {
     // NOTE: names are alphabetically sorted.
-    csinfo_.addn = "AddN";
     csinfo_.avg_pool = "AvgPool";
     csinfo_.avg_pool_grad = "AvgPoolGrad";
     csinfo_.avg_pool3d = "AvgPool3D";
@@ -283,11 +282,8 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.fused_depthwise_conv2d = "_FusedDepthwiseConv2dNative";
     csinfo_.fused_matmul = "_FusedMatMul";
     csinfo_.fused_pad_conv2d = "FusedPadConv2D";
-    csinfo_.identity = "Identity";
     csinfo_.leakyrelu = "LeakyRelu";
     csinfo_.leakyrelu_grad = "LeakyReluGrad";
-    csinfo_.lrn = "LRN";
-    csinfo_.lrn_grad = "LRNGrad";
     csinfo_.matmul = "MatMul";
     csinfo_.max_pool = "MaxPool";
     csinfo_.max_pool_grad = "MaxPoolGrad";
@@ -370,34 +366,16 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.requantize = "Requantize";
     csinfo_.tanh = "Tanh";
     csinfo_.tanh_grad = "TanhGrad";
-    csinfo_.reshape = "Reshape";
     csinfo_.sparse_matrix_matmul = "SparseMatrixMatMul";
-    csinfo_.slice = "Slice";
     csinfo_.softmax = "Softmax";
-    csinfo_.split = "Split";
     csinfo_.transpose = "Transpose";
     // Element-wise ops. Ensure you also add any new ops to IsOpElementWise
-    // in the MklUtil.h (IsMklElementWiseOp method) to ensure that the
-    // MklInputConversion op is added before it.
-    csinfo_.add = "Add";
-    csinfo_.add_v2 = "AddV2";
-    csinfo_.maximum = "Maximum";
-    csinfo_.mul = "Mul";
-    csinfo_.squared_difference = "SquaredDifference";
-    csinfo_.sub = "Sub";
+    // in the MklUtil.h
     csinfo_.sigmoid = "Sigmoid";
     // End - element-wise ops. See note above.
 
     const bool native_fmt = NativeFormatEnabled();
     // NOTE: names are alphabetically sorted.
-    rinfo_.push_back({csinfo_.addn, mkl_op_registry::GetMklOpName(csinfo_.addn),
-                      CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
-    rinfo_.push_back({csinfo_.add, mkl_op_registry::GetMklOpName(csinfo_.add),
-                      CopyAttrsAll, RewriteIfAtleastOneMklInput,
-                      GetRewriteCause()});
-    rinfo_.push_back(
-        {csinfo_.add_v2, mkl_op_registry::GetMklOpName(csinfo_.add_v2),
-         CopyAttrsAll, RewriteIfAtleastOneMklInput, GetRewriteCause()});
     rinfo_.push_back({csinfo_.avg_pool,
                       mkl_op_registry::GetMklOpName(csinfo_.avg_pool),
                       CopyAttrsAll, RewriteIfX86, GetRewriteCause()});
@@ -531,14 +509,6 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back(
         {csinfo_.fused_pad_conv2d, csinfo_.mkl_native_pad_with_conv2d,
          CopyAttrsAllCheckConstFilter, AlwaysRewrite, kRewriteForOpNameChange});
-    rinfo_.push_back(
-        {csinfo_.identity, mkl_op_registry::GetMklOpName(csinfo_.identity),
-         CopyAttrsAll, RewriteIfAtleastOneMklInput, GetRewriteCause()});
-    rinfo_.push_back({csinfo_.lrn, mkl_op_registry::GetMklOpName(csinfo_.lrn),
-                      CopyAttrsAll, LrnRewrite, GetRewriteCause()});
-    rinfo_.push_back({csinfo_.lrn_grad,
-                      mkl_op_registry::GetMklOpName(csinfo_.lrn_grad),
-                      CopyAttrsAll, LrnGradRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.matmul,
                       mkl_op_registry::GetMklOpName(csinfo_.matmul),
                       CopyAttrsAll, MatMulRewrite, kRewriteForOpNameChange});
@@ -566,12 +536,6 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back({csinfo_.max_pool3d_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.max_pool3d_grad),
                       CopyAttrsAll, Maxpool3DGradRewrite, GetRewriteCause()});
-    rinfo_.push_back(
-        {csinfo_.maximum, mkl_op_registry::GetMklOpName(csinfo_.maximum),
-         CopyAttrsAll, RewriteIfAtleastOneMklInput, GetRewriteCause()});
-    rinfo_.push_back({csinfo_.mul, mkl_op_registry::GetMklOpName(csinfo_.mul),
-                      CopyAttrsAll, RewriteIfAtleastOneMklInput,
-                      GetRewriteCause()});
     rinfo_.push_back({csinfo_.pad_with_conv2d,
                       native_fmt ? csinfo_.mkl_native_pad_with_conv2d
                                  : csinfo_.mkl_pad_with_conv2d,
@@ -717,29 +681,14 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back({csinfo_.tanh_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.tanh_grad),
                       CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
-    rinfo_.push_back({csinfo_.reshape,
-                      mkl_op_registry::GetMklOpName(csinfo_.reshape),
-                      CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
-    rinfo_.push_back(
-        {csinfo_.slice, mkl_op_registry::GetMklOpName(csinfo_.slice),
-         CopyAttrsAll, RewriteIfAtleastOneMklInput, GetRewriteCause()});
     rinfo_.push_back({csinfo_.softmax,
                       mkl_op_registry::GetMklOpName(csinfo_.softmax),
                       CopyAttrsAll, RewriteIfX86, GetRewriteCause()});
-
-    rinfo_.push_back({csinfo_.squared_difference,
-                      mkl_op_registry::GetMklOpName(csinfo_.squared_difference),
-                      CopyAttrsAll, RewriteIfAtleastOneMklInput,
-                      GetRewriteCause()});
-    rinfo_.push_back({csinfo_.sub, mkl_op_registry::GetMklOpName(csinfo_.sub),
-                      CopyAttrsAll, RewriteIfAtleastOneMklInput,
-                      GetRewriteCause()});
     rinfo_.push_back({csinfo_.transpose,
                       mkl_op_registry::GetMklOpName(csinfo_.transpose),
                       CopyAttrsAll, RewriteIfX86, kRewriteForOpNameChange});
 
     // Add info about which ops to add workspace edge to and the slots.
-    wsinfo_.push_back({csinfo_.lrn, csinfo_.lrn_grad, 0, 2, 1, 3});
     wsinfo_.push_back({csinfo_.max_pool, csinfo_.max_pool_grad, 0, 1, 1, 3});
     wsinfo_.push_back(
         {csinfo_.max_pool3d, csinfo_.max_pool3d_grad, 0, 1, 1, 3});
@@ -881,9 +830,6 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   /// Structure to store all constant strings
   /// NOTE: names are alphabetically sorted.
   typedef struct {
-    string addn;
-    string add;
-    string add_v2;
     string avg_pool;
     string avg_pool_grad;
     string avg_pool3d;
@@ -919,17 +865,13 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string fused_conv3d;
     string fused_depthwise_conv2d;
     string fused_matmul;
-    string identity;
     string leakyrelu;
     string leakyrelu_grad;
-    string lrn;
-    string lrn_grad;
     string matmul;
     string max_pool;
     string max_pool_grad;
     string max_pool3d;
     string max_pool3d_grad;
-    string maximum;
     string mkl_conv2d;
     string mkl_conv2d_grad_input;
     string mkl_conv2d_grad_filter;
@@ -954,7 +896,6 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string mkl_pad_with_conv2d;
     string mkl_pad_with_fused_conv2d;
     string mkl_swish;
-    string mul;
     string pad;
     string pad_with_conv2d;
     string pad_with_fused_conv2d;
@@ -992,13 +933,8 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string tanh;
     string tanh_grad;
     string transpose;
-    string reshape;
     string sparse_matrix_matmul;
-    string slice;
     string softmax;
-    string split;
-    string squared_difference;
-    string sub;
   } ConstStringsInfo;
 
  private:
@@ -1675,45 +1611,6 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     return false;
   }
 
-  // If the depth_radius of LRN is not 2, then MKL DNN takes unoptimized
-  // path. The unoptimized path is slow. Thus we don't rewrite the node
-  // and use default Eigen. But for depth_radius=2, MKL DNN optimized
-  // path is taken, i.e., eigen node is rewritten by MKl DNN node.
-  static bool LrnRewrite(const Node* n) {
-    DCHECK(n);
-
-    int depth_radius;
-    TF_CHECK_OK(GetNodeAttr(n->def(), "depth_radius", &depth_radius));
-
-    // if the depth_radius of LRN is not 2, don't rewrite the node by MKL DNN
-    // and use eigen node instead
-    if (depth_radius == 2) {
-      return true;
-    }
-    VLOG(1) << "LrnRewrite: The model sets depth_radius as not 2 which"
-            << "case is not optimized by Intel MKL, thus using Eigen op"
-            << "for LRN ";
-
-    return false;
-  }
-
-  static bool LrnGradRewrite(const Node* n) {
-    DCHECK(n);
-    bool do_rewrite = false;
-
-    for (const Edge* e : n->in_edges()) {
-      // Rewrite only if there is corresponding LRN, i.e workspace is available
-      if (e->dst()->type_string() == csinfo_.lrn_grad && e->dst_input() == 2 &&
-          e->src()->type_string() ==
-              mkl_op_registry::GetMklOpName(csinfo_.lrn) &&
-          e->src_output() == 0) {
-        do_rewrite = true;
-        break;
-      }
-    }
-    return do_rewrite;
-  }
-
   // MKL-DNN's LeakyRelu(feature) = feature          (if feature > 0), or
   //                                feature * alpha  (otherwise),
   // while TensorFlow's LeakyRelu(feature) = max(feature, feature * alpha).
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/BUILD b/tensorflow/core/common_runtime/next_pluggable_device/BUILD
index b07376c1614788..41fd6207bd188d 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/BUILD
+++ b/tensorflow/core/common_runtime/next_pluggable_device/BUILD
@@ -269,7 +269,7 @@ cc_library(
         "//tensorflow/core/platform:status",
         "//tensorflow/core/platform:statusor",
         "@com_google_absl//absl/time",
-        "@local_tsl//tsl/distributed_runtime/coordination:coordination_service_agent",
+        "@local_xla//xla/tsl/distributed_runtime/coordination:coordination_service_agent",
     ],
 )
 
@@ -325,9 +325,6 @@ tf_cc_test(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/time",
-        "@local_tsl//tsl/distributed_runtime:call_options",
-        "@local_tsl//tsl/distributed_runtime/coordination:coordination_client",
-        "@local_tsl//tsl/distributed_runtime/coordination:coordination_service_agent",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:env_impl",
@@ -337,6 +334,9 @@ tf_cc_test(
         "@local_tsl//tsl/platform:test_main",
         "@local_tsl//tsl/protobuf:coordination_config_proto_cc",
         "@local_tsl//tsl/protobuf:coordination_service_proto_cc",
+        "@local_xla//xla/tsl/distributed_runtime:call_options",
+        "@local_xla//xla/tsl/distributed_runtime/coordination:coordination_client",
+        "@local_xla//xla/tsl/distributed_runtime/coordination:coordination_service_agent",
     ],
 )
 
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_coordination_service_agent_test.cc b/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_coordination_service_agent_test.cc
index 9070a7e2dcf146..131ab086e4fd46 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_coordination_service_agent_test.cc
+++ b/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_coordination_service_agent_test.cc
@@ -24,10 +24,10 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/time/time.h"
+#include "xla/tsl/distributed_runtime/call_options.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_client.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
 #include "tensorflow/core/platform/status.h"
-#include "tsl/distributed_runtime/call_options.h"
-#include "tsl/distributed_runtime/coordination/coordination_client.h"
-#include "tsl/distributed_runtime/coordination/coordination_service_agent.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/test.h"
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_coordination_service_agent.h b/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_coordination_service_agent.h
index 2a6081d41f8373..edba423358448c 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_coordination_service_agent.h
+++ b/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_coordination_service_agent.h
@@ -20,10 +20,10 @@ limitations under the License.
 #include <string_view>
 
 #include "absl/time/time.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
 #include "tensorflow/core/common_runtime/next_pluggable_device/plugin_coordination_service_agent.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/statusor.h"
-#include "tsl/distributed_runtime/coordination/coordination_service_agent.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_context.cc b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_context.cc
index bb942de174627e..ddc7949a1a7f39 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_context.cc
+++ b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_context.cc
@@ -73,9 +73,9 @@ NextPluggableDeviceContext::~NextPluggableDeviceContext() {
 void NextPluggableDeviceContext::CopyDeviceTensorToCPU(
     const Tensor* device_tensor, absl::string_view tensor_name, Device* device,
     Tensor* cpu_tensor, StatusCallback done) {
-  profiler::TraceMeProducer traceme(
+  tsl::profiler::TraceMeProducer traceme(
       [] { return "NextPluggableDeviceContext::CopyDeviceTensorToCPU"; },
-      profiler::ContextType::kGeneric);
+      tsl::profiler::ContextType::kGeneric);
   tensorflow::Status s;
   TF_Tensor* c_cpu_tensor = TF_TensorFromTensor(*cpu_tensor, &s);
   if (!s.ok()) {
@@ -102,9 +102,9 @@ void NextPluggableDeviceContext::CopyDeviceTensorToCPU(
 void NextPluggableDeviceContext::CopyCPUTensorToDevice(
     const Tensor* cpu_tensor, Device* device, Tensor* device_tensor,
     StatusCallback done, bool sync_dst_compute) const {
-  profiler::TraceMeProducer traceme(
+  tsl::profiler::TraceMeProducer traceme(
       [] { return "NextPluggableDeviceContext::CopyCPUTensorToDevice"; },
-      profiler::ContextType::kGeneric);
+      tsl::profiler::ContextType::kGeneric);
   tensorflow::Status s;
   TF_Tensor* c_cpu_tensor = TF_TensorFromTensor(*cpu_tensor, &s);
   if (!s.ok()) {
diff --git a/tensorflow/core/common_runtime/partitioning_utils_test.cc b/tensorflow/core/common_runtime/partitioning_utils_test.cc
index 17ba51e9f357dd..a3db45c57bdef1 100644
--- a/tensorflow/core/common_runtime/partitioning_utils_test.cc
+++ b/tensorflow/core/common_runtime/partitioning_utils_test.cc
@@ -99,8 +99,8 @@ class PartitioningUtilsTest : public ::testing::Test {
   // where each node has type `dtype` and arg/ret nodes have
   // indices `arg_index` and `ret_index`.
   void SubGraph(Graph* subgraph, DataType dtype,
-                gtl::ArraySlice<int> arg_indices,
-                gtl::ArraySlice<int> ret_indices) {
+                absl::Span<const int> arg_indices,
+                absl::Span<const int> ret_indices) {
     Scope s = Scope::NewRootScope();
     Scope s1 = s.WithDevice("/job:a/replica:0/task:0/device:CPU:0");
     CHECK_EQ(arg_indices.size(), ret_indices.size());
diff --git a/tensorflow/core/common_runtime/placer_test.cc b/tensorflow/core/common_runtime/placer_test.cc
index 784436579e0913..aeca73546468c0 100644
--- a/tensorflow/core/common_runtime/placer_test.cc
+++ b/tensorflow/core/common_runtime/placer_test.cc
@@ -1966,7 +1966,7 @@ TEST_P(SoftPlacementPlacerTest,
           NDef("b", "_Arg", {}, {{"T", DT_RESOURCE}}, kCPU),
           NDef("id1", "Identity", {"a"},
                {{"T", DT_RESOURCE},
-                {"_class", gtl::ArraySlice<string>({"loc:@id2"})}}),
+                {"_class", absl::Span<const string>({"loc:@id2"})}}),
           NDef("id2", "Identity", {"b"}, {{"T", DT_RESOURCE}}),
       },
       // FunctionLib
@@ -2013,7 +2013,7 @@ TEST_F(PlacerTest, RequestedDeviceCanBeOverridden) {
           NDef("id_b", "Identity", {"b"}, {{"T", DT_RESOURCE}}, kCPU),
           NDef("id1", "Identity", {"id_a"},
                {{"T", DT_RESOURCE},
-                {"_class", gtl::ArraySlice<string>({"loc:@id2"})}}),
+                {"_class", absl::Span<const string>({"loc:@id2"})}}),
           NDef("id2", "Identity", {"id_b"}, {{"T", DT_RESOURCE}}),
       },
       // FunctionLib
@@ -2076,7 +2076,7 @@ TEST_P(SoftPlacementPlacerTest,
           NDef("id_b", "Identity", {"b"}, {{"T", DT_RESOURCE}}),
           NDef("id1", "Identity", {"id_a"},
                {{"T", DT_RESOURCE},
-                {"_class", gtl::ArraySlice<string>({"loc:@id2"})}}),
+                {"_class", absl::Span<const string>({"loc:@id2"})}}),
           NDef("id2", "Identity", {"id_b"}, {{"T", DT_RESOURCE}}),
       },
       // FunctionLib
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index b1d491fa2fcd87..5fcb4189dcdc06 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -150,7 +150,7 @@ ProcessFunctionLibraryRuntime::ProcessFunctionLibraryRuntime(
 Status ProcessFunctionLibraryRuntime::SendTensors(
     const string& source_device, const string& target_device,
     const string& key_prefix, int64_t src_incarnation,
-    gtl::ArraySlice<Tensor> tensors_to_send, DeviceContext* device_context,
+    absl::Span<const Tensor> tensors_to_send, DeviceContext* device_context,
     const std::vector<AllocatorAttributes>& alloc_attrs,
     RendezvousInterface* rendezvous) {
   std::vector<string> keys;
@@ -398,7 +398,7 @@ ProcessFunctionLibraryRuntime::IsMultiDevice(
 
 namespace {
 // Returns the local tensors referred by `args`.
-std::vector<Tensor> GetLocalArgs(gtl::ArraySlice<FunctionArg> args) {
+std::vector<Tensor> GetLocalArgs(absl::Span<const FunctionArg> args) {
   std::vector<Tensor> tensors;
   for (const auto& arg : args) {
     if (arg.index() == 0) {
@@ -1320,7 +1320,7 @@ Status ProcessFunctionLibraryRuntime::CreateRendezvous(
 }
 
 Status ProcessFunctionLibraryRuntime::GetComponentArgs(
-    const gtl::ArraySlice<Tensor> args,
+    const absl::Span<const Tensor> args,
     const ProcessFunctionLibraryRuntime::ComponentFunctionData& comp_data,
     ProcessFunctionLibraryRuntime::InternalArgs* comp_args) {
   // "Index"s of _Arg nodes are unique when all arguments are local Tensors.
@@ -1375,7 +1375,7 @@ Status ProcessFunctionLibraryRuntime::GetComponentArgs(
 
 void ProcessFunctionLibraryRuntime::Run(
     const FunctionLibraryRuntime::Options& opts,
-    FunctionLibraryRuntime::Handle handle, gtl::ArraySlice<Tensor> args,
+    FunctionLibraryRuntime::Handle handle, absl::Span<const Tensor> args,
     std::vector<Tensor>* rets,
     FunctionLibraryRuntime::DoneCallback done) const {
   FunctionLibraryRuntime::Options new_opts = opts;
@@ -1420,7 +1420,7 @@ void ProcessFunctionLibraryRuntime::Run(
 // This method handles the simple remote call case (not multi-device).
 void ProcessFunctionLibraryRuntime::RunInternal(
     const FunctionLibraryRuntime::Options& opts,
-    FunctionLibraryRuntime::Handle handle, gtl::ArraySlice<FunctionArg> args,
+    FunctionLibraryRuntime::Handle handle, absl::Span<const FunctionArg> args,
     std::vector<FunctionRet>* rets,
     std::vector<std::unique_ptr<CleanUpItem>>* cleanup_items,
     FunctionLibraryRuntime::DoneCallback done) const {
@@ -1560,7 +1560,7 @@ void ProcessFunctionLibraryRuntime::Run(
 
 Status ProcessFunctionLibraryRuntime::RunSync(
     const FunctionLibraryRuntime::Options& orig_opts,
-    FunctionLibraryRuntime::Handle handle, gtl::ArraySlice<Tensor> args,
+    FunctionLibraryRuntime::Handle handle, absl::Span<const Tensor> args,
     std::vector<Tensor>* rets) const {
   MultiDeviceFunctionData* multi_device_data = IsMultiDevice(handle);
   if (multi_device_data && multi_device_data->enable_sync_execution) {
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.h b/tensorflow/core/common_runtime/process_function_library_runtime.h
index 433459e79738b2..8c2aff62fca209 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.h
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.h
@@ -95,7 +95,7 @@ class ProcessFunctionLibraryRuntime {
   static Status SendTensors(const string& source_device,
                             const string& target_device,
                             const string& key_prefix, int64_t src_incarnation,
-                            gtl::ArraySlice<Tensor> tensors_to_send,
+                            absl::Span<const Tensor> tensors_to_send,
                             DeviceContext* device_context,
                             const std::vector<AllocatorAttributes>& alloc_attrs,
                             RendezvousInterface* rendezvous);
@@ -181,7 +181,7 @@ class ProcessFunctionLibraryRuntime {
   // Runs the function with given `handle`. Function could have been
   // instantiated on any device. More details in framework/function.h
   void Run(const FunctionLibraryRuntime::Options& opts,
-           FunctionLibraryRuntime::Handle handle, gtl::ArraySlice<Tensor> args,
+           FunctionLibraryRuntime::Handle handle, absl::Span<const Tensor> args,
            std::vector<Tensor>* rets,
            FunctionLibraryRuntime::DoneCallback done) const;
   void Run(const FunctionLibraryRuntime::Options& opts,
@@ -195,7 +195,8 @@ class ProcessFunctionLibraryRuntime {
 
   Status RunSync(const FunctionLibraryRuntime::Options& opts,
                  FunctionLibraryRuntime::Handle handle,
-                 gtl::ArraySlice<Tensor> args, std::vector<Tensor>* rets) const;
+                 absl::Span<const Tensor> args,
+                 std::vector<Tensor>* rets) const;
   Status RunSync(const FunctionLibraryRuntime::Options& opts,
                  FunctionLibraryRuntime::Handle handle,
                  CallFrameInterface* frame) const;
@@ -391,7 +392,7 @@ class ProcessFunctionLibraryRuntime {
 
   void RunInternal(const FunctionLibraryRuntime::Options& opts,
                    FunctionLibraryRuntime::Handle handle,
-                   gtl::ArraySlice<FunctionArg> args,
+                   absl::Span<const FunctionArg> args,
                    std::vector<FunctionRet>* rets,
                    std::vector<std::unique_ptr<CleanUpItem>>* cleanup_items,
                    FunctionLibraryRuntime::DoneCallback done) const;
@@ -409,7 +410,7 @@ class ProcessFunctionLibraryRuntime {
   void CleanUp(std::vector<std::unique_ptr<CleanUpItem>>* items,
                FunctionLibraryRuntime::DoneCallback done) const;
 
-  static Status GetComponentArgs(gtl::ArraySlice<Tensor> args,
+  static Status GetComponentArgs(absl::Span<const Tensor> args,
                                  const ComponentFunctionData& comp_data,
                                  InternalArgs* comp_args);
 
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
index 8e3c2c225dc0ed..344d22efc58b28 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
@@ -73,12 +73,12 @@ class TestClusterFLR : public DistributedFunctionLibraryRuntime {
 
   void Run(const FunctionLibraryRuntime::Options& opts,
            FunctionLibraryRuntime::LocalHandle handle,
-           gtl::ArraySlice<Tensor> args, std::vector<Tensor>* rets,
+           absl::Span<const Tensor> args, std::vector<Tensor>* rets,
            FunctionLibraryRuntime::DoneCallback done) override {}
 
   void Run(const FunctionLibraryRuntime::Options& opts,
            FunctionLibraryRuntime::LocalHandle handle,
-           gtl::ArraySlice<FunctionArg> args, std::vector<FunctionRet>* rets,
+           absl::Span<const FunctionArg> args, std::vector<FunctionRet>* rets,
            FunctionLibraryRuntime::DoneCallback done) override {}
 
   void CleanUp(uint64 step_id, FunctionLibraryRuntime::LocalHandle handle,
@@ -1136,7 +1136,7 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_StateHandle) {
       // Attrs
       {},
       // Nodes
-      {FunctionDefHelper::Const<int32>("shape", gtl::ArraySlice<int32>({1})),
+      {FunctionDefHelper::Const<int32>("shape", absl::Span<const int32>({1})),
        FunctionDefHelper::Const<int32>("minval", 0),
        {{"maxval"}, "ReadVariableOp", {"x"}, {{"dtype", T}}, {}},
        // A stateful node.
diff --git a/tensorflow/core/common_runtime/propagator_state.cc b/tensorflow/core/common_runtime/propagator_state.cc
index 0a50530ac4eb8b..9a365177770d4a 100644
--- a/tensorflow/core/common_runtime/propagator_state.cc
+++ b/tensorflow/core/common_runtime/propagator_state.cc
@@ -65,7 +65,7 @@ void PropagatorState::ActivateRoots(gtl::ArraySlice<const NodeItem*> roots,
 void PropagatorState::PropagateOutputs(const TaggedNode& tagged_node,
                                        EntryVector* outputs,
                                        TaggedNodeSeq* ready) {
-  profiler::TraceMe activity(
+  tsl::profiler::TraceMe activity(
       [&]() {
         return strings::StrCat(
             "ExecutorPropagateOutputs#", "id=", step_id_,
@@ -75,7 +75,7 @@ void PropagatorState::PropagateOutputs(const TaggedNode& tagged_node,
             tagged_node.node_item->num_output_control_edges,
             ",input_frame=", tagged_node.input_frame->frame_id, "#");
       },
-      profiler::GetTFTraceMeLevel(/*is_expensive=*/false));
+      tsl::profiler::GetTFTraceMeLevel(/*is_expensive=*/false));
 
   const NodeItem* const item = tagged_node.node_item;
   FrameState* const input_frame = tagged_node.input_frame;
@@ -157,12 +157,12 @@ void PropagatorState::PropagateOutputs(const TaggedNode& tagged_node,
       }
       if (output_frame != nullptr) {
         if (need_create_iter) {
-          profiler::TraceMe activit1y(
+          tsl::profiler::TraceMe activit1y(
               [&]() {
                 return strings::StrCat(
                     "PropagateOutputs::NextIteration::CreateIterationState");
               },
-              profiler::GetTFTraceMeLevel(/*is_expensive=*/false));
+              tsl::profiler::GetTFTraceMeLevel(/*is_expensive=*/false));
           mutex_lock l(input_frame->mu);
           if (input_iter->iter_num == input_frame->iteration_count) {
             // Check another time since another thread may create the required
diff --git a/tensorflow/core/common_runtime/rendezvous_mgr.cc b/tensorflow/core/common_runtime/rendezvous_mgr.cc
index 75bfb99e5ef6e8..80002db99d827a 100644
--- a/tensorflow/core/common_runtime/rendezvous_mgr.cc
+++ b/tensorflow/core/common_runtime/rendezvous_mgr.cc
@@ -88,7 +88,7 @@ void SameWorkerRecvDone(const DeviceMgr* device_mgr,
     return;
   }
 
-  profiler::ScopedMemoryDebugAnnotation op_annotation(
+  tsl::profiler::ScopedMemoryDebugAnnotation op_annotation(
       "SameWorkerRecvDone", 0, "dynamic", in.dtype(),
       [&in]() { return in.shape().DebugString(); });
   AllocatorAttributes attr = recv_args.alloc_attrs;
@@ -136,7 +136,7 @@ void IntraProcessRecvAsyncImpl(const DeviceMgr* device_mgr,
                                RendezvousInterface::DoneCallback done) {
   VLOG(1) << "IntraProcessRendezvous Recv " << local << " " << parsed.FullKey();
 
-  profiler::ScopedMemoryDebugAnnotation op_annotation("RecvAsync");
+  tsl::profiler::ScopedMemoryDebugAnnotation op_annotation("RecvAsync");
   // Recv the tensor from local_.
   local->RecvAsync(
       parsed, recv_args,
diff --git a/tensorflow/core/common_runtime/rendezvous_util.cc b/tensorflow/core/common_runtime/rendezvous_util.cc
index bad10d33dee5b1..9c6d949d18e16b 100644
--- a/tensorflow/core/common_runtime/rendezvous_util.cc
+++ b/tensorflow/core/common_runtime/rendezvous_util.cc
@@ -22,7 +22,7 @@ namespace tensorflow {
 Status SendTensorsToRendezvous(
     RendezvousInterface* rendezvous, DeviceContext* device_context,
     const std::vector<AllocatorAttributes>& alloc_attrs,
-    const std::vector<string>& keys, gtl::ArraySlice<Tensor> tensors_to_send) {
+    const std::vector<string>& keys, absl::Span<const Tensor> tensors_to_send) {
   if (keys.size() != tensors_to_send.size()) {
     return errors::InvalidArgument(
         "keys and tensors_to_send are not the same size. keys.size() = ",
diff --git a/tensorflow/core/common_runtime/rendezvous_util.h b/tensorflow/core/common_runtime/rendezvous_util.h
index fe95dc0ef57c85..51175373f4dd51 100644
--- a/tensorflow/core/common_runtime/rendezvous_util.h
+++ b/tensorflow/core/common_runtime/rendezvous_util.h
@@ -33,7 +33,7 @@ typedef std::function<void(const Status&)> StatusCallback;
 Status SendTensorsToRendezvous(
     RendezvousInterface* rendezvous, DeviceContext* device_context,
     const std::vector<AllocatorAttributes>& alloc_attrs,
-    const std::vector<string>& keys, gtl::ArraySlice<Tensor> tensors_to_send);
+    const std::vector<string>& keys, absl::Span<const Tensor> tensors_to_send);
 
 // Uses `rendezvous` to obtain tensors. `device_context` should be the
 // DeviceContext associated with the receiving device. `alloc_attrs` contains
diff --git a/tensorflow/core/common_runtime/request_cost.cc b/tensorflow/core/common_runtime/request_cost.cc
index cced4849f28b30..4cceae7392ea3b 100644
--- a/tensorflow/core/common_runtime/request_cost.cc
+++ b/tensorflow/core/common_runtime/request_cost.cc
@@ -39,6 +39,19 @@ absl::flat_hash_map<std::string, absl::Duration> RequestCost::GetCosts() const {
   return cost_map_;
 }
 
+void RequestCost::RecordMetrics(
+    const std::vector<std::pair<absl::string_view, double>>& metrics) {
+  absl::MutexLock lock(&mutex_);
+  for (const auto& metric : metrics) {
+    metric_map_[metric.first] = metric.second;
+  }
+}
+
+absl::flat_hash_map<std::string, double> RequestCost::GetMetrics() const {
+  absl::MutexLock lock(&mutex_);
+  return metric_map_;
+}
+
 void RequestCost::RecordBatchMetrics(const BatchMetrics& batch_metrics) {
   absl::MutexLock lock(&mutex_);
   batch_metrics_.push_back(batch_metrics);
diff --git a/tensorflow/core/common_runtime/request_cost.h b/tensorflow/core/common_runtime/request_cost.h
index ce4e5cc92845c2..79c830585d741e 100644
--- a/tensorflow/core/common_runtime/request_cost.h
+++ b/tensorflow/core/common_runtime/request_cost.h
@@ -37,11 +37,23 @@ class RequestCost {
   void RecordCost(
       const std::vector<std::pair<absl::string_view, absl::Duration>>& costs);
 
+  // Records metrics. The inputs should be pairs of metric name and value.
+  // It's thread-safe, and can be called from different threads. Unlike
+  // RecordCosts where costs are summed up if recorded with the same key,
+  // metrics are replaced.
+  void RecordMetrics(
+      const std::vector<std::pair<absl::string_view, double>>& metrics);
+
   // Gets all types of costs for processing an rpc request.
   // It's thread-safe. It's expected to be called at the end of processing an
   // rpc request, when all the costs have been collected.
   absl::flat_hash_map<std::string, absl::Duration> GetCosts() const;
 
+  // Gets all types of metrics for processing an rpc request.
+  // It's thread-safe. It's expected to be called at the end of processing an
+  // rpc request, when all the metrics have been collected.
+  absl::flat_hash_map<std::string, double> GetMetrics() const;
+
   // Metrics of each batch that processes this rpc request.
   struct BatchMetrics {
     // Size of the batch.
@@ -70,6 +82,8 @@ class RequestCost {
   // Query costs. Map from cost type to cost.
   absl::flat_hash_map<std::string, absl::Duration> cost_map_
       ABSL_GUARDED_BY(mutex_);
+  // Query metrics. Map from metric name to value.
+  absl::flat_hash_map<std::string, double> metric_map_ ABSL_GUARDED_BY(mutex_);
 
   // Metrics of batches that process this rpc request.
   std::vector<BatchMetrics> batch_metrics_ ABSL_GUARDED_BY(mutex_);
diff --git a/tensorflow/core/common_runtime/request_cost_test.cc b/tensorflow/core/common_runtime/request_cost_test.cc
index 052f1eef600f0b..6732080bd51c91 100644
--- a/tensorflow/core/common_runtime/request_cost_test.cc
+++ b/tensorflow/core/common_runtime/request_cost_test.cc
@@ -51,6 +51,24 @@ TEST(RequestCostTest, RecordCost) {
                                    Pair("cpu_v2", absl::Milliseconds(44))));
 }
 
+TEST(RequestCostTest, RecordMetrics) {
+  RequestCost request_cost;
+
+  request_cost.RecordMetrics({{"metric_v1", 1}, {"metric_v2", 3.14}});
+  EXPECT_THAT(
+      request_cost.GetMetrics(),
+      UnorderedElementsAre(Pair("metric_v1", 1), Pair("metric_v2", 3.14)));
+
+  request_cost.RecordMetrics({{"metric_v1", 11},
+                              {"metric_v2", 3.14159},
+                              {"other_metric_v1", 3},
+                              {"other_metric_v2", 4}});
+  EXPECT_THAT(request_cost.GetMetrics(),
+              UnorderedElementsAre(
+                  Pair("metric_v1", 11), Pair("metric_v2", 3.14159),
+                  Pair("other_metric_v1", 3), Pair("other_metric_v2", 4)));
+}
+
 TEST(RequestCostTest, RecordBatchMetrics) {
   RequestCost request_cost;
 
diff --git a/tensorflow/core/common_runtime/ring_gatherer.cc b/tensorflow/core/common_runtime/ring_gatherer.cc
index dfebc65336161f..34de53f21f2513 100644
--- a/tensorflow/core/common_runtime/ring_gatherer.cc
+++ b/tensorflow/core/common_runtime/ring_gatherer.cc
@@ -99,7 +99,8 @@ void RingGatherer::Run(StatusCallback done) {
   // We are running in a blockable thread and the callback can't block so
   // just wait here on the copy.
   {
-    profiler::TraceMe activity("MemCpyAsync", profiler::TraceMeLevel::kInfo);
+    tsl::profiler::TraceMe activity("MemCpyAsync",
+                                    tsl::profiler::TraceMeLevel::kInfo);
     Notification note;
     Status status;
     Tensor alias_chunk(ca_->ChunkAlias(col_params_->subdiv_rank[0]));
@@ -144,8 +145,8 @@ bool RingGatherer::RunAsyncParts() {
     // complete before proceeding.  The previous InitRingField calls allocated
     // temp memory buffers that are not guaranteed to be valid (e.g. for RDMA
     // write) unless we do.
-    profiler::TraceMe activity("WaitForQueuedEvents",
-                               profiler::TraceMeLevel::kInfo);
+    tsl::profiler::TraceMe activity("WaitForQueuedEvents",
+                                    tsl::profiler::TraceMeLevel::kInfo);
     Notification note;
     Status s = gpu_info->default_context->ThenExecute(
         col_ctx_->device, gpu_info->stream, [&note]() { note.Notify(); });
@@ -166,7 +167,7 @@ bool RingGatherer::RunAsyncParts() {
 
   // Loop until all RingFields have advanced to completion.
   {
-    profiler::TraceMe activity("Loop", profiler::TraceMeLevel::kInfo);
+    tsl::profiler::TraceMe activity("Loop", tsl::profiler::TraceMeLevel::kInfo);
     while (field_done_count < rfv_.size()) {
       VLOG(4) << FieldState();
       // Wait for a RingField to appear in the ready_queue.
diff --git a/tensorflow/core/common_runtime/ring_reducer.cc b/tensorflow/core/common_runtime/ring_reducer.cc
index ed95379f41b26c..a01490ca2eb644 100644
--- a/tensorflow/core/common_runtime/ring_reducer.cc
+++ b/tensorflow/core/common_runtime/ring_reducer.cc
@@ -93,7 +93,8 @@ void RingReducer::Run(StatusCallback done) {
     // just wait here on the copy.
     Notification note;
     Status status;
-    profiler::TraceMe activity("MemCpyAsync", profiler::TraceMeLevel::kInfo);
+    tsl::profiler::TraceMe activity("MemCpyAsync",
+                                    tsl::profiler::TraceMeLevel::kInfo);
     CollectiveRemoteAccessLocal::MemCpyAsync(
         col_ctx_->op_ctx->op_device_context(),
         col_ctx_->op_ctx->op_device_context(), col_ctx_->device,
@@ -194,8 +195,8 @@ bool RingReducer::RunAsyncParts() {
     // complete before proceeding.  The previous InitRingField calls allocated
     // temp memory buffers that are not guaranteed to be valid (e.g. for RDMA
     // write) unless we do.
-    profiler::TraceMe activity("WaitForQueuedEvents",
-                               profiler::TraceMeLevel::kInfo);
+    tsl::profiler::TraceMe activity("WaitForQueuedEvents",
+                                    tsl::profiler::TraceMeLevel::kInfo);
     Notification note;
     Status s = gpu_info->default_context->ThenExecute(
         col_ctx_->device, gpu_info->stream, [&note]() { note.Notify(); });
@@ -215,7 +216,7 @@ bool RingReducer::RunAsyncParts() {
   std::atomic<bool> aborted(false);
 
   {
-    profiler::TraceMe activity("Loop", profiler::TraceMeLevel::kInfo);
+    tsl::profiler::TraceMe activity("Loop", tsl::profiler::TraceMeLevel::kInfo);
     // Loop until all RingFields have advanced to completion.
     while (field_done_count < rfv_.size()) {
       VLOG(4) << FieldState();
diff --git a/tensorflow/core/common_runtime/scoped_allocator.cc b/tensorflow/core/common_runtime/scoped_allocator.cc
index 56497d56e0c226..1b3d39a8c6e996 100644
--- a/tensorflow/core/common_runtime/scoped_allocator.cc
+++ b/tensorflow/core/common_runtime/scoped_allocator.cc
@@ -21,7 +21,7 @@ namespace tensorflow {
 
 ScopedAllocator::ScopedAllocator(const Tensor& backing_tensor, int32_t scope_id,
                                  const string& name,
-                                 const gtl::ArraySlice<Field> fields,
+                                 const absl::Span<const Field> fields,
                                  int32_t expected_call_count,
                                  ScopedAllocatorContainer* container)
     : backing_tensor_(backing_tensor),
diff --git a/tensorflow/core/common_runtime/scoped_allocator.h b/tensorflow/core/common_runtime/scoped_allocator.h
index 15e581024c9cd8..5b22deb264ce52 100644
--- a/tensorflow/core/common_runtime/scoped_allocator.h
+++ b/tensorflow/core/common_runtime/scoped_allocator.h
@@ -45,7 +45,7 @@ class ScopedAllocator {
   // instance.  It must be large enough to back all of the specified
   // (offset, byte) ranges of the fields.
   ScopedAllocator(const Tensor& backing_tensor, int32_t scope_id,
-                  const std::string& name, const gtl::ArraySlice<Field> fields,
+                  const std::string& name, const absl::Span<const Field> fields,
                   int32_t expected_call_count,
                   ScopedAllocatorContainer* container);
 
diff --git a/tensorflow/core/common_runtime/scoped_allocator_mgr.cc b/tensorflow/core/common_runtime/scoped_allocator_mgr.cc
index 44e64690ccbb25..36595a9afe2148 100644
--- a/tensorflow/core/common_runtime/scoped_allocator_mgr.cc
+++ b/tensorflow/core/common_runtime/scoped_allocator_mgr.cc
@@ -21,7 +21,7 @@ namespace tensorflow {
 
 Status ScopedAllocatorContainer::AddScopedAllocator(
     const Tensor& backing_tensor, int32_t scope_id, const string& scope_name,
-    const gtl::ArraySlice<ScopedAllocator::Field>& fields,
+    const absl::Span<const ScopedAllocator::Field>& fields,
     int32_t expected_call_count) {
   VLOG(1) << "AddScopedAllocator " << mgr_->device_name()
           << " step_id_=" << step_id_ << " scope_id=" << scope_id;
@@ -153,7 +153,7 @@ ScopedAllocatorContainer* ScopedAllocatorMgr::GetContainer(int64_t step_id) {
 Status ScopedAllocatorMgr::AddScopedAllocator(
     const Tensor& backing_tensor, int64_t step_id, int32_t scope_id,
     const string& scope_name,
-    const gtl::ArraySlice<ScopedAllocator::Field>& fields,
+    const absl::Span<const ScopedAllocator::Field>& fields,
     int32_t expected_call_count) {
   ScopedAllocatorContainer* sac = GetContainer(step_id);
   return sac->AddScopedAllocator(backing_tensor, scope_id, scope_name, fields,
@@ -162,7 +162,7 @@ Status ScopedAllocatorMgr::AddScopedAllocator(
 
 /*static*/
 size_t ScopedAllocatorMgr::PopulateFields(
-    int32_t scope_id, const gtl::ArraySlice<TensorShape>& shapes,
+    int32_t scope_id, const absl::Span<const TensorShape>& shapes,
     const DataType dtype, std::vector<ScopedAllocator::Field>* fields) {
   const int32_t num_fields = static_cast<int32>(shapes.size());
   fields->resize(num_fields);
diff --git a/tensorflow/core/common_runtime/scoped_allocator_mgr.h b/tensorflow/core/common_runtime/scoped_allocator_mgr.h
index b8e5dcb15649fc..60b954cdcff788 100644
--- a/tensorflow/core/common_runtime/scoped_allocator_mgr.h
+++ b/tensorflow/core/common_runtime/scoped_allocator_mgr.h
@@ -34,7 +34,7 @@ class ScopedAllocatorContainer : public core::RefCounted {
   Status AddScopedAllocator(
       const Tensor& backing_tensor, int32_t scope_id,
       const std::string& scope_name,
-      const gtl::ArraySlice<ScopedAllocator::Field>& fields,
+      const absl::Span<const ScopedAllocator::Field>& fields,
       int32_t expected_call_count);
 
   ScopedAllocatorInstance* GetInstance(int32_t scope_id);
@@ -83,7 +83,7 @@ class ScopedAllocatorMgr {
   Status AddScopedAllocator(
       const Tensor& backing_tensor, int64_t step_id, int32_t scope_id,
       const std::string& scope_name,
-      const gtl::ArraySlice<ScopedAllocator::Field>& fields,
+      const absl::Span<const ScopedAllocator::Field>& fields,
       int32_t expected_call_count);
 
   void Cleanup(int64_t step_id);
@@ -94,7 +94,7 @@ class ScopedAllocatorMgr {
   // backing tensor, for convenience.  (The same value can be obtained
   // by summing offset and bytes in the last field.)
   static size_t PopulateFields(int32_t scope_id,
-                               const gtl::ArraySlice<TensorShape>& shapes,
+                               const absl::Span<const TensorShape>& shapes,
                                const DataType dtype,
                                std::vector<ScopedAllocator::Field>* fields);
 
diff --git a/tensorflow/core/common_runtime/simple_propagator_state.cc b/tensorflow/core/common_runtime/simple_propagator_state.cc
index 19c3b783bb97e2..582fe569aeb67c 100644
--- a/tensorflow/core/common_runtime/simple_propagator_state.cc
+++ b/tensorflow/core/common_runtime/simple_propagator_state.cc
@@ -56,7 +56,7 @@ void SimplePropagatorState::ActivateRoots(
 void SimplePropagatorState::PropagateOutputs(const TaggedNode& tagged_node,
                                              EntryVector* outputs,
                                              TaggedNodeSeq* ready) {
-  profiler::TraceMe activity(
+  tsl::profiler::TraceMe activity(
       [&]() {
         return strings::StrCat(
             "ExecutorPropagateOutputs#", "id=", step_id_,
@@ -65,7 +65,7 @@ void SimplePropagatorState::PropagateOutputs(const TaggedNode& tagged_node,
             ",num_output_control_edges=",
             tagged_node.node_item->num_output_control_edges, "#");
       },
-      profiler::GetTFTraceMeLevel(/*is_expensive=*/false));
+      tsl::profiler::GetTFTraceMeLevel(/*is_expensive=*/false));
 
   // Propagates outputs along out edges, and puts newly ready nodes
   // into the ready queue.
diff --git a/tensorflow/core/config/flag_defs.h b/tensorflow/core/config/flag_defs.h
index e933eebf9c0145..bc91a9bcc4247b 100644
--- a/tensorflow/core/config/flag_defs.h
+++ b/tensorflow/core/config/flag_defs.h
@@ -59,6 +59,8 @@ class Flags {
   TF_DECLARE_FLAG(enable_colocation_key_propagation_in_while_op_lowering, false,
                   "If true, colocation key attributes for the ops will be "
                   "propagated during while op lowering to switch/merge ops.")
+  TF_DECLARE_FLAG(enable_tf2min_ici_weight, false,
+                  "If true, ici weight optimization will be used in tf2/min.")
   // LINT.ThenChange(//tensorflow/core/config/flags_api_wrapper.cc)
 };
 
diff --git a/tensorflow/core/config/flags_api_wrapper.cc b/tensorflow/core/config/flags_api_wrapper.cc
index a9f405f30bdf9c..e6c2192523cca9 100644
--- a/tensorflow/core/config/flags_api_wrapper.cc
+++ b/tensorflow/core/config/flags_api_wrapper.cc
@@ -54,5 +54,6 @@ PYBIND11_MODULE(flags_pybind, m) {
   TF_PY_DECLARE_FLAG(publish_function_graphs);
   TF_PY_DECLARE_FLAG(enable_aggressive_constant_replication);
   TF_PY_DECLARE_FLAG(enable_colocation_key_propagation_in_while_op_lowering);
+  TF_PY_DECLARE_FLAG(enable_tf2min_ici_weight)
   // LINT.ThenChange(//tensorflow/core/config/flag_defs.h)
 };
diff --git a/tensorflow/core/data/BUILD b/tensorflow/core/data/BUILD
index c5191a995b60a2..4f97c020126071 100644
--- a/tensorflow/core/data/BUILD
+++ b/tensorflow/core/data/BUILD
@@ -197,6 +197,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
diff --git a/tensorflow/core/data/captured_function.cc b/tensorflow/core/data/captured_function.cc
index 7707b220e91f25..6ddf987b3ef95e 100644
--- a/tensorflow/core/data/captured_function.cc
+++ b/tensorflow/core/data/captured_function.cc
@@ -821,12 +821,12 @@ Status InstantiatedCapturedFunction::Run(
 
   OwnedArgsCallFrame frame(std::move(args), &captured_func_->captured_inputs(),
                            ret_types_);
-  profiler::TraceMe activity(
+  tsl::profiler::TraceMe activity(
       [&] {
-        return profiler::TraceMeEncode("InstantiatedCapturedFunction::Run",
-                                       {{"id", f_opts.step_id}});
+        return tsl::profiler::TraceMeEncode("InstantiatedCapturedFunction::Run",
+                                            {{"id", f_opts.step_id}});
       },
-      profiler::TraceMeLevel::kInfo);
+      tsl::profiler::TraceMeLevel::kInfo);
   if (node) {
     // Resource usage for function execution is gathered from the executor.
     // TODO(jsimsa): Factor out common code for Run, RunAsync, and
@@ -884,13 +884,13 @@ Status InstantiatedCapturedFunction::RunWithBorrowedArgs(
 
   BorrowedArgsCallFrame frame(args, &captured_func_->captured_inputs(),
                               ret_types_);
-  profiler::TraceMe activity(
+  tsl::profiler::TraceMe activity(
       [&] {
-        return profiler::TraceMeEncode(
+        return tsl::profiler::TraceMeEncode(
             "InstantiatedCapturedFunction::RunWithBorrowedArgs",
             {{"id", f_opts.step_id}});
       },
-      profiler::TraceMeLevel::kInfo);
+      tsl::profiler::TraceMeLevel::kInfo);
   if (was_recording) node->record_stop(EnvTime::NowNanos());
   if (node) {
     // Resource usage for function execution is gathered from the executor.
@@ -931,13 +931,13 @@ Status InstantiatedCapturedFunction::RunInstantiated(
 
   BorrowedArgsCallFrame frame(args, &captured_func_->captured_inputs(),
                               ret_types_);
-  profiler::TraceMe activity(
+  tsl::profiler::TraceMe activity(
       [&] {
-        return profiler::TraceMeEncode(
+        return tsl::profiler::TraceMeEncode(
             "InstantiatedCapturedFunction::RunInstantiated",
             {{"id", f_opts.step_id}});
       },
-      profiler::TraceMeLevel::kInfo);
+      tsl::profiler::TraceMeLevel::kInfo);
   TF_RETURN_IF_ERROR(lib_->RunSync(std::move(f_opts), f_handle_, &frame));
   return frame.ConsumeRetvals(rets);
 }
@@ -1027,12 +1027,12 @@ void InstantiatedCapturedFunction::RunAsync(
       },
       std::move(done), ctx, std::move(stats_collector), std::placeholders::_1);
 
-  profiler::TraceMe activity(
+  tsl::profiler::TraceMe activity(
       [&] {
-        return profiler::TraceMeEncode("InstantiatedCapturedFunction::RunAsync",
-                                       {{"id", f_opts.step_id}});
+        return tsl::profiler::TraceMeEncode(
+            "InstantiatedCapturedFunction::RunAsync", {{"id", f_opts.step_id}});
       },
-      profiler::TraceMeLevel::kInfo);
+      tsl::profiler::TraceMeLevel::kInfo);
   // Stop the usage collection before calling `Run()` because `callback` may
   // be executed synchronously, and so the `node->record_start()` call within
   // `callback` would violate nesting.
diff --git a/tensorflow/core/data/dataset_utils.cc b/tensorflow/core/data/dataset_utils.cc
index 91ee86ad1b2211..1a8f4a5451080d 100644
--- a/tensorflow/core/data/dataset_utils.cc
+++ b/tensorflow/core/data/dataset_utils.cc
@@ -1024,7 +1024,7 @@ REGISTER_DATASET_EXPERIMENT("inject_io_prefetch", RandomJobSamplePercentage<0>,
                             AllTasks);
 REGISTER_DATASET_EXPERIMENT("reduce_array_record_dataset_memory_usage",
                             RandomJobSamplePercentage<50>, AllTasks);
-REGISTER_DATASET_EXPERIMENT("map_fusion", RandomJobSamplePercentage<50>,
+REGISTER_DATASET_EXPERIMENT("map_fusion", RandomJobSamplePercentage<0>,
                             AllTasks);
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/data/global_shuffle_utils.cc b/tensorflow/core/data/global_shuffle_utils.cc
index dd792763875be1..132a35f1d10620 100644
--- a/tensorflow/core/data/global_shuffle_utils.cc
+++ b/tensorflow/core/data/global_shuffle_utils.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
 
 namespace tensorflow {
 namespace data {
@@ -59,7 +60,8 @@ absl::Status GlobalShuffleIterator::GetNext(IteratorContext* ctx,
   }
 
   absl::MutexLock l(&mu_);
-  int64_t output_index = ctx->index_mapper()(element_count_++);
+  TF_ASSIGN_OR_RETURN(int64_t output_index,
+                      ctx->index_mapper()(element_count_++));
   absl::Status status =
       dataset_->Get(AnyContext(ctx), output_index, out_tensors);
   if (absl::IsOutOfRange(status)) {
diff --git a/tensorflow/core/data/service/BUILD b/tensorflow/core/data/service/BUILD
index cc79b5921520eb..b34c1bb532aab1 100644
--- a/tensorflow/core/data/service/BUILD
+++ b/tensorflow/core/data/service/BUILD
@@ -474,6 +474,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
         "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:logging",
     ] + tf_grpc_cc_dependencies(),
 )
 
diff --git a/tensorflow/core/data/service/client/data_service_client.cc b/tensorflow/core/data/service/client/data_service_client.cc
index 7bba141740292a..15e0e519575354 100644
--- a/tensorflow/core/data/service/client/data_service_client.cc
+++ b/tensorflow/core/data/service/client/data_service_client.cc
@@ -612,6 +612,10 @@ void DataServiceClient::RunWorkerThread(std::function<void()> done)
   });
   VLOG(1) << "Starting worker thread";
   std::shared_ptr<Task> task_to_process;
+  int64_t num_consecutive_skipped = 0;
+  constexpr int64_t MAX_ROUND_FALLBACK_TO_BLOCKING = 5;
+  bool allow_skip = true;
+
   while (true) {
     std::shared_ptr<Result> result;
     {
@@ -649,9 +653,9 @@ void DataServiceClient::RunWorkerThread(std::function<void()> done)
       VLOG(3) << "Processing task " << task_to_process->info.task_id();
     }
     int64_t deadline_micros = kint64max;
-    Status s =
-        GetElementTraced(task_to_process.get(), deadline_micros,
-                         /*enqueue_result=*/!IsCoordinatedRead(), result);
+    Status s = GetElementTraced(task_to_process.get(), deadline_micros,
+                                /*enqueue_result=*/!IsCoordinatedRead(),
+                                allow_skip, result);
     if (!s.ok()) {
       mutex_lock l(mu_);
       VLOG(1) << "Failed to get element from worker "
@@ -665,6 +669,24 @@ void DataServiceClient::RunWorkerThread(std::function<void()> done)
       get_next_cv_.notify_all();
       return;
     }
+
+    if (!IsCoordinatedRead()) {
+      if (mutex_lock l(mu_); result->skip) {
+        num_consecutive_skipped++;
+        if (num_consecutive_skipped >=
+            MAX_ROUND_FALLBACK_TO_BLOCKING * tasks_.size()) {
+          // Switches to blocking call when we already skip
+          // all workers enough rounds.
+          // This is to ensures we do not spam the network traffic.
+          allow_skip = false;
+          VLOG(1) << "`allow_skip` is turned off. Switching to blocking "
+                     "get element calls to the workers.";
+        }
+      } else {
+        num_consecutive_skipped = 0;
+        allow_skip = true;
+      }
+    }
   }
 }
 
@@ -730,7 +752,7 @@ void DataServiceClient::AdvanceTaskIndex() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
   }
 }
 
-Status DataServiceClient::TryGetElement(const Task& task,
+Status DataServiceClient::TryGetElement(const Task& task, bool allow_skip,
                                         GetElementResult& result) {
   GetElementRequest req;
   req.set_task_id(task.info.task_id());
@@ -739,6 +761,8 @@ Status DataServiceClient::TryGetElement(const Task& task,
     req.set_consumer_index(params_.consumer_index.value());
     req.set_round_index(task.round);
     req.set_allow_skip(true);
+  } else {
+    req.set_allow_skip(allow_skip);
   }
   if (params_.cross_trainer_cache_options) {
     req.set_trainer_id(params_.cross_trainer_cache_options->trainer_id());
@@ -764,7 +788,7 @@ void DataServiceClient::ProcessGetElementResponse(
     task.end_of_sequence = true;
     finished_tasks_++;
   }
-  if (enqueue_result && !result->end_of_sequence) {
+  if (enqueue_result && !result->end_of_sequence && !result->skip) {
     ctx_->RecordBufferEnqueue(result->element);
     results_.push(std::move(result));
   }
@@ -772,25 +796,27 @@ void DataServiceClient::ProcessGetElementResponse(
 }
 
 Status DataServiceClient::GetElementTraced(Task* task, int64_t deadline_micros,
-                                           bool enqueue_result,
+                                           bool enqueue_result, bool allow_skip,
                                            std::shared_ptr<Result> result)
     TF_LOCKS_EXCLUDED(mu_) {
   VLOG(3) << "Getting an element for task id " << task->info.task_id();
-  tensorflow::profiler::TraceMe activity(
-      "GetDataServiceElement", tensorflow::profiler::TraceMeLevel::kInfo);
+  tsl::profiler::TraceMe activity("GetDataServiceElement",
+                                  tsl::profiler::TraceMeLevel::kInfo);
   activity.AppendMetadata([&]() {
-    return profiler::TraceMeEncode({{"address", task->info.worker_address()}});
+    return tsl::profiler::TraceMeEncode(
+        {{"address", task->info.worker_address()}});
   });
   if (IsCoordinatedRead()) {
     VLOG(3) << "Requesting element from consumer index "
             << params_.consumer_index.value() << ", round " << task->round;
     activity.AppendMetadata([&]() {
-      return profiler::TraceMeEncode(
+      return tsl::profiler::TraceMeEncode(
           {{"consumer_index", params_.consumer_index.value()},
            {"round_index", task->round}});
     });
   }
-  Status s = GetElement(task, deadline_micros, enqueue_result, result);
+  Status s =
+      GetElement(task, deadline_micros, enqueue_result, allow_skip, result);
   mutex_lock l(mu_);
   VLOG(3) << "Got an element for task id " << task->info.task_id();
   return s;
@@ -827,12 +853,12 @@ Status DataServiceClient::MaybeRemoveTask(Task& task, int64_t deadline_micros,
 }
 
 Status DataServiceClient::GetElement(Task* task, int64_t deadline_micros,
-                                     bool enqueue_result,
+                                     bool enqueue_result, bool allow_skip,
                                      std::shared_ptr<Result> result)
     TF_LOCKS_EXCLUDED(mu_) {
   GetElementResult get_element_result;
   while (true) {
-    Status s = TryGetElement(*task, get_element_result);
+    Status s = TryGetElement(*task, allow_skip, get_element_result);
     if (s.ok()) {
       task->num_retries = 0;
       break;
diff --git a/tensorflow/core/data/service/client/data_service_client.h b/tensorflow/core/data/service/client/data_service_client.h
index 5aaba90d854259..5c8e3d82fa92d1 100644
--- a/tensorflow/core/data/service/client/data_service_client.h
+++ b/tensorflow/core/data/service/client/data_service_client.h
@@ -174,15 +174,17 @@ class DataServiceClient {
   // task a chance to proceed.
   std::shared_ptr<Task> GetTaskToProcess();
   void AdvanceTaskIndex();
-  Status TryGetElement(const Task& task, GetElementResult& result);
+  Status TryGetElement(const Task& task, bool allow_skip,
+                       GetElementResult& result);
   void ProcessGetElementResponse(bool enqueue_result,
                                  GetElementResult& get_element_result,
                                  std::shared_ptr<Result> result, Task& task);
   Status GetElementTraced(Task* task, int64_t deadline_micros,
-                          bool enqueue_result, std::shared_ptr<Result> result);
+                          bool enqueue_result, bool allow_skip,
+                          std::shared_ptr<Result> result);
   Status MaybeRemoveTask(Task& task, int64_t deadline_micros, Result& result);
   Status GetElement(Task* task, int64_t deadline_micros, bool enqueue_result,
-                    std::shared_ptr<Result> result);
+                    bool allow_skip, std::shared_ptr<Result> result);
   bool ResultReady() const;
   std::shared_ptr<Result> PopNextResult();
   bool IsCoordinatedRead() const;
diff --git a/tensorflow/core/data/service/cross_trainer_cache_test.cc b/tensorflow/core/data/service/cross_trainer_cache_test.cc
index a0359b5135266e..9426d24d917e04 100644
--- a/tensorflow/core/data/service/cross_trainer_cache_test.cc
+++ b/tensorflow/core/data/service/cross_trainer_cache_test.cc
@@ -395,7 +395,7 @@ TEST(CrossTrainerCacheTest, Cancel) {
 
 TEST(CrossTrainerCacheTest, Errors) {
   auto elements = std::make_unique<ElementOrErrorDataset<std::string>>(
-      std::vector<StatusOr<std::string>>{
+      std::vector<absl::StatusOr<std::string>>{
           std::string("First element"),
           errors::Cancelled("Cancelled"),
           std::string("Second element"),
diff --git a/tensorflow/core/data/service/dispatcher_impl.cc b/tensorflow/core/data/service/dispatcher_impl.cc
index 73ee07a6c34e64..c11bc14132db1d 100644
--- a/tensorflow/core/data/service/dispatcher_impl.cc
+++ b/tensorflow/core/data/service/dispatcher_impl.cc
@@ -78,6 +78,7 @@ limitations under the License.
 #include "tensorflow/core/protobuf/service_config.pb.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/threadpool.h"
 
@@ -409,7 +410,7 @@ void DataServiceDispatcherImpl::ReportProcessingTimesFromActiveTasks(
         task->iteration->iteration_id, worker_address,
         absl::Nanoseconds(processing_time_nsec));
     if (!auto_scaler_status.ok()) {
-      LOG_EVERY_N(WARNING, 20)
+      LOG_EVERY_N_SEC(WARNING, 300)
           << "Failed to report processing time for Iteration "
           << task->iteration->iteration_id << " and worker address "
           << worker_address
@@ -453,6 +454,7 @@ Status DataServiceDispatcherImpl::WorkerHeartbeat(
                          request->current_tasks().cend());
     const std::vector<ActiveTask> active_tasks(request->active_tasks().begin(),
                                                request->active_tasks().end());
+    // TODO(b/249286501): Skip this if the user does not enable auto-scaling.
     ReportProcessingTimesFromActiveTasks(active_tasks,
                                          request->worker_address());
     TF_RETURN_IF_ERROR(
diff --git a/tensorflow/core/data/service/snapshot/BUILD b/tensorflow/core/data/service/snapshot/BUILD
index 717699dc1b21b0..523bfebc44dcb5 100644
--- a/tensorflow/core/data/service/snapshot/BUILD
+++ b/tensorflow/core/data/service/snapshot/BUILD
@@ -1,6 +1,6 @@
 # Distributed snapshot library.
 
-load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
+load("@local_xla//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.default.bzl", "tf_grpc_cc_dependencies", "tf_kernel_library")
 load("//tensorflow/core/platform:build_config.bzl", "tf_protos_profiler_service")
diff --git a/tensorflow/core/data/service/snapshot/snapshot_manager.cc b/tensorflow/core/data/service/snapshot/snapshot_manager.cc
index d2860d38ff9e3d..8ea22871945af2 100644
--- a/tensorflow/core/data/service/snapshot/snapshot_manager.cc
+++ b/tensorflow/core/data/service/snapshot/snapshot_manager.cc
@@ -228,12 +228,12 @@ absl::Status SnapshotManager::WriteOnDiskSkeleton()
 
 absl::Status SnapshotManager::WriteOnDiskMetadata(
     const SnapshotRequest& request) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-  TF_RETURN_IF_ERROR(WriteTextProto(env_, SnapshotMetadataFilePath(path_),
-                                    request.metadata()));
-  TF_RETURN_IF_ERROR(WriteStringToFile(env_, DatasetSpecFilePath(path_),
-                                       request.metadata().element_spec()));
-  TF_RETURN_IF_ERROR(
-      WriteBinaryProto(env_, DatasetDefFilePath(path_), request.dataset()));
+  TF_RETURN_IF_ERROR(AtomicallyWriteTextProto(SnapshotMetadataFilePath(path_),
+                                              request.metadata(), env_));
+  TF_RETURN_IF_ERROR(AtomicallyWriteStringToFile(
+      DatasetSpecFilePath(path_), request.metadata().element_spec(), env_));
+  TF_RETURN_IF_ERROR(AtomicallyWriteBinaryProto(DatasetDefFilePath(path_),
+                                                request.dataset(), env_));
   return absl::OkStatus();
 }
 
@@ -290,7 +290,7 @@ absl::Status SnapshotManager::ReadOnDiskMetadata()
   return absl::OkStatus();
 }
 
-// TODO(b/297930782): Refactor this method.
+// TODO(yangchen): Refactor this method.
 absl::Status SnapshotManager::ReadOnDiskStreams()
     TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
   std::string streams_path = StreamsDirectory(path_);
diff --git a/tensorflow/core/data/service/task_runner.cc b/tensorflow/core/data/service/task_runner.cc
index c56d0989b1ceaa..1784f346b99103 100644
--- a/tensorflow/core/data/service/task_runner.cc
+++ b/tensorflow/core/data/service/task_runner.cc
@@ -118,6 +118,10 @@ FirstComeFirstServedTaskRunner::~FirstComeFirstServedTaskRunner() { Cancel(); }
 
 Status FirstComeFirstServedTaskRunner::GetNext(const GetElementRequest& req,
                                                GetElementResult& result) {
+  if (req.allow_skip() && buffer_.Empty()) {
+    result.skip = true;
+    return absl::OkStatus();
+  }
   return GetNext(result);
 }
 
diff --git a/tensorflow/core/data/service/test_util.cc b/tensorflow/core/data/service/test_util.cc
index 766fe07c4f7469..c86657fa132dfa 100644
--- a/tensorflow/core/data/service/test_util.cc
+++ b/tensorflow/core/data/service/test_util.cc
@@ -66,8 +66,8 @@ NodeDef GetMapNode(absl::string_view name, absl::string_view input_node_name,
       name, /*op=*/"MapDataset", {std::string(input_node_name)},
       {{"f", FunctionDefHelper::FunctionRef(std::string(function_name))},
        {"Targuments", {}},
-       {"output_shapes", gtl::ArraySlice<TensorShape>{TensorShape()}},
-       {"output_types", gtl::ArraySlice<DataType>{DT_INT64}}});
+       {"output_shapes", absl::Span<const TensorShape>{TensorShape()}},
+       {"output_types", absl::Span<const DataType>{DT_INT64}}});
 }
 
 FunctionDef XTimesX() {
@@ -135,8 +135,8 @@ DatasetDef RangeDataset(int64_t range) {
        NDef("step", "Const", /*inputs=*/{},
             {{"value", AsScalar<int64_t>(1)}, {"dtype", DT_INT64}}),
        NDef("range", "RangeDataset", /*inputs=*/{"start", "stop", "step"},
-            {{"output_shapes", gtl::ArraySlice<TensorShape>{TensorShape()}},
-             {"output_types", gtl::ArraySlice<DataType>{DT_INT64}}}),
+            {{"output_shapes", absl::Span<const TensorShape>{TensorShape()}},
+             {"output_types", absl::Span<const DataType>{DT_INT64}}}),
        NDef("dataset", "_Retval", /*inputs=*/{"range"},
             {{"T", DT_VARIANT}, {"index", 0}})},
       {});
@@ -153,8 +153,8 @@ DatasetDef RangeSquareDataset(const int64_t range) {
        NDef("step", "Const", /*inputs=*/{},
             {{"value", AsScalar<int64_t>(1)}, {"dtype", DT_INT64}}),
        NDef("range", "RangeDataset", /*inputs=*/{"start", "stop", "step"},
-            {{"output_shapes", gtl::ArraySlice<TensorShape>{TensorShape()}},
-             {"output_types", gtl::ArraySlice<DataType>{DT_INT64}}}),
+            {{"output_shapes", absl::Span<const TensorShape>{TensorShape()}},
+             {"output_types", absl::Span<const DataType>{DT_INT64}}}),
        GetMapNode("map", "range", "XTimesX"),
        NDef("dataset", "_Retval", /*inputs=*/{"map"},
             {{"T", DT_VARIANT}, {"index", 0}})},
@@ -172,16 +172,16 @@ DatasetDef RangeDatasetWithShardHint(const int64_t range) {
        NDef("step", "Const", /*inputs=*/{},
             {{"value", AsScalar<int64_t>(1)}, {"dtype", DT_INT64}}),
        NDef("range", "RangeDataset", /*inputs=*/{"start", "stop", "step"},
-            {{"output_shapes", gtl::ArraySlice<TensorShape>{TensorShape()}},
-             {"output_types", gtl::ArraySlice<DataType>{DT_INT64}}}),
+            {{"output_shapes", absl::Span<const TensorShape>{TensorShape()}},
+             {"output_types", absl::Span<const DataType>{DT_INT64}}}),
        NDef("num_shards", "Const", /*inputs=*/{},
             {{"value", AsScalar<int64_t>(kShardHint)}, {"dtype", DT_INT64}}),
        NDef("index", "Const", /*inputs=*/{},
             {{"value", AsScalar<int64_t>(kShardHint)}, {"dtype", DT_INT64}}),
        NDef("ShardDataset", "ShardDataset",
             /*inputs=*/{"range", "num_shards", "index"},
-            {{"output_shapes", gtl::ArraySlice<TensorShape>{TensorShape()}},
-             {"output_types", gtl::ArraySlice<DataType>{DT_INT64}}}),
+            {{"output_shapes", absl::Span<const TensorShape>{TensorShape()}},
+             {"output_types", absl::Span<const DataType>{DT_INT64}}}),
        NDef("dataset", "_Retval", /*inputs=*/{"ShardDataset"},
             {{"T", DT_VARIANT}, {"index", 0}})},
       /*funcs=*/{});
@@ -198,13 +198,13 @@ DatasetDef InfiniteDataset() {
        NDef("step", "Const", /*inputs=*/{},
             {{"value", AsScalar<int64_t>(1)}, {"dtype", DT_INT64}}),
        NDef("range", "RangeDataset", /*inputs=*/{"start", "stop", "step"},
-            {{"output_shapes", gtl::ArraySlice<TensorShape>{TensorShape()}},
-             {"output_types", gtl::ArraySlice<DataType>{DT_INT64}}}),
+            {{"output_shapes", absl::Span<const TensorShape>{TensorShape()}},
+             {"output_types", absl::Span<const DataType>{DT_INT64}}}),
        NDef("count", "Const", /*inputs=*/{},
             {{"value", AsScalar<int64_t>(-1)}, {"dtype", DT_INT64}}),
        NDef("repeat", "RepeatDataset", /*inputs=*/{"range", "count"},
-            {{"output_shapes", gtl::ArraySlice<TensorShape>{TensorShape()}},
-             {"output_types", gtl::ArraySlice<DataType>{DT_INT64}}}),
+            {{"output_shapes", absl::Span<const TensorShape>{TensorShape()}},
+             {"output_types", absl::Span<const DataType>{DT_INT64}}}),
        NDef("dataset", "_Retval", /*inputs=*/{"repeat"},
             {{"T", DT_VARIANT}, {"index", 0}})},
       {});
diff --git a/tensorflow/core/data/service/thread_safe_buffer.h b/tensorflow/core/data/service/thread_safe_buffer.h
index 3c18da024a52ac..6234de355eebca 100644
--- a/tensorflow/core/data/service/thread_safe_buffer.h
+++ b/tensorflow/core/data/service/thread_safe_buffer.h
@@ -47,10 +47,13 @@ class ThreadSafeBuffer final {
   // REQUIRES: !status.ok()
   void Cancel(Status status);
 
+  // Returns whether the buffer is empty.
+  bool Empty() const;
+
  private:
   const size_t buffer_size_;
 
-  mutex mu_;
+  mutable mutex mu_;
   condition_variable ready_to_pop_;
   condition_variable ready_to_push_;
   std::deque<StatusOr<T>> results_ TF_GUARDED_BY(mu_);
@@ -68,6 +71,12 @@ ThreadSafeBuffer<T>::ThreadSafeBuffer(size_t buffer_size)
       << buffer_size << ".";
 }
 
+template <class T>
+bool ThreadSafeBuffer<T>::Empty() const {
+  tf_shared_lock l(mu_);
+  return results_.empty();
+}
+
 template <class T>
 StatusOr<T> ThreadSafeBuffer<T>::Pop() {
   mutex_lock l(mu_);
diff --git a/tensorflow/core/data/service/worker.proto b/tensorflow/core/data/service/worker.proto
index c8c063ab9b7a36..34c523924f3755 100644
--- a/tensorflow/core/data/service/worker.proto
+++ b/tensorflow/core/data/service/worker.proto
@@ -26,7 +26,12 @@ message GetElementRequest {
   // Whether the previous round was skipped. This information is needed by the
   // worker to recover after restarts.
   bool skipped_previous_round = 4;
-  // Whether to skip the round if data isn't ready fast enough.
+  // For the case of coordinated data read:
+  //   Whether to skip the round if data isn't ready fast enough.
+  // In other cases:
+  //   When true, the underlying GetNext will be nonblocking and return
+  //   immediately when there is no data ready on tf.data service worker side.
+  //   If false, GetNext will block until there is data to return.
   bool allow_skip = 5;
   // The trainer ID used to read elements from a multi-trainer cache. This cache
   // enables sharing data across concurrent training iterations. If set, this
diff --git a/tensorflow/core/debug/debug_io_utils.cc b/tensorflow/core/debug/debug_io_utils.cc
index 9f9e11c39c969c..0e4e11d81e9d12 100644
--- a/tensorflow/core/debug/debug_io_utils.cc
+++ b/tensorflow/core/debug/debug_io_utils.cc
@@ -421,7 +421,7 @@ Status DebugIO::PublishDebugMetadata(
 Status DebugIO::PublishDebugTensor(const DebugNodeKey& debug_node_key,
                                    const Tensor& tensor,
                                    const uint64 wall_time_us,
-                                   const gtl::ArraySlice<string> debug_urls,
+                                   const absl::Span<const string> debug_urls,
                                    const bool gated_grpc,
                                    const int64_t step_id) {
   int32_t num_failed_urls = 0;
@@ -496,7 +496,7 @@ Status DebugIO::PublishDebugTensor(const DebugNodeKey& debug_node_key,
 Status DebugIO::PublishDebugTensor(const DebugNodeKey& debug_node_key,
                                    const Tensor& tensor,
                                    const uint64 wall_time_us,
-                                   const gtl::ArraySlice<string> debug_urls) {
+                                   const absl::Span<const string> debug_urls) {
   return PublishDebugTensor(debug_node_key, tensor, wall_time_us, debug_urls,
                             false);
 }
diff --git a/tensorflow/core/debug/debug_io_utils.h b/tensorflow/core/debug/debug_io_utils.h
index 0f0054eaa03e5d..1555f92a96df6e 100644
--- a/tensorflow/core/debug/debug_io_utils.h
+++ b/tensorflow/core/debug/debug_io_utils.h
@@ -83,14 +83,14 @@ class DebugIO {
   static Status PublishDebugTensor(const DebugNodeKey& debug_node_key,
                                    const Tensor& tensor,
                                    const uint64 wall_time_us,
-                                   const gtl::ArraySlice<string> debug_urls,
+                                   const absl::Span<const string> debug_urls,
                                    bool gated_grpc, int64_t step_id = -1);
 
   // Convenience overload of the method above for no gated_grpc by default.
   static Status PublishDebugTensor(const DebugNodeKey& debug_node_key,
                                    const Tensor& tensor,
                                    const uint64 wall_time_us,
-                                   const gtl::ArraySlice<string> debug_urls);
+                                   const absl::Span<const string> debug_urls);
 
   // Publishes a graph to a set of debug URLs.
   //
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index c87c153ba61d28..77c8c9207f8b37 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -172,10 +172,10 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/activity_watcher",
         "//tensorflow/core/protobuf:worker_proto_cc",
-        "@local_tsl//tsl/distributed_runtime/coordination:coordination_service",
-        "@local_tsl//tsl/distributed_runtime/coordination:coordination_service_agent",
-        "@local_tsl//tsl/distributed_runtime/coordination:coordination_service_rpc_handler",
         "@local_tsl//tsl/protobuf:coordination_service_proto_cc",
+        "@local_xla//xla/tsl/distributed_runtime/coordination:coordination_service",
+        "@local_xla//xla/tsl/distributed_runtime/coordination:coordination_service_agent",
+        "@local_xla//xla/tsl/distributed_runtime/coordination:coordination_service_rpc_handler",
     ],
 )
 
@@ -273,7 +273,7 @@ cc_library(
     hdrs = ["call_options.h"],
     # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
-        "@local_tsl//tsl/distributed_runtime:call_options",
+        "@local_xla//xla/tsl/distributed_runtime:call_options",
     ],
 )
 
@@ -507,14 +507,17 @@ cc_library(
         ":worker_env",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
-        "//tensorflow/core:graph",
+        "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/debug",
         "//tensorflow/core/profiler/lib:connected_traceme",
+        "//tensorflow/core/profiler/lib:context_types_hdrs",
+        "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/core/profiler/lib:traceme_encode",
         "//tensorflow/core/protobuf:worker_proto_cc",
+        "@com_google_absl//absl/status",
     ],
 )
 
diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
index 87addeeceddc0d..a31794d7591899 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
@@ -247,7 +247,7 @@ void BaseRemoteRendezvous::SameWorkerRecvDone(
     return;
   }
 
-  profiler::ScopedMemoryDebugAnnotation op_annotation(
+  tsl::profiler::ScopedMemoryDebugAnnotation op_annotation(
       "SameWorkerRecvDone", step_id_, "dynamic", in.dtype(),
       [&in]() { return in.shape().DebugString(); });
   AllocatorAttributes attr = recv_args.alloc_attrs;
@@ -298,7 +298,8 @@ void BaseRemoteRendezvous::RecvAsync(const ParsedKey& parsed,
   DCHECK(is_initialized()) << "RecvAsync called when uninitialized (key: "
                            << parsed.FullKey() << ").";
 
-  profiler::ScopedMemoryDebugAnnotation op_annotation("RecvAsync", step_id_);
+  tsl::profiler::ScopedMemoryDebugAnnotation op_annotation("RecvAsync",
+                                                           step_id_);
   // Are src and dst in the same worker?
   // At this point parsed.dst must be a local device asserted by the previous
   // call to ValidateDevices.
diff --git a/tensorflow/core/distributed_runtime/call_options.h b/tensorflow/core/distributed_runtime/call_options.h
index 3cff6ccd0f8ad4..a845bcdc9c9042 100644
--- a/tensorflow/core/distributed_runtime/call_options.h
+++ b/tensorflow/core/distributed_runtime/call_options.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_CALL_OPTIONS_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_CALL_OPTIONS_H_
 
-#include "tsl/distributed_runtime/call_options.h"
+#include "xla/tsl/distributed_runtime/call_options.h"
 
 namespace tensorflow {
 // NOLINTBEGIN(misc-unused-using-decls)
diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
index 47dbf7c35a9670..04254815c59ac2 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
@@ -257,7 +257,7 @@ void ClusterFunctionLibraryRuntime::Instantiate(
 
 void ClusterFunctionLibraryRuntime::Run(
     const FunctionLibraryRuntime::Options& opts,
-    FunctionLibraryRuntime::LocalHandle handle, gtl::ArraySlice<Tensor> args,
+    FunctionLibraryRuntime::LocalHandle handle, absl::Span<const Tensor> args,
     std::vector<Tensor>* rets, FunctionLibraryRuntime::DoneCallback done) {
   FunctionData* function_data = nullptr;
   {
@@ -334,7 +334,7 @@ void ClusterFunctionLibraryRuntime::Run(
 void ClusterFunctionLibraryRuntime::Run(
     const FunctionLibraryRuntime::Options& opts,
     FunctionLibraryRuntime::LocalHandle handle,
-    gtl::ArraySlice<FunctionArg> args, std::vector<FunctionRet>* rets,
+    absl::Span<const FunctionArg> args, std::vector<FunctionRet>* rets,
     FunctionLibraryRuntime::DoneCallback done) {
   std::vector<Tensor> tensors;
   for (const auto& arg : args) {
diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
index eaad4301576570..397325529b9ff5 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
@@ -49,12 +49,12 @@ class ClusterFunctionLibraryRuntime : public DistributedFunctionLibraryRuntime {
 
   void Run(const FunctionLibraryRuntime::Options& opts,
            FunctionLibraryRuntime::LocalHandle handle,
-           gtl::ArraySlice<Tensor> args, std::vector<Tensor>* rets,
+           absl::Span<const Tensor> args, std::vector<Tensor>* rets,
            FunctionLibraryRuntime::DoneCallback done) override;
 
   void Run(const FunctionLibraryRuntime::Options& opts,
            FunctionLibraryRuntime::LocalHandle handle,
-           gtl::ArraySlice<FunctionArg> args, std::vector<FunctionRet>* rets,
+           absl::Span<const FunctionArg> args, std::vector<FunctionRet>* rets,
            FunctionLibraryRuntime::DoneCallback done) override;
 
   void CleanUp(uint64 step_id, FunctionLibraryRuntime::LocalHandle handle,
diff --git a/tensorflow/core/distributed_runtime/collective_rma_distributed.cc b/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
index 984a5bda243c82..44a4fb2f8aade3 100644
--- a/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
+++ b/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
@@ -153,7 +153,7 @@ void CollectiveRemoteAccessDistributed::RecvFromPeer(
     }
     AllocatorAttributes cpu_attr;
     cpu_attr.set_gpu_compatible(true);
-    profiler::ScopedMemoryDebugAnnotation op_annotation(
+    tsl::profiler::ScopedMemoryDebugAnnotation op_annotation(
         "CollectiveRemoteAccessDistributed::RecvFromPeer"
         "::recv_buf_callback",
         step_id_, "dynamic", to_tensor->dtype(),
diff --git a/tensorflow/core/distributed_runtime/coordination/BUILD b/tensorflow/core/distributed_runtime/coordination/BUILD
index 253393afd778d5..73bff8ebebddf4 100644
--- a/tensorflow/core/distributed_runtime/coordination/BUILD
+++ b/tensorflow/core/distributed_runtime/coordination/BUILD
@@ -18,7 +18,7 @@ cc_library(
     hdrs = ["coordination_client.h"],
     # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
-        "@local_tsl//tsl/distributed_runtime/coordination:coordination_client",
+        "@local_xla//xla/tsl/distributed_runtime/coordination:coordination_client",
     ],
 )
 
@@ -29,7 +29,7 @@ cc_library(
     ],
     # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
-        "@local_tsl//tsl/distributed_runtime/coordination:coordination_service_rpc_handler",
+        "@local_xla//xla/tsl/distributed_runtime/coordination:coordination_service_rpc_handler",
     ],
 )
 
@@ -38,7 +38,7 @@ cc_library(
     hdrs = ["coordination_service_error_util.h"],
     # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
-        "@local_tsl//tsl/distributed_runtime/coordination:coordination_service_error_util",
+        "@local_xla//xla/tsl/distributed_runtime/coordination:coordination_service_error_util",
     ],
 )
 
@@ -59,8 +59,8 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
-        "@local_tsl//tsl/distributed_runtime/coordination:coordination_service_agent",
         "@local_tsl//tsl/protobuf:coordination_service_proto_cc",
+        "@local_xla//xla/tsl/distributed_runtime/coordination:coordination_service_agent",
     ],
 )
 
@@ -80,11 +80,11 @@ tf_cc_test(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/distributed_runtime:call_options",
-        "@local_tsl//tsl/distributed_runtime/coordination:coordination_client",
-        "@local_tsl//tsl/distributed_runtime/coordination:coordination_service_agent",
         "@local_tsl//tsl/protobuf:coordination_config_proto_cc",
         "@local_tsl//tsl/protobuf:coordination_service_proto_cc",
+        "@local_xla//xla/tsl/distributed_runtime:call_options",
+        "@local_xla//xla/tsl/distributed_runtime/coordination:coordination_client",
+        "@local_xla//xla/tsl/distributed_runtime/coordination:coordination_service_agent",
     ],
 )
 
diff --git a/tensorflow/core/distributed_runtime/coordination/coordination_client.h b/tensorflow/core/distributed_runtime/coordination/coordination_client.h
index 95907778e7012b..0901d56b827a73 100644
--- a/tensorflow/core/distributed_runtime/coordination/coordination_client.h
+++ b/tensorflow/core/distributed_runtime/coordination/coordination_client.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <memory>
 #include <string>
 
-#include "tsl/distributed_runtime/coordination/coordination_client.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_client.h"
 
 namespace tensorflow {
 // NOLINTBEGIN(misc-unused-using-decls)
diff --git a/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.cc b/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.cc
index 61083bf86bcfe9..5f1fed1004bc71 100644
--- a/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.cc
+++ b/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.cc
@@ -26,10 +26,10 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/time/time.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
-#include "tsl/distributed_runtime/coordination/coordination_service_agent.h"
 #include "tsl/protobuf/coordination_service.pb.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.h b/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.h
index b0612fc2ee9770..5d9aeeec3debc4 100644
--- a/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.h
+++ b/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.h
@@ -24,11 +24,11 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
 #include "absl/time/time.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/thread_annotations.h"
-#include "tsl/distributed_runtime/coordination/coordination_service_agent.h"
 #include "tsl/protobuf/coordination_service.pb.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy_test.cc b/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy_test.cc
index 213c187cd75214..a7f77c59888a73 100644
--- a/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy_test.cc
+++ b/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy_test.cc
@@ -29,14 +29,14 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/time/time.h"
+#include "xla/tsl/distributed_runtime/call_options.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_client.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/threadpool.h"
-#include "tsl/distributed_runtime/call_options.h"
-#include "tsl/distributed_runtime/coordination/coordination_client.h"
-#include "tsl/distributed_runtime/coordination/coordination_service_agent.h"
 #include "tsl/protobuf/coordination_config.pb.h"
 #include "tsl/protobuf/coordination_service.pb.h"
 
diff --git a/tensorflow/core/distributed_runtime/coordination/coordination_service_error_util.h b/tensorflow/core/distributed_runtime/coordination/coordination_service_error_util.h
index bbc263e1deb826..aa6dfa412ccdd7 100644
--- a/tensorflow/core/distributed_runtime/coordination/coordination_service_error_util.h
+++ b/tensorflow/core/distributed_runtime/coordination/coordination_service_error_util.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_ERROR_UTIL_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_ERROR_UTIL_H_
 
-#include "tsl/distributed_runtime/coordination/coordination_service_error_util.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_error_util.h"
 
 namespace tensorflow {
 // NOLINTBEGIN(misc-unused-using-decls)
diff --git a/tensorflow/core/distributed_runtime/coordination/coordination_service_rpc_handler.h b/tensorflow/core/distributed_runtime/coordination/coordination_service_rpc_handler.h
index bde5533a62909b..d378684d8bd644 100644
--- a/tensorflow/core/distributed_runtime/coordination/coordination_service_rpc_handler.h
+++ b/tensorflow/core/distributed_runtime/coordination/coordination_service_rpc_handler.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_RPC_HANDLER_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_RPC_HANDLER_H_
 
-#include "tsl/distributed_runtime/coordination/coordination_service_rpc_handler.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.h"
 
 namespace tensorflow {
 // NOLINTBEGIN(misc-unused-using-decls)
diff --git a/tensorflow/core/distributed_runtime/eager/BUILD b/tensorflow/core/distributed_runtime/eager/BUILD
index 45ca833f49c66e..5b3362e8f8db85 100644
--- a/tensorflow/core/distributed_runtime/eager/BUILD
+++ b/tensorflow/core/distributed_runtime/eager/BUILD
@@ -123,7 +123,7 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
-        "@local_tsl//tsl/distributed_runtime/preemption:preemption_notifier",
+        "@local_xla//xla/tsl/distributed_runtime/preemption:preemption_notifier",
     ] + tf_grpc_cc_dependencies(),
 )
 
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
index f5062d6b75ef42..6684614beb6498 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/types/optional.h"
 #include "tensorflow/c/eager/immediate_execution_distributed_manager.h"
+#include "xla/tsl/distributed_runtime/preemption/preemption_notifier.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/common_runtime/eager/context_distributed_manager.h"
@@ -52,7 +53,6 @@ limitations under the License.
 #include "tensorflow/core/platform/stringprintf.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
-#include "tsl/distributed_runtime/preemption/preemption_notifier.h"
 #include "tsl/protobuf/coordination_config.pb.h"
 namespace tensorflow {
 namespace eager {
@@ -130,8 +130,8 @@ Status GetEagerOperationAndNumRetvals(const Operation& operation,
                                      eager_executor, remote_func_params));
 
   {
-    profiler::TraceMe activity("EagerService:RemoteTensorHandleInternal",
-                               profiler::TraceMeLevel::kVerbose);
+    tsl::profiler::TraceMe activity("EagerService:RemoteTensorHandleInternal",
+                                    tsl::profiler::TraceMeLevel::kVerbose);
     for (const auto& input : operation.op_inputs()) {
       tensorflow::TensorHandle* handle;
       if (input.has_remote_handle()) {
@@ -695,12 +695,12 @@ Status EagerServiceImpl::ExecuteOp(CallOptions* call_opts,
 Status EagerServiceImpl::Enqueue(CallOptions* call_opts,
                                  const EnqueueRequest* request,
                                  EnqueueResponse* response, uint64 stream_id) {
-  profiler::TraceMe activity(
+  tsl::profiler::TraceMe activity(
       [&] {
         return absl::StrCat(
             "EagerService:Enqueue#debug_str=", request->DebugString(), "#");
       },
-      profiler::TraceMeLevel::kInfo);
+      tsl::profiler::TraceMeLevel::kInfo);
   ServerContext* context = nullptr;
   TF_RETURN_IF_ERROR(GetServerContext(request->context_id(), &context));
   core::ScopedUnref context_unref(context);
diff --git a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
index e89c29774e7ff8..7c6198f5578e94 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
@@ -235,9 +235,9 @@ Status RemoteTensorHandleData::OpIdAndOutputNum(const bool wait_until_ready,
 Status RemoteTensorHandleData::WaitReady(const char* caller) const {
   tf_shared_lock l(mu_);
   if (!is_ready_) {
-    profiler::TraceMe activity(
+    tsl::profiler::TraceMe activity(
         [caller] { return absl::StrCat(caller, " WaitReady"); },
-        profiler::TraceMeLevel::kInfo);
+        tsl::profiler::TraceMeLevel::kInfo);
     DVLOG(3) << "WaitReady: " << caller << " " << this;
     mu_.Await(Condition(&is_ready_));
   }
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.cc b/tensorflow/core/distributed_runtime/graph_mgr.cc
index 7d4afbfa0e0c3d..99231592df3434 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.cc
+++ b/tensorflow/core/distributed_runtime/graph_mgr.cc
@@ -18,41 +18,56 @@ limitations under the License.
 #include <chrono>  // NOLINT(build/c++11)
 #include <vector>
 
+#include "absl/status/status.h"
 #include "tensorflow/core/common_runtime/build_graph_options.h"
-#include "tensorflow/core/common_runtime/constant_folding.h"
 #include "tensorflow/core/common_runtime/debugger_state_interface.h"
-#include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
-#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/executor.h"
 #include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
+#include "tensorflow/core/common_runtime/local_executor_params.h"
 #include "tensorflow/core/common_runtime/memory_types.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
-#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/common_runtime/rendezvous_util.h"
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
+#include "tensorflow/core/distributed_runtime/message_wrappers.h"
 #include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
+#include "tensorflow/core/distributed_runtime/worker_env.h"
 #include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/metrics.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/node_properties.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/op_segment.h"
+#include "tensorflow/core/framework/rendezvous.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_partition.h"
-#include "tensorflow/core/graph/validate.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/strings/stringprintf.h"
-#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/tracing.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/strcat.h"
+#include "tensorflow/core/platform/stringprintf.h"
+#include "tensorflow/core/platform/threadpool.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/lib/connected_traceme.h"
+#include "tensorflow/core/profiler/lib/context_types.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/profiler/lib/traceme_encode.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
+#include "tensorflow/core/util/device_name_utils.h"
 #include "tensorflow/core/util/env_var.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/refcount.h"
 
 namespace tensorflow {
 
@@ -422,14 +437,14 @@ void GraphMgr::ExecuteAsync(
     tsl::CoordinationServiceAgent* coordination_service_agent,
     StatusCallback done) {
   const uint64 start_time_usecs = Env::Default()->NowMicros();
-  profiler::TraceMeProducer activity(
+  tsl::profiler::TraceMeProducer activity(
       // To TraceMeConsumers in ExecutorState::Process/Finish or RunGraphDone.
       [step_id] {
-        return profiler::TraceMeEncode(
+        return tsl::profiler::TraceMeEncode(
             "RunGraph", {{"id", step_id}, {"_r", 1} /*root_event*/});
       },
-      profiler::ContextType::kTfExecutor, step_id,
-      profiler::TraceMeLevel::kInfo);
+      tsl::profiler::ContextType::kTfExecutor, step_id,
+      tsl::profiler::TraceMeLevel::kInfo);
   // Lookup an item. Holds one ref while executing.
   Item* item = nullptr;
   {
@@ -496,13 +511,14 @@ void GraphMgr::ExecuteAsync(
       coordination_service_agent,
       [item, rendezvous, ce_handle, done, start_time_usecs, input_size,
        step_id](const Status& s) {
-        profiler::TraceMeConsumer activity(
+        tsl::profiler::TraceMeConsumer activity(
             // From TraceMeProducer in GraphMgr::ExecuteAsync.
             [step_id] {
-              return profiler::TraceMeEncode("RunGraphDone", {{"id", step_id}});
+              return tsl::profiler::TraceMeEncode("RunGraphDone",
+                                                  {{"id", step_id}});
             },
-            profiler::ContextType::kTfExecutor, step_id,
-            profiler::TraceMeLevel::kInfo);
+            tsl::profiler::ContextType::kTfExecutor, step_id,
+            tsl::profiler::TraceMeLevel::kInfo);
         done(s);
         metrics::RecordGraphInputTensors(input_size);
         metrics::UpdateGraphExecTime(Env::Default()->NowMicros() -
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.h b/tensorflow/core/distributed_runtime/graph_mgr.h
index 22f83152287b8c..eabfb527c34cfb 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.h
+++ b/tensorflow/core/distributed_runtime/graph_mgr.h
@@ -32,10 +32,12 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/protobuf/debug.pb.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
+#include "tsl/platform/thread_annotations.h"
 
 namespace tsl {
 class CoordinationServiceAgent;
diff --git a/tensorflow/core/distributed_runtime/integration_test/BUILD b/tensorflow/core/distributed_runtime/integration_test/BUILD
index 1240d21c376678..4927d6fd3cdf58 100644
--- a/tensorflow/core/distributed_runtime/integration_test/BUILD
+++ b/tensorflow/core/distributed_runtime/integration_test/BUILD
@@ -20,8 +20,8 @@ tf_cuda_library(
     # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         "//tensorflow/core:framework",
-        "@local_tsl//tsl/distributed_runtime/coordination:coordination_service_agent",
-        "@local_tsl//tsl/distributed_runtime/coordination:coordination_service_error_util",
+        "@local_xla//xla/tsl/distributed_runtime/coordination:coordination_service_agent",
+        "@local_xla//xla/tsl/distributed_runtime/coordination:coordination_service_error_util",
     ],
     alwayslink = 1,
 )
@@ -105,8 +105,8 @@ tf_cuda_cc_test(
         "//tensorflow/core/common_runtime/eager:context",
         "//tensorflow/core/distributed_runtime:server_lib",
         "//tensorflow/core/platform:env",
-        "@local_tsl//tsl/distributed_runtime/coordination:coordination_service",
-        "@local_tsl//tsl/distributed_runtime/coordination:coordination_service_agent",
+        "@local_xla//xla/tsl/distributed_runtime/coordination:coordination_service",
+        "@local_xla//xla/tsl/distributed_runtime/coordination:coordination_service_agent",
     ],
 )
 
@@ -139,8 +139,8 @@ tf_cc_test(
         "//tensorflow/core/distributed_runtime:server_lib",
         "//tensorflow/core/platform:blocking_counter",
         "//tensorflow/core/platform:env",
-        "@local_tsl//tsl/distributed_runtime/coordination:coordination_service",
-        "@local_tsl//tsl/distributed_runtime/coordination:coordination_service_agent",
+        "@local_xla//xla/tsl/distributed_runtime/coordination:coordination_service",
+        "@local_xla//xla/tsl/distributed_runtime/coordination:coordination_service_agent",
     ],
 )
 
diff --git a/tensorflow/core/distributed_runtime/integration_test/coordination_test_opkernel_registration.cc b/tensorflow/core/distributed_runtime/integration_test/coordination_test_opkernel_registration.cc
index e87fa92147456d..af266e3845e21b 100644
--- a/tensorflow/core/distributed_runtime/integration_test/coordination_test_opkernel_registration.cc
+++ b/tensorflow/core/distributed_runtime/integration_test/coordination_test_opkernel_registration.cc
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_error_util.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tsl/distributed_runtime/coordination/coordination_service_agent.h"
-#include "tsl/distributed_runtime/coordination/coordination_service_error_util.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/core/distributed_runtime/preemption/BUILD b/tensorflow/core/distributed_runtime/preemption/BUILD
index 4c796f377acd31..78a4bde0082231 100644
--- a/tensorflow/core/distributed_runtime/preemption/BUILD
+++ b/tensorflow/core/distributed_runtime/preemption/BUILD
@@ -13,7 +13,7 @@ cc_library(
     name = "preemption_sync_manager",
     hdrs = ["preemption_sync_manager.h"],
     deps = [
-        "@local_tsl//tsl/distributed_runtime/preemption:preemption_sync_manager",
+        "@local_xla//xla/tsl/distributed_runtime/preemption:preemption_sync_manager",
     ],
 )
 
@@ -32,7 +32,7 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core/distributed_runtime:error_payloads",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/distributed_runtime/coordination:coordination_service_agent",
+        "@local_xla//xla/tsl/distributed_runtime/coordination:coordination_service_agent",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/core/distributed_runtime/preemption/check_preemption_op_kernel.cc b/tensorflow/core/distributed_runtime/preemption/check_preemption_op_kernel.cc
index e2c2151662aaf0..183c511181a2f8 100644
--- a/tensorflow/core/distributed_runtime/preemption/check_preemption_op_kernel.cc
+++ b/tensorflow/core/distributed_runtime/preemption/check_preemption_op_kernel.cc
@@ -16,12 +16,12 @@ limitations under the License.
 #include <string>
 
 #include "absl/strings/string_view.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
 #include "tensorflow/core/distributed_runtime/error_payloads.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/op_requires.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/errors.h"
-#include "tsl/distributed_runtime/coordination/coordination_service_agent.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/distributed_runtime/preemption/preemption_sync_manager.h b/tensorflow/core/distributed_runtime/preemption/preemption_sync_manager.h
index 682cbb67f345ba..cbe03db62ca24c 100644
--- a/tensorflow/core/distributed_runtime/preemption/preemption_sync_manager.h
+++ b/tensorflow/core/distributed_runtime/preemption/preemption_sync_manager.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_PREEMPTION_PREEMPTION_SYNC_MANAGER_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_PREEMPTION_PREEMPTION_SYNC_MANAGER_H_
 
-#include "tsl/distributed_runtime/preemption/preemption_sync_manager.h"
+#include "xla/tsl/distributed_runtime/preemption/preemption_sync_manager.h"
 
 namespace tensorflow {
 // NOLINTBEGIN(misc-unused-using-decls)
diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index 9ceccd7ccfa015..a9f2370ab81fa1 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -52,7 +52,7 @@ cc_library(
         # Required to be able to overload TensorResponse parsing.
         "//tensorflow/core/distributed_runtime:tensor_coding",
         "//tensorflow/core:lib_internal",
-        "@local_tsl//tsl/distributed_runtime/rpc:grpc_util",
+        "@local_xla//xla/tsl/distributed_runtime/rpc:grpc_util",
     ] + tf_grpc_dependencies() + tf_grpc_cc_dependencies(),
 )
 
@@ -62,7 +62,7 @@ cc_library(
     hdrs = ["grpc_client_cq_tag.h"],
     # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
-        "@local_tsl//tsl/distributed_runtime/rpc:grpc_client_cq_tag",
+        "@local_xla//xla/tsl/distributed_runtime/rpc:grpc_client_cq_tag",
     ],
 )
 
@@ -79,7 +79,7 @@ cc_library(
         "//tensorflow/core/distributed_runtime:call_options",
         "//tensorflow/core/distributed_runtime:tensor_coding",
         "@com_google_absl//absl/strings:str_format",
-        "@local_tsl//tsl/distributed_runtime/rpc:grpc_state",
+        "@local_xla//xla/tsl/distributed_runtime/rpc:grpc_state",
     ] + tf_grpc_cc_dependencies(),
 )
 
@@ -110,7 +110,7 @@ cc_library(
     hdrs = ["grpc_channel.h"],
     # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
-        "@local_tsl//tsl/distributed_runtime/rpc:grpc_channel",
+        "@local_xla//xla/tsl/distributed_runtime/rpc:grpc_channel",
     ],
 )
 
@@ -208,9 +208,9 @@ tf_cuda_library(
         "//tensorflow/core/profiler/lib:scoped_memory_debug_annotation",
         "//tensorflow/core/protobuf:worker_proto_cc",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@local_tsl//tsl/distributed_runtime/rpc:async_service_interface",
-        "@local_tsl//tsl/distributed_runtime/rpc:grpc_call",
         "@local_tsl//tsl/protobuf:rpc_options_proto_cc",
+        "@local_xla//xla/tsl/distributed_runtime/rpc:async_service_interface",
+        "@local_xla//xla/tsl/distributed_runtime/rpc:grpc_call",
     ] + tf_grpc_cc_dependencies(),
 )
 
@@ -259,8 +259,8 @@ cc_library(
         "//tensorflow/core/distributed_runtime:master",
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/core/protobuf:master_proto_cc",
-        "@local_tsl//tsl/distributed_runtime/rpc:async_service_interface",
-        "@local_tsl//tsl/distributed_runtime/rpc:grpc_call",
+        "@local_xla//xla/tsl/distributed_runtime/rpc:async_service_interface",
+        "@local_xla//xla/tsl/distributed_runtime/rpc:grpc_call",
     ] + tf_grpc_cc_dependencies(),
     alwayslink = 1,
 )
@@ -329,7 +329,7 @@ cc_library(
         "//tensorflow/core/nccl:collective_communicator",
         "//tensorflow/core/profiler/rpc:profiler_service_impl",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/distributed_runtime/rpc:async_service_interface",
+        "@local_xla//xla/tsl/distributed_runtime/rpc:async_service_interface",
     ] + tf_protos_profiler_service() + tf_grpc_dependencies() + tf_grpc_cc_dependencies(),
     alwayslink = 1,
 )
diff --git a/tensorflow/core/distributed_runtime/rpc/coordination/BUILD b/tensorflow/core/distributed_runtime/rpc/coordination/BUILD
index 2db1aed4c5826e..0e31a3e8b6c859 100644
--- a/tensorflow/core/distributed_runtime/rpc/coordination/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/coordination/BUILD
@@ -13,7 +13,7 @@ cc_library(
     hdrs = ["grpc_coordination_client.h"],
     # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
-        "@local_tsl//tsl/distributed_runtime/rpc/coordination:grpc_coordination_client",
+        "@local_xla//xla/tsl/distributed_runtime/rpc/coordination:grpc_coordination_client",
     ],
 )
 
@@ -22,6 +22,6 @@ cc_library(
     hdrs = ["grpc_coordination_service_impl.h"],
     # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
-        "@local_tsl//tsl/distributed_runtime/rpc/coordination:grpc_coordination_service_impl",
+        "@local_xla//xla/tsl/distributed_runtime/rpc/coordination:grpc_coordination_service_impl",
     ],
 )
diff --git a/tensorflow/core/distributed_runtime/rpc/coordination/grpc_coordination_client.h b/tensorflow/core/distributed_runtime/rpc/coordination/grpc_coordination_client.h
index 4d301ee8018e37..b692ce70db95fa 100644
--- a/tensorflow/core/distributed_runtime/rpc/coordination/grpc_coordination_client.h
+++ b/tensorflow/core/distributed_runtime/rpc/coordination/grpc_coordination_client.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <memory>
 
-#include "tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.h"
+#include "xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.h"
 
 namespace tensorflow {
 // NOLINTBEGIN(misc-unused-using-decls)
diff --git a/tensorflow/core/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h b/tensorflow/core/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h
index d914583c44c218..9e0a218aa77eb7 100644
--- a/tensorflow/core/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_COORDINATION_GRPC_COORDINATION_SERVICE_IMPL_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_COORDINATION_GRPC_COORDINATION_SERVICE_IMPL_H_
 
-#include "tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h"
+#include "xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h"
 
 namespace tensorflow {
 // NOLINTBEGIN(misc-unused-using-decls)
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/BUILD b/tensorflow/core/distributed_runtime/rpc/eager/BUILD
index 84bce367bdd389..fc26093c418cf3 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/eager/BUILD
@@ -40,7 +40,7 @@ cc_library(
         "//tensorflow/core/platform:error_payloads",
         "//tensorflow/core/protobuf:eager_service_proto_cc",
         "//tensorflow/core/protobuf:for_core_protos_cc",
-        "@local_tsl//tsl/distributed_runtime:call_options",
+        "@local_xla//xla/tsl/distributed_runtime:call_options",
     ] + tf_grpc_cc_dependencies(),
 )
 
@@ -57,8 +57,8 @@ cc_library(
         "//tensorflow/core/distributed_runtime/rpc:grpc_util",
         "//tensorflow/core/distributed_runtime/rpc:grpc_worker_cache",
         "//tensorflow/core/protobuf:eager_service_proto_cc",
-        "@local_tsl//tsl/distributed_runtime/rpc:async_service_interface",
-        "@local_tsl//tsl/distributed_runtime/rpc:grpc_call",
+        "@local_xla//xla/tsl/distributed_runtime/rpc:async_service_interface",
+        "@local_xla//xla/tsl/distributed_runtime/rpc:grpc_call",
     ] + tf_grpc_cc_dependencies(),
 )
 
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
index 8142ea84660a36..f69a34eeb21839 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 
 #include "grpcpp/generic/generic_stub.h"
+#include "xla/tsl/distributed_runtime/call_options.h"
 #include "tensorflow/core/distributed_runtime/call_options.h"
 #include "tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h"
@@ -33,7 +34,6 @@ limitations under the License.
 #include "tensorflow/core/protobuf/core_platform_payloads.pb.h"
 #include "tensorflow/core/protobuf/eager_service.pb.h"
 #include "tensorflow/core/util/env_var.h"
-#include "tsl/distributed_runtime/call_options.h"
 
 namespace tensorflow {
 namespace eager {
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc
index df1c3610009c51..9599766e784e35 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc
@@ -17,11 +17,11 @@ limitations under the License.
 
 #include <memory>
 
+#include "xla/tsl/distributed_runtime/rpc/grpc_call.h"
 #include "tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.h"
-#include "tsl/distributed_runtime/rpc/grpc_call.h"
 
 namespace tensorflow {
 namespace eager {
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
index d310424c92f588..88d49312fa87d1 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
@@ -19,11 +19,11 @@ limitations under the License.
 #include "grpcpp/alarm.h"
 #include "grpcpp/completion_queue.h"
 #include "grpcpp/server_builder.h"
+#include "xla/tsl/distributed_runtime/rpc/async_service_interface.h"
+#include "xla/tsl/distributed_runtime/rpc/grpc_call.h"
 #include "tensorflow/core/distributed_runtime/eager/eager_service_impl.h"
 #include "tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
-#include "tsl/distributed_runtime/rpc/async_service_interface.h"
-#include "tsl/distributed_runtime/rpc/grpc_call.h"
 
 namespace tensorflow {
 namespace eager {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_channel.h b/tensorflow/core/distributed_runtime/rpc/grpc_channel.h
index c98ad7a074dd6f..b9bc118e725e17 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_channel.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_channel.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CHANNEL_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CHANNEL_H_
 
-#include "tsl/distributed_runtime/rpc/grpc_channel.h"
+#include "xla/tsl/distributed_runtime/rpc/grpc_channel.h"
 
 namespace tensorflow {
 // NOLINTBEGIN(misc-unused-using-decls)
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h b/tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h
index accb50857d579e..30822036879947 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CLIENT_CQ_TAG_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CLIENT_CQ_TAG_H_
 
-#include "tsl/distributed_runtime/rpc/grpc_client_cq_tag.h"
+#include "xla/tsl/distributed_runtime/rpc/grpc_client_cq_tag.h"
 
 namespace tensorflow {
 // NOLINTBEGIN(misc-unused-using-decls)
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
index 50dc3d76889a02..abc9d12969147e 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
@@ -34,6 +34,8 @@ limitations under the License.
 
 #include "grpcpp/alarm.h"
 #include "grpcpp/server_builder.h"
+#include "xla/tsl/distributed_runtime/rpc/async_service_interface.h"
+#include "xla/tsl/distributed_runtime/rpc/grpc_call.h"
 #include "tensorflow/core/distributed_runtime/master.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
@@ -42,8 +44,6 @@ limitations under the License.
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/protobuf/master.pb.h"
-#include "tsl/distributed_runtime/rpc/async_service_interface.h"
-#include "tsl/distributed_runtime/rpc/grpc_call.h"
 
 namespace tensorflow {
 
@@ -291,7 +291,7 @@ class GrpcMasterService : public tsl::AsyncServiceInterface {
 #undef ENQUEUE_REQUEST
 
   // Start tracing, including the ID attached to the RPC.
-  profiler::TraceMe* TraceRpc(
+  tsl::profiler::TraceMe* TraceRpc(
       StringPiece name,
       const std::multimap<::grpc::string_ref, ::grpc::string_ref>& metadata) {
     StringPiece id;
@@ -299,8 +299,9 @@ class GrpcMasterService : public tsl::AsyncServiceInterface {
     if (it != metadata.end()) {
       id = StringPiece(it->second.data(), it->second.size());
     }
-    return new profiler::TraceMe([&] { return strings::StrCat(name, ":", id); },
-                                 profiler::TraceMeLevel::kInfo);
+    return new tsl::profiler::TraceMe(
+        [&] { return strings::StrCat(name, ":", id); },
+        tsl::profiler::TraceMeLevel::kInfo);
   }
 
   GrpcMasterService(const GrpcMasterService&) = delete;
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
index 30bfc43de078f7..c47dc31fc3ef4e 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
@@ -114,12 +114,13 @@ class GrpcRemoteMaster : public MasterInterface {
 
  private:
   // Start tracing, attaching a unique ID to both the trace and the RPC.
-  profiler::TraceMe* NewTraceRpc(StringPiece name, ::grpc::ClientContext* ctx) {
+  tsl::profiler::TraceMe* NewTraceRpc(StringPiece name,
+                                      ::grpc::ClientContext* ctx) {
     string trace_id = strings::StrCat(tracing::GetUniqueArg());
     ctx->AddMetadata(GrpcIdKey(), trace_id);
-    return new profiler::TraceMe(
+    return new tsl::profiler::TraceMe(
         [&] { return strings::StrCat(name, ":", trace_id); },
-        profiler::TraceMeLevel::kInfo);
+        tsl::profiler::TraceMeLevel::kInfo);
   }
 
   template <typename Request, typename Response>
@@ -136,7 +137,7 @@ class GrpcRemoteMaster : public MasterInterface {
     Status s;
     for (int num_retries = 0;; ++num_retries) {
       ::grpc::ClientContext ctx;
-      std::unique_ptr<profiler::TraceMe> trace;
+      std::unique_ptr<tsl::profiler::TraceMe> trace;
       if (!trace_string.empty()) {
         trace.reset(NewTraceRpc(trace_string, &ctx));
       }
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index 42f0d3ea79c274..61861a670b8e9e 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "grpcpp/security/credentials.h"
 #include "grpcpp/server_builder.h"
 #include "absl/strings/numbers.h"
+#include "xla/tsl/distributed_runtime/rpc/async_service_interface.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/process_util.h"
@@ -61,7 +62,6 @@ limitations under the License.
 #include "tensorflow/core/profiler/rpc/profiler_service_impl.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/util/env_var.h"
-#include "tsl/distributed_runtime/rpc/async_service_interface.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
index f588f291960a37..383955e54675f2 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
@@ -24,6 +24,7 @@ limitations under the License.
 
 #include "grpcpp/grpcpp.h"
 #include "grpcpp/security/credentials.h"
+#include "xla/tsl/distributed_runtime/rpc/async_service_interface.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/common_runtime/stats_publisher_interface.h"
@@ -37,7 +38,6 @@ limitations under the License.
 #include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/platform/env.h"
-#include "tsl/distributed_runtime/rpc/async_service_interface.h"
 #include "tsl/profiler/protobuf/profiler_service.grpc.pb.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_state.h b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
index 8fdc1d9437cf2f..43d87b35cb2628 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_state.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "grpcpp/generic/generic_stub.h"
 #include "grpcpp/grpcpp.h"
+#include "xla/tsl/distributed_runtime/rpc/grpc_state.h"
 #include "tensorflow/core/distributed_runtime/call_options.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
@@ -33,7 +34,6 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/notification.h"
 #include "tensorflow/core/util/env_var.h"
-#include "tsl/distributed_runtime/rpc/grpc_state.h"
 
 namespace tensorflow {
 // NOLINTBEGIN(misc-unused-using-decls)
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_util.h b/tensorflow/core/distributed_runtime/rpc/grpc_util.h
index bf7bb58b466e26..0db1838262a272 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_util.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_util.h
@@ -22,11 +22,11 @@ limitations under the License.
 #include "grpcpp/grpcpp.h"
 #include "grpcpp/impl/codegen/proto_utils.h"
 #include "grpcpp/support/byte_buffer.h"
+#include "xla/tsl/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/distributed_runtime/tensor_coding.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/protobuf.h"
-#include "tsl/distributed_runtime/rpc/grpc_util.h"
 
 namespace tensorflow {
 // NOLINTBEGIN(misc-unused-using-decls)
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
index 2eaaeda2d571dc..a990f41af261a2 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
@@ -23,6 +23,8 @@ limitations under the License.
 #include "grpcpp/alarm.h"
 #include "grpcpp/server_builder.h"
 #include "absl/container/flat_hash_map.h"
+#include "xla/tsl/distributed_runtime/rpc/async_service_interface.h"
+#include "xla/tsl/distributed_runtime/rpc/grpc_call.h"
 #include "tensorflow/core/common_runtime/buf_rendezvous.h"
 #include "tensorflow/core/common_runtime/copy_tensor.h"
 #include "tensorflow/core/common_runtime/device.h"
@@ -54,8 +56,6 @@ limitations under the License.
 #include "tensorflow/core/profiler/lib/scoped_memory_debug_annotation.h"
 #include "tensorflow/core/protobuf/transport_options.pb.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
-#include "tsl/distributed_runtime/rpc/async_service_interface.h"
-#include "tsl/distributed_runtime/rpc/grpc_call.h"
 #include "tsl/protobuf/rpc_options.pb.h"
 
 namespace tensorflow {
@@ -532,7 +532,7 @@ void GrpcWorker::GrpcRecvTensorAsync(CallOptions* opts,
         AllocatorAttributes alloc_attrs;
         alloc_attrs.set_gpu_compatible(true);
         alloc_attrs.set_on_host(true);
-        profiler::ScopedMemoryDebugAnnotation op_annotation(
+        tsl::profiler::ScopedMemoryDebugAnnotation op_annotation(
             "GrpcWorker::RecvTensorAsync::consumer_callback",
             request->step_id(), "dynamic", val.dtype(),
             [shape = val.shape()]() { return shape.DebugString(); });
@@ -669,7 +669,7 @@ void GrpcWorker::RecvBufAsync(CallOptions* opts, const RecvBufRequest* request,
           AllocatorAttributes cpu_attr;
           cpu_attr.set_gpu_compatible(true);
           cpu_attr.set_nic_compatible(true);
-          profiler::ScopedMemoryDebugAnnotation op_annotation(
+          tsl::profiler::ScopedMemoryDebugAnnotation op_annotation(
               "GrpcWorker::RecvBufAsync::consumer_callback", request->step_id(),
               "dynamic", hook->prod_value->dtype(),
               [hook]() { return hook->prod_value->shape().DebugString(); });
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
index 6fd9f6ba61c401..ebb1ac91ebf713 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
@@ -20,11 +20,11 @@ limitations under the License.
 #include <unordered_map>
 
 #include "grpcpp/server_builder.h"
+#include "xla/tsl/distributed_runtime/rpc/async_service_interface.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h"
 #include "tensorflow/core/distributed_runtime/rpc/rpc_response_cache.h"
 #include "tensorflow/core/distributed_runtime/worker.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
-#include "tsl/distributed_runtime/rpc/async_service_interface.h"
 
 namespace grpc {
 class ByteBuffer;
diff --git a/tensorflow/core/distributed_runtime/session_mgr.cc b/tensorflow/core/distributed_runtime/session_mgr.cc
index 5fcd27c3c17b89..a6b4df397b6b25 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr.cc
@@ -21,6 +21,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "xla/tsl/distributed_runtime/coordination/coordination_service.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
 #include "tensorflow/core/activity_watcher/activity.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/renamed_device.h"
@@ -33,8 +35,6 @@ limitations under the License.
 #include "tensorflow/core/protobuf/cluster.pb.h"
 #include "tensorflow/core/protobuf/tensorflow_server.pb.h"
 #include "tensorflow/core/util/device_name_utils.h"
-#include "tsl/distributed_runtime/coordination/coordination_service.h"
-#include "tsl/distributed_runtime/coordination/coordination_service_agent.h"
 #include "tsl/protobuf/coordination_config.pb.h"
 #include "tsl/protobuf/coordination_service.pb.h"
 #include "tsl/protobuf/distributed_runtime_payloads.pb.h"
diff --git a/tensorflow/core/distributed_runtime/session_mgr.h b/tensorflow/core/distributed_runtime/session_mgr.h
index f3339f568747be..6daaa756b05bfb 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.h
+++ b/tensorflow/core/distributed_runtime/session_mgr.h
@@ -19,15 +19,15 @@ limitations under the License.
 #include <functional>
 #include <string>
 
+#include "xla/tsl/distributed_runtime/coordination/coordination_service.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.h"
 #include "tensorflow/core/distributed_runtime/worker_session.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/protobuf/tensorflow_server.pb.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
-#include "tsl/distributed_runtime/coordination/coordination_service.h"
-#include "tsl/distributed_runtime/coordination/coordination_service_agent.h"
-#include "tsl/distributed_runtime/coordination/coordination_service_rpc_handler.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/framework/BUILD b/tensorflow/core/framework/BUILD
index 15db744c0b2201..c457a044758fba 100644
--- a/tensorflow/core/framework/BUILD
+++ b/tensorflow/core/framework/BUILD
@@ -1415,9 +1415,7 @@ cc_library(
     name = "loose_headers",
     tags = ["avoid_dep"],
     textual_hdrs = ["op_gen_lib.h"],
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
 )
 
 # All framework protos are self-contained, i.e. they only import other
diff --git a/tensorflow/core/framework/attr_value_util.h b/tensorflow/core/framework/attr_value_util.h
index 0717e2cc3307c8..843d024a999037 100644
--- a/tensorflow/core/framework/attr_value_util.h
+++ b/tensorflow/core/framework/attr_value_util.h
@@ -74,24 +74,24 @@ void SetAttrValue(const Tensor& value, AttrValue* out);
 void SetAttrValue(const TensorProto& value, AttrValue* out);
 void SetAttrValue(const NameAttrList& value, AttrValue* out);
 
-void SetAttrValue(gtl::ArraySlice<string> value, AttrValue* out);
-void SetAttrValue(gtl::ArraySlice<tstring> value, AttrValue* out);
-void SetAttrValue(gtl::ArraySlice<const char*> value, AttrValue* out);
-void SetAttrValue(gtl::ArraySlice<StringPiece> value, AttrValue* out);
-void SetAttrValue(gtl::ArraySlice<int64_t> value, AttrValue* out);
-void SetAttrValue(gtl::ArraySlice<int32> value, AttrValue* out);
-void SetAttrValue(gtl::ArraySlice<float> value, AttrValue* out);
-void SetAttrValue(gtl::ArraySlice<double> value, AttrValue* out);
-void SetAttrValue(gtl::ArraySlice<bool> value, AttrValue* out);
+void SetAttrValue(absl::Span<const string> value, AttrValue* out);
+void SetAttrValue(absl::Span<const tstring> value, AttrValue* out);
+void SetAttrValue(absl::Span<const char* const> value, AttrValue* out);
+void SetAttrValue(absl::Span<const StringPiece> value, AttrValue* out);
+void SetAttrValue(absl::Span<const int64_t> value, AttrValue* out);
+void SetAttrValue(absl::Span<const int32> value, AttrValue* out);
+void SetAttrValue(absl::Span<const float> value, AttrValue* out);
+void SetAttrValue(absl::Span<const double> value, AttrValue* out);
+void SetAttrValue(absl::Span<const bool> value, AttrValue* out);
 void SetAttrValue(const std::vector<bool>& value, AttrValue* out);
 void SetAttrValue(std::initializer_list<bool> value, AttrValue* out);
 void SetAttrValue(DataTypeSlice value, AttrValue* out);
-void SetAttrValue(gtl::ArraySlice<TensorShape> value, AttrValue* out);
-void SetAttrValue(gtl::ArraySlice<TensorShapeProto> value, AttrValue* out);
-void SetAttrValue(gtl::ArraySlice<PartialTensorShape> value, AttrValue* out);
-void SetAttrValue(gtl::ArraySlice<Tensor> value, AttrValue* out);
-void SetAttrValue(gtl::ArraySlice<TensorProto> value, AttrValue* out);
-void SetAttrValue(gtl::ArraySlice<NameAttrList> value, AttrValue* out);
+void SetAttrValue(absl::Span<const TensorShape> value, AttrValue* out);
+void SetAttrValue(absl::Span<const TensorShapeProto> value, AttrValue* out);
+void SetAttrValue(absl::Span<const PartialTensorShape> value, AttrValue* out);
+void SetAttrValue(absl::Span<const Tensor> value, AttrValue* out);
+void SetAttrValue(absl::Span<const TensorProto> value, AttrValue* out);
+void SetAttrValue(absl::Span<const NameAttrList> value, AttrValue* out);
 
 void SetAttrValue(const AttrValue& value, AttrValue* out);
 
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index ab823a5cf9fec0..9d7d89749667ea 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <cstdlib>
 #include <deque>
+#include <functional>
 #include <iterator>
 #include <memory>
 #include <optional>
@@ -28,6 +29,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
@@ -89,7 +91,7 @@ using TraceMeMetadata = std::vector<std::pair<StringPiece, string>>;
 
 // Maps the index of dataset elements to a globally shuffled index. See the
 // comment for IteratorContext::Params::index_mapper for more details.
-using IndexMapperFn = std::function<size_t(size_t)>;
+using IndexMapperFn = std::function<absl::StatusOr<size_t>(size_t)>;
 
 constexpr char kTFDataFunction[] = "_tf_data_function";
 
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index eb74ea58905405..95eed7f2e3a202 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -124,7 +124,7 @@ class FunctionDefHelper {
   // Constructs an AttrValue.func given the "name" and "attrs".
   static AttrValueWrapper FunctionRef(
       const std::string& name,
-      gtl::ArraySlice<std::pair<string, AttrValueWrapper>> attrs);
+      absl::Span<const std::pair<string, AttrValueWrapper>> attrs);
   static AttrValueWrapper FunctionRef(const std::string& name) {
     return FunctionRef(name, {});
   }
@@ -169,35 +169,34 @@ class FunctionDefHelper {
   // - `control_ret_def` holds a mapping from the function control
   //   output names to the nodes from `node_def`.
   static FunctionDef Create(
-      const std::string& function_name, gtl::ArraySlice<string> in_def,
-      gtl::ArraySlice<string> out_def, gtl::ArraySlice<string> attr_def,
-      gtl::ArraySlice<Node> node_def,
-      gtl::ArraySlice<std::pair<string, string>> ret_def,
-      gtl::ArraySlice<std::pair<string, string>> control_ret_def);
+      const std::string& function_name, absl::Span<const string> in_def,
+      absl::Span<const string> out_def, absl::Span<const string> attr_def,
+      absl::Span<const Node> node_def,
+      absl::Span<const std::pair<string, string>> ret_def,
+      absl::Span<const std::pair<string, string>> control_ret_def);
 
   // Creates a FunctionDef from the given parameters. Node inputs must use
   // function encoding (node_name:output_name[:output_index]).
   // - `ret_def` holds a mapping from the function output names from `out_def`
   //   to the node outputs from `node_def`.
-  static FunctionDef Create(const std::string& function_name,
-                            gtl::ArraySlice<string> in_def,
-                            gtl::ArraySlice<string> out_def,
-                            gtl::ArraySlice<string> attr_def,
-                            gtl::ArraySlice<Node> node_def,
-                            gtl::ArraySlice<std::pair<string, string>> ret_def);
+  static FunctionDef Create(
+      const std::string& function_name, absl::Span<const string> in_def,
+      absl::Span<const string> out_def, absl::Span<const string> attr_def,
+      absl::Span<const Node> node_def,
+      absl::Span<const std::pair<string, string>> ret_def);
 
   // TODO(josh11b): Get rid of these and transition to the one above.
   static FunctionDef Define(const std::string& function_name,
-                            gtl::ArraySlice<string> arg_def,
-                            gtl::ArraySlice<string> ret_def,
-                            gtl::ArraySlice<string> attr_def,
-                            gtl::ArraySlice<Node> node_def);
+                            absl::Span<const string> arg_def,
+                            absl::Span<const string> ret_def,
+                            absl::Span<const string> attr_def,
+                            absl::Span<const Node> node_def);
 
   // Defines an anonymous function. I.e., its name is not relevant.
-  static FunctionDef Define(gtl::ArraySlice<string> arg_def,
-                            gtl::ArraySlice<string> ret_def,
-                            gtl::ArraySlice<string> attr_def,
-                            gtl::ArraySlice<Node> node_def);
+  static FunctionDef Define(absl::Span<const string> arg_def,
+                            absl::Span<const string> ret_def,
+                            absl::Span<const string> attr_def,
+                            absl::Span<const Node> node_def);
 
   // Helpers to construct a constant scalar.
   template <typename T>
@@ -279,7 +278,7 @@ Status InstantiateFunction(const FunctionDef& fdef, AttrSlice attr_values,
 // etc.)
 std::string DebugString(const FunctionDef& func_def);
 std::string DebugString(const GraphDef& instantiated_func_def);
-std::string DebugString(gtl::ArraySlice<NodeDef> instantiated_func_nodes);
+std::string DebugString(absl::Span<const NodeDef> instantiated_func_nodes);
 
 // Returns a debug string for a top level graph (the main program and
 // its supporting functions defined in its library).
@@ -329,7 +328,7 @@ class FunctionCallFrame : public CallFrameInterface {
   ~FunctionCallFrame() override;
 
   // Caller methods.
-  Status SetArgs(gtl::ArraySlice<Tensor> args);
+  Status SetArgs(absl::Span<const Tensor> args);
   Status GetRetvals(std::vector<Tensor>* rets) const;
 
   // Moves the return values from the frame to rets. If allow_dead_tensors is
@@ -965,13 +964,13 @@ class FunctionLibraryRuntime : public core::WeakRefCounted {
   };
   typedef std::function<void(const Status&)> DoneCallback;
   virtual void Run(const Options& opts, Handle handle,
-                   gtl::ArraySlice<Tensor> args, std::vector<Tensor>* rets,
+                   absl::Span<const Tensor> args, std::vector<Tensor>* rets,
                    DoneCallback done) = 0;
   virtual void Run(const Options& opts, Handle handle,
                    CallFrameInterface* call_frame, DoneCallback done) = 0;
 
   virtual Status RunSync(Options opts, Handle handle,
-                         gtl::ArraySlice<Tensor> args,
+                         absl::Span<const Tensor> args,
                          std::vector<Tensor>* rets) = 0;
   virtual Status RunSync(Options opts, Handle handle,
                          CallFrameInterface* call_frame) = 0;
@@ -1126,7 +1125,7 @@ class DistributedFunctionLibraryRuntime {
   // opts.runner isn't used for execution.
   virtual void Run(const FunctionLibraryRuntime::Options& opts,
                    FunctionLibraryRuntime::LocalHandle handle,
-                   gtl::ArraySlice<Tensor> args, std::vector<Tensor>* rets,
+                   absl::Span<const Tensor> args, std::vector<Tensor>* rets,
                    FunctionLibraryRuntime::DoneCallback done) = 0;
 
   // Run an instantiated remote function (specified by `handle`) with a list of
@@ -1138,7 +1137,7 @@ class DistributedFunctionLibraryRuntime {
   // supported in TensorFlow v1 runtime.
   virtual void Run(const FunctionLibraryRuntime::Options& opts,
                    FunctionLibraryRuntime::LocalHandle handle,
-                   gtl::ArraySlice<FunctionArg> args,
+                   absl::Span<const FunctionArg> args,
                    std::vector<FunctionRet>* rets,
                    FunctionLibraryRuntime::DoneCallback done) = 0;
 
diff --git a/tensorflow/core/framework/function_testlib.cc b/tensorflow/core/framework/function_testlib.cc
index 6884810f4ca386..7303228b935bbf 100644
--- a/tensorflow/core/framework/function_testlib.cc
+++ b/tensorflow/core/framework/function_testlib.cc
@@ -28,8 +28,8 @@ namespace function {
 
 typedef FunctionDefHelper FDH;
 
-GraphDef GDef(gtl::ArraySlice<NodeDef> nodes,
-              gtl::ArraySlice<FunctionDef> funcs) {
+GraphDef GDef(absl::Span<const NodeDef> nodes,
+              absl::Span<const FunctionDef> funcs) {
   GraphDef g;
   VersionDef* versions = g.mutable_versions();
   versions->set_producer(TF_GRAPH_DEF_VERSION);
@@ -45,8 +45,8 @@ GraphDef GDef(gtl::ArraySlice<NodeDef> nodes,
 }
 
 // Helper to construct a NodeDef.
-NodeDef NDef(StringPiece name, StringPiece op, gtl::ArraySlice<string> inputs,
-             gtl::ArraySlice<std::pair<string, FDH::AttrValueWrapper>> attrs,
+NodeDef NDef(StringPiece name, StringPiece op, absl::Span<const string> inputs,
+             absl::Span<const std::pair<string, FDH::AttrValueWrapper>> attrs,
              const string& device) {
   NodeDef n;
   n.set_name(string(name));
diff --git a/tensorflow/core/framework/function_testlib.h b/tensorflow/core/framework/function_testlib.h
index 49b3ca837961ac..0aba30650d5f84 100644
--- a/tensorflow/core/framework/function_testlib.h
+++ b/tensorflow/core/framework/function_testlib.h
@@ -56,14 +56,14 @@ class Attrs {
 
 // Helper to construct a NodeDef.
 NodeDef NDef(
-    StringPiece name, StringPiece op, gtl::ArraySlice<string> inputs,
-    gtl::ArraySlice<std::pair<string, FunctionDefHelper::AttrValueWrapper>>
+    StringPiece name, StringPiece op, absl::Span<const string> inputs,
+    absl::Span<const std::pair<string, FunctionDefHelper::AttrValueWrapper>>
         attrs = {},
     const string& device = "");
 
 // Helper to construct a GraphDef proto.
-GraphDef GDef(gtl::ArraySlice<NodeDef> nodes,
-              gtl::ArraySlice<FunctionDef> funcs = {});
+GraphDef GDef(absl::Span<const NodeDef> nodes,
+              absl::Span<const FunctionDef> funcs = {});
 
 // For testing convenience, we provide a few simple functions that can
 // be easily executed and tested.
diff --git a/tensorflow/core/framework/kernel_def_builder.cc b/tensorflow/core/framework/kernel_def_builder.cc
index 7917d252456a36..c9788b0a08c45f 100644
--- a/tensorflow/core/framework/kernel_def_builder.cc
+++ b/tensorflow/core/framework/kernel_def_builder.cc
@@ -36,7 +36,7 @@ KernelDefBuilder& KernelDefBuilder::Device(const char* device_type) {
 
 template <>
 KernelDefBuilder& KernelDefBuilder::AttrConstraint<int64_t>(
-    const char* attr_name, gtl::ArraySlice<int64_t> allowed) {
+    const char* attr_name, absl::Span<const int64_t> allowed) {
   auto* constraint = kernel_def_->add_constraint();
   constraint->set_name(attr_name);
   auto* allowed_values = constraint->mutable_allowed_values()->mutable_list();
@@ -51,12 +51,12 @@ KernelDefBuilder& KernelDefBuilder::AttrConstraint<int64_t>(
     const char* attr_name, int64_t allowed) {
   return AttrConstraint(
       attr_name,
-      gtl::ArraySlice<int64_t>(std::initializer_list<int64_t>({allowed})));
+      absl::Span<const int64_t>(std::initializer_list<int64_t>({allowed})));
 }
 
 template <>
 KernelDefBuilder& KernelDefBuilder::AttrConstraint<string>(
-    const char* attr_name, gtl::ArraySlice<string> allowed) {
+    const char* attr_name, absl::Span<const string> allowed) {
   auto* constraint = kernel_def_->add_constraint();
   constraint->set_name(attr_name);
   auto* allowed_values = constraint->mutable_allowed_values()->mutable_list();
@@ -71,12 +71,12 @@ KernelDefBuilder& KernelDefBuilder::AttrConstraint<string>(
     const char* attr_name, string allowed) {
   return AttrConstraint(
       attr_name,
-      gtl::ArraySlice<string>(std::initializer_list<string>({allowed})));
+      absl::Span<const string>(std::initializer_list<string>({allowed})));
 }
 
 template <>
 KernelDefBuilder& KernelDefBuilder::AttrConstraint<const char*>(
-    const char* attr_name, gtl::ArraySlice<const char*> allowed) {
+    const char* attr_name, absl::Span<const char* const> allowed) {
   auto* constraint = kernel_def_->add_constraint();
   constraint->set_name(attr_name);
   auto* allowed_values = constraint->mutable_allowed_values()->mutable_list();
@@ -90,7 +90,7 @@ template <>
 KernelDefBuilder& KernelDefBuilder::AttrConstraint<const char*>(
     const char* attr_name, const char* allowed) {
   return AttrConstraint(attr_name,
-                        gtl::ArraySlice<const char*>(
+                        absl::Span<const char* const>(
                             std::initializer_list<const char*>({allowed})));
 }
 
@@ -105,7 +105,7 @@ KernelDefBuilder& KernelDefBuilder::AttrConstraint<bool>(const char* attr_name,
 }
 
 KernelDefBuilder& KernelDefBuilder::TypeConstraint(
-    const char* attr_name, gtl::ArraySlice<DataType> allowed) {
+    const char* attr_name, absl::Span<const DataType> allowed) {
   auto* constraint = kernel_def_->add_constraint();
   constraint->set_name(attr_name);
   auto* allowed_values = constraint->mutable_allowed_values()->mutable_list();
diff --git a/tensorflow/core/framework/kernel_def_builder.h b/tensorflow/core/framework/kernel_def_builder.h
index f4d2220d90261f..b7629c8d7e34ff 100644
--- a/tensorflow/core/framework/kernel_def_builder.h
+++ b/tensorflow/core/framework/kernel_def_builder.h
@@ -55,7 +55,7 @@ class KernelDefBuilder {
   // what the Op allows).
   // Returns *this.
   KernelDefBuilder& TypeConstraint(const char* attr_name,
-                                   gtl::ArraySlice<DataType> allowed);
+                                   absl::Span<const DataType> allowed);
 
   // Like TypeConstraint but supports just a single type.
   KernelDefBuilder& TypeConstraint(const char* attr_name, DataType allowed);
diff --git a/tensorflow/core/framework/kernel_def_builder_test.cc b/tensorflow/core/framework/kernel_def_builder_test.cc
index c79fc78f47060e..fa37b114abbe22 100644
--- a/tensorflow/core/framework/kernel_def_builder_test.cc
+++ b/tensorflow/core/framework/kernel_def_builder_test.cc
@@ -94,7 +94,7 @@ TEST(KernelDefBuilderTest, Int64Constraint) {
   def = KernelDefBuilder("C")
             .Device(DEVICE_GPU)
             .AttrConstraint("U",
-                            gtl::ArraySlice<int64_t>{int64_t{5}, int64_t{17}})
+                            absl::Span<const int64_t>{int64_t{5}, int64_t{17}})
             .AttrConstraint("V", string("proto"))
             .Build();
 
@@ -135,7 +135,7 @@ TEST(KernelDefBuilderTest, StringConstraint) {
 
   def = KernelDefBuilder("C")
             .Device(DEVICE_GPU)
-            .AttrConstraint("U", gtl::ArraySlice<const char*>{"boo", "ya"})
+            .AttrConstraint("U", absl::Span<const char* const>{"boo", "ya"})
             .AttrConstraint("V", string("proto"))
             .Build();
 
diff --git a/tensorflow/core/framework/local_rendezvous.cc b/tensorflow/core/framework/local_rendezvous.cc
index 83922b052e4563..488e9251d8e913 100644
--- a/tensorflow/core/framework/local_rendezvous.cc
+++ b/tensorflow/core/framework/local_rendezvous.cc
@@ -401,7 +401,8 @@ void LocalRendezvous::DoAbort(const Status& status) {
     mutex_lock l(mu_);
     status_.Update(status);
   }
-  LOG(WARNING) << "Local rendezvous is aborting with status: " << status;
+  LOG_EVERY_POW_2(INFO) << "Local rendezvous is aborting with status: "
+                        << status;
 
   // Keeps one Item to make sure the current rendezvous won't be destructed.
   std::unique_ptr<Item> to_delete;
diff --git a/tensorflow/core/framework/model.cc b/tensorflow/core/framework/model.cc
index cc970218ba76cf..0248ff3e62328d 100644
--- a/tensorflow/core/framework/model.cc
+++ b/tensorflow/core/framework/model.cc
@@ -1326,8 +1326,8 @@ class AsyncKnownRatio : public AsyncRatio {
       parameters.push_back(pair.second);
     }
     return std::make_shared<AsyncKnownRatio>(
-        Args{id_, name_, std::move(output)}, Ratio(), MemoryRatio(),
-        parameters);
+        Args{id_, name_, std::move(output)}, Ratio(), MemoryRatio(), parameters,
+        is_legacy_prefetch_autotuned_);
   }
 
   Status ToProto(ModelProto::Node* node_proto) const override {
diff --git a/tensorflow/core/framework/model_test.cc b/tensorflow/core/framework/model_test.cc
index 841296ef32deb3..3f0e173041a86b 100644
--- a/tensorflow/core/framework/model_test.cc
+++ b/tensorflow/core/framework/model_test.cc
@@ -24,7 +24,10 @@ limitations under the License.
 #include <tuple>
 #include <utility>
 
+#include <gmock/gmock.h>
+#include "absl/status/status.h"
 #include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/model.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
@@ -240,6 +243,27 @@ INSTANTIATE_TEST_SUITE_P(Test, AsyncKnownRatioTest,
                                             ::testing::Values(0, 50, 100, 200),
                                             ::testing::Values(0, 1, 2, 4)));
 
+TEST(AsyncKnownRatioTest, LegacyPrefetchAutotuneShouldBeExportedAsTunable) {
+  static constexpr int IRRELEVANT_MIN = 1;
+  constexpr int IRRELEVANT_MAX = 16;
+  constexpr int IRRELEVANT_VALUE = 1;
+  const Node::Args irrelevant_args = {0, "async_known_many", nullptr};
+
+  std::shared_ptr<Node> async_known_many = model::MakeAsyncKnownRatioNode(
+      irrelevant_args, /*ratio=*/100,
+      {model::MakeParameter(kBufferSize,
+                            std::make_shared<SharedState>(IRRELEVANT_VALUE,
+                                                          /*mu=*/nullptr,
+                                                          /*cond_var=*/nullptr),
+                            IRRELEVANT_MIN, IRRELEVANT_MAX)},
+      /*is_legacy_prefetch_autotuned=*/true);
+  std::shared_ptr<Node> cloned = async_known_many->Snapshot();
+  ModelProto::Node node;
+  ASSERT_EQ(cloned->ToProto(&node), absl::OkStatus());
+  ASSERT_EQ(node.parameters().size(), 1);
+  EXPECT_TRUE(node.parameters(0).tunable());
+}
+
 TEST(InterleaveManyTest, Model) {
   auto parameter =
       model::MakeParameter("cycle_length", nullptr, /*min=*/1, /*max=*/1);
diff --git a/tensorflow/core/framework/node_def_builder.h b/tensorflow/core/framework/node_def_builder.h
index 0e8a5911b34388..84dcc9e5a8d6e3 100644
--- a/tensorflow/core/framework/node_def_builder.h
+++ b/tensorflow/core/framework/node_def_builder.h
@@ -81,7 +81,7 @@ class NodeDefBuilder {
   NodeDefBuilder& Input(const NodeOut& src);
 
   // For inputs that take a list of tensors.
-  NodeDefBuilder& Input(gtl::ArraySlice<NodeOut> src_list);
+  NodeDefBuilder& Input(absl::Span<const NodeOut> src_list);
 
   // To create inputs in tests, see fake_input.h.
   NodeDefBuilder& Input(FakeInputFunctor fake_input);
@@ -108,23 +108,23 @@ class NodeDefBuilder {
   NodeDefBuilder& Attr(StringPiece name, const Tensor& value);
   NodeDefBuilder& Attr(StringPiece name, const TensorProto& value);
   NodeDefBuilder& Attr(StringPiece name, const NameAttrList& value);
-  NodeDefBuilder& Attr(StringPiece name, gtl::ArraySlice<StringPiece> value);
-  NodeDefBuilder& Attr(StringPiece name, gtl::ArraySlice<const char*> value);
-  NodeDefBuilder& Attr(StringPiece name, gtl::ArraySlice<string> value);
-  NodeDefBuilder& Attr(StringPiece name, gtl::ArraySlice<tstring> value);
-  NodeDefBuilder& Attr(StringPiece name, gtl::ArraySlice<int32> value);
-  NodeDefBuilder& Attr(StringPiece name, gtl::ArraySlice<int64_t> value);
-  NodeDefBuilder& Attr(StringPiece name, gtl::ArraySlice<float> value);
-  NodeDefBuilder& Attr(StringPiece name, gtl::ArraySlice<bool> value);
+  NodeDefBuilder& Attr(StringPiece name, absl::Span<const StringPiece> value);
+  NodeDefBuilder& Attr(StringPiece name, absl::Span<const char* const> value);
+  NodeDefBuilder& Attr(StringPiece name, absl::Span<const string> value);
+  NodeDefBuilder& Attr(StringPiece name, absl::Span<const tstring> value);
+  NodeDefBuilder& Attr(StringPiece name, absl::Span<const int32> value);
+  NodeDefBuilder& Attr(StringPiece name, absl::Span<const int64_t> value);
+  NodeDefBuilder& Attr(StringPiece name, absl::Span<const float> value);
+  NodeDefBuilder& Attr(StringPiece name, absl::Span<const bool> value);
   NodeDefBuilder& Attr(StringPiece name, const std::vector<bool>& value);
-  NodeDefBuilder& Attr(StringPiece name, gtl::ArraySlice<DataType> value);
-  NodeDefBuilder& Attr(StringPiece name, gtl::ArraySlice<TensorShape> value);
+  NodeDefBuilder& Attr(StringPiece name, absl::Span<const DataType> value);
+  NodeDefBuilder& Attr(StringPiece name, absl::Span<const TensorShape> value);
   NodeDefBuilder& Attr(StringPiece name,
-                       gtl::ArraySlice<PartialTensorShape> value);
+                       absl::Span<const PartialTensorShape> value);
   NodeDefBuilder& Attr(StringPiece name,
-                       gtl::ArraySlice<TensorShapeProto> value);
-  NodeDefBuilder& Attr(StringPiece name, gtl::ArraySlice<Tensor> value);
-  NodeDefBuilder& Attr(StringPiece name, gtl::ArraySlice<NameAttrList> value);
+                       absl::Span<const TensorShapeProto> value);
+  NodeDefBuilder& Attr(StringPiece name, absl::Span<const Tensor> value);
+  NodeDefBuilder& Attr(StringPiece name, absl::Span<const NameAttrList> value);
 
   template <class T>
   NodeDefBuilder& Attr(StringPiece name, std::initializer_list<T> value) {
@@ -159,7 +159,7 @@ class NodeDefBuilder {
   void SingleInput(const OpDef::ArgDef* input_arg, StringPiece src_node,
                    int src_index, DataType dt);
   void ListInput(const OpDef::ArgDef* input_arg,
-                 gtl::ArraySlice<NodeOut> src_list);
+                 absl::Span<const NodeOut> src_list);
 
   // Add "src_node:src_index" to the list of inputs in the node_def_.
   void AddInput(StringPiece src_node, int src_index);
diff --git a/tensorflow/core/framework/node_def_util.h b/tensorflow/core/framework/node_def_util.h
index 7987e5ebb03e37..cac0a537b97f6f 100644
--- a/tensorflow/core/framework/node_def_util.h
+++ b/tensorflow/core/framework/node_def_util.h
@@ -101,33 +101,33 @@ void AddNodeAttr(StringPiece name, const Tensor& value, NodeDef* node_def);
 void AddNodeAttr(StringPiece name, const TensorProto& value, NodeDef* node_def);
 void AddNodeAttr(StringPiece name, const NameAttrList& value,
                  NodeDef* node_def);
-void AddNodeAttr(StringPiece name, gtl::ArraySlice<StringPiece> value,
+void AddNodeAttr(StringPiece name, absl::Span<const StringPiece> value,
                  NodeDef* node_def);
-void AddNodeAttr(StringPiece name, gtl::ArraySlice<const char*> value,
+void AddNodeAttr(StringPiece name, absl::Span<const char* const> value,
                  NodeDef* node_def);
-void AddNodeAttr(StringPiece name, gtl::ArraySlice<string> value,
+void AddNodeAttr(StringPiece name, absl::Span<const string> value,
                  NodeDef* node_def);
-void AddNodeAttr(StringPiece name, gtl::ArraySlice<int32> value,
+void AddNodeAttr(StringPiece name, absl::Span<const int32> value,
                  NodeDef* node_def);
-void AddNodeAttr(StringPiece name, gtl::ArraySlice<int64_t> value,
+void AddNodeAttr(StringPiece name, absl::Span<const int64_t> value,
                  NodeDef* node_def);
-void AddNodeAttr(StringPiece name, gtl::ArraySlice<float> value,
+void AddNodeAttr(StringPiece name, absl::Span<const float> value,
                  NodeDef* node_def);
-void AddNodeAttr(StringPiece name, gtl::ArraySlice<bool> value,
+void AddNodeAttr(StringPiece name, absl::Span<const bool> value,
                  NodeDef* node_def);
 void AddNodeAttr(StringPiece name, const std::vector<bool>& value,
                  NodeDef* node_def);
-void AddNodeAttr(StringPiece name, gtl::ArraySlice<DataType> value,
+void AddNodeAttr(StringPiece name, absl::Span<const DataType> value,
                  NodeDef* node_def);
-void AddNodeAttr(StringPiece name, gtl::ArraySlice<TensorShape> value,
+void AddNodeAttr(StringPiece name, absl::Span<const TensorShape> value,
                  NodeDef* node_def);
-void AddNodeAttr(StringPiece name, gtl::ArraySlice<PartialTensorShape> value,
+void AddNodeAttr(StringPiece name, absl::Span<const PartialTensorShape> value,
                  NodeDef* node_def);
-void AddNodeAttr(StringPiece name, gtl::ArraySlice<TensorShapeProto> value,
+void AddNodeAttr(StringPiece name, absl::Span<const TensorShapeProto> value,
                  NodeDef* node_def);
-void AddNodeAttr(StringPiece name, gtl::ArraySlice<Tensor> value,
+void AddNodeAttr(StringPiece name, absl::Span<const Tensor> value,
                  NodeDef* node_def);
-void AddNodeAttr(StringPiece name, gtl::ArraySlice<NameAttrList> value,
+void AddNodeAttr(StringPiece name, absl::Span<const NameAttrList> value,
                  NodeDef* node_def);
 
 // Version to workaround C++'s "perfect" forwarding not being able to
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 16d97f79926c7b..af22fb7872d399 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -904,11 +904,11 @@ class OpKernelContext {
   // not nullptr). If no inputs are forwarded, forwarded_input will be assigned
   // -1.
   Status forward_input_or_allocate_output(
-      gtl::ArraySlice<int> candidate_input_indices, int output_index,
+      absl::Span<const int> candidate_input_indices, int output_index,
       const TensorShape& output_shape, Tensor** output,
       int* forwarded_input = nullptr) TF_MUST_USE_RESULT;
   Status forward_input_or_allocate_output(
-      gtl::ArraySlice<StringPiece> candidate_input_names,
+      absl::Span<const StringPiece> candidate_input_names,
       StringPiece output_name, const TensorShape& output_shape,
       Tensor** output) TF_MUST_USE_RESULT;
 
@@ -916,12 +916,12 @@ class OpKernelContext {
   // If none of the given inputs can be forwarded, calls
   // allocate_temp() to allocate a new temporary buffer.
   Status forward_input_or_allocate_temp(
-      gtl::ArraySlice<int> candidate_input_indices, DataType type,
+      absl::Span<const int> candidate_input_indices, DataType type,
       const TensorShape& shape, const AllocatorAttributes& allocator_attr,
       Tensor* out_temp) TF_MUST_USE_RESULT;
 
   Status forward_input_or_allocate_temp(
-      gtl::ArraySlice<int> candidate_input_indices, DataType type,
+      absl::Span<const int> candidate_input_indices, DataType type,
       const TensorShape& shape, Tensor* out_temp) TF_MUST_USE_RESULT {
     return forward_input_or_allocate_temp(candidate_input_indices, type, shape,
                                           AllocatorAttributes(), out_temp);
diff --git a/tensorflow/core/framework/registration/BUILD b/tensorflow/core/framework/registration/BUILD
index c9d7590fed0aef..a0d5b56b0cd91c 100644
--- a/tensorflow/core/framework/registration/BUILD
+++ b/tensorflow/core/framework/registration/BUILD
@@ -49,5 +49,6 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/tensorflow/core/framework/registration/registration_test.cc b/tensorflow/core/framework/registration/registration_test.cc
index 245e677e4d5666..debb35034b3878 100644
--- a/tensorflow/core/framework/registration/registration_test.cc
+++ b/tensorflow/core/framework/registration/registration_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/registration/registration.h"
 
+#include <gmock/gmock.h>
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/framework/stats_aggregator.h b/tensorflow/core/framework/stats_aggregator.h
index 9a3cae57664f42..83fdf7829c3f2b 100644
--- a/tensorflow/core/framework/stats_aggregator.h
+++ b/tensorflow/core/framework/stats_aggregator.h
@@ -47,7 +47,7 @@ class StatsAggregator {
   // Add the given `values` to the histogram with the given `name`. Each
   // element of `values` will be treated as a separate sample in the histogram.
   virtual void AddToHistogram(const string& name,
-                              gtl::ArraySlice<double> values,
+                              absl::Span<const double> values,
                               int64_t global_step) = 0;
 
   // TODO(shivaniagrawal): consistency in double and float usage.
diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index 7d1940d4bcf2c1..f8fc1af3480186 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -510,7 +510,7 @@ class Tensor {
   typename TTypes<T, NDIMS>::Tensor flat_inner_outer_dims(int64_t begin);
 
   template <typename T, size_t NDIMS>
-  typename TTypes<T, NDIMS>::Tensor shaped(gtl::ArraySlice<int64_t> new_sizes);
+  typename TTypes<T, NDIMS>::Tensor shaped(absl::Span<const int64_t> new_sizes);
 
   /// \brief Return the tensor data to an `Eigen::Tensor` with the new
   /// shape specified in `new_sizes` and cast to a new dtype `T`.
@@ -519,11 +519,11 @@ class Tensor {
   /// The allowed bitcast is the only difference from `shaped()`.
   template <typename T, size_t NDIMS>
   typename TTypes<T, NDIMS>::Tensor bit_casted_shaped(
-      gtl::ArraySlice<int64_t> new_sizes);
+      absl::Span<const int64_t> new_sizes);
 
   template <typename T, size_t NDIMS>
   typename TTypes<T, NDIMS>::UnalignedTensor unaligned_shaped(
-      gtl::ArraySlice<int64_t> new_sizes);
+      absl::Span<const int64_t> new_sizes);
 
   /// \brief Return the Tensor data as a `TensorMap` of fixed size 1:
   /// `TensorMap<TensorFixedSize<T, 1>>`.
@@ -575,7 +575,7 @@ class Tensor {
 
   template <typename T, size_t NDIMS>
   typename TTypes<T, NDIMS>::ConstTensor shaped(
-      gtl::ArraySlice<int64_t> new_sizes) const;
+      absl::Span<const int64_t> new_sizes) const;
 
   /// \brief Return the tensor data to an `Eigen::Tensor` with the new
   /// shape specified in `new_sizes` and cast to a new dtype `T`.
@@ -584,11 +584,11 @@ class Tensor {
   /// The allowed bitcast is the only difference from `shaped()`.
   template <typename T, size_t NDIMS>
   typename TTypes<T, NDIMS>::ConstTensor bit_casted_shaped(
-      gtl::ArraySlice<int64_t> new_sizes) const;
+      absl::Span<const int64_t> new_sizes) const;
 
   template <typename T, size_t NDIMS>
   typename TTypes<T, NDIMS>::UnalignedConstTensor unaligned_shaped(
-      gtl::ArraySlice<int64_t> new_sizes) const;
+      absl::Span<const int64_t> new_sizes) const;
 
   template <typename T>
   typename TTypes<T>::ConstScalar scalar() const;
@@ -687,9 +687,9 @@ class Tensor {
 
   // TensorShape's InlineVector.
   static gtl::InlinedVector<int64_t, 4> ComputeFlatInnerDims(
-      gtl::ArraySlice<int64_t> orig, int64_t num_out_dims);
+      absl::Span<const int64_t> orig, int64_t num_out_dims);
   static gtl::InlinedVector<int64_t, 4> ComputeFlatOuterDims(
-      gtl::ArraySlice<int64_t> orig, int64_t num_out_dims);
+      absl::Span<const int64_t> orig, int64_t num_out_dims);
 
   TensorShape shape_;
   TensorBuffer* buf_;
@@ -753,12 +753,12 @@ class Tensor {
 
   template <size_t NDIMS>
   void FillDimsAndValidateCompatibleShape(
-      gtl::ArraySlice<int64_t> new_sizes,
+      absl::Span<const int64_t> new_sizes,
       Eigen::array<Eigen::DenseIndex, NDIMS>* dims) const;
 
   template <typename T, size_t NDIMS>
   void FillDimsAndValidateCompatibleShape(
-      gtl::ArraySlice<int64_t> new_sizes,
+      absl::Span<const int64_t> new_sizes,
       Eigen::array<Eigen::DenseIndex, NDIMS>* dims) const;
 };
 
@@ -836,7 +836,7 @@ typename TTypes<T, NDIMS>::ConstTensor Tensor::reinterpret_last_dimension()
 
 template <size_t NDIMS>
 void Tensor::FillDimsAndValidateCompatibleShape(
-    gtl::ArraySlice<int64_t> new_sizes,
+    absl::Span<const int64_t> new_sizes,
     Eigen::array<Eigen::DenseIndex, NDIMS>* dims) const {
   CHECK_EQ(NDIMS, new_sizes.size());
   int64_t new_num_elements = 1;
@@ -849,7 +849,7 @@ void Tensor::FillDimsAndValidateCompatibleShape(
 
 template <typename T, size_t NDIMS>
 void Tensor::FillDimsAndValidateCompatibleShape(
-    gtl::ArraySlice<int64_t> new_sizes,
+    absl::Span<const int64_t> new_sizes,
     Eigen::array<Eigen::DenseIndex, NDIMS>* dims) const {
   CHECK_EQ(NDIMS, new_sizes.size());
   int64_t new_num_elements = 1;
@@ -892,7 +892,7 @@ typename TTypes<T>::ConstFlat Tensor::flat() const {
 
 template <typename T, size_t NDIMS>
 typename TTypes<T, NDIMS>::Tensor Tensor::shaped(
-    gtl::ArraySlice<int64_t> new_sizes) {
+    absl::Span<const int64_t> new_sizes) {
   CheckTypeAndIsAligned(DataTypeToEnum<T>::v());
   Eigen::array<Eigen::DenseIndex, NDIMS> dims;
   FillDimsAndValidateCompatibleShape(new_sizes, &dims);
@@ -901,7 +901,7 @@ typename TTypes<T, NDIMS>::Tensor Tensor::shaped(
 
 template <typename T, size_t NDIMS>
 typename TTypes<T, NDIMS>::Tensor Tensor::bit_casted_shaped(
-    gtl::ArraySlice<int64_t> new_sizes) {
+    absl::Span<const int64_t> new_sizes) {
   CHECK(IsAligned());
   Eigen::array<Eigen::DenseIndex, NDIMS> dims;
   FillDimsAndValidateCompatibleShape<T>(new_sizes, &dims);
@@ -910,7 +910,7 @@ typename TTypes<T, NDIMS>::Tensor Tensor::bit_casted_shaped(
 
 template <typename T, size_t NDIMS>
 typename TTypes<T, NDIMS>::UnalignedTensor Tensor::unaligned_shaped(
-    gtl::ArraySlice<int64_t> new_sizes) {
+    absl::Span<const int64_t> new_sizes) {
   CheckType(DataTypeToEnum<T>::v());
   Eigen::array<Eigen::DenseIndex, NDIMS> dims;
   FillDimsAndValidateCompatibleShape(new_sizes, &dims);
@@ -919,7 +919,7 @@ typename TTypes<T, NDIMS>::UnalignedTensor Tensor::unaligned_shaped(
 
 template <typename T, size_t NDIMS>
 typename TTypes<T, NDIMS>::ConstTensor Tensor::shaped(
-    gtl::ArraySlice<int64_t> new_sizes) const {
+    absl::Span<const int64_t> new_sizes) const {
   CheckType(DataTypeToEnum<T>::v());
   CHECK(IsAligned()) << "ptr = " << base<void>();
   Eigen::array<Eigen::DenseIndex, NDIMS> dims;
@@ -929,7 +929,7 @@ typename TTypes<T, NDIMS>::ConstTensor Tensor::shaped(
 
 template <typename T, size_t NDIMS>
 typename TTypes<T, NDIMS>::ConstTensor Tensor::bit_casted_shaped(
-    gtl::ArraySlice<int64_t> new_sizes) const {
+    absl::Span<const int64_t> new_sizes) const {
   CHECK(IsAligned());
   Eigen::array<Eigen::DenseIndex, NDIMS> dims;
   FillDimsAndValidateCompatibleShape<T>(new_sizes, &dims);
@@ -938,7 +938,7 @@ typename TTypes<T, NDIMS>::ConstTensor Tensor::bit_casted_shaped(
 
 template <typename T, size_t NDIMS>
 typename TTypes<T, NDIMS>::UnalignedConstTensor Tensor::unaligned_shaped(
-    gtl::ArraySlice<int64_t> new_sizes) const {
+    absl::Span<const int64_t> new_sizes) const {
   CheckType(DataTypeToEnum<T>::v());
   Eigen::array<Eigen::DenseIndex, NDIMS> dims;
   FillDimsAndValidateCompatibleShape(new_sizes, &dims);
diff --git a/tensorflow/core/framework/tensor_test.cc b/tensorflow/core/framework/tensor_test.cc
index bd0d54047ee834..2d8d182da28da9 100644
--- a/tensorflow/core/framework/tensor_test.cc
+++ b/tensorflow/core/framework/tensor_test.cc
@@ -390,7 +390,7 @@ TEST(Tensor_Variant, Simple) {
   }
   {
     LOG(INFO) << "AsTensor";
-    gtl::ArraySlice<Variant> values(t.flat<Variant>().data(), t.NumElements());
+    absl::Span<const Variant> values(t.flat<Variant>().data(), t.NumElements());
     Tensor t2 = test::AsTensor(values, t.shape());
     ExpectEqual<Variant>(t, t2);
   }
@@ -573,9 +573,9 @@ class TensorReshapeTest : public ::testing::Test {
   }
 
   template <typename T>
-  using ReshapeFunc = T (Tensor::*)(gtl::ArraySlice<int64_t>);
+  using ReshapeFunc = T (Tensor::*)(absl::Span<const int64_t>);
   template <typename T>
-  using ConstReshapeFunc = T (Tensor::*)(gtl::ArraySlice<int64_t>) const;
+  using ConstReshapeFunc = T (Tensor::*)(absl::Span<const int64_t>) const;
 
   template <typename T, ReshapeFunc<T> Func>
   void TestReshape(std::initializer_list<int64_t> sizes) {
diff --git a/tensorflow/core/framework/tensor_util.h b/tensorflow/core/framework/tensor_util.h
index 213025d274bd01..926bdc0878fa39 100644
--- a/tensorflow/core/framework/tensor_util.h
+++ b/tensorflow/core/framework/tensor_util.h
@@ -49,7 +49,7 @@ void DeepCopy(const Tensor& input, Tensor* output);
 // REQUIRES: Each member of 'tensors' must point to data stored in CPU memory.
 // REQUIRES: Each member of 'tensors' must be a Tensor of a copy-able type if it
 //           is not appropriately memory-aligned.
-Status Concat(const gtl::ArraySlice<Tensor>& tensors,
+Status Concat(const absl::Span<const Tensor>& tensors,
               Tensor* result) TF_MUST_USE_RESULT;
 
 // Splits 'tensor' into 'sizes.size()' individual tensors, along the 0th
@@ -62,7 +62,7 @@ Status Concat(const gtl::ArraySlice<Tensor>& tensors,
 //           appropriately memory-aligned.
 //
 // Split() and Concat() are inverse operations.
-Status Split(const Tensor& tensor, const gtl::ArraySlice<int64_t>& sizes,
+Status Split(const Tensor& tensor, const absl::Span<const int64_t>& sizes,
              std::vector<Tensor>* result) TF_MUST_USE_RESULT;
 
 namespace internal {
diff --git a/tensorflow/core/framework/types.h b/tensorflow/core/framework/types.h
index 9a6935b83dc445..1a53fce4d2d997 100644
--- a/tensorflow/core/framework/types.h
+++ b/tensorflow/core/framework/types.h
@@ -74,10 +74,10 @@ struct DeviceName<Eigen::GpuDevice> {
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 typedef gtl::InlinedVector<MemoryType, 4> MemoryTypeVector;
-typedef gtl::ArraySlice<MemoryType> MemoryTypeSlice;
+typedef absl::Span<const MemoryType> MemoryTypeSlice;
 
 typedef gtl::InlinedVector<DataType, 4> DataTypeVector;
-typedef gtl::ArraySlice<DataType> DataTypeSlice;
+typedef absl::Span<const DataType> DataTypeSlice;
 
 typedef gtl::InlinedVector<DeviceType, 4> DeviceTypeVector;
 typedef gtl::InlinedVector<std::pair<DeviceType, int32>, 4>
diff --git a/tensorflow/core/framework/variant_test.cc b/tensorflow/core/framework/variant_test.cc
index 1d8728edae8c3c..4bfd80dfa1d3e2 100644
--- a/tensorflow/core/framework/variant_test.cc
+++ b/tensorflow/core/framework/variant_test.cc
@@ -473,7 +473,7 @@ TEST(VariantTest, Tensor) {
 TEST(VariantTest, NontrivialTensorVariantCopy) {
   Tensor variants(DT_VARIANT, {});
   Tensor t(true);
-  test::FillValues<Variant>(&variants, gtl::ArraySlice<Variant>({t}));
+  test::FillValues<Variant>(&variants, absl::Span<const Variant>({t}));
   const Tensor* t_c = variants.flat<Variant>()(0).get<Tensor>();
   EXPECT_EQ(t_c->dtype(), t.dtype());
   EXPECT_EQ(t_c->shape(), t.shape());
diff --git a/tensorflow/core/function/runtime_client/BUILD b/tensorflow/core/function/runtime_client/BUILD
index d1046cdd72e9cd..aa0716faf7da15 100644
--- a/tensorflow/core/function/runtime_client/BUILD
+++ b/tensorflow/core/function/runtime_client/BUILD
@@ -77,8 +77,8 @@ cc_library(
         "//tensorflow/core/distributed_runtime:pywrap_required_hdrs",
         "//tensorflow/core/distributed_runtime/coordination:pywrap_required_hdrs",
         "//tensorflow/core/distributed_runtime/eager:pywrap_required_hdrs",
-        "@local_tsl//tsl/distributed_runtime:pywrap_required_hdrs",
-        "@local_tsl//tsl/distributed_runtime/coordination:pywrap_required_hdrs",
+        "@local_xla//xla/tsl/distributed_runtime:pywrap_required_hdrs",
+        "@local_xla//xla/tsl/distributed_runtime/coordination:pywrap_required_hdrs",
     ],
 )
 
diff --git a/tensorflow/core/graph/algorithm.cc b/tensorflow/core/graph/algorithm.cc
index 808d22e565e18c..c5b327bb68321a 100644
--- a/tensorflow/core/graph/algorithm.cc
+++ b/tensorflow/core/graph/algorithm.cc
@@ -94,7 +94,7 @@ void DFS(const Graph& g, const std::function<void(Node*)>& enter,
                 edge_filter);
 }
 
-void DFSFrom(const Graph& g, gtl::ArraySlice<Node*> start,
+void DFSFrom(const Graph& g, absl::Span<Node* const> start,
              const std::function<void(Node*)>& enter,
              const std::function<void(Node*)>& leave,
              const NodeComparator& stable_comparator,
@@ -102,7 +102,7 @@ void DFSFrom(const Graph& g, gtl::ArraySlice<Node*> start,
   DFSFromHelper(g, start, enter, leave, stable_comparator, edge_filter);
 }
 
-void DFSFrom(const Graph& g, gtl::ArraySlice<const Node*> start,
+void DFSFrom(const Graph& g, absl::Span<const Node* const> start,
              const std::function<void(const Node*)>& enter,
              const std::function<void(const Node*)>& leave,
              const NodeComparator& stable_comparator,
@@ -184,7 +184,7 @@ void ReverseDFSFromHelper(const Graph& g, gtl::ArraySlice<T> start,
 
 }  // namespace
 
-void ReverseDFSFrom(const Graph& g, gtl::ArraySlice<const Node*> start,
+void ReverseDFSFrom(const Graph& g, absl::Span<const Node* const> start,
                     const std::function<void(const Node*)>& enter,
                     const std::function<void(const Node*)>& leave,
                     const NodeComparator& stable_comparator,
@@ -192,7 +192,7 @@ void ReverseDFSFrom(const Graph& g, gtl::ArraySlice<const Node*> start,
   ReverseDFSFromHelper(g, start, enter, leave, stable_comparator, edge_filter);
 }
 
-void ReverseDFSFrom(const Graph& g, gtl::ArraySlice<Node*> start,
+void ReverseDFSFrom(const Graph& g, absl::Span<Node* const> start,
                     const std::function<void(Node*)>& enter,
                     const std::function<void(Node*)>& leave,
                     const NodeComparator& stable_comparator,
@@ -307,14 +307,14 @@ void BreadthFirstTraversalHelper(const Graph& g, gtl::ArraySlice<T> start,
 }
 }  // namespace
 
-void BreadthFirstTraversal(const Graph& g, gtl::ArraySlice<const Node*> start,
+void BreadthFirstTraversal(const Graph& g, absl::Span<const Node* const> start,
                            const std::function<void(const Node*)>& visit,
                            NodeComparator stable_comparator) {
   return BreadthFirstTraversalHelper<const Node*>(g, start, visit,
                                                   stable_comparator);
 }
 
-void BreadthFirstTraversal(Graph& g, gtl::ArraySlice<Node*> start,
+void BreadthFirstTraversal(Graph& g, absl::Span<Node* const> start,
                            const std::function<void(Node*)>& visit,
                            NodeComparator stable_comparator) {
   return BreadthFirstTraversalHelper<Node*>(g, start, visit, stable_comparator);
diff --git a/tensorflow/core/graph/algorithm.h b/tensorflow/core/graph/algorithm.h
index c9f328a0cdcc2b..e20d682325c824 100644
--- a/tensorflow/core/graph/algorithm.h
+++ b/tensorflow/core/graph/algorithm.h
@@ -61,12 +61,12 @@ void DFS(const Graph& g, const std::function<void(Node*)>& enter,
 // If stable_comparator is set, a stable ordering of visit is achieved by
 // sorting a node's neighbors first before visiting them.
 // If edge_filter is set then ignores edges for which edge_filter returns false.
-void DFSFrom(const Graph& g, gtl::ArraySlice<Node*> start,
+void DFSFrom(const Graph& g, absl::Span<Node* const> start,
              const std::function<void(Node*)>& enter,
              const std::function<void(Node*)>& leave,
              const NodeComparator& stable_comparator = {},
              const EdgeFilter& edge_filter = {});
-void DFSFrom(const Graph& g, gtl::ArraySlice<const Node*> start,
+void DFSFrom(const Graph& g, absl::Span<const Node* const> start,
              const std::function<void(const Node*)>& enter,
              const std::function<void(const Node*)>& leave,
              const NodeComparator& stable_comparator = {},
@@ -89,24 +89,24 @@ void ReverseDFS(const Graph& g, const std::function<void(Node*)>& enter,
 // If stable_comparator is set, a stable ordering of visit is achieved by
 // sorting a node's neighbors first before visiting them.
 // If edge_filter is set then ignores edges for which edge_filter returns false.
-void ReverseDFSFrom(const Graph& g, gtl::ArraySlice<Node*> start,
+void ReverseDFSFrom(const Graph& g, absl::Span<Node* const> start,
                     const std::function<void(Node*)>& enter,
                     const std::function<void(Node*)>& leave,
                     const NodeComparator& stable_comparator = {},
                     const EdgeFilter& edge_filter = {});
-void ReverseDFSFrom(const Graph& g, gtl::ArraySlice<const Node*> start,
+void ReverseDFSFrom(const Graph& g, absl::Span<const Node* const> start,
                     const std::function<void(const Node*)>& enter,
                     const std::function<void(const Node*)>& leave,
                     const NodeComparator& stable_comparator = {},
                     const EdgeFilter& edge_filter = {});
 
 void BreadthFirstTraversal(
-    const Graph& g, gtl::ArraySlice<const Node*> start,
+    const Graph& g, absl::Span<const Node* const> start,
     const std::function<void(const Node*)>& visit,
     NodeComparator stable_comparator = NodeComparatorID());
 
 void BreadthFirstTraversal(
-    Graph& g, gtl::ArraySlice<Node*> start,
+    Graph& g, absl::Span<Node* const> start,
     const std::function<void(Node*)>& visit,
     NodeComparator stable_comparator = NodeComparatorID());
 
diff --git a/tensorflow/core/graph/costmodel_test.cc b/tensorflow/core/graph/costmodel_test.cc
index 67d39adeafb88c..153d366e263129 100644
--- a/tensorflow/core/graph/costmodel_test.cc
+++ b/tensorflow/core/graph/costmodel_test.cc
@@ -283,7 +283,7 @@ TEST(CostModelTest, TimeEstimate) {
   EXPECT_EQ(cm.TimeEstimate(C), Microseconds(kMicrosPerCount));
 }
 
-TensorShapeProto CreateTensorShapeProto(gtl::ArraySlice<int64_t> dims) {
+TensorShapeProto CreateTensorShapeProto(absl::Span<const int64_t> dims) {
   TensorShapeProto shape;
   for (int i = 0; i < dims.size(); ++i) {
     shape.add_dim()->set_size(dims[i]);
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index f5145f0cd004e1..10984ae23608bc 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -206,7 +206,7 @@ Status Node::ShrinkTypeInfo(const absl::flat_hash_map<int, int>& index_mapping,
   AddAttr(type_attr_name, new_dtypes);
 
   if (!update_full_type || !def().has_experimental_type()) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   FullTypeDef ft = def().experimental_type();
   if (ft.type_id() != TFT_PRODUCT) {
@@ -229,7 +229,7 @@ Status Node::ShrinkTypeInfo(const absl::flat_hash_map<int, int>& index_mapping,
   }
   MaybeCopyOnWrite();
   *(mutable_def()->mutable_experimental_type()) = new_ft;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 const std::string& Node::name() const { return props_->node_def.name(); }
@@ -334,7 +334,7 @@ Status Node::input_edge(int idx, const Edge** e) const {
   for (const Edge* edge : in_edges()) {
     if (edge->dst_input() == idx) {
       *e = edge;
-      return OkStatus();
+      return absl::OkStatus();
     }
   }
 
@@ -363,7 +363,7 @@ Status Node::input_edges(std::vector<const Edge*>* input_edges) const {
       return errors::InvalidArgument("Missing edge input number: ", i);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status Node::input_node(int idx, Node** n) const {
@@ -374,14 +374,14 @@ Status Node::input_node(int idx, Node** n) const {
   } else {
     *n = e->src();
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status Node::input_node(int idx, const Node** const_n) const {
   Node* n;
   TF_RETURN_IF_ERROR(input_node(idx, &n));
   *const_n = n;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status Node::input_tensor(int idx, OutputTensor* t) const {
@@ -389,7 +389,7 @@ Status Node::input_tensor(int idx, OutputTensor* t) const {
   TF_RETURN_IF_ERROR(input_edge(idx, &e));
   DCHECK(e != nullptr);
   *t = OutputTensor(e->src(), e->src_output());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // NodeDebugInfo
@@ -746,7 +746,7 @@ Status Graph::UpdateEdge(Node* new_src, int new_src_index, Node* dst,
   dst->MaybeCopyOnWrite();
   (*dst->props_->node_def.mutable_input())[dst_index] =
       strings::StrCat(new_src->name(), ":", new_src_index);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void Graph::AddInput(NodeDef* dst, StringPiece src_name, int src_slot) {
@@ -778,7 +778,7 @@ Status Graph::AddWhileInputHack(Node* new_src, int new_src_index, Node* dst) {
   dst->MaybeCopyOnWrite();
   dst->props_->node_def.add_input(
       strings::StrCat(new_src->name(), ":", new_src_index));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status Graph::AddFunctionLibrary(
@@ -930,7 +930,7 @@ Status Graph::IsValidNode(const Node* node) const {
                                    " is different from the passed in node. "
                                    "Does it belong to a different graph?");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status Graph::IsValidOutputTensor(const Node* node, int idx) const {
@@ -941,7 +941,7 @@ Status Graph::IsValidOutputTensor(const Node* node, int idx) const {
                               "', num of outputs: ", node->num_outputs(),
                               ") does not have ", "output ", idx);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status Graph::IsValidInputTensor(const Node* node, int idx) const {
@@ -952,7 +952,7 @@ Status Graph::IsValidInputTensor(const Node* node, int idx) const {
                               "', num of inputs: ", node->num_inputs(),
                               ") does not have ", "input ", idx);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Node* Graph::AllocateNode(std::shared_ptr<NodeProperties> props,
@@ -1021,7 +1021,7 @@ Status Graph::AddWhileContext(StringPiece frame_name,
                                    "' already exists");
   }
   *result = &pair.first->second;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 std::unordered_map<std::string, Node*> Graph::BuildNodeNameIndex() const {
diff --git a/tensorflow/core/graph/graph_def_builder.cc b/tensorflow/core/graph/graph_def_builder.cc
index a91b7fccce1f13..0b6c50373471ed 100644
--- a/tensorflow/core/graph/graph_def_builder.cc
+++ b/tensorflow/core/graph/graph_def_builder.cc
@@ -39,7 +39,7 @@ GraphDefBuilder::Options GraphDefBuilder::Options::WithControlInput(
   return Options(*this).WithControlInputImpl(control_input);
 }
 GraphDefBuilder::Options GraphDefBuilder::Options::WithControlInputs(
-    gtl::ArraySlice<Node*> control_inputs) const {
+    absl::Span<Node* const> control_inputs) const {
   return Options(*this).WithControlInputsImpl(control_inputs);
 }
 GraphDefBuilder::Options GraphDefBuilder::Options::WithNameImpl(
@@ -58,7 +58,7 @@ GraphDefBuilder::Options GraphDefBuilder::Options::WithControlInputImpl(
   return *this;
 }
 GraphDefBuilder::Options GraphDefBuilder::Options::WithControlInputsImpl(
-    gtl::ArraySlice<Node*> control_inputs) {
+    absl::Span<Node* const> control_inputs) {
   control_inputs_.insert(control_inputs_.end(), control_inputs.begin(),
                          control_inputs.end());
   return *this;
diff --git a/tensorflow/core/graph/graph_def_builder.h b/tensorflow/core/graph/graph_def_builder.h
index f7fc2549b3ea62..8935cf46485247 100644
--- a/tensorflow/core/graph/graph_def_builder.h
+++ b/tensorflow/core/graph/graph_def_builder.h
@@ -82,7 +82,7 @@ class GraphDefBuilder {
     Options WithName(StringPiece name) const;
     Options WithDevice(StringPiece device) const;
     Options WithControlInput(Node* control_input) const;
-    Options WithControlInputs(gtl::ArraySlice<Node*> control_inputs) const;
+    Options WithControlInputs(absl::Span<Node* const> control_inputs) const;
 
     // Override the default value for an optional attr.
     template <class T>
@@ -130,7 +130,7 @@ class GraphDefBuilder {
     Options WithNameImpl(StringPiece name);
     Options WithDeviceImpl(StringPiece device);
     Options WithControlInputImpl(Node* control_input);
-    Options WithControlInputsImpl(gtl::ArraySlice<Node*> control_inputs);
+    Options WithControlInputsImpl(absl::Span<Node* const> control_inputs);
     template <class T>
     Options WithAttrImpl(StringPiece name, T&& value) {
       attrs_.emplace_back(string(name), AttrValue());
diff --git a/tensorflow/core/graph/graph_partition_test.cc b/tensorflow/core/graph/graph_partition_test.cc
index 6751248e4b49c7..a6ad2a01031af4 100644
--- a/tensorflow/core/graph/graph_partition_test.cc
+++ b/tensorflow/core/graph/graph_partition_test.cc
@@ -173,7 +173,7 @@ REGISTER_OP("Combine")
     .SetShapeFn(shape_inference::UnknownShape);
 
 Output ConstructOp(const Scope& scope, const string& op_type,
-                   const gtl::ArraySlice<Input>& inputs) {
+                   const absl::Span<const Input>& inputs) {
   if (!scope.ok()) return Output();
   const string unique_name = scope.GetUniqueNameForOp(op_type);
   auto builder =
@@ -685,8 +685,8 @@ TEST(TopologicalSortNodesWithTimePriority, Dependencies) {
   // Create addn to sum all squares.
   std::vector<Input> inputs;
   for (const auto& s : squares) inputs.push_back(s);
-  ops::AddN addn = ops::AddN(root.WithOpName("addn"),
-                             tensorflow::gtl::ArraySlice<Input>(inputs));
+  ops::AddN addn =
+      ops::AddN(root.WithOpName("addn"), absl::Span<const Input>(inputs));
   // Start times is actually listed earlier than the nodes it depends on.
   // But because of dependency ordering, it is last in the list.
   addn.node()->AddAttr("_start_time", 1);
diff --git a/tensorflow/core/graph/node_builder.cc b/tensorflow/core/graph/node_builder.cc
index 02204cdd88337d..dbd8fafd1ea523 100644
--- a/tensorflow/core/graph/node_builder.cc
+++ b/tensorflow/core/graph/node_builder.cc
@@ -72,7 +72,7 @@ NodeBuilder& NodeBuilder::Input(NodeOut src) {
   return *this;
 }
 
-NodeBuilder& NodeBuilder::Input(gtl::ArraySlice<NodeOut> src_list) {
+NodeBuilder& NodeBuilder::Input(absl::Span<const NodeOut> src_list) {
   std::vector<NodeDefBuilder::NodeOut> srcs;
   srcs.reserve(src_list.size());
   for (const auto& node_out : src_list) {
@@ -83,7 +83,7 @@ NodeBuilder& NodeBuilder::Input(gtl::ArraySlice<NodeOut> src_list) {
       inputs_.emplace_back(node_out.node, node_out.index);
     }
   }
-  def_builder_.Input(gtl::ArraySlice<NodeDefBuilder::NodeOut>(srcs));
+  def_builder_.Input(absl::Span<const NodeDefBuilder::NodeOut>(srcs));
   return *this;
 }
 
@@ -93,7 +93,7 @@ NodeBuilder& NodeBuilder::ControlInput(Node* src_node) {
   return *this;
 }
 
-NodeBuilder& NodeBuilder::ControlInputs(gtl::ArraySlice<Node*> src_nodes) {
+NodeBuilder& NodeBuilder::ControlInputs(absl::Span<Node* const> src_nodes) {
   control_inputs_.insert(control_inputs_.end(), src_nodes.begin(),
                          src_nodes.end());
   for (const Node* src_node : src_nodes) {
@@ -153,7 +153,7 @@ Status NodeBuilder::Finalize(Graph* graph, Node** created_node, bool consume) {
 
   if (created_node != nullptr) *created_node = node;
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void NodeBuilder::AddIndexError(const Node* node, int i) {
diff --git a/tensorflow/core/graph/node_builder.h b/tensorflow/core/graph/node_builder.h
index 706629995c482f..6b7d67b9f1be5a 100644
--- a/tensorflow/core/graph/node_builder.h
+++ b/tensorflow/core/graph/node_builder.h
@@ -92,11 +92,11 @@ class NodeBuilder {
   NodeBuilder& Input(NodeOut src);
 
   // For inputs that take a list of tensors.
-  NodeBuilder& Input(gtl::ArraySlice<NodeOut> src_list);
+  NodeBuilder& Input(absl::Span<const NodeOut> src_list);
 
   // Require that this node run after src_node(s).
   NodeBuilder& ControlInput(Node* src_node);
-  NodeBuilder& ControlInputs(gtl::ArraySlice<Node*> src_nodes);
+  NodeBuilder& ControlInputs(absl::Span<Node* const> src_nodes);
 
   // Sets the "requested device spec" in the NodeDef (not the
   // "assigned device" in the Node).
diff --git a/tensorflow/core/graph/subgraph.cc b/tensorflow/core/graph/subgraph.cc
index 5837051b827afa..b101810f2f1a81 100644
--- a/tensorflow/core/graph/subgraph.cc
+++ b/tensorflow/core/graph/subgraph.cc
@@ -189,7 +189,7 @@ bool AddNodeToTargets(const string& node_or_tensor_name,
 
 Status PruneForTargets(Graph* g, const NameIndex& name_index,
                        const std::vector<Node*>& fetch_nodes,
-                       const gtl::ArraySlice<string>& target_nodes) {
+                       const absl::Span<const string>& target_nodes) {
   string not_found;
   std::unordered_set<const Node*> targets;
   for (Node* n : fetch_nodes) {
@@ -291,9 +291,9 @@ Status SendFetchRewrite::AddNode(Graph* g, NodeBuilder::NodeOut fetch_tensor,
 }
 
 Status RewriteGraphForExecution(
-    Graph* g, const gtl::ArraySlice<string>& fed_outputs,
-    const gtl::ArraySlice<string>& fetch_outputs,
-    const gtl::ArraySlice<string>& target_node_names,
+    Graph* g, const absl::Span<const string>& fed_outputs,
+    const absl::Span<const string>& fetch_outputs,
+    const absl::Span<const string>& target_node_names,
     const DeviceAttributes& device_info, bool use_function_convention,
     RewriteGraphMetadata* out_metadata) {
   std::vector<std::unique_ptr<PruneRewrite>> feed_rewrites;
@@ -338,7 +338,7 @@ std::vector<string> ConvertToVector(StringContainer field) {
 Status RewriteGraphForExecution(
     Graph* g, const std::vector<std::unique_ptr<PruneRewrite>>& feed_rewrites,
     const std::vector<std::unique_ptr<PruneRewrite>>& fetch_rewrites,
-    const gtl::ArraySlice<string>& target_node_names,
+    const absl::Span<const string>& target_node_names,
     RewriteGraphMetadata* out_metadata) {
   if (fetch_rewrites.empty() && target_node_names.empty()) {
     return errors::InvalidArgument(
diff --git a/tensorflow/core/graph/subgraph.h b/tensorflow/core/graph/subgraph.h
index 76e7ea17822783..824c89e5482ed4 100644
--- a/tensorflow/core/graph/subgraph.h
+++ b/tensorflow/core/graph/subgraph.h
@@ -98,9 +98,9 @@ class PruneRewrite {
 //    - fetch output "node:output_index" does not exist in "*g"
 //    - target node "node" does not exist in "*g"
 Status RewriteGraphForExecution(
-    Graph* g, const gtl::ArraySlice<string>& fed_outputs,
-    const gtl::ArraySlice<string>& fetch_outputs,
-    const gtl::ArraySlice<string>& target_node_names,
+    Graph* g, const absl::Span<const string>& fed_outputs,
+    const absl::Span<const string>& fetch_outputs,
+    const absl::Span<const string>& target_node_names,
     const DeviceAttributes& device_info, bool use_function_convention,
     RewriteGraphMetadata* out_metadata);
 
@@ -109,7 +109,7 @@ Status RewriteGraphForExecution(
 Status RewriteGraphForExecution(
     Graph* g, const std::vector<std::unique_ptr<PruneRewrite>>& feed_rewrites,
     const std::vector<std::unique_ptr<PruneRewrite>>& fetch_rewrites,
-    const gtl::ArraySlice<string>& target_node_names,
+    const absl::Span<const string>& target_node_names,
     RewriteGraphMetadata* out_metadata);
 
 /////////////////////////////////////////////////////////
diff --git a/tensorflow/core/graph/testlib.cc b/tensorflow/core/graph/testlib.cc
index 3c7fe2087e172b..f83bf238cde9d1 100644
--- a/tensorflow/core/graph/testlib.cc
+++ b/tensorflow/core/graph/testlib.cc
@@ -239,7 +239,7 @@ Node* Binary(Graph* g, const string& func, Node* in0, Node* in1) {
   return ret;
 }
 
-Node* Multi(Graph* g, const string& func, gtl::ArraySlice<Node*> ins) {
+Node* Multi(Graph* g, const string& func, absl::Span<Node* const> ins) {
   Node* ret;
   auto b = NodeBuilder(g->NewName("n"), func, g->op_registry());
   for (Node* n : ins) b = b.Input(n);
@@ -341,7 +341,7 @@ Node* Merge(Graph* g, Node* in0, Node* in1) {
   return ret;
 }
 
-Node* Merge(Graph* g, Node* in0, gtl::ArraySlice<string> remaining_in) {
+Node* Merge(Graph* g, Node* in0, absl::Span<const string> remaining_in) {
   std::vector<NodeBuilder::NodeOut> inputs;
   inputs.reserve(remaining_in.size() + 1);
   inputs.emplace_back(in0);
@@ -355,7 +355,7 @@ Node* Merge(Graph* g, Node* in0, gtl::ArraySlice<string> remaining_in) {
   return ret;
 }
 
-Node* Concat(Graph* g, Node* concat_dim, gtl::ArraySlice<Node*> tensors) {
+Node* Concat(Graph* g, Node* concat_dim, absl::Span<Node* const> tensors) {
   std::vector<NodeBuilder::NodeOut> nodeouts;
   nodeouts.reserve(tensors.size());
   for (auto const t : tensors) {
@@ -369,7 +369,7 @@ Node* Concat(Graph* g, Node* concat_dim, gtl::ArraySlice<Node*> tensors) {
   return ret;
 }
 
-Node* ConcatV2(Graph* g, gtl::ArraySlice<Node*> tensors, Node* concat_dim) {
+Node* ConcatV2(Graph* g, absl::Span<Node* const> tensors, Node* concat_dim) {
   std::vector<NodeBuilder::NodeOut> nodeouts;
   nodeouts.reserve(tensors.size());
   for (auto const t : tensors) {
diff --git a/tensorflow/core/graph/testlib.h b/tensorflow/core/graph/testlib.h
index 185ad3f92b0d07..b2d1a4166fc3ed 100644
--- a/tensorflow/core/graph/testlib.h
+++ b/tensorflow/core/graph/testlib.h
@@ -99,7 +99,7 @@ Node* Identity(Graph* g, Node* input, int index = 0);
 Node* Binary(Graph* g, const string& func, Node* in0, Node* in1);
 
 // Adds a function "func" node in "g" taking inputs "ins".
-Node* Multi(Graph* g, const string& func, gtl::ArraySlice<Node*> ins);
+Node* Multi(Graph* g, const string& func, absl::Span<Node* const> ins);
 
 // Adds a binary add node in "g" doing in0 + in1.
 Node* Add(Graph* g, Node* in0, Node* in1);
@@ -160,7 +160,7 @@ Node* Merge(Graph* g, Node* in0, Node* in1);
 
 // Adds a Merge node in "g". The first input is "in0", the remaining
 // inputs are only given by their names in remaining_in.
-Node* Merge(Graph* g, Node* in0, gtl::ArraySlice<string> remaining_in);
+Node* Merge(Graph* g, Node* in0, absl::Span<const string> remaining_in);
 
 // Adds a NextIteration node in "g", which makes its input available
 // to the next iteration.
@@ -189,12 +189,12 @@ Node* GetSessionTensor(Graph* g, Node* in);
 // Adds a Concat node in "g". The first input is "concat_dim", the
 // dimension to concatenate on, and the tensors to concatenate are
 // given in "tensors".
-Node* Concat(Graph* g, Node* concat_dim, gtl::ArraySlice<Node*> tensors);
+Node* Concat(Graph* g, Node* concat_dim, absl::Span<Node* const> tensors);
 
 // Adds a ConcatV2 node in "g". The last input is "concat_dim", the
 // dimension to concatenate on, and the tensors to concatenate are
 // given in "tensors".
-Node* ConcatV2(Graph* g, gtl::ArraySlice<Node*> tensors, Node* concat_dim);
+Node* ConcatV2(Graph* g, absl::Span<Node* const> tensors, Node* concat_dim);
 
 // Add a Relu node in "g".
 Node* Relu(Graph* g, Node* in);
diff --git a/tensorflow/core/grappler/costs/graph_properties_test.cc b/tensorflow/core/grappler/costs/graph_properties_test.cc
index 0824a95eefaf19..b1aed906154496 100644
--- a/tensorflow/core/grappler/costs/graph_properties_test.cc
+++ b/tensorflow/core/grappler/costs/graph_properties_test.cc
@@ -324,7 +324,7 @@ class ConstTensorSkipTestCase {
               << ", expected: " << expected_;
     // Build a graph with Const --> Identity --> Detect.
     GrapplerItem item;
-    const gtl::ArraySlice<int64_t> shape_array_slice(shape_);
+    const absl::Span<const int64_t> shape_array_slice(shape_);
     Tensor const_tensor_value(data_type_, TensorShape(shape_array_slice));
     // Fill the const tensor value based on data type.
     switch (data_type_) {
diff --git a/tensorflow/core/grappler/optimizers/data/auto_shard_test.cc b/tensorflow/core/grappler/optimizers/data/auto_shard_test.cc
index 34919e6fe07b17..eb3022aa37facf 100644
--- a/tensorflow/core/grappler/optimizers/data/auto_shard_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/auto_shard_test.cc
@@ -48,20 +48,20 @@ void FinishItem(GrapplerItem* item, const string& input_node_name) {
   *item->graph.add_node() =
       NDef("map_before_rebatch", "MapDataset", {input_node_name},
            {{"f", "__inference_Dataset_map_normalize_8232"},
-            {"output_shapes", gtl::ArraySlice<TensorShape>{}},
-            {"output_types", gtl::ArraySlice<DataType>{}}});
+            {"output_shapes", absl::Span<const TensorShape>{}},
+            {"output_types", absl::Span<const DataType>{}}});
   *item->graph.add_node() =
       NDef("num_replicas", "Const", {}, {{"value", 2}, {"dtype", DT_INT32}});
   *item->graph.add_node() =
       NDef("rebatch", "RebatchDataset", {"map_before_rebatch", "num_replicas"},
-           {{"output_shapes", gtl::ArraySlice<TensorShape>{}},
-            {"output_types", gtl::ArraySlice<DataType>{}}});
+           {{"output_shapes", absl::Span<const TensorShape>{}},
+            {"output_types", absl::Span<const DataType>{}}});
   *item->graph.add_node() =
       NDef("prefetch_count", "Const", {}, {{"value", 2}, {"dtype", DT_INT32}});
   *item->graph.add_node() =
       NDef("prefetch", "PrefetchDataset", {"rebatch", "prefetch_count"},
-           {{"output_shapes", gtl::ArraySlice<TensorShape>{}},
-            {"output_types", gtl::ArraySlice<DataType>{}}});
+           {{"output_shapes", absl::Span<const TensorShape>{}},
+            {"output_types", absl::Span<const DataType>{}}});
   *item->graph.add_node() = NDef("Sink", "Identity", {"prefetch"}, {});
   item->fetch.push_back("Sink");
 }
@@ -80,8 +80,8 @@ TEST(RewriteBatchTest, InfiniteSource) {
       NDef("tf_record", "TFRecordDataset", {"file"}, {}),
       NDef("repeat_count", "Const", {}, {{"value", -1}, {"dtype", DT_INT32}}),
       NDef("repeat", "RepeatDataset", {"tf_record", "repeat_count"},
-           {{"output_shapes", gtl::ArraySlice<TensorShape>{}},
-            {"output_types", gtl::ArraySlice<DataType>{}}}),
+           {{"output_shapes", absl::Span<const TensorShape>{}},
+            {"output_types", absl::Span<const DataType>{}}}),
       NDef("batch_size", "Const", {}, {{"value", 2}, {"dtype", DT_INT32}}),
       NDef("drop_remainder", "Const", {},
            {{"value", true}, {"dtype", DT_BOOL}}),
@@ -110,8 +110,8 @@ TEST(RewriteBatchTest, InfiniteSourceMapAndBatch) {
       NDef("tf_record", "TFRecordDataset", {"file"}, {}),
       NDef("repeat_count", "Const", {}, {{"value", -1}, {"dtype", DT_INT32}}),
       NDef("repeat", "RepeatDataset", {"tf_record", "repeat_count"},
-           {{"output_shapes", gtl::ArraySlice<TensorShape>{}},
-            {"output_types", gtl::ArraySlice<DataType>{}}}),
+           {{"output_shapes", absl::Span<const TensorShape>{}},
+            {"output_types", absl::Span<const DataType>{}}}),
       NDef("batch_size", "Const", {}, {{"value", 2}, {"dtype", DT_INT32}}),
       NDef("num_parallel_calls", "Const", {},
            {{"value", 2}, {"dtype", DT_INT64}}),
@@ -142,8 +142,8 @@ TEST(RewriteBatchTest, InfiniteSourceParallelBatch) {
       NDef("tf_record", "TFRecordDataset", {"file"}, {}),
       NDef("repeat_count", "Const", {}, {{"value", -1}, {"dtype", DT_INT32}}),
       NDef("repeat", "RepeatDataset", {"tf_record", "repeat_count"},
-           {{"output_shapes", gtl::ArraySlice<TensorShape>{}},
-            {"output_types", gtl::ArraySlice<DataType>{}}}),
+           {{"output_shapes", absl::Span<const TensorShape>{}},
+            {"output_types", absl::Span<const DataType>{}}}),
       NDef("batch_size", "Const", {}, {{"value", 2}, {"dtype", DT_INT32}}),
       NDef("num_parallel_calls", "Const", {},
            {{"value", 2}, {"dtype", DT_INT64}}),
@@ -357,8 +357,8 @@ TEST(RewriteBatchTest, InfiniteSourceNoRebatch) {
       NDef("tf_record", "TFRecordDataset", {"file"}, {}),
       NDef("repeat_count", "Const", {}, {{"value", -1}, {"dtype", DT_INT32}}),
       NDef("repeat", "RepeatDataset", {"tf_record", "repeat_count"},
-           {{"output_shapes", gtl::ArraySlice<TensorShape>{}},
-            {"output_types", gtl::ArraySlice<DataType>{}}}),
+           {{"output_shapes", absl::Span<const TensorShape>{}},
+            {"output_types", absl::Span<const DataType>{}}}),
       NDef("batch_size", "Const", {}, {{"value", 2}, {"dtype", DT_INT32}}),
       NDef("drop_remainder", "Const", {},
            {{"value", true}, {"dtype", DT_BOOL}}),
diff --git a/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism_test.cc b/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism_test.cc
index c3be1f16565326..fb67798c6d9753 100644
--- a/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism_test.cc
@@ -108,8 +108,8 @@ TEST_P(IntraOpNotSetTest, IntraOpParallelism) {
        NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
        NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
        NDef("range", "RangeDataset", {"start", "stop", "step"},
-            {{"output_shapes", gtl::ArraySlice<TensorShape>{}},
-             {"output_types", gtl::ArraySlice<DataType>{}}}),
+            {{"output_shapes", absl::Span<const TensorShape>{}},
+             {"output_types", absl::Span<const DataType>{}}}),
        NDef("Sink", op, {"range"}, {})});
   EXPECT_FALSE(graph_utils::ContainsNodeWithOp("MaxIntraOpParallelismDataset",
                                                item.graph));
@@ -163,8 +163,8 @@ TEST(AutotuneWithModelTest, IntraOpParallelism) {
        NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
        NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
        NDef("range", "RangeDataset", {"start", "stop", "step"},
-            {{"output_shapes", gtl::ArraySlice<TensorShape>{}},
-             {"output_types", gtl::ArraySlice<DataType>{}}}),
+            {{"output_shapes", absl::Span<const TensorShape>{}},
+             {"output_types", absl::Span<const DataType>{}}}),
        NDef("model", "ModelDataset", {"range"}, {}),
        NDef("Sink", "Identity", {"model"}, {})});
   EXPECT_FALSE(graph_utils::ContainsNodeWithOp("MaxIntraOpParallelismDataset",
diff --git a/tensorflow/core/grappler/optimizers/data/disable_prefetch_legacy_autotune_test.cc b/tensorflow/core/grappler/optimizers/data/disable_prefetch_legacy_autotune_test.cc
index 849e52f6e1b349..c8aac694c4055d 100644
--- a/tensorflow/core/grappler/optimizers/data/disable_prefetch_legacy_autotune_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/disable_prefetch_legacy_autotune_test.cc
@@ -55,8 +55,8 @@ TEST_P(RewriteTest, DisablePrefetchLegacyAutotune) {
       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
       NDef("range", "RangeDataset", {"start", "stop", "step"},
-           {{"output_shapes", gtl::ArraySlice<TensorShape>{}},
-            {"output_types", gtl::ArraySlice<DataType>{}}}),
+           {{"output_shapes", absl::Span<const TensorShape>{}},
+            {"output_types", absl::Span<const DataType>{}}}),
       NDef("prefetch1", "PrefetchDataset", {"range"},
            {{"legacy_autotune", true}}),
       NDef("prefetch2", "PrefetchDataset", {"prefetch1"},
diff --git a/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc b/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc
index d01d5c8f4621c4..f9d4f063618811 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc
@@ -34,8 +34,8 @@ NodeDef MakeBatchV2Node(StringPiece name, StringPiece input_node_name,
       {string(input_node_name), string(batch_size_node_name),
        string(drop_remainder_node_name)},
       {{"parallel_copy", parallel_copy},
-       {"output_shapes", gtl::ArraySlice<TensorShape>{}},
-       {"output_types", gtl::ArraySlice<DataType>{}}});
+       {"output_shapes", absl::Span<const TensorShape>{}},
+       {"output_types", absl::Span<const DataType>{}}});
 }
 
 NodeDef MakeParallelBatchNode(StringPiece name, StringPiece input_node_name,
@@ -47,8 +47,8 @@ NodeDef MakeParallelBatchNode(StringPiece name, StringPiece input_node_name,
       name, "ParallelBatchDataset",
       {string(input_node_name), string(batch_size_node_name),
        string(num_parallel_calls_node_name), string(drop_remainder_node_name)},
-      {{"output_shapes", gtl::ArraySlice<TensorShape>{}},
-       {"output_types", gtl::ArraySlice<DataType>{}},
+      {{"output_shapes", absl::Span<const TensorShape>{}},
+       {"output_types", absl::Span<const DataType>{}},
        {"deterministic", string(deterministic)}});
 }
 
@@ -63,8 +63,8 @@ NodeDef MakeCacheV2Node(StringPiece name, StringPiece input_node_name,
           string(cache_node_name),
       },
       {
-          {"output_shapes", gtl::ArraySlice<TensorShape>{}},
-          {"output_types", gtl::ArraySlice<DataType>{}},
+          {"output_shapes", absl::Span<const TensorShape>{}},
+          {"output_types", absl::Span<const DataType>{}},
       });
 }
 
@@ -74,8 +74,8 @@ NodeDef MakeFilterNode(StringPiece name, StringPiece input_node_name,
       name, "FilterDataset", {string(input_node_name)},
       {{"predicate", FunctionDefHelper::FunctionRef(string(function_name))},
        {"Targuments", {}},
-       {"output_shapes", gtl::ArraySlice<TensorShape>{}},
-       {"output_types", gtl::ArraySlice<DataType>{}}});
+       {"output_shapes", absl::Span<const TensorShape>{}},
+       {"output_types", absl::Span<const DataType>{}}});
 }
 
 NodeDef MakeMapAndBatchNode(StringPiece name, StringPiece input_node_name,
@@ -89,8 +89,8 @@ NodeDef MakeMapAndBatchNode(StringPiece name, StringPiece input_node_name,
        string(num_parallel_calls_node_name), string(drop_remainder_node_name)},
       {{"f", FunctionDefHelper::FunctionRef(string(function_name))},
        {"Targuments", {}},
-       {"output_shapes", gtl::ArraySlice<TensorShape>{}},
-       {"output_types", gtl::ArraySlice<DataType>{}}});
+       {"output_shapes", absl::Span<const TensorShape>{}},
+       {"output_types", absl::Span<const DataType>{}}});
 }
 
 NodeDef MakeMapNode(StringPiece name, StringPiece input_node_name,
@@ -99,8 +99,8 @@ NodeDef MakeMapNode(StringPiece name, StringPiece input_node_name,
       name, "MapDataset", {string(input_node_name)},
       {{"f", FunctionDefHelper::FunctionRef(string(function_name))},
        {"Targuments", {}},
-       {"output_shapes", gtl::ArraySlice<TensorShape>{}},
-       {"output_types", gtl::ArraySlice<DataType>{}}});
+       {"output_shapes", absl::Span<const TensorShape>{}},
+       {"output_types", absl::Span<const DataType>{}}});
 }
 
 NodeDef MakeParallelInterleaveV2Node(StringPiece name,
@@ -116,8 +116,8 @@ NodeDef MakeParallelInterleaveV2Node(StringPiece name,
       {
           {"f", FunctionDefHelper::FunctionRef(string(function_name))},
           {"Targuments", {}},
-          {"output_shapes", gtl::ArraySlice<TensorShape>{}},
-          {"output_types", gtl::ArraySlice<DataType>{}},
+          {"output_shapes", absl::Span<const TensorShape>{}},
+          {"output_types", absl::Span<const DataType>{}},
           {"sloppy", sloppy},
       });
 }
@@ -136,8 +136,8 @@ NodeDef MakeParallelInterleaveV4Node(StringPiece name,
       {
           {"f", FunctionDefHelper::FunctionRef(string(function_name))},
           {"Targuments", {}},
-          {"output_shapes", gtl::ArraySlice<TensorShape>{}},
-          {"output_types", gtl::ArraySlice<DataType>{}},
+          {"output_shapes", absl::Span<const TensorShape>{}},
+          {"output_types", absl::Span<const DataType>{}},
           {"deterministic", string(deterministic)},
       });
 }
@@ -154,8 +154,8 @@ NodeDef MakeInterleaveNode(StringPiece name, StringPiece input_node_name,
       {
           {"f", FunctionDefHelper::FunctionRef(string(function_name))},
           {"Targuments", {}},
-          {"output_shapes", gtl::ArraySlice<TensorShape>{}},
-          {"output_types", gtl::ArraySlice<DataType>{}},
+          {"output_shapes", absl::Span<const TensorShape>{}},
+          {"output_types", absl::Span<const DataType>{}},
           {"deterministic", string(deterministic)},
       });
 }
@@ -169,8 +169,8 @@ NodeDef MakeParallelMapNode(StringPiece name, StringPiece input_node_name,
       {
           {"f", FunctionDefHelper::FunctionRef(string(function_name))},
           {"Targuments", {}},
-          {"output_shapes", gtl::ArraySlice<TensorShape>{}},
-          {"output_types", gtl::ArraySlice<DataType>{}},
+          {"output_shapes", absl::Span<const TensorShape>{}},
+          {"output_types", absl::Span<const DataType>{}},
           {"sloppy", sloppy},
       });
 }
@@ -185,8 +185,8 @@ NodeDef MakeParallelMapV2Node(StringPiece name, StringPiece input_node_name,
       {
           {"f", FunctionDefHelper::FunctionRef(string(function_name))},
           {"Targuments", {}},
-          {"output_shapes", gtl::ArraySlice<TensorShape>{}},
-          {"output_types", gtl::ArraySlice<DataType>{}},
+          {"output_shapes", absl::Span<const TensorShape>{}},
+          {"output_types", absl::Span<const DataType>{}},
           {"deterministic", string(deterministic)},
       });
 }
@@ -198,8 +198,8 @@ NodeDef MakeParseExampleNode(StringPiece name, StringPiece input_node_name,
       name, "ParseExampleDataset",
       {string(input_node_name), string(num_parallel_calls_node_name)},
       {
-          {"output_shapes", gtl::ArraySlice<TensorShape>{}},
-          {"output_types", gtl::ArraySlice<DataType>{}},
+          {"output_shapes", absl::Span<const TensorShape>{}},
+          {"output_types", absl::Span<const DataType>{}},
           {"sloppy", sloppy},
       });
 }
@@ -215,8 +215,8 @@ NodeDef MakeShuffleV2Node(StringPiece name, StringPiece input_node_name,
           string(seed_generator_node_name),
       },
       {
-          {"output_shapes", gtl::ArraySlice<TensorShape>{}},
-          {"output_types", gtl::ArraySlice<DataType>{}},
+          {"output_shapes", absl::Span<const TensorShape>{}},
+          {"output_types", absl::Span<const DataType>{}},
       });
 }
 
@@ -229,8 +229,8 @@ NodeDef MakeTakeNode(StringPiece name, StringPiece input_node_name,
           string(count_node_name),
       },
       {
-          {"output_shapes", gtl::ArraySlice<TensorShape>{}},
-          {"output_types", gtl::ArraySlice<DataType>{}},
+          {"output_shapes", absl::Span<const TensorShape>{}},
+          {"output_types", absl::Span<const DataType>{}},
       });
 }
 
@@ -242,8 +242,8 @@ NodeDef MakeTensorSliceNode(StringPiece name, StringPiece tensor_node_name,
           string(tensor_node_name),
       },
       {
-          {"output_shapes", gtl::ArraySlice<TensorShape>{}},
-          {"output_types", gtl::ArraySlice<DataType>{}},
+          {"output_shapes", absl::Span<const TensorShape>{}},
+          {"output_types", absl::Span<const DataType>{}},
           {"replicate_on_split", replicate_on_split},
       });
 }
@@ -257,8 +257,8 @@ NodeDef MakeSkipNode(StringPiece name, StringPiece input_node_name,
           string(count_node_name),
       },
       {
-          {"output_shapes", gtl::ArraySlice<TensorShape>{}},
-          {"output_types", gtl::ArraySlice<DataType>{}},
+          {"output_shapes", absl::Span<const TensorShape>{}},
+          {"output_types", absl::Span<const DataType>{}},
       });
 }
 
@@ -273,8 +273,8 @@ NodeDef MakeShardNode(StringPiece name, StringPiece input_node_name,
           string(index_node_name),
       },
       {
-          {"output_shapes", gtl::ArraySlice<TensorShape>{}},
-          {"output_types", gtl::ArraySlice<DataType>{}},
+          {"output_shapes", absl::Span<const TensorShape>{}},
+          {"output_types", absl::Span<const DataType>{}},
       });
 }
 
@@ -282,8 +282,8 @@ NodeDef MakePrefetchNode(StringPiece name, StringPiece input_node_name,
                          StringPiece buffer_size) {
   return test::function::NDef(
       name, "PrefetchDataset", {string(input_node_name), string(buffer_size)},
-      {{"output_shapes", gtl::ArraySlice<TensorShape>{}},
-       {"output_types", gtl::ArraySlice<DataType>{}},
+      {{"output_shapes", absl::Span<const TensorShape>{}},
+       {"output_types", absl::Span<const DataType>{}},
        {"slack_period", 0},
        {"legacy_autotune", true},
        {"buffer_size_min", 0}});
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc b/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
index 93c0a8dce66cc1..9b892099dd2fe8 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
@@ -378,14 +378,14 @@ TEST(GraphUtilsTest, TestFindSinkNodeNoFetches) {
 
 TEST(GraphUtilsTest, TestCopyShapesAndTypesAttrsNoShapes) {
   NodeDef from = NDef("range", "RangeDataset", {},
-                      {{kOutputTypes, gtl::ArraySlice<DataType>{}}});
+                      {{kOutputTypes, absl::Span<const DataType>{}}});
   NodeDef to_node;
   EXPECT_FALSE(CopyShapesAndTypesAttrs(from, &to_node));
 }
 
 TEST(GraphUtilsTest, TestCopyShapesAndTypesAttrsNoTypes) {
   NodeDef from = NDef("range", "RangeDataset", {},
-                      {{kOutputShapes, gtl::ArraySlice<TensorShape>{}}});
+                      {{kOutputShapes, absl::Span<const TensorShape>{}}});
   NodeDef to_node;
   EXPECT_FALSE(CopyShapesAndTypesAttrs(from, &to_node));
 }
diff --git a/tensorflow/core/grappler/optimizers/data/inject_prefetch_test.cc b/tensorflow/core/grappler/optimizers/data/inject_prefetch_test.cc
index 2473ff77df5f90..b2ba7515002207 100644
--- a/tensorflow/core/grappler/optimizers/data/inject_prefetch_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/inject_prefetch_test.cc
@@ -64,8 +64,8 @@ TEST_P(InjectPrefetchParameterizedTest, TestAutotuneSetting) {
        NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
        NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
        NDef("range", "RangeDataset", {"start", "stop", "step"},
-            {{"output_shapes", gtl::ArraySlice<TensorShape>{}},
-             {"output_types", gtl::ArraySlice<DataType>{}}}),
+            {{"output_shapes", absl::Span<const TensorShape>{}},
+             {"output_types", absl::Span<const DataType>{}}}),
        NDef("Sink", "Identity", {"range"}, {})});
 
   item.fetch.push_back("Sink");
@@ -90,8 +90,8 @@ TEST(InjectPrefetchTest, FromFunctionDef) {
        NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
        NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
        NDef("range", "RangeDataset", {"start", "stop", "step"},
-            {{"output_shapes", gtl::ArraySlice<TensorShape>{}},
-             {"output_types", gtl::ArraySlice<DataType>{}}}),
+            {{"output_shapes", absl::Span<const TensorShape>{}},
+             {"output_types", absl::Span<const DataType>{}}}),
        NDef("Sink", "_Retval", {"range"}, {})});
 
   item.fetch.push_back("Sink");
@@ -108,8 +108,8 @@ TEST(InjectPrefetchTest, AlreadyPrefetched) {
        NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
        NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
        NDef("range", "RangeDataset", {"start", "stop", "step"},
-            {{"output_shapes", gtl::ArraySlice<TensorShape>{}},
-             {"output_types", gtl::ArraySlice<DataType>{}}}),
+            {{"output_shapes", absl::Span<const TensorShape>{}},
+             {"output_types", absl::Span<const DataType>{}}}),
        NDef("prefetch", kPrefetchDataset, {"range"}, {}),
        NDef("Sink", "Identity", {"prefetch"}, {})});
 
@@ -128,12 +128,12 @@ TEST(InjectPrefetchTest, AlreadyParallelMap) {
        NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
        NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
        NDef("range", "RangeDataset", {"start", "stop", "step"},
-            {{"output_shapes", gtl::ArraySlice<TensorShape>{}},
-             {"output_types", gtl::ArraySlice<DataType>{}}}),
+            {{"output_shapes", absl::Span<const TensorShape>{}},
+             {"output_types", absl::Span<const DataType>{}}}),
        NDef("parallel_map", kParallelMapDataset, {"range"},
             {{"f", "__inference_Dataset_map_normalize_8232"},
-             {"output_shapes", gtl::ArraySlice<TensorShape>{}},
-             {"output_types", gtl::ArraySlice<DataType>{}}}),
+             {"output_shapes", absl::Span<const TensorShape>{}},
+             {"output_types", absl::Span<const DataType>{}}}),
        NDef("Sink", "Identity", {"parallel_map"}, {})});
 
   item.fetch.push_back("Sink");
@@ -151,14 +151,14 @@ TEST(InjectPrefetchTest, OptionsFollowedByPrefetched) {
        NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
        NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
        NDef("range", "RangeDataset", {"start", "stop", "step"},
-            {{"output_shapes", gtl::ArraySlice<TensorShape>{}},
-             {"output_types", gtl::ArraySlice<DataType>{}}}),
+            {{"output_shapes", absl::Span<const TensorShape>{}},
+             {"output_types", absl::Span<const DataType>{}}}),
        NDef("prefetch", kPrefetchDataset, {"range"},
-            {{"output_shapes", gtl::ArraySlice<TensorShape>{}},
-             {"output_types", gtl::ArraySlice<DataType>{}}}),
+            {{"output_shapes", absl::Span<const TensorShape>{}},
+             {"output_types", absl::Span<const DataType>{}}}),
        NDef("options", kOptionsDataset, {"prefetch"},
-            {{"output_shapes", gtl::ArraySlice<TensorShape>{}},
-             {"output_types", gtl::ArraySlice<DataType>{}}}),
+            {{"output_shapes", absl::Span<const TensorShape>{}},
+             {"output_types", absl::Span<const DataType>{}}}),
        NDef("Sink", "Identity", {"options"}, {})});
 
   item.fetch.push_back("Sink");
diff --git a/tensorflow/core/grappler/optimizers/data/make_deterministic_test.cc b/tensorflow/core/grappler/optimizers/data/make_deterministic_test.cc
index ae5d0436e80625..1b76fee5103640 100644
--- a/tensorflow/core/grappler/optimizers/data/make_deterministic_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/make_deterministic_test.cc
@@ -672,7 +672,7 @@ TEST_P(RewriteMapAndBatchWithoutSplitTest, RewriteMapAndBatchWithoutSplit) {
       "map_and_batch", "range", "batch_size", "num_parallel_calls",
       "drop_remainder", func.signature().name());
   SetAttrValue(
-      gtl::ArraySlice<PartialTensorShape>{
+      absl::Span<const PartialTensorShape>{
           {2}, {-1, 3, -1}, PartialTensorShape()},
       &(*map_and_batch_node_def.mutable_attr())["output_shapes"]);
 
diff --git a/tensorflow/core/grappler/optimizers/data/use_private_thread_pool_test.cc b/tensorflow/core/grappler/optimizers/data/use_private_thread_pool_test.cc
index 092298acd7faf4..d09474cef1ab90 100644
--- a/tensorflow/core/grappler/optimizers/data/use_private_thread_pool_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/use_private_thread_pool_test.cc
@@ -104,8 +104,8 @@ TEST_P(ThreadPoolOpNotSetTest, PrivateThreadPool) {
        NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
        NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
        NDef("range", "RangeDataset", {"start", "stop", "step"},
-            {{"output_shapes", gtl::ArraySlice<TensorShape>{}},
-             {"output_types", gtl::ArraySlice<DataType>{}}}),
+            {{"output_shapes", absl::Span<const TensorShape>{}},
+             {"output_types", absl::Span<const DataType>{}}}),
        NDef("Sink", op, {"range"}, {})});
   EXPECT_FALSE(
       graph_utils::ContainsNodeWithOp("PrivateThreadPoolDataset", item.graph));
@@ -159,8 +159,8 @@ TEST(AutotuneWithModelTest, PrivateThreadPool) {
        NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
        NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
        NDef("range", "RangeDataset", {"start", "stop", "step"},
-            {{"output_shapes", gtl::ArraySlice<TensorShape>{}},
-             {"output_types", gtl::ArraySlice<DataType>{}}}),
+            {{"output_shapes", absl::Span<const TensorShape>{}},
+             {"output_types", absl::Span<const DataType>{}}}),
        NDef("model", "ModelDataset", {"range"}, {}),
        NDef("Sink", "Identity", {"model"}, {})});
   EXPECT_FALSE(
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index e2c67628646a18..c541597c8d9d80 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -703,11 +703,13 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/kernels/batching_util:adaptive_shared_batch_scheduler",
         "//tensorflow/core/kernels/batching_util:batch_resource_base",
+        "//tensorflow/core/kernels/batching_util:batch_scheduler_hdrs",
         "//tensorflow/core/kernels/batching_util:bounded_executor",
         "//tensorflow/core/kernels/batching_util:concat_split_util",
         "//tensorflow/core/kernels/batching_util:periodic_function_dynamic",
         "//tensorflow/core/kernels/batching_util:warmup",
         "//tensorflow/core/platform:numbers",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@local_tsl//tsl/platform:types",
@@ -1662,6 +1664,7 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/framework:types_proto_cc",
+        "//tensorflow/core/kernels/batching_util:batch_scheduler_hdrs",
         "//tensorflow/core/kernels/batching_util:warmup",
         "//tensorflow/core/platform:status",
         "//tensorflow/core/protobuf:for_core_protos_cc",
diff --git a/tensorflow/core/kernels/barrier_ops.cc b/tensorflow/core/kernels/barrier_ops.cc
index 91b1de594b121a..c3cb388cd65df9 100644
--- a/tensorflow/core/kernels/barrier_ops.cc
+++ b/tensorflow/core/kernels/barrier_ops.cc
@@ -290,7 +290,7 @@ class Barrier : public ResourceBase {
   const DataTypeVector component_types() const {
     return value_component_types_;
   }
-  const gtl::ArraySlice<TensorShape> component_shapes() const {
+  const absl::Span<const TensorShape> component_shapes() const {
     return value_component_shapes_;
   }
 
diff --git a/tensorflow/core/kernels/batch_kernels.cc b/tensorflow/core/kernels/batch_kernels.cc
index bd2d7880d8520e..f8634974c0e3e1 100644
--- a/tensorflow/core/kernels/batch_kernels.cc
+++ b/tensorflow/core/kernels/batch_kernels.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
@@ -36,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h"
 #include "tensorflow/core/kernels/batching_util/batch_resource_base.h"
+#include "tensorflow/core/kernels/batching_util/batch_scheduler.h"
 #include "tensorflow/core/kernels/batching_util/bounded_executor.h"
 #include "tensorflow/core/kernels/batching_util/concat_split_util.h"
 #include "tensorflow/core/kernels/batching_util/periodic_function.h"
@@ -49,6 +51,8 @@ limitations under the License.
 #include "tensorflow/core/platform/numbers.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/threadpool.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
 
 namespace tensorflow {
 namespace {
@@ -170,6 +174,9 @@ class BatchResource : public serving::BatchResourceBase {
                   /*low_priority_batch_timeout_micros=*/0,
                   /*low_priority_max_enqueued_batches=*/0,
                   /*low_priority_allowed_batch_sizes=*/{},
+                  /*mixed_priority_batching_policy=*/
+                  serving::MixedPriorityBatchingPolicy::
+                      kLowPriorityPaddingWithMaxBatchSize,
                   enable_large_batch_splitting, resource);
   }
 
@@ -182,6 +189,7 @@ class BatchResource : public serving::BatchResourceBase {
       int32_t low_priority_batch_timeout_micros,
       int32_t low_priority_max_enqueued_batches,
       const std::vector<int32>& low_priority_allowed_batch_sizes,
+      serving::MixedPriorityBatchingPolicy mixed_priority_batching_policy,
       bool enable_large_batch_splitting,
       std::unique_ptr<BatchResource>* resource) {
     BatcherT::Options batcher_options;
@@ -197,8 +205,8 @@ class BatchResource : public serving::BatchResourceBase {
             enable_large_batch_splitting,
             /*disable_padding=*/false, low_priority_max_batch_size,
             low_priority_batch_timeout_micros,
-            low_priority_max_enqueued_batches,
-            low_priority_allowed_batch_sizes),
+            low_priority_max_enqueued_batches, low_priority_allowed_batch_sizes,
+            mixed_priority_batching_policy),
         allowed_batch_sizes));
     return absl::OkStatus();
   }
@@ -295,6 +303,8 @@ BatchFunctionKernel::BatchFunctionKernel(OpKernelConstruction* c)
                                &low_priority_allowed_batch_sizes_));
   OP_REQUIRES_OK(c, c->GetAttr("low_priority_max_enqueued_batches",
                                &low_priority_max_enqueued_batches_));
+  OP_REQUIRES_OK(c,
+                 c->GetAttr("mixed_priority_policy", &mixed_priority_policy_));
 
   OP_REQUIRES_OK(c, c->GetAttr("f", &func_));
 
@@ -416,6 +426,10 @@ void BatchFunctionKernel::ComputeAsync(OpKernelContext* c, DoneCallback done) {
   } else {
     creator = [this,
                session_metadata = c->session_metadata()](BatchResource** r) {
+      TF_ASSIGN_OR_RETURN(
+          serving::MixedPriorityBatchingPolicy mixed_priority_batching_policy,
+          serving::GetMixedPriorityBatchingPolicy(mixed_priority_policy_));
+
       std::unique_ptr<BatchResource> new_resource;
       TF_RETURN_IF_ERROR(BatchResource::Create(
           /*has_process_batch_function=*/true, num_batch_threads_,
@@ -423,7 +437,8 @@ void BatchFunctionKernel::ComputeAsync(OpKernelContext* c, DoneCallback done) {
           allowed_batch_sizes_, low_priority_max_batch_size_,
           low_priority_batch_timeout_micros_,
           low_priority_max_enqueued_batches_, low_priority_allowed_batch_sizes_,
-          enable_large_batch_splitting_, &new_resource));
+          mixed_priority_batching_policy, enable_large_batch_splitting_,
+          &new_resource));
       if (session_metadata) {
         new_resource->set_session_metadata(*session_metadata);
       }
diff --git a/tensorflow/core/kernels/batch_kernels.h b/tensorflow/core/kernels/batch_kernels.h
index 1c9c35356e3d2f..f13bff06d303ca 100644
--- a/tensorflow/core/kernels/batch_kernels.h
+++ b/tensorflow/core/kernels/batch_kernels.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_KERNELS_BATCH_KERNELS_H_
 
 #include <cstdint>
+#include <string>
 
 #include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
@@ -109,6 +110,7 @@ class BatchFunctionKernel : public AsyncOpKernel {
   int32 low_priority_batch_timeout_micros_;
   int32 low_priority_max_enqueued_batches_;
   std::vector<int32> low_priority_allowed_batch_sizes_;
+  std::string mixed_priority_policy_;
   NameAttrList func_;
   absl::optional<FunctionLibraryRuntime::Handle> fhandle_ TF_GUARDED_BY(mu_);
   bool enable_large_batch_splitting_ = false;
diff --git a/tensorflow/core/kernels/batch_kernels_test.cc b/tensorflow/core/kernels/batch_kernels_test.cc
index 4b3b04bda413ae..4bdc1dbb3c8140 100644
--- a/tensorflow/core/kernels/batch_kernels_test.cc
+++ b/tensorflow/core/kernels/batch_kernels_test.cc
@@ -22,7 +22,7 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
-#include "absl/strings/match.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/framework/device_factory.h"
 #include "tensorflow/core/framework/function.h"
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/kernels/batch_kernel_test_util.h"
+#include "tensorflow/core/kernels/batching_util/batch_scheduler.h"
 #include "tensorflow/core/kernels/batching_util/warmup.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/platform/env.h"
@@ -90,6 +91,7 @@ class BatchFunctionTestState : public SharedBatchFunctionTestState {
   // Init test fixture with a batch kernel instance. The caller guarantees that
   // the device pointer is valid throughout the life of this class.
   absl::Status Init(Device *device, bool enable_low_priority_queue,
+                    absl::string_view mixed_priority_policy,
                     int64_t expected_batch_size) {
     // Override the per-test/per-op device with a given device so that it can
     // be shared between ops.
@@ -120,20 +122,23 @@ class BatchFunctionTestState : public SharedBatchFunctionTestState {
     std::vector<NodeDefBuilder::NodeOut> inputs(
         {NodeDefBuilder::NodeOut({"n1", 0, DataType::DT_INT64})});
     TF_RETURN_IF_ERROR(NodeDefBuilder("BatchTPUInput", "BatchFunction")
-                           .Attr("max_batch_size", 4)
-                           .Attr("num_batch_threads", 4)
-                           .Attr("allowed_batch_sizes", {4})
+                           .Attr("max_batch_size", 8)
+                           .Attr("num_batch_threads", 8)
+                           .Attr("allowed_batch_sizes", {4, 8})
                            .Attr("batch_timeout_micros", 1000000)
                            .Attr("max_enqueued_batches", 10)
+                           .Attr("enable_large_batch_splitting", true)
                            .Attr("low_priority_max_batch_size",
                                  enable_low_priority_queue ? 8 : 0)
                            .Attr("low_priority_batch_timeout_micros",
                                  enable_low_priority_queue ? 2000000 : 0)
                            .Attr("low_priority_allowed_batch_sizes",
-                                 enable_low_priority_queue ? std::vector<int>{8}
-                                                           : std::vector<int>())
+                                 enable_low_priority_queue
+                                     ? std::vector<int>{4, 8}
+                                     : std::vector<int>())
                            .Attr("low_priority_max_enqueued_batches",
                                  enable_low_priority_queue ? 2 : 0)
+                           .Attr("mixed_priority_policy", mixed_priority_policy)
                            .Attr("Tin", {DataType::DT_INT64})
                            .Input(inputs)
                            .Attr("Tcaptured", std::vector<DataType>{})
@@ -165,20 +170,21 @@ TEST_P(BatchFunctionTest, BatchingWorksWithoutCriticality) {
 
   bool enable_low_priority_queue = GetParam();
   {
-    tsl::BlockingCounter blocking_counter(4);
+    tsl::BlockingCounter blocking_counter(8);
     // 8 threads run the batch op with no explicit criticality set. They are
-    // eventually batched to form a tensor with [4, 2] shape which is verified
+    // eventually batched to form a tensor with [8, 2] shape which is verified
     // within the function.
-    for (int i = 0; i < 4; ++i) {
+    for (int i = 0; i < 8; ++i) {
       Env::Default()->SchedClosure([&]() {
         ASSERT_EQ(tsl::criticality::GetCriticality(),
                   tsl::criticality::Criticality::kCritical);
 
         BatchFunctionTestState test_state;
         test_state.set_session_metadata(session_metadata);
-        TF_ASSERT_OK(test_state.Init(cpu_device_.get(),
-                                     enable_low_priority_queue,
-                                     /*expected_batch_size=*/4));
+        TF_ASSERT_OK(test_state.Init(
+            cpu_device_.get(), enable_low_priority_queue,
+            serving::kLowPriorityPaddingWithMaxBatchSizeAttrValue,
+            /*expected_batch_size=*/8));
         test_state.AddInputFromList<int64_t>(TensorShape({1, 2}), {123, 456});
         TF_EXPECT_OK(test_state.RunOpKernel());
 
@@ -211,9 +217,10 @@ TEST_P(BatchFunctionTest, PaddingWorksWithoutCriticality) {
 
         BatchFunctionTestState test_state;
         test_state.set_session_metadata(session_metadata);
-        TF_ASSERT_OK(test_state.Init(cpu_device_.get(),
-                                     enable_low_priority_queue,
-                                     /*expected_batch_size=*/4));
+        TF_ASSERT_OK(test_state.Init(
+            cpu_device_.get(), enable_low_priority_queue,
+            serving::kLowPriorityPaddingWithMaxBatchSizeAttrValue,
+            /*expected_batch_size=*/4));
         test_state.AddInputFromList<int64_t>(TensorShape({1, 2}), {123, 456});
         TF_EXPECT_OK(test_state.RunOpKernel());
 
@@ -229,18 +236,19 @@ TEST_P(BatchFunctionTest, PaddingWorksWithoutCriticality) {
 }
 
 #if defined(PLATFORM_GOOGLE)
-TEST_P(BatchFunctionTest, LowPriorityTaskPaddingHighPriorityBatch) {
+TEST_P(BatchFunctionTest,
+       LowPriorityTaskPaddingHighPriorityBatchUptoMaxBatchSize) {
   SessionMetadata session_metadata;
   session_metadata.set_name("test_model");
   session_metadata.set_version(123);
 
   bool enable_low_priority_queue = GetParam();
   {
-    tsl::BlockingCounter blocking_counter(4);
-    // 2 threads run the batch op with critical plus and 2 threads run the batch
-    // op with sheddable. They are eventually batched to form a tensor with [4,
+    tsl::BlockingCounter blocking_counter(8);
+    // 4 threads run the batch op with critical plus and 4 threads run the batch
+    // op with sheddable. They are eventually batched to form a tensor with [8,
     // 2] shape which is verified within the function.
-    for (int i = 0; i < 2; ++i) {
+    for (int i = 0; i < 4; ++i) {
       Env::Default()->SchedClosure([&]() {
         tsl::criticality::ScopedCriticality scoped_criticality(
             tsl::criticality::Criticality::kCriticalPlus);
@@ -249,9 +257,10 @@ TEST_P(BatchFunctionTest, LowPriorityTaskPaddingHighPriorityBatch) {
 
         BatchFunctionTestState test_state;
         test_state.set_session_metadata(session_metadata);
-        TF_ASSERT_OK(test_state.Init(cpu_device_.get(),
-                                     enable_low_priority_queue,
-                                     /*expected_batch_size=*/4));
+        TF_ASSERT_OK(test_state.Init(
+            cpu_device_.get(), enable_low_priority_queue,
+            serving::kLowPriorityPaddingWithMaxBatchSizeAttrValue,
+            /*expected_batch_size=*/8));
         test_state.AddInputFromList<int64_t>(TensorShape({1, 2}), {123, 456});
         TF_EXPECT_OK(test_state.RunOpKernel());
 
@@ -262,7 +271,7 @@ TEST_P(BatchFunctionTest, LowPriorityTaskPaddingHighPriorityBatch) {
       });
     }
 
-    for (int i = 0; i < 2; ++i) {
+    for (int i = 0; i < 4; ++i) {
       Env::Default()->SchedClosure([&]() {
         tsl::criticality::ScopedCriticality scoped_criticality(
             tsl::criticality::Criticality::kSheddable);
@@ -271,9 +280,10 @@ TEST_P(BatchFunctionTest, LowPriorityTaskPaddingHighPriorityBatch) {
 
         BatchFunctionTestState test_state;
         test_state.set_session_metadata(session_metadata);
-        TF_ASSERT_OK(test_state.Init(cpu_device_.get(),
-                                     enable_low_priority_queue,
-                                     /*expected_batch_size=*/4));
+        TF_ASSERT_OK(test_state.Init(
+            cpu_device_.get(), enable_low_priority_queue,
+            serving::kLowPriorityPaddingWithMaxBatchSizeAttrValue,
+            /*expected_batch_size=*/8));
         test_state.AddInputFromList<int64_t>(TensorShape({1, 2}), {234, 567});
         TF_EXPECT_OK(test_state.RunOpKernel());
 
@@ -308,8 +318,10 @@ TEST_P(BatchFunctionTest,
 
       BatchFunctionTestState test_state;
       test_state.set_session_metadata(session_metadata);
-      TF_ASSERT_OK(test_state.Init(cpu_device_.get(), enable_low_priority_queue,
-                                   /*expected_batch_size=*/4));
+      TF_ASSERT_OK(
+          test_state.Init(cpu_device_.get(), enable_low_priority_queue,
+                          serving::kLowPriorityPaddingWithMaxBatchSizeAttrValue,
+                          /*expected_batch_size=*/4));
       test_state.AddInputFromList<int64_t>(TensorShape({1, 2}), {123, 456});
       TF_EXPECT_OK(test_state.RunOpKernel());
 
@@ -327,8 +339,10 @@ TEST_P(BatchFunctionTest,
 
       BatchFunctionTestState test_state;
       test_state.set_session_metadata(session_metadata);
-      TF_ASSERT_OK(test_state.Init(cpu_device_.get(), enable_low_priority_queue,
-                                   /*expected_batch_size=*/4));
+      TF_ASSERT_OK(
+          test_state.Init(cpu_device_.get(), enable_low_priority_queue,
+                          serving::kLowPriorityPaddingWithMaxBatchSizeAttrValue,
+                          /*expected_batch_size=*/4));
       test_state.AddInputFromList<int64_t>(TensorShape({1, 2}), {234, 567});
       TF_EXPECT_OK(test_state.RunOpKernel());
 
@@ -341,12 +355,134 @@ TEST_P(BatchFunctionTest,
     blocking_counter.Wait();
   }
 }
+
+TEST_P(BatchFunctionTest,
+       LowPriorityTaskPaddingHighPriorityBatchUptoNextAllowedBatchSize) {
+  SessionMetadata session_metadata;
+  session_metadata.set_name("test_model");
+  session_metadata.set_version(123);
+
+  bool enable_low_priority_queue = GetParam();
+  {
+    tsl::BlockingCounter blocking_counter(4);
+    // 2 threads run the batch op with critical plus and 2 threads run the batch
+    // op with sheddable. They are eventually batched to form a tensor with [4,
+    // 2] shape which is verified within the function.
+    for (int i = 0; i < 2; ++i) {
+      Env::Default()->SchedClosure([&]() {
+        tsl::criticality::ScopedCriticality scoped_criticality(
+            tsl::criticality::Criticality::kCriticalPlus);
+        ASSERT_EQ(tsl::criticality::GetCriticality(),
+                  tsl::criticality::Criticality::kCriticalPlus);
+
+        BatchFunctionTestState test_state;
+        test_state.set_session_metadata(session_metadata);
+        TF_ASSERT_OK(test_state.Init(
+            cpu_device_.get(), enable_low_priority_queue,
+            serving::kLowPriorityPaddingWithNextAllowedBatchSizeAttrValue,
+            /*expected_batch_size=*/4));
+        test_state.AddInputFromList<int64_t>(TensorShape({1, 2}), {123, 456});
+        TF_EXPECT_OK(test_state.RunOpKernel());
+
+        test::ExpectTensorEqual<int64_t>(
+            *test_state.GetOutput(0),
+            test::AsTensor<int64_t>({123, 456}, TensorShape({1, 2})));
+        blocking_counter.DecrementCount();
+      });
+    }
+
+    for (int i = 0; i < 2; ++i) {
+      Env::Default()->SchedClosure([&]() {
+        tsl::criticality::ScopedCriticality scoped_criticality(
+            tsl::criticality::Criticality::kSheddable);
+        ASSERT_EQ(tsl::criticality::GetCriticality(),
+                  tsl::criticality::Criticality::kSheddable);
+
+        BatchFunctionTestState test_state;
+        test_state.set_session_metadata(session_metadata);
+        TF_ASSERT_OK(test_state.Init(
+            cpu_device_.get(), enable_low_priority_queue,
+            serving::kLowPriorityPaddingWithNextAllowedBatchSizeAttrValue,
+            /*expected_batch_size=*/4));
+        test_state.AddInputFromList<int64_t>(TensorShape({1, 2}), {234, 567});
+        TF_EXPECT_OK(test_state.RunOpKernel());
+
+        test::ExpectTensorEqual<int64_t>(
+            *test_state.GetOutput(0),
+            test::AsTensor<int64_t>({234, 567}, TensorShape({1, 2})));
+        blocking_counter.DecrementCount();
+      });
+    }
+
+    blocking_counter.Wait();
+  }
+}
 #endif
 
 INSTANTIATE_TEST_SUITE_P(BatchFunctionTest, BatchFunctionTest,
                          ::testing::Bool());
 
 #if defined(PLATFORM_GOOGLE)
+TEST_F(BatchFunctionTest, HighPriorityBatchNotPaddedWithLowPriorityTasks) {
+  SessionMetadata session_metadata;
+  session_metadata.set_name("test_model");
+  session_metadata.set_version(123);
+
+  {
+    tsl::BlockingCounter blocking_counter(8);
+    // 4 threads run the batch op with critical plus and 4 threads run the batch
+    // op with sheddable. They each get batched separately to form tensors with
+    // [4, 2] shape which is verified within the function.
+    for (int i = 0; i < 4; ++i) {
+      Env::Default()->SchedClosure([&]() {
+        tsl::criticality::ScopedCriticality scoped_criticality(
+            tsl::criticality::Criticality::kCriticalPlus);
+        ASSERT_EQ(tsl::criticality::GetCriticality(),
+                  tsl::criticality::Criticality::kCriticalPlus);
+
+        BatchFunctionTestState test_state;
+        test_state.set_session_metadata(session_metadata);
+        TF_ASSERT_OK(test_state.Init(cpu_device_.get(),
+                                     /*enable_low_priority_queue=*/true,
+                                     serving::kPriorityIsolationAttrValue,
+                                     /*expected_batch_size=*/4));
+        test_state.AddInputFromList<int64_t>(TensorShape({1, 2}), {123, 456});
+        TF_EXPECT_OK(test_state.RunOpKernel());
+
+        test::ExpectTensorEqual<int64_t>(
+            *test_state.GetOutput(0),
+            test::AsTensor<int64_t>({123, 456}, TensorShape({1, 2})));
+        blocking_counter.DecrementCount();
+      });
+    }
+
+    for (int i = 0; i < 4; ++i) {
+      Env::Default()->SchedClosure([&]() {
+        tsl::criticality::ScopedCriticality scoped_criticality(
+            tsl::criticality::Criticality::kSheddable);
+        ASSERT_EQ(tsl::criticality::GetCriticality(),
+                  tsl::criticality::Criticality::kSheddable);
+
+        BatchFunctionTestState test_state;
+        test_state.set_session_metadata(session_metadata);
+        TF_ASSERT_OK(test_state.Init(cpu_device_.get(),
+                                     /*enable_low_priority_queue=*/true,
+                                     serving::kPriorityIsolationAttrValue,
+                                     /*expected_batch_size=*/4));
+        test_state.AddInputFromList<int64_t>(TensorShape({1, 2}), {234, 567});
+        TF_EXPECT_OK(test_state.RunOpKernel());
+
+        test::ExpectTensorEqual<int64_t>(
+            *test_state.GetOutput(0),
+            test::AsTensor<int64_t>({234, 567}, TensorShape({1, 2})));
+        blocking_counter.DecrementCount();
+      });
+    }
+
+    blocking_counter.Wait();
+  }
+}
+
 TEST_F(BatchFunctionTest, LowPriorityOnlyBatchAtMaxLowPriorityBatchSize) {
   SessionMetadata session_metadata;
   session_metadata.set_name("test_model");
@@ -366,9 +502,11 @@ TEST_F(BatchFunctionTest, LowPriorityOnlyBatchAtMaxLowPriorityBatchSize) {
 
         BatchFunctionTestState test_state;
         test_state.set_session_metadata(session_metadata);
-        TF_ASSERT_OK(test_state.Init(cpu_device_.get(),
-                                     /*enable_low_priority_queue=*/true,
-                                     /*expected_batch_size=*/8));
+        TF_ASSERT_OK(test_state.Init(
+            cpu_device_.get(),
+            /*enable_low_priority_queue=*/true,
+            serving::kLowPriorityPaddingWithMaxBatchSizeAttrValue,
+            /*expected_batch_size=*/8));
         test_state.AddInputFromList<int64_t>(TensorShape({1, 2}), {234, 567});
         TF_EXPECT_OK(test_state.RunOpKernel());
 
@@ -391,8 +529,8 @@ TEST_F(BatchFunctionTest, LowPriorityBatchPaddedToLowPriorityAllowedBatchSize) {
   {
     tsl::BlockingCounter blocking_counter(2);
     // 2 threads run the batch op with sheddable. They are eventually batched
-    // and padded to form a tensor with [8, 2] shape, which is verified within
-    // the function, since the low priority allowed batch size is set to [8].
+    // and padded to form a tensor with [4, 2] shape, which is verified within
+    // the function, since the low priority allowed batch size is set to [4, 8].
     for (int i = 0; i < 2; ++i) {
       Env::Default()->SchedClosure([&]() {
         tsl::criticality::ScopedCriticality scoped_criticality(
@@ -402,9 +540,11 @@ TEST_F(BatchFunctionTest, LowPriorityBatchPaddedToLowPriorityAllowedBatchSize) {
 
         BatchFunctionTestState test_state;
         test_state.set_session_metadata(session_metadata);
-        TF_ASSERT_OK(test_state.Init(cpu_device_.get(),
-                                     /*enable_low_priority_queue=*/true,
-                                     /*expected_batch_size=*/8));
+        TF_ASSERT_OK(test_state.Init(
+            cpu_device_.get(),
+            /*enable_low_priority_queue=*/true,
+            serving::kLowPriorityPaddingWithMaxBatchSizeAttrValue,
+            /*expected_batch_size=*/4));
         test_state.AddInputFromList<int64_t>(TensorShape({1, 2}), {234, 567});
         TF_EXPECT_OK(test_state.RunOpKernel());
 
diff --git a/tensorflow/core/kernels/batching_util/BUILD b/tensorflow/core/kernels/batching_util/BUILD
index 50c5631d8e6ba1..86c578a33b5600 100644
--- a/tensorflow/core/kernels/batching_util/BUILD
+++ b/tensorflow/core/kernels/batching_util/BUILD
@@ -100,20 +100,36 @@ cc_library(
         "//tensorflow/core/lib/core:status",
         "//tensorflow/core/platform:thread_annotations",
         "//tensorflow/core/profiler/lib:traceme",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:criticality",
     ],
 )
 
 cc_library(
     name = "batch_scheduler",
+    srcs = ["batch_scheduler.cc"],
     hdrs = ["batch_scheduler.h"],
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler/lib:traceme",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:criticality",
     ],
 )
 
+cc_library(
+    name = "batch_scheduler_utils",
+    srcs = ["batch_scheduler_utils.cc"],
+    hdrs = ["batch_scheduler_utils.h"],
+    deps = [
+        "//tensorflow/core:portable_gif_internal",
+    ],
+)
+
 cc_library(
     name = "threadsafe_status",
     srcs = ["threadsafe_status.cc"],
@@ -134,6 +150,17 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/platform:status_matchers",
+        "@com_google_absl//absl/status",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+tf_cc_test(
+    name = "batch_scheduler_utils_test",
+    srcs = ["batch_scheduler_utils_test.cc"],
+    deps = [
+        ":batch_scheduler_utils",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -144,6 +171,7 @@ cc_library(
     deps = [
         ":batch_input_task",
         ":batch_scheduler_hdrs",
+        ":batch_scheduler_utils",
         ":periodic_function_dynamic",
         "//tensorflow/core:framework_lite",
         "//tensorflow/core:lib",
@@ -173,6 +201,7 @@ cc_library(
     deps = [
         ":batch_input_task",
         ":batch_scheduler",
+        ":batch_scheduler_utils",
         ":periodic_function_dynamic",
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler/lib:connected_traceme",
@@ -372,6 +401,7 @@ cc_library(
     deps = [
         ":adaptive_shared_batch_scheduler",
         ":batch_scheduler",
+        ":batch_scheduler_utils",
         ":concat_split_util",
         ":input_split_metadata",
         ":shared_batch_scheduler",
diff --git a/tensorflow/core/kernels/batching_util/batch_resource_base.cc b/tensorflow/core/kernels/batching_util/batch_resource_base.cc
index 2f3195bd9fc3b7..8fbc3e302b5b89 100644
--- a/tensorflow/core/kernels/batching_util/batch_resource_base.cc
+++ b/tensorflow/core/kernels/batching_util/batch_resource_base.cc
@@ -52,6 +52,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_util.h"
 #include "tensorflow/core/kernels/batching_util/batch_scheduler.h"
+#include "tensorflow/core/kernels/batching_util/batch_scheduler_utils.h"
 #include "tensorflow/core/kernels/batching_util/concat_split_util.h"
 #include "tensorflow/core/kernels/batching_util/input_split_metadata.h"
 #include "tensorflow/core/kernels/batching_util/threadsafe_status.h"
@@ -501,7 +502,9 @@ BatchResourceBase::GetBatcherQueueOptions(
       disable_padding, /*low_priority_max_batch_size=*/0,
       /*low_priority_batch_timeout_micros=*/0,
       /*low_priority_max_enqueued_batches=*/0,
-      /*low_priority_allowed_batch_sizes=*/{});
+      /*low_priority_allowed_batch_sizes=*/{},
+      /*mixed_priority_batching_policy*/
+      MixedPriorityBatchingPolicy::kLowPriorityPaddingWithMaxBatchSize);
 }
 
 /*static*/ BatchResourceBase::BatcherT::QueueOptions
@@ -513,7 +516,8 @@ BatchResourceBase::GetBatcherQueueOptions(
     int32_t low_priority_max_batch_size,
     int32_t low_priority_batch_timeout_micros,
     int32_t low_priority_max_enqueued_batches,
-    const std::vector<int32>& low_priority_allowed_batch_sizes) {
+    const std::vector<int32>& low_priority_allowed_batch_sizes,
+    MixedPriorityBatchingPolicy mixed_priority_batching_policy) {
   BatcherT::QueueOptions batcher_queue_options;
   batcher_queue_options.input_batch_size_limit = max_batch_size;
   batcher_queue_options.max_enqueued_batches = max_enqueued_batches;
@@ -542,6 +546,8 @@ BatchResourceBase::GetBatcherQueueOptions(
   }
   batcher_queue_options.low_priority_queue_options.allowed_batch_sizes =
       low_priority_allowed_batch_sizes;
+  batcher_queue_options.mixed_priority_batching_policy =
+      mixed_priority_batching_policy;
   batcher_queue_options.enable_large_batch_splitting =
       enable_large_batch_splitting;
   if (enable_large_batch_splitting) {
@@ -635,18 +641,8 @@ int BatchResourceBase::RoundToLowestAllowedBatchSize(
                                   .allowed_batch_sizes
                             : allowed_batch_sizes_;
 
-  if (batcher_queue_options_.disable_padding || allowed_batch_sizes.empty()) {
-    return batch_size;
-  }
-  for (int allowed_size : allowed_batch_sizes) {
-    if (allowed_size >= batch_size) {
-      return allowed_size;
-    }
-  }
-  LOG(ERROR) << "Batch size " << batch_size
-             << " is greater than largest allowed size; "
-                "ignoring allowed sizes constraint.";
-  return batch_size;
+  return GetNextAllowedBatchSize(batch_size, allowed_batch_sizes,
+                                 batcher_queue_options_.disable_padding);
 }
 
 Status BatchResourceBase::ConcatInputTensors(
diff --git a/tensorflow/core/kernels/batching_util/batch_resource_base.h b/tensorflow/core/kernels/batching_util/batch_resource_base.h
index 83b0840d5f7e48..6a3a554996bea1 100644
--- a/tensorflow/core/kernels/batching_util/batch_resource_base.h
+++ b/tensorflow/core/kernels/batching_util/batch_resource_base.h
@@ -45,6 +45,20 @@ limitations under the License.
 namespace tensorflow {
 namespace serving {
 
+// Options used to create a batch resource.
+struct BatchResourceOptions {
+  int32_t num_batch_threads;
+  int32_t max_batch_size;
+  int32_t batch_timeout_micros;
+  int32_t max_enqueued_batches;
+  std::vector<int32_t> allowed_batch_sizes;
+  int32_t low_priority_max_batch_size;
+  int32_t low_priority_batch_timeout_micros;
+  int32_t low_priority_max_enqueued_batches;
+  std::vector<int32_t> low_priority_allowed_batch_sizes;
+  MixedPriorityBatchingPolicy mixed_priority_batching_policy;
+};
+
 // Base class for resource that encapsulating the state and logic for batching
 // tensors.
 class BatchResourceBase : public ResourceBase {
@@ -202,7 +216,8 @@ class BatchResourceBase : public ResourceBase {
       int32_t low_priority_max_batch_size,
       int32_t low_priority_batch_timeout_micros,
       int32_t low_priority_max_enqueued_batches,
-      const std::vector<int32>& low_priority_allowed_batch_sizes);
+      const std::vector<int32>& low_priority_allowed_batch_sizes,
+      MixedPriorityBatchingPolicy mixed_priority_batching_policy);
 
   static AdaptiveBatcherT::QueueOptions GetAdaptiveBatcherQueueOptions(
       int32_t max_batch_size, int32_t batch_timeout_micros,
diff --git a/tensorflow/core/kernels/batching_util/batch_scheduler.cc b/tensorflow/core/kernels/batching_util/batch_scheduler.cc
new file mode 100644
index 00000000000000..b60d5def35bc82
--- /dev/null
+++ b/tensorflow/core/kernels/batching_util/batch_scheduler.cc
@@ -0,0 +1,42 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/batching_util/batch_scheduler.h"
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+
+namespace tensorflow {
+namespace serving {
+
+absl::StatusOr<MixedPriorityBatchingPolicy> GetMixedPriorityBatchingPolicy(
+    absl::string_view attr_value) {
+  if (attr_value == kLowPriorityPaddingWithMaxBatchSizeAttrValue) {
+    return MixedPriorityBatchingPolicy::kLowPriorityPaddingWithMaxBatchSize;
+  } else if (attr_value ==
+             kLowPriorityPaddingWithNextAllowedBatchSizeAttrValue) {
+    return MixedPriorityBatchingPolicy::
+        kLowPriorityPaddingWithNextAllowedBatchSize;
+  } else if (attr_value == kPriorityIsolationAttrValue) {
+    return MixedPriorityBatchingPolicy::kPriorityIsolation;
+  }
+  return absl::InvalidArgumentError(absl::StrFormat(
+      "Unknown mixed priority batching policy: %s", attr_value));
+}
+
+}  // namespace serving
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batching_util/batch_scheduler.h b/tensorflow/core/kernels/batching_util/batch_scheduler.h
index f6ed5cf47aef91..6b7f3563182230 100644
--- a/tensorflow/core/kernels/batching_util/batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/batch_scheduler.h
@@ -38,6 +38,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
@@ -51,6 +53,21 @@ limitations under the License.
 namespace tensorflow {
 namespace serving {
 
+const absl::string_view kLowPriorityPaddingWithMaxBatchSizeAttrValue =
+    "low_priority_padding_with_max_batch_size";
+const absl::string_view kLowPriorityPaddingWithNextAllowedBatchSizeAttrValue =
+    "low_priority_padding_with_next_allowed_batch_size";
+const absl::string_view kPriorityIsolationAttrValue = "priority_isolation";
+
+enum class MixedPriorityBatchingPolicy {
+  kLowPriorityPaddingWithMaxBatchSize,
+  kLowPriorityPaddingWithNextAllowedBatchSize,
+  kPriorityIsolation
+};
+
+absl::StatusOr<MixedPriorityBatchingPolicy> GetMixedPriorityBatchingPolicy(
+    absl::string_view attr_value);
+
 // The abstract superclass for a unit of work to be done as part of a batch.
 //
 // An implementing subclass typically contains (or points to):
diff --git a/tensorflow/core/kernels/batching_util/batch_scheduler_test.cc b/tensorflow/core/kernels/batching_util/batch_scheduler_test.cc
index 916e709c67015d..e159c4373fdf90 100644
--- a/tensorflow/core/kernels/batching_util/batch_scheduler_test.cc
+++ b/tensorflow/core/kernels/batching_util/batch_scheduler_test.cc
@@ -19,11 +19,15 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 #include <optional>
+#include <string>
+#include <tuple>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/status_matchers.h"
 #include "tensorflow/core/platform/test.h"
 #include "tsl/platform/criticality.h"
 
@@ -35,6 +39,40 @@ using ::testing::ElementsAre;
 using ::testing::Eq;
 using ::testing::Property;
 
+TEST(MixedPriorityBatchingPolicyTest, InvalidAttrValueError) {
+  EXPECT_THAT(
+      GetMixedPriorityBatchingPolicy("invalid_attr_value"),
+      testing::StatusIs(
+          absl::StatusCode::kInvalidArgument,
+          ::testing::HasSubstr(
+              "Unknown mixed priority batching policy: invalid_attr_value")));
+}
+
+using MixedPriorityBatchingPolicyParameterizedTest = ::testing::TestWithParam<
+    std::tuple<std::string, MixedPriorityBatchingPolicy>>;
+
+TEST_P(MixedPriorityBatchingPolicyParameterizedTest,
+       GetMixedPriorityBatchingPolicySuccess) {
+  auto [attr_name, policy] = GetParam();
+  EXPECT_THAT(GetMixedPriorityBatchingPolicy(attr_name),
+              testing::IsOkAndHolds(Eq(policy)));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    Parameter, MixedPriorityBatchingPolicyParameterizedTest,
+    ::testing::Values(
+        std::make_tuple(
+            /*attr_name=*/kLowPriorityPaddingWithMaxBatchSizeAttrValue,
+            /*policy=*/MixedPriorityBatchingPolicy::
+                kLowPriorityPaddingWithMaxBatchSize),
+        std::make_tuple(
+            /*attr_name=*/kLowPriorityPaddingWithNextAllowedBatchSizeAttrValue,
+            /*policy=*/MixedPriorityBatchingPolicy::
+                kLowPriorityPaddingWithNextAllowedBatchSize),
+        std::make_tuple(
+            /*attr_name=*/kPriorityIsolationAttrValue,
+            /*policy=*/MixedPriorityBatchingPolicy::kPriorityIsolation)));
+
 class FakeTask : public BatchTask {
  public:
   explicit FakeTask(size_t size) : size_(size) {}
diff --git a/tensorflow/core/kernels/batching_util/batch_scheduler_utils.cc b/tensorflow/core/kernels/batching_util/batch_scheduler_utils.cc
new file mode 100644
index 00000000000000..148bb7c039f2dc
--- /dev/null
+++ b/tensorflow/core/kernels/batching_util/batch_scheduler_utils.cc
@@ -0,0 +1,44 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/batching_util/batch_scheduler_utils.h"
+
+#include <vector>
+
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace serving {
+
+int GetNextAllowedBatchSize(int batch_size,
+                            const std::vector<int32>& allowed_batch_sizes,
+                            bool disable_padding) {
+  if (disable_padding || allowed_batch_sizes.empty()) {
+    return batch_size;
+  }
+  for (int allowed_size : allowed_batch_sizes) {
+    if (allowed_size >= batch_size) {
+      return allowed_size;
+    }
+  }
+  LOG(ERROR) << "Batch size " << batch_size
+             << " is greater than largest allowed size; ignoring allowed sizes "
+                "constraint.";
+  return batch_size;
+}
+
+}  // namespace serving
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batching_util/batch_scheduler_utils.h b/tensorflow/core/kernels/batching_util/batch_scheduler_utils.h
new file mode 100644
index 00000000000000..38831531abd6e7
--- /dev/null
+++ b/tensorflow/core/kernels/batching_util/batch_scheduler_utils.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BATCH_SCHEDULER_UTILS_H_
+#define TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BATCH_SCHEDULER_UTILS_H_
+
+#include <vector>
+
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace serving {
+
+// Returns the next allowed batch size, which is the smallest allowed batch size
+// greater than or equal to the given batch size. If allowed_batch_sizes,
+// returns batch_size as is.
+int GetNextAllowedBatchSize(int batch_size,
+                            const std::vector<int32>& allowed_batch_sizes,
+                            bool disable_padding);
+
+}  // namespace serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BATCH_SCHEDULER_UTILS_H_
diff --git a/tensorflow/core/kernels/batching_util/batch_scheduler_utils_test.cc b/tensorflow/core/kernels/batching_util/batch_scheduler_utils_test.cc
new file mode 100644
index 00000000000000..9cd6ce1ddcb210
--- /dev/null
+++ b/tensorflow/core/kernels/batching_util/batch_scheduler_utils_test.cc
@@ -0,0 +1,48 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/batching_util/batch_scheduler_utils.h"
+
+#include <gtest/gtest.h>
+
+namespace tensorflow {
+namespace serving {
+
+namespace {
+
+TEST(GetNextAllowedBatchSizeTest, PaddingDisallowed) {
+  EXPECT_EQ(GetNextAllowedBatchSize(3, {2, 4, 8}, true), 3);
+}
+
+TEST(GetNextAllowedBatchSizeTest, EmptyAllowedBatchSizes) {
+  EXPECT_EQ(GetNextAllowedBatchSize(3, {}, false), 3);
+}
+
+TEST(GetNextAllowedBatchSizeTest, NextAllowedBatchSizeFound) {
+  EXPECT_EQ(GetNextAllowedBatchSize(3, {2, 4, 8}, false), 4);
+}
+
+TEST(GetNextAllowedBatchSizeTest, AlreadyAllowedBatchSize) {
+  EXPECT_EQ(GetNextAllowedBatchSize(2, {2, 4, 8}, false), 2);
+}
+
+TEST(GetNextAllowedBatchSizeTest, GreaterThanAllowedBatchSize) {
+  EXPECT_EQ(GetNextAllowedBatchSize(10, {2, 4, 8}, false), 10);
+}
+
+}  // namespace
+
+}  // namespace serving
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batching_util/concat_split_util.h b/tensorflow/core/kernels/batching_util/concat_split_util.h
index d3b2839ae9752e..adf6363979e7b7 100644
--- a/tensorflow/core/kernels/batching_util/concat_split_util.h
+++ b/tensorflow/core/kernels/batching_util/concat_split_util.h
@@ -35,7 +35,7 @@ typedef Eigen::GpuDevice GPUDevice;
 // 'output' using 'context' for the allocation to ensure proper device
 // placement.
 template <typename T>
-Status Concat(OpKernelContext* context, const gtl::ArraySlice<Tensor> inputs,
+Status Concat(OpKernelContext* context, const absl::Span<const Tensor> inputs,
               Tensor* output) {
   const int input_dims = inputs[0].dims();
   const TensorShape& input_shape = inputs[0].shape();
@@ -92,7 +92,7 @@ Status Concat(OpKernelContext* context, const gtl::ArraySlice<Tensor> inputs,
 
 // Same as 'Concat' above, but handles Tensor dtype deduction automatically.
 inline Status Concat(OpKernelContext* context,
-                     const gtl::ArraySlice<Tensor> inputs, Tensor* output) {
+                     const absl::Span<const Tensor> inputs, Tensor* output) {
   const DataType type = inputs[0].dtype();
   Status concat_status;
   switch (type) {
@@ -118,7 +118,7 @@ inline Status Concat(OpKernelContext* context,
 // applicable special case and wrote to the outputs. Otherwise acts as a no-op.
 template <typename T>
 Status SplitEasyCases(OpKernelContext* context, const Tensor& input,
-                      const gtl::ArraySlice<int64_t> sizes,
+                      const absl::Span<const int64_t> sizes,
                       std::vector<Tensor>* outputs, bool* done) {
   *done = false;
 
@@ -155,7 +155,7 @@ Status SplitEasyCases(OpKernelContext* context, const Tensor& input,
 // Handles the general case, on CPU.
 template <typename T>
 Status SplitCPU(OpKernelContext* context, const Tensor& input,
-                const gtl::ArraySlice<int64_t> sizes,
+                const absl::Span<const int64_t> sizes,
                 std::vector<Tensor>* outputs) {
   int64_t suffix_dim_size = 1;
   for (int i = 1; i < input.shape().dims(); ++i) {
@@ -209,7 +209,7 @@ Status SplitGPU(OpKernelContext* context, const Tensor& input,
 // The outer function that dispatches to the various Split*() functions above.
 template <typename T>
 Status Split(OpKernelContext* context, const Tensor& input,
-             const gtl::ArraySlice<int64_t> sizes,
+             const absl::Span<const int64_t> sizes,
              std::vector<Tensor>* outputs) {
   bool easy_cases_done;
   TF_RETURN_IF_ERROR(
@@ -228,7 +228,7 @@ Status Split(OpKernelContext* context, const Tensor& input,
 
 // Same as 'Split' above, but handles Tensor dtype automatically.
 inline Status Split(OpKernelContext* context, const Tensor& input,
-                    const gtl::ArraySlice<int64_t> sizes,
+                    const absl::Span<const int64_t> sizes,
                     std::vector<Tensor>* outputs) {
   const DataType type = input.dtype();
   Status split_status;
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
index 7d0c2d66354e00..7e638224369e82 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
@@ -35,6 +35,7 @@ limitations under the License.
 #include "absl/time/clock.h"
 #include "tensorflow/core/kernels/batching_util/batch_input_task.h"
 #include "tensorflow/core/kernels/batching_util/batch_scheduler.h"
+#include "tensorflow/core/kernels/batching_util/batch_scheduler_utils.h"
 #include "tensorflow/core/kernels/batching_util/periodic_function.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -263,6 +264,11 @@ class SharedBatchScheduler
     PriorityQueueOptions high_priority_queue_options;
     // A subset of queue options for low priority input.
     PriorityQueueOptions low_priority_queue_options;
+
+    // A policy that determines the mixed priority batching behavior. It is
+    // effective only when enable_priority_queue is true.
+    MixedPriorityBatchingPolicy mixed_priority_batching_policy =
+        MixedPriorityBatchingPolicy::kLowPriorityPaddingWithMaxBatchSize;
   };
   Status AddQueue(const QueueOptions& options,
                   ProcessBatchCallback process_batch_callback,
@@ -404,10 +410,10 @@ class Queue {
   // Batches are guaranteed to form at task enqueue time.
   std::unique_ptr<Batch<TaskType>> ScheduleBatchWithEagerSplit();
 
-  // Retrieves the tasks up to the specified size from the low priority task
-  // queue. It will immediately return an empty vector when
-  // enable_priority_queue is false.
-  std::vector<std::unique_ptr<TaskType>> GetLowPriorityTasks(size_t size);
+  // Retrieves the low priority tasks that can be padded to a high priority
+  // batch of the specified size.
+  std::vector<std::unique_ptr<TaskType>> GetLowPriorityTasksForPadding(
+      size_t batch_size);
 
   // Processes a batch that has been returned earlier by ScheduleBatch().
   void ProcessBatch(std::unique_ptr<Batch<TaskType>> batch,
@@ -513,6 +519,11 @@ class Queue {
   TaskQueue<TaskType>& GetLowPriorityTaskQueue()
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
+  // Retrieves the tasks up to the specified size from the low priority task
+  // queue. It will immediately return an empty vector when
+  // enable_priority_queue is false.
+  std::vector<std::unique_ptr<TaskType>> GetLowPriorityTasks(size_t size);
+
   const typename SharedBatchScheduler<TaskType>::QueueOptions options_;
 
   // The environment to use.
@@ -854,13 +865,10 @@ void SharedBatchScheduler<TaskType>::ThreadLogic() {
         std::move(absl::get<BatchTaskUniqueptr>(batch_to_process));
   }
 
-  // TODO(b/316379576): Make the policy determine between padding up to the max
-  // batch size and up to the next smallest allowed batch size.
-  size_t low_priority_task_padding_size =
-      queue_for_batch->max_execution_batch_size() - batch_to_schedule->size();
+  size_t batch_size_to_schedule = batch_to_schedule->size();
   queue_for_batch->ProcessBatch(
       std::move(batch_to_schedule),
-      queue_for_batch->GetLowPriorityTasks(low_priority_task_padding_size));
+      queue_for_batch->GetLowPriorityTasksForPadding(batch_size_to_schedule));
 }
 
 namespace internal {
@@ -1279,9 +1287,10 @@ template <typename TaskType>
 std::vector<std::unique_ptr<TaskType>> Queue<TaskType>::GetLowPriorityTasks(
     size_t size) {
   std::vector<std::unique_ptr<TaskType>> low_priority_tasks_to_pad;
-  // If priority queue is not enable, immediately return instead of attempting
+  // If priority queue is not enabled, immediately return instead of attempting
   // to acquire a lock.
-  if (!options_.enable_priority_queue) return low_priority_tasks_to_pad;
+  if (!options_.enable_priority_queue || size == 0)
+    return low_priority_tasks_to_pad;
   {
     mutex_lock l(mu_);
     low_priority_tasks_to_pad = GetLowPriorityTaskQueue().RemoveTask(size);
@@ -1289,6 +1298,30 @@ std::vector<std::unique_ptr<TaskType>> Queue<TaskType>::GetLowPriorityTasks(
   return low_priority_tasks_to_pad;
 }
 
+template <typename TaskType>
+std::vector<std::unique_ptr<TaskType>>
+Queue<TaskType>::GetLowPriorityTasksForPadding(size_t batch_size) {
+  size_t target_batch_size;
+  switch (options_.mixed_priority_batching_policy) {
+    case MixedPriorityBatchingPolicy::kLowPriorityPaddingWithMaxBatchSize:
+      target_batch_size = max_execution_batch_size();
+      break;
+    case MixedPriorityBatchingPolicy::
+        kLowPriorityPaddingWithNextAllowedBatchSize:
+      target_batch_size = GetNextAllowedBatchSize(
+          batch_size, options_.allowed_batch_sizes, options_.disable_padding);
+      break;
+    default:
+      target_batch_size = 0;
+      break;
+  }
+
+  if (target_batch_size <= batch_size) {
+    return {};
+  }
+  return GetLowPriorityTasks(target_batch_size - batch_size);
+}
+
 template <typename TaskType>
 void Queue<TaskType>::ProcessBatch(
     std::unique_ptr<Batch<TaskType>> batch,
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc b/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
index 0e890a2931c92d..1c4073a065eef2 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
@@ -183,8 +183,11 @@ QueueOptions CreateQueueOptions(size_t max_execution_batch_size,
   return queue_options;
 }
 
-class SharedBatchSchedulerTest
-    : public ::testing::TestWithParam<std::tuple<bool, bool>> {
+class SharedBatchSchedulerTestBase {
+ public:
+  SharedBatchSchedulerTestBase() = default;
+  virtual ~SharedBatchSchedulerTestBase() = default;
+
  protected:
   QueueOptions CreateQueueOptions(size_t max_execution_batch_size,
                                   size_t input_batch_size_limit,
@@ -196,9 +199,9 @@ class SharedBatchSchedulerTest
         max_enqueued_batches, enable_input_batch_split(), enable_lazy_split(),
         get_split_func(), enable_priority_queue);
   }
-  bool enable_input_batch_split() const { return std::get<0>(GetParam()); }
+  virtual bool enable_input_batch_split() const = 0;
 
-  bool enable_lazy_split() const { return std::get<1>(GetParam()); }
+  virtual bool enable_lazy_split() const = 0;
 
   SplitFunc get_split_func() const {
     if (enable_input_batch_split()) {
@@ -228,6 +231,17 @@ class SharedBatchSchedulerTest
   }
 };
 
+class SharedBatchSchedulerTest
+    : public ::testing::TestWithParam<std::tuple<bool, bool>>,
+      public SharedBatchSchedulerTestBase {
+ protected:
+  bool enable_input_batch_split() const override {
+    return std::get<0>(GetParam());
+  }
+
+  bool enable_lazy_split() const override { return std::get<1>(GetParam()); }
+};
+
 TEST_P(SharedBatchSchedulerTest, Basic) {
   for (int num_batch_threads : {1, 2, 3}) {
     for (const bool delete_scheduler_early : {false, true}) {
@@ -341,8 +355,6 @@ TEST_P(SharedBatchSchedulerTest,
   EXPECT_TRUE(queue_1_callback_called);
 }
 
-// For now there shouldn't be much difference with the enabled case above since
-// nothing currently inserts any tasks to the low priority task queue.
 TEST_P(SharedBatchSchedulerTest,
        CallbackWithTaskVectorOkWithPriorityQueueDisabled) {
   bool queue_0_callback_called = false;
@@ -376,7 +388,7 @@ TEST_P(SharedBatchSchedulerTest,
     const QueueOptions queue_options = CreateQueueOptions(
         /*max_execution_batch_size=*/10, /*input_batch_size_limit=*/10,
         /*batch_timeout_micros=*/1 * 1000 * 1000, /*max_enqueued_batches=*/2,
-        /*enable_priority_queue=*/true);
+        /*enable_priority_queue=*/false);
     std::unique_ptr<Queue> queue_0 =
         CreateQueue(scheduler, queue_options, queue_0_callback);
     std::unique_ptr<Queue> queue_1 =
@@ -1051,7 +1063,21 @@ INSTANTIATE_TEST_SUITE_P(
                       std::make_tuple(/*enable_input_batch_split=*/false,
                                       /*enable_lazy_split=*/false)));
 
-using SharedBatchSchedulerPriorityTest = SharedBatchSchedulerTest;
+class SharedBatchSchedulerPriorityTest
+    : public ::testing::TestWithParam<
+          std::tuple<bool, bool, MixedPriorityBatchingPolicy>>,
+      public SharedBatchSchedulerTestBase {
+ protected:
+  bool enable_input_batch_split() const override {
+    return std::get<0>(GetParam());
+  }
+
+  bool enable_lazy_split() const override { return std::get<1>(GetParam()); }
+
+  MixedPriorityBatchingPolicy mixed_priority_batching_policy() const {
+    return std::get<2>(GetParam());
+  }
+};
 
 TEST_P(SharedBatchSchedulerPriorityTest,
        InvalidLowPriorityTaskWithPriorityQueueEnabled) {
@@ -1075,6 +1101,8 @@ TEST_P(SharedBatchSchedulerPriorityTest,
         1 * 1000 * 1000;
     queue_options.low_priority_queue_options.input_batch_size_limit = 1;
     queue_options.low_priority_queue_options.max_enqueued_batches = 2;
+    queue_options.mixed_priority_batching_policy =
+        mixed_priority_batching_policy();
     std::unique_ptr<Queue> queue =
         CreateQueue(scheduler, queue_options, queue_callback);
 
@@ -1091,7 +1119,7 @@ TEST_P(SharedBatchSchedulerPriorityTest,
 }
 
 TEST_P(SharedBatchSchedulerPriorityTest,
-       InvalidLowPriorityTaskWithQueueFullWithPriorityQueueEnabled) {
+       InvalidLowPriorityTaskWithQueueFullWithPriorityQueueEnabledNew) {
   Notification processing, proceed;
   auto queue_callback = [&processing, &proceed](
                             std::unique_ptr<Batch<FakeTask>> batch,
@@ -1114,14 +1142,14 @@ TEST_P(SharedBatchSchedulerPriorityTest,
       1 * 1000 * 1000;
   queue_options.low_priority_queue_options.input_batch_size_limit = 10;
   queue_options.low_priority_queue_options.max_enqueued_batches = 2;
+  queue_options.mixed_priority_batching_policy =
+      mixed_priority_batching_policy();
   std::unique_ptr<Queue> queue =
       CreateQueue(scheduler, queue_options, queue_callback);
 
   // Schedule one task and block the thread.
   TF_ASSERT_OK(ScheduleTask(5, queue.get(),
                             tsl::criticality::Criticality::kCriticalPlus));
-  TF_ASSERT_OK(ScheduleTask(5, queue.get(),
-                            tsl::criticality::Criticality::kSheddablePlus));
   processing.WaitForNotification();
   ASSERT_EQ(0, queue->NumEnqueuedTasks());
 
@@ -1150,34 +1178,29 @@ TEST_P(SharedBatchSchedulerPriorityTest,
 }
 
 TEST_P(SharedBatchSchedulerPriorityTest,
-       CallbackWithTaskVectorOkWithPriorityQueueEnabledWithPrioritySet) {
+       CallbackWithTaskVectorOkWithPriorityQueueDisabledWithPrioritySet) {
   bool queue_callback_called = false;
   auto queue_callback = [&queue_callback_called](
                             std::unique_ptr<Batch<FakeTask>> batch,
                             std::vector<std::unique_ptr<FakeTask>> tasks) {
     queue_callback_called = true;
     ASSERT_TRUE(batch->IsClosed());
-    ASSERT_EQ(2, batch->num_tasks());
+    ASSERT_EQ(3, batch->num_tasks());
     EXPECT_EQ(1, batch->task(0).size());
     EXPECT_EQ(3, batch->task(1).size());
-    EXPECT_EQ(1, tasks.size());
-    EXPECT_EQ(5, tasks[0]->size());
+    EXPECT_EQ(5, batch->task(2).size());
+    EXPECT_EQ(0, tasks.size());
   };
 
   {
     std::shared_ptr<Scheduler> scheduler =
         CreateSharedBatchScheduler(/*num_batch_threads=*/3);
 
-    // Create a queue with the priority queue enabled.
-    QueueOptions queue_options = CreateQueueOptions(
+    // Create a queue with the priority queue disabled.
+    const QueueOptions queue_options = CreateQueueOptions(
         /*max_execution_batch_size=*/10, /*input_batch_size_limit=*/10,
         /*batch_timeout_micros=*/1 * 1000 * 1000, /*max_enqueued_batches=*/2,
-        /*enable_priority_queue=*/true);
-    queue_options.low_priority_queue_options.max_execution_batch_size = 10;
-    queue_options.low_priority_queue_options.batch_timeout_micros =
-        1 * 1000 * 1000;
-    queue_options.low_priority_queue_options.input_batch_size_limit = 10;
-    queue_options.low_priority_queue_options.max_enqueued_batches = 2;
+        /*enable_priority_queue=*/false);
     std::unique_ptr<Queue> queue =
         CreateQueue(scheduler, queue_options, queue_callback);
 
@@ -1193,7 +1216,7 @@ TEST_P(SharedBatchSchedulerPriorityTest,
 }
 
 TEST_P(SharedBatchSchedulerPriorityTest,
-       CallbackWithTaskVectorOkWithPriorityQueueDisabledWithPrioritySet) {
+       LowPriorityTaskOnlyAtMaxBatchSizeWithPriorityQueueEnabled) {
   bool queue_callback_called = false;
   auto queue_callback = [&queue_callback_called](
                             std::unique_ptr<Batch<FakeTask>> batch,
@@ -1204,26 +1227,32 @@ TEST_P(SharedBatchSchedulerPriorityTest,
     EXPECT_EQ(1, batch->task(0).size());
     EXPECT_EQ(3, batch->task(1).size());
     EXPECT_EQ(5, batch->task(2).size());
-    EXPECT_EQ(0, tasks.size());
+    EXPECT_TRUE(tasks.empty());
   };
 
   {
     std::shared_ptr<Scheduler> scheduler =
         CreateSharedBatchScheduler(/*num_batch_threads=*/3);
 
-    // Create a queue with the priority queue disabled.
-    const QueueOptions queue_options = CreateQueueOptions(
-        /*max_execution_batch_size=*/10, /*input_batch_size_limit=*/10,
+    QueueOptions queue_options = CreateQueueOptions(
+        /*max_execution_batch_size=*/100, /*input_batch_size_limit=*/100,
         /*batch_timeout_micros=*/1 * 1000 * 1000, /*max_enqueued_batches=*/2,
-        /*enable_priority_queue=*/false);
+        /*enable_priority_queue=*/true);
+    queue_options.low_priority_queue_options.max_execution_batch_size = 9;
+    queue_options.low_priority_queue_options.batch_timeout_micros =
+        1 * 1000 * 1000;
+    queue_options.low_priority_queue_options.input_batch_size_limit = 10;
+    queue_options.low_priority_queue_options.max_enqueued_batches = 2;
+    queue_options.mixed_priority_batching_policy =
+        mixed_priority_batching_policy();
     std::unique_ptr<Queue> queue =
         CreateQueue(scheduler, queue_options, queue_callback);
 
-    // Submit tasks to the queue.
+    // Submit low priority tasks to fill up the max batch size.
     TF_ASSERT_OK(ScheduleTask(1, queue.get(),
-                              tsl::criticality::Criticality::kCriticalPlus));
+                              tsl::criticality::Criticality::kSheddablePlus));
     TF_ASSERT_OK(ScheduleTask(3, queue.get(),
-                              tsl::criticality::Criticality::kCriticalPlus));
+                              tsl::criticality::Criticality::kSheddablePlus));
     TF_ASSERT_OK(ScheduleTask(5, queue.get(),
                               tsl::criticality::Criticality::kSheddable));
   }
@@ -1231,7 +1260,7 @@ TEST_P(SharedBatchSchedulerPriorityTest,
 }
 
 TEST_P(SharedBatchSchedulerPriorityTest,
-       LowPriorityTaskOnlyAtMaxBatchSizeWithPriorityQueueEnabled) {
+       LowPriorityTaskOnlyAtTimeoutWithPriorityQueueEnabled) {
   bool queue_callback_called = false;
   auto queue_callback = [&queue_callback_called](
                             std::unique_ptr<Batch<FakeTask>> batch,
@@ -1253,15 +1282,18 @@ TEST_P(SharedBatchSchedulerPriorityTest,
         /*max_execution_batch_size=*/100, /*input_batch_size_limit=*/100,
         /*batch_timeout_micros=*/1 * 1000 * 1000, /*max_enqueued_batches=*/2,
         /*enable_priority_queue=*/true);
-    queue_options.low_priority_queue_options.max_execution_batch_size = 9;
+    queue_options.low_priority_queue_options.max_execution_batch_size = 20;
     queue_options.low_priority_queue_options.batch_timeout_micros =
         1 * 1000 * 1000;
     queue_options.low_priority_queue_options.input_batch_size_limit = 10;
     queue_options.low_priority_queue_options.max_enqueued_batches = 2;
+    queue_options.mixed_priority_batching_policy =
+        mixed_priority_batching_policy();
     std::unique_ptr<Queue> queue =
         CreateQueue(scheduler, queue_options, queue_callback);
 
-    // Submit low priority tasks to fill up the max batch size.
+    // Submit low priority tasks that wouldn't fill up the max batch size, but
+    // they should still be scheduled due to timeout.
     TF_ASSERT_OK(ScheduleTask(1, queue.get(),
                               tsl::criticality::Criticality::kSheddablePlus));
     TF_ASSERT_OK(ScheduleTask(3, queue.get(),
@@ -1272,55 +1304,314 @@ TEST_P(SharedBatchSchedulerPriorityTest,
   EXPECT_TRUE(queue_callback_called);
 }
 
-TEST_P(SharedBatchSchedulerPriorityTest,
-       LowPriorityTaskOnlyAtTimeoutWithPriorityQueueEnabled) {
+// Lazy split is to be removed. The mixed priority batching is only supported
+// when the lazy split is not enabled.
+INSTANTIATE_TEST_SUITE_P(
+    Parameter, SharedBatchSchedulerPriorityTest,
+    ::testing::Values(
+        std::make_tuple(
+            /*enable_input_batch_split=*/true,
+            /*enable_lazy_split=*/false,
+            MixedPriorityBatchingPolicy::kLowPriorityPaddingWithMaxBatchSize),
+        std::make_tuple(/*enable_input_batch_split=*/true,
+                        /*enable_lazy_split=*/false,
+                        MixedPriorityBatchingPolicy::
+                            kLowPriorityPaddingWithNextAllowedBatchSize),
+        std::make_tuple(
+            /*enable_input_batch_split=*/false,
+            /*enable_lazy_split=*/false,
+            MixedPriorityBatchingPolicy::kLowPriorityPaddingWithMaxBatchSize),
+        std::make_tuple(/*enable_input_batch_split=*/false,
+                        /*enable_lazy_split=*/false,
+                        MixedPriorityBatchingPolicy::
+                            kLowPriorityPaddingWithNextAllowedBatchSize),
+        std::make_tuple(
+            /*enable_input_batch_split=*/false,
+            /*enable_lazy_split=*/false,
+            MixedPriorityBatchingPolicy::kPriorityIsolation),
+        std::make_tuple(/*enable_input_batch_split=*/false,
+                        /*enable_lazy_split=*/false,
+                        MixedPriorityBatchingPolicy::kPriorityIsolation)));
+
+using SharedBatchSchedulerPriorityPolicyTest = SharedBatchSchedulerTest;
+
+TEST_P(SharedBatchSchedulerPriorityPolicyTest,
+       HighPriorityBatchPaddedUptoMaxBatchSize) {
   bool queue_callback_called = false;
   auto queue_callback = [&queue_callback_called](
                             std::unique_ptr<Batch<FakeTask>> batch,
                             std::vector<std::unique_ptr<FakeTask>> tasks) {
+    // Skip if this was called already, which is the low priority task only
+    // batch case.
+    if (queue_callback_called) return;
+
     queue_callback_called = true;
     ASSERT_TRUE(batch->IsClosed());
-    ASSERT_EQ(3, batch->num_tasks());
+    ASSERT_EQ(2, batch->num_tasks());
     EXPECT_EQ(1, batch->task(0).size());
     EXPECT_EQ(3, batch->task(1).size());
-    EXPECT_EQ(5, batch->task(2).size());
-    EXPECT_TRUE(tasks.empty());
+    EXPECT_EQ(2, tasks.size());
+    EXPECT_EQ(3, tasks[0]->size());
+    EXPECT_EQ(3, tasks[1]->size());
   };
 
   {
     std::shared_ptr<Scheduler> scheduler =
         CreateSharedBatchScheduler(/*num_batch_threads=*/3);
 
+    // Create a queue with the priority queue enabled.
     QueueOptions queue_options = CreateQueueOptions(
-        /*max_execution_batch_size=*/100, /*input_batch_size_limit=*/100,
+        /*max_execution_batch_size=*/10, /*input_batch_size_limit=*/10,
         /*batch_timeout_micros=*/1 * 1000 * 1000, /*max_enqueued_batches=*/2,
         /*enable_priority_queue=*/true);
-    queue_options.low_priority_queue_options.max_execution_batch_size = 20;
+    queue_options.low_priority_queue_options.max_execution_batch_size = 10;
     queue_options.low_priority_queue_options.batch_timeout_micros =
         1 * 1000 * 1000;
     queue_options.low_priority_queue_options.input_batch_size_limit = 10;
     queue_options.low_priority_queue_options.max_enqueued_batches = 2;
+    queue_options.mixed_priority_batching_policy =
+        MixedPriorityBatchingPolicy::kLowPriorityPaddingWithMaxBatchSize;
     std::unique_ptr<Queue> queue =
         CreateQueue(scheduler, queue_options, queue_callback);
 
-    // Submit low priority tasks that wouldn't fill up the max batch size, but
-    // they should still be scheduled due to timeout.
+    // Submit tasks to the queue. The high priority batch is padded by the low
+    // priority tasks only up to 10, which is the max batch size.
     TF_ASSERT_OK(ScheduleTask(1, queue.get(),
-                              tsl::criticality::Criticality::kSheddablePlus));
+                              tsl::criticality::Criticality::kCriticalPlus));
     TF_ASSERT_OK(ScheduleTask(3, queue.get(),
-                              tsl::criticality::Criticality::kSheddablePlus));
-    TF_ASSERT_OK(ScheduleTask(5, queue.get(),
+                              tsl::criticality::Criticality::kCriticalPlus));
+    TF_ASSERT_OK(ScheduleTask(3, queue.get(),
+                              tsl::criticality::Criticality::kSheddable));
+    TF_ASSERT_OK(ScheduleTask(3, queue.get(),
+                              tsl::criticality::Criticality::kSheddable));
+    TF_ASSERT_OK(ScheduleTask(3, queue.get(),
+                              tsl::criticality::Criticality::kSheddable));
+  }
+  EXPECT_TRUE(queue_callback_called);
+}
+
+TEST_P(SharedBatchSchedulerPriorityPolicyTest,
+       HighPriorityBatchPaddedUptoMaxAvailableBatchSize) {
+  bool queue_callback_called = false;
+  auto queue_callback = [&queue_callback_called](
+                            std::unique_ptr<Batch<FakeTask>> batch,
+                            std::vector<std::unique_ptr<FakeTask>> tasks) {
+    queue_callback_called = true;
+    ASSERT_TRUE(batch->IsClosed());
+    ASSERT_EQ(2, batch->num_tasks());
+    EXPECT_EQ(1, batch->task(0).size());
+    EXPECT_EQ(3, batch->task(1).size());
+    EXPECT_EQ(1, tasks.size());
+    EXPECT_EQ(3, tasks[0]->size());
+  };
+
+  {
+    std::shared_ptr<Scheduler> scheduler =
+        CreateSharedBatchScheduler(/*num_batch_threads=*/3);
+
+    // Create a queue with the priority queue enabled.
+    QueueOptions queue_options = CreateQueueOptions(
+        /*max_execution_batch_size=*/10, /*input_batch_size_limit=*/10,
+        /*batch_timeout_micros=*/1 * 1000 * 1000, /*max_enqueued_batches=*/2,
+        /*enable_priority_queue=*/true);
+    queue_options.low_priority_queue_options.max_execution_batch_size = 10;
+    queue_options.low_priority_queue_options.batch_timeout_micros =
+        1 * 1000 * 1000;
+    queue_options.low_priority_queue_options.input_batch_size_limit = 10;
+    queue_options.low_priority_queue_options.max_enqueued_batches = 2;
+    queue_options.mixed_priority_batching_policy =
+        MixedPriorityBatchingPolicy::kLowPriorityPaddingWithMaxBatchSize;
+    std::unique_ptr<Queue> queue =
+        CreateQueue(scheduler, queue_options, queue_callback);
+
+    // Submit tasks to the queue. The high priority batch is padded by the low
+    // priority tasks only up to 7, since there is no more available low
+    // priority task.
+    TF_ASSERT_OK(ScheduleTask(1, queue.get(),
+                              tsl::criticality::Criticality::kCriticalPlus));
+    TF_ASSERT_OK(ScheduleTask(3, queue.get(),
+                              tsl::criticality::Criticality::kCriticalPlus));
+    TF_ASSERT_OK(ScheduleTask(3, queue.get(),
+                              tsl::criticality::Criticality::kSheddable));
+  }
+  EXPECT_TRUE(queue_callback_called);
+}
+
+TEST_P(SharedBatchSchedulerPriorityPolicyTest,
+       HighPriorityBatchPaddedUptoNextAllowedBatchSize) {
+  bool queue_callback_called = false;
+  auto queue_callback = [&queue_callback_called](
+                            std::unique_ptr<Batch<FakeTask>> batch,
+                            std::vector<std::unique_ptr<FakeTask>> tasks) {
+    // Skip if this was called already, which is the low priority task only
+    // batch case.
+    if (queue_callback_called) return;
+
+    queue_callback_called = true;
+    ASSERT_TRUE(batch->IsClosed());
+    ASSERT_EQ(2, batch->num_tasks());
+    EXPECT_EQ(1, batch->task(0).size());
+    EXPECT_EQ(3, batch->task(1).size());
+    EXPECT_EQ(2, tasks.size());
+    EXPECT_EQ(2, tasks[0]->size());
+    EXPECT_EQ(2, tasks[1]->size());
+  };
+
+  {
+    std::shared_ptr<Scheduler> scheduler =
+        CreateSharedBatchScheduler(/*num_batch_threads=*/3);
+
+    // Create a queue with the priority queue enabled.
+    QueueOptions queue_options = CreateQueueOptions(
+        /*max_execution_batch_size=*/10, /*input_batch_size_limit=*/10,
+        /*batch_timeout_micros=*/1 * 1000 * 1000, /*max_enqueued_batches=*/2,
+        /*enable_priority_queue=*/true);
+    queue_options.allowed_batch_sizes = {2, 8, 16};
+    queue_options.low_priority_queue_options.max_execution_batch_size = 10;
+    queue_options.low_priority_queue_options.batch_timeout_micros =
+        1 * 1000 * 1000;
+    queue_options.low_priority_queue_options.input_batch_size_limit = 10;
+    queue_options.low_priority_queue_options.max_enqueued_batches = 2;
+    queue_options.mixed_priority_batching_policy = MixedPriorityBatchingPolicy::
+        kLowPriorityPaddingWithNextAllowedBatchSize;
+    std::unique_ptr<Queue> queue =
+        CreateQueue(scheduler, queue_options, queue_callback);
+
+    // Submit tasks to the queue. The high priority batch is padded by the low
+    // priority tasks only up to 8, which is the next allowed batch size.
+    TF_ASSERT_OK(ScheduleTask(1, queue.get(),
+                              tsl::criticality::Criticality::kCriticalPlus));
+    TF_ASSERT_OK(ScheduleTask(3, queue.get(),
+                              tsl::criticality::Criticality::kCriticalPlus));
+    TF_ASSERT_OK(ScheduleTask(2, queue.get(),
+                              tsl::criticality::Criticality::kSheddable));
+    TF_ASSERT_OK(ScheduleTask(2, queue.get(),
+                              tsl::criticality::Criticality::kSheddable));
+    TF_ASSERT_OK(ScheduleTask(2, queue.get(),
+                              tsl::criticality::Criticality::kSheddable));
+  }
+  EXPECT_TRUE(queue_callback_called);
+}
+
+TEST_P(SharedBatchSchedulerPriorityPolicyTest,
+       HighPriorityBatchNotPaddedWhenAllowedBatchSizeMissing) {
+  bool queue_callback_called = false;
+  auto queue_callback = [&queue_callback_called](
+                            std::unique_ptr<Batch<FakeTask>> batch,
+                            std::vector<std::unique_ptr<FakeTask>> tasks) {
+    // Skip if this was called already, which is the low priority task only
+    // batch case.
+    if (queue_callback_called) return;
+
+    queue_callback_called = true;
+    ASSERT_TRUE(batch->IsClosed());
+    ASSERT_EQ(2, batch->num_tasks());
+    EXPECT_EQ(1, batch->task(0).size());
+    EXPECT_EQ(3, batch->task(1).size());
+    EXPECT_EQ(0, tasks.size());
+  };
+
+  {
+    std::shared_ptr<Scheduler> scheduler =
+        CreateSharedBatchScheduler(/*num_batch_threads=*/3);
+
+    // Create a queue with the priority queue enabled.
+    QueueOptions queue_options = CreateQueueOptions(
+        /*max_execution_batch_size=*/10, /*input_batch_size_limit=*/10,
+        /*batch_timeout_micros=*/1 * 1000 * 1000, /*max_enqueued_batches=*/2,
+        /*enable_priority_queue=*/true);
+    queue_options.low_priority_queue_options.max_execution_batch_size = 10;
+    queue_options.low_priority_queue_options.batch_timeout_micros =
+        1 * 1000 * 1000;
+    queue_options.low_priority_queue_options.input_batch_size_limit = 10;
+    queue_options.low_priority_queue_options.max_enqueued_batches = 2;
+    queue_options.mixed_priority_batching_policy = MixedPriorityBatchingPolicy::
+        kLowPriorityPaddingWithNextAllowedBatchSize;
+    std::unique_ptr<Queue> queue =
+        CreateQueue(scheduler, queue_options, queue_callback);
+
+    // Submit tasks to the queue. The high priority batch is padded by the low
+    // priority tasks up to 10, which is the max batch size because the allowed
+    // batch sizes are missing.
+    TF_ASSERT_OK(ScheduleTask(1, queue.get(),
+                              tsl::criticality::Criticality::kCriticalPlus));
+    TF_ASSERT_OK(ScheduleTask(3, queue.get(),
+                              tsl::criticality::Criticality::kCriticalPlus));
+    TF_ASSERT_OK(ScheduleTask(2, queue.get(),
+                              tsl::criticality::Criticality::kSheddable));
+    TF_ASSERT_OK(ScheduleTask(2, queue.get(),
+                              tsl::criticality::Criticality::kSheddable));
+    TF_ASSERT_OK(ScheduleTask(2, queue.get(),
                               tsl::criticality::Criticality::kSheddable));
   }
   EXPECT_TRUE(queue_callback_called);
 }
 
+TEST_P(SharedBatchSchedulerPriorityPolicyTest,
+       HighPriorityBatchNotPaddedWithLowPriorityTasks) {
+  int queue_callback_counter = 0;
+  auto queue_callback = [&queue_callback_counter](
+                            std::unique_ptr<Batch<FakeTask>> batch,
+                            std::vector<std::unique_ptr<FakeTask>> tasks) {
+    // The first batch must be high priority only batch.
+    if (queue_callback_counter++ == 0) {
+      ASSERT_TRUE(batch->IsClosed());
+      ASSERT_EQ(2, batch->num_tasks());
+      EXPECT_EQ(1, batch->task(0).size());
+      EXPECT_EQ(3, batch->task(1).size());
+      return;
+    }
+
+    // The next batch must be the low priority only batch.
+    ASSERT_TRUE(batch->IsClosed());
+    ASSERT_EQ(2, batch->num_tasks());
+    EXPECT_EQ(3, batch->task(0).size());
+    EXPECT_EQ(3, batch->task(1).size());
+  };
+
+  {
+    std::shared_ptr<Scheduler> scheduler =
+        CreateSharedBatchScheduler(/*num_batch_threads=*/3);
+
+    // Create a queue with the priority queue enabled.
+    QueueOptions queue_options = CreateQueueOptions(
+        /*max_execution_batch_size=*/10, /*input_batch_size_limit=*/10,
+        /*batch_timeout_micros=*/1 * 1000 * 1000, /*max_enqueued_batches=*/2,
+        /*enable_priority_queue=*/true);
+    queue_options.low_priority_queue_options.max_execution_batch_size = 10;
+    queue_options.low_priority_queue_options.batch_timeout_micros =
+        1 * 1000 * 1000;
+    queue_options.low_priority_queue_options.input_batch_size_limit = 10;
+    queue_options.low_priority_queue_options.max_enqueued_batches = 2;
+    queue_options.mixed_priority_batching_policy =
+        MixedPriorityBatchingPolicy::kPriorityIsolation;
+    std::unique_ptr<Queue> queue =
+        CreateQueue(scheduler, queue_options, queue_callback);
+
+    // Submit tasks to the queue. The high priority batch and the low priority
+    // batch are scheduled separately.
+    TF_ASSERT_OK(ScheduleTask(1, queue.get(),
+                              tsl::criticality::Criticality::kCriticalPlus));
+    TF_ASSERT_OK(ScheduleTask(3, queue.get(),
+                              tsl::criticality::Criticality::kCriticalPlus));
+    TF_ASSERT_OK(ScheduleTask(3, queue.get(),
+                              tsl::criticality::Criticality::kSheddable));
+    TF_ASSERT_OK(ScheduleTask(3, queue.get(),
+                              tsl::criticality::Criticality::kSheddable));
+  }
+  EXPECT_EQ(queue_callback_counter, 2);
+}
+
 // Lazy split is to be removed. The mixed priority batching is only supported
 // when the lazy split is not enabled.
 INSTANTIATE_TEST_SUITE_P(
-    Parameter, SharedBatchSchedulerPriorityTest,
+    Parameter, SharedBatchSchedulerPriorityPolicyTest,
     ::testing::Values(std::make_tuple(/*enable_input_batch_split=*/true,
                                       /*enable_lazy_split=*/false),
+                      std::make_tuple(/*enable_input_batch_split=*/true,
+                                      /*enable_lazy_split=*/false),
+                      std::make_tuple(/*enable_input_batch_split=*/false,
+                                      /*enable_lazy_split=*/false),
                       std::make_tuple(/*enable_input_batch_split=*/false,
                                       /*enable_lazy_split=*/false)));
 
diff --git a/tensorflow/core/kernels/candidate_sampler_ops.cc b/tensorflow/core/kernels/candidate_sampler_ops.cc
index 5907ac337850dd..940e08cd71f880 100644
--- a/tensorflow/core/kernels/candidate_sampler_ops.cc
+++ b/tensorflow/core/kernels/candidate_sampler_ops.cc
@@ -71,7 +71,7 @@ class BaseCandidateSamplerOp : public OpKernel {
                    context->allocate_output(2, TensorShape({num_sampled_}),
                                             &out_sampled_expected_count));
 
-    gtl::ArraySlice<int64_t> true_candidate(
+    absl::Span<const int64_t> true_candidate(
         true_classes.matrix<int64_t>().data(), batch_size * num_true_);
 
     for (const auto& candidate : true_candidate) {
@@ -81,12 +81,12 @@ class BaseCandidateSamplerOp : public OpKernel {
                                           "), received ", candidate));
     }
 
-    gtl::MutableArraySlice<int64_t> sampled_candidate(
+    absl::Span<int64_t> sampled_candidate(
         out_sampled_candidates->vec<int64_t>().data(), num_sampled_);
-    gtl::MutableArraySlice<float> true_expected_count(
+    absl::Span<float> true_expected_count(
         out_true_expected_count->matrix<float>().data(),
         batch_size * num_true_);
-    gtl::MutableArraySlice<float> sampled_expected_count(
+    absl::Span<float> sampled_expected_count(
         out_sampled_expected_count->vec<float>().data(), num_sampled_);
 
     // Approximately conservatively estimate the number of samples required.
diff --git a/tensorflow/core/kernels/collective_ops.cc b/tensorflow/core/kernels/collective_ops.cc
index dea6e0ec5f0953..f5a882b792ca4f 100644
--- a/tensorflow/core/kernels/collective_ops.cc
+++ b/tensorflow/core/kernels/collective_ops.cc
@@ -646,12 +646,12 @@ class CollectiveOpV2Kernel : public AsyncOpKernel {
   void Run(OpKernelContext* c, CollectiveParams* col_params,
            DoneCallback done) {
     // Trace the Run event.
-    profiler::TraceMeProducer producer(
+    tsl::profiler::TraceMeProducer producer(
         [this] {
-          return profiler::TraceMeEncode("CollectiveOpV2Kernel::Run",
-                                         {{"name", name()}});
+          return tsl::profiler::TraceMeEncode("CollectiveOpV2Kernel::Run",
+                                              {{"name", name()}});
         },
-        profiler::ContextType::kTfExecutor);
+        tsl::profiler::ContextType::kTfExecutor);
     auto xprof_ctx_id = producer.GetContextId();
 
     CollectiveExecutor* col_exec = c->collective_executor();
@@ -689,12 +689,13 @@ class CollectiveOpV2Kernel : public AsyncOpKernel {
     c->collective_executor()->RunClosure([c, activity_id, xprof_ctx_id,
                                           done = std::move(done), col_params,
                                           col_exec]() mutable {
-      profiler::TraceMeConsumer consumer(
+      tsl::profiler::TraceMeConsumer consumer(
           [&] {
-            return profiler::TraceMeEncode("CollectiveExecutor::RunClosure",
-                                           {{"name", c->op_kernel().name()}});
+            return tsl::profiler::TraceMeEncode(
+                "CollectiveExecutor::RunClosure",
+                {{"name", c->op_kernel().name()}});
           },
-          profiler::ContextType::kTfExecutor, xprof_ctx_id);
+          tsl::profiler::ContextType::kTfExecutor, xprof_ctx_id);
 
       VLOG(1) << "Collective CompleteParams for " << col_params->name
               << " device " << c->device()->name() << " group "
@@ -704,24 +705,24 @@ class CollectiveOpV2Kernel : public AsyncOpKernel {
           c->device()->attributes(), col_params, c->cancellation_manager(),
           [c, activity_id, xprof_ctx_id, done = std::move(done), col_params,
            col_exec](const Status& s) mutable {
-            profiler::TraceMeConsumer consumer(
+            tsl::profiler::TraceMeConsumer consumer(
                 [&] {
-                  return profiler::TraceMeEncode(
+                  return tsl::profiler::TraceMeEncode(
                       "CollectiveExecutor::CompleteParamsAsync::Done",
                       {{"name", c->op_kernel().name()}});
                 },
-                profiler::ContextType::kTfExecutor, xprof_ctx_id);
+                tsl::profiler::ContextType::kTfExecutor, xprof_ctx_id);
 
             if (s.ok()) {
               auto actual_done = [c, activity_id, col_params, xprof_ctx_id,
                                   done = std::move(done)](const Status& s) {
-                profiler::TraceMeConsumer consumer(
+                tsl::profiler::TraceMeConsumer consumer(
                     [&] {
-                      return profiler::TraceMeEncode(
+                      return tsl::profiler::TraceMeEncode(
                           "CollectiveExecutor::ExecuteAsync::Done",
                           {{"name", c->op_kernel().name()}});
                     },
-                    profiler::ContextType::kTfExecutor, xprof_ctx_id);
+                    tsl::profiler::ContextType::kTfExecutor, xprof_ctx_id);
 
                 VLOG(1) << "Collective ExecuteAsync done for "
                         << col_params->name << " device " << c->device()->name()
diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index ad7f924dc02ae4..bfe1a2773e7a18 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -77,7 +77,7 @@ ConstantOp::ConstantOp(OpKernelConstruction* ctx)
     : OpKernel(ctx, StripTensorDataFromNodeDef(ctx), false),
       tensor_(ctx->output_type(0)) {
   const TensorProto* proto = nullptr;
-  profiler::ScopedMemoryDebugAnnotation op_annotation(name_view().data());
+  tsl::profiler::ScopedMemoryDebugAnnotation op_annotation(name_view().data());
   OP_REQUIRES_OK(ctx, ctx->GetAttr("value", &proto));
   OP_REQUIRES_OK(ctx, ctx->device()->MakeTensorFromProto(
                           *proto, AllocatorAttributes(), &tensor_));
diff --git a/tensorflow/core/kernels/conv_2d.h b/tensorflow/core/kernels/conv_2d.h
index 3354b6e0bad61a..fac56dbce06973 100644
--- a/tensorflow/core/kernels/conv_2d.h
+++ b/tensorflow/core/kernels/conv_2d.h
@@ -551,7 +551,7 @@ struct NCHWToNHWC {
 template <typename Device, typename T, bool conjugate = false>
 struct SwapDimension1And2InTensor3 {
   void operator()(const Device& d, const T* in,
-                  const gtl::ArraySlice<int64_t>& input_dims, T* out);
+                  const absl::Span<const int64_t>& input_dims, T* out);
 };
 
 // Converts a tensor from:
@@ -561,7 +561,7 @@ struct SwapDimension1And2InTensor3 {
 template <typename Device, typename T, bool conjugate = false>
 struct SwapDimension0And2InTensor3 {
   void operator()(const Device& d, const T* in,
-                  const gtl::ArraySlice<int64_t>& input_dims, T* out);
+                  const absl::Span<const int64_t>& input_dims, T* out);
 };
 
 // Transforms back filter from OIHW or OHWI to HWOI format to reverse effect of
diff --git a/tensorflow/core/kernels/conv_grad_shape_utils.cc b/tensorflow/core/kernels/conv_grad_shape_utils.cc
index 4bce9b873473f2..00aceb02e31f5b 100644
--- a/tensorflow/core/kernels/conv_grad_shape_utils.cc
+++ b/tensorflow/core/kernels/conv_grad_shape_utils.cc
@@ -53,7 +53,7 @@ namespace {
 Status ConvBackpropExtractAndVerifyDimension(
     StringPiece label, const TensorShape& input_shape,
     const TensorShape& filter_shape, const TensorShape& output_shape,
-    const gtl::ArraySlice<int32> dilations, const std::vector<int32>& strides,
+    const absl::Span<const int32> dilations, const std::vector<int32>& strides,
     Padding padding, int64_t padding_before, int64_t padding_after,
     int spatial_dim, int filter_spatial_dim,
     ConvBackpropSpatialDimension* dim) {
@@ -95,7 +95,7 @@ Status ConvBackpropExtractAndVerifyDimension(
 Status ConvBackpropComputeDimensionsV2(
     StringPiece label, int num_spatial_dims, const TensorShape& input_shape,
     const TensorShape& filter_shape, const TensorShape& out_backprop_shape,
-    const gtl::ArraySlice<int32>& dilations, const std::vector<int32>& strides,
+    const absl::Span<const int32>& dilations, const std::vector<int32>& strides,
     Padding padding, absl::Span<const int64_t> explicit_paddings,
     TensorFormat data_format, ConvBackpropDimensions* dims) {
   // The + 2 in the following line is for the batch and feature dimensions.
diff --git a/tensorflow/core/kernels/conv_grad_shape_utils.h b/tensorflow/core/kernels/conv_grad_shape_utils.h
index 37affc8455c696..8d105a9df92e0a 100644
--- a/tensorflow/core/kernels/conv_grad_shape_utils.h
+++ b/tensorflow/core/kernels/conv_grad_shape_utils.h
@@ -80,7 +80,7 @@ Status ConvBackpropComputeDimensions(StringPiece label, int num_spatial_dims,
 Status ConvBackpropComputeDimensionsV2(
     StringPiece label, int num_spatial_dims, const TensorShape& input_shape,
     const TensorShape& filter_shape, const TensorShape& out_backprop_shape,
-    const gtl::ArraySlice<int32>& dilations, const std::vector<int32>& strides,
+    const absl::Span<const int32>& dilations, const std::vector<int32>& strides,
     Padding padding, absl::Span<const int64_t> explicit_paddings,
     TensorFormat data_format, ConvBackpropDimensions* dims);
 
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index fc902bf6360ade..bf855d8eba0c40 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -26,6 +26,7 @@ tf_kernel_library(
         "//tensorflow/core/data:global_shuffle_utils",
         "//tensorflow/core/data:name_utils",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:mutex",
     ],
@@ -65,12 +66,12 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core/data:dataset_utils",
         "//tensorflow/core/data:name_utils",
         "//tensorflow/core/data:serialization_utils",
         "//tensorflow/core/framework:dataset_options_proto_cc",
         "//tensorflow/core/util/tensor_bundle",
         "//tensorflow/core/util/tensor_bundle:naming",
+        "@com_google_absl//absl/status",
     ],
 )
 
@@ -1084,6 +1085,7 @@ tf_kernel_library(
         "//tensorflow/core/data:global_shuffle_utils",
         "//tensorflow/core/data:name_utils",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:thread_annotations",
@@ -1161,6 +1163,7 @@ tf_kernel_library(
         "//tensorflow/core/data:global_shuffle_utils",
         "//tensorflow/core/data:name_utils",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:thread_annotations",
     ],
@@ -1233,8 +1236,10 @@ tf_kernel_library(
         "//tensorflow/core/data:global_shuffle_utils",
         "//tensorflow/core/data:name_utils",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
diff --git a/tensorflow/core/kernels/data/batch_dataset_op.cc b/tensorflow/core/kernels/data/batch_dataset_op.cc
index 13db60592ab475..238706787c6b13 100644
--- a/tensorflow/core/kernels/data/batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/batch_dataset_op.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/core/data/dataset_utils.h"
 #include "tensorflow/core/data/global_shuffle_utils.h"
@@ -34,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/platform/stringprintf.h"
 #include "tensorflow/core/util/batch_util.h"
 #include "tsl/platform/mutex.h"
+#include "tsl/platform/statusor.h"
 
 namespace tensorflow {
 namespace data {
@@ -257,11 +259,11 @@ class BatchDatasetOp::Dataset : public DatasetBase {
         IndexMapperFn parent_index_mapper) const override {
       int64_t batch_size = dataset()->batch_size_;
       return [parent_index_mapper,
-              batch_size](size_t element_position) -> size_t {
+              batch_size](size_t element_position) -> absl::StatusOr<size_t> {
         size_t batch_element_position = element_position / batch_size;
         size_t input_element_offset = element_position % batch_size;
-        size_t shuffled_element_position =
-            parent_index_mapper(batch_element_position);
+        TF_ASSIGN_OR_RETURN(size_t shuffled_element_position,
+                            parent_index_mapper(batch_element_position));
         return shuffled_element_position * batch_size + input_element_offset;
       };
     }
diff --git a/tensorflow/core/kernels/data/cache_dataset_ops.cc b/tensorflow/core/kernels/data/cache_dataset_ops.cc
index 98d75f2cfe2641..3460bbc7475ca2 100644
--- a/tensorflow/core/kernels/data/cache_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/cache_dataset_ops.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "tensorflow/core/data/name_utils.h"
 #include "tensorflow/core/data/serialization_utils.h"
 #include "tensorflow/core/framework/dataset.h"
@@ -76,9 +77,10 @@ constexpr char kIncompleteCacheErrorMessage[] =
     "should use `dataset.take(k).cache().repeat()` instead.";
 }  // namespace
 
-class PartialCache {
+class DatasetRandomAccessCache {
  public:
-  explicit PartialCache(const DatasetBase* dataset) : input_(dataset) {}
+  explicit DatasetRandomAccessCache(const DatasetBase* dataset)
+      : input_(dataset) {}
 
   // Extends the temporary cache up to a given index and then updates
   // out_tensors with the element at that index.
@@ -136,6 +138,43 @@ class PartialCache {
   std::vector<std::vector<Tensor>> cache_;
 };
 
+// Caches dataset elements when global shuffling is enabled.
+// TODO(b/325112575): Support save/load.
+class IteratorRandomAccessCache {
+ public:
+  explicit IteratorRandomAccessCache(const DatasetBase* input)
+      : input_(input) {}
+
+  absl::Status Get(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                   bool* end_of_sequence) {
+    TF_ASSIGN_OR_RETURN(size_t element_position,
+                        ctx->index_mapper()(element_count_++));
+    if (element_position < cache_.size() && !cache_[element_position].empty()) {
+      *out_tensors = cache_[element_position];
+      *end_of_sequence = false;
+      return absl::OkStatus();
+    }
+
+    absl::Status status =
+        input_->Get(AnyContext(ctx), element_position, out_tensors);
+    if (absl::IsOutOfRange(status)) {
+      *end_of_sequence = true;
+      return absl::OkStatus();
+    }
+
+    if (element_position >= cache_.size()) {
+      cache_.resize(element_position + 1);
+    }
+    cache_[element_position] = *out_tensors;
+    return absl::OkStatus();
+  }
+
+ private:
+  int64_t element_count_ = 0;
+  const DatasetBase* input_ = nullptr;
+  std::vector<std::vector<Tensor>> cache_;
+};
+
 class CacheDatasetOp::FileDatasetBase : public DatasetBase {
  public:
   FileDatasetBase(OpKernelContext* ctx, const DatasetBase* input,
@@ -736,6 +775,7 @@ class CacheDatasetOp::MemoryDatasetBase : public DatasetBase {
         input_(input),
         cache_(std::move(cache)) {
     input_->Ref();
+    random_indexing_compatible_ = input_->RandomIndexingCompatible();
   }
 
   ~MemoryDatasetBase() override { input_->Unref(); }
@@ -781,10 +821,11 @@ class CacheDatasetOp::MemoryDatasetBase : public DatasetBase {
       return errors::OutOfRange("Index out of range [0, ", cardinality,
                                 "):", index);
     }
-    if (!partial_cache_) {
-      partial_cache_ = std::make_unique<PartialCache>(input_);
+    if (!dataset_random_access_cache_) {
+      dataset_random_access_cache_ =
+          std::make_unique<DatasetRandomAccessCache>(input_);
     }
-    return partial_cache_->Get(ctx, index, out_tensors);
+    return dataset_random_access_cache_->Get(ctx, index, out_tensors);
   }
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
@@ -796,6 +837,10 @@ class CacheDatasetOp::MemoryDatasetBase : public DatasetBase {
     return input_->CheckExternalState();
   }
 
+  absl::Status RandomIndexingCompatible() const override {
+    return random_indexing_compatible_;
+  }
+
  protected:
   class MemoryIterator : public DatasetIterator<MemoryDatasetBase> {
    public:
@@ -811,6 +856,14 @@ class CacheDatasetOp::MemoryDatasetBase : public DatasetBase {
                            std::vector<Tensor>* out_tensors,
                            bool* end_of_sequence) override {
       mutex_lock l(mu_);
+      if (ctx->index_mapper() != nullptr) {
+        if (!iterator_random_access_cache_) {
+          iterator_random_access_cache_ =
+              std::make_unique<IteratorRandomAccessCache>(dataset()->input_);
+        }
+        return iterator_random_access_cache_->Get(ctx, out_tensors,
+                                                  end_of_sequence);
+      }
       return iterator_->GetNext(ctx, out_tensors, end_of_sequence);
     }
 
@@ -1015,12 +1068,16 @@ class CacheDatasetOp::MemoryDatasetBase : public DatasetBase {
     mutex mu_;
     MemoryCache* cache_ TF_GUARDED_BY(mu_);  // not owned.
     std::unique_ptr<IteratorBase> iterator_ TF_GUARDED_BY(mu_);
+    std::unique_ptr<IteratorRandomAccessCache> iterator_random_access_cache_
+        TF_GUARDED_BY(mu_);
   };  // MemoryIterator
 
   mutable mutex mu_;
   const DatasetBase* const input_;
   const std::shared_ptr<MemoryCache> cache_;
-  mutable std::unique_ptr<PartialCache> partial_cache_ TF_GUARDED_BY(mu_);
+  mutable std::unique_ptr<DatasetRandomAccessCache> dataset_random_access_cache_
+      TF_GUARDED_BY(mu_);
+  absl::Status random_indexing_compatible_ = absl::OkStatus();
 };  // MemoryDatasetBase
 
 // This version of memory dataset has an exclusive ownership of the memory cache
diff --git a/tensorflow/core/kernels/data/experimental/BUILD b/tensorflow/core/kernels/data/experimental/BUILD
index 81f4abfbcb0bba..c6192b9e41f320 100644
--- a/tensorflow/core/kernels/data/experimental/BUILD
+++ b/tensorflow/core/kernels/data/experimental/BUILD
@@ -264,6 +264,8 @@ tf_kernel_library(
         "//tensorflow/core/data/service:dispatcher_client",
         "//tensorflow/core/data/service:grpc_util",
         "//tensorflow/core/data/service:py_utils",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
     ],
 )
@@ -346,6 +348,7 @@ tf_kernel_library(
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
     ],
@@ -391,6 +394,29 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "index_flat_map_dataset_op",
+    srcs = ["index_flat_map_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_types_hdr",
+        "//tensorflow/core/data:captured_function",
+        "//tensorflow/core/data:dataset_utils",
+        "//tensorflow/core/data:name_utils",
+        "//tensorflow/core/framework:dataset_options_proto_cc",
+        "//tensorflow/core/framework:op_requires",
+        "//tensorflow/core/framework:types_proto_cc",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
 tf_kernel_library(
     name = "save_dataset_op",
     srcs = ["save_dataset_op.cc"],
@@ -917,6 +943,27 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "weighted_flat_map_dataset_op",
+    srcs = ["weighted_flat_map_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/data:captured_function",
+        "//tensorflow/core/data:name_utils",
+        "//tensorflow/core/framework:types_proto_cc",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
 tf_cc_test(
     name = "unique_dataset_op_test",
     size = "small",
@@ -963,6 +1010,7 @@ tf_kernel_library(
         ":group_by_reducer_dataset_op",
         ":group_by_window_dataset_op",
         ":ignore_errors_dataset_op",
+        ":index_flat_map_dataset_op",
         ":list_dataset_op",
         ":load_dataset_op",
         ":lookup_ops",
@@ -990,6 +1038,7 @@ tf_kernel_library(
         ":to_tf_record_op",
         ":unbatch_dataset_op",
         ":unique_dataset_op",
+        ":weighted_flat_map_dataset_op",
         "//tensorflow/core/data/service/snapshot:list_snapshot_chunks_dataset_op",
         "//tensorflow/core/data/service/snapshot:snapshot_chunk_dataset_op",
     ] + select({
diff --git a/tensorflow/core/kernels/data/experimental/distributed_save_op.cc b/tensorflow/core/kernels/data/experimental/distributed_save_op.cc
index ed04719126c7b2..7e737026c8d4a4 100644
--- a/tensorflow/core/kernels/data/experimental/distributed_save_op.cc
+++ b/tensorflow/core/kernels/data/experimental/distributed_save_op.cc
@@ -12,11 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #include "tensorflow/core/kernels/data/experimental/distributed_save_op.h"
 
+#include <string>
 #include <utility>
 
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "absl/time/time.h"
 #include "tensorflow/core/data/serialization_utils.h"
 #include "tensorflow/core/data/service/common.pb.h"
@@ -65,6 +67,14 @@ void DistributedSaveOp::Compute(OpKernelContext* ctx) {
   OP_REQUIRES(ctx, !address.empty(),
               errors::InvalidArgument(kAddress, " must be nonempty"));
 
+  bool has_atomic_move = false;
+  OP_REQUIRES_OK(ctx, ctx->env()->HasAtomicMove(directory, &has_atomic_move));
+  OP_REQUIRES(ctx, has_atomic_move,
+              absl::FailedPreconditionError(absl::StrCat(
+                  "The file system for ", std::string(directory),
+                  " does not support atomic move (rename), which is required "
+                  "to write tf.data snapshots.")));
+
   SerializationContext::Params params(ctx);
   SerializationContext serialization_ctx(params);
   DatasetDef dataset_def;
diff --git a/tensorflow/core/kernels/data/experimental/global_shuffle_dataset_op.cc b/tensorflow/core/kernels/data/experimental/global_shuffle_dataset_op.cc
index ad0006724bd5ef..346121c0eb6982 100644
--- a/tensorflow/core/kernels/data/experimental/global_shuffle_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/global_shuffle_dataset_op.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/base/thread_annotations.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/synchronization/mutex.h"
 #include "tensorflow/core/data/name_utils.h"
@@ -40,6 +41,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/data/random_seed_ops.h"
 #include "tensorflow/core/kernels/random_index_shuffle.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
 
 namespace tensorflow {
 namespace data {
@@ -155,6 +157,10 @@ class GlobalShuffleDatasetOp::Dataset : public DatasetBase {
         output);
   }
 
+  absl::Status RandomIndexingCompatible() const override {
+    return absl::OkStatus();
+  }
+
  private:
   class Iterator;
 
@@ -211,9 +217,10 @@ class GlobalShuffleDatasetOp::Dataset::Iterator
     uint64_t max_index =
         cardinality_ > 0 ? static_cast<uint64_t>(cardinality_ - 1) : 0;
     return [parent_index_mapper, seed, seed2, seed3,
-            max_index](size_t element_position) -> size_t {
+            max_index](size_t element_position) -> absl::StatusOr<size_t> {
       if (parent_index_mapper != nullptr) {
-        element_position = parent_index_mapper(element_position);
+        TF_ASSIGN_OR_RETURN(element_position,
+                            parent_index_mapper(element_position));
       }
       // This could happen if the source dataset generates more elements than
       // needed by the intermediate transformations. For example, when shuffling
@@ -253,6 +260,7 @@ class GlobalShuffleDatasetOp::Dataset::Iterator
     }
     IteratorContext::Params params(ctx);
     params.restored_element_count = element_count_;
+    params.index_mapper = GetIndexMapper(ctx->index_mapper());
     IteratorContext ctx_copy(params);
     TF_RETURN_IF_ERROR(RestoreInput(&ctx_copy, reader, input_impl_));
     ctx->MergeCheckpoint(ctx_copy.checkpoint());
diff --git a/tensorflow/core/kernels/data/experimental/index_flat_map_dataset_op.cc b/tensorflow/core/kernels/data/experimental/index_flat_map_dataset_op.cc
new file mode 100644
index 00000000000000..0938502220a125
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/index_flat_map_dataset_op.cc
@@ -0,0 +1,474 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdint>
+#include <cstdlib>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/core/data/captured_function.h"
+#include "tensorflow/core/data/dataset_utils.h"
+#include "tensorflow/core/data/name_utils.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/dataset_options.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/op_requires.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+constexpr const char kDatasetType[] = "IndexFlatMap";
+constexpr const char kIndexFlatMapDataset[] = "IndexFlatMapDataset";
+constexpr const char kMapFn[] = "map_func";
+constexpr const char kMapFuncTargs[] = "Tmap_func_args";
+constexpr const char kMapFuncOtherArgs[] = "map_func_other_args";
+constexpr const char kIndexMapFn[] = "index_map_func";
+constexpr const char kIndexMapFuncTargs[] = "Tindex_map_func_args";
+constexpr const char kIndexMapFuncOtherArgs[] = "index_map_func_other_args";
+constexpr const char kOutputCardinality[] = "output_cardinality";
+constexpr const char kOutputTypes[] = "output_types";
+constexpr const char kOutputShapes[] = "output_shapes";
+constexpr const char kElementCount[] = "element_count";
+constexpr const char kInputElementCount[] = "input_element_count";
+constexpr const char kInputUnflattenedTensors[] = "input_unflattened_tensors";
+constexpr const char kInputUnflattenedTensorsSize[] =
+    "input_unflattened_tensors_size";
+
+std::string ToDebugString(const std::vector<Tensor>& tensors) {
+  std::vector<std::string> tensor_strs;
+  tensor_strs.reserve(tensors.size());
+  for (const Tensor& tensor : tensors) {
+    tensor_strs.push_back(tensor.DebugString());
+  }
+  return absl::StrCat("{", absl::StrJoin(tensor_strs, ", "), "}");
+}
+
+absl::StatusOr<size_t> GetValue(const Tensor& tensor) {
+  switch (tensor.dtype()) {
+    case DT_UINT64:
+      return tensor.scalar<uint64_t>()();
+    case DT_UINT32:
+      return tensor.scalar<uint32_t>()();
+    case DT_INT64:
+      return tensor.scalar<int64_t>()();
+    case DT_INT32:
+      return tensor.scalar<int32_t>()();
+    default:
+      return absl::InvalidArgumentError(absl::StrCat(
+          "The `index_map_fn` for `index_flat_map` is expected to return two "
+          "int32/int64 values representing the element index and an offset "
+          "within the element. Got: ",
+          tensor.DebugString()));
+  }
+}
+
+// Returns the `offset`-th element from `tensors`.
+absl::StatusOr<std::vector<Tensor>> GetSlice(const std::vector<Tensor>& tensors,
+                                             size_t offset) {
+  std::vector<Tensor> result;
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    if (tensors[i].dims() == 0) {  // Scalar.
+      result.push_back(tensors[i]);
+      continue;
+    }
+    if (offset > tensors[i].dim_size(0)) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "`index_flat_map` got invalid `index_map_fn` which returns offset ",
+          offset, ", but the input element has ", tensors[i].dim_size(0),
+          " elements: ", tensors[i].DebugString()));
+    }
+    result.push_back(MaybeCopySubSlice(tensors[i], offset));
+  }
+  return result;
+}
+
+class IndexFlatMapDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit IndexFlatMapDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  class Dataset;
+  std::shared_ptr<FunctionMetadata> map_func_metadata_ = nullptr;
+  std::shared_ptr<FunctionMetadata> index_map_func_metadata_ = nullptr;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
+class IndexFlatMapDatasetOp::Dataset : public DatasetBase {
+ public:
+  Dataset(OpKernelContext* ctx, const DatasetBase* input,
+          std::unique_ptr<CapturedFunction> captured_map_func,
+          std::unique_ptr<CapturedFunction> captured_index_map_func,
+          const int64_t output_cardinality, const DataTypeVector& output_types,
+          const std::vector<PartialTensorShape>& output_shapes)
+      : DatasetBase(DatasetContext(ctx)),
+        input_(input),
+        captured_map_func_(std::move(captured_map_func)),
+        captured_index_map_func_(std::move(captured_index_map_func)),
+        output_cardinality_(output_cardinality),
+        output_types_(output_types),
+        output_shapes_(output_shapes) {
+    input_->Ref();
+  }
+
+  ~Dataset() override { input_->Unref(); }
+
+  const DataTypeVector& output_dtypes() const override { return output_types_; }
+
+  const std::vector<PartialTensorShape>& output_shapes() const override {
+    return output_shapes_;
+  }
+
+  std::string DebugString() const override {
+    return name_utils::DatasetDebugString(kDatasetType);
+  }
+
+  int64_t CardinalityInternal(CardinalityOptions options) const override {
+    return output_cardinality_;
+  }
+
+  absl::Status InputDatasets(
+      std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return absl::OkStatus();
+  }
+
+  absl::Status CheckExternalState() const override {
+    return input_->CheckExternalState();
+  }
+
+  absl::Status RandomIndexingCompatible() const override {
+    return absl::OkStatus();
+  }
+
+ protected:
+  std::unique_ptr<IteratorBase> MakeIteratorInternal(
+      const std::string& prefix) const override;
+
+  absl::Status AsGraphDefInternal(SerializationContext* ctx,
+                                  DatasetGraphDefBuilder* b,
+                                  Node** output) const override {
+    Node* input_graph_node = nullptr;
+    TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
+
+    std::vector<Node*> map_func_other_args;
+    DataTypeVector map_func_other_args_types;
+    TF_RETURN_IF_ERROR(captured_map_func_->AddToGraph(
+        ctx, b, &map_func_other_args, &map_func_other_args_types));
+
+    std::vector<Node*> index_map_func_other_args;
+    DataTypeVector index_map_func_other_args_types;
+    TF_RETURN_IF_ERROR(captured_index_map_func_->AddToGraph(
+        ctx, b, &index_map_func_other_args, &index_map_func_other_args_types));
+
+    Node* output_cardinality;
+    TF_RETURN_IF_ERROR(b->AddScalar(output_cardinality_, &output_cardinality));
+
+    AttrValue map_func_attr;
+    b->BuildAttrValue(captured_map_func_->func(), &map_func_attr);
+
+    AttrValue map_func_arguments_types_attr;
+    b->BuildAttrValue(map_func_other_args_types,
+                      &map_func_arguments_types_attr);
+
+    AttrValue index_map_func_attr;
+    b->BuildAttrValue(captured_index_map_func_->func(), &index_map_func_attr);
+
+    AttrValue index_map_func_arguments_types_attr;
+    b->BuildAttrValue(index_map_func_other_args_types,
+                      &index_map_func_arguments_types_attr);
+
+    return b->AddDataset(
+        this,
+        /*inputs=*/
+        {std::make_pair(0, input_graph_node),
+         std::make_pair(3, output_cardinality)},
+        /*list_inputs=*/
+        {std::make_pair(1, map_func_other_args),
+         std::make_pair(2, index_map_func_other_args)},
+        /*attrs=*/
+        {{kMapFn, map_func_attr},
+         {kMapFuncTargs, map_func_arguments_types_attr},
+         {kIndexMapFn, index_map_func_attr},
+         {kIndexMapFuncTargs, index_map_func_arguments_types_attr}},
+        output);
+  }
+
+ private:
+  class Iterator;
+  const DatasetBase* const input_;
+  const std::unique_ptr<CapturedFunction> captured_map_func_;
+  const std::unique_ptr<CapturedFunction> captured_index_map_func_;
+  const int64_t output_cardinality_;
+  const DataTypeVector output_types_;
+  const std::vector<PartialTensorShape> output_shapes_;
+};
+
+class IndexFlatMapDatasetOp::Dataset::Iterator
+    : public DatasetIterator<Dataset> {
+ public:
+  explicit Iterator(const Params& params) : DatasetIterator<Dataset>(params) {}
+
+  absl::Status Initialize(IteratorContext* ctx) override
+      ABSL_LOCKS_EXCLUDED(mu_) {
+    absl::MutexLock l(&mu_);
+    TF_RETURN_IF_ERROR(
+        dataset()->input_->MakeIterator(ctx, this, prefix(), &input_impl_));
+    TF_RETURN_IF_ERROR(dataset()->captured_map_func_->Instantiate(
+        ctx, &instantiated_map_func_));
+    TF_RETURN_IF_ERROR(dataset()->captured_index_map_func_->Instantiate(
+        ctx, &instantiated_index_map_func_));
+    return absl::OkStatus();
+  }
+
+  absl::Status GetNextInternal(IteratorContext* ctx,
+                               std::vector<Tensor>* out_tensors,
+                               bool* end_of_sequence) override
+      ABSL_LOCKS_EXCLUDED(mu_) {
+    absl::MutexLock l(&mu_);
+    *end_of_sequence = false;
+    if (ctx->index_mapper() == nullptr) {
+      absl::StatusOr<std::tuple<size_t, size_t>> next_input_index_and_offset =
+          GetUnflattenedIndex(ctx, element_count_);
+      TF_RETURN_IF_ERROR(next_input_index_and_offset.status());
+      const auto [next_input_index, offset] = *next_input_index_and_offset;
+      // When all the values of the current input element have been read,
+      // advances to the next input element. Otherwise, returns an element from
+      // the current `input_unflattened_tensors_`.
+      if (next_input_index > input_element_count_ ||
+          input_unflattened_tensors_.empty()) {
+        input_unflattened_tensors_.clear();
+        TF_RETURN_IF_ERROR(GetMappedTensorsFromInput(
+            ctx, &input_unflattened_tensors_, end_of_sequence));
+        if (*end_of_sequence) {
+          return absl::OkStatus();
+        }
+        input_element_count_ = next_input_index;
+      }
+      TF_ASSIGN_OR_RETURN(*out_tensors,
+                          GetSlice(input_unflattened_tensors_, offset));
+      ++element_count_;
+    } else {
+      const int64_t cardinality = dataset()->Cardinality();
+      if (cardinality < 0) {
+        return absl::FailedPreconditionError(absl::StrCat(
+            "Global shuffling requires finite cardinality. Got cardinality ",
+            cardinality, " for dataset ", dataset()->DebugString(), "."));
+      }
+      // TODO(b/325112575): Make it easier to return multiple values from
+      // IndexMapperFn.
+      size_t offset = 0;
+      IteratorContext ctx_with_index_mapper =
+          GetContextWithIndexMapper(ctx, offset);
+      std::vector<Tensor> mapped_tensors;
+      TF_RETURN_IF_ERROR(GetMappedTensorsFromInput(
+          &ctx_with_index_mapper, &mapped_tensors, end_of_sequence));
+      ctx->MergeCheckpoint(ctx_with_index_mapper.checkpoint());
+      if (*end_of_sequence) {
+        return absl::OkStatus();
+      }
+      TF_ASSIGN_OR_RETURN(*out_tensors, GetSlice(mapped_tensors, offset));
+    }
+    return absl::OkStatus();
+  }
+
+  absl::Status GetMappedTensorsFromInput(IteratorContext* ctx,
+                                         std::vector<Tensor>* mapped_tensors,
+                                         bool* end_of_sequence)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    std::vector<Tensor> input_tensors;
+    TF_RETURN_IF_ERROR(
+        input_impl_->GetNext(ctx, &input_tensors, end_of_sequence));
+    if (*end_of_sequence) {
+      return absl::OkStatus();
+    }
+    return instantiated_map_func_->Run(ctx, {std::move(input_tensors)},
+                                       mapped_tensors);
+  }
+
+  IteratorContext GetContextWithIndexMapper(IteratorContext* ctx,
+                                            size_t& offset) const {
+    IteratorContext::Params params(ctx);
+    params.index_mapper = GetFlatMapIndexMapper(ctx, offset);
+    return IteratorContext(params);
+  }
+
+  IndexMapperFn GetFlatMapIndexMapper(IteratorContext* ctx,
+                                      size_t& offset) const {
+    return [this, ctx,
+            &offset](size_t element_position) -> absl::StatusOr<size_t> {
+      if (ctx->index_mapper()) {
+        TF_ASSIGN_OR_RETURN(element_position,
+                            ctx->index_mapper()(element_position));
+      }
+      absl::StatusOr<std::tuple<size_t, size_t>> unflattened_index =
+          GetUnflattenedIndex(ctx, element_position);
+      TF_RETURN_IF_ERROR(unflattened_index.status());
+      offset = std::get<1>(*unflattened_index);
+      return std::get<0>(*unflattened_index);
+    };
+  }
+
+  // Given an index in the flattened dataset, returns a tuple of
+  // (element index, offset within element) in the unflattend dataset.
+  absl::StatusOr<std::tuple<size_t, size_t>> GetUnflattenedIndex(
+      IteratorContext* ctx, size_t flattened_index) const {
+    Tensor flattened_index_tensor(ctx->allocator({}), DT_INT64,
+                                  TensorShape({}));
+    flattened_index_tensor.scalar<int64_t>()() = flattened_index;
+
+    std::vector<Tensor> unflattened_index;
+    TF_RETURN_IF_ERROR(instantiated_index_map_func_->Run(
+        ctx, {std::move(flattened_index_tensor)}, &unflattened_index));
+    if (unflattened_index.size() != 2) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "The `index_map_fn` for `index_flat_map` is expected to return two "
+          "int values representing the element index and an offset within the "
+          "element. Got: ",
+          ToDebugString(unflattened_index)));
+    }
+    TF_ASSIGN_OR_RETURN(size_t element_index, GetValue(unflattened_index[0]));
+    TF_ASSIGN_OR_RETURN(size_t offset, GetValue(unflattened_index[1]));
+    return std::tuple<size_t, size_t>{element_index, offset};
+  }
+
+  bool SymbolicCheckpointCompatible() const override { return true; }
+
+  absl::Status SaveInternal(SerializationContext* ctx,
+                            IteratorStateWriter* writer) override
+      ABSL_LOCKS_EXCLUDED(mu_) {
+    absl::MutexLock l(&mu_);
+    TF_RETURN_IF_ERROR(
+        writer->WriteScalar(prefix(), kElementCount, element_count_));
+    TF_RETURN_IF_ERROR(writer->WriteScalar(prefix(), kInputElementCount,
+                                           input_element_count_));
+    TF_RETURN_IF_ERROR(writer->WriteScalar(prefix(),
+                                           kInputUnflattenedTensorsSize,
+                                           input_unflattened_tensors_.size()));
+    for (int64_t i = 0; i < input_unflattened_tensors_.size(); ++i) {
+      TF_RETURN_IF_ERROR(writer->WriteTensor(
+          prefix(), absl::StrCat(kInputUnflattenedTensors, "[", i, "]"),
+          input_unflattened_tensors_[i]));
+    }
+    return SaveInput(ctx, writer, input_impl_);
+  }
+
+  absl::Status RestoreInternal(IteratorContext* ctx,
+                               IteratorStateReader* reader) override
+      ABSL_LOCKS_EXCLUDED(mu_) {
+    absl::MutexLock l(&mu_);
+    if (ctx->restored_element_count().has_value()) {
+      return RestoreInput(ctx, reader, input_impl_);
+    }
+    TF_RETURN_IF_ERROR(
+        reader->ReadScalar(prefix(), kElementCount, &element_count_));
+    TF_RETURN_IF_ERROR(reader->ReadScalar(prefix(), kInputElementCount,
+                                          &input_element_count_));
+
+    int64_t input_unflattened_tensors_size = 0;
+    TF_RETURN_IF_ERROR(reader->ReadScalar(prefix(),
+                                          kInputUnflattenedTensorsSize,
+                                          &input_unflattened_tensors_size));
+    input_unflattened_tensors_.clear();
+    input_unflattened_tensors_.reserve(input_unflattened_tensors_size);
+    for (int64_t i = 0; i < input_unflattened_tensors_size; ++i) {
+      Tensor tensor;
+      TF_RETURN_IF_ERROR(reader->ReadTensor(
+          prefix(), absl::StrCat(kInputUnflattenedTensors, "[", i, "]"),
+          &tensor));
+      input_unflattened_tensors_.push_back(std::move(tensor));
+    }
+    return RestoreInput(ctx, reader, input_impl_);
+  }
+
+ private:
+  mutable absl::Mutex mu_;
+  std::unique_ptr<IteratorBase> input_impl_ ABSL_GUARDED_BY(mu_);
+
+  // Tracks the input element. When not using global shuffling, these record the
+  // current element count, the element count of the input iterator, and the
+  // current output of the input iterator.
+  int64_t element_count_ ABSL_GUARDED_BY(mu_) = 0;
+  int64_t input_element_count_ ABSL_GUARDED_BY(mu_) = 0;
+  std::vector<Tensor> input_unflattened_tensors_ ABSL_GUARDED_BY(mu_);
+
+  std::unique_ptr<InstantiatedCapturedFunction> instantiated_map_func_;
+  std::unique_ptr<InstantiatedCapturedFunction> instantiated_index_map_func_;
+};
+
+IndexFlatMapDatasetOp::IndexFlatMapDatasetOp(OpKernelConstruction* ctx)
+    : UnaryDatasetOpKernel(ctx) {
+  OP_REQUIRES_OK(ctx, FunctionMetadata::Create(ctx, kMapFn, /*params=*/{},
+                                               &map_func_metadata_));
+  OP_REQUIRES_OK(ctx, FunctionMetadata::Create(ctx, kIndexMapFn, /*params=*/{},
+                                               &index_map_func_metadata_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputTypes, &output_types_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputShapes, &output_shapes_));
+}
+
+void IndexFlatMapDatasetOp::MakeDataset(OpKernelContext* ctx,
+                                        DatasetBase* input,
+                                        DatasetBase** output) {
+  std::unique_ptr<CapturedFunction> captured_map_func;
+  OP_REQUIRES_OK(
+      ctx, CapturedFunction::Create(ctx, map_func_metadata_, kMapFuncOtherArgs,
+                                    &captured_map_func));
+
+  std::unique_ptr<CapturedFunction> captured_index_map_func;
+  OP_REQUIRES_OK(ctx, CapturedFunction::Create(ctx, index_map_func_metadata_,
+                                               kIndexMapFuncOtherArgs,
+                                               &captured_index_map_func));
+
+  int64_t output_cardinality;
+  OP_REQUIRES_OK(
+      ctx, ParseScalarArgument(ctx, kOutputCardinality, &output_cardinality));
+
+  *output = new Dataset(ctx, input, std::move(captured_map_func),
+                        std::move(captured_index_map_func), output_cardinality,
+                        output_types_, output_shapes_);
+}
+
+std::unique_ptr<IteratorBase>
+IndexFlatMapDatasetOp::Dataset::MakeIteratorInternal(
+    const std::string& prefix) const {
+  return std::make_unique<IndexFlatMapDatasetOp::Dataset::Iterator>(
+      Iterator::Params{this, name_utils::IteratorPrefix(kDatasetType, prefix)});
+}
+
+REGISTER_KERNEL_BUILDER(Name(kIndexFlatMapDataset).Device(DEVICE_CPU),
+                        IndexFlatMapDatasetOp);
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
index 80b208a8f3e2d6..13173f183a9d8c 100644
--- a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
@@ -264,9 +264,9 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
         batch_results_.pop_front();
         cond_var_->notify_all();
       }
-      profiler::TraceMe traceme([&] {
-        return profiler::TraceMeEncode("MapAndBatchConsume",
-                                       {{"element_id", result->uid}});
+      tsl::profiler::TraceMe traceme([&] {
+        return tsl::profiler::TraceMeEncode("MapAndBatchConsume",
+                                            {{"element_id", result->uid}});
       });
       // Deallocate tensors allocated for the output.
       auto cleanup = gtl::MakeCleanup([result] { result->output.clear(); });
@@ -425,9 +425,9 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
     void CallFunction(std::shared_ptr<IteratorContext> ctx,
                       const std::shared_ptr<BatchResult>& result,
                       int64_t offset) TF_LOCKS_EXCLUDED(*mu_) {
-      profiler::TraceMe traceme([&] {
-        return profiler::TraceMeEncode("MapAndBatchProduce",
-                                       {{"element_id", result->uid}});
+      tsl::profiler::TraceMe traceme([&] {
+        return tsl::profiler::TraceMeEncode("MapAndBatchProduce",
+                                            {{"element_id", result->uid}});
       });
       // Get the next input element.
       std::vector<Tensor> input_element;
diff --git a/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
index 9facb7f4fb52ea..7021a23f02f211 100644
--- a/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
@@ -340,8 +340,8 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
             }
             *end_of_sequence = false;
             Status s = current_worker->outputs.front().status;
-            profiler::TraceMe traceme([&] {
-              return profiler::TraceMeEncode(
+            tsl::profiler::TraceMe traceme([&] {
+              return tsl::profiler::TraceMeEncode(
                   "ParallelInterleaveConsume",
                   {{"element_id", current_worker->outputs.front().id}});
             });
@@ -873,10 +873,10 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
                   !worker_thread_states_[thread_index].end_of_sequence) {
                 int64_t& id =
                     worker_thread_states_[thread_index].output_elem.id;
-                profiler::TraceMe traceme(
+                tsl::profiler::TraceMe traceme(
                     [&] {
-                      id = profiler::TraceMe::NewActivityId();
-                      return profiler::TraceMeEncode(
+                      id = tsl::profiler::TraceMe::NewActivityId();
+                      return tsl::profiler::TraceMeEncode(
                           "ParallelInterleaveProduce", {{"element_id", id}});
                     },
                     profiler::kInfo);
diff --git a/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc b/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
index 9ca3a2bf9aa059..03a1a42c1f7a57 100644
--- a/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
@@ -410,9 +410,9 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
         RecordStop(ctx);
         result->notification.WaitForNotification();
         RecordStart(ctx);
-        profiler::TraceMe traceme([&] {
-          return profiler::TraceMeEncode("ParseExampleConsume",
-                                         {{"element_id", result->id}});
+        tsl::profiler::TraceMe traceme([&] {
+          return tsl::profiler::TraceMeEncode("ParseExampleConsume",
+                                              {{"element_id", result->id}});
         });
         return ProcessResult(ctx, result, out_tensors, end_of_sequence);
       }
@@ -581,9 +581,9 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
       void CallFunction(const std::shared_ptr<IteratorContext>& ctx,
                         const std::shared_ptr<InvocationResult>& result)
           TF_LOCKS_EXCLUDED(*mu_) {
-        profiler::TraceMe traceme([&] {
-          return profiler::TraceMeEncode("ParseExampleProduce",
-                                         {{"element_id", result->id}});
+        tsl::profiler::TraceMe traceme([&] {
+          return tsl::profiler::TraceMeEncode("ParseExampleProduce",
+                                              {{"element_id", result->id}});
         });
         // Get the next input element.
         std::vector<Tensor> input_element;
@@ -653,8 +653,8 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
         std::vector<tstring> slice_vec;
         for (const Tensor& t : input) {
           auto serialized_t = t.flat<tstring>();
-          gtl::ArraySlice<tstring> slice(serialized_t.data(),
-                                         serialized_t.size());
+          absl::Span<const tstring> slice(serialized_t.data(),
+                                          serialized_t.size());
           for (auto it = slice.begin(); it != slice.end(); it++)
             slice_vec.push_back(*it);
         }
diff --git a/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc b/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
index 9dfdbafe6e8e3c..4108b5e19354b0 100644
--- a/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
@@ -1324,11 +1324,11 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
               *out_tensors = std::move(buffer_.front().value);
 
               {
-                profiler::TraceMe activity(
+                tsl::profiler::TraceMe activity(
                     [&]() {
                       return absl::StrCat(prefix(), kSeparator, kBookkeeping);
                     },
-                    profiler::TraceMeLevel::kInfo);
+                    tsl::profiler::TraceMeLevel::kInfo);
                 // Printing some statistics along the way.
                 int64_t num_bytes = 0;
                 for (int i = 0; i < out_tensors->size(); ++i) {
@@ -1494,9 +1494,9 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
             std::vector<Tensor> read_tensors;
             Status s = reader->ReadTensors(&read_tensors);
             if (s.ok()) {
-              profiler::TraceMe activity(
+              tsl::profiler::TraceMe activity(
                   [&]() { return absl::StrCat(prefix(), kSeparator, kParse); },
-                  profiler::TraceMeLevel::kInfo);
+                  tsl::profiler::TraceMeLevel::kInfo);
               BufferElement elem;
               elem.value = std::move(read_tensors);
               elem.status = absl::OkStatus();
@@ -1731,11 +1731,11 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
           TF_RETURN_IF_ERROR(FillBuffer(ctx));
 
           {
-            profiler::TraceMe activity(
+            tsl::profiler::TraceMe activity(
                 [&]() {
                   return absl::StrCat(prefix(), kSeparator, kBookkeeping);
                 },
-                profiler::TraceMeLevel::kInfo);
+                tsl::profiler::TraceMeLevel::kInfo);
 
             // Book keeping to report some statistics.
             mutex_lock l(mu_);
@@ -1998,11 +1998,11 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
                                  string* snapshot_data_filename,
                                  std::unique_ptr<snapshot_util::Writer>* writer,
                                  bool* end_of_processing) {
-          profiler::TraceMe activity(
+          tsl::profiler::TraceMe activity(
               [&]() {
                 return absl::StrCat(prefix(), kSeparator, kProcessOneElement);
               },
-              profiler::TraceMeLevel::kInfo);
+              tsl::profiler::TraceMeLevel::kInfo);
           bool cancelled = false;
           *end_of_processing = false;
           bool produced_elem = false;
diff --git a/tensorflow/core/kernels/data/experimental/weighted_flat_map_dataset_op.cc b/tensorflow/core/kernels/data/experimental/weighted_flat_map_dataset_op.cc
new file mode 100644
index 00000000000000..2218d8f877c9e8
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/weighted_flat_map_dataset_op.cc
@@ -0,0 +1,523 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <numeric>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/core/data/captured_function.h"
+#include "tensorflow/core/data/name_utils.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/op_requires.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+constexpr const char kDatasetType[] = "WeightedFlatMap";
+constexpr const char kInputNumElements[] = "input_num_elements";
+constexpr const char kWeightedFlatMapDataset[] = "WeightedFlatMapDataset";
+constexpr const char kOutputTypes[] = "output_types";
+constexpr const char kOutputShapes[] = "output_shapes";
+
+// Computes the index of the `cardinalities` where the `index` appears in. The
+// `cardinalities` vector is a vector of ranges represented by the cumulative
+// endpoints of the ranges where the endpoints are exclusive. For example, [3,
+// 6] defines 2 ranges: {0, 1, 2} and {3, 4, 5}. When called with 4 for this
+// example, it returns 1.
+size_t IntervalIndex(const std::vector<uint64_t>& cardinalities,
+                     uint64_t index) {
+  DCHECK(index < cardinalities.back())
+      << "index: " << index << " cardinalities.back() " << cardinalities.back();
+  return std::upper_bound(cardinalities.begin(), cardinalities.end(), index) -
+         cardinalities.begin();
+}
+
+// Normalizes input cardinalities with respect to the given weights. The
+// normalized cardinalities are the number of elements from each input dataset
+// that this dataset reads from. They are computed in such a way at least one of
+// the input datasets is exhausted while respecting the weights. For example, if
+// the cardinalities of the input datasets are {100, 100, 10} and the weights
+// are {0.2, 0.3, 0.5}, then the cardinalities are {4, 6, 10} because the third
+// input runs out of elements after reading 4 from the first dataset, 6 from the
+// second, and 10 from the third.
+Status NormalizeInputCardinalities(const std::vector<double>& weights,
+                                   std::vector<uint64_t>* input_cardinalities) {
+  double max_weight = 0.0;
+  for (const double weight : weights) {
+    max_weight = std::max(max_weight, weight);
+  }
+  DCHECK_GT(max_weight, 0.0);
+  double min_cardinality = std::numeric_limits<double>::max();
+  size_t min_cardinality_index;
+  for (size_t i = 0; i < input_cardinalities->size(); ++i) {
+    const double cardinality = static_cast<double>((*input_cardinalities)[i]) *
+                               weights[i] / max_weight;
+    if (min_cardinality > cardinality) {
+      min_cardinality = cardinality;
+      min_cardinality_index = i;
+    }
+  }
+  for (size_t i = 0; i < input_cardinalities->size(); ++i) {
+    uint64_t cardinality = static_cast<uint64_t>(
+        min_cardinality * weights[i] / weights[min_cardinality_index]);
+    if (cardinality > (*input_cardinalities)[i]) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "Input ", i, " needs to have at least ", cardinality,
+          " elements. It only has ", (*input_cardinalities)[i], " elements."));
+    }
+    (*input_cardinalities)[i] = cardinality;
+  }
+  return absl::OkStatus();
+}
+
+}  // namespace
+
+// A dataset kernel that fetches elements from its inputs and flattens the
+// results according to the weights of its inputs.
+class WeightedFlatMapDatasetOp : public DatasetOpKernel {
+ public:
+  explicit WeightedFlatMapDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override;
+
+ private:
+  class Dataset;
+  std::shared_ptr<FunctionMetadata> func_metadata_ = nullptr;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
+class WeightedFlatMapDatasetOp::Dataset : public DatasetBase {
+ public:
+  Dataset(OpKernelContext* ctx, const std::vector<DatasetBase*>& inputs,
+          const std::vector<double>& weights,
+          const std::vector<uint64_t>& input_cardinalities,
+          const DataTypeVector& output_types,
+          const std::vector<PartialTensorShape>& output_shapes)
+      : DatasetBase(DatasetContext(ctx)),
+        inputs_(std::move(inputs)),
+        weights_(std::move(weights)),
+        input_cardinalities_(std::move(input_cardinalities)),
+        output_types_(output_types),
+        output_shapes_(output_shapes) {
+    random_indexing_compatible_ = absl::OkStatus();
+    for (auto input : inputs_) {
+      input->Ref();
+      random_indexing_compatible_.Update(input->RandomIndexingCompatible());
+    }
+  }
+
+  std::vector<uint64_t> ComputeCumulativeInputCardinalities() const {
+    std::vector<uint64_t> cumulative_input_cardinalities(
+        input_cardinalities_.size());
+    cumulative_input_cardinalities[0] = input_cardinalities_[0];
+    for (size_t i = 1; i < inputs_.size(); ++i) {
+      cumulative_input_cardinalities[i] +=
+          cumulative_input_cardinalities[i - 1] + input_cardinalities_[i];
+    }
+    return cumulative_input_cardinalities;
+  }
+
+  ~Dataset() override {
+    for (auto input : inputs_) {
+      input->Unref();
+    }
+  }
+
+  const DataTypeVector& output_dtypes() const override { return output_types_; }
+
+  const std::vector<PartialTensorShape>& output_shapes() const override {
+    return output_shapes_;
+  }
+
+  string DebugString() const override {
+    return name_utils::DatasetDebugString(kDatasetType);
+  }
+
+  int64_t CardinalityInternal(CardinalityOptions options) const override {
+    return std::accumulate(input_cardinalities_.begin(),
+                           input_cardinalities_.end(), 0UL);
+  }
+
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    for (const auto& input : inputs_) {
+      inputs->push_back(input);
+    }
+    return absl::OkStatus();
+  }
+
+  Status CheckExternalState() const override {
+    for (const auto& input : inputs_) {
+      TF_RETURN_IF_ERROR(input->CheckExternalState());
+    }
+    return absl::OkStatus();
+  }
+
+ protected:
+  std::unique_ptr<IteratorBase> MakeIteratorInternal(
+      const std::string& prefix) const override {
+    return std::make_unique<Dataset::Iterator>(Iterator::Params{
+        this, name_utils::IteratorPrefix(kDatasetType, prefix)});
+  }
+
+  Status AsGraphDefInternal(SerializationContext* ctx,
+                            DatasetGraphDefBuilder* b,
+                            Node** output) const override {
+    std::vector<Node*> input_nodes;
+    input_nodes.reserve(inputs_.size());
+    for (const auto& input : inputs_) {
+      Node* input_node;
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input, &input_node));
+      input_nodes.push_back(input_node);
+    }
+
+    std::vector<Node*> weight_nodes;
+    weight_nodes.reserve(weights_.size());
+    for (const double weight : weights_) {
+      Node* weight_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(weight, &weight_node));
+      weight_nodes.push_back(weight_node);
+    }
+
+    auto s = b->AddDataset(
+        this,
+        /*inputs=*/{},
+        /*list_inputs=*/
+        {std::make_pair(0, input_nodes), std::make_pair(1, weight_nodes)},
+        /*attrs=*/{}, output);
+    return s;
+  }
+
+  absl::Status RandomIndexingCompatible() const override {
+    return random_indexing_compatible_;
+  }
+
+ private:
+  class Iterator : public DatasetIterator<Dataset> {
+   public:
+    explicit Iterator(const Params& params)
+        : DatasetIterator<Dataset>(params),
+          input_impls_(dataset()->inputs_.size()),
+          element_count_(0),
+          inputs_element_count_(dataset()->inputs_.size(), 0),
+          next_positions_(dataset()->inputs_.size(), 0),
+          cumulative_input_cardinalities_(
+              dataset()->ComputeCumulativeInputCardinalities()) {}
+
+    bool SymbolicCheckpointCompatible() const override { return true; }
+
+    absl::Status Initialize(IteratorContext* ctx) override
+        ABSL_LOCKS_EXCLUDED(mu_) {
+      absl::MutexLock l(&mu_);
+      for (int i = 0; i < dataset()->inputs_.size(); ++i) {
+        TF_RETURN_IF_ERROR(dataset()->inputs_[i]->MakeIterator(
+            ctx, this, absl::StrCat(prefix(), "[", i, "]"), &input_impls_[i]));
+      }
+      return absl::OkStatus();
+    }
+
+    absl::Status GetNextInternal(IteratorContext* ctx,
+                                 std::vector<Tensor>* out_tensors,
+                                 bool* end_of_sequence) override
+        ABSL_LOCKS_EXCLUDED(mu_) {
+      absl::MutexLock l(&mu_);
+      if (element_count_ >= cumulative_input_cardinalities_.back()) {
+        *end_of_sequence = true;
+        return absl::OkStatus();
+      }
+      size_t input_dataset_index;
+      if (ctx->index_mapper() == nullptr) {
+        input_dataset_index =
+            IntervalIndex(cumulative_input_cardinalities_, element_count_);
+        TF_RETURN_IF_ERROR(input_impls_[input_dataset_index]->GetNext(
+            ctx, out_tensors, end_of_sequence));
+      } else {
+        TF_ASSIGN_OR_RETURN(auto parent_index,
+                            ctx->index_mapper()(element_count_));
+        input_dataset_index =
+            IntervalIndex(cumulative_input_cardinalities_, parent_index);
+        IteratorContext::Params params(ctx);
+        params.index_mapper = GetWeightedFlatMapIndexMapper(
+            ctx->index_mapper(), input_dataset_index);
+        IteratorContext global_shuffle_ctx(params);
+        TF_RETURN_IF_ERROR(input_impls_[input_dataset_index]->GetNext(
+            &global_shuffle_ctx, out_tensors, end_of_sequence));
+        ctx->MergeCheckpoint(global_shuffle_ctx.checkpoint());
+      }
+      ++inputs_element_count_[input_dataset_index];
+      ++element_count_;
+      return absl::OkStatus();
+    }
+
+    // Returns the index mapper for an input given its `input_dataset_index`.
+    IndexMapperFn GetWeightedFlatMapIndexMapper(
+        IndexMapperFn parent_index_mapper, size_t input_dataset_index = 0)
+        ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      size_t last_position = this->cumulative_input_cardinalities_.back();
+      return [this, parent_index_mapper, input_dataset_index, last_position](
+                 size_t element_position) -> absl::StatusOr<size_t> {
+        // This index mapper function scans the position of the
+        // `WeightedFlatMap` elements to find the first element that matches the
+        // `input_dataset_index`. It updates this position each time the
+        // function is called so that it does not start from the beginning the
+        // next time it is called. For example, if there are 2 inputs: input0
+        // and input1 with elements [0, 1, 2], and [10, 11, 12] and the output
+        // is shuffled to return [1, 12, 10, 2, 11, 0]. The first time each
+        // input is called, the following is what each variable has before
+        // returning.
+        //                       input0       input1
+        //   element_position      0            0
+        //   index                 1            1
+        //   next_position         1            2 (next_position = 1 is skipped
+        //                                         because it is for input0)
+        //   index                 1 (for 1)    2 (for 12)
+        // The second time around, input0 will start scanning from
+        // `next_positions_[0]`, which is 1, and input1 will start scanning from
+        // `next_positions_[1]`, which is 2.
+        while (this->next_positions_[input_dataset_index] < last_position) {
+          // `index` is the shuffled index of this dataset, not any of the
+          // inputs.
+          size_t index = this->next_positions_[input_dataset_index];
+          if (parent_index_mapper != nullptr) {
+            TF_ASSIGN_OR_RETURN(index, parent_index_mapper(index));
+          }
+          ++(this->next_positions_[input_dataset_index]);
+          // Finds the shuffled `index` comes from dataset
+          // `input_dataset_index`, computes the local offset to the input and
+          // return the offset. If not, iterate to continue scanning.
+          if (IntervalIndex(this->cumulative_input_cardinalities_, index) ==
+              input_dataset_index) {
+            // Finds the offset in input `input_dataset_index`.
+            if (input_dataset_index > 0) {
+              index -= cumulative_input_cardinalities_[input_dataset_index - 1];
+            }
+            return index;
+          }
+        }
+        return last_position;
+      };
+    }
+
+    absl::Status SaveInternal(SerializationContext* ctx,
+                              IteratorStateWriter* writer) override {
+      absl::MutexLock l(&mu_);
+      TF_RETURN_IF_ERROR(
+          writer->WriteScalar(prefix(), kInputNumElements, element_count_));
+      for (int i = 0; i < inputs_element_count_.size(); ++i) {
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            prefix(), absl::StrCat(kInputNumElements, "[", i, "]"),
+            inputs_element_count_[i]));
+        TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impls_[i]));
+      }
+      return absl::OkStatus();
+    }
+
+    absl::Status RestoreInternal(IteratorContext* ctx,
+                                 IteratorStateReader* reader) override {
+      absl::MutexLock l(&mu_);
+      if (ctx->restored_element_count().has_value()) {
+        element_count_ = *ctx->restored_element_count();
+        // Restores all input's element counts and next positions.
+        std::fill(inputs_element_count_.begin(), inputs_element_count_.end(),
+                  0);
+        std::fill(next_positions_.begin(), next_positions_.end(), 0);
+        for (int64_t count = 0; count < element_count_; ++count) {
+          if (element_count_ >= cumulative_input_cardinalities_.back()) {
+            break;
+          }
+          auto parent_index = count;
+          if (ctx->index_mapper() != nullptr) {
+            TF_ASSIGN_OR_RETURN(parent_index, ctx->index_mapper()(count));
+          }
+          auto input_dataset_index =
+              IntervalIndex(cumulative_input_cardinalities_, parent_index);
+          ++inputs_element_count_[input_dataset_index];
+          next_positions_[input_dataset_index] = count + 1;
+        }
+        // Restores all inputs.
+        for (int i = 0; i < inputs_element_count_.size(); ++i) {
+          IteratorContext::Params params(ctx);
+          params.restored_element_count = inputs_element_count_[i];
+          IteratorContext ctx_copy(params);
+          TF_RETURN_IF_ERROR(RestoreInput(&ctx_copy, reader, input_impls_[i]));
+          ctx->MergeCheckpoint(ctx_copy.checkpoint());
+        }
+        return absl::OkStatus();
+      }
+      TF_RETURN_IF_ERROR(
+          reader->ReadScalar(prefix(), kInputNumElements, &element_count_));
+      for (int i = 0; i < inputs_element_count_.size(); ++i) {
+        TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impls_[i]));
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            prefix(), absl::StrCat(kInputNumElements, "[", i, "]"),
+            &inputs_element_count_[i]));
+      }
+      return absl::OkStatus();
+    }
+
+   private:
+    mutable absl::Mutex mu_;
+    std::vector<std::unique_ptr<IteratorBase>> input_impls_
+        ABSL_GUARDED_BY(mu_);
+    // Counts the number of elements this iterator has produced.
+    int64_t element_count_ ABSL_GUARDED_BY(mu_) = 0;
+    // Counts the number of elements each input iterator has produced.
+    std::vector<int64_t> inputs_element_count_ ABSL_GUARDED_BY(mu_);
+    // Keeps track of the position of this iterator that each input starts to
+    // scan for its next index.
+    std::vector<size_t> next_positions_;
+    const std::vector<uint64_t> cumulative_input_cardinalities_;
+  };
+
+  const std::vector<DatasetBase*> inputs_;
+  std::vector<double> weights_;
+  std::vector<uint64_t> input_cardinalities_;
+  const DataTypeVector output_types_;
+  const std::vector<PartialTensorShape> output_shapes_;
+  absl::Status random_indexing_compatible_;
+};
+
+WeightedFlatMapDatasetOp::WeightedFlatMapDatasetOp(OpKernelConstruction* ctx)
+    : DatasetOpKernel(ctx) {
+  OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputTypes, &output_types_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputShapes, &output_shapes_));
+}
+
+void WeightedFlatMapDatasetOp::MakeDataset(OpKernelContext* ctx,
+                                           DatasetBase** output) {
+  OpInputList input_list;
+  OP_REQUIRES_OK(ctx, ctx->input_list("input_datasets", &input_list));
+  OP_REQUIRES(ctx, input_list.size() > 1,
+              absl::InvalidArgumentError(
+                  "WeightedFlatMap must have at least two input datasets."));
+  OpInputList weight_list;
+  OP_REQUIRES_OK(ctx, ctx->input_list("weights", &weight_list));
+  OP_REQUIRES(ctx, weight_list.size() == input_list.size(),
+              absl::InvalidArgumentError(
+                  absl::StrCat("`input_datasets` and `weights` of the "
+                               "WeightedFlatMap must have the same size. Got ",
+                               input_list.size(), " datasets and ",
+                               weight_list.size(), " weights.")));
+
+  std::vector<DatasetBase*> inputs;
+  for (const auto& tensor : input_list) {
+    DatasetBase* input;
+    OP_REQUIRES_OK(ctx, GetDatasetFromVariantTensor(tensor, &input));
+    inputs.push_back(input);
+  }
+
+  for (size_t i = 1, num_inputs = inputs.size(); i < num_inputs; ++i) {
+    OP_REQUIRES(ctx, inputs[i]->output_dtypes() == output_types_,
+                absl::InvalidArgumentError(absl::StrCat(
+                    "All inputs to WeightedFlatMap must have the same output "
+                    "types. Input ",
+                    i, " has output types: ",
+                    DataTypeVectorString(inputs[i]->output_dtypes()),
+                    ". Expected: ", DataTypeVectorString(output_types_), ".")));
+  }
+
+  // Merge the output shapes of all the input datasets, returning an
+  // error if any of them are incompatible.
+  for (size_t i = 1, num_inputs = inputs.size(); i < num_inputs; ++i) {
+    OP_REQUIRES(ctx, inputs[i]->output_shapes().size() == output_shapes_.size(),
+                absl::InvalidArgumentError(
+                    absl::StrCat("All inputs to WeightedFlatMap must have "
+                                 "compatible outputs. Input ",
+                                 i, " has ", inputs[i]->output_shapes().size(),
+                                 " components. Expected to have ",
+                                 output_shapes_.size(), " components.")));
+    for (size_t j = 0, num_components = output_shapes_.size();
+         j < num_components; ++j) {
+      PartialTensorShape result;
+      OP_REQUIRES(
+          ctx,
+          output_shapes_[j]
+              .MergeWith(inputs[i]->output_shapes().at(j), &result)
+              .ok(),
+          absl::InvalidArgumentError(absl::StrCat(
+              "All inputs to WeightedFlatMap must have compatible output "
+              "shapes. Component ",
+              j, " of input ", i,
+              " has shape: ", inputs[i]->output_shapes().at(j).DebugString(),
+              ". Expected to be compatible with shape: ",
+              output_shapes_[j].DebugString(), ".")));
+      output_shapes_[j] = std::move(result);
+    }
+  }
+
+  // Checks that none of the input dataset has unknown or infinite cardinality.
+  for (size_t i = 0, num_inputs = inputs.size(); i < num_inputs; ++i) {
+    const auto cardinality = inputs[i]->Cardinality();
+    OP_REQUIRES(
+        ctx,
+        (cardinality != kUnknownCardinality) &&
+            (cardinality != kInfiniteCardinality),
+        absl::InvalidArgumentError(absl::StrCat(
+            "Cardinalities of the inputs must be known. Input ", i, " has ",
+            (cardinality == kInfiniteCardinality ? "INFINITE" : "UNKNOWN"),
+            " cardinality.")));
+  }
+
+  std::vector<double> weights;
+  for (size_t i = 0; i < weight_list.size(); ++i) {
+    const auto weight = weight_list[i].scalar<double>()();
+    OP_REQUIRES(ctx, weight > 0.0,
+                absl::InvalidArgumentError(
+                    absl::StrCat("`weights` must be greater than 0.0. Input ",
+                                 i, " has a weight of ", weight)));
+    weights.emplace_back(weight);
+  }
+  std::vector<uint64_t> input_cardinalities(inputs.size());
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    input_cardinalities[i] = inputs[i]->Cardinality();
+  }
+  OP_REQUIRES_OK(ctx,
+                 NormalizeInputCardinalities(weights, &input_cardinalities));
+  *output = new Dataset(ctx, std::move(inputs), std::move(weights),
+                        std::move(input_cardinalities), output_types_,
+                        output_shapes_);
+}
+
+namespace {
+REGISTER_KERNEL_BUILDER(Name(kWeightedFlatMapDataset).Device(DEVICE_CPU),
+                        WeightedFlatMapDatasetOp);
+}
+
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/get_options_op.cc b/tensorflow/core/kernels/data/get_options_op.cc
index aea42daeb39ee9..54373715343e96 100644
--- a/tensorflow/core/kernels/data/get_options_op.cc
+++ b/tensorflow/core/kernels/data/get_options_op.cc
@@ -38,7 +38,7 @@ void GetOptionsOp::Compute(OpKernelContext* ctx) {
 
 string GetOptionsOp::TraceString(const OpKernelContext& ctx,
                                  bool verbose) const {
-  return profiler::TraceMeOp(name_view(), type_string_view());
+  return tsl::profiler::TraceMeOp(name_view(), type_string_view());
 }
 
 namespace {
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index 1ba269ede75ef4..4b5c498941f447 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -650,10 +650,10 @@ class ToSingleElementOp : public AsyncOpKernel {
 
  private:
   Status DoCompute(OpKernelContext* ctx) {
-    profiler::TraceMe traceme(
+    tsl::profiler::TraceMe traceme(
         [&] {
-          return profiler::TraceMeEncode("ToSingleElementOp::DoCompute",
-                                         {{"id", ctx->step_id()}});
+          return tsl::profiler::TraceMeEncode("ToSingleElementOp::DoCompute",
+                                              {{"id", ctx->step_id()}});
         },
         profiler::kInfo);
     tensorflow::ResourceTagger tag(kTFDataResourceTag,
@@ -893,13 +893,13 @@ AsyncOpKernel* IteratorGetNextOp::AsAsync() {
 }
 
 void RecordElementSize(const std::vector<Tensor> element,
-                       profiler::TraceMe* traceme) {
+                       tsl::profiler::TraceMe* traceme) {
   traceme->AppendMetadata([&]() {
     int64_t element_size = 0;
     for (const auto& component : element) {
       element_size += component.TotalBytes();
     }
-    return profiler::TraceMeEncode({{"element_size", element_size}});
+    return tsl::profiler::TraceMeEncode({{"element_size", element_size}});
   });
 }
 
@@ -913,9 +913,9 @@ Status IteratorGetNextOp::DoCompute(OpKernelContext* ctx) {
         ctx, "IteratorGetNextOp::DoCompute",
         activity_watcher::ActivityCategory::kDatasetOp);
   });
-  profiler::TraceMe traceme(
+  tsl::profiler::TraceMe traceme(
       [&] {
-        return profiler::TraceMeEncode(
+        return tsl::profiler::TraceMeEncode(
             "IteratorGetNextOp::DoCompute",
             {{"id", ctx->step_id()}, {"iter_num", ctx->frame_iter().iter_id}});
       },
@@ -968,9 +968,9 @@ Status IteratorGetNextAsOptionalOp::DoCompute(OpKernelContext* ctx) {
         ctx, "IteratorGetNextAsOptionalOp::DoCompute",
         activity_watcher::ActivityCategory::kDatasetOp);
   });
-  profiler::TraceMe traceme(
+  tsl::profiler::TraceMe traceme(
       [&] {
-        return profiler::TraceMeEncode(
+        return tsl::profiler::TraceMeEncode(
             "IteratorGetNextAsOptionalOp::DoCompute",
             {{"id", ctx->step_id()}, {"iter_num", ctx->frame_iter().iter_id}});
       },
diff --git a/tensorflow/core/kernels/data/parallel_batch_dataset_op.cc b/tensorflow/core/kernels/data/parallel_batch_dataset_op.cc
index 21b7769feb450a..5e6abb3abe58c9 100644
--- a/tensorflow/core/kernels/data/parallel_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_batch_dataset_op.cc
@@ -249,9 +249,9 @@ class ParallelBatchDatasetOp::Dataset : public DatasetBase {
         }
       }
 
-      profiler::TraceMe traceme([&] {
-        return profiler::TraceMeEncode("ParallelBatchConsume",
-                                       {{"element_id", result->uid}});
+      tsl::profiler::TraceMe traceme([&] {
+        return tsl::profiler::TraceMeEncode("ParallelBatchConsume",
+                                            {{"element_id", result->uid}});
       });
       mutex_lock l(result->mu);
       // Deallocate tensors allocated for the output.
@@ -376,9 +376,9 @@ class ParallelBatchDatasetOp::Dataset : public DatasetBase {
     void CallBatching(std::shared_ptr<IteratorContext> ctx,
                       const std::shared_ptr<BatchResult>& result)
         TF_LOCKS_EXCLUDED(*mu_) {
-      profiler::TraceMe traceme([&] {
-        return profiler::TraceMeEncode("ParallelBatchProduce",
-                                       {{"element_id", result->uid}});
+      tsl::profiler::TraceMe traceme([&] {
+        return tsl::profiler::TraceMeEncode("ParallelBatchProduce",
+                                            {{"element_id", result->uid}});
       });
 
       if (!input_impl_) {
diff --git a/tensorflow/core/kernels/data/parallel_filter_dataset_op.cc b/tensorflow/core/kernels/data/parallel_filter_dataset_op.cc
index 2a7786d91624e6..53b488958dc1e4 100644
--- a/tensorflow/core/kernels/data/parallel_filter_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_filter_dataset_op.cc
@@ -185,9 +185,9 @@ class ParallelFilterDatasetOp::Dataset : public DatasetBase {
           return errors::Cancelled("Iterator was cancelled");
         }
       }
-      profiler::TraceMe traceme([&] {
-        return profiler::TraceMeEncode("ParallelFilterConsume",
-                                       {{"element_id", result->uid}});
+      tsl::profiler::TraceMe traceme([&] {
+        return tsl::profiler::TraceMeEncode("ParallelFilterConsume",
+                                            {{"element_id", result->uid}});
       });
       return ProcessResult(ctx, result, out_tensors, end_of_sequence);
     }
@@ -336,9 +336,9 @@ class ParallelFilterDatasetOp::Dataset : public DatasetBase {
     void CallFunction(const std::shared_ptr<IteratorContext>& ctx,
                       const std::shared_ptr<InvocationResult>& result)
         TF_LOCKS_EXCLUDED(*mu_) {
-      profiler::TraceMe traceme([&] {
-        return profiler::TraceMeEncode("ParallelFilterProduce",
-                                       {{"element_id", result->uid}});
+      tsl::profiler::TraceMe traceme([&] {
+        return tsl::profiler::TraceMeEncode("ParallelFilterProduce",
+                                            {{"element_id", result->uid}});
       });
       // Get the next input element.
       std::vector<Tensor> input_element;
diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
index e134259ee4abfa..c585aeba983f6f 100644
--- a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
@@ -448,9 +448,9 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
         *end_of_sequence = true;
         return absl::OkStatus();
       }
-      profiler::TraceMe traceme([&] {
-        return profiler::TraceMeEncode("ParallelInterleaveConsume",
-                                       {{"element_id", result->id}});
+      tsl::profiler::TraceMe traceme([&] {
+        return tsl::profiler::TraceMeEncode("ParallelInterleaveConsume",
+                                            {{"element_id", result->id}});
       });
       if (result->status.ok()) {
         *out_tensors = std::move(result->return_values);
@@ -1082,9 +1082,9 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
       // Process until the results queue is full or we reach end of input.
       while (true) {
         auto result = std::make_shared<Result>(ctx);
-        profiler::TraceMe traceme([&] {
-          result->id = profiler::TraceMe::NewActivityId();
-          return profiler::TraceMeEncode(
+        tsl::profiler::TraceMe traceme([&] {
+          result->id = tsl::profiler::TraceMe::NewActivityId();
+          return tsl::profiler::TraceMeEncode(
               "ParallelInterleaveProduce",
               {{"input_element_id", input_element_id},
                {"element_id", result->id}});
@@ -1124,8 +1124,8 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
         NotifyElementUpdate(element);
         return;
       }
-      profiler::TraceMe traceme([input_element_id = element.id] {
-        return profiler::TraceMeEncode(
+      tsl::profiler::TraceMe traceme([input_element_id = element.id] {
+        return tsl::profiler::TraceMeEncode(
             "ParallelInterleaveInitializeInput",
             {{"input_element_id", input_element_id}});
       });
diff --git a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
index fbd7c24fa6e178..568dedb5bc8e9e 100644
--- a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
@@ -305,9 +305,9 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
       RecordStop(ctx);
       result->notification.WaitForNotification();
       RecordStart(ctx);
-      profiler::TraceMe traceme([&] {
-        return profiler::TraceMeEncode("ParallelMapConsume",
-                                       {{"element_id", result->uid}});
+      tsl::profiler::TraceMe traceme([&] {
+        return tsl::profiler::TraceMeEncode("ParallelMapConsume",
+                                            {{"element_id", result->uid}});
       });
       return ProcessResult(ctx, result, out_tensors, end_of_sequence);
     }
@@ -489,9 +489,9 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
     void CallFunction(const std::shared_ptr<IteratorContext>& ctx,
                       const std::shared_ptr<InvocationResult>& result)
         TF_LOCKS_EXCLUDED(*mu_) {
-      profiler::TraceMe traceme([&] {
-        return profiler::TraceMeEncode("ParallelMapProduce",
-                                       {{"element_id", result->uid}});
+      tsl::profiler::TraceMe traceme([&] {
+        return tsl::profiler::TraceMeEncode("ParallelMapProduce",
+                                            {{"element_id", result->uid}});
       });
       // Get the next input element.
       std::vector<Tensor> input_element;
diff --git a/tensorflow/core/kernels/data/prefetch_autotuner.h b/tensorflow/core/kernels/data/prefetch_autotuner.h
index d7106b698a876c..a06eb60f614c05 100644
--- a/tensorflow/core/kernels/data/prefetch_autotuner.h
+++ b/tensorflow/core/kernels/data/prefetch_autotuner.h
@@ -51,7 +51,7 @@ class PrefetchAutotuner {
   int64_t buffer_limit() const { return buffer_limit_; }
 
   // Reports whether the element size has been set.
-  int64_t HasElementSize() const { return element_size_bytes_.has_value(); }
+  bool HasElementSize() const { return element_size_bytes_.has_value(); }
   // Sets the element size to use for predicting memory usage. Element size must
   // be set before the autotuner can increase the buffer size.
   void SetElementSize(int64_t element_size_bytes);
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op.cc b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
index 1d77b70e0bcb4f..0a7779c519a835 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
@@ -447,9 +447,9 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
       Status s = buffer_.front().status;
       if (s.ok()) {
         int64_t buffer_element_id = buffer_.front().uid;
-        profiler::TraceMe traceme(
+        tsl::profiler::TraceMe traceme(
             [&] {
-              return profiler::TraceMeEncode(
+              return tsl::profiler::TraceMeEncode(
                   "PrefetchConsume", {{"element_id", buffer_element_id}});
             },
             profiler::kInfo);
@@ -552,9 +552,9 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
         bool end_of_sequence = false;
         BufferElement buffer_element(ctx.get());
         {
-          profiler::TraceMe traceme(
+          tsl::profiler::TraceMe traceme(
               [&] {
-                return profiler::TraceMeEncode(
+                return tsl::profiler::TraceMeEncode(
                     "PrefetchProduce", {{"element_id", buffer_element.uid}});
               },
               profiler::kInfo);
diff --git a/tensorflow/core/kernels/data/reduce_dataset_op.cc b/tensorflow/core/kernels/data/reduce_dataset_op.cc
index 226ed3d2e66587..c4937bf0cd2ea8 100644
--- a/tensorflow/core/kernels/data/reduce_dataset_op.cc
+++ b/tensorflow/core/kernels/data/reduce_dataset_op.cc
@@ -42,10 +42,10 @@ ReduceDatasetOp::ReduceDatasetOp(OpKernelConstruction* ctx)
 }
 
 Status ReduceDatasetOp::DoCompute(OpKernelContext* ctx) {
-  profiler::TraceMe traceme(
+  tsl::profiler::TraceMe traceme(
       [&] {
-        return profiler::TraceMeEncode("ReduceDatasetOp::DoCompute",
-                                       {{"id", ctx->step_id()}});
+        return tsl::profiler::TraceMeEncode("ReduceDatasetOp::DoCompute",
+                                            {{"id", ctx->step_id()}});
       },
       profiler::kInfo);
   tensorflow::ResourceTagger tag(kTFDataResourceTag,
diff --git a/tensorflow/core/kernels/data/repeat_dataset_op.cc b/tensorflow/core/kernels/data/repeat_dataset_op.cc
index cfc71e39b7701f..431649ac2d3fe5 100644
--- a/tensorflow/core/kernels/data/repeat_dataset_op.cc
+++ b/tensorflow/core/kernels/data/repeat_dataset_op.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/core/data/global_shuffle_utils.h"
 #include "tensorflow/core/data/name_utils.h"
@@ -31,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
 #include "tsl/platform/thread_annotations.h"
 
 namespace tensorflow {
@@ -313,7 +315,7 @@ class RepeatDatasetOp::Dataset : public DatasetBase {
       int64_t input_cardinality = dataset()->input_->Cardinality();
       int64_t repeat_count = i_;
       return [parent_index_mapper, input_cardinality,
-              repeat_count](size_t element_position) -> size_t {
+              repeat_count](size_t element_position) -> absl::StatusOr<size_t> {
         if (element_position >= input_cardinality) {
           // The input element position is out-of-range. The caller is
           // responsible for handle this case (e.g.: returning end_of_sequence).
@@ -326,8 +328,8 @@ class RepeatDatasetOp::Dataset : public DatasetBase {
         // mod. This way, the shuffling happens across repetitions.
         size_t repeated_element_position =
             repeat_count * input_cardinality + element_position;
-        size_t shuffled_element_position =
-            parent_index_mapper(repeated_element_position);
+        TF_ASSIGN_OR_RETURN(size_t shuffled_element_position,
+                            parent_index_mapper(repeated_element_position));
         return shuffled_element_position % input_cardinality;
       };
     }
diff --git a/tensorflow/core/kernels/data/shard_dataset_op.cc b/tensorflow/core/kernels/data/shard_dataset_op.cc
index bda0c9b6ed7297..607e3fc2d0c556 100644
--- a/tensorflow/core/kernels/data/shard_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shard_dataset_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/core/data/dataset_utils.h"
 #include "tensorflow/core/data/global_shuffle_utils.h"
@@ -34,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/stringprintf.h"
 #include "tensorflow/core/util/batch_util.h"
+#include "tsl/platform/statusor.h"
 #include "tsl/platform/thread_annotations.h"
 
 namespace tensorflow {
@@ -258,8 +260,9 @@ class ShardDatasetOp::Dataset : public DatasetBase {
       int64_t num_shards = dataset()->num_shards_;
       int64_t shard_index = dataset()->index_;
       return [parent_index_mapper, num_shards,
-              shard_index](size_t element_position) -> size_t {
-        size_t output_index = parent_index_mapper(element_position);
+              shard_index](size_t element_position) -> absl::StatusOr<size_t> {
+        TF_ASSIGN_OR_RETURN(size_t output_index,
+                            parent_index_mapper(element_position));
         return output_index * num_shards + shard_index;
       };
     }
diff --git a/tensorflow/core/kernels/data/skip_dataset_op.cc b/tensorflow/core/kernels/data/skip_dataset_op.cc
index 1e2d000865e844..2674a6ee02adb3 100644
--- a/tensorflow/core/kernels/data/skip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/skip_dataset_op.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cstdint>
 
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/core/data/global_shuffle_utils.h"
 #include "tensorflow/core/data/name_utils.h"
@@ -25,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
 
 namespace tensorflow {
 namespace data {
@@ -218,10 +220,12 @@ class SkipDatasetOp::Dataset : public DatasetBase {
     IndexMapperFn GetIndexMapper(
         IndexMapperFn parent_index_mapper) const override {
       int64_t skip_count = dataset()->count_;
-      return
-          [parent_index_mapper, skip_count](size_t element_position) -> size_t {
-            return parent_index_mapper(element_position) + skip_count;
-          };
+      return [parent_index_mapper,
+              skip_count](size_t element_position) -> absl::StatusOr<size_t> {
+        TF_ASSIGN_OR_RETURN(size_t shuffled_element_position,
+                            parent_index_mapper(element_position));
+        return shuffled_element_position + skip_count;
+      };
     }
 
    protected:
diff --git a/tensorflow/core/kernels/linalg/matrix_diag_op.cc b/tensorflow/core/kernels/linalg/matrix_diag_op.cc
index 89dc4f06ea111a..b7302ef1514e78 100644
--- a/tensorflow/core/kernels/linalg/matrix_diag_op.cc
+++ b/tensorflow/core/kernels/linalg/matrix_diag_op.cc
@@ -236,12 +236,13 @@ class MatrixDiagOp : public OpKernel {
         errors::InvalidArgument(
             "lower_diag_index must not be larger than upper_diag_index: ",
             lower_diag_index, " > ", upper_diag_index));
-    OP_REQUIRES(context,
-                lower_diag_index == upper_diag_index ||
-                    diagonal_shape.dim_size(diag_rank - 2) == num_diags,
-                errors::InvalidArgument(
-                    "The number of diagonals provided in the input does not "
-                    "match the lower_diag_index and upper_diag_index range."));
+    OP_REQUIRES(
+        context,
+        lower_diag_index == upper_diag_index ||
+            diagonal_shape.dim_size(std::max(diag_rank - 2, 0)) == num_diags,
+        errors::InvalidArgument(
+            "The number of diagonals provided in the input does not "
+            "match the lower_diag_index and upper_diag_index range."));
 
     const Eigen::Index max_diag_len = diagonal_shape.dim_size(diag_rank - 1);
     const Eigen::Index min_num_rows =
diff --git a/tensorflow/core/kernels/mkl/BUILD b/tensorflow/core/kernels/mkl/BUILD
index f533b2f9abcf06..329139874b101f 100644
--- a/tensorflow/core/kernels/mkl/BUILD
+++ b/tensorflow/core/kernels/mkl/BUILD
@@ -401,15 +401,6 @@ tf_mkl_kernel_library(
     deps = MKL_DEPS,
 )
 
-tf_mkl_kernel_library(
-    name = "mkl_deprecated_ops",
-    prefix = "mkl_deprecated_ops",
-    deps = [
-        "//tensorflow/core/graph:mkl_graph_util",
-        "//tensorflow/core/kernels:cwise_op",
-    ] + MKL_DEPS,
-)
-
 tf_mkl_kernel_library(
     name = "mkl_fused_batch_norm_op",
     srcs = ["mkl_fused_batch_norm_op.cc"],
diff --git a/tensorflow/core/kernels/mkl/mkl_deprecated_ops.cc b/tensorflow/core/kernels/mkl/mkl_deprecated_ops.cc
deleted file mode 100644
index 60cb30d04a3096..00000000000000
--- a/tensorflow/core/kernels/mkl/mkl_deprecated_ops.cc
+++ /dev/null
@@ -1,167 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// See docs in ../ops/array_ops.cc, ../ops/math_ops.cc ../ops/nn_ops.cc.
-
-#ifdef INTEL_MKL
-
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/graph/mkl_graph_util.h"
-#include "tensorflow/core/kernels/cwise_ops_common.h"
-
-namespace tensorflow {
-
-// This file contains temporary registrations for some deprecated MKL
-// operations. The kernel registered for all these ops simply raises an error.
-// Implementation for these ops have been removed.
-// TODO(intel-tf): The registration will be removed in next major release.
-
-namespace {
-class RaiseDeprecatedMklOpError : public OpKernel {
- public:
-  explicit RaiseDeprecatedMklOpError(OpKernelConstruction* context)
-      : OpKernel(context) {
-    OP_REQUIRES(context, false,
-                errors::InvalidArgument("Op has been deprecated"));
-  }
-
-  void Compute(OpKernelContext* context) override {}
-};
-}  // namespace
-
-// Deprecated MklAddN op
-#define REGISTER_MKL_CPU(T)                                    \
-  REGISTER_KERNEL_BUILDER(                                     \
-      Name("_MklAddN")                                         \
-          .Device(DEVICE_CPU)                                  \
-          .TypeConstraint<T>("T")                              \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      RaiseDeprecatedMklOpError);
-TF_CALL_float(REGISTER_MKL_CPU);
-TF_CALL_bfloat16(REGISTER_MKL_CPU);
-#undef REGISTER_MKL_CPU
-
-// Deprecated element-wise ops:
-//   MklAdd, MklAddV2, MklSub, MklMul,
-//   MklMaximum, and MklSquaredDifference
-#pragma push_macro("REGISTER")
-#undef REGISTER
-#define REGISTER(OP, D, N, F, T)                               \
-  REGISTER_KERNEL_BUILDER(                                     \
-      Name(N)                                                  \
-          .Device(DEVICE_##D)                                  \
-          .TypeConstraint<T>("T")                              \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      OP);
-REGISTER6(RaiseDeprecatedMklOpError, CPU, "_MklAdd", functor::add, float,
-          Eigen::half, double, int32, int64, bfloat16);
-REGISTER6(RaiseDeprecatedMklOpError, CPU, "_MklAddV2", functor::add, float,
-          Eigen::half, double, int32, int64, bfloat16);
-REGISTER8(RaiseDeprecatedMklOpError, CPU, "_MklSub", functor::sub, float,
-          Eigen::half, double, int32, int64, complex64, complex128, bfloat16);
-REGISTER6(RaiseDeprecatedMklOpError, CPU, "_MklMul", functor::mul, float,
-          Eigen::half, double, uint8, int32, bfloat16);
-REGISTER6(RaiseDeprecatedMklOpError, CPU, "_MklMaximum", functor::maximum,
-          float, Eigen::half, double, int32, int64, bfloat16);
-REGISTER6(RaiseDeprecatedMklOpError, CPU, "_MklSquaredDifference",
-          functor::squared_difference, float, Eigen::half, double, int32, int64,
-          bfloat16);
-#undef REGISTER
-#pragma pop_macro("REGISTER")
-
-// Deprecated MklIdentity op
-#define REGISTER_MKL_CPU(T)                                    \
-  REGISTER_KERNEL_BUILDER(                                     \
-      Name("_MklIdentity")                                     \
-          .Device(DEVICE_CPU)                                  \
-          .TypeConstraint<T>("T")                              \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      RaiseDeprecatedMklOpError);
-TF_CALL_float(REGISTER_MKL_CPU);
-TF_CALL_bfloat16(REGISTER_MKL_CPU);
-#undef REGISTER_MKL_CPU
-
-// Deprecated MklInputConversion op
-#define REGISTER_MKL_CPU(T)                                    \
-  REGISTER_KERNEL_BUILDER(                                     \
-      Name("_MklInputConversion")                              \
-          .Device(DEVICE_CPU)                                  \
-          .TypeConstraint<T>("T")                              \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      RaiseDeprecatedMklOpError);
-TF_CALL_float(REGISTER_MKL_CPU);
-TF_CALL_bfloat16(REGISTER_MKL_CPU);
-#undef REGISTER_MKL_CPU
-
-// Deprecated MklLRN and MklLRNGrad ops
-#define REGISTER_MKL_CPU(T)                                    \
-  REGISTER_KERNEL_BUILDER(                                     \
-      Name("_MklLRN")                                          \
-          .Device(DEVICE_CPU)                                  \
-          .TypeConstraint<T>("T")                              \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      RaiseDeprecatedMklOpError);                              \
-  REGISTER_KERNEL_BUILDER(                                     \
-      Name("_MklLRNGrad")                                      \
-          .Device(DEVICE_CPU)                                  \
-          .TypeConstraint<T>("T")                              \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      RaiseDeprecatedMklOpError);
-TF_CALL_float(REGISTER_MKL_CPU);
-#undef REGISTER_MKL_CPU
-
-// Deprecated MklReshape op
-#define REGISTER_MKL_CPU(T)                                    \
-  REGISTER_KERNEL_BUILDER(                                     \
-      Name("_MklReshape")                                      \
-          .Device(DEVICE_CPU)                                  \
-          .HostMemory("shape")                                 \
-          .TypeConstraint<T>("T")                              \
-          .TypeConstraint("Tshape", {DT_INT32, DT_INT64})      \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      RaiseDeprecatedMklOpError);
-TF_CALL_float(REGISTER_MKL_CPU);
-TF_CALL_bfloat16(REGISTER_MKL_CPU);
-#undef REGISTER_MKL_CPU
-
-// Deprecated MklSlice op
-#define REGISTER_MKL_CPU(type)                                 \
-  REGISTER_KERNEL_BUILDER(                                     \
-      Name("_MklSlice")                                        \
-          .Device(DEVICE_CPU)                                  \
-          .TypeConstraint<type>("T")                           \
-          .HostMemory("begin")                                 \
-          .HostMemory("size")                                  \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      RaiseDeprecatedMklOpError);
-TF_CALL_float(REGISTER_MKL_CPU);
-TF_CALL_bfloat16(REGISTER_MKL_CPU);
-#undef REGISTER_MKL_CPU
-
-// Deprecated MklToTf op
-#define REGISTER_MKL_CPU(T)                                    \
-  REGISTER_KERNEL_BUILDER(                                     \
-      Name("_MklToTf")                                         \
-          .Device(DEVICE_CPU)                                  \
-          .TypeConstraint<T>("T")                              \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      RaiseDeprecatedMklOpError);
-TF_CALL_NUMBER_TYPES(REGISTER_MKL_CPU);
-TF_CALL_QUANTIZED_TYPES(REGISTER_MKL_CPU);
-#undef REGISTER_MKL_CPU
-
-}  // namespace tensorflow
-#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/padding_fifo_queue.cc b/tensorflow/core/kernels/padding_fifo_queue.cc
index fb0dfd553430ac..317f19712d0e8d 100644
--- a/tensorflow/core/kernels/padding_fifo_queue.cc
+++ b/tensorflow/core/kernels/padding_fifo_queue.cc
@@ -394,7 +394,7 @@ Status PaddingFIFOQueue::SetElementZero(Tensor* element) {
 }
 
 std::vector<TensorShape> PaddingFIFOQueue::ConvertShapesPartialDimensionsToZero(
-    const absl::Span<const PartialTensorShape>& partial_shapes) {
+    const absl::Span<const PartialTensorShape> partial_shapes) {
   std::vector<TensorShape> shapes(partial_shapes.size());
   for (size_t i = 0; i < shapes.size(); ++i) {
     const PartialTensorShape& partial = partial_shapes[i];
diff --git a/tensorflow/core/kernels/padding_fifo_queue.h b/tensorflow/core/kernels/padding_fifo_queue.h
index 15b25efbc6f87e..d124f0a03c049a 100644
--- a/tensorflow/core/kernels/padding_fifo_queue.h
+++ b/tensorflow/core/kernels/padding_fifo_queue.h
@@ -57,7 +57,7 @@ class PaddingFIFOQueue : public FIFOQueue {
   // Any unknown dimension sizes are converted to 0.
   // REQUIRED: All the input shapes have well defined rank.
   static std::vector<TensorShape> ConvertShapesPartialDimensionsToZero(
-      const absl::Span<const PartialTensorShape>& partial_shapes);
+      absl::Span<const PartialTensorShape> partial_shapes);
 
   // Sets the values in the given element to zero.
   static Status SetElementZero(Tensor* element);
diff --git a/tensorflow/core/lib/gtl/edit_distance_test.cc b/tensorflow/core/lib/gtl/edit_distance_test.cc
index b6f63880ff6433..8290334bbf71a7 100644
--- a/tensorflow/core/lib/gtl/edit_distance_test.cc
+++ b/tensorflow/core/lib/gtl/edit_distance_test.cc
@@ -91,8 +91,8 @@ TEST_F(LevenshteinDistanceTest, DifferentComparisons) {
   ASSERT_EQ(LevenshteinDistance(lower_, upper_, std::equal_to<char>()), 5);
   ASSERT_EQ(LevenshteinDistance(upper_, lower_, std::equal_to<char>()), 5);
   ASSERT_EQ(
-      LevenshteinDistance(gtl::ArraySlice<char>(lower_.data(), lower_.size()),
-                          gtl::ArraySlice<char>(upper_.data(), upper_.size()),
+      LevenshteinDistance(absl::Span<const char>(lower_.data(), lower_.size()),
+                          absl::Span<const char>(upper_.data(), upper_.size()),
                           std::equal_to<char>()),
       5);
   auto no_case_cmp = [](char c1, char c2) {
@@ -125,8 +125,8 @@ static void BM_EditDistanceHelper(::testing::benchmark::State& state, int len,
     }
   }
   for (auto s : state) {
-    LevenshteinDistance(gtl::ArraySlice<char>(a.data(), len),
-                        gtl::ArraySlice<char>(b.data(), len),
+    LevenshteinDistance(absl::Span<const char>(a.data(), len),
+                        absl::Span<const char>(b.data(), len),
                         std::equal_to<char>());
   }
 }
diff --git a/tensorflow/core/nccl/BUILD b/tensorflow/core/nccl/BUILD
index 80ef849a9ed93d..acf621ce886a77 100644
--- a/tensorflow/core/nccl/BUILD
+++ b/tensorflow/core/nccl/BUILD
@@ -114,9 +114,7 @@ cc_library(
     name = "loose_headers",
     tags = ["avoid_dep"],
     textual_hdrs = ["nccl_manager.h"],
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
 )
 
 filegroup(
diff --git a/tensorflow/core/ops/array_grad.cc b/tensorflow/core/ops/array_grad.cc
index c53044dbe26f51..ebaade2c926c8f 100644
--- a/tensorflow/core/ops/array_grad.cc
+++ b/tensorflow/core/ops/array_grad.cc
@@ -29,6 +29,9 @@ REGISTER_OP_NO_GRADIENT("OnesLike");
 REGISTER_OP_NO_GRADIENT("Const");
 REGISTER_OP_NO_GRADIENT("EditDistance");
 REGISTER_OP_NO_GRADIENT("StopGradient");
+REGISTER_OP_NO_GRADIENT("FakeQuantWithMinMaxArgsGradient");
+REGISTER_OP_NO_GRADIENT("FakeQuantWithMinMaxVarsGradient");
+REGISTER_OP_NO_GRADIENT("FakeQuantWithMinMaxVarsPerChannelGradient");
 
 Status ReshapeGrad(const AttrSlice& attrs, FunctionDef* g) {
   // clang-format off
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 62171078b51a27..650ef381266ea7 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -1322,18 +1322,6 @@ REGISTER_OP("Snapshot")
     .Attr("T: type")
     .SetShapeFn(shape_inference::UnchangedShape);
 
-#ifdef INTEL_MKL
-REGISTER_OP("_MklIdentity")
-    .Input("input: T")
-    .Input("mkl_input: uint8")
-    .Output("output: T")
-    .Output("mkl_output: uint8")
-    .Attr("T: type")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"Doc( Mkl implementation of IdentityOp
-)Doc");
-#endif
-
 REGISTER_OP("IdentityN")
     .Input("input: T")
     .Output("output: T")
@@ -1419,21 +1407,6 @@ REGISTER_OP("Reshape")
       return SetOutputShapeForReshape(c);
     });
 
-#ifdef INTEL_MKL
-REGISTER_OP("_MklReshape")
-    .Input("tensor: T")
-    .Input("shape: Tshape")
-    .Input("mkl_tensor: uint8")
-    .Input("mkl_shape: uint8")
-    .Output("output: T")
-    .Output("mkl_output: uint8")
-    .Attr("T: type")
-    .Attr("Tshape: {int32, int64} = DT_INT32")
-    .SetShapeFn([](InferenceContext* c) { return SetOutputShapeForReshape(c); })
-    .Doc(R"Doc( MKL implementation of ReshapeOp.
-)Doc");
-#endif  // INTEL_MKL
-
 // --------------------------------------------------------------------------
 REGISTER_OP("InvertPermutation")
     .Input("x: T")
@@ -1728,21 +1701,6 @@ REGISTER_OP("Slice")
     .Attr("Index: {int32,int64}")
     .SetShapeFn(shape_inference::SliceShape);
 
-#ifdef INTEL_MKL
-REGISTER_OP("_MklSlice")
-    .Input("input: T")
-    .Input("begin: Index")
-    .Input("size: Index")
-    .Input("mkl_input: uint8")
-    .Input("mkl_begin: uint8")
-    .Input("mkl_size: uint8")
-    .Output("output: T")
-    .Output("mkl_output: uint8")
-    .Attr("T: type")
-    .Attr("Index: {int32,int64}")
-    .SetShapeFn(shape_inference::SliceShape);
-#endif
-
 REGISTER_OP("StridedSlice")
     .Input("input: T")
     .Input("begin: Index")
diff --git a/tensorflow/core/ops/batch_ops.cc b/tensorflow/core/ops/batch_ops.cc
index e0e7a9fe477a22..6361fe72d850ac 100644
--- a/tensorflow/core/ops/batch_ops.cc
+++ b/tensorflow/core/ops/batch_ops.cc
@@ -51,6 +51,26 @@ REGISTER_OP("BatchFunction")
     .Attr("low_priority_batch_timeout_micros: int = 0")
     .Attr("low_priority_allowed_batch_sizes: list(int) = []")
     .Attr("low_priority_max_enqueued_batches: int = 0")
+    // Policy that determines the mixed priority batching behavior when low
+    // priority batch parameters are present.
+    //
+    // low_priority_padding_with_next_allowed_batch_size: If high priority
+    // batches time out without reaching the max batch size, low priority inputs
+    // pad the high priority batches up to the next allowed batch size. A low
+    // priority only batch gets schedule only when the low priority input times
+    // out or reaches the max batch size while there is no high priority input
+    // waiting to be processed.
+    // low_priority_padding_with_max_batch_size: Same as above but pad up to the
+    // max batch size.
+    // priority_isolation: High priority and low priority inputs never share the
+    // same batch, i.e., no low priority input padding high priority batches.
+    // Low priority inputs get scheduled only as part of low priority only
+    // batches as described above.
+    .Attr(
+        "mixed_priority_policy: "
+        "{'low_priority_padding_with_max_batch_size', "
+        "'low_priority_padding_with_next_allowed_batch_size', "
+        "'priority_isolation'} = 'low_priority_padding_with_max_batch_size'")
     .Attr("Tin: list(type)")
     .Attr("Tcaptured: list(type) >= 0")
     .Attr("Tout: list(type)")
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BatchFunction.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BatchFunction.pbtxt
index 621fd8d252f59c..e35c2c8618eddd 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BatchFunction.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BatchFunction.pbtxt
@@ -386,3 +386,272 @@ op {
   }
   is_distributed_communication: true
 }
+op {
+  name: "BatchFunction"
+  input_arg {
+    name: "in_tensors"
+    type_list_attr: "Tin"
+  }
+  input_arg {
+    name: "captured_tensors"
+    type_list_attr: "Tcaptured"
+  }
+  output_arg {
+    name: "out_tensors"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "num_batch_threads"
+    type: "int"
+  }
+  attr {
+    name: "max_batch_size"
+    type: "int"
+  }
+  attr {
+    name: "batch_timeout_micros"
+    type: "int"
+  }
+  attr {
+    name: "max_enqueued_batches"
+    type: "int"
+    default_value {
+      i: 10
+    }
+  }
+  attr {
+    name: "allowed_batch_sizes"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "batching_queue"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "low_priority_max_batch_size"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "low_priority_batch_timeout_micros"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "low_priority_allowed_batch_sizes"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "low_priority_max_enqueued_batches"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "mixed_priority_policy"
+    type: "string"
+    default_value {
+      s: "low_priority_padding_with_max_batch_size"
+    }
+    allowed_values {
+      list {
+        s: "low_priority_padding_with_max_batch_size"
+        s: "low_priority_padding_with_next_allowed_batch_size"
+      }
+    }
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Tcaptured"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "enable_large_batch_splitting"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_distributed_communication: true
+}
+op {
+  name: "BatchFunction"
+  input_arg {
+    name: "in_tensors"
+    type_list_attr: "Tin"
+  }
+  input_arg {
+    name: "captured_tensors"
+    type_list_attr: "Tcaptured"
+  }
+  output_arg {
+    name: "out_tensors"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "num_batch_threads"
+    type: "int"
+  }
+  attr {
+    name: "max_batch_size"
+    type: "int"
+  }
+  attr {
+    name: "batch_timeout_micros"
+    type: "int"
+  }
+  attr {
+    name: "max_enqueued_batches"
+    type: "int"
+    default_value {
+      i: 10
+    }
+  }
+  attr {
+    name: "allowed_batch_sizes"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "batching_queue"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "low_priority_max_batch_size"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "low_priority_batch_timeout_micros"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "low_priority_allowed_batch_sizes"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "low_priority_max_enqueued_batches"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "mixed_priority_policy"
+    type: "string"
+    default_value {
+      s: "low_priority_padding_with_max_batch_size"
+    }
+    allowed_values {
+      list {
+        s: "low_priority_padding_with_max_batch_size"
+        s: "low_priority_padding_with_next_allowed_batch_size"
+        s: "priority_isolation"
+      }
+    }
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Tcaptured"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "enable_large_batch_splitting"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_distributed_communication: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ComputeDedupDataSizeV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ComputeDedupDataSizeV2.pbtxt
index 2493251c1fddc6..7851f740a6b679 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ComputeDedupDataSizeV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ComputeDedupDataSizeV2.pbtxt
@@ -1,4 +1,4 @@
-op {
+op 	 {
   name: "ComputeDedupDataSizeV2"
   output_arg {
     name: "num_elements"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ComputeDedupDataTupleMaskV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ComputeDedupDataTupleMaskV2.pbtxt
index 87deca2a2daecd..f1e3e21c75ad1b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ComputeDedupDataTupleMaskV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ComputeDedupDataTupleMaskV2.pbtxt
@@ -1,4 +1,4 @@
-op {
+op 	 {
   name: "ComputeDedupDataTupleMaskV2"
   output_arg {
     name: "output_shape"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FinalizeTPUEmbeddingV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FinalizeTPUEmbeddingV2.pbtxt
index 63c69eaff3aaba..fd57e3726f6040 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FinalizeTPUEmbeddingV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FinalizeTPUEmbeddingV2.pbtxt
@@ -1,4 +1,4 @@
-op {
+op 	 {
   name: "FinalizeTPUEmbeddingV2"
   input_arg {
     name: "common_config"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/IndexFlatMapDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/IndexFlatMapDataset.pbtxt
new file mode 100644
index 00000000000000..348e21901d1743
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/IndexFlatMapDataset.pbtxt
@@ -0,0 +1,80 @@
+op 	 {
+  name: "IndexFlatMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "map_func_other_args"
+    type_list_attr: "Tmap_func_args"
+  }
+  input_arg {
+    name: "index_map_func_other_args"
+    type_list_attr: "Tindex_map_func_args"
+  }
+  input_arg {
+    name: "output_cardinality"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "map_func"
+    type: "func"
+  }
+  attr {
+    name: "index_map_func"
+    type: "func"
+  }
+  attr {
+    name: "Tmap_func_args"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tindex_map_func_args"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/WeightedFlatMapDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/WeightedFlatMapDataset.pbtxt
new file mode 100644
index 00000000000000..5c50229e4c5b07
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/WeightedFlatMapDataset.pbtxt
@@ -0,0 +1,68 @@
+op 	 {
+  name: "WeightedFlatMapDataset"
+  input_arg {
+    name: "input_datasets"
+    type: DT_VARIANT
+    number_attr: "N"
+  }
+  input_arg {
+    name: "weights"
+    type: DT_DOUBLE
+    number_attr: "M"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "M"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaRecvTPUEmbeddingActivationsV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaRecvTPUEmbeddingActivationsV2.pbtxt
index 2e8fb4d4f2530c..c5abbb2d5f1a4e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/XlaRecvTPUEmbeddingActivationsV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaRecvTPUEmbeddingActivationsV2.pbtxt
@@ -1,4 +1,4 @@
-op {
+op 	 {
   name: "XlaRecvTPUEmbeddingActivationsV2"
   input_arg {
     name: "deduplication_data"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaRecvTPUEmbeddingDeduplicationDataV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaRecvTPUEmbeddingDeduplicationDataV2.pbtxt
index d97710b91e46fb..71632c6d871452 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/XlaRecvTPUEmbeddingDeduplicationDataV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaRecvTPUEmbeddingDeduplicationDataV2.pbtxt
@@ -1,4 +1,4 @@
-op {
+op 	 {
   name: "XlaRecvTPUEmbeddingDeduplicationDataV2"
   output_arg {
     name: "output"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSendTPUEmbeddingGradientsV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSendTPUEmbeddingGradientsV2.pbtxt
index b416d0ad1a8f0c..f52b83abc14719 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/XlaSendTPUEmbeddingGradientsV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSendTPUEmbeddingGradientsV2.pbtxt
@@ -1,4 +1,4 @@
-op {
+op 	 {
   name: "XlaSendTPUEmbeddingGradientsV2"
   input_arg {
     name: "gradients"
diff --git a/tensorflow/core/ops/experimental_dataset_ops.cc b/tensorflow/core/ops/experimental_dataset_ops.cc
index fd59bbe974703f..281075d6255803 100644
--- a/tensorflow/core/ops/experimental_dataset_ops.cc
+++ b/tensorflow/core/ops/experimental_dataset_ops.cc
@@ -523,6 +523,36 @@ REGISTER_OP("GlobalShuffleDataset")
       return shape_inference::ScalarShape(c);
     });
 
+REGISTER_OP("IndexFlatMapDataset")
+    .Input("input_dataset: variant")
+    .Input("map_func_other_args: Tmap_func_args")
+    .Input("index_map_func_other_args: Tindex_map_func_args")
+    .Input("output_cardinality: int64")
+    .Output("handle: variant")
+    .Attr("map_func: func")
+    .Attr("index_map_func: func")
+    .Attr("Tmap_func_args: list(type) >= 0")
+    .Attr("Tindex_map_func_args: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .Attr("metadata: string = ''")
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("WeightedFlatMapDataset")
+    .Input("input_datasets: N * variant")
+    .Input("weights: M * double")
+    .Output("handle: variant")
+    .Attr("N: int >= 2")
+    .Attr("M: int >= 2")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .Attr("metadata: string = ''")
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
+    .SetShapeFn(shape_inference::ScalarShape);
+
 REGISTER_OP("IgnoreErrorsDataset")
     .Input("input_dataset: variant")
     .Output("handle: variant")
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 192899b6726364..25e4e373d5aace 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -432,45 +432,6 @@ REGISTER_OP("AddV2")
     .SetIsAggregate()
     .SetIsCommutative();
 
-#ifdef INTEL_MKL
-REGISTER_OP("_MklAdd")
-    .Input("x: T")
-    .Input("y: T")
-    .Input("mkl_x: uint8")
-    .Input("mkl_y: uint8")
-    .Output("z: T")
-    .Output("mkl_z: uint8")
-    .Attr(
-        "T: {half, float, double, uint8, int8, int16, int32, int64, complex64, "
-        "complex128, string, bfloat16}")
-    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
-    .Doc(R"doc(
-Returns `x` + `y` element-wise.
-
-*NOTE*: `tf.math.add` supports broadcasting. `tf.math.add_n` does not. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html).
-)doc");
-
-REGISTER_OP("_MklAddV2")
-    .Input("x: T")
-    .Input("y: T")
-    .Input("mkl_x: uint8")
-    .Input("mkl_y: uint8")
-    .Output("z: T")
-    .Output("mkl_z: uint8")
-    .Attr(
-        "T: {bfloat16, half, float, double, uint8, int8, int16, int32, int64, "
-        "complex64, complex128}")
-    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
-    .SetIsAggregate()
-    .SetIsCommutative()
-    .Doc(R"doc(
-Returns `x` + `y` element-wise.
-*NOTE*: `tf.math.add` supports broadcasting. `tf.math.add_n` does not. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html).
-)doc");
-#endif  // INTEL_MKL
-
 REGISTER_OP("Sub")
     .Input("x: T")
     .Input("y: T")
@@ -480,19 +441,6 @@ REGISTER_OP("Sub")
         "int64, complex64, complex128, uint32, uint64}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
-REGISTER_OP("_MklSub")
-    .BINARY_FEWER()
-    .Input("mkl_x: uint8")
-    .Input("mkl_y: uint8")
-    .Output("mkl_z: uint8")
-    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
-    .Doc(R"doc(
-Returns x - y element-wise.
-
-*NOTE*: `Sub` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-)doc");
-
 REGISTER_OP("Mul").BINARY_MORE().SetIsCommutative().SetShapeFn(
     shape_inference::BroadcastBinaryOpShapeFn);
 
@@ -503,20 +451,6 @@ REGISTER_OP("MulNoNan")
     .Attr("T: {bfloat16, half, float, double, complex64, complex128}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
-// Note: This op is not commutative w.r.t. to all its inputs.
-REGISTER_OP("_MklMul")
-    .BINARY_MORE()
-    .Input("mkl_x: uint8")
-    .Input("mkl_y: uint8")
-    .Output("mkl_z: uint8")
-    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
-    .Doc(R"doc(
-Returns x * y element-wise.
-
-*NOTE*: `Mul` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-)doc");
-
 REGISTER_OP("Div").BINARY_MORE().SetShapeFn(
     shape_inference::BroadcastBinaryOpShapeFn);
 
@@ -544,20 +478,6 @@ REGISTER_OP("SquaredDifference")
     .SetIsCommutative()
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
-// Note: This op is not commutative w.r.t. to all its inputs.
-REGISTER_OP("_MklSquaredDifference")
-    .BINARY_FEWER()
-    .Input("mkl_x: uint8")
-    .Input("mkl_y: uint8")
-    .Output("mkl_z: uint8")
-    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
-    .Doc(R"doc(
-Returns (x - y)(x - y) element-wise.
-
-*NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-)doc");
-
 REGISTER_OP("Xlogy")
     .Input("x: T")
     .Input("y: T")
@@ -591,22 +511,6 @@ REGISTER_OP("Maximum")
         "int32, uint32, int64, uint64}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
-REGISTER_OP("_MklMaximum")
-    .Input("x: T")
-    .Input("y: T")
-    .Input("mkl_x: uint8")
-    .Input("mkl_y: uint8")
-    .Output("z: T")
-    .Output("mkl_z: uint8")
-    .Attr("T: {half, float, double, int32, int64, bfloat16}")
-    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
-    .Doc(R"doc(
-Returns the max of x and y (i.e. x > y ? x : y) element-wise.
-
-*NOTE*: `Maximum` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-)doc");
-
 REGISTER_OP("Minimum")
     .Input("x: T")
     .Input("y: T")
@@ -2152,32 +2056,6 @@ REGISTER_OP("ClipByValue")
     .Attr("T: numbertype")
     .SetShapeFn(shape_inference::UnchangedShape);
 
-#ifdef INTEL_MKL
-// Note: This op is not commutative w.r.t. to all its inputs.
-REGISTER_OP("_MklAddN")
-    .Input("inputs: N * T")
-    .Input("mkl_input: N * uint8")
-    .Output("sum: T")
-    .Output("mkl_sum: uint8")
-    .Attr("N: int >= 1")
-    .Attr("T: numbertype")
-    .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle cur = c->input(c->num_inputs() - 1);
-      for (int i = c->num_inputs() - 2; i >= 0; --i) {
-        TF_RETURN_WITH_CONTEXT_IF_ERROR(c->Merge(c->input(i), cur, &cur),
-                                        "From merging shape ", i,
-                                        " with other shapes.");
-      }
-      c->set_output(0, cur);
-      return OkStatus();
-    })
-    .Doc(R"doc(
-Add two input tensors element wise using mkl kernel sum.
-inputs: Must all be the same size and shape.
-)doc");
-
-#endif  // INTEL_MKL
-
 REGISTER_OP("RequantizePerChannel")
     .Input("input: T")
     .Input("input_min: float")
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 102b5dc5030e7f..0b422f2aa3cc53 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -2625,63 +2625,6 @@ of MaxPool3D function.
 expected to invoke these operators.
 )doc");
 
-REGISTER_OP("_MklLRN")
-    .Input("input: T")
-    .Input("mkl_input: uint8")
-    .Output("output: T")
-    .Output("workspace: uint8")
-    .Output("mkl_output: uint8")
-    .Output("mkl_workspace: uint8")
-    .Attr("depth_radius: int = 5")
-    .Attr("bias: float = 1.0")
-    .Attr("alpha: float = 1.0")
-    .Attr("beta: float = 0.5")
-    .Attr("workspace_enabled: bool = false")
-    .Attr("T: {float, half} = DT_FLOAT")
-    .SetShapeFn([](InferenceContext* c) {
-      return UnchangedShapeWithRank(c, 4);
-    })
-    .Doc(R"doc(
-MKL version of LRN operator. Uses MKL DNN APIs to perform local response
-normalization.
-
-NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
-expected to invoke these operators.
-)doc");
-
-REGISTER_OP("_MklLRNGrad")
-    .Input("input_grads: T")
-    .Input("input_image: T")
-    .Input("output_image: T")
-    .Input("workspace: uint8")
-    .Input("mkl_input_grads: uint8")
-    .Input("mkl_input_image: uint8")
-    .Input("mkl_output_image: uint8")
-    .Input("mkl_workspace: uint8")
-    .Output("output: T")
-    .Output("mkl_output: uint8")
-    .Attr("depth_radius: int = 5")
-    .Attr("bias: float = 1.0")
-    .Attr("alpha: float = 1.0")
-    .Attr("beta: float = 0.5")
-    .Attr("workspace_enabled: bool = false")
-    .Attr("T: {float, half} = DT_FLOAT")
-    .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle s;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &s));  // input_grads
-      TF_RETURN_IF_ERROR(c->Merge(s, c->input(1), &s));     // input_image
-      TF_RETURN_IF_ERROR(c->Merge(s, c->input(2), &s));     // output_image
-      c->set_output(0, s);
-      return OkStatus();
-    })
-    .Doc(R"doc(
-MKL version of LRNGrad operator. Uses MKL DNN APIs to compute gradient for
-local response normalization.
-
-NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
-expected to invoke these operators.
-)doc");
-
 REGISTER_OP("_MklFusedBatchNorm")
     .Input("x: T")
     .Input("scale: T")
@@ -2808,44 +2751,6 @@ REGISTER_OP("_MklFusedBatchNormGradV2")
     .Attr("is_training: bool = true")
     .SetShapeFn(shape_inference::FusedBatchNormGradShape);
 
-REGISTER_OP("_MklToTf")
-    .Input("input: T")
-    .Input("mkl_input: uint8")
-    .Output("output: T")
-    .Attr("T: {half, float, double, bfloat16, qint8, quint8, qint32}")
-    .Attr(GetConvnetDataFormat2D3DAttrString())
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"doc(
-MKL operator to convert a tensor from MKL layout to TensorFlow layout.
-
-NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
-expected to invoke these operators.
-)doc");
-
-REGISTER_OP("_MklInputConversion")
-    .Input("input_0: T")
-    .Input("input_1: T")
-    .Input("mkl_input_0: uint8")
-    .Input("mkl_input_1: uint8")
-    .Output("output_0: T")
-    .Output("output_1: T")
-    .Output("mkl_output_0: uint8")
-    .Output("mkl_output_1: uint8")
-    // All datatypes supported by element-wise ops
-    .Attr(
-        "T: {half, float, bfloat16, double, uint8, int8, uint16, int16, int32, "
-        "int64, complex64, complex128}")
-    .Attr(GetConvnetDataFormat2D3DAttrString())
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"doc(
-MKL operator to process the inputs to an elementwise MKL op. Both inputs
-need to be either in TF or in MKL format. This op is added before every
-element-wise MKL op.
-
-NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
-expected to invoke these operators.
-)doc");
-
 #endif  // INTEL_MKL
 REGISTER_OP("QuantizedConv2DAndRequantize")
     .Input("input: Tinput")
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 81347a6dea8801..672f72adeb82c4 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -4089,6 +4089,20 @@ op {
       i: 0
     }
   }
+  attr {
+    name: "mixed_priority_policy"
+    type: "string"
+    default_value {
+      s: "low_priority_padding_with_max_batch_size"
+    }
+    allowed_values {
+      list {
+        s: "low_priority_padding_with_max_batch_size"
+        s: "low_priority_padding_with_next_allowed_batch_size"
+        s: "priority_isolation"
+      }
+    }
+  }
   attr {
     name: "Tin"
     type: "list(type)"
@@ -23513,6 +23527,86 @@ op {
     }
   }
 }
+op {
+  name: "IndexFlatMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "map_func_other_args"
+    type_list_attr: "Tmap_func_args"
+  }
+  input_arg {
+    name: "index_map_func_other_args"
+    type_list_attr: "Tindex_map_func_args"
+  }
+  input_arg {
+    name: "output_cardinality"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "map_func"
+    type: "func"
+  }
+  attr {
+    name: "index_map_func"
+    type: "func"
+  }
+  attr {
+    name: "Tmap_func_args"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tindex_map_func_args"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
 op {
   name: "InfeedDequeue"
   output_arg {
@@ -65701,6 +65795,74 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "WeightedFlatMapDataset"
+  input_arg {
+    name: "input_datasets"
+    type: DT_VARIANT
+    number_attr: "N"
+  }
+  input_arg {
+    name: "weights"
+    type: DT_DOUBLE
+    number_attr: "M"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "M"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
 op {
   name: "Where"
   input_arg {
diff --git a/tensorflow/core/platform/build_config.default.bzl b/tensorflow/core/platform/build_config.default.bzl
index f3c7eba3fd03b6..650e776c0423fc 100644
--- a/tensorflow/core/platform/build_config.default.bzl
+++ b/tensorflow/core/platform/build_config.default.bzl
@@ -2,7 +2,7 @@
 
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 load(
-    "@local_tsl//tsl:tsl.bzl",
+    "@local_xla//xla/tsl:tsl.bzl",
     "if_libtpu",
 )
 load(
diff --git a/tensorflow/core/platform/build_config_root.bzl b/tensorflow/core/platform/build_config_root.bzl
index c0513041898983..76fb425ba6f313 100644
--- a/tensorflow/core/platform/build_config_root.bzl
+++ b/tensorflow/core/platform/build_config_root.bzl
@@ -5,6 +5,7 @@ load(
     _if_llvm_aarch32_available = "if_llvm_aarch32_available",
     _if_llvm_aarch64_available = "if_llvm_aarch64_available",
     _if_llvm_arm_available = "if_llvm_arm_available",
+    _if_llvm_hexagon_available = "if_llvm_hexagon_available",
     _if_llvm_powerpc_available = "if_llvm_powerpc_available",
     _if_llvm_system_z_available = "if_llvm_system_z_available",
     _if_llvm_x86_available = "if_llvm_x86_available",
@@ -28,6 +29,7 @@ load(
 if_llvm_aarch32_available = _if_llvm_aarch32_available
 if_llvm_aarch64_available = _if_llvm_aarch64_available
 if_llvm_arm_available = _if_llvm_arm_available
+if_llvm_hexagon_available = _if_llvm_hexagon_available
 if_llvm_powerpc_available = _if_llvm_powerpc_available
 if_llvm_system_z_available = _if_llvm_system_z_available
 if_llvm_x86_available = _if_llvm_x86_available
diff --git a/tensorflow/core/platform/build_config_root.default.bzl b/tensorflow/core/platform/build_config_root.default.bzl
index 1e40dd6a7a1aa1..b503d99729b04d 100644
--- a/tensorflow/core/platform/build_config_root.default.bzl
+++ b/tensorflow/core/platform/build_config_root.default.bzl
@@ -3,7 +3,7 @@
 # unused in TSL
 def tf_additional_plugin_deps():
     return select({
-        str(Label("@local_tsl//tsl:with_xla_support")): [
+        str(Label("@local_xla//xla/tsl:with_xla_support")): [
             str(Label("//tensorflow/compiler/jit")),
         ],
         "//conditions:default": [],
diff --git a/tensorflow/core/platform/cloud/BUILD b/tensorflow/core/platform/cloud/BUILD
index c1d6061917ceaa..5f4ae9b22bda89 100644
--- a/tensorflow/core/platform/cloud/BUILD
+++ b/tensorflow/core/platform/cloud/BUILD
@@ -2,7 +2,7 @@
 # Cloud file system implementation.
 
 load(
-    "@local_tsl//tsl:tsl.bzl",
+    "@local_xla//xla/tsl:tsl.bzl",
     "tsl_copts",
 )
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
diff --git a/tensorflow/core/platform/protobuf_internal.h b/tensorflow/core/platform/protobuf_internal.h
index 5b6108db4e1df1..fa23b59604a356 100644
--- a/tensorflow/core/platform/protobuf_internal.h
+++ b/tensorflow/core/platform/protobuf_internal.h
@@ -37,7 +37,7 @@ Status ParseAny(const google::protobuf::Any& any, T* message,
   if (!any.UnpackTo(message)) {
     return errors::FailedPrecondition("Failed to unpack: ", any.DebugString());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/BUILD b/tensorflow/core/profiler/BUILD
index cec5bd7bd32a21..f0f7d2485e7e24 100644
--- a/tensorflow/core/profiler/BUILD
+++ b/tensorflow/core/profiler/BUILD
@@ -111,6 +111,7 @@ cc_library(
         "//tensorflow/core/profiler/lib:profiler_factory_impl",
         "//tensorflow/core/profiler/lib:profiler_session_impl",
         "@local_tsl//tsl/profiler/backends/cpu:annotation_stack_impl",
+        "@local_tsl//tsl/profiler/backends/cpu:threadpool_listener",
         "@local_tsl//tsl/profiler/backends/cpu:traceme_recorder_impl",
         "@local_tsl//tsl/profiler/utils:time_utils_impl",
     ],
diff --git a/tensorflow/core/profiler/backends/gpu/device_tracer_test.cc b/tensorflow/core/profiler/backends/gpu/device_tracer_test.cc
index fe017117ae664c..4030e1608a4ea7 100644
--- a/tensorflow/core/profiler/backends/gpu/device_tracer_test.cc
+++ b/tensorflow/core/profiler/backends/gpu/device_tracer_test.cc
@@ -74,7 +74,9 @@ std::unique_ptr<ProfilerInterface> CreateGpuTracer() {
 
 #else
 // We don't have device tracer for non-cuda case.
-std::unique_ptr<ProfilerInterface> CreateGpuTracer() { return nullptr; }
+std::unique_ptr<tsl::profiler::ProfilerInterface> CreateGpuTracer() {
+  return nullptr;
+}
 #endif
 
 namespace {
diff --git a/tensorflow/core/profiler/convert/BUILD b/tensorflow/core/profiler/convert/BUILD
index 40d967302cc34c..ac4fb188bac6d5 100644
--- a/tensorflow/core/profiler/convert/BUILD
+++ b/tensorflow/core/profiler/convert/BUILD
@@ -727,9 +727,7 @@ cc_library(
     srcs = ["hlo_to_tools_data.cc"],
     hdrs = ["hlo_to_tools_data.h"],
     copts = tf_profiler_copts(),
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
     deps = [
         ":hlo_proto_to_graph_view",
         ":hlo_proto_to_memory_visualization_utils",
diff --git a/tensorflow/core/profiler/convert/hlo_proto_to_graph_view.cc b/tensorflow/core/profiler/convert/hlo_proto_to_graph_view.cc
index bee450104be0a1..f887d67a04b4a6 100644
--- a/tensorflow/core/profiler/convert/hlo_proto_to_graph_view.cc
+++ b/tensorflow/core/profiler/convert/hlo_proto_to_graph_view.cc
@@ -64,10 +64,10 @@ void CleanUpHloModuleForGraphviz(HloModule* hlo_module) {
   }
 }
 
-StatusOr<std::string> Plot(std::unique_ptr<HloModule> module,
-                           const std::string& node_name, int graph_width,
-                           const HloRenderOptions& render_options,
-                           const RenderedGraphFormat& format) {
+absl::StatusOr<std::string> Plot(std::unique_ptr<HloModule> module,
+                                 const std::string& node_name, int graph_width,
+                                 const HloRenderOptions& render_options,
+                                 const RenderedGraphFormat& format) {
   if (node_name.empty()) {
     // This should not happen.
     return InvalidArgument("node_name should not be empty");
@@ -81,7 +81,7 @@ StatusOr<std::string> Plot(std::unique_ptr<HloModule> module,
                      node_name, "."));
   }
   // Generate the graph and print the resulting string.
-  StatusOr<std::string> graph_handle;
+  absl::StatusOr<std::string> graph_handle;
 
   CleanUpHloModuleForGraphviz(module.get());
   if (comp) {
@@ -112,7 +112,8 @@ static constexpr int kDefaultMergeFusion = 0;
 
 }  // namespace
 
-StatusOr<GraphViewerParams> ParseGraphViewerParams(const ToolOptions& options) {
+absl::StatusOr<GraphViewerParams> ParseGraphViewerParams(
+    const ToolOptions& options) {
   GraphViewerParams params;
   std::optional<std::string> type = GetParam<std::string>(options, "type");
   if (!type.has_value()) {
@@ -167,7 +168,7 @@ xla::RenderedGraphFormat GetRenderFormat(const std::string& format_string) {
   }
 }
 
-StatusOr<std::string> ConvertHloProtoToGraph(
+absl::StatusOr<std::string> ConvertHloProtoToGraph(
     const HloProto& hlo_proto, const std::string& node_name, int graph_width,
     const HloRenderOptions& render_options, const RenderedGraphFormat& format) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> hlo_module,
@@ -176,8 +177,8 @@ StatusOr<std::string> ConvertHloProtoToGraph(
               format);
 }
 
-StatusOr<std::string> ConvertHloProtoToStringView(const HloProto& hlo_proto,
-                                                  bool verbose, bool metadata) {
+absl::StatusOr<std::string> ConvertHloProtoToStringView(
+    const HloProto& hlo_proto, bool verbose, bool metadata) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> hlo_module,
                       ConvertHloProtoToModule(hlo_proto));
   HloPrintOptions options;
@@ -189,7 +190,8 @@ StatusOr<std::string> ConvertHloProtoToStringView(const HloProto& hlo_proto,
   return hlo_module->ToString(options);
 }
 
-std::function<StatusOr<std::string>(absl::string_view)>* url_renderer = nullptr;
+std::function<absl::StatusOr<std::string>(absl::string_view)>* url_renderer =
+    nullptr;
 
 // Precondition: (url_renderer != nullptr || format != kUrl).
 //
@@ -205,7 +207,7 @@ absl::Status CheckPrecondition(xla::RenderedGraphFormat format) {
   return absl::OkStatus();
 }
 
-StatusOr<std::string> RenderGraphView(
+absl::StatusOr<std::string> RenderGraphView(
     const xla::HloComputation& computation, absl::string_view label,
     const xla::DebugOptions& debug_options, xla::RenderedGraphFormat format,
     xla::HloRenderOptions hlo_render_options) {
@@ -222,7 +224,7 @@ StatusOr<std::string> RenderGraphView(
   return WrapDotInFormat(rendered_dot.value(), format);
 }
 
-StatusOr<std::string> RenderGraphNeighborhoodAround(
+absl::StatusOr<std::string> RenderGraphNeighborhoodAround(
     const xla::HloInstruction& node, int radius,
     xla::RenderedGraphFormat format, xla::HloRenderOptions hlo_render_options,
     const absl::flat_hash_set<const xla::HloInstruction*>& boundary) {
@@ -238,8 +240,8 @@ StatusOr<std::string> RenderGraphNeighborhoodAround(
   return WrapDotInFormat(rendered_dot.value(), format);
 }
 
-StatusOr<std::string> WrapDotInFormat(std::string dot,
-                                      xla::RenderedGraphFormat format) {
+absl::StatusOr<std::string> WrapDotInFormat(std::string dot,
+                                            xla::RenderedGraphFormat format) {
   switch (format) {
     case xla::RenderedGraphFormat::kUrl:
       if (url_renderer == nullptr) {
@@ -346,15 +348,16 @@ std::string WrapDotInHtml(std::string dot) {
 }
 
 void RegisterGraphvizURLRenderer(
-    std::function<StatusOr<std::string>(absl::string_view)> renderer) {
+    std::function<absl::StatusOr<std::string>(absl::string_view)> renderer) {
   if (url_renderer != nullptr) {
     LOG(WARNING) << "Multiple calls to RegisterGraphToURLRenderer. Last call "
                     "wins, but because order of initialization in C++ is "
                     "nondeterministic, this may not be what you want.";
   }
   delete url_renderer;
-  url_renderer = new std::function<StatusOr<std::string>(absl::string_view)>(
-      std::move(renderer));
+  url_renderer =
+      new std::function<absl::StatusOr<std::string>(absl::string_view)>(
+          std::move(renderer));
 }
 
 }  // namespace profiler
diff --git a/tensorflow/core/profiler/convert/hlo_proto_to_graph_view.h b/tensorflow/core/profiler/convert/hlo_proto_to_graph_view.h
index 9beeaf50cb53fe..cbc7d4202b9fbc 100644
--- a/tensorflow/core/profiler/convert/hlo_proto_to_graph_view.h
+++ b/tensorflow/core/profiler/convert/hlo_proto_to_graph_view.h
@@ -43,37 +43,38 @@ struct GraphViewerParams {
 };
 
 // Parse tool options to get the parameters for graph viewer.
-StatusOr<GraphViewerParams> ParseGraphViewerParams(const ToolOptions& options);
+absl::StatusOr<GraphViewerParams> ParseGraphViewerParams(
+    const ToolOptions& options);
 
 // Get graph render format.
 xla::RenderedGraphFormat GetRenderFormat(const std::string& format_string);
 
 // Convert `hlo_proto` to GraphView with the provided render options.
-tensorflow::StatusOr<std::string> ConvertHloProtoToGraph(
+absl::StatusOr<std::string> ConvertHloProtoToGraph(
     const xla::HloProto& hlo_proto, const std::string& node_name,
     int graph_width, const xla::HloRenderOptions& render_options,
     const xla::RenderedGraphFormat& format);
 
 // Render graph with the provided render options.
-StatusOr<std::string> RenderGraphView(
+absl::StatusOr<std::string> RenderGraphView(
     const xla::HloComputation& computation, absl::string_view label,
     const xla::DebugOptions& debug_options, xla::RenderedGraphFormat format,
     xla::HloRenderOptions hlo_render_options = {});
 
 // Render graph with centered node and depth
-StatusOr<std::string> RenderGraphNeighborhoodAround(
+absl::StatusOr<std::string> RenderGraphNeighborhoodAround(
     const xla::HloInstruction& node, int radius,
     xla::RenderedGraphFormat format,
     xla::HloRenderOptions hlo_render_options = {},
     const absl::flat_hash_set<const xla::HloInstruction*>& boundary = {});
 
 // Convert `hlo_proto` to StringView.
-tensorflow::StatusOr<std::string> ConvertHloProtoToStringView(
+absl::StatusOr<std::string> ConvertHloProtoToStringView(
     const xla::HloProto& hlo_proto, bool verbose, bool metadata);
 
 // Convert dot into certain format
-StatusOr<std::string> WrapDotInFormat(std::string dot,
-                                      xla::RenderedGraphFormat format);
+absl::StatusOr<std::string> WrapDotInFormat(std::string dot,
+                                            xla::RenderedGraphFormat format);
 
 // Convert dot into visual graph in html
 std::string WrapDotInHtml(std::string dot);
@@ -83,7 +84,7 @@ std::string WrapDotInHtml(std::string dot);
 // There can only be one active renderer, and the last call to this function
 // wins.
 void RegisterGraphvizURLRenderer(
-    std::function<StatusOr<std::string>(absl::string_view dot)> renderer);
+    std::function<absl::StatusOr<std::string>(absl::string_view dot)> renderer);
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils.cc b/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils.cc
index d42235f4b723f8..2dc3be35c0c866 100644
--- a/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils.cc
+++ b/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/layout_util.h"
 #include "xla/service/hlo.pb.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/xla_data.pb.h"
 #include "tensorflow/core/platform/errors.h"
@@ -57,12 +58,16 @@ using ::xla::LogicalBufferProto;
 using ::xla::Shape;
 using ::xla::ShapeUtil;
 
-const Shape* ResolveShapeIndex(const Shape* shape,
-                               absl::Span<const int64_t> shape_index) {
-  for (int64_t value : shape_index) {
-    shape = &shape->tuple_shapes(value);
+Shape ResolveShapeIndex(const xla::ShapeProto& shape_proto,
+                        absl::Span<const int64_t> shape_index) {
+  if (shape_index.empty()) return Shape(shape_proto);
+  // Choosing the last subshape to maintain historical behavior.
+  int64_t i = shape_index.back();
+  if (i >= shape_proto.tuple_shapes_size()) {
+    LOG(WARNING) << "shape_index out of tuple_shapes range.";
+    return Shape(shape_proto);
   }
-  return shape;
+  return Shape(shape_proto.tuple_shapes(i));
 }
 
 std::string ShapeDescription(const Shape& shape) {
@@ -132,12 +137,12 @@ struct LogicalBufferStruct {
   LogicalBufferStruct(const LogicalBufferProto& p,
                       const BufferAllocationStruct& b,
                       const ::xla::HloInstructionProto& i, uint64_t offset)
-      : proto(p), buffer_allocation(b), hlo_instruction(i), offset(offset) {
-    // Get shape of logical buffer.
-    const Shape top_level_shape(hlo_instruction.shape());
-    shape =
-        *ResolveShapeIndex(&top_level_shape, proto.defined_at().shape_index());
-  }
+      : proto(p),
+        buffer_allocation(b),
+        hlo_instruction(i),
+        offset(offset),
+        shape(ResolveShapeIndex(hlo_instruction.shape(),
+                                proto.defined_at().shape_index())) {}
 
   absl::string_view instruction_name() const { return hlo_instruction.name(); }
 
@@ -326,6 +331,7 @@ class HloProtoBufferWrapper {
       // to obtain the buffer allocation index ourselves.
       if (heap_simulator_traces[i].events().empty()) continue;
       int logical_buffer_id = heap_simulator_traces[i].events(0).buffer_id();
+      if (!id_to_logical_buffer_.contains(logical_buffer_id)) continue;
       auto* logical_buffer = id_to_logical_buffer_[logical_buffer_id].get();
       auto buffer_allocation_index = logical_buffer->buffer_allocation.index();
       id_to_buffer_allocation_[buffer_allocation_index]
diff --git a/tensorflow/core/profiler/convert/hlo_to_tools_data.cc b/tensorflow/core/profiler/convert/hlo_to_tools_data.cc
index 794078c8d3af10..e6773b5a298c1e 100644
--- a/tensorflow/core/profiler/convert/hlo_to_tools_data.cc
+++ b/tensorflow/core/profiler/convert/hlo_to_tools_data.cc
@@ -36,7 +36,7 @@ namespace profiler {
 
 namespace {
 
-StatusOr<std::string> ConvertHloProtoToMemoryViewer(
+absl::StatusOr<std::string> ConvertHloProtoToMemoryViewer(
     const xla::HloProto& hlo_proto) {
   static constexpr int kSmallBufferSize = 16 * 1024;  // 16KB
   static constexpr int kMemorySpaceColor = 0;         // HBM
@@ -64,7 +64,7 @@ StatusOr<std::string> ConvertHloProtoToMemoryViewer(
   return json_output;
 }
 
-StatusOr<std::string> ConvertHloProtoToGraphViewer(
+absl::StatusOr<std::string> ConvertHloProtoToGraphViewer(
     const xla::HloProto& hlo_proto, const ToolOptions& options) {
   TF_ASSIGN_OR_RETURN(GraphViewerParams params,
                       ParseGraphViewerParams(options));
@@ -80,7 +80,7 @@ StatusOr<std::string> ConvertHloProtoToGraphViewer(
 
 }  // namespace
 
-StatusOr<std::string> ConvertHloProtoToToolData(
+absl::StatusOr<std::string> ConvertHloProtoToToolData(
     const SessionSnapshot& session_snapshot, const absl::string_view tool_name,
     const ToolOptions& options) {
   // <options> must provide a hlo module_name field to identify the HLO module.
diff --git a/tensorflow/core/profiler/convert/hlo_to_tools_data.h b/tensorflow/core/profiler/convert/hlo_to_tools_data.h
index fbcafba8a4c033..b567c973382997 100644
--- a/tensorflow/core/profiler/convert/hlo_to_tools_data.h
+++ b/tensorflow/core/profiler/convert/hlo_to_tools_data.h
@@ -31,7 +31,7 @@ namespace profiler {
 // is used for the conversion.
 // Return the serialized string of tool specific data when the conversion is
 // successful, else return an error status.
-StatusOr<std::string> ConvertHloProtoToToolData(
+absl::StatusOr<std::string> ConvertHloProtoToToolData(
     const SessionSnapshot& session_snapshot, absl::string_view tool_name,
     const ToolOptions& options);
 
diff --git a/tensorflow/core/profiler/convert/op_metrics_to_record.h b/tensorflow/core/profiler/convert/op_metrics_to_record.h
index 47b675554b691b..229fde68d1a62c 100644
--- a/tensorflow/core/profiler/convert/op_metrics_to_record.h
+++ b/tensorflow/core/profiler/convert/op_metrics_to_record.h
@@ -32,14 +32,16 @@ inline double GigaFlopsPerSecondPerCore(const OpMetrics& metrics) {
   // flops and time_ps are accumulated across all occurrences on all cores.
   // time_ps is used instead of self_time_ps because flops for an op includes
   // the flops executed by children (nested) ops.
-  return SafeDivide(metrics.flops(), PicoToNano(metrics.time_ps()));
+  return tsl::profiler::SafeDivide(
+      metrics.flops(), tsl::profiler::PicoToNano(metrics.time_ps()));
 }
 
 inline double GigaModelFlopsPerSecondPerCore(const OpMetrics& metrics) {
   // flops and time_ps are accumulated across all occurrences on all cores.
   // time_ps is used instead of self_time_ps because flops for an op includes
   // the flops executed by children (nested) ops.
-  return SafeDivide(metrics.model_flops(), PicoToNano(metrics.time_ps()));
+  return tsl::profiler::SafeDivide(
+      metrics.model_flops(), tsl::profiler::PicoToNano(metrics.time_ps()));
 }
 
 // Return ByteAccessed for memory_space and operation_type.
@@ -74,23 +76,26 @@ inline double GigaBytesPerSecondPerCore(
   // cores.
   // time_ps is used instead of self_time_ps because bytes_accessed for an op
   // includes the bytes accessed by children (nested) ops.
-  return SafeDivide(BytesAccessedPerCore(metrics, memory_space, operation_type),
-                    PicoToNano(metrics.time_ps()));
+  return tsl::profiler::SafeDivide(
+      BytesAccessedPerCore(metrics, memory_space, operation_type),
+      tsl::profiler::PicoToNano(metrics.time_ps()));
 }
 
 inline double GibiBytesPerSecondPerCore(
     const OpMetrics& metrics, uint64_t memory_space,
     OpMetrics::MemoryAccessed::OperationType op_type) {
-  return GigaToGibi(GigaBytesPerSecondPerCore(metrics, memory_space, op_type));
+  return tsl::profiler::GigaToGibi(
+      GigaBytesPerSecondPerCore(metrics, memory_space, op_type));
 }
 
 template <typename Record>
 inline void SetExecutionTimes(const OpMetrics& metrics, Record* record) {
   record->set_occurrences(metrics.occurrences());
-  record->set_total_time_in_us(PicoToMicro(metrics.time_ps()));
+  record->set_total_time_in_us(tsl::profiler::PicoToMicro(metrics.time_ps()));
   record->set_avg_time_in_us(
       SafeDivide(record->total_time_in_us(), metrics.occurrences()));
-  record->set_total_self_time_in_us(PicoToMicro(metrics.self_time_ps()));
+  record->set_total_self_time_in_us(
+      tsl::profiler::PicoToMicro(metrics.self_time_ps()));
   record->set_avg_self_time_in_us(
       SafeDivide(record->total_self_time_in_us(), metrics.occurrences()));
 }
@@ -98,7 +103,7 @@ inline void SetExecutionTimes(const OpMetrics& metrics, Record* record) {
 template <typename Record>
 inline void SetTpuUnitFractions(const OpMetrics& metrics, Record* record) {
   record->set_dma_stall_fraction(
-      SafeDivide(metrics.dma_stall_ps(), metrics.time_ps()));
+      tsl::profiler::SafeDivide(metrics.dma_stall_ps(), metrics.time_ps()));
 }
 
 template <typename Record>
@@ -146,7 +151,7 @@ inline void SetRooflineMetrics(const OpMetrics& metrics,
       GigaBytesPerSecondPerCore(metrics, MemorySpace::MEMORY_SPACE_ALL,
                                 OpMetrics::MemoryAccessed::UNKNOWN));
   record->set_operational_intensity(
-      SafeDivide(metrics.flops(), metrics.bytes_accessed()));
+      tsl::profiler::SafeDivide(metrics.flops(), metrics.bytes_accessed()));
   record->set_bound_by((metrics.bytes_accessed() != 0)
                            ? ((record->operational_intensity() >=
                                ridge_point_operational_intensity)
diff --git a/tensorflow/core/profiler/convert/op_profile_builder.cc b/tensorflow/core/profiler/convert/op_profile_builder.cc
index 4ee010aa50eb99..75f5c32fc158d2 100644
--- a/tensorflow/core/profiler/convert/op_profile_builder.cc
+++ b/tensorflow/core/profiler/convert/op_profile_builder.cc
@@ -172,50 +172,54 @@ void PopulateOpMetricsNode(
   metrics->set_raw_time(op_metrics.time_ps());
   metrics->set_raw_flops(op_metrics.model_flops());
   metrics->set_occurrences(op_metrics.occurrences());
-  metrics->set_avg_time_ps(
-      SafeDivide(op_metrics.time_ps(), op_metrics.occurrences()));
+  metrics->set_avg_time_ps(tsl::profiler::SafeDivide(op_metrics.time_ps(),
+                                                     op_metrics.occurrences()));
 
-  double flops_utilization = SafeDivide(GigaFlopsPerSecondPerCore(op_metrics),
-                                        peak_gigaflops_per_second_per_core);
+  double flops_utilization =
+      tsl::profiler::SafeDivide(GigaFlopsPerSecondPerCore(op_metrics),
+                                peak_gigaflops_per_second_per_core);
   // The UI expects flops_utilization = flop_util / time_fraction. See:
   // https://github.com/tensorflow/profiler/blob/master/frontend/app/common/utils/utils.ts
-  const double time_fraction = SafeDivide(op_metrics.time_ps(), total_time_ps);
+  const double time_fraction =
+      tsl::profiler::SafeDivide(op_metrics.time_ps(), total_time_ps);
   metrics->set_flops(flops_utilization * time_fraction);
 
   // Capture both on-chip and off-chip memory utilization.
   const double hbm_gibibytes_per_second =
-      GigaToGibi(GigaBytesPerSecondPerCore(op_metrics,
-                                           MemorySpace::MEMORY_SPACE_HBM,
-                                           OpMetrics::MemoryAccessed::READ)) +
-      GigaToGibi(GigaBytesPerSecondPerCore(op_metrics,
-                                           MemorySpace::MEMORY_SPACE_HBM,
-                                           OpMetrics::MemoryAccessed::WRITE));
-  const double hbm_bw_utilization = SafeDivide(
+      tsl::profiler::GigaToGibi(
+          GigaBytesPerSecondPerCore(op_metrics, MemorySpace::MEMORY_SPACE_HBM,
+                                    OpMetrics::MemoryAccessed::READ)) +
+      tsl::profiler::GigaToGibi(
+          GigaBytesPerSecondPerCore(op_metrics, MemorySpace::MEMORY_SPACE_HBM,
+                                    OpMetrics::MemoryAccessed::WRITE));
+  const double hbm_bw_utilization = tsl::profiler::SafeDivide(
       hbm_gibibytes_per_second,
       peak_mem_gibibytes_per_second_per_core[MemBwType::MEM_BW_TYPE_HBM_RW]);
   metrics->add_bandwidth_utils(hbm_bw_utilization);
-  double hbm_bytes =
-      GibiToGiga(hbm_gibibytes_per_second) * PicoToNano(op_metrics.time_ps());
+  double hbm_bytes = tsl::profiler::GibiToGiga(hbm_gibibytes_per_second) *
+                     tsl::profiler::PicoToNano(op_metrics.time_ps());
 
-  const double sram_rd_gibibytes_per_second = GigaToGibi(
+  const double sram_rd_gibibytes_per_second = tsl::profiler::GigaToGibi(
       GigaBytesPerSecondPerCore(op_metrics, MemorySpace::MEMORY_SPACE_ON_CHIP,
                                 OpMetrics::MemoryAccessed::READ));
-  const double sram_rd_bw_utilization = SafeDivide(
+  const double sram_rd_bw_utilization = tsl::profiler::SafeDivide(
       sram_rd_gibibytes_per_second,
       peak_mem_gibibytes_per_second_per_core[MemBwType::MEM_BW_TYPE_SRAM_RD]);
   metrics->add_bandwidth_utils(sram_rd_bw_utilization);
-  double sram_rd_bytes = GibiToGiga(sram_rd_gibibytes_per_second) *
-                         PicoToNano(op_metrics.time_ps());
+  double sram_rd_bytes =
+      tsl::profiler::GibiToGiga(sram_rd_gibibytes_per_second) *
+      tsl::profiler::PicoToNano(op_metrics.time_ps());
 
-  const double sram_wr_gibibytes_per_second = GigaToGibi(
+  const double sram_wr_gibibytes_per_second = tsl::profiler::GigaToGibi(
       GigaBytesPerSecondPerCore(op_metrics, MemorySpace::MEMORY_SPACE_ON_CHIP,
                                 OpMetrics::MemoryAccessed::WRITE));
-  const double sram_wr_bw_utilization = SafeDivide(
+  const double sram_wr_bw_utilization = tsl::profiler::SafeDivide(
       sram_wr_gibibytes_per_second,
       peak_mem_gibibytes_per_second_per_core[MemBwType::MEM_BW_TYPE_SRAM_WR]);
   metrics->add_bandwidth_utils(sram_wr_bw_utilization);
-  double sram_wr_bytes = GibiToGiga(sram_wr_gibibytes_per_second) *
-                         PicoToNano(op_metrics.time_ps());
+  double sram_wr_bytes =
+      tsl::profiler::GibiToGiga(sram_wr_gibibytes_per_second) *
+      tsl::profiler::PicoToNano(op_metrics.time_ps());
 
   metrics->add_raw_bytes_accessed_array(hbm_bytes);
   metrics->add_raw_bytes_accessed_array(sram_rd_bytes);
diff --git a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
index 268908b3f1cf28..c02c3d0067e49a 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
@@ -222,7 +222,8 @@ InputPipelineAnalysisResult ComputeGenericInputPipelineAnalysisResult(
     } else {
       details.set_step_name(step_info.step_name());
     }
-    details.set_step_time_ms(PicoToMilli(step_info.duration_ps()));
+    details.set_step_time_ms(
+        tsl::profiler::PicoToMilli(step_info.duration_ps()));
     GenericStepBreakdown generic;
     bool success = step_info.step_breakdown().UnpackTo(&generic);
     if (!success && !step_info.step_breakdown().type_url().empty()) {
@@ -247,9 +248,9 @@ InputPipelineAnalysisResult ComputeGenericInputPipelineAnalysisResult(
     result.add_step_details()->PackFrom(details);
 
     const double input_percent_of_step_time =
-        100.0 *
-        SafeDivide(details.host_wait_input_ms() + details.host_to_device_ms(),
-                   details.step_time_ms());
+        100.0 * tsl::profiler::SafeDivide(
+                    details.host_wait_input_ms() + details.host_to_device_ms(),
+                    details.step_time_ms());
     input_summary_stats_in_percent.UpdateStat(input_percent_of_step_time);
   }
 
@@ -350,12 +351,15 @@ InputOpDetails ConvertOpMetricsToInputOpDetails(const OpMetrics& op_metrics,
   InputOpDetails details;
   details.set_op_name(op_metrics.name());
   details.set_count(op_metrics.occurrences());
-  details.set_time_in_ms(PicoToMilli(op_metrics.time_ps()));
-  details.set_self_time_in_ms(PicoToMilli(op_metrics.self_time_ps()));
+  details.set_time_in_ms(tsl::profiler::PicoToMilli(op_metrics.time_ps()));
+  details.set_self_time_in_ms(
+      tsl::profiler::PicoToMilli(op_metrics.self_time_ps()));
   details.set_time_in_percent(
-      100.0 * SafeDivide(op_metrics.time_ps(), input_op_time_ps));
+      100.0 *
+      tsl::profiler::SafeDivide(op_metrics.time_ps(), input_op_time_ps));
   details.set_self_time_in_percent(
-      100.0 * SafeDivide(op_metrics.self_time_ps(), input_op_time_ps));
+      100.0 *
+      tsl::profiler::SafeDivide(op_metrics.self_time_ps(), input_op_time_ps));
   details.set_category(InputOpCategoryString(category));
   return details;
 }
@@ -380,7 +384,8 @@ double RatioOfHostToDeviceTimeToStepTime(
             &generic_breakdown)) {
       double avg_host_to_device_time_ms =
           generic_breakdown.host_to_device_ms_summary().average();
-      return SafeDivide(avg_host_to_device_time_ms, avg_step_time_ms);
+      return tsl::profiler::SafeDivide(avg_host_to_device_time_ms,
+                                       avg_step_time_ms);
     }
   }
   return 0.0;
@@ -493,7 +498,7 @@ void GenerateHostResult(const OpMetricsDb& host_tf_metrics_db,
     *result->add_input_op_details() = ConvertOpMetricsToInputOpDetails(
         *op_metrics, input_op_metrics.input_op_time_ps, category);
     aggregated_input_op_times_us[category] +=
-        PicoToMicro(op_metrics->self_time_ps());
+        tsl::profiler::PicoToMicro(op_metrics->self_time_ps());
   }
 
   double enqueue_time_us =
@@ -511,15 +516,15 @@ void GenerateHostResult(const OpMetricsDb& host_tf_metrics_db,
                                    : total_input_op_time_us;
 
   // Scales the various input-time components wrt to non_enqueue_time_us.
-  double scaled_demanded_fileread_time_us = SafeDivide(
+  double scaled_demanded_fileread_time_us = tsl::profiler::SafeDivide(
       non_enqueue_time_us *
           aggregated_input_op_times_us[InputOpCategory::kDemandedFileRead],
       total_input_op_time_us);
-  double scaled_advanced_fileread_time_us = SafeDivide(
+  double scaled_advanced_fileread_time_us = tsl::profiler::SafeDivide(
       non_enqueue_time_us *
           aggregated_input_op_times_us[InputOpCategory::kAdvancedFileRead],
       total_input_op_time_us);
-  double scaled_preprocessing_time_us = SafeDivide(
+  double scaled_preprocessing_time_us = tsl::profiler::SafeDivide(
       non_enqueue_time_us *
           aggregated_input_op_times_us[InputOpCategory::kPreprocessing],
       total_input_op_time_us);
@@ -833,7 +838,8 @@ double HostToDeviceTransferAsPercentOfInputTime(
       breakdown.demanded_file_read_us() + breakdown.advanced_file_read_us() +
       breakdown.preprocessing_us() + breakdown.enqueue_us() +
       breakdown.unclassified_non_enqueue_us();
-  return 100.0 * SafeDivide(breakdown.enqueue_us(), total_input_time_us);
+  return 100.0 *
+         tsl::profiler::SafeDivide(breakdown.enqueue_us(), total_input_time_us);
 }
 
 }  // namespace profiler
diff --git a/tensorflow/core/profiler/convert/op_stats_to_op_profile.cc b/tensorflow/core/profiler/convert/op_stats_to_op_profile.cc
index 5b9a4bf0c6ff1c..90bf226384215a 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_op_profile.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_op_profile.cc
@@ -32,13 +32,11 @@ namespace tensorflow {
 namespace profiler {
 namespace {
 
-using ::tensorflow::profiler::GigaToGibi;
 using ::tensorflow::profiler::IsIdleOp;
 using ::tensorflow::profiler::OpMetrics;
 using ::tensorflow::profiler::OpProfileBuilder;
 using ::tensorflow::profiler::OpProfileOptions;
 using ::tensorflow::profiler::OpStats;
-using ::tensorflow::profiler::TeraToGiga;
 using ::tensorflow::profiler::TotalTimePs;
 using ::tensorflow::profiler::op_profile::Node;
 
@@ -63,10 +61,10 @@ void BuildOpProfileNodeTree(const OpStats& op_stats, bool group_by_program,
 
   const auto& perf_env = op_stats.perf_env();
   double max_gigaflops_per_second_per_core =
-      TeraToGiga(perf_env.peak_tera_flops_per_second());
+      tsl::profiler::TeraToGiga(perf_env.peak_tera_flops_per_second());
   std::vector<double> peak_bws;
   for (auto bw : perf_env.peak_bws_giga_bytes_per_second()) {
-    peak_bws.push_back(GigaToGibi(bw));
+    peak_bws.push_back(tsl::profiler::GigaToGibi(bw));
   }
   builder.Finalize(max_gigaflops_per_second_per_core, peak_bws,
                    TotalTimePs(metrics_db, exclude_idle_ops));
diff --git a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
index 621fe05afc9aa5..0e0ad42b20a4da 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
@@ -198,7 +198,7 @@ OverviewPageAnalysis ComputeAnalysisResult(const OpStats& op_stats) {
     device_cumulative_fraction += op->self_time_fraction();
     op->set_cumulative_time_fraction(device_cumulative_fraction);
     op->set_flop_rate(tsl::profiler::SafeDivide(
-        metrics->flops(), PicoToNano(metrics->time_ps())));
+        metrics->flops(), tsl::profiler::PicoToNano(metrics->time_ps())));
     auto iter = kernel_stats_by_op_name.find(op->name());
     if (iter != kernel_stats_by_op_name.end()) {
       op->set_is_op_tensorcore_eligible(
diff --git a/tensorflow/core/profiler/convert/op_stats_to_pod_stats.cc b/tensorflow/core/profiler/convert/op_stats_to_pod_stats.cc
index 097aa2b55af8cd..ff24abb1db3774 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_pod_stats.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_pod_stats.cc
@@ -41,7 +41,8 @@ PodStatsRecord CreatePodStatsRecord(absl::string_view host_name,
   DCHECK(success);
   record.set_host_name(string(host_name));
   record.set_step_num(step_info.step_num());
-  record.set_total_duration_us(PicoToMicro(step_info.duration_ps()));
+  record.set_total_duration_us(
+      tsl::profiler::PicoToMicro(step_info.duration_ps()));
   auto& step_breakdown_map = *record.mutable_step_breakdown_us();
   std::vector<std::pair<uint64, absl::string_view>> metrics;
 
@@ -51,7 +52,7 @@ PodStatsRecord CreatePodStatsRecord(absl::string_view host_name,
     for (const auto& event_type : event_list) {
       ps += gtl::FindWithDefault(generic.type_ps(), event_type, /*value=*/0);
     }
-    step_breakdown_map[type] = PicoToMicro(ps);
+    step_breakdown_map[type] = tsl::profiler::PicoToMicro(ps);
     metrics.emplace_back(ps, GetGenericEventTypeStr(type));
   };
 
diff --git a/tensorflow/core/profiler/convert/op_stats_to_pod_stats_test.cc b/tensorflow/core/profiler/convert/op_stats_to_pod_stats_test.cc
index 7afdc01e92c4fa..be62b99e1e417a 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_pod_stats_test.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_pod_stats_test.cc
@@ -86,24 +86,30 @@ TEST(OpStatsToPodStats, GpuPodStats) {
   const PodStatsRecord& record = pod_stats_db.pod_stats_record(0);
   EXPECT_EQ(kStepNum, record.step_num());
   EXPECT_EQ(kHostname, record.host_name());
-  EXPECT_NEAR(PicoToMicro(kStepTimePs), record.total_duration_us(), kMaxError);
+  EXPECT_NEAR(tsl::profiler::PicoToMicro(kStepTimePs),
+              record.total_duration_us(), kMaxError);
   const auto& breakdown = record.step_breakdown_us();
-  EXPECT_NEAR(PicoToMicro(kDeviceCompute32Ps + kDeviceCompute16Ps),
-              breakdown.at(kDeviceCompute), kMaxError);
-  EXPECT_NEAR(PicoToMicro(kDeviceToDevicePs + kDeviceWaitDevicePs),
-              breakdown.at(kDeviceToDevice), kMaxError);
-  EXPECT_NEAR(PicoToMicro(kDeviceCollectivePs),
-              breakdown.at(kDeviceCollectives), kMaxError);
-  EXPECT_NEAR(PicoToMicro(kHostComputePs), breakdown.at(kHostCompute),
-              kMaxError);
-  EXPECT_NEAR(PicoToMicro(kHostPreparePs), breakdown.at(kHostPrepare),
-              kMaxError);
   EXPECT_NEAR(
-      PicoToMicro(kHostWaitInputPs + kHostToDevicePs + kDeviceWaitHostPs),
-      breakdown.at(kInput), kMaxError);
-  EXPECT_NEAR(PicoToMicro(kDeviceToHostPs), breakdown.at(kOutput), kMaxError);
-  EXPECT_NEAR(PicoToMicro(kHostCompilePs), breakdown.at(kCompile), kMaxError);
-  EXPECT_NEAR(PicoToMicro(kUnknownTimePs), breakdown.at(kAllOthers), kMaxError);
+      tsl::profiler::PicoToMicro(kDeviceCompute32Ps + kDeviceCompute16Ps),
+      breakdown.at(kDeviceCompute), kMaxError);
+  EXPECT_NEAR(
+      tsl::profiler::PicoToMicro(kDeviceToDevicePs + kDeviceWaitDevicePs),
+      breakdown.at(kDeviceToDevice), kMaxError);
+  EXPECT_NEAR(tsl::profiler::PicoToMicro(kDeviceCollectivePs),
+              breakdown.at(kDeviceCollectives), kMaxError);
+  EXPECT_NEAR(tsl::profiler::PicoToMicro(kHostComputePs),
+              breakdown.at(kHostCompute), kMaxError);
+  EXPECT_NEAR(tsl::profiler::PicoToMicro(kHostPreparePs),
+              breakdown.at(kHostPrepare), kMaxError);
+  EXPECT_NEAR(tsl::profiler::PicoToMicro(kHostWaitInputPs + kHostToDevicePs +
+                                         kDeviceWaitHostPs),
+              breakdown.at(kInput), kMaxError);
+  EXPECT_NEAR(tsl::profiler::PicoToMicro(kDeviceToHostPs),
+              breakdown.at(kOutput), kMaxError);
+  EXPECT_NEAR(tsl::profiler::PicoToMicro(kHostCompilePs),
+              breakdown.at(kCompile), kMaxError);
+  EXPECT_NEAR(tsl::profiler::PicoToMicro(kUnknownTimePs),
+              breakdown.at(kAllOthers), kMaxError);
 
   EXPECT_EQ(GetGenericEventTypeStr(kDeviceCollectives), record.bottleneck());
 }
diff --git a/tensorflow/core/profiler/convert/op_stats_to_pod_viewer_test.cc b/tensorflow/core/profiler/convert/op_stats_to_pod_viewer_test.cc
index dfb9bdbaf9b9b5..4b21f12265a5e1 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_pod_viewer_test.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_pod_viewer_test.cc
@@ -90,24 +90,30 @@ TEST(OpStatsToPodViewer, GpuPodViewer) {
   const PodStatsRecord& record = pod_stats_map.pod_stats_per_core().at(kCoreId);
   EXPECT_EQ(kStepNum, record.step_num());
   EXPECT_EQ(kHostname, record.host_name());
-  EXPECT_NEAR(PicoToMicro(kStepTimePs), record.total_duration_us(), kMaxError);
+  EXPECT_NEAR(tsl::profiler::PicoToMicro(kStepTimePs),
+              record.total_duration_us(), kMaxError);
   const auto& breakdown = record.step_breakdown_us();
-  EXPECT_NEAR(PicoToMicro(kDeviceCompute32Ps + kDeviceCompute16Ps),
-              breakdown.at(kDeviceCompute), kMaxError);
-  EXPECT_NEAR(PicoToMicro(kDeviceToDevicePs + kDeviceWaitDevicePs),
-              breakdown.at(kDeviceToDevice), kMaxError);
-  EXPECT_NEAR(PicoToMicro(kDeviceCollectivePs),
-              breakdown.at(kDeviceCollectives), kMaxError);
-  EXPECT_NEAR(PicoToMicro(kHostComputePs), breakdown.at(kHostCompute),
-              kMaxError);
-  EXPECT_NEAR(PicoToMicro(kHostPreparePs), breakdown.at(kHostPrepare),
-              kMaxError);
   EXPECT_NEAR(
-      PicoToMicro(kHostWaitInputPs + kHostToDevicePs + kDeviceWaitHostPs),
-      breakdown.at(kInput), kMaxError);
-  EXPECT_NEAR(PicoToMicro(kDeviceToHostPs), breakdown.at(kOutput), kMaxError);
-  EXPECT_NEAR(PicoToMicro(kHostCompilePs), breakdown.at(kCompile), kMaxError);
-  EXPECT_NEAR(PicoToMicro(kUnknownTimePs), breakdown.at(kAllOthers), kMaxError);
+      tsl::profiler::PicoToMicro(kDeviceCompute32Ps + kDeviceCompute16Ps),
+      breakdown.at(kDeviceCompute), kMaxError);
+  EXPECT_NEAR(
+      tsl::profiler::PicoToMicro(kDeviceToDevicePs + kDeviceWaitDevicePs),
+      breakdown.at(kDeviceToDevice), kMaxError);
+  EXPECT_NEAR(tsl::profiler::PicoToMicro(kDeviceCollectivePs),
+              breakdown.at(kDeviceCollectives), kMaxError);
+  EXPECT_NEAR(tsl::profiler::PicoToMicro(kHostComputePs),
+              breakdown.at(kHostCompute), kMaxError);
+  EXPECT_NEAR(tsl::profiler::PicoToMicro(kHostPreparePs),
+              breakdown.at(kHostPrepare), kMaxError);
+  EXPECT_NEAR(tsl::profiler::PicoToMicro(kHostWaitInputPs + kHostToDevicePs +
+                                         kDeviceWaitHostPs),
+              breakdown.at(kInput), kMaxError);
+  EXPECT_NEAR(tsl::profiler::PicoToMicro(kDeviceToHostPs),
+              breakdown.at(kOutput), kMaxError);
+  EXPECT_NEAR(tsl::profiler::PicoToMicro(kHostCompilePs),
+              breakdown.at(kCompile), kMaxError);
+  EXPECT_NEAR(tsl::profiler::PicoToMicro(kUnknownTimePs),
+              breakdown.at(kAllOthers), kMaxError);
 
   EXPECT_EQ(GetGenericEventTypeStr(kDeviceCollectives), record.bottleneck());
 }
diff --git a/tensorflow/core/profiler/convert/op_stats_to_tf_stats.cc b/tensorflow/core/profiler/convert/op_stats_to_tf_stats.cc
index 3f9cbb9510ea60..55d94c12cd154b 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_tf_stats.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_tf_stats.cc
@@ -59,7 +59,8 @@ TfStatsTable GenerateTfStatsTable(
 
   // Sets device-side TF stats.
   uint64 total_device_time_ps = TotalTimePs(device_tf_metrics_db, exclude_idle);
-  double total_device_time_us = PicoToMicro(total_device_time_ps);
+  double total_device_time_us =
+      tsl::profiler::PicoToMicro(total_device_time_ps);
   for (const OpMetrics* metrics :
        SortedOpMetricsDb(device_tf_metrics_db, kMaxNumOfOps)) {
     if (exclude_idle && IsIdleOp(*metrics)) continue;
@@ -81,7 +82,7 @@ TfStatsTable GenerateTfStatsTable(
 
   // Sets host-side TF stats.
   uint64 total_host_time_ps = TotalTimePs(host_tf_metrics_db, exclude_idle);
-  double total_host_time_us = PicoToMicro(total_host_time_ps);
+  double total_host_time_us = tsl::profiler::PicoToMicro(total_host_time_ps);
   for (const OpMetrics* metrics : tensorflow::profiler::SortedOpMetricsDb(
            host_tf_metrics_db, kMaxNumOfOps)) {
     if (exclude_idle && IsIdleOp(*metrics)) continue;
diff --git a/tensorflow/core/profiler/convert/repository.cc b/tensorflow/core/profiler/convert/repository.cc
index 4a7e6e614f482f..abc4a994325d39 100644
--- a/tensorflow/core/profiler/convert/repository.cc
+++ b/tensorflow/core/profiler/convert/repository.cc
@@ -47,7 +47,7 @@ std::string GetHostnameByPath(absl::string_view xspace_path) {
 }
 }  // namespace
 
-StatusOr<SessionSnapshot> SessionSnapshot::Create(
+absl::StatusOr<SessionSnapshot> SessionSnapshot::Create(
     std::vector<std::string> xspace_paths,
     std::optional<std::vector<std::unique_ptr<XSpace>>> xspaces) {
   if (xspace_paths.empty()) {
@@ -79,7 +79,7 @@ StatusOr<SessionSnapshot> SessionSnapshot::Create(
   return SessionSnapshot(std::move(xspace_paths), std::move(xspaces));
 }
 
-StatusOr<std::unique_ptr<XSpace>> SessionSnapshot::GetXSpace(
+absl::StatusOr<std::unique_ptr<XSpace>> SessionSnapshot::GetXSpace(
     size_t index) const {
   if (index >= xspace_paths_.size()) {
     return errors::InvalidArgument("Can not get the ", index,
@@ -103,7 +103,7 @@ StatusOr<std::unique_ptr<XSpace>> SessionSnapshot::GetXSpace(
   return xspace_from_file;
 }
 
-StatusOr<std::unique_ptr<XSpace>> SessionSnapshot::GetXSpaceByName(
+absl::StatusOr<std::unique_ptr<XSpace>> SessionSnapshot::GetXSpaceByName(
     absl::string_view name) const {
   if (auto it = hostname_map_.find(name); it != hostname_map_.end()) {
     return GetXSpace(it->second);
@@ -129,7 +129,7 @@ std::optional<std::string> SessionSnapshot::GetFilePath(
   return std::nullopt;
 }
 
-StatusOr<std::string> SessionSnapshot::GetHostDataFileName(
+absl::StatusOr<std::string> SessionSnapshot::GetHostDataFileName(
     const StoredDataType data_type, const std::string host) const {
   for (const auto& format : *kHostDataSuffixes) {
     if (data_type == format.first) return absl::StrCat(host, format.second);
@@ -137,7 +137,7 @@ StatusOr<std::string> SessionSnapshot::GetHostDataFileName(
   return absl::InternalError(&"Unknown StoredDataType: "[data_type]);
 }
 
-StatusOr<std::optional<std::string>> SessionSnapshot::GetHostDataFilePath(
+absl::StatusOr<std::optional<std::string>> SessionSnapshot::GetHostDataFilePath(
     const StoredDataType data_type, const std::string host) const {
   // Gets all the files in session run directory.
   std::vector<std::string> results;
@@ -156,7 +156,7 @@ StatusOr<std::optional<std::string>> SessionSnapshot::GetHostDataFilePath(
   return std::nullopt;
 }
 
-StatusOr<std::pair<bool, std::string>> SessionSnapshot::HasCacheFile(
+absl::StatusOr<std::pair<bool, std::string>> SessionSnapshot::HasCacheFile(
     const StoredDataType data_type) const {
   std::optional<std::string> filepath;
   TF_ASSIGN_OR_RETURN(filepath,
diff --git a/tensorflow/core/profiler/convert/repository.h b/tensorflow/core/profiler/convert/repository.h
index 8613ce3fada2fe..55d33af3d4bfbb 100644
--- a/tensorflow/core/profiler/convert/repository.h
+++ b/tensorflow/core/profiler/convert/repository.h
@@ -55,7 +55,7 @@ class SessionSnapshot {
   // <xspace_paths> are the file paths to XSpace protos.
   // Optionally, <xspaces> can contain the XSpace protos pre-loaded by the
   // profiler plugin.
-  static StatusOr<SessionSnapshot> Create(
+  static absl::StatusOr<SessionSnapshot> Create(
       std::vector<std::string> xspace_paths,
       std::optional<std::vector<std::unique_ptr<XSpace>>> xspaces);
 
@@ -64,11 +64,11 @@ class SessionSnapshot {
 
   // Gets XSpace proto.
   // The caller of this function will take ownership of the XSpace.
-  StatusOr<std::unique_ptr<XSpace>> GetXSpace(size_t index) const;
+  absl::StatusOr<std::unique_ptr<XSpace>> GetXSpace(size_t index) const;
 
   // Gets XSpace proto.
   // The caller of this function will take ownership of the XSpace.
-  StatusOr<std::unique_ptr<XSpace>> GetXSpaceByName(
+  absl::StatusOr<std::unique_ptr<XSpace>> GetXSpaceByName(
       absl::string_view name) const;
 
   // Gets host name.
@@ -86,11 +86,11 @@ class SessionSnapshot {
                                          absl::string_view host) const;
 
   // Gets the name of the host data file.
-  StatusOr<std::string> GetHostDataFileName(StoredDataType data_type,
-                                            std::string host) const;
+  absl::StatusOr<std::string> GetHostDataFileName(StoredDataType data_type,
+                                                  std::string host) const;
 
   // Gets the path of the host data file.
-  StatusOr<std::optional<std::string>> GetHostDataFilePath(
+  absl::StatusOr<std::optional<std::string>> GetHostDataFilePath(
       StoredDataType data_type, std::string host) const;
 
   /* Gets whether the cache file is present in run dir. First value indicates
@@ -102,7 +102,7 @@ class SessionSnapshot {
       3. <true, filepath>: If cache file is present and file contains data_type
      events
   */
-  StatusOr<std::pair<bool, std::string>> HasCacheFile(
+  absl::StatusOr<std::pair<bool, std::string>> HasCacheFile(
       StoredDataType data_type) const;
 
   template <typename T>
diff --git a/tensorflow/core/profiler/convert/xplane_to_dcn_collective_stats.cc b/tensorflow/core/profiler/convert/xplane_to_dcn_collective_stats.cc
index 6586356b60a294..be995c9fe18916 100644
--- a/tensorflow/core/profiler/convert/xplane_to_dcn_collective_stats.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_dcn_collective_stats.cc
@@ -49,7 +49,7 @@ bool HasDcnCollectiveStatsInXSpace(const XSpace& xspace) {
   return false;
 }
 
-StatusOr<bool> GetDcnCollectiveStatsFromMultiXSpaceAndSaveToFile(
+absl::StatusOr<bool> GetDcnCollectiveStatsFromMultiXSpaceAndSaveToFile(
     const SessionSnapshot& session_snapshot) {
   DcnSlackAnalysisCombiner combiner;
   for (int idx = 0; idx < session_snapshot.XSpaceSize(); idx++) {
@@ -87,7 +87,7 @@ StatusOr<bool> GetDcnCollectiveStatsFromMultiXSpaceAndSaveToFile(
 
 }  // namespace
 
-StatusOr<bool> HasDcnCollectiveStatsInMultiXSpace(
+absl::StatusOr<bool> HasDcnCollectiveStatsInMultiXSpace(
     const SessionSnapshot& session_snapshot) {
   std::pair<bool, std::string> hasCacheFile;
   TF_ASSIGN_OR_RETURN(hasCacheFile, session_snapshot.HasCacheFile(
@@ -118,7 +118,7 @@ StatusOr<bool> HasDcnCollectiveStatsInMultiXSpace(
   }
 }
 
-StatusOr<bool> ConvertMultiXSpaceToDcnCollectiveStats(
+absl::StatusOr<bool> ConvertMultiXSpaceToDcnCollectiveStats(
     const SessionSnapshot& session_snapshot) {
   std::pair<bool, std::string> hasCacheFile;
   TF_ASSIGN_OR_RETURN(hasCacheFile, session_snapshot.HasCacheFile(
@@ -140,7 +140,7 @@ StatusOr<bool> ConvertMultiXSpaceToDcnCollectiveStats(
   }
 }
 
-StatusOr<DcnSlackAnalysis> GetDcnSlackAnalysisByHostName(
+absl::StatusOr<DcnSlackAnalysis> GetDcnSlackAnalysisByHostName(
     const SessionSnapshot& session_snapshot, const std::string hostname) {
   TF_ASSIGN_OR_RETURN(bool hasDcnCollectiveStats,
                       ConvertMultiXSpaceToDcnCollectiveStats(session_snapshot));
diff --git a/tensorflow/core/profiler/convert/xplane_to_dcn_collective_stats.h b/tensorflow/core/profiler/convert/xplane_to_dcn_collective_stats.h
index f8d3c351183aad..68e0b491331bdd 100644
--- a/tensorflow/core/profiler/convert/xplane_to_dcn_collective_stats.h
+++ b/tensorflow/core/profiler/convert/xplane_to_dcn_collective_stats.h
@@ -26,15 +26,15 @@ namespace profiler {
 // Converts multiple XSpaces to dcn collective stats.
 // Stores the dcn collective stats as files in the same directory
 // as the xspace files.
-StatusOr<bool> ConvertMultiXSpaceToDcnCollectiveStats(
+absl::StatusOr<bool> ConvertMultiXSpaceToDcnCollectiveStats(
     const SessionSnapshot& session_snapshot);
 
 // Returns whether there are dcn collective stats in the profile.
-StatusOr<bool> HasDcnCollectiveStatsInMultiXSpace(
+absl::StatusOr<bool> HasDcnCollectiveStatsInMultiXSpace(
     const SessionSnapshot& session_snapshot);
 
 // Gets DcnSlackAnalysis proto for a host.
-StatusOr<DcnSlackAnalysis> GetDcnSlackAnalysisByHostName(
+absl::StatusOr<DcnSlackAnalysis> GetDcnSlackAnalysisByHostName(
     const SessionSnapshot& session_snapshot, std::string hostname);
 
 }  // namespace profiler
diff --git a/tensorflow/core/profiler/convert/xplane_to_dcn_collective_stats_test.cc b/tensorflow/core/profiler/convert/xplane_to_dcn_collective_stats_test.cc
index ede580606ad20d..3c06bd37fa34b9 100644
--- a/tensorflow/core/profiler/convert/xplane_to_dcn_collective_stats_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_dcn_collective_stats_test.cc
@@ -93,7 +93,7 @@ SessionSnapshot CreateSessionSnapshot(bool create_cache_file,
   std::vector<std::unique_ptr<XSpace>> xspaces;
   xspaces.push_back(std::move(xspace));
 
-  StatusOr<SessionSnapshot> session_snapshot_status =
+  absl::StatusOr<SessionSnapshot> session_snapshot_status =
       SessionSnapshot::Create(paths, std::move(xspaces));
   TF_CHECK_OK(session_snapshot_status.status());
   SessionSnapshot session_snapshot = std::move(session_snapshot_status.value());
@@ -111,14 +111,16 @@ TEST(ConvertXplaneToDcnCollectiveStats,
      HasAllHostsDcnCollectiveStatsCacheFile) {
   SessionSnapshot session_snapshot = CreateSessionSnapshot(true, true);
 
-  StatusOr<bool> status = HasDcnCollectiveStatsInMultiXSpace(session_snapshot);
+  absl::StatusOr<bool> status =
+      HasDcnCollectiveStatsInMultiXSpace(session_snapshot);
   EXPECT_EQ(status.value(), true);
 }
 
 TEST(ConvertXplaneToDcnCollectiveStats, HasNoHostDcnCollectiveStatsCacheFile) {
   SessionSnapshot session_snapshot = CreateSessionSnapshot(true, false);
 
-  StatusOr<bool> status = HasDcnCollectiveStatsInMultiXSpace(session_snapshot);
+  absl::StatusOr<bool> status =
+      HasDcnCollectiveStatsInMultiXSpace(session_snapshot);
   EXPECT_EQ(status.value(), false);
 }
 
@@ -126,7 +128,8 @@ TEST(ConvertXplaneToDcnCollectiveStats,
      NoCacheFileButTraceHasDcnCollectiveStats) {
   SessionSnapshot session_snapshot = CreateSessionSnapshot(false, true);
 
-  StatusOr<bool> status = HasDcnCollectiveStatsInMultiXSpace(session_snapshot);
+  absl::StatusOr<bool> status =
+      HasDcnCollectiveStatsInMultiXSpace(session_snapshot);
   EXPECT_EQ(status.value(), true);
 }
 
@@ -134,7 +137,8 @@ TEST(ConvertXplaneToDcnCollectiveStats,
      NoCacheFileNoDcnCollectiveStatsPresent) {
   SessionSnapshot session_snapshot = CreateSessionSnapshot(false, false);
 
-  StatusOr<bool> status = HasDcnCollectiveStatsInMultiXSpace(session_snapshot);
+  absl::StatusOr<bool> status =
+      HasDcnCollectiveStatsInMultiXSpace(session_snapshot);
   EXPECT_EQ(status.value(), false);
 }
 
@@ -142,12 +146,12 @@ TEST(ConvertXplaneToDcnCollectiveStats,
      ConvertXSpaceToDcnCollectiveStatsWhenStatsPresent) {
   SessionSnapshot session_snapshot = CreateSessionSnapshot(false, true);
 
-  StatusOr<bool> status =
+  absl::StatusOr<bool> status =
       ConvertMultiXSpaceToDcnCollectiveStats(session_snapshot);
-  StatusOr<std::optional<std::string>> all_hosts_filepath =
+  absl::StatusOr<std::optional<std::string>> all_hosts_filepath =
       session_snapshot.GetHostDataFilePath(StoredDataType::DCN_COLLECTIVE_STATS,
                                            kAllHostsIdentifier);
-  StatusOr<std::optional<std::string>> host_filepath =
+  absl::StatusOr<std::optional<std::string>> host_filepath =
       session_snapshot.GetHostDataFilePath(StoredDataType::DCN_COLLECTIVE_STATS,
                                            "hostname");
 
@@ -164,9 +168,9 @@ TEST(ConvertXplaneToDcnCollectiveStats,
      ConvertXSpaceToDcnCollectiveStatsWhenStatsNotPresent) {
   SessionSnapshot session_snapshot = CreateSessionSnapshot(false, false);
 
-  StatusOr<bool> status =
+  absl::StatusOr<bool> status =
       ConvertMultiXSpaceToDcnCollectiveStats(session_snapshot);
-  StatusOr<std::optional<std::string>> filepath =
+  absl::StatusOr<std::optional<std::string>> filepath =
       session_snapshot.GetHostDataFilePath(StoredDataType::DCN_COLLECTIVE_STATS,
                                            kNoHostIdentifier);
 
@@ -180,7 +184,7 @@ TEST(ConvertXplaneToDcnCollectiveStats,
      GetHostDcnSlackAnalysisWhenStatsNotPresent) {
   SessionSnapshot session_snapshot = CreateSessionSnapshot(false, false);
 
-  StatusOr<DcnSlackAnalysis> host_dcn_slack_analysis =
+  absl::StatusOr<DcnSlackAnalysis> host_dcn_slack_analysis =
       GetDcnSlackAnalysisByHostName(session_snapshot, "hostname");
 
   TF_EXPECT_OK(host_dcn_slack_analysis.status());
@@ -191,7 +195,7 @@ TEST(ConvertXplaneToDcnCollectiveStats,
      GetHostDcnSlackAnalysisWhenStatsPresent) {
   SessionSnapshot session_snapshot = CreateSessionSnapshot(true, true);
 
-  StatusOr<DcnSlackAnalysis> host_dcn_slack_analysis =
+  absl::StatusOr<DcnSlackAnalysis> host_dcn_slack_analysis =
       GetDcnSlackAnalysisByHostName(session_snapshot, "hostname");
 
   TF_EXPECT_OK(host_dcn_slack_analysis.status());
diff --git a/tensorflow/core/profiler/convert/xplane_to_hlo.cc b/tensorflow/core/profiler/convert/xplane_to_hlo.cc
index 8877b5e3e1fdfe..f4af2784f039f6 100644
--- a/tensorflow/core/profiler/convert/xplane_to_hlo.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_hlo.cc
@@ -44,7 +44,7 @@ constexpr char kHloProtoSuffix[] = ".hlo_proto.pb";
 
 // Extracts and deduplicates the HLO protos from all the XSpaces.
 // Stores the HLO protos as files in the same directory as the xspace files.
-StatusOr<bool> GetHloProtoFromMultiXSpaceAndSaveToFile(
+absl::StatusOr<bool> GetHloProtoFromMultiXSpaceAndSaveToFile(
     const SessionSnapshot& session_snapshot) {
   // Get all HLO protos from XSpaces and deduplicate.
   HloProtoMap hlo_proto_map;
@@ -86,7 +86,7 @@ StatusOr<bool> GetHloProtoFromMultiXSpaceAndSaveToFile(
 
 }  // namespace
 
-StatusOr<xla::HloProto> GetHloProtoByModuleName(
+absl::StatusOr<xla::HloProto> GetHloProtoByModuleName(
     const SessionSnapshot& session_snapshot,
     const absl::string_view module_name) {
   std::string file_name =
@@ -98,7 +98,7 @@ StatusOr<xla::HloProto> GetHloProtoByModuleName(
   return hlo_proto;
 }
 
-StatusOr<bool> ConvertMultiXSpaceToHloProto(
+absl::StatusOr<bool> ConvertMultiXSpaceToHloProto(
     const SessionSnapshot& session_snapshot) {
   // Gets all the files in session run directory.
   // TODO(profiler): Move this glob to SessionSnapshot and build a map from file
diff --git a/tensorflow/core/profiler/convert/xplane_to_hlo.h b/tensorflow/core/profiler/convert/xplane_to_hlo.h
index cac0b803c2d4c1..2361ba6e13d194 100644
--- a/tensorflow/core/profiler/convert/xplane_to_hlo.h
+++ b/tensorflow/core/profiler/convert/xplane_to_hlo.h
@@ -27,13 +27,13 @@ namespace tensorflow {
 namespace profiler {
 
 // Get HLO proto by module name.
-StatusOr<xla::HloProto> GetHloProtoByModuleName(
+absl::StatusOr<xla::HloProto> GetHloProtoByModuleName(
     const SessionSnapshot& session_snapshot, absl::string_view module_name);
 
 // Converts multiple XSpaces to HLO protos.
 // Stores the HLO protos as files in the same directory as the xspace files.
 // Returns whether there are HLO protos in this profile.
-StatusOr<bool> ConvertMultiXSpaceToHloProto(
+absl::StatusOr<bool> ConvertMultiXSpaceToHloProto(
     const SessionSnapshot& session_snapshot);
 
 }  // namespace profiler
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_metrics_db_test.cc b/tensorflow/core/profiler/convert/xplane_to_op_metrics_db_test.cc
index 5226b07353f74c..dfb21b1652cafa 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_metrics_db_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_op_metrics_db_test.cc
@@ -108,10 +108,10 @@ TEST(ConvertXPlaneToOpMetricsDb, HostOpMetricsDb) {
   // Op1, Op2, Idle.
   EXPECT_EQ(3, op_metrics.metrics_db_size());
   uint64 total_op_duration =
-      NanoToPico(kTfOp1DurationNs * 2 + kTfOp2DurationNs);
+      tsl::profiler::NanoToPico(kTfOp1DurationNs * 2 + kTfOp2DurationNs);
   EXPECT_EQ(total_op_duration, op_metrics.total_op_time_ps());
-  uint64 total_duration = NanoToPico(kTfOp2StartNs - kTfOp1StartNs +
-                                     kTfOp2DurationNs + kTfOp1DurationNs);
+  uint64 total_duration = tsl::profiler::NanoToPico(
+      kTfOp2StartNs - kTfOp1StartNs + kTfOp2DurationNs + kTfOp1DurationNs);
   EXPECT_EQ(total_duration, op_metrics.total_time_ps());
 
   // Verifies OpMetricsDb is built correctly.
@@ -119,19 +119,19 @@ TEST(ConvertXPlaneToOpMetricsDb, HostOpMetricsDb) {
   EXPECT_EQ(kTfOp1, op_1.name());
   EXPECT_EQ(kTfOp1, op_1.category());
   EXPECT_EQ(2, op_1.occurrences());
-  EXPECT_EQ(NanoToPico(kTfOp1DurationNs) * 2, op_1.time_ps());
+  EXPECT_EQ(tsl::profiler::NanoToPico(kTfOp1DurationNs) * 2, op_1.time_ps());
 
   const OpMetrics& idle = op_metrics.metrics_db().at(1);
   EXPECT_EQ(kIdle, idle.name());
   EXPECT_EQ(kIdle, idle.category());
   // Idle time is the gap between Op2 start and the end of Op1, which is 2000ns.
-  EXPECT_EQ(NanoToPico(2000), idle.time_ps());
+  EXPECT_EQ(tsl::profiler::NanoToPico(2000), idle.time_ps());
 
   const OpMetrics& op_2 = op_metrics.metrics_db().at(2);
   EXPECT_EQ(kTfOp2, op_2.name());
   EXPECT_EQ(kTfOp2, op_2.category());
   EXPECT_EQ(1, op_2.occurrences());
-  EXPECT_EQ(NanoToPico(kTfOp2DurationNs), op_2.time_ps());
+  EXPECT_EQ(tsl::profiler::NanoToPico(kTfOp2DurationNs), op_2.time_ps());
 }
 
 TEST(ConvertXPlaneToOpMetricsDb, DeviceOpMetricsDb) {
@@ -173,13 +173,13 @@ TEST(ConvertXPlaneToOpMetricsDb, DeviceOpMetricsDb) {
 
   // kernel1, kernel2, kernel3, Idle.
   EXPECT_EQ(4, op_metrics.metrics_db_size());
-  uint64 total_op_duration = NanoToPico(
+  uint64 total_op_duration = tsl::profiler::NanoToPico(
       kKernel1DurationNs * 2 + kKernel2DurationNs * 2 + kKernel3DurationNs);
   EXPECT_EQ(total_op_duration, op_metrics.total_op_time_ps());
   // For device, the total_duration for each device is the total duration merged
   // from all GPU streams, which is from 100000 to 130000.
-  uint64 total_duration =
-      NanoToPico(kKernel3StartNs + kKernel3DurationNs - kKernel1StartNs);
+  uint64 total_duration = tsl::profiler::NanoToPico(
+      kKernel3StartNs + kKernel3DurationNs - kKernel1StartNs);
   EXPECT_EQ(std::max(total_duration, total_op_duration),
             op_metrics.total_time_ps());
 
@@ -188,25 +188,25 @@ TEST(ConvertXPlaneToOpMetricsDb, DeviceOpMetricsDb) {
   EXPECT_EQ(absl::StrCat(kTfOp1, "/", kKernel1), op_1.name());
   EXPECT_EQ(kTfOp1, op_1.category());
   EXPECT_EQ(2, op_1.occurrences());
-  EXPECT_EQ(NanoToPico(kKernel1DurationNs) * 2, op_1.time_ps());
+  EXPECT_EQ(tsl::profiler::NanoToPico(kKernel1DurationNs) * 2, op_1.time_ps());
 
   const OpMetrics& op_2 = op_metrics.metrics_db().at(1);
   EXPECT_EQ(absl::StrCat(kTfOp1, "/", kKernel2), op_2.name());
   EXPECT_EQ(kTfOp1, op_2.category());
   EXPECT_EQ(2, op_2.occurrences());
-  EXPECT_EQ(NanoToPico(kKernel2DurationNs) * 2, op_2.time_ps());
+  EXPECT_EQ(tsl::profiler::NanoToPico(kKernel2DurationNs) * 2, op_2.time_ps());
 
   const OpMetrics& op_3 = op_metrics.metrics_db().at(2);
   EXPECT_EQ(absl::StrCat(kTfOp2, "/", kKernel3), op_3.name());
   EXPECT_EQ(kTfOp2, op_3.category());
   EXPECT_EQ(1, op_3.occurrences());
-  EXPECT_EQ(NanoToPico(kKernel3DurationNs), op_3.time_ps());
+  EXPECT_EQ(tsl::profiler::NanoToPico(kKernel3DurationNs), op_3.time_ps());
 
   const OpMetrics& idle = op_metrics.metrics_db().at(3);
   EXPECT_EQ(kIdle, idle.name());
   EXPECT_EQ(kIdle, idle.category());
   // GPU is always busy in this example.
-  EXPECT_EQ(NanoToPico(0), idle.time_ps());
+  EXPECT_EQ(tsl::profiler::NanoToPico(0), idle.time_ps());
 }
 
 TEST(ConvertXPlaneToOpMetricsDb, TpuDeviceOpMetricsDb) {
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats.cc b/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
index ab5906965f23fe..c442b0443dba88 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
@@ -70,7 +70,7 @@ PerfEnv MakePerfEnv(double peak_tera_flops_per_second,
   for (const auto bw : peak_bws) {
     result.add_peak_bws_giga_bytes_per_second(bw);
   }
-  result.set_ridge_point(TeraToGiga(peak_tera_flops_per_second) /
+  result.set_ridge_point(tsl::profiler::TeraToGiga(peak_tera_flops_per_second) /
                          peak_bws[MemBwType::MEM_BW_TYPE_HBM_RW]);
   return result;
 }
@@ -79,10 +79,13 @@ PerfEnv GetPerfEnvFromXPlane(const XPlane& device_plane) {
   DeviceCapabilities cap = GetDeviceCaps(device_plane);
   if (!absl::StartsWith(device_plane.name(), kTpuPlanePrefix)) {
     return MakePerfEnv(
-        GigaToTera(GetFlopMaxThroughputPerSM(cap)) * cap.num_cores(),
+        tsl::profiler::GigaToTera(GetFlopMaxThroughputPerSM(cap)) *
+            cap.num_cores(),
         // Ideally, the cap should report separate hbm BW, for now set to same.
-        {UniToGiga(cap.memory_bandwidth()), UniToGiga(cap.memory_bandwidth()),
-         UniToGiga(cap.memory_bandwidth()), UniToGiga(cap.memory_bandwidth())});
+        {tsl::profiler::UniToGiga(cap.memory_bandwidth()),
+         tsl::profiler::UniToGiga(cap.memory_bandwidth()),
+         tsl::profiler::UniToGiga(cap.memory_bandwidth()),
+         tsl::profiler::UniToGiga(cap.memory_bandwidth())});
   } else {
     XPlaneVisitor visitor = tsl::profiler::CreateTfXPlaneVisitor(&device_plane);
     auto peak_tera_flops_per_second =
diff --git a/tensorflow/core/profiler/convert/xplane_to_step_stats.cc b/tensorflow/core/profiler/convert/xplane_to_step_stats.cc
index 8b789f89ada024..ff084d1e03e1e5 100644
--- a/tensorflow/core/profiler/convert/xplane_to_step_stats.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_step_stats.cc
@@ -63,10 +63,10 @@ GpuEventType ParseMemcpyName(absl::string_view memcpy_name) {
 }
 
 void SetNodeTimes(const XEventVisitor& event, NodeExecStats* ns) {
-  ns->set_all_start_micros(NanoToMicro(event.TimestampNs()));
+  ns->set_all_start_micros(tsl::profiler::NanoToMicro(event.TimestampNs()));
   ns->set_op_start_rel_micros(0);
-  ns->set_op_end_rel_micros(NanoToMicro(event.DurationNs()));
-  ns->set_all_end_rel_micros(NanoToMicro(event.DurationNs()));
+  ns->set_op_end_rel_micros(tsl::profiler::NanoToMicro(event.DurationNs()));
+  ns->set_all_end_rel_micros(tsl::profiler::NanoToMicro(event.DurationNs()));
 }
 
 }  // namespace
@@ -136,7 +136,7 @@ void ConvertGpuXSpaceToStepStats(const XSpace& xspace, StepStats* step_stats) {
           if (it != correlation_info_map.end()) {
             const CorrelationInfo& correlation_info = it->second;
             ns->set_scheduled_micros(
-                NanoToMicro(correlation_info.enqueue_time_ns));
+                tsl::profiler::NanoToMicro(correlation_info.enqueue_time_ns));
             ns->set_thread_id(correlation_info.thread_id);
           }
         }
diff --git a/tensorflow/core/profiler/convert/xplane_to_tf_functions.cc b/tensorflow/core/profiler/convert/xplane_to_tf_functions.cc
index 627e5767b77cd5..533d937792362f 100644
--- a/tensorflow/core/profiler/convert/xplane_to_tf_functions.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_tf_functions.cc
@@ -72,7 +72,8 @@ double ComputeExpensiveCallPercent(const TfFunction& tf_function) {
       expensive_call_time_ps += metrics.self_time_ps();
     }
   }
-  return SafeDivide(100.0 * expensive_call_time_ps, total_call_time_ps);
+  return tsl::profiler::SafeDivide(100.0 * expensive_call_time_ps,
+                                   total_call_time_ps);
 }
 
 // Each invocation of a tf-function creates an ActivationRecord.
diff --git a/tensorflow/core/profiler/convert/xplane_to_tool_names.cc b/tensorflow/core/profiler/convert/xplane_to_tool_names.cc
index 57ee859ad791a7..c0e7a535804972 100644
--- a/tensorflow/core/profiler/convert/xplane_to_tool_names.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_tool_names.cc
@@ -30,7 +30,7 @@ limitations under the License.
 namespace tensorflow {
 namespace profiler {
 
-StatusOr<std::string> GetAvailableToolNames(
+absl::StatusOr<std::string> GetAvailableToolNames(
     const SessionSnapshot& session_snapshot) {
   std::vector<std::string> tools;
   bool is_cloud_vertex_ai = !session_snapshot.HasAccessibleRunDir();
diff --git a/tensorflow/core/profiler/convert/xplane_to_tool_names.h b/tensorflow/core/profiler/convert/xplane_to_tool_names.h
index c5045d46212ad6..a1e936940d2b91 100644
--- a/tensorflow/core/profiler/convert/xplane_to_tool_names.h
+++ b/tensorflow/core/profiler/convert/xplane_to_tool_names.h
@@ -26,7 +26,7 @@ namespace profiler {
 
 // Gets the names of the available tools given a session snapshot.
 // Returns a comma separated list of tool names.
-StatusOr<std::string> GetAvailableToolNames(
+absl::StatusOr<std::string> GetAvailableToolNames(
     const SessionSnapshot& session_snapshot);
 
 }  // namespace profiler
diff --git a/tensorflow/core/profiler/convert/xplane_to_tool_names_test.cc b/tensorflow/core/profiler/convert/xplane_to_tool_names_test.cc
index b82403f319804d..f0d30dace3abe6 100644
--- a/tensorflow/core/profiler/convert/xplane_to_tool_names_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_tool_names_test.cc
@@ -90,7 +90,7 @@ SessionSnapshot CreateSessionSnapshot(std::unique_ptr<XSpace> xspace,
   std::vector<std::unique_ptr<XSpace>> xspaces;
   xspaces.push_back(std::move(xspace));
 
-  StatusOr<SessionSnapshot> session_snapshot =
+  absl::StatusOr<SessionSnapshot> session_snapshot =
       SessionSnapshot::Create(paths, std::move(xspaces));
   TF_CHECK_OK(session_snapshot.status());
   return std::move(session_snapshot.value());
@@ -107,7 +107,8 @@ TEST_P(XPlaneToToolsTest, ToolsList) {
       CreateSessionSnapshot(std::move(xspace), test_case.has_hlo_module,
                             test_case.has_dcn_collective_stats);
 
-  StatusOr<std::string> toolsString = GetAvailableToolNames(sessionSnapshot);
+  absl::StatusOr<std::string> toolsString =
+      GetAvailableToolNames(sessionSnapshot);
   ASSERT_TRUE(toolsString.ok());
 
   std::vector<std::string> tools = absl::StrSplit(toolsString.value(), ',');
diff --git a/tensorflow/core/profiler/convert/xplane_to_tools_data.cc b/tensorflow/core/profiler/convert/xplane_to_tools_data.cc
index 7d0f9c3c34a90b..b623c6a9824ad8 100644
--- a/tensorflow/core/profiler/convert/xplane_to_tools_data.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_tools_data.cc
@@ -91,7 +91,7 @@ absl::StatusOr<TraceViewOption> GetTraceViewOption(const ToolOptions& options) {
   return trace_options;
 }
 
-StatusOr<std::string> ConvertXSpaceToTraceEvents(
+absl::StatusOr<std::string> ConvertXSpaceToTraceEvents(
     const SessionSnapshot& session_snapshot, const absl::string_view tool_name,
     const ToolOptions& options) {
   if (session_snapshot.XSpaceSize() != 1) {
@@ -144,7 +144,7 @@ StatusOr<std::string> ConvertXSpaceToTraceEvents(
   }
 }
 
-StatusOr<std::string> ConvertMultiXSpacesToOverviewPage(
+absl::StatusOr<std::string> ConvertMultiXSpacesToOverviewPage(
     const SessionSnapshot& session_snapshot) {
   OpStatsOptions options;
   options.generate_kernel_stats_db = true;
@@ -157,7 +157,7 @@ StatusOr<std::string> ConvertMultiXSpacesToOverviewPage(
   return ConvertOpStatsToOverviewPage(combined_op_stats).SerializeAsString();
 }
 
-StatusOr<std::string> ConvertMultiXSpacesToInputPipeline(
+absl::StatusOr<std::string> ConvertMultiXSpacesToInputPipeline(
     const SessionSnapshot& session_snapshot) {
   OpStatsOptions options;
   options.generate_op_metrics_db = true;
@@ -169,7 +169,7 @@ StatusOr<std::string> ConvertMultiXSpacesToInputPipeline(
       .SerializeAsString();
 }
 
-StatusOr<std::string> ConvertMultiXSpacesToTfStats(
+absl::StatusOr<std::string> ConvertMultiXSpacesToTfStats(
     const SessionSnapshot& session_snapshot) {
   OpStatsOptions options;
   options.generate_op_metrics_db = true;
@@ -180,7 +180,7 @@ StatusOr<std::string> ConvertMultiXSpacesToTfStats(
   return ConvertOpStatsToTfStats(combined_op_stats).SerializeAsString();
 }
 
-StatusOr<std::string> ConvertMultiXSpacesToKernelStats(
+absl::StatusOr<std::string> ConvertMultiXSpacesToKernelStats(
     const SessionSnapshot& session_snapshot) {
   OpStatsOptions options;
   options.generate_kernel_stats_db = true;
@@ -190,7 +190,7 @@ StatusOr<std::string> ConvertMultiXSpacesToKernelStats(
   return combined_op_stats.kernel_stats_db().SerializeAsString();
 }
 
-StatusOr<std::string> ConvertXSpaceToMemoryProfile(
+absl::StatusOr<std::string> ConvertXSpaceToMemoryProfile(
     const SessionSnapshot& session_snapshot) {
   if (session_snapshot.XSpaceSize() != 1) {
     return errors::InvalidArgument(
@@ -207,7 +207,7 @@ StatusOr<std::string> ConvertXSpaceToMemoryProfile(
   return json_output;
 }
 
-StatusOr<std::string> ConvertMultiXSpacesToPodViewer(
+absl::StatusOr<std::string> ConvertMultiXSpacesToPodViewer(
     const SessionSnapshot& session_snapshot) {
   OpStatsOptions options;
   options.generate_op_metrics_db = true;
@@ -230,7 +230,7 @@ StatusOr<std::string> ConvertMultiXSpacesToPodViewer(
   return json_output;
 }
 
-StatusOr<std::string> ConvertMultiXSpacesToTfDataBottleneckAnalysis(
+absl::StatusOr<std::string> ConvertMultiXSpacesToTfDataBottleneckAnalysis(
     const SessionSnapshot& session_snapshot) {
   CombinedTfDataStats combined_tf_data_stats;
   CombinedTfDataStatsBuilder builder(&combined_tf_data_stats);
@@ -257,7 +257,7 @@ StatusOr<std::string> ConvertMultiXSpacesToTfDataBottleneckAnalysis(
   return combined_tf_data_stats.SerializeAsString();
 }
 
-StatusOr<std::string> ConvertMultiXSpacesToOpProfileViewer(
+absl::StatusOr<std::string> ConvertMultiXSpacesToOpProfileViewer(
     const SessionSnapshot& session_snapshot) {
   OpStatsOptions options;
   options.generate_op_metrics_db = true;
@@ -285,7 +285,7 @@ StatusOr<std::string> ConvertMultiXSpacesToOpProfileViewer(
   return json_output;
 }
 
-StatusOr<std::string> PreprocessXSpace(
+absl::StatusOr<std::string> PreprocessXSpace(
     const SessionSnapshot& session_snapshot) {
   if (session_snapshot.XSpaceSize() != 1) {
     return errors::InvalidArgument(
@@ -300,7 +300,7 @@ StatusOr<std::string> PreprocessXSpace(
   return xspace->SerializeAsString();
 }
 
-StatusOr<std::string> ConvertDcnCollectiveStatsToToolData(
+absl::StatusOr<std::string> ConvertDcnCollectiveStatsToToolData(
     const SessionSnapshot& session_snapshot, const ToolOptions& options) {
   // <options> must provide a host_name field.
   std::optional<std::string> hostname =
@@ -320,7 +320,7 @@ StatusOr<std::string> ConvertDcnCollectiveStatsToToolData(
 
 }  // namespace
 
-StatusOr<std::string> ConvertMultiXSpacesToToolData(
+absl::StatusOr<std::string> ConvertMultiXSpacesToToolData(
     const SessionSnapshot& session_snapshot, const absl::string_view tool_name,
     const ToolOptions& options) {
   LOG(INFO) << "serving tool: " << tool_name
diff --git a/tensorflow/core/profiler/convert/xplane_to_tools_data.h b/tensorflow/core/profiler/convert/xplane_to_tools_data.h
index f7fd494099da86..8a40e03a7cd1dd 100644
--- a/tensorflow/core/profiler/convert/xplane_to_tools_data.h
+++ b/tensorflow/core/profiler/convert/xplane_to_tools_data.h
@@ -29,7 +29,7 @@ namespace profiler {
 // Convert XSpace protos to a tool specific data.
 // Return the serialized string of tool specific data when the conversion is
 // successful, else return error status.
-StatusOr<std::string> ConvertMultiXSpacesToToolData(
+absl::StatusOr<std::string> ConvertMultiXSpacesToToolData(
     const SessionSnapshot& session_snapshot, absl::string_view tool_name,
     const ToolOptions& options);
 
diff --git a/tensorflow/core/profiler/lib/annotated_traceme.h b/tensorflow/core/profiler/lib/annotated_traceme.h
index 313fca86d8d08c..150b8097bda77a 100644
--- a/tensorflow/core/profiler/lib/annotated_traceme.h
+++ b/tensorflow/core/profiler/lib/annotated_traceme.h
@@ -34,8 +34,8 @@ class AnnotatedTraceMe {
   template <typename NameGeneratorT>
   explicit AnnotatedTraceMe(NameGeneratorT&& name_generator, int level = 1) {
     DCHECK_GE(level, 1);
-    bool annotation_enabled = ScopedAnnotation::IsEnabled();
-    bool traceme_enabled = TraceMe::Active(level);
+    bool annotation_enabled = tsl::profiler::ScopedAnnotation::IsEnabled();
+    bool traceme_enabled = tsl::profiler::TraceMe::Active(level);
     if (TF_PREDICT_TRUE(!annotation_enabled && !traceme_enabled)) {
       return;
     }
@@ -49,8 +49,8 @@ class AnnotatedTraceMe {
   }
 
  private:
-  std::optional<TraceMe> trace_me_;
-  std::optional<ScopedAnnotation> scoped_annotation_;
+  std::optional<tsl::profiler::TraceMe> trace_me_;
+  std::optional<tsl::profiler::ScopedAnnotation> scoped_annotation_;
 };
 
 }  // namespace profiler
diff --git a/tensorflow/core/profiler/lib/context_types.h b/tensorflow/core/profiler/lib/context_types.h
index 434a02c22dd912..dbb7fc2e271848 100644
--- a/tensorflow/core/profiler/lib/context_types.h
+++ b/tensorflow/core/profiler/lib/context_types.h
@@ -32,12 +32,13 @@ using ContextType ABSL_DEPRECATE_AND_INLINE() =
     tsl::profiler::ContextType;  // NOLINT
 
 ABSL_DEPRECATE_AND_INLINE()
-inline const char* GetContextTypeString(ContextType context_type) {
+inline const char* GetContextTypeString(
+    tsl::profiler::ContextType context_type) {
   return tsl::profiler::GetContextTypeString(context_type);
 }
 
 ABSL_DEPRECATE_AND_INLINE()
-inline ContextType GetSafeContextType(uint32_t context_type) {
+inline tsl::profiler::ContextType GetSafeContextType(uint32_t context_type) {
   return tsl::profiler::GetSafeContextType(context_type);
 }
 
diff --git a/tensorflow/core/profiler/lib/device_profiler_session.h b/tensorflow/core/profiler/lib/device_profiler_session.h
index 715f527422cf8e..4065c66d0c80e1 100644
--- a/tensorflow/core/profiler/lib/device_profiler_session.h
+++ b/tensorflow/core/profiler/lib/device_profiler_session.h
@@ -38,7 +38,7 @@ class DeviceProfilerSession {
   // Does not trace TPU devices (not supported).
   static std::unique_ptr<DeviceProfilerSession> Create() {
 #if !defined(IS_MOBILE_PLATFORM)
-    ProfileOptions options = ProfilerSession::DefaultOptions();
+    ProfileOptions options = tsl::ProfilerSession::DefaultOptions();
     options.set_host_tracer_level(0);
     options.set_device_type(ProfileOptions::GPU);
     return absl::WrapUnique(new DeviceProfilerSession(options));
@@ -64,7 +64,7 @@ class DeviceProfilerSession {
   // Constructs an instance of the class and starts profiling
   explicit DeviceProfilerSession(const ProfileOptions& options)
 #if !defined(IS_MOBILE_PLATFORM)
-      : profiler_session_(ProfilerSession::Create(options))
+      : profiler_session_(tsl::ProfilerSession::Create(options))
 #endif
   {
   }
@@ -75,7 +75,7 @@ class DeviceProfilerSession {
 
 #if !defined(IS_MOBILE_PLATFORM)
   // TODO(b/256013238)
-  std::unique_ptr<ProfilerSession> profiler_session_;
+  std::unique_ptr<tsl::ProfilerSession> profiler_session_;
 #endif
 };
 
diff --git a/tensorflow/core/profiler/lib/profiler_disabled_test.cc b/tensorflow/core/profiler/lib/profiler_disabled_test.cc
index 2ad5be1855893f..089c8692ca7bde 100644
--- a/tensorflow/core/profiler/lib/profiler_disabled_test.cc
+++ b/tensorflow/core/profiler/lib/profiler_disabled_test.cc
@@ -24,7 +24,8 @@ namespace {
 
 TEST(ProfilerDisabledTest, ProfilerDisabledTest) {
   setenv("TF_DISABLE_PROFILING", "1", /*overwrite=*/1);
-  StatusOr<ProfilerLock> profiler_lock = ProfilerLock::Acquire();
+  absl::StatusOr<ProfilerLock> profiler_lock =
+      tsl::profiler::ProfilerLock::Acquire();
   EXPECT_FALSE(profiler_lock.ok());
 }
 
diff --git a/tensorflow/core/profiler/lib/profiler_factory.h b/tensorflow/core/profiler/lib/profiler_factory.h
index 54b6fb5577cf6a..ebba761bd73357 100644
--- a/tensorflow/core/profiler/lib/profiler_factory.h
+++ b/tensorflow/core/profiler/lib/profiler_factory.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <functional>
 #include <memory>
+#include <utility>
 #include <vector>
 
 #include "absl/base/macros.h"
@@ -39,14 +40,14 @@ using ProfilerFactor ABSL_DEPRECATE_AND_INLINE() =
 
 // Registers a profiler factory. Should be invoked at most once per factory.
 ABSL_DEPRECATE_AND_INLINE()
-inline void RegisterProfilerFactory(ProfilerFactor factory) {
-  tsl::profiler::RegisterProfilerFactory(factory);
+inline void RegisterProfilerFactory(tsl::profiler::ProfilerFactory factory) {
+  tsl::profiler::RegisterProfilerFactory(std::move(factory));
 }
 
 // Invokes all registered profiler factories with the given options, and
 // returns the instantiated (non-null) profiler interfaces.
 ABSL_DEPRECATE_AND_INLINE()
-inline std::vector<std::unique_ptr<profiler::ProfilerInterface>>
+inline std::vector<std::unique_ptr<tsl::profiler::ProfilerInterface>>
 CreateProfilers(const tensorflow::ProfileOptions& options) {
   return tsl::profiler::CreateProfilers(options);
 }
diff --git a/tensorflow/core/profiler/lib/traceme_encode.h b/tensorflow/core/profiler/lib/traceme_encode.h
index 5f1bb253ccb611..0ebd2051287b40 100644
--- a/tensorflow/core/profiler/lib/traceme_encode.h
+++ b/tensorflow/core/profiler/lib/traceme_encode.h
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include <initializer_list>
 #include <string>
+#include <utility>
 
 #include "absl/base/macros.h"
 #include "absl/strings/match.h"
@@ -40,25 +41,27 @@ using TraceMeArg ABSL_DEPRECATE_AND_INLINE() =
     tsl::profiler::TraceMeArg;  // NOLINT
 
 ABSL_DEPRECATE_AND_INLINE()
-inline std::string TraceMeEncode(std::string name,
-                                 std::initializer_list<TraceMeArg> args) {
-  return tsl::profiler::TraceMeEncode(name, args);
+inline std::string TraceMeEncode(
+    std::string name, std::initializer_list<tsl::profiler::TraceMeArg> args) {
+  return tsl::profiler::TraceMeEncode(std::move(name), args);
 }
 
 ABSL_DEPRECATE_AND_INLINE()
-inline std::string TraceMeEncode(absl::string_view name,
-                                 std::initializer_list<TraceMeArg> args) {
+inline std::string TraceMeEncode(
+    absl::string_view name,
+    std::initializer_list<tsl::profiler::TraceMeArg> args) {
   return tsl::profiler::TraceMeEncode(name, args);
 }
 
 ABSL_DEPRECATE_AND_INLINE()
-inline std::string TraceMeEncode(const char* name,
-                                 std::initializer_list<TraceMeArg> args) {
+inline std::string TraceMeEncode(
+    const char* name, std::initializer_list<tsl::profiler::TraceMeArg> args) {
   return tsl::profiler::TraceMeEncode(name, args);
 }
 
 ABSL_DEPRECATE_AND_INLINE()
-inline std::string TraceMeEncode(std::initializer_list<TraceMeArg> args) {
+inline std::string TraceMeEncode(
+    std::initializer_list<tsl::profiler::TraceMeArg> args) {
   return tsl::profiler::TraceMeEncode(args);
 }
 
diff --git a/tensorflow/core/profiler/protobuf/overview_page.proto b/tensorflow/core/profiler/protobuf/overview_page.proto
index 4b91b96da14c60..5d78fecd6bf5db 100644
--- a/tensorflow/core/profiler/protobuf/overview_page.proto
+++ b/tensorflow/core/profiler/protobuf/overview_page.proto
@@ -77,6 +77,22 @@ message OverviewPageAnalysis {
   double sc_outfeed_time_ms_avg = 21;
   // Sparse core idle time in ms average.
   double sc_idle_time_ms_avg = 22;
+  // Max FW VDD Core PL1 power metrics in watts.
+  double fw_max_vdd_core_pl1_power_watts = 23;
+  // Max FW VDD Core PL2 power metrics in watts.
+  double fw_max_vdd_core_pl2_power_watts = 24;
+  // Max FW VDD Core PL3 power metrics in watts.
+  double fw_max_vdd_core_pl3_power_watts = 25;
+  // Max FW VDD Core PL4 power metrics in watts.
+  double fw_max_vdd_core_pl4_power_watts = 26;
+  // Max FW HBM PL1 power metrics in watts.
+  double fw_max_hbm_pl1_power_watts = 27;
+  // Max FW HBM PL2 power metrics in watts.
+  double fw_max_hbm_pl2_power_watts = 28;
+  // Max FW HBM PL3 power metrics in watts.
+  double fw_max_hbm_pl3_power_watts = 29;
+  // Max FW HBM PL4 power metrics in watts.
+  double fw_max_hbm_pl4_power_watts = 30;
   // END-INTERNAL
 }
 
diff --git a/tensorflow/core/profiler/protobuf/power_metrics.proto b/tensorflow/core/profiler/protobuf/power_metrics.proto
index ece0b84262e98b..e3360f44298beb 100644
--- a/tensorflow/core/profiler/protobuf/power_metrics.proto
+++ b/tensorflow/core/profiler/protobuf/power_metrics.proto
@@ -9,12 +9,17 @@ message PowerComponentMetrics {
   double max_power = 2;
   // average watts monitored.
   double avg_power = 3;
-  // maximum watts of moving average power over a time window of 100us
+  // (SPI sampler only) maximum watts of moving average power over a time window
+  // of 100us.
   double max_moving_avg_power_100us = 4;
-  // maximum watts of moving average power over a time window of 1ms
+  // (SPI sampler only) maximum watts of moving average power over a time window
+  // of 1ms.
   double max_moving_avg_power_1ms = 5;
-  // maximum watts of moving average power over a time window of 10ms
+  // (SPI sampler only) maximum watts of moving average power over a time window
+  // of 10ms.
   double max_moving_avg_power_10ms = 6;
+  // (FW only) The timescale in us to compute moving averages.
+  uint32 timescale_us = 7;
 }
 
 message PowerMetrics {
diff --git a/tensorflow/core/profiler/rpc/client/BUILD b/tensorflow/core/profiler/rpc/client/BUILD
index 82e68a58529c2c..1d66c93ccb1c40 100644
--- a/tensorflow/core/profiler/rpc/client/BUILD
+++ b/tensorflow/core/profiler/rpc/client/BUILD
@@ -33,9 +33,7 @@ tf_profiler_pybind_cc_library_wrapper(
 cc_library(
     name = "profiler_client",
     hdrs = ["profiler_client.h"],
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
     deps = [
         ":profiler_client_impl",
         "//tensorflow/core:lib",
diff --git a/tensorflow/core/profiler/utils/derived_timeline.cc b/tensorflow/core/profiler/utils/derived_timeline.cc
index 981b6a0c54e8de..0cb61f2cbdd762 100644
--- a/tensorflow/core/profiler/utils/derived_timeline.cc
+++ b/tensorflow/core/profiler/utils/derived_timeline.cc
@@ -393,10 +393,12 @@ void DeriveEventsFromHostTrace(
         device_event.AddStatValue(group_id_stat_metadata, group_id);
         device_event.AddStatValue(num_launches_stat_metadata,
                                   group_info.stat.count());
-        device_event.AddStatValue(max_launch_time_us_stat_metadata,
-                                  PicoToMicro(group_info.stat.max()));
-        device_event.AddStatValue(avg_launch_time_us_stat_metadata,
-                                  PicoToMicro(group_info.stat.avg()));
+        device_event.AddStatValue(
+            max_launch_time_us_stat_metadata,
+            tsl::profiler::PicoToMicro(group_info.stat.max()));
+        device_event.AddStatValue(
+            avg_launch_time_us_stat_metadata,
+            tsl::profiler::PicoToMicro(group_info.stat.avg()));
       }
     }
   }
diff --git a/tensorflow/core/profiler/utils/gpu_event_stats.h b/tensorflow/core/profiler/utils/gpu_event_stats.h
index 2c691dff6de157..9366012dc72a71 100644
--- a/tensorflow/core/profiler/utils/gpu_event_stats.h
+++ b/tensorflow/core/profiler/utils/gpu_event_stats.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_PROFILER_UTILS_GPU_EVENT_STATS_H_
 
 #include <cstdint>
+#include <optional>
 #include <vector>
 
 #include "absl/strings/string_view.h"
@@ -44,15 +45,15 @@ struct GpuEventStats {
   // Stats from XLA.
   std::vector<absl::string_view> hlo_op_names;
   absl::string_view hlo_module_name;
-  absl::optional<uint64_t> program_id;
+  std::optional<uint64_t> program_id;
 
   // Stats from CUPTI.
   absl::string_view kernel_details;
   absl::string_view memcpy_details;
-  absl::optional<int64_t> correlation_id;
+  std::optional<int64_t> correlation_id;
 
   // Stats derived by grouping.
-  absl::optional<int64_t> group_id;
+  std::optional<int64_t> group_id;
   bool is_eager = false;
 };
 
@@ -65,11 +66,11 @@ struct LaunchEventStats {
   }
 
   // Stats from CUPTI.
-  absl::optional<int64_t> device_id;
-  absl::optional<int64_t> correlation_id;
+  std::optional<int64_t> device_id;
+  std::optional<int64_t> correlation_id;
 
   // Stat derived by grouping.
-  absl::optional<int64_t> group_id;
+  std::optional<int64_t> group_id;
 };
 
 }  // namespace profiler
diff --git a/tensorflow/core/profiler/utils/hlo_module_map.cc b/tensorflow/core/profiler/utils/hlo_module_map.cc
index 4bbdfad0023836..0fbc48011e84f8 100644
--- a/tensorflow/core/profiler/utils/hlo_module_map.cc
+++ b/tensorflow/core/profiler/utils/hlo_module_map.cc
@@ -46,7 +46,8 @@ int64_t ShapeSize(const xla::Shape& shape) {
 HloInstructionWrapper::HloInstructionWrapper(
     const xla::HloInstruction* instr, const xla::HloCostAnalysis* cost_analysis)
     : instr_(instr),
-      op_full_name_(TraceMeOp(Metadata().op_name(), Metadata().op_type())) {
+      op_full_name_(tsl::profiler::TraceMeOp(Metadata().op_name(),
+                                             Metadata().op_type())) {
   if (cost_analysis != nullptr) {
     flops_ = cost_analysis->flop_count(*instr_);
     bytes_accessed_ = cost_analysis->bytes_accessed(*instr_);
diff --git a/tensorflow/core/profiler/utils/op_metrics_db_utils.cc b/tensorflow/core/profiler/utils/op_metrics_db_utils.cc
index 5ea6b463571a4e..07290edeaa7e4d 100644
--- a/tensorflow/core/profiler/utils/op_metrics_db_utils.cc
+++ b/tensorflow/core/profiler/utils/op_metrics_db_utils.cc
@@ -253,7 +253,8 @@ OpMetricsDb XEventsOpMetricsDbBuilder::Finalize() {
 }
 
 double IdleTimeRatio(const OpMetricsDb& db) {
-  return 1.0 - SafeDivide(db.total_op_time_ps(), db.total_time_ps());
+  return 1.0 -
+         tsl::profiler::SafeDivide(db.total_op_time_ps(), db.total_time_ps());
 }
 
 uint64 IdleTimePs(const OpMetricsDb& db) {
@@ -274,14 +275,15 @@ void AddIdleOp(OpMetricsDb& db) {
   SetIdleOp(idle_time_ps, *db.add_metrics_db());
 }
 
-absl::optional<double> HostInfeedEnqueueRatio(const OpMetricsDb& db) {
+std::optional<double> HostInfeedEnqueueRatio(const OpMetricsDb& db) {
   if (db.total_host_infeed_enq_start_timestamp_ps_diff() > 0) {
     // We use total_host_infeed_enq_start_timestamp_ps_diff to approximate the
     // total host time.
-    return SafeDivide(db.total_host_infeed_enq_duration_ps(),
-                      db.total_host_infeed_enq_start_timestamp_ps_diff());
+    return tsl::profiler::SafeDivide(
+        db.total_host_infeed_enq_duration_ps(),
+        db.total_host_infeed_enq_start_timestamp_ps_diff());
   }
-  return absl::nullopt;
+  return std::nullopt;
 }
 
 OpMetricsDb CreateTfMetricsDbFromDeviceOpMetricsDb(
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index bb798e7845959e..9249f032b1e1c1 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1819  // Updated: 2024/4/1
+#define TF_GRAPH_DEF_VERSION 1835  // Updated: 2024/4/17
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //
diff --git a/tensorflow/core/runtime_fallback/kernel/attr_util.cc b/tensorflow/core/runtime_fallback/kernel/attr_util.cc
index 3efe36f3766a44..7c35faceea8369 100644
--- a/tensorflow/core/runtime_fallback/kernel/attr_util.cc
+++ b/tensorflow/core/runtime_fallback/kernel/attr_util.cc
@@ -67,7 +67,7 @@ bool ParseBoolAttrValue(StringPiece attr_value) {
 
 Status ParseValue(StringPiece input, bool* value) {
   *value = ParseBoolAttrValue(input);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ParseValue(StringPiece input, int32* value) {
@@ -75,17 +75,17 @@ Status ParseValue(StringPiece input, int32* value) {
   if (!parse_result) {
     return errors::InvalidArgument("Could not parse int32 from ", input);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ParseValue(StringPiece input, DataType* value) {
   *value = ParseTFDataType(input);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ParseValue(StringPiece input, std::string* value) {
   *value = std::string(input);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ParseValue(StringPiece input, std::vector<int32>* value) {
@@ -100,7 +100,7 @@ Status ParseValue(StringPiece input, std::vector<int32>* value) {
     }
     value->push_back(value_int);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ParseValue(StringPiece input, Padding* value) {
diff --git a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.cc b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.cc
index 16ef9cd3eefa91..d8ab1b18c5f483 100644
--- a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.cc
+++ b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.cc
@@ -179,7 +179,7 @@ Status SetUpKernelFallbackCompatRequestContext(
   fallback_request_state.set_cancellation_manager(cancellation_manager);
   fallback_request_state.set_runtime_config(runtime_config);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tfd
diff --git a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.cc b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.cc
index f4943a06829f22..f76269995e0964 100644
--- a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.cc
+++ b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.cc
@@ -138,7 +138,7 @@ static Status ValidateInputTypes(
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 namespace {
@@ -328,8 +328,8 @@ std::string GetTracingMetadata(llvm::ArrayRef<tfrt::AsyncValue*> args,
   auto debug_info = exec_ctx.location().GetDebugInfo();
   auto long_name = debug_info.has_value() ? debug_info.value().info : "";
 
-  if (!profiler::TfOpDetailsEnabled()) {
-    return profiler::TraceMeEncode(
+  if (!tsl::profiler::TfOpDetailsEnabled()) {
+    return tsl::profiler::TraceMeEncode(
         {{"id", request_id}, {"long_name", ToAbslStringView(long_name)}});
   }
 
@@ -356,7 +356,7 @@ std::string GetTracingMetadata(llvm::ArrayRef<tfrt::AsyncValue*> args,
     attr_string_stream << ";\n";
   }
 
-  return profiler::TraceMeEncode({
+  return tsl::profiler::TraceMeEncode({
       {"id", request_id},
       {"long_name", ToAbslStringView(long_name)},
       {"inputs", input_string},
@@ -414,9 +414,9 @@ TF_ATTRIBUTE_ALWAYS_INLINE static void KernelFallbackExecuteOpInternal(
     const KernelFallbackCompatRequestState& fallback_request_state,
     const OpKernelRunner& kernel_runner, bool is_async,
     tensorflow::Device* device) {
-  tensorflow::profiler::TraceMe trace_me([&]() -> std::string {
+  tsl::profiler::TraceMe trace_me([&]() -> std::string {
     if (kernel_runner.op_kernel()) {
-      return tensorflow::profiler::TraceMeOp(
+      return tsl::profiler::TraceMeOp(
           kernel_runner.op_kernel()->name_view(),
           kernel_runner.op_kernel()->type_string_view());
     }
@@ -608,9 +608,9 @@ void FallbackGetResource(tfrt::Argument<tfrt::Chain> in_ch,
                          tfrt::RemainingResults results,
                          tfrt::StringAttr device, tfrt::ArrayAttr indices_attr,
                          const tfrt::ExecutionContext& exec_ctx) {
-  tensorflow::profiler::TraceMe trace_me("tfrt_fallback_async.get_resource");
+  tsl::profiler::TraceMe trace_me("tfrt_fallback_async.get_resource");
   trace_me.AppendMetadata([request_id = exec_ctx.request_ctx()->id()]() {
-    return tensorflow::profiler::TraceMeEncode({{"id", request_id}});
+    return tsl::profiler::TraceMeEncode({{"id", request_id}});
   });
 
   const auto* fallback_request_state =
@@ -915,7 +915,7 @@ void BatchFunction(
     // Pass in a BEF function pointer with a I64 attribute.
     int64_t ptr_value = absl::bit_cast<int64_t>(&f.get());
     (*attr_value_map)["opaque_function_handle"].set_i(ptr_value);
-    return OkStatus();
+    return absl::OkStatus();
   };
   auto kernel_runner_or_status = runner_cache->GetOrCreate(
       exec_ctx.location(), kTfKernelNameToFallback,
diff --git a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat_eager.cc b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat_eager.cc
index 584debb8640955..745074cf568bd6 100644
--- a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat_eager.cc
+++ b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat_eager.cc
@@ -73,7 +73,7 @@ Status SetUpKernelFallbackCompatRequestContext(
   fallback_request_state.set_log_device_placement(
       eager_context->LogDevicePlacement());
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tfd
diff --git a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_op_handler.cc b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_op_handler.cc
index efb3f3a47ebeb7..dd5f8c5774ebae 100644
--- a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_op_handler.cc
+++ b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_op_handler.cc
@@ -209,7 +209,7 @@ Expected<CoreRuntimeOp> KernelFallbackOpHandler::MakeOp(string_view op_name) {
               if (auto error =
                       tfd::FillAttrValueMap(attrs, host, attr_value_map))
                 return tensorflow::errors::InvalidArgument(tfrt::StrCat(error));
-              return OkStatus();
+              return absl::OkStatus();
             },
             fallback_op_entry.fallback_request_state->device_manager(),
             fallback_op_entry.fallback_request_state
diff --git a/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.cc b/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.cc
index 0690117d9028a1..6272e343a0e2ea 100644
--- a/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.cc
+++ b/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.cc
@@ -55,7 +55,7 @@ Status TFRTOpKernelConstruction::GetAttr(StringPiece attr_name,
     return MissingAttributeError(attr_name);
   }
   *value = view.str();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <>
@@ -68,7 +68,7 @@ Status TFRTOpKernelConstruction::GetAttr(StringPiece attr_name,
     return MissingAttributeError(attr_name);
   }
   *value = tfd::ConvertToTfDataType(attrtype);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <>
@@ -89,7 +89,7 @@ Status TFRTOpKernelConstruction::GetAttr(StringPiece attr_name,
     return MissingAttributeError(attr_name);
   }
   *value = arrayref;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void TFRTOpKernelConstruction::CtxFailure(const Status& s) {
@@ -160,7 +160,7 @@ Status TFRTOpKernelContext::allocate_temp(DataType type,
                                           const TensorShape& shape,
                                           Tensor* out_temp) {
   *out_temp = Tensor(type, shape);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status TFRTOpKernelContext::allocate_output(int index, const TensorShape& shape,
@@ -169,7 +169,7 @@ Status TFRTOpKernelContext::allocate_output(int index, const TensorShape& shape,
   DataType output_type = op_meta_->output_type(index);
   outputs_[index] = Tensor(output_type, shape);
   *tensor = &outputs_[index];
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 DataType TFRTOpKernelContext::expected_output_dtype(int i) const {
@@ -295,7 +295,7 @@ Status ValidKernelAttr(StringPiece kernel_class_name,
           " does not match attribute type ", DataTypeString(type), ".");
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 std::unique_ptr<TFRTOpKernel> TFRTOpKernelFactories::CreateKernel(
diff --git a/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.h b/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.h
index fee9279556c23e..b1070a6375b67b 100644
--- a/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.h
+++ b/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.h
@@ -76,7 +76,7 @@ class TFRTOpKernelConstruction {
                         const DataTypeSlice expected_outputs) {
     // TODO(annarev): Move MatchSignatureHelper out of op_kernel.h
     // and call it here.
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   const std::optional<std::string>& error();
@@ -113,7 +113,7 @@ Status TFRTOpKernelConstruction::GetAttr(StringPiece attr_name,
   if (!success) {
     return MissingAttributeError(attr_name);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // An implementation of OpKernelContext that fetches inputs from a
diff --git a/tensorflow/core/runtime_fallback/runtime/BUILD b/tensorflow/core/runtime_fallback/runtime/BUILD
index 2e38424455876f..ece9e5dbb6c437 100644
--- a/tensorflow/core/runtime_fallback/runtime/BUILD
+++ b/tensorflow/core/runtime_fallback/runtime/BUILD
@@ -55,6 +55,7 @@ cc_library(
         ":op_logger",
         ":runtime_fallback_tensor",
         "//tensorflow/core/kernels/batching_util:adaptive_shared_batch_scheduler",
+        "//tensorflow/core/kernels/batching_util:batch_scheduler_hdrs",
         "//tensorflow/core/platform:statusor",
         "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_compat_request_state",
         "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_execute_compat",
@@ -193,6 +194,7 @@ cc_library(
         "//tensorflow/core/kernels:batch_kernels",
         "//tensorflow/core/kernels/batching_util:adaptive_shared_batch_scheduler",
         "//tensorflow/core/kernels/batching_util:batch_resource_base",
+        "//tensorflow/core/kernels/batching_util:batch_scheduler_hdrs",
         "//tensorflow/core/kernels/batching_util:bounded_executor",
         "//tensorflow/core/kernels/batching_util:warmup",
         "//tensorflow/core/lib/core:refcount",
diff --git a/tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.cc b/tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.cc
index dfaf901d4f4284..54f4fcbc863ce9 100644
--- a/tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.cc
+++ b/tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.cc
@@ -99,6 +99,8 @@ BatchFunctionFallbackKernelBase::BatchFunctionFallbackKernelBase(
                                &low_priority_allowed_batch_sizes_));
   OP_REQUIRES_OK(c, c->GetAttr("low_priority_max_enqueued_batches",
                                &low_priority_max_enqueued_batches_));
+  OP_REQUIRES_OK(c,
+                 c->GetAttr("mixed_priority_policy", &mixed_priority_policy_));
 
   if (shared_name_.empty()) {
     // If shared_name is not supplied, use name instead (prevent collisions by
@@ -148,7 +150,7 @@ BatchFunctionFallbackKernelBase::BatchFunctionFallbackKernelBase(
 
 Status BatchFunctionFallbackKernelBase::ValidateAllowedBatchSizes() const {
   if (allowed_batch_sizes_.empty()) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   int32_t last_size = 0;
   for (size_t i = 0; i < allowed_batch_sizes_.size(); ++i) {
@@ -167,7 +169,7 @@ Status BatchFunctionFallbackKernelBase::ValidateAllowedBatchSizes() const {
 
     last_size = size;
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void BatchFunctionFallbackKernelBase::SetAdaptiveBatchSchedulerOptions(
diff --git a/tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.h b/tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.h
index d731db16f34410..0d3f83e9246b9e 100644
--- a/tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.h
+++ b/tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/batch_kernels.h"
 #include "tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h"
 #include "tensorflow/core/kernels/batching_util/batch_resource_base.h"
+#include "tensorflow/core/kernels/batching_util/batch_scheduler.h"
 #include "tensorflow/core/kernels/batching_util/warmup.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -78,6 +79,7 @@ class BatchFunctionFallbackKernelBase : public AsyncOpKernel {
   int32 low_priority_batch_timeout_micros_;
   int32 low_priority_max_enqueued_batches_;
   std::vector<int32> low_priority_allowed_batch_sizes_;
+  std::string mixed_priority_policy_;
   bool enable_large_batch_splitting_;
   bool has_attribute_enable_large_batch_splitting_;
   bool disable_padding_;
@@ -203,10 +205,27 @@ void BatchFunctionFallbackKernel<BatchResourceType>::ComputeAsync(
   } else {
     creator = [this, c]()
         -> absl::StatusOr<tensorflow::core::RefCountPtr<BatchResourceType>> {
+      serving::BatchResourceOptions batch_resource_options;
+      TF_ASSIGN_OR_RETURN(
+          batch_resource_options.mixed_priority_batching_policy,
+          serving::GetMixedPriorityBatchingPolicy(mixed_priority_policy_));
+      batch_resource_options.num_batch_threads = num_batch_threads_;
+      batch_resource_options.max_batch_size = max_batch_size_;
+      batch_resource_options.batch_timeout_micros = batch_timeout_micros_;
+      batch_resource_options.max_enqueued_batches = max_enqueued_batches_;
+      batch_resource_options.allowed_batch_sizes = allowed_batch_sizes_;
+      batch_resource_options.low_priority_max_batch_size =
+          low_priority_max_batch_size_;
+      batch_resource_options.low_priority_batch_timeout_micros =
+          low_priority_batch_timeout_micros_;
+      batch_resource_options.low_priority_max_enqueued_batches =
+          low_priority_max_enqueued_batches_;
+      batch_resource_options.low_priority_allowed_batch_sizes =
+          low_priority_allowed_batch_sizes_;
+
       std::unique_ptr<BatchResourceType> new_resource;
       auto status = BatchResourceType::Create(
-          c, num_batch_threads_, max_batch_size_, batch_timeout_micros_,
-          max_enqueued_batches_, allowed_batch_sizes_, batch_function_,
+          c, batch_resource_options, batch_function_,
           enable_large_batch_splitting_, disable_padding_, &new_resource);
       if (!status.ok()) return status;
       if (c->session_metadata() != nullptr) {
diff --git a/tensorflow/core/runtime_fallback/runtime/runtime_fallback_batch_tf_opkernels.cc b/tensorflow/core/runtime_fallback/runtime/runtime_fallback_batch_tf_opkernels.cc
index d04e4f93714201..d5a27a01a0ddc4 100644
--- a/tensorflow/core/runtime_fallback/runtime/runtime_fallback_batch_tf_opkernels.cc
+++ b/tensorflow/core/runtime_fallback/runtime/runtime_fallback_batch_tf_opkernels.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <algorithm>
-#include <cstdlib>
+#include <cstdint>
 #include <functional>
 #include <memory>
 #include <string>
@@ -65,7 +65,7 @@ Status GetTfrtExecutionContext(OpKernelContext* c,
   TF_RETURN_IF_ERROR(c->input("tfrt_exec_ctx", &tensor));
   int64_t exec_ctx_intptr = *reinterpret_cast<const int64_t*>(tensor->data());
   *exec_ctx = absl::bit_cast<const tfrt::ExecutionContext*>(exec_ctx_intptr);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 class FallbackBatchResource : public tensorflow::serving::BatchResourceBase {
@@ -109,10 +109,8 @@ class FallbackBatchResource : public tensorflow::serving::BatchResourceBase {
     return batch_function->name();
   }
 
-  static Status Create(OpKernelContext* c, int32_t num_batch_threads,
-                       int32_t max_batch_size, int32_t batch_timeout_micros,
-                       int32_t max_enqueued_batches,
-                       ArrayRef<int32_t> allowed_batch_sizes,
+  static Status Create(OpKernelContext* c,
+                       const serving::BatchResourceOptions& options,
                        tsl::RCReference<const tfrt::Function> bef_func,
                        bool enable_large_batch_splitting, bool disable_padding,
                        std::unique_ptr<FallbackBatchResource>* resource) {
@@ -120,7 +118,7 @@ class FallbackBatchResource : public tensorflow::serving::BatchResourceBase {
     TF_RETURN_IF_ERROR(GetTfrtExecutionContext(c, &exec_ctx));
 
     BatcherT::Options batcher_options;
-    batcher_options.num_batch_threads = num_batch_threads;
+    batcher_options.num_batch_threads = options.num_batch_threads;
     std::shared_ptr<BatcherT> batcher;
     TF_RETURN_IF_ERROR(BatcherT::Create(batcher_options, &batcher));
 
@@ -135,12 +133,17 @@ class FallbackBatchResource : public tensorflow::serving::BatchResourceBase {
     resource->reset(new FallbackBatchResource(
         *exec_ctx, *fallback_request_state, std::move(bef_func),
         std::move(batcher),
-        GetBatcherQueueOptions(num_batch_threads, max_batch_size,
-                               batch_timeout_micros, max_enqueued_batches,
-                               allowed_batch_sizes,
-                               enable_large_batch_splitting, disable_padding),
-        allowed_batch_sizes));
-    return OkStatus();
+        GetBatcherQueueOptions(
+            options.num_batch_threads, options.max_batch_size,
+            options.batch_timeout_micros, options.max_enqueued_batches,
+            options.allowed_batch_sizes, enable_large_batch_splitting,
+            disable_padding, options.low_priority_max_batch_size,
+            options.low_priority_batch_timeout_micros,
+            options.low_priority_max_enqueued_batches,
+            options.low_priority_allowed_batch_sizes,
+            options.mixed_priority_batching_policy),
+        options.allowed_batch_sizes));
+    return absl::OkStatus();
   }
 
   static Status Create(
@@ -173,7 +176,7 @@ class FallbackBatchResource : public tensorflow::serving::BatchResourceBase {
                                        true /* enable large batch split */,
                                        allowed_batch_sizes, disable_padding),
         allowed_batch_sizes));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   string DebugString() const final { return "FallbackBatchResource"; }
@@ -325,14 +328,14 @@ void FallbackBatchResource::ProcessFuncBatchImpl(
   auto req_ctx = std::move(statusor).value();
 
   int64_t id = req_ctx->id();
-  tensorflow::profiler::TraceMeProducer activity(
+  tsl::profiler::TraceMeProducer activity(
       // To TraceMeConsumers in WorkQueue.
       [id] {
-        return tensorflow::profiler::TraceMeEncode("RunBefFunction",
-                                                   {{"id", id}, {"_r", 1}});
+        return tsl::profiler::TraceMeEncode("RunBefFunction",
+                                            {{"id", id}, {"_r", 1}});
       },
-      tensorflow::profiler::ContextType::kTfrtExecutor, id,
-      tensorflow::profiler::TraceMeLevel::kInfo);
+      tsl::profiler::ContextType::kTfrtExecutor, id,
+      tsl::profiler::TraceMeLevel::kInfo);
 
   tfrt::ExecutionContext batch_exec_ctx(std::move(req_ctx));
   batch_exec_ctx.set_work_queue(&exec_ctx.work_queue());
@@ -410,6 +413,26 @@ REGISTER_OP("_BatchFunctionFallback")
     .Attr("low_priority_batch_timeout_micros: int = 0")
     .Attr("low_priority_allowed_batch_sizes: list(int) = []")
     .Attr("low_priority_max_enqueued_batches: int = 0")
+    // Policy that determines the mixed priority batching behavior when low
+    // priority batch parameters are present.
+    //
+    // low_priority_padding_with_next_allowed_batch_size: If high priority
+    // batches time out without reaching the max batch size, low priority inputs
+    // pad the high priority batches up to the next allowed batch size. A low
+    // priority only batch gets schedule only when the low priority input times
+    // out or reaches the max batch size while there is no high priority input
+    // waiting to be processed.
+    // low_priority_padding_with_max_batch_size: Same as above but pad up to the
+    // max batch size.
+    // priority_isolation: High priority and low priority inputs never share the
+    // same batch, i.e., no low priority input padding high priority batches.
+    // Low priority inputs get scheduled only as part of low priority only
+    // batches as described above.
+    .Attr(
+        "mixed_priority_policy: "
+        "{'low_priority_padding_with_max_batch_size', "
+        "'low_priority_padding_with_next_allowed_batch_size', "
+        "'priority_isolation'} = 'low_priority_padding_with_max_batch_size'")
     .Attr("Tin: list(type)")
     .Attr("Tcaptured: list(type) >= 0")
     .Attr("Tout: list(type)")
diff --git a/tensorflow/core/runtime_fallback/runtime/runtime_fallback_kernels.cc b/tensorflow/core/runtime_fallback/runtime/runtime_fallback_kernels.cc
index 1668aa20184dfa..9abbdf411c2149 100644
--- a/tensorflow/core/runtime_fallback/runtime/runtime_fallback_kernels.cc
+++ b/tensorflow/core/runtime_fallback/runtime/runtime_fallback_kernels.cc
@@ -257,7 +257,7 @@ static tensorflow::Status DecodeDenseAttrToTensorInterface(
   tensorflow::Tensor t;
   TF_RETURN_IF_ERROR(TF_TensorToTensor(tf_tensor.get(), &t));
   *result = tensorflow::TensorInterface(std::move(t));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Handle attributes.
@@ -476,7 +476,7 @@ Status CallEagerExecute(const tfrt::ExecutionContext& exec_ctx,
   TF_RETURN_IF_ERROR(eager_op->Execute(
       absl::MakeSpan(result_tensor_handles.data(), num_retvals), &num_retvals));
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 static bool ShouldAddHostContextAttr(const char* op_name) {
@@ -835,10 +835,9 @@ void CoreRTTensorHandleToFallbackTensorInternal(
 void CoreRTTensorHandleToFallbackTensor(
     RemainingArguments args, RemainingResults results, StringAttr device,
     const tfrt::ExecutionContext& exec_ctx) {
-  tensorflow::profiler::TraceMe trace_me(
-      "corert_tensorhandle_to_fallback_tensor");
+  tsl::profiler::TraceMe trace_me("corert_tensorhandle_to_fallback_tensor");
   trace_me.AppendMetadata([request_id = exec_ctx.request_ctx()->id()]() {
-    return tensorflow::profiler::TraceMeEncode({{"id", request_id}});
+    return tsl::profiler::TraceMeEncode({{"id", request_id}});
   });
 
   CoreRTTensorHandleToFallbackTensorInternal(args.values(), results.values(),
@@ -876,10 +875,9 @@ static void FallbackTensorToCoreRTTensorHandleInternal(
 void FallbackTensorToCoreRTTensorHandle(
     RemainingArguments args, RemainingResults results, StringAttr device,
     const tfrt::ExecutionContext& exec_ctx) {
-  tensorflow::profiler::TraceMe trace_me(
-      "fallback_tensor_to_corert_tensorhandle");
+  tsl::profiler::TraceMe trace_me("fallback_tensor_to_corert_tensorhandle");
   trace_me.AppendMetadata([request_id = exec_ctx.request_ctx()->id()]() {
-    return tensorflow::profiler::TraceMeEncode({{"id", request_id}});
+    return tsl::profiler::TraceMeEncode({{"id", request_id}});
   });
 
   FallbackTensorToCoreRTTensorHandleInternal(args.values(), results.values(),
diff --git a/tensorflow/core/tfrt/common/create_pjrt_client_util.cc b/tensorflow/core/tfrt/common/create_pjrt_client_util.cc
index 5f6cbbc06eab2c..73f7dfc6de0e3e 100644
--- a/tensorflow/core/tfrt/common/create_pjrt_client_util.cc
+++ b/tensorflow/core/tfrt/common/create_pjrt_client_util.cc
@@ -32,7 +32,7 @@ absl::StatusOr<xla::PjRtClient*> GetOrCreatePjRtClient(
       rmgr->default_container(), kPjRtStateResourceName, &pjrt_state,
       [&](PjRtState** ret) {
         *ret = PjRtState::Create();
-        return OkStatus();
+        return absl::OkStatus();
       }));
   core::ScopedUnref pjrt_state_ref(pjrt_state);
   return pjrt_state->GetOrCreatePjRtClient(device_type);
diff --git a/tensorflow/core/tfrt/common/pjrt_state.cc b/tensorflow/core/tfrt/common/pjrt_state.cc
index c7fb3304d42bc6..a1a8e2366c6a38 100644
--- a/tensorflow/core/tfrt/common/pjrt_state.cc
+++ b/tensorflow/core/tfrt/common/pjrt_state.cc
@@ -67,7 +67,7 @@ Status PjRtState::SetPjRtClient(const DeviceType& device_type,
     unused_.push_back(std::move(it->second));
   }
   clients_[device_type] = std::move(client);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status PjRtState::MovePjRtClientToUnused(const DeviceType& device_type) {
@@ -75,7 +75,7 @@ Status PjRtState::MovePjRtClientToUnused(const DeviceType& device_type) {
   if (auto it = clients_.find(device_type); it != clients_.end()) {
     unused_.push_back(std::move(it->second));
     clients_.erase(it);
-    return OkStatus();
+    return absl::OkStatus();
   }
   return errors::NotFound("PjRt client not found for device type ",
                           device_type);
diff --git a/tensorflow/core/tfrt/common/pjrt_util.cc b/tensorflow/core/tfrt/common/pjrt_util.cc
index d679e92eed3f3b..643632d5706a47 100644
--- a/tensorflow/core/tfrt/common/pjrt_util.cc
+++ b/tensorflow/core/tfrt/common/pjrt_util.cc
@@ -37,14 +37,14 @@ Status SetPjRtClientInTFGlobalResourceManager(
       rmgr->default_container(), kPjRtStateResourceName, &pjrt_state,
       [&](PjRtState** ret) {
         *ret = PjRtState::Create();
-        return OkStatus();
+        return absl::OkStatus();
       }));
   core::ScopedUnref pjrt_state_ref(pjrt_state);
   if (client == nullptr) {
     return errors::InvalidArgument("PJRT client is nullptr.");
   }
   TF_RETURN_IF_ERROR(pjrt_state->SetPjRtClient(device_type, std::move(client)));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 absl::StatusOr<xla::PjRtClient*> GetPjRtClient(const DeviceType& device_type) {
@@ -54,7 +54,7 @@ absl::StatusOr<xla::PjRtClient*> GetPjRtClient(const DeviceType& device_type) {
       rmgr->default_container(), kPjRtStateResourceName, &pjrt_state,
       [&](PjRtState** ret) {
         *ret = PjRtState::Create();
-        return OkStatus();
+        return absl::OkStatus();
       }));
   core::ScopedUnref pjrt_state_ref(pjrt_state);
   return pjrt_state->GetPjRtClient(device_type);
diff --git a/tensorflow/core/tfrt/graph_executor/graph_execution_options.h b/tensorflow/core/tfrt/graph_executor/graph_execution_options.h
index 5597fd24844694..dc902eb695e90d 100644
--- a/tensorflow/core/tfrt/graph_executor/graph_execution_options.h
+++ b/tensorflow/core/tfrt/graph_executor/graph_execution_options.h
@@ -136,6 +136,10 @@ struct GraphExecutionRunOptions {
 
   std::function<void(absl::flat_hash_map<std::string, tensorflow::Tensor>)>
       streamed_output_callback;
+
+  // The optional name for debugging purposes. If empty, the runtime will pick a
+  // name e.g. the joined string of input names and output names.
+  std::string name;
 };
 
 // Creates the default `SessionOptions` from a `GraphExecutionOptions`.
diff --git a/tensorflow/core/tfrt/graph_executor/graph_executor.cc b/tensorflow/core/tfrt/graph_executor/graph_executor.cc
index 366508e8719ba0..14e29d9c91b620 100644
--- a/tensorflow/core/tfrt/graph_executor/graph_executor.cc
+++ b/tensorflow/core/tfrt/graph_executor/graph_executor.cc
@@ -208,7 +208,7 @@ tensorflow::Status RunMlrtFunction(
     outputs->push_back(std::move(mlrt_output.Get<FallbackTensor>().tensor()));
   }
 
-  return tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 absl::StatusOr<std::unique_ptr<RequestInfo>> CreateRequestInfo(
@@ -320,7 +320,7 @@ tensorflow::Status GraphExecutionRunOnFunction(
   // will add TraceMeProducer and TraceMeConsumer to connect async tasks.
   tsl::profiler::TraceMe traceme(
       [request_id, signature_name, &options, symbol_uids] {
-        return tensorflow::profiler::TraceMeEncode(
+        return tsl::profiler::TraceMeEncode(
             "TfrtModelRun",
             {{"_r", 1},
              {"id", request_id},
@@ -664,7 +664,7 @@ tensorflow::Status GraphExecutor::Run(
   absl::Duration elapsed_duration = end - now;
   loaded_client_graph.latency_sampler()->Add(
       absl::ToDoubleMicroseconds(elapsed_duration));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status GraphExecutor::Extend(const GraphDef& graph) {
@@ -880,7 +880,7 @@ tensorflow::Status GraphExecutor::InitBef(
   TF_RETURN_IF_ERROR(
       RunRuntimeInitializer(exec_ctx, bef_file, kResourceInitFunction));
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status GraphExecutor::InitBytecode(
@@ -913,7 +913,7 @@ tensorflow::Status GraphExecutor::InitBytecode(
         &loaded_graph->sync_resource_state()));
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 absl::StatusOr<std::reference_wrapper<GraphExecutor::LoadedClientGraph>>
@@ -965,7 +965,7 @@ GraphExecutor::GetOrCreateLoadedClientGraph(
     input_nodes[input_name] = array_info;
   }
   ClientGraph client_graph{
-      joined_name,
+      run_options.name.empty() ? joined_name : run_options.name,
       std::move(input_nodes),
       {output_tensor_names.begin(), output_tensor_names.end()},
       {target_tensor_names.begin(), target_tensor_names.end()}};
@@ -1095,7 +1095,7 @@ Status GraphExecutor::LoadedClientGraph::UpdateCost(
     // add a test kernel that examines the cost.
     executable_context_ = std::move(new_executable_context);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 GraphExecutor::LoadedClientGraph::LoadedClientGraph(
@@ -1129,7 +1129,7 @@ GraphExecutor::LoadedClientGraph::LoadedClientGraph(
                                    tsl::core::RefCountPtr<Rendezvous>* r) {
               *r = tsl::core::RefCountPtr<Rendezvous>(
                   new IntraProcessRendezvous(device_mgr));
-              return OkStatus();
+              return absl::OkStatus();
             }}),
       latency_sampler_(latency_sampler) {
   const auto& options = graph_executor_->options().cost_analysis_options;
diff --git a/tensorflow/core/tfrt/graph_executor/graph_executor.h b/tensorflow/core/tfrt/graph_executor/graph_executor.h
index 9688bb60ccdb34..c7f5748f2ee99e 100644
--- a/tensorflow/core/tfrt/graph_executor/graph_executor.h
+++ b/tensorflow/core/tfrt/graph_executor/graph_executor.h
@@ -96,7 +96,7 @@ struct SymbolUids {
 // Note: `resource_context` is per-graph-executor and
 // `client_graph_resource_context` is per-loaded-client-graph. See the comment
 // above `GraphExecutor::resource_context_` about the todo to merge these two.
-StatusOr<std::unique_ptr<RequestInfo>> CreateRequestInfo(
+absl::StatusOr<std::unique_ptr<RequestInfo>> CreateRequestInfo(
     const GraphExecutionOptions& options,
     const GraphExecutionRunOptions& run_options,
     tensorflow::tfrt_stub::WorkQueueInterface* work_queue,
@@ -239,7 +239,8 @@ class GraphExecutor {
 
   // A subgraph constructed by specifying input/output tensors.
   struct ClientGraph {
-    // A unique name by joining all the input/output/target names.
+    // The human-readable name for the graph, e.g. the signature_name in the
+    // saved model.
     std::string name;
     // The feed nodes for the corresponding inputs, but they might not be in the
     // original order and if there are more than one original inputs mapped to
@@ -252,7 +253,7 @@ class GraphExecutor {
   };
 
   // Creates a `GraphExecutor` given the args.
-  static StatusOr<std::unique_ptr<GraphExecutor>> Create(
+  static absl::StatusOr<std::unique_ptr<GraphExecutor>> Create(
       Options options, std::unique_ptr<FallbackState> fallback_state,
       std::unique_ptr<tfrt::ResourceContext> resource_context,
       tensorflow::GraphDef graph_def,
@@ -320,19 +321,21 @@ class GraphExecutor {
 
  private:
   // A set of methods to load a client graph.
-  StatusOr<std::unique_ptr<GraphExecutor::LoadedClientGraph>> LoadClientGraph(
+  absl::StatusOr<std::unique_ptr<GraphExecutor::LoadedClientGraph>>
+  LoadClientGraph(
       const GraphExecutor::ClientGraph& client_graph,
       tensorflow::tfrt_stub::WorkQueueInterface* work_queue,
       absl::Span<const std::pair<std::string, tensorflow::Tensor>> inputs);
-  StatusOr<std::unique_ptr<GraphExecutor::LoadedClientGraph>>
+  absl::StatusOr<std::unique_ptr<GraphExecutor::LoadedClientGraph>>
   ImportAndCompileClientGraph(
       const GraphExecutor::ClientGraph& client_graph,
       absl::Span<const std::pair<std::string, tensorflow::Tensor>> inputs);
-  tensorflow::StatusOr<
+  absl::StatusOr<
       std::pair<FunctionLibraryDefinition, mlir::OwningOpRef<mlir::ModuleOp>>>
   ImportClientGraphToMlirModule(const GraphExecutor::ClientGraph& client_graph,
                                 mlir::MLIRContext* context) const;
-  StatusOr<tfrt::BefBuffer> CompileMlirModuleToBef(mlir::ModuleOp module) const;
+  absl::StatusOr<tfrt::BefBuffer> CompileMlirModuleToBef(
+      mlir::ModuleOp module) const;
 
   tensorflow::Status InitBef(
       LoadedClientGraph* loaded_client_graph,
@@ -342,7 +345,7 @@ class GraphExecutor {
 
   // Returns a `LoadedClientGraph` given input/output tensor info. If there is
   // no existing one yet, creates one first.
-  StatusOr<std::reference_wrapper<GraphExecutor::LoadedClientGraph>>
+  absl::StatusOr<std::reference_wrapper<GraphExecutor::LoadedClientGraph>>
   GetOrCreateLoadedClientGraph(
       const RunOptions& run_options,
       absl::Span<const std::string> input_tensor_names,
diff --git a/tensorflow/core/tfrt/ifrt/BUILD b/tensorflow/core/tfrt/ifrt/BUILD
index 8e9e7c060b2fa6..e8d4b629be8400 100644
--- a/tensorflow/core/tfrt/ifrt/BUILD
+++ b/tensorflow/core/tfrt/ifrt/BUILD
@@ -24,17 +24,47 @@ tf_proto_library(
     visibility = ["//visibility:public"],
 )
 
+cc_library(
+    name = "ifrt_serving_core_selector",
+    srcs = ["ifrt_serving_core_selector.cc"],
+    hdrs = ["ifrt_serving_core_selector.h"],
+    deps = [
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/framework:serving_device_selector",
+    ],
+)
+
+tf_cc_test(
+    name = "ifrt_serving_core_selector_test",
+    srcs = ["ifrt_serving_core_selector_test.cc"],
+    tags = ["no_oss"],
+    deps = [
+        ":ifrt_serving_core_selector",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/framework:serving_device_selector",
+        "@local_tsl//tsl/framework/test_util:mock_serving_device_selector",
+    ],
+)
+
 cc_library(
     name = "ifrt_serving_executable",
     srcs = ["ifrt_serving_executable.cc"],
     hdrs = ["ifrt_serving_executable.h"],
     deps = [
         ":ifrt_loaded_variable_registry",
+        ":ifrt_restore_tensor_registry",
         ":ifrt_tensor_utils",
         ":sharding_utils",
+        ":tf_host_callback",
+        "//tensorflow/compiler/mlir/tfrt:export",
+        "//tensorflow/compiler/mlir/tfrt/transforms/ifrt:extract_callback",
         "//tensorflow/compiler/mlir/tfrt/transforms/ifrt:ifrt_types",
         "//tensorflow/compiler/mlir/tfrt/transforms/ifrt:tf2hlo",
+        "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/compiler/tf2xla:host_compute_metadata_proto_cc",
         "//tensorflow/compiler/tf2xla:xla_helpers",
+        "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
@@ -49,13 +79,17 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/concurrency:ref_count",
+        "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:tstring",
+        "@local_xla//xla:shape_util",
         "@local_xla//xla:xla_data_proto_cc",
         "@local_xla//xla/hlo/ir:hlo",
+        "@local_xla//xla/pjrt:host_callback",
         "@local_xla//xla/pjrt:pjrt_executable",
         "@local_xla//xla/python/ifrt",
+        "@local_xla//xla/python/pjrt_ifrt",
         "@local_xla//xla/python/pjrt_ifrt:xla_ifrt",
         "@local_xla//xla/service:computation_placer_hdr",
     ],
@@ -83,7 +117,10 @@ cc_library(
     srcs = ["ifrt_restore_tensor_registry.cc"],
     hdrs = ["ifrt_restore_tensor_registry.h"],
     deps = [
+        "//tensorflow/compiler/mlir/tfrt/transforms/ifrt:ifrt_types",
+        "//tensorflow/core:framework",
         "//tensorflow/core/framework:tensor",
+        "//tensorflow/core/framework:types_proto_cc",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
@@ -124,6 +161,7 @@ cc_library(
         ":ifrt_loaded_variable_registry",
         ":ifrt_restore_tensor_registry",
         "//tensorflow/compiler/tf2xla:xla_helpers",
+        "//tensorflow/core:core_cpu_base",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -152,6 +190,7 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
+        "@local_xla//xla:shape_util",
         "@local_xla//xla:xla_data_proto_cc",
         "@local_xla//xla/python/ifrt",
         "@local_xla//xla/python/pjrt_ifrt",
@@ -192,6 +231,123 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "ifrt_loaded_variable_utils",
+    srcs = ["ifrt_loaded_variable_utils.cc"],
+    hdrs = ["ifrt_loaded_variable_utils.h"],
+    deps = [
+        ":ifrt_config_proto_cc",
+        ":ifrt_loaded_variable_registry",
+        ":ifrt_restore_tensor_registry",
+        ":sharding_utils",
+        "//tensorflow/compiler/mlir/tfrt/transforms/ifrt:ifrt_types",
+        "//tensorflow/core:framework",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/concurrency:ref_count",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_xla//xla/hlo/ir:hlo",
+        "@local_xla//xla/python/ifrt",
+        "@tf_runtime//:hostcontext",
+    ],
+)
+
+cc_library(
+    name = "tf_host_callback",
+    srcs = ["tf_host_callback.cc"],
+    hdrs = ["tf_host_callback.h"],
+    deps = [
+        "//tensorflow/c/eager:abstract_tensor_handle",
+        "//tensorflow/c/eager:immediate_execution_context",
+        "//tensorflow/c/eager:immediate_execution_operation",
+        "//tensorflow/compiler/mlir/tfrt/transforms/ifrt:ifrt_types",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core/common_runtime/eager:context",
+        "//tensorflow/core/common_runtime/eager:core_no_xla",
+        "//tensorflow/core/common_runtime/eager:tensor_handle",
+        "//tensorflow/core/protobuf:for_core_protos_cc",
+        "@com_google_absl//absl/cleanup",
+        "@com_google_absl//absl/container:fixed_array",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:casts",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:refcount",
+        "@local_tsl//tsl/profiler/lib:traceme",
+    ],
+)
+
+tf_cc_test(
+    name = "ifrt_loaded_variable_utils_test",
+    srcs = ["ifrt_loaded_variable_utils_test.cc"],
+    tags = ["no_oss"],
+    deps = [
+        ":ifrt_config_proto_cc",
+        ":ifrt_loaded_variable_registry",
+        ":ifrt_loaded_variable_utils",
+        ":ifrt_restore_tensor_registry",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/framework:tensor_matcher",
+        "//tensorflow/core/framework:tensor_testutil",
+        "//tensorflow/core/framework:types_proto_cc",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/concurrency:ref_count",
+        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:status_matchers",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test",
+        "@local_xla//xla:xla_data_proto_cc",
+        "@local_xla//xla/python/ifrt",
+        "@local_xla//xla/python/ifrt:test_util",
+        "@local_xla//xla/python/pjrt_ifrt:tfrt_cpu_client_test_lib",
+        "@tf_runtime//:hostcontext",
+    ],
+)
+
+tf_cc_test(
+    name = "tf_host_callback_test",
+    srcs = ["tf_host_callback_test.cc"],
+    tags = ["no_oss"],
+    deps = [
+        ":tf_host_callback",
+        "//tensorflow/cc:const_op",
+        "//tensorflow/cc:function_ops",
+        "//tensorflow/cc:functional_ops",
+        "//tensorflow/cc:math_ops",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:resource_variable_ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/compiler/mlir/tfrt/transforms/ifrt:ifrt_types",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core/framework:tensor_matcher",
+        "//tensorflow/core/framework:tensor_testutil",
+        "//tensorflow/core/kernels:constant_op",
+        "//tensorflow/core/kernels:math",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_xla//xla:xla_data_proto_cc",
+    ],
+)
+
 tf_cc_test(
     name = "sharding_utils_test",
     srcs = ["sharding_utils_test.cc"],
@@ -231,12 +387,18 @@ tf_cc_test(
     tags = ["no_oss"],
     deps = [
         ":ifrt_loaded_variable_registry",
+        ":ifrt_restore_tensor_registry",
         ":ifrt_serving_executable",
         ":sharding_utils",
+        ":tf_host_callback",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/tf2xla:xla_helpers",
+        "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:portable_tensorflow_test_lib",
         "//tensorflow/core:test",
+        "//tensorflow/core/common_runtime/eager:core",
         "//tensorflow/core/framework:tensor",
         "//tensorflow/core/framework:tensor_matcher",
         "//tensorflow/core/framework:tensor_testutil",
@@ -257,5 +419,55 @@ tf_cc_test(
         "@local_xla//xla/python/ifrt",
         "@local_xla//xla/python/ifrt:test_util",
         "@local_xla//xla/python/pjrt_ifrt:tfrt_cpu_client_test_lib",
+        "@tf_runtime//:basic_kernels_alwayslink",
+        "@tf_runtime//:core_runtime_alwayslink",
+        "@tf_runtime//:test_kernels_alwayslink",
+        "@tf_runtime//backends/cpu:core_runtime_alwayslink",
+        "@tf_runtime//backends/cpu:tf_ops_alwayslink",
+    ],
+)
+
+tf_cc_test(
+    name = "ifrt_executable_registry_test",
+    srcs = [
+        "ifrt_executable_registry_test.cc",
+    ],
+    data = [
+        "//tensorflow/core/tfrt/ifrt/testdata",
+    ],
+    tags = ["no_oss"],
+    deps = [
+        ":ifrt_executable_registry",
+        ":ifrt_loaded_variable_registry",
+        ":ifrt_restore_tensor_registry",
+        ":ifrt_serving_executable",
+        ":tf_host_callback",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/tf2xla:xla_helpers",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:portable_tensorflow_test_lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core/common_runtime/eager:core",
+        "//tensorflow/core/framework:tensor",
+        "//tensorflow/core/framework:types_proto_cc",
+        "//tensorflow/core/platform:resource_loader",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_xla//xla/python/ifrt",
+        "@local_xla//xla/python/ifrt:test_util",
+        "@local_xla//xla/python/pjrt_ifrt:tfrt_cpu_client_test_lib",
+        "@tf_runtime//:basic_kernels_alwayslink",
+        "@tf_runtime//:core_runtime_alwayslink",
+        "@tf_runtime//:test_kernels_alwayslink",
+        "@tf_runtime//backends/cpu:core_runtime_alwayslink",
+        "@tf_runtime//backends/cpu:tf_ops_alwayslink",
     ],
 )
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_executable_registry.cc b/tensorflow/core/tfrt/ifrt/ifrt_executable_registry.cc
index 178026b1793c4d..dce09b8e7df228 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_executable_registry.cc
+++ b/tensorflow/core/tfrt/ifrt/ifrt_executable_registry.cc
@@ -74,7 +74,7 @@ ServingExecutableRegistry::Handle::Handle(int64_t program_id)
 
 absl::StatusOr<ServingExecutableRegistry::Handle>
 ServingExecutableRegistry::Register(
-    int64_t program_id, std::shared_ptr<IfrtServingExecutable> executable) {
+    int64_t program_id, std::unique_ptr<IfrtServingExecutable> executable) {
   absl::MutexLock l(&mu_);
   VLOG(1) << "Registering program " << program_id << " from signature '"
           << executable->signature_name() << "' of model '"
@@ -87,20 +87,19 @@ ServingExecutableRegistry::Register(
   return Handle(program_id);
 }
 
-std::shared_ptr<IfrtServingExecutable> ServingExecutableRegistry::Lookup(
-    int64_t program_id) {
+IfrtServingExecutable* ServingExecutableRegistry::Lookup(int64_t program_id) {
   absl::ReaderMutexLock l(&mu_);
   VLOG(1) << "Looking up program " << program_id;
   const auto it = executables_->find(program_id);
-  return it != executables_->end() ? it->second : nullptr;
+  return it != executables_->end() ? it->second.get() : nullptr;
 }
 
 ABSL_CONST_INIT absl::Mutex ServingExecutableRegistry::mu_(absl::kConstInit);
 
-absl::flat_hash_map<int64_t, std::shared_ptr<IfrtServingExecutable>>* const
+absl::flat_hash_map<int64_t, std::unique_ptr<IfrtServingExecutable>>* const
     ServingExecutableRegistry::executables_ =
         new absl::flat_hash_map<int64_t,
-                                std::shared_ptr<IfrtServingExecutable>>();
+                                std::unique_ptr<IfrtServingExecutable>>();
 
 }  // namespace ifrt_serving
 }  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_executable_registry.h b/tensorflow/core/tfrt/ifrt/ifrt_executable_registry.h
index d495cfbc98d24d..247e6e0a3848fd 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_executable_registry.h
+++ b/tensorflow/core/tfrt/ifrt/ifrt_executable_registry.h
@@ -74,11 +74,11 @@ class ServingExecutableRegistry {
   // Registers an executable under the given program id. Returns an RAII handle
   // that unregisters the program at its destruction.
   static absl::StatusOr<Handle> Register(
-      int64_t program_id, std::shared_ptr<IfrtServingExecutable> executable);
+      int64_t program_id, std::unique_ptr<IfrtServingExecutable> executable);
 
   // Looks up an executable registered under the given program id, or returns
   // nullptr if there's no such program.
-  static std::shared_ptr<IfrtServingExecutable> Lookup(int64_t program_id);
+  static IfrtServingExecutable* Lookup(int64_t program_id);
 
  private:
   friend class Handle;
@@ -87,7 +87,7 @@ class ServingExecutableRegistry {
 
   // Mapping from program ids to executables.
   static absl::flat_hash_map<int64_t,
-                             std::shared_ptr<IfrtServingExecutable>>* const
+                             std::unique_ptr<IfrtServingExecutable>>* const
       executables_ ABSL_GUARDED_BY(&mu_);
 };
 
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_executable_registry_test.cc b/tensorflow/core/tfrt/ifrt/ifrt_executable_registry_test.cc
new file mode 100644
index 00000000000000..b30340287badd2
--- /dev/null
+++ b/tensorflow/core/tfrt/ifrt/ifrt_executable_registry_test.cc
@@ -0,0 +1,124 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tfrt/ifrt/ifrt_executable_registry.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/InitAllDialects.h"  // from @llvm-project
+#include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/test_util.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/resource_loader.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h"
+#include "tensorflow/core/tfrt/ifrt/tf_host_callback.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/threadpool.h"
+
+namespace tensorflow {
+namespace ifrt_serving {
+namespace {
+const tsl::thread::ThreadPool& GetThreadPool() {
+  constexpr int kMaxParallelism = 16;
+  static auto* const thread_pool =
+      new tsl::thread::ThreadPool(tsl::Env::Default(), tsl::ThreadOptions(),
+                                  "IfrtSharding", kMaxParallelism);
+  return *thread_pool;
+}
+
+absl::StatusOr<std::unique_ptr<IfrtServingExecutable>>
+CreateIfrtServingExecutable(mlir::MLIRContext& context) {
+  // Create test input module
+  constexpr absl::string_view kDataDirectory =
+      "tensorflow/core/tfrt/ifrt/testdata";
+  std::string mlir_module_path = tensorflow::GetDataDependencyFilepath(
+      absl::StrCat(kDataDirectory, "/executable.mlir"));
+
+  mlir::OwningOpRef<mlir::ModuleOp> mlir_module =
+      mlir::parseSourceFile<mlir::ModuleOp>(mlir_module_path, &context);
+
+  if (!mlir_module) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Failed to parse MLIR file: ", mlir_module_path));
+  }
+
+  // Create contexts required for the compiler execution.
+  TF_ASSIGN_OR_RETURN(std::shared_ptr<xla::ifrt::Client> client,
+                      xla::ifrt::test_util::GetClient());
+
+  IfrtLoadedVariableRegistry ifrt_loaded_variable_registry;
+  IfrtRestoreTensorRegistry ifrt_restore_tensor_registry;
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<tensorflow::StaticDeviceMgr> device_mgr,
+                      CreateTfStaticDeviceMgr());
+
+  return std::make_unique<IfrtServingExecutable>(
+      "test", "main", std::move(mlir_module), client, &GetThreadPool(),
+      &ifrt_loaded_variable_registry, &ifrt_restore_tensor_registry,
+      device_mgr.get(), tensorflow::IdentityShapeRepresentationFn());
+}
+
+TEST(IfrtExecutableRegistry, Basic) {
+  mlir::DialectRegistry registry;
+  mlir::registerAllDialects(registry);
+  mlir::RegisterAllTensorFlowDialects(registry);
+
+  mlir::MLIRContext context(registry);
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<IfrtServingExecutable> executable,
+                          CreateIfrtServingExecutable(context));
+  IfrtServingExecutable* raw_ptr = executable.get();
+
+  int64_t program_id = 1234;
+
+  TF_ASSERT_OK_AND_ASSIGN(auto handle, ServingExecutableRegistry::Register(
+                                           program_id, std::move(executable)));
+
+  IfrtServingExecutable* executable_ptr =
+      ServingExecutableRegistry::Lookup(program_id);
+  ASSERT_EQ(executable_ptr, raw_ptr);
+}
+
+TEST(IfrtExecutableRegistry, InvalidProgramIdShallReturnNull) {
+  int64_t program_id = 1234;
+
+  IfrtServingExecutable* executable_ptr =
+      ServingExecutableRegistry::Lookup(program_id);
+  ASSERT_EQ(executable_ptr, nullptr);
+}
+
+}  // namespace
+}  // namespace ifrt_serving
+}  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h b/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h
index 5cb885598174bd..469dae297dd0b0 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h
+++ b/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h
@@ -34,7 +34,6 @@ namespace ifrt_serving {
 class IfrtLoadedVariableRegistry {
  public:
   struct LoadedVariable {
-    DtypeAndShape dtype_and_shape;
     xla::ifrt::Future<absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>> array;
   };
   using LoadedVariableConstructor =
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_utils.cc b/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_utils.cc
new file mode 100644
index 00000000000000..9161a37f55f5ce
--- /dev/null
+++ b/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_utils.cc
@@ -0,0 +1,153 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_utils.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/future.h"
+#include "tensorflow/core/framework/resource_handle.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.h"
+#include "tensorflow/core/tfrt/ifrt/sharding_utils.h"
+#include "tsl/concurrency/ref_count.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/threadpool.h"
+#include "tfrt/host_context/concurrent_work_queue.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace ifrt_serving {
+
+namespace {
+
+absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> LoadIfrtVariable(
+    std::shared_ptr<xla::ifrt::Client> ifrt_client,
+    const tsl::thread::ThreadPool& thread_pool,
+    const tensorflow::Tensor& variable,
+    const VariableDeviceShardingConfigProto& sharding_config) {
+  std::vector<int> device_ids{sharding_config.device_ids().begin(),
+                              sharding_config.device_ids().end()};
+  TF_ASSIGN_OR_RETURN(xla::HloSharding hlo_sharding,
+                      xla::HloSharding::FromProto(sharding_config.sharding()));
+  return tensorflow::ifrt_serving::MakeArrayFromTensor(
+      *ifrt_client, variable, sharding_config.device_ids(), hlo_sharding,
+      thread_pool);
+}
+
+}  // namespace
+
+absl::StatusOr<ifrt_serving::DtypeAndShape> GetDtypeAndShape(
+    const ResourceHandle& resource_handle) {
+  const std::vector<DtypeAndPartialTensorShape>& dtype_and_partial_shapes =
+      resource_handle.dtypes_and_shapes();
+
+  if (dtype_and_partial_shapes.size() != 1) {
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Expected 1 dtype and shape, got ", dtype_and_partial_shapes.size()));
+  }
+  ifrt_serving::DtypeAndShape dtype_and_shape;
+  if (!dtype_and_partial_shapes.front().shape.AsTensorShape(
+          &dtype_and_shape.shape)) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Failed to convert partial shape to full tensor shape: ",
+                     dtype_and_partial_shapes.front().shape.DebugString()));
+  }
+
+  dtype_and_shape.dtype = dtype_and_partial_shapes.front().dtype;
+  return dtype_and_shape;
+}
+
+std::string GetRuntimeNameFromVarHandle(const ResourceHandle& handle) {
+  return absl::StrCat(handle.container(), "__", handle.name());
+}
+
+absl::Status LoadRestoredTensorAsIfrtLoadedVariable(
+    absl::string_view runtime_name,
+    std::shared_ptr<xla::ifrt::Client> ifrt_client,
+    const tsl::thread::ThreadPool& thread_pool,
+    ifrt_serving::IfrtRestoreTensorRegistry& ifrt_restore_tensor_registry,
+    ifrt_serving::IfrtLoadedVariableRegistry& ifrt_loaded_variable_registry,
+    tfrt::ConcurrentWorkQueue* checkpoint_loader_queue,
+    const VariableDeviceShardingConfigProto& sharding_config) {
+  xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>> restored_tensor_future =
+      ifrt_restore_tensor_registry.GetRestoredTensor(runtime_name);
+  if (!restored_tensor_future.IsValid()) {
+    return absl::InternalError(absl::StrCat(
+        "LoadVariableOp: failed to fetch variable tensor: ", runtime_name));
+  }
+
+  auto loaded_variable_promise = xla::ifrt::Future<
+      absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>>::CreatePromise();
+  auto loaded_variable_future =
+      xla::ifrt::Future<absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>>(
+          loaded_variable_promise);
+
+  TF_ASSIGN_OR_RETURN(
+      absl::StatusOr<ifrt_serving::DtypeAndShape> dtype_and_shape,
+      ifrt_restore_tensor_registry.GetDtypeAndShape(runtime_name));
+  // TODO(b/330360798) Load variable on devices from the result of core
+  // selection.
+  TF_RETURN_IF_ERROR(ifrt_loaded_variable_registry.TryRegisterLoadedVariable(
+      runtime_name,
+      [&]() -> absl::StatusOr<
+                ifrt_serving::IfrtLoadedVariableRegistry::LoadedVariable> {
+        return ifrt_serving::IfrtLoadedVariableRegistry::LoadedVariable(
+            {.array = loaded_variable_future});
+      }));
+  restored_tensor_future.OnReady(
+      [ifrt_client = ifrt_client, &thread_pool = thread_pool,
+       checkpoint_loader_queue = checkpoint_loader_queue,
+       sharding_config = sharding_config,
+       loaded_variable_promise = std::move(loaded_variable_promise)](
+          absl::StatusOr<tensorflow::Tensor> restored_tensor) mutable {
+        if (!restored_tensor.ok()) {
+          loaded_variable_promise.Set(restored_tensor.status());
+          return;
+        }
+
+        // Transfer tensor to array in a separate thread.
+        checkpoint_loader_queue->AddTask(
+            [ifrt_client = ifrt_client, &thread_pool = thread_pool,
+             sharding_config = std::move(sharding_config),
+             restored_tensor = std::move(*restored_tensor),
+             loaded_variable_promise =
+                 std::move(loaded_variable_promise)]() mutable {
+              absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>
+                  variable_array =
+                      LoadIfrtVariable(ifrt_client, thread_pool,
+                                       restored_tensor, sharding_config);
+              loaded_variable_promise.Set(std::move(variable_array));
+            });
+      });
+  return absl::OkStatus();
+}
+
+}  // namespace ifrt_serving
+}  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_utils.h b/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_utils.h
new file mode 100644
index 00000000000000..aafebbda16bd77
--- /dev/null
+++ b/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_utils.h
@@ -0,0 +1,59 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TFRT_IFRT_IFRT_LOADED_VARIABLE_UTILS_H_
+#define TENSORFLOW_CORE_TFRT_IFRT_IFRT_LOADED_VARIABLE_UTILS_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/status/status.h"
+#include "xla/python/ifrt/client.h"
+#include "tensorflow/core/framework/resource_handle.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_config.pb.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.h"
+#include "tsl/platform/threadpool.h"
+#include "tfrt/host_context/concurrent_work_queue.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace ifrt_serving {
+
+absl::StatusOr<ifrt_serving::DtypeAndShape> GetDtypeAndShape(
+    const ResourceHandle& resource_handle);
+
+// Returns the runtime name from the resource handle. The name will be concat of
+// handle's container name and handle's name.
+std::string GetRuntimeNameFromVarHandle(const ResourceHandle& handle);
+
+// Loads a restored tensor as an IFRT loaded variable and set the restored
+// tensor in the `restored_tensor_promise` as output. It is an async loading. We
+// look for the restored tensor in `ifrt_restore_tensor_registry` and save a
+// future of IFRT loaded variable in `ifrt_loaded_variable_registry`. The caller
+// can look for the actual loaded variable value in
+// `ifrt_loaded_variable_registry`.
+absl::Status LoadRestoredTensorAsIfrtLoadedVariable(
+    absl::string_view runtime_name,
+    std::shared_ptr<xla::ifrt::Client> ifrt_client,
+    const tsl::thread::ThreadPool& thread_pool,
+    ifrt_serving::IfrtRestoreTensorRegistry& ifrt_restore_tensor_registry,
+    ifrt_serving::IfrtLoadedVariableRegistry& ifrt_loaded_variable_registry,
+    tfrt::ConcurrentWorkQueue* checkpoint_loader_queue,
+    const VariableDeviceShardingConfigProto& sharding_config);
+
+}  // namespace ifrt_serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_IFRT_IFRT_LOADED_VARIABLE_UTILS_H_
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_utils_test.cc b/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_utils_test.cc
new file mode 100644
index 00000000000000..40cf1903d46fe8
--- /dev/null
+++ b/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_utils_test.cc
@@ -0,0 +1,161 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_utils.h"
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt/test_util.h"
+#include "xla/xla_data.pb.h"
+#include "tensorflow/core/framework/resource_handle.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_matcher.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_config.pb.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.h"
+#include "tsl/concurrency/ref_count.h"
+#include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/status_matchers.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/test.h"
+#include "tsl/platform/threadpool.h"
+#include "tfrt/host_context/concurrent_work_queue.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace ifrt_serving {
+namespace {
+
+using tensorflow::test::TensorEq;
+using tsl::testing::StatusIs;
+
+TEST(ShardingUtilsTest, ShardTensorToIfrtLoadedVariableNotFoundWrongName) {
+  auto input_tensor =
+      test::AsTensor<int32_t>({1, 2, 3, 4}, tensorflow::TensorShape({2, 2}));
+
+  Tensor variable_handle(DT_RESOURCE, TensorShape({}));
+  ResourceHandle resource_handle;
+  resource_handle.set_name("var_x");
+  resource_handle.set_dtypes_and_shapes({{
+      DT_INT32,
+      TensorShape({2, 2}),
+  }});
+  variable_handle.flat<ResourceHandle>()(0) = std::move(resource_handle);
+
+  IfrtRestoreTensorRegistry restored_tensor_registry;
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::ifrt::Client> client,
+                          xla::ifrt::test_util::GetClient());
+  constexpr int kMaxParallelism = 16;
+  tsl::thread::ThreadPool thread_pool(tsl::Env::Default(), tsl::ThreadOptions(),
+                                      "Resharding", kMaxParallelism);
+  IfrtLoadedVariableRegistry loaded_variable_registry;
+  auto restore_work_queue = tfrt::CreateMultiThreadedWorkQueue(
+      /*num_threads=*/4, /*num_blocking_threads=*/4);
+
+  VariableDeviceShardingConfigProto sharding_config;
+  sharding_config.add_device_ids(0);
+
+  auto promise =
+      xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>>::CreatePromise();
+  auto future = xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>>(promise);
+
+  IfrtRestoreTensorRegistry::RestoredTensorInfo restored_tensor_info = {
+      GetDtypeAndShape(variable_handle.scalar<ResourceHandle>()()).value(),
+      future};
+  TF_ASSERT_OK(restored_tensor_registry.TryRegister("var_x_wrong",
+                                                    restored_tensor_info));
+  promise.Set(input_tensor);
+  EXPECT_THAT(
+      LoadRestoredTensorAsIfrtLoadedVariable(
+          "var_x", client, thread_pool, restored_tensor_registry,
+          loaded_variable_registry, restore_work_queue.get(), sharding_config),
+      StatusIs(absl::StatusCode::kNotFound));
+}
+
+TEST(ShardingUtilsTest, ShardTensorToIfrtLoadedVariableSucceed) {
+  auto input_tensor =
+      test::AsTensor<int32_t>({1, 2, 3, 4}, TensorShape({2, 2}));
+
+  Tensor variable_handle(DT_RESOURCE, TensorShape({}));
+  ResourceHandle resource_handle;
+  resource_handle.set_name("var_x");
+  resource_handle.set_dtypes_and_shapes({{
+      DT_INT32,
+      TensorShape({2, 2}),
+  }});
+  variable_handle.flat<ResourceHandle>()(0) = std::move(resource_handle);
+
+  IfrtRestoreTensorRegistry restored_tensor_registry;
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::ifrt::Client> client,
+                          xla::ifrt::test_util::GetClient());
+  constexpr int kMaxParallelism = 16;
+  tsl::thread::ThreadPool thread_pool(tsl::Env::Default(), tsl::ThreadOptions(),
+                                      "Resharding", kMaxParallelism);
+  IfrtLoadedVariableRegistry loaded_variable_registry;
+  auto restore_work_queue = tfrt::CreateMultiThreadedWorkQueue(
+      /*num_threads=*/4, /*num_blocking_threads=*/4);
+
+  VariableDeviceShardingConfigProto sharding_config;
+  sharding_config.add_device_ids(0);
+
+  auto promise =
+      xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>>::CreatePromise();
+  auto future = xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>>(promise);
+
+  IfrtRestoreTensorRegistry::RestoredTensorInfo restored_tensor_info = {
+      GetDtypeAndShape(variable_handle.scalar<ResourceHandle>()()).value(),
+      future};
+
+  TF_ASSERT_OK(
+      restored_tensor_registry.TryRegister("var_x", restored_tensor_info));
+  TF_ASSERT_OK(LoadRestoredTensorAsIfrtLoadedVariable(
+      "var_x", client, thread_pool, restored_tensor_registry,
+      loaded_variable_registry, restore_work_queue.get(), sharding_config));
+  promise.Set(input_tensor);
+  TF_ASSERT_OK_AND_ASSIGN(auto v,
+                          loaded_variable_registry.GetLoadedVariable("var_x"));
+  TF_ASSERT_OK_AND_ASSIGN(auto assembled_array, v.array.Await());
+
+  TF_ASSERT_OK_AND_ASSIGN(auto disassembled_arrays,
+                          assembled_array->DisassembleIntoSingleDeviceArrays(
+                              xla::ifrt::ArrayCopySemantics::kAlwaysCopy));
+  ASSERT_EQ(disassembled_arrays.size(), 1);
+  for (int i = 0; i < disassembled_arrays.size(); ++i) {
+    tensorflow::Tensor host_tensor(input_tensor.dtype(), input_tensor.shape());
+    TF_ASSERT_OK(
+        disassembled_arrays[i]
+            ->CopyToHostBuffer(host_tensor.data(), /*byte_strides=*/{},
+                               xla::ifrt::ArrayCopySemantics::kAlwaysCopy)
+            .Await());
+    EXPECT_THAT(host_tensor, TensorEq(input_tensor));
+  }
+}
+}  // namespace
+}  // namespace ifrt_serving
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_model_context.h b/tensorflow/core/tfrt/ifrt/ifrt_model_context.h
index 067ccfe2365ea2..8aa214372303e0 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_model_context.h
+++ b/tensorflow/core/tfrt/ifrt/ifrt_model_context.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/client.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_executable_registry.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.h"
@@ -59,9 +60,11 @@ class IfrtModelContext {
   IfrtModelContext(
       std::shared_ptr<xla::ifrt::Client> client,
       const tsl::thread::ThreadPool* thread_pool,
+      std::unique_ptr<tensorflow::StaticDeviceMgr> device_mgr,
       tensorflow::XlaHelpers::ShapeRepresentationFn shape_representation_fn)
       : client_(std::move(client)),
         thread_pool_(*thread_pool),
+        device_mgr_(std::move(device_mgr)),
         shape_representation_fn_(shape_representation_fn) {}
 
   void RegisterHandle(ServingExecutableRegistry::Handle handle) {
@@ -91,6 +94,10 @@ class IfrtModelContext {
     return restore_tensor_registry_;
   }
 
+  tensorflow::StaticDeviceMgr* GetDeviceMgr() const {
+    return device_mgr_.get();
+  }
+
   tfrt::ConcurrentWorkQueue* checkpoint_loader_queue() const {
     return checkpoint_loader_queue_;
   }
@@ -101,6 +108,8 @@ class IfrtModelContext {
  private:
   std::shared_ptr<xla::ifrt::Client> client_;
   const tsl::thread::ThreadPool& thread_pool_;
+
+  std::unique_ptr<tensorflow::StaticDeviceMgr> device_mgr_;
   tensorflow::XlaHelpers::ShapeRepresentationFn shape_representation_fn_ =
       tensorflow::IdentityShapeRepresentationFn();
 
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.cc b/tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.cc
index 752a22384539bb..e04bafd11ef16c 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.cc
+++ b/tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h"
 #include "xla/python/ifrt/future.h"
 #include "tensorflow/core/framework/tensor.h"
 
@@ -31,20 +32,19 @@ namespace tensorflow {
 namespace ifrt_serving {
 
 absl::Status IfrtRestoreTensorRegistry::TryRegister(
-    absl::string_view name,
-    xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>> tensor_future) {
+    absl::string_view name, RestoredTensorInfo restored_tensor_info) {
   absl::MutexLock lock(&mutex_);
-  auto& variable = restored_tensors_[name];
-  if (variable.IsValid()) {
+  auto& info = restored_tensors_[name];
+  if (info.tensor_future.IsValid()) {
     return absl::AlreadyExistsError(
         absl::StrCat("Variable '", name, "' already registered."));
   }
-  variable = std::move(tensor_future);
+  info = std::move(restored_tensor_info);
   return absl::OkStatus();
 }
 
 xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>>
-IfrtRestoreTensorRegistry::Get(absl::string_view name) const {
+IfrtRestoreTensorRegistry::GetRestoredTensor(absl::string_view name) const {
   absl::MutexLock lock(&mutex_);
   auto it = restored_tensors_.find(name);
   if (it == restored_tensors_.end()) {
@@ -52,7 +52,19 @@ IfrtRestoreTensorRegistry::Get(absl::string_view name) const {
         absl::NotFoundError(absl::StrCat("Variable '", name, "' not found.")));
   }
 
-  return it->second;
+  return it->second.tensor_future;
+}
+
+absl::StatusOr<DtypeAndShape> IfrtRestoreTensorRegistry::GetDtypeAndShape(
+    absl::string_view name) const {
+  absl::MutexLock lock(&mutex_);
+  auto it = restored_tensors_.find(name);
+  if (it == restored_tensors_.end()) {
+    return absl::NotFoundError(
+        absl::StrCat("Variable '", name, "' not found."));
+  }
+
+  return it->second.dtype_and_shape;
 }
 
 }  // namespace ifrt_serving
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.h b/tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.h
index d14b3774874543..a178065170b94e 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.h
+++ b/tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.h
@@ -23,8 +23,11 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h"
 #include "xla/python/ifrt/future.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
 
 namespace tensorflow {
 namespace ifrt_serving {
@@ -32,21 +35,26 @@ namespace ifrt_serving {
 // This class is thread safe.
 class IfrtRestoreTensorRegistry {
  public:
+  struct RestoredTensorInfo {
+    DtypeAndShape dtype_and_shape;
+    xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>> tensor_future;
+  };
   // Tries to register a loaded variable with the given name.
   // Returns an error if the named tensor already exists.
-  absl::Status TryRegister(
-      absl::string_view name,
-      xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>> tensor_future)
+  absl::Status TryRegister(absl::string_view name,
+                           RestoredTensorInfo restored_tensor_info)
       ABSL_LOCKS_EXCLUDED(mutex_);
 
-  xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>> Get(
+  xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>> GetRestoredTensor(
       absl::string_view name) const ABSL_LOCKS_EXCLUDED(mutex_);
 
+  absl::StatusOr<DtypeAndShape> GetDtypeAndShape(absl::string_view name) const
+      ABSL_LOCKS_EXCLUDED(mutex_);
+
  private:
   mutable absl::Mutex mutex_;
-  absl::flat_hash_map<std::string,
-                      xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>>>
-      restored_tensors_ ABSL_GUARDED_BY(mutex_);
+  absl::flat_hash_map<std::string, RestoredTensorInfo> restored_tensors_
+      ABSL_GUARDED_BY(mutex_);
 };
 
 }  // namespace ifrt_serving
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_serving_core_selector.cc b/tensorflow/core/tfrt/ifrt/ifrt_serving_core_selector.cc
new file mode 100644
index 00000000000000..0521ce2f25b27b
--- /dev/null
+++ b/tensorflow/core/tfrt/ifrt/ifrt_serving_core_selector.cc
@@ -0,0 +1,36 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tfrt/ifrt/ifrt_serving_core_selector.h"
+
+#include <cstdint>
+
+#include "absl/strings/str_cat.h"
+#include "tsl/framework/serving_device_selector.h"
+
+namespace tensorflow {
+namespace ifrt_serving {
+
+IfrtServingCoreSelector::IfrtServingCoreSelector(
+    tsl::ServingDeviceSelector* device_selector)
+    : device_selector_(device_selector) {}
+
+tsl::DeviceReservation IfrtServingCoreSelector::ReserveDevice(
+    int64_t program_id) {
+  return device_selector_->ReserveDevice(absl::StrCat(program_id));
+}
+
+}  // namespace ifrt_serving
+}  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_serving_core_selector.h b/tensorflow/core/tfrt/ifrt/ifrt_serving_core_selector.h
new file mode 100644
index 00000000000000..d716648b5998c0
--- /dev/null
+++ b/tensorflow/core/tfrt/ifrt/ifrt_serving_core_selector.h
@@ -0,0 +1,42 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TFRT_IFRT_IFRT_SERVING_CORE_SELECTOR_H_
+#define TENSORFLOW_CORE_TFRT_IFRT_IFRT_SERVING_CORE_SELECTOR_H_
+
+#include <cstdint>
+
+#include "tsl/framework/serving_device_selector.h"
+namespace tensorflow {
+namespace ifrt_serving {
+
+// A wrapper of a `tsl::ServingDeviceSelector` that will be responsible for the
+// core selection during Ifrt TPU execution.
+class IfrtServingCoreSelector {
+ public:
+  explicit IfrtServingCoreSelector(tsl::ServingDeviceSelector* device_selector);
+  // Reserves a device for the given `program_id`. The `program_id` is used to
+  // identify an IFRT executable and should be the key of
+  // `tensorflow::ifrt_serving::ServingExecutableRegistry `.
+  tsl::DeviceReservation ReserveDevice(int64_t program_id);
+
+ private:
+  tsl::ServingDeviceSelector* device_selector_;
+};
+
+}  // namespace ifrt_serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_IFRT_IFRT_SERVING_CORE_SELECTOR_H_
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_serving_core_selector_test.cc b/tensorflow/core/tfrt/ifrt/ifrt_serving_core_selector_test.cc
new file mode 100644
index 00000000000000..427a26c3b38ffb
--- /dev/null
+++ b/tensorflow/core/tfrt/ifrt/ifrt_serving_core_selector_test.cc
@@ -0,0 +1,56 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tfrt/ifrt/ifrt_serving_core_selector.h"
+
+#include <cstdint>
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/str_cat.h"
+#include "tsl/framework/serving_device_selector.h"
+#include "tsl/framework/test_util/mock_serving_device_selector.h"
+
+namespace tensorflow {
+namespace ifrt_serving {
+namespace {
+
+class IfrtServingCoreSelectorTest : public ::testing::Test {
+ protected:
+  explicit IfrtServingCoreSelectorTest() {
+    core_selector_ =
+        std::make_unique<IfrtServingCoreSelector>(&serving_device_selector_);
+  }
+
+  tsl::test_util::MockServingDeviceSelector serving_device_selector_;
+  std::unique_ptr<IfrtServingCoreSelector> core_selector_;
+};
+
+TEST_F(IfrtServingCoreSelectorTest, ReservedDevicesReturns) {
+  int64_t program_id1 = 111111;
+  EXPECT_CALL(serving_device_selector_,
+              ReserveDevice(absl::StrCat(program_id1)))
+      .WillOnce([this](::testing::Unused) {
+        return tsl::DeviceReservation(0, &serving_device_selector_);
+      });
+  tsl::DeviceReservation reservation =
+      core_selector_->ReserveDevice(program_id1);
+  EXPECT_THAT(reservation.device_index(), 0);
+}
+
+}  // namespace
+}  // namespace ifrt_serving
+}  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc
index bb9176c3b3de6f..1e713d17365988 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc
+++ b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc
@@ -15,8 +15,11 @@ limitations under the License.
 
 #include "tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h"
 
+#include <cstdint>
+#include <functional>
 #include <memory>
 #include <optional>
+#include <string>
 #include <utility>
 #include <vector>
 
@@ -26,32 +29,46 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/extract_callback.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.h"
+#include "tensorflow/compiler/mlir/tfrt/utils/export.h"
+#include "tensorflow/compiler/tf2xla/host_compute_metadata.pb.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/pjrt/host_callback.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/executable.h"
 #include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt/host_callback.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
+#include "xla/python/pjrt_ifrt/pjrt_host_callback.h"
 #include "xla/python/pjrt_ifrt/xla_compiler.h"
 #include "xla/service/computation_placer.h"
+#include "xla/shape.h"
 #include "xla/xla_data.pb.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_tensor_utils.h"
 #include "tensorflow/core/tfrt/ifrt/sharding_utils.h"
+#include "tensorflow/core/tfrt/ifrt/tf_host_callback.h"
 #include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
@@ -63,7 +80,7 @@ namespace {
 absl::StatusOr<std::vector<DtypeAndShape>> BuildDtypeAndShape(
     absl::Span<const tensorflow::Tensor> inputs,
     absl::Span<const int> variable_arg_indices,
-    const IfrtLoadedVariableRegistry& ifrt_loaded_variable_registry) {
+    const IfrtRestoreTensorRegistry& ifrt_restore_tensor_registry) {
   std::vector<DtypeAndShape> dtypes_and_shapes;
   dtypes_and_shapes.reserve(inputs.size());
 
@@ -72,10 +89,10 @@ absl::StatusOr<std::vector<DtypeAndShape>> BuildDtypeAndShape(
     if (variable_index < variable_arg_indices.size() &&
         i == variable_arg_indices[variable_index]) {
       // Get already loaded variable tensor.
-      TF_ASSIGN_OR_RETURN(auto loaded_variable,
-                          ifrt_loaded_variable_registry.GetLoadedVariable(
+      TF_ASSIGN_OR_RETURN(auto dtype_and_shape,
+                          ifrt_restore_tensor_registry.GetDtypeAndShape(
                               inputs[i].scalar<tsl::tstring>()()));
-      dtypes_and_shapes.push_back(loaded_variable.dtype_and_shape);
+      dtypes_and_shapes.push_back(dtype_and_shape);
 
       variable_index++;
     } else {
@@ -113,8 +130,9 @@ absl::StatusOr<std::vector<xla::ifrt::Device*>> GetAssignedDevices(
          computation_idx < device_assignment.computation_count();
          computation_idx++) {
       auto device_id = device_assignment(replica_idx, computation_idx);
-      TF_ASSIGN_OR_RETURN(xla::ifrt::Device * device,
-                          ifrt_client.LookupDevice(device_id));
+      TF_ASSIGN_OR_RETURN(
+          xla::ifrt::Device * device,
+          ifrt_client.LookupDevice(xla::ifrt::DeviceId(device_id)));
       devices.push_back(device);
     }
   }
@@ -136,6 +154,139 @@ IfrtServingExecutable::ConvertTensorToArray(
                              std::move(hlo_sharding), thread_pool_);
 }
 
+absl::StatusOr<std::vector<tensorflow::FunctionDef>> BuildFunctionDef(
+    mlir::ModuleOp module) {
+  std::vector<tensorflow::FunctionDef> function_defs;
+
+  // Sets `export_tf_original_func_name` to false so that ExportFunctionDef
+  // does not rename the function back to the original function name. This
+  // allows calling the function by the function name in the MLIR module.
+  TF_RETURN_IF_ERROR(ExportFunctionDefs(
+      module,
+      [&](tensorflow::FunctionDef function_def) {
+        function_defs.push_back(function_def);
+        return absl::OkStatus();
+      },
+      /*export_tf_original_func_name=*/false));
+
+  return function_defs;
+}
+
+// Host callback info for one host callback.
+struct HostCallbackBuilderInfo {
+  tensorflow::tf2xla::HostTransferMetadata device_to_host;
+  tensorflow::tf2xla::HostTransferMetadata host_to_device;
+};
+
+absl::StatusOr<absl::flat_hash_map<std::string, HostCallbackBuilderInfo>>
+GroupHostCallbackByKey(const Tf2HloResult& tf2hlo_result) {
+  absl::flat_hash_map<std::string, HostCallbackBuilderInfo> host_callbacks;
+
+  for (const auto& device_to_host :
+       tf2hlo_result.host_compute_metadata.device_to_host()) {
+    auto& host_callback = host_callbacks[device_to_host.key()];
+    host_callback.device_to_host = device_to_host;
+  }
+  for (const auto& host_to_device :
+       tf2hlo_result.host_compute_metadata.host_to_device()) {
+    auto& host_callback = host_callbacks[host_to_device.key()];
+    host_callback.host_to_device = host_to_device;
+  }
+  return host_callbacks;
+}
+
+// TODO: shape propagation in module
+absl::StatusOr<xla::HostCallback> BuildHostCallback(
+    absl::string_view key, const HostCallbackBuilderInfo& builder_info,
+    mlir::ModuleOp module, tensorflow::StaticDeviceMgr* device_mgr,
+    std::vector<std::shared_ptr<TfHostCallback>>& tf_host_callbacks) {
+  VLOG(2) << "BuildHostCallback for key: " << key;
+
+  DCHECK(device_mgr);
+  xla::HostCallback host_callback;
+  std::vector<DtypeAndShape> operand_type_and_shapes;
+  std::vector<DtypeAndShape> result_type_and_shapes;
+
+  auto to_xla_shape = [](tensorflow::DataType data_type,
+                         const tensorflow::TensorShapeProto& shape)
+      -> absl::StatusOr<xla::Shape> {
+    xla::Shape xla_shape;
+    TF_ASSIGN_OR_RETURN(tensorflow::TensorShape tensor_shape,
+                        tensorflow::TensorShape::BuildTensorShape(shape));
+
+    if (absl::Status status = tensorflow::TensorShapeToXLAShape(
+            data_type, tensor_shape, &xla_shape);
+        status.ok()) {
+      return xla_shape;
+    } else {
+      return status;
+    }
+  };
+
+  operand_type_and_shapes.reserve(builder_info.device_to_host.metadata_size());
+  result_type_and_shapes.reserve(builder_info.host_to_device.metadata_size());
+  for (const auto& metadata : builder_info.device_to_host.metadata()) {
+    TF_ASSIGN_OR_RETURN(xla::Shape shape,
+                        to_xla_shape(metadata.type(), metadata.shape()));
+    uint16_t channel_id = static_cast<uint16_t>(metadata.channel_id());
+    VLOG(2) << "Channel id: " << channel_id;
+    host_callback.operands.push_back(
+        {.channel_id = channel_id, .shape = shape});
+    operand_type_and_shapes.push_back(
+        DtypeAndShape{.dtype = metadata.type(), .shape = metadata.shape()});
+  }
+
+  for (const auto& metadata : builder_info.host_to_device.metadata()) {
+    TF_ASSIGN_OR_RETURN(xla::Shape shape,
+                        to_xla_shape(metadata.type(), metadata.shape()));
+    uint16_t channel_id = static_cast<uint16_t>(metadata.channel_id());
+    VLOG(2) << "Channel id: " << channel_id;
+    host_callback.results.push_back({.channel_id = channel_id, .shape = shape});
+    result_type_and_shapes.push_back(
+        DtypeAndShape{.dtype = metadata.type(), .shape = metadata.shape()});
+  }
+
+  // TODO(b/332774825): reuse functions in BEF/MLRT once we switch to
+  // GraphExecutor.
+  TF_ASSIGN_OR_RETURN(mlir::OwningOpRef<mlir::ModuleOp> callback_module,
+                      ExtractCallbackModule(module, key));
+
+  TF_ASSIGN_OR_RETURN(std::vector<tensorflow::FunctionDef> function_defs,
+                      BuildFunctionDef(*callback_module));
+
+  TF_ASSIGN_OR_RETURN(
+      std::shared_ptr<TfHostCallback> tf_host_callback,
+      TfHostCallback::Create(function_defs, key, operand_type_and_shapes,
+                             result_type_and_shapes, device_mgr));
+
+  host_callback.callback = [tf_host_callback = tf_host_callback.get()](
+                               void** output, void** input) {
+    return tf_host_callback->Call(input, output);
+  };
+
+  tf_host_callbacks.push_back(std::move(tf_host_callback));
+  return host_callback;
+}
+
+absl::StatusOr<std::vector<xla::HostCallback>> BuildHostCallbacks(
+    const Tf2HloResult& tf2hlo_result, mlir::ModuleOp module,
+    tensorflow::StaticDeviceMgr* device_mgr,
+    std::vector<std::shared_ptr<TfHostCallback>>& tf_host_callbacks) {
+  TF_ASSIGN_OR_RETURN(auto host_callback_maps,
+                      GroupHostCallbackByKey(tf2hlo_result));
+
+  std::vector<xla::HostCallback> host_callbacks;
+  host_callbacks.reserve(host_callback_maps.size());
+  for (const auto& [entry_function, builder_info] : host_callback_maps) {
+    TF_ASSIGN_OR_RETURN(auto host_callback,
+                        BuildHostCallback(entry_function, builder_info, module,
+                                          device_mgr, tf_host_callbacks));
+    host_callbacks.push_back(std::move(host_callback));
+  }
+
+  return host_callbacks;
+}
+
 absl::StatusOr<IfrtServingExecutable::CachedExecutableBundle>
 IfrtServingExecutable::CreateExecutableSynchronously(
     absl::Span<const DtypeAndShape> dtypes_and_shapes) {
@@ -173,17 +324,34 @@ IfrtServingExecutable::CreateExecutableSynchronously(
   xla_compile_options.parameter_is_tupled_arguments = false;
   xla_compile_options.executable_build_options.set_device_assignment(da);
 
+  std::vector<std::shared_ptr<TfHostCallback>> tf_host_callbacks;
+  TF_ASSIGN_OR_RETURN(auto host_callbacks,
+                      BuildHostCallbacks(tf2hlo_result, *module_, device_mgr_,
+                                         tf_host_callbacks));
+
+  std::vector<tsl::RCReference<xla::ifrt::LoadedHostCallback>>
+      loaded_host_callbacks;
+  loaded_host_callbacks.reserve(host_callbacks.size());
+  for (const auto& host_callback : host_callbacks) {
+    loaded_host_callbacks.push_back(
+        tsl::MakeRef<xla::ifrt::PjRtHostSendAndRecvLoadedHostCallback>(
+            ifrt_client_.get(),
+            std::make_unique<xla::HostCallback>(host_callback)));
+  }
+
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<xla::ifrt::LoadedExecutable> ifrt_executable,
       ifrt_client_->GetDefaultCompiler()->Compile(
           std::make_unique<xla::ifrt::XlaProgram>(
               tf2hlo_result.mlir_hlo_module.get()),
-          std::make_unique<xla::ifrt::XlaCompileOptions>(xla_compile_options)));
+          std::make_unique<xla::ifrt::XlaCompileOptions>(
+              xla_compile_options, loaded_host_callbacks)));
 
   CachedExecutableBundle executable_bundle;
   executable_bundle.ifrt_executable = std::move(ifrt_executable);
   executable_bundle.compile_metadata =
       std::move(tf2hlo_result.compile_metadata);
+  executable_bundle.host_callbacks = std::move(tf_host_callbacks);
 
   return executable_bundle;
 }
@@ -258,7 +426,7 @@ absl::StatusOr<std::vector<tensorflow::Tensor>> IfrtServingExecutable::Execute(
 
   TF_ASSIGN_OR_RETURN(std::vector<DtypeAndShape> dtypes_and_shapes,
                       BuildDtypeAndShape(inputs, variable_arg_indices,
-                                         ifrt_loaded_variable_registry_));
+                                         ifrt_restore_tensor_registry_));
   TF_ASSIGN_OR_RETURN(
       CachedExecutableBundle executable_bundle,
       LookUpOrCreateExecutable(absl::MakeSpan(dtypes_and_shapes)).Await());
@@ -303,10 +471,14 @@ absl::StatusOr<std::vector<tensorflow::Tensor>> IfrtServingExecutable::Execute(
 
   VLOG(2) << "Start Execution";
 
-  TF_ASSIGN_OR_RETURN(auto execution_result,
-                      executable_bundle.ifrt_executable->Execute(
-                          absl::MakeSpan(args),
-                          /*options=*/{.untuple_result = true}, std::nullopt));
+  TF_ASSIGN_OR_RETURN(
+      auto execution_result,
+      executable_bundle.ifrt_executable->Execute(
+          absl::MakeSpan(args),
+          /*options=*/
+          {.untuple_result = true,
+           .use_major_to_minor_data_layout_for_callbacks = true},
+          std::nullopt));
 
   auto status = execution_result.status.Await();
   TF_RETURN_IF_ERROR(status);
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h
index 56fa22ffebef0f..691a22b1e4638b 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h
+++ b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h
@@ -31,20 +31,25 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/executable.h"
 #include "xla/python/ifrt/future.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
 #include "xla/xla_data.pb.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.h"
+#include "tensorflow/core/tfrt/ifrt/tf_host_callback.h"
 #include "tsl/concurrency/ref_count.h"
+#include "tsl/platform/threadpool.h"
 
 namespace tensorflow {
 namespace ifrt_serving {
@@ -57,6 +62,8 @@ class IfrtServingExecutable {
       std::shared_ptr<xla::ifrt::Client> client,
       const tsl::thread::ThreadPool* thread_pool,
       const IfrtLoadedVariableRegistry* ifrt_loaded_variable_registry,
+      const IfrtRestoreTensorRegistry* ifrt_restore_tensor_registry,
+      tensorflow::StaticDeviceMgr* device_mgr,
       tensorflow::XlaHelpers::ShapeRepresentationFn shape_representation_fn)
       : model_name_(std::string(model_name)),
         signature_name_(std::string(signature_name)),
@@ -64,6 +71,8 @@ class IfrtServingExecutable {
         ifrt_client_(std::move(client)),
         thread_pool_(*thread_pool),
         ifrt_loaded_variable_registry_(*ifrt_loaded_variable_registry),
+        ifrt_restore_tensor_registry_(*ifrt_restore_tensor_registry),
+        device_mgr_(device_mgr),
         shape_representation_fn_(std::move(shape_representation_fn)) {}
 
   // Movable but not copyable.
@@ -108,6 +117,9 @@ class IfrtServingExecutable {
   struct CachedExecutableBundle {
     std::shared_ptr<xla::ifrt::LoadedExecutable> ifrt_executable;
     tensorflow::tpu::TPUCompileMetadataProto compile_metadata;
+    // TODO(b/322541827): change from std::shared_ptr to std::unique_ptr once
+    // we avoid copy use of CachedExectableBundle.
+    std::vector<std::shared_ptr<TfHostCallback>> host_callbacks;
   };
 
   std::string model_name_;
@@ -120,6 +132,8 @@ class IfrtServingExecutable {
   const tsl::thread::ThreadPool& thread_pool_;
 
   const IfrtLoadedVariableRegistry& ifrt_loaded_variable_registry_;
+  const IfrtRestoreTensorRegistry& ifrt_restore_tensor_registry_;
+  tensorflow::StaticDeviceMgr* device_mgr_;  // Not owned. For host callback.
   tensorflow::XlaHelpers::ShapeRepresentationFn shape_representation_fn_;
 
   mutable absl::Mutex mutex_;
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable_test.cc b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable_test.cc
index 5d225f4c8db598..593deedb4cb22b 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable_test.cc
+++ b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable_test.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/future.h"
 #include "xla/python/ifrt/test_util.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_matcher.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -48,7 +49,9 @@ limitations under the License.
 #include "tensorflow/core/platform/resource_loader.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.h"
 #include "tensorflow/core/tfrt/ifrt/sharding_utils.h"
+#include "tensorflow/core/tfrt/ifrt/tf_host_callback.h"
 #include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/statusor.h"
@@ -102,10 +105,15 @@ TEST(IfrtServingExecutableTest, Basic) {
                           xla::ifrt::test_util::GetClient());
 
   IfrtLoadedVariableRegistry ifrt_loaded_variable_registry;
-  IfrtServingExecutable executable("test", "main", std::move(mlir_module),
-                                   client, &GetThreadPool(),
-                                   &ifrt_loaded_variable_registry,
-                                   tensorflow::IdentityShapeRepresentationFn());
+  IfrtRestoreTensorRegistry ifrt_restore_tensor_registry;
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<tensorflow::StaticDeviceMgr> device_mgr,
+      CreateTfStaticDeviceMgr());
+
+  IfrtServingExecutable executable(
+      "test", "main", std::move(mlir_module), client, &GetThreadPool(),
+      &ifrt_loaded_variable_registry, &ifrt_restore_tensor_registry,
+      device_mgr.get(), tensorflow::IdentityShapeRepresentationFn());
 
   auto x = AsTensor<int32_t>({1, 2, 3}, tensorflow::TensorShape({1, 3}));
   auto y = AsTensor<int32_t>({1, 2, 3}, tensorflow::TensorShape({3, 1}));
@@ -143,11 +151,15 @@ TEST(IfrtServingExecutableTest, MultipleShapes) {
                           xla::ifrt::test_util::GetClient());
 
   IfrtLoadedVariableRegistry ifrt_loaded_variable_registry;
+  IfrtRestoreTensorRegistry ifrt_restore_tensor_registry;
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<tensorflow::StaticDeviceMgr> device_mgr,
+      CreateTfStaticDeviceMgr());
 
-  IfrtServingExecutable executable("test", "main", std::move(mlir_module),
-                                   client, &GetThreadPool(),
-                                   &ifrt_loaded_variable_registry,
-                                   tensorflow::IdentityShapeRepresentationFn());
+  IfrtServingExecutable executable(
+      "test", "main", std::move(mlir_module), client, &GetThreadPool(),
+      &ifrt_loaded_variable_registry, &ifrt_restore_tensor_registry,
+      device_mgr.get(), tensorflow::IdentityShapeRepresentationFn());
 
   auto x1 = AsTensor<int32_t>({1, 2, 3}, tensorflow::TensorShape({1, 3}));
   auto y1 = AsTensor<int32_t>({1, 2, 3}, tensorflow::TensorShape({3, 1}));
@@ -200,11 +212,15 @@ TEST(IfrtServingExecutableTest, Spmd) {
                           xla::ifrt::test_util::GetClient());
 
   IfrtLoadedVariableRegistry ifrt_loaded_variable_registry;
+  IfrtRestoreTensorRegistry ifrt_restore_tensor_registry;
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<tensorflow::StaticDeviceMgr> device_mgr,
+      CreateTfStaticDeviceMgr());
 
-  IfrtServingExecutable executable("test", "main", std::move(mlir_module),
-                                   client, &GetThreadPool(),
-                                   &ifrt_loaded_variable_registry,
-                                   tensorflow::IdentityShapeRepresentationFn());
+  IfrtServingExecutable executable(
+      "test", "main", std::move(mlir_module), client, &GetThreadPool(),
+      &ifrt_loaded_variable_registry, &ifrt_restore_tensor_registry,
+      device_mgr.get(), tensorflow::IdentityShapeRepresentationFn());
 
   auto x = AsTensor<int32_t>({1, 2, 3, 4, 5, 6, 7, 8},
                              tensorflow::TensorShape({4, 2}));
@@ -247,11 +263,15 @@ TEST(IfrtServingExecutableTest, SpmdTwoReturns) {
                           xla::ifrt::test_util::GetClient());
 
   IfrtLoadedVariableRegistry ifrt_loaded_variable_registry;
+  IfrtRestoreTensorRegistry ifrt_restore_tensor_registry;
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<tensorflow::StaticDeviceMgr> device_mgr,
+      CreateTfStaticDeviceMgr());
 
-  IfrtServingExecutable executable("test", "main", std::move(mlir_module),
-                                   client, &GetThreadPool(),
-                                   &ifrt_loaded_variable_registry,
-                                   tensorflow::IdentityShapeRepresentationFn());
+  IfrtServingExecutable executable(
+      "test", "main", std::move(mlir_module), client, &GetThreadPool(),
+      &ifrt_loaded_variable_registry, &ifrt_restore_tensor_registry,
+      device_mgr.get(), tensorflow::IdentityShapeRepresentationFn());
 
   auto x = AsTensor<int32_t>({1, 2, 3, 4, 5, 6, 7, 8},
                              tensorflow::TensorShape({4, 2}));
@@ -298,11 +318,15 @@ TEST(IfrtServingExecutableTest, NoReturn) {
                           xla::ifrt::test_util::GetClient());
 
   IfrtLoadedVariableRegistry ifrt_loaded_variable_registry;
+  IfrtRestoreTensorRegistry ifrt_restore_tensor_registry;
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<tensorflow::StaticDeviceMgr> device_mgr,
+      CreateTfStaticDeviceMgr());
 
-  IfrtServingExecutable executable("test", "main", std::move(mlir_module),
-                                   client, &GetThreadPool(),
-                                   &ifrt_loaded_variable_registry,
-                                   tensorflow::IdentityShapeRepresentationFn());
+  IfrtServingExecutable executable(
+      "test", "main", std::move(mlir_module), client, &GetThreadPool(),
+      &ifrt_loaded_variable_registry, &ifrt_restore_tensor_registry,
+      device_mgr.get(), tensorflow::IdentityShapeRepresentationFn());
 
   auto x = AsTensor<int32_t>({1, 2, 3}, tensorflow::TensorShape({1, 3}));
   auto y = AsTensor<int32_t>({1, 2, 3}, tensorflow::TensorShape({3, 1}));
@@ -337,16 +361,24 @@ TEST_P(VariableInputTest, InterleaveVariable) {
                           xla::ifrt::test_util::GetClient());
 
   IfrtLoadedVariableRegistry ifrt_loaded_variable_registry;
-  IfrtServingExecutable executable("test", "main", std::move(mlir_module),
-                                   client, &GetThreadPool(),
-                                   &ifrt_loaded_variable_registry,
-                                   tensorflow::IdentityShapeRepresentationFn());
+  IfrtRestoreTensorRegistry ifrt_restore_tensor_registry;
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<tensorflow::StaticDeviceMgr> device_mgr,
+      CreateTfStaticDeviceMgr());
+  IfrtServingExecutable executable(
+      "test", "main", std::move(mlir_module), client, &GetThreadPool(),
+      &ifrt_loaded_variable_registry, &ifrt_restore_tensor_registry,
+      device_mgr.get(), tensorflow::IdentityShapeRepresentationFn());
 
   std::vector<tensorflow::Tensor> inputs;
   std::vector<int> loaded_variable_indices;
   for (int i = 0; i < GetParam().in_tensors.size(); i++) {
     if (GetParam().is_variable[i]) {
+      IfrtRestoreTensorRegistry::RestoredTensorInfo restore_tensor_info = {
+          {GetParam().in_tensors[i].dtype(), GetParam().in_tensors[i].shape()}};
       std::string variable_name = absl::StrCat("variable_", i);
+      ASSERT_OK(ifrt_restore_tensor_registry.TryRegister(variable_name,
+                                                         restore_tensor_info));
       ASSERT_OK(ifrt_loaded_variable_registry.TryRegisterLoadedVariable(
           variable_name,
           [&]() -> absl::StatusOr<IfrtLoadedVariableRegistry::LoadedVariable> {
@@ -366,9 +398,6 @@ TEST_P(VariableInputTest, InterleaveVariable) {
 
             IfrtLoadedVariableRegistry::LoadedVariable loaded_variable;
             loaded_variable.array = future;
-            loaded_variable.dtype_and_shape.dtype = in_tensor.dtype();
-            loaded_variable.dtype_and_shape.shape = in_tensor.shape();
-
             return loaded_variable;
           }));
       loaded_variable_indices.push_back(i);
diff --git a/tensorflow/core/tfrt/ifrt/sharding_utils.cc b/tensorflow/core/tfrt/ifrt/sharding_utils.cc
index e982f3c0f0eda6..ceabfcd28c66b8 100644
--- a/tensorflow/core/tfrt/ifrt/sharding_utils.cc
+++ b/tensorflow/core/tfrt/ifrt/sharding_utils.cc
@@ -371,11 +371,12 @@ CreateArrayFromHostTensorForSingleDevice(xla::ifrt::Client& ifrt_client,
       });
 }
 
-StatusOr<tsl::RCReference<xla::ifrt::Array>> MakeAssembledArrayFromHostBuffer(
-    xla::ifrt::Client& ifrt_client, const tensorflow::Tensor& input_tensor,
-    const xla::HloSharding& hlo_sharding,
-    const xla::ifrt::DeviceList& device_list,
-    const tsl::thread::ThreadPool& thread_pool) {
+absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>
+MakeAssembledArrayFromHostBuffer(xla::ifrt::Client& ifrt_client,
+                                 const tensorflow::Tensor& input_tensor,
+                                 const xla::HloSharding& hlo_sharding,
+                                 const xla::ifrt::DeviceList& device_list,
+                                 const tsl::thread::ThreadPool& thread_pool) {
   // TODO(b/316959894): use xla::HloSharding to identifying sharding axis.
   auto sharding = xla::ifrt::HloSharding::Create(
       device_list, xla::ifrt::MemoryKind(), hlo_sharding);
@@ -615,7 +616,7 @@ absl::StatusOr<tensorflow::Tensor> MakeTensorFromArray(
             b.index_domain.origin().elements().end());
       });
 
-  std::vector<xla::ifrt::Future<absl::Status>> arrays_copy_status;
+  std::vector<xla::ifrt::Future<>> arrays_copy_status;
   std::vector<tensorflow::Tensor> input_tensors;
   input_tensors.reserve(index_domain_device_arrays.size());
   arrays_copy_status.reserve(index_domain_device_arrays.size());
@@ -625,7 +626,7 @@ absl::StatusOr<tensorflow::Tensor> MakeTensorFromArray(
                         ToTensorDataType(array->dtype()));
     tensorflow::Tensor tensor(dtype, tensor_shape);
     input_tensors.push_back(tensor);
-    xla::ifrt::Future<absl::Status> copy_status =
+    xla::ifrt::Future<> copy_status =
         array->CopyToHostBuffer(tensor.data(), /*byte_strides=*/{},
                                 xla::ifrt::ArrayCopySemantics::kAlwaysCopy);
     copy_status.OnReady([tensor](absl::Status status) {
@@ -672,7 +673,8 @@ absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> MakeArrayFromTensor(
     VLOG(1) << "Single device fast path for Maximal tiled tensor";
     xla::ifrt::Device* device;
     int unique_device_id = hlo_sharding.GetUniqueDevice();
-    TF_ASSIGN_OR_RETURN(device, ifrt_client.LookupDevice(unique_device_id));
+    TF_ASSIGN_OR_RETURN(device, ifrt_client.LookupDevice(
+                                    xla::ifrt::DeviceId(unique_device_id)));
     return CreateArrayFromHostTensorForSingleDevice(ifrt_client, input_tensor,
                                                     device);
   }
@@ -692,8 +694,9 @@ absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> MakeArrayFromTensor(
   std::vector<xla::ifrt::Device*> devices;
   devices.reserve(device_ids.size());
   for (auto device_id : device_ids) {
-    TF_ASSIGN_OR_RETURN(xla::ifrt::Device * device,
-                        ifrt_client.LookupDevice(device_id));
+    TF_ASSIGN_OR_RETURN(
+        xla::ifrt::Device * device,
+        ifrt_client.LookupDevice(xla::ifrt::DeviceId(device_id)));
     devices.push_back(device);
   }
   xla::ifrt::DeviceList device_list(
diff --git a/tensorflow/core/tfrt/ifrt/tf_host_callback.cc b/tensorflow/core/tfrt/ifrt/tf_host_callback.cc
new file mode 100644
index 00000000000000..084d1ea1a3ec7a
--- /dev/null
+++ b/tensorflow/core/tfrt/ifrt/tf_host_callback.cc
@@ -0,0 +1,174 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tfrt/ifrt/tf_host_callback.h"
+
+#include <cstddef>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/cleanup/cleanup.h"
+#include "absl/container/fixed_array.h"
+#include "absl/log/check.h"
+#include "absl/memory/memory.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/framework/device.h"
+#include "tensorflow/core/framework/device_factory.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tsl/platform/casts.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/refcount.h"
+#include "tsl/profiler/lib/traceme.h"
+
+namespace tensorflow {
+namespace ifrt_serving {
+namespace {
+using RefCountHandle = ::tsl::core::RefCountPtr<tensorflow::TensorHandle>;
+
+size_t GetSizeInBytes(const tensorflow::Tensor& tensor) {
+  return tensor.shape().num_elements() * DataTypeSize(tensor.dtype());
+}
+
+// Returns a tensor of the specified type and shape. The tensor's data is filled
+// from `src`.
+tensorflow::Tensor GetTensor(const DtypeAndShape& dtype_and_shape, void* src) {
+  DCHECK(DataTypeCanUseMemcpy(dtype_and_shape.dtype));
+  tensorflow::Tensor t(dtype_and_shape.dtype, dtype_and_shape.shape);
+  std::memcpy(t.data(), src, GetSizeInBytes(t));
+  return t;
+}
+
+// Fills the buffer pointed by `dst` by data from the given tensor.
+void CopyToBuffer(void* dst, const tensorflow::Tensor& tensor) {
+  DCHECK(DataTypeCanUseMemcpy(tensor.dtype()));
+  std::memcpy(dst, tensor.data(), GetSizeInBytes(tensor));
+}
+}  // namespace
+
+absl::Status TfHostCallback::Call(void** inputs, void** outputs) {
+  tsl::profiler::TraceMe trace_me("TfHostCallback::Call");
+
+  tensorflow::ImmediateOpPtr op(ctx_->CreateOperation());
+  TF_RETURN_IF_ERROR(
+      op->Reset(entry_function_name_.c_str(), /*raw_device_name=*/nullptr));
+
+  // Wrap each execution with StartStep/EndStep. This ensures that per-step
+  // TF resources like TensorArray are always cleaned up.
+  ctx_->StartStep();
+  absl::Cleanup cleanup_step = [this]() { ctx_->EndStep(); };
+
+  // Prepare inputs.
+  for (int i = 0; i < operand_type_and_shapes_.size(); ++i) {
+    tensorflow::Tensor t = GetTensor(operand_type_and_shapes_[i], inputs[i]);
+    RefCountHandle handle(tensorflow::down_cast<tensorflow::TensorHandle*>(
+        ctx_->CreateLocalHandleFromTFTensor(t, /*d_name=*/nullptr)));
+    TF_RETURN_IF_ERROR(op->AddInput(handle.get()));
+  }
+
+  // Execute the function and block until completion.
+  int num_outputs = result_type_and_shapes_.size();
+  absl::FixedArray<tensorflow::AbstractTensorHandle*> output_raw_handles(
+      num_outputs);
+  TF_RETURN_IF_ERROR(
+      op->Execute(absl::MakeSpan(output_raw_handles), &num_outputs));
+
+  std::vector<RefCountHandle> output_handles;
+  output_handles.reserve(num_outputs);
+  for (auto* output_raw_handle : output_raw_handles) {
+    output_handles.emplace_back(
+        tensorflow::down_cast<tensorflow::TensorHandle*>(output_raw_handle));
+  }
+
+  // Copy the output tensors.
+  if (result_type_and_shapes_.size() != num_outputs) {
+    return absl::InternalError(absl::StrCat(
+        "TF host callback invocation expected ", result_type_and_shapes_.size(),
+        " results, instead got ", num_outputs));
+  }
+  for (int i = 0; i < num_outputs; ++i) {
+    const tensorflow::Tensor* tensor;
+    TF_RETURN_IF_ERROR(output_handles[i]->Tensor(&tensor));
+    CopyToBuffer(outputs[i], *tensor);
+  }
+
+  return absl::OkStatus();
+}
+
+absl::StatusOr<std::unique_ptr<TfHostCallback>> TfHostCallback::Create(
+    absl::Span<const tensorflow::FunctionDef> functions,
+    absl::string_view entry_function_name,
+    absl::Span<const DtypeAndShape> operand_type_and_shapes,
+    absl::Span<const DtypeAndShape> result_type_and_shapes,
+    tensorflow::StaticDeviceMgr* device_mgr) {
+  tensorflow::SessionOptions options;
+  // Explicitly disable non-CPU devices to avoid triggering TPU device
+  // initialization inside TF.
+  options.config.add_device_filters("/device:CPU:*");
+
+  DCHECK(device_mgr != nullptr);
+
+  // Create a new synchronous TF Eager context. Using sync mode simplifies the
+  // error semantics and host callbacks cannot use asynchronous execution anyway
+  // because they have to write results to specified buffers before the call
+  // returns.
+  tensorflow::EagerContextPtr ctx(new tensorflow::EagerContext(
+      options,
+      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
+      /*async=*/false, device_mgr,
+      /*device_mgr_owned=*/false,
+      /*rendezvous=*/nullptr,
+      /*cluster_flr=*/nullptr,
+      /*collective_executor_mgr=*/nullptr,
+      /*run_eager_op_as_function=*/true));
+
+  for (const tensorflow::FunctionDef& function : functions) {
+    TF_RETURN_IF_ERROR(ctx->AddFunctionDef(function));
+  }
+
+  return absl::WrapUnique(
+      new TfHostCallback(entry_function_name, operand_type_and_shapes,
+                         result_type_and_shapes, std::move(ctx)));
+}
+
+absl::StatusOr<std::unique_ptr<tensorflow::StaticDeviceMgr>>
+CreateTfStaticDeviceMgr() {
+  // Share the same TF devices across all host callbacks in a single
+  // computation. This makes it possible to share states (e.g., TF resources)
+  // across host callbacks in a single computation.
+  std::vector<std::unique_ptr<tensorflow::Device>> devices;
+  TF_RETURN_IF_ERROR(tensorflow::DeviceFactory::AddCpuDevices(
+      tensorflow::SessionOptions(), "/job:localhost/replica:0/task:0",
+      &devices));
+  return std::make_unique<tensorflow::StaticDeviceMgr>(std::move(devices));
+}
+
+}  // namespace ifrt_serving
+}  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/ifrt/tf_host_callback.h b/tensorflow/core/tfrt/ifrt/tf_host_callback.h
new file mode 100644
index 00000000000000..ddb00050e8df58
--- /dev/null
+++ b/tensorflow/core/tfrt/ifrt/tf_host_callback.h
@@ -0,0 +1,84 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TFRT_IFRT_TF_HOST_CALLBACK_H_
+#define TENSORFLOW_CORE_TFRT_IFRT_TF_HOST_CALLBACK_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+namespace ifrt_serving {
+
+// A host callback implementation to run a TF graph.
+// TODO(b/332774825): Use TFRT executor for host callback.
+class TfHostCallback {
+ public:
+  // Creates a TfHostCallback instance. `device_mgr` ptr is guaranteed to be
+  // alive throughout the lifetime of model.
+  static absl::StatusOr<std::unique_ptr<TfHostCallback>> Create(
+      absl::Span<const tensorflow::FunctionDef> functions,
+      absl::string_view entry_function_name,
+      absl::Span<const DtypeAndShape> operand_type_and_shapes,
+      absl::Span<const DtypeAndShape> result_type_and_shapes,
+      tensorflow::StaticDeviceMgr* device_mgr);
+
+  // The host callback function takes two pointer arrays, each element of which
+  // points to allocated host buffer in host layout according to corresponding
+  // operand or result's shape. The buffers are only guaranteed to be alive
+  // during the call.
+  absl::Status Call(void** inputs, void** outputs);
+
+ private:
+  TfHostCallback(absl::string_view entry_function_name,
+                 absl::Span<const DtypeAndShape> operand_type_and_shapes,
+                 absl::Span<const DtypeAndShape> result_type_and_shape,
+                 tensorflow::EagerContextPtr ctx)
+      : ctx_(std::move(ctx)),
+        entry_function_name_(entry_function_name),
+        operand_type_and_shapes_(operand_type_and_shapes.begin(),
+                                 operand_type_and_shapes.end()),
+        result_type_and_shapes_(result_type_and_shape.begin(),
+                                result_type_and_shape.end()) {}
+
+  // Per-callback TF Eager context.
+  tensorflow::EagerContextPtr ctx_;
+
+  // Entry function name to be called on invocation.
+  std::string entry_function_name_;
+
+  std::vector<DtypeAndShape> operand_type_and_shapes_;
+  std::vector<DtypeAndShape> result_type_and_shapes_;
+};
+
+absl::StatusOr<std::unique_ptr<tensorflow::StaticDeviceMgr>>
+CreateTfStaticDeviceMgr();
+
+}  // namespace ifrt_serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_IFRT_TF_HOST_CALLBACK_H_
diff --git a/tensorflow/core/tfrt/ifrt/tf_host_callback_test.cc b/tensorflow/core/tfrt/ifrt/tf_host_callback_test.cc
new file mode 100644
index 00000000000000..bc67bbae34d94a
--- /dev/null
+++ b/tensorflow/core/tfrt/ifrt/tf_host_callback_test.cc
@@ -0,0 +1,246 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tfrt/ifrt/tf_host_callback.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/cc/ops/functional_ops.h"
+#include "tensorflow/cc/ops/math_ops.h"
+#include "tensorflow/cc/ops/resource_variable_ops.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h"
+#include "xla/xla_data.pb.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/graph_to_functiondef.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/tensor_matcher.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+
+namespace tensorflow {
+namespace ifrt_serving {
+namespace {
+
+using ::tensorflow::test::AsTensor;
+using ::tensorflow::test::TensorEq;
+
+absl::StatusOr<tensorflow::FunctionDef> ToFunctionDef(
+    tensorflow::Scope scope, const std::string& function_name) {
+  auto graph =
+      std::make_unique<tensorflow::Graph>(tensorflow::OpRegistry::Global());
+  TF_RETURN_IF_ERROR(scope.ToGraph(graph.get()));
+  tensorflow::FunctionDef function_def;
+  TF_RETURN_IF_ERROR(
+      tensorflow::GraphToFunctionDef(*graph, function_name, &function_def));
+  return function_def;
+}
+
+absl::StatusOr<tensorflow::FunctionDef> MakeAddOneFunctionDef(
+    const std::string& function_name) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+  {
+    auto arg0 = tensorflow::ops::_Arg(scope.WithOpName("arg0"),
+                                      tensorflow::DT_FLOAT, 0);
+    auto const0_value = tensorflow::test::AsScalar<float>(1);
+    auto const0 =
+        tensorflow::ops::Const(scope.WithOpName("const0"),
+                               tensorflow::Input::Initializer(const0_value));
+    auto add0 = tensorflow::ops::Add(scope.WithOpName("add0"), arg0, const0);
+    auto retval0 =
+        tensorflow::ops::_Retval(scope.WithOpName("retval0"), add0, 0);
+  }
+  return ToFunctionDef(std::move(scope), function_name);
+}
+
+absl::StatusOr<std::vector<tensorflow::FunctionDef>>
+MakeAddOneWithCallFunctionDef(const std::string& function_name) {
+  std::vector<tensorflow::FunctionDef> function_defs;
+  TF_ASSIGN_OR_RETURN(function_defs.emplace_back(),
+                      MakeAddOneFunctionDef("add"));
+
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+  {
+    auto arg0 = tensorflow::ops::_Arg(scope.WithOpName("arg0"),
+                                      tensorflow::DT_FLOAT, 0);
+    tensorflow::NameAttrList f;
+    f.set_name("add");
+    auto call = tensorflow::ops::StatefulPartitionedCall(
+        scope.WithOpName("call"), {arg0.output}, {tensorflow::DT_FLOAT}, f);
+    auto retval0 = tensorflow::ops::_Retval(scope.WithOpName("retval0"),
+                                            call.output[0], 0);
+  }
+  TF_ASSIGN_OR_RETURN(function_defs.emplace_back(),
+                      ToFunctionDef(std::move(scope), function_name));
+
+  return function_defs;
+}
+
+absl::StatusOr<tensorflow::FunctionDef> MakeAssignVarFunctionDef(
+    const std::string& function_name) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+  {
+    auto arg0 = tensorflow::ops::_Arg(scope.WithOpName("arg0"),
+                                      tensorflow::DT_INT32, 0);
+    auto var = tensorflow::ops::VarHandleOp(
+        scope.WithOpName("var"), tensorflow::DT_INT32,
+        tensorflow::TensorShape(),
+        tensorflow::ops::VarHandleOp::Attrs().SharedName("var"));
+    tensorflow::ops::AssignVariableOp assign_op(scope.WithOpName("assign"), var,
+                                                arg0);
+  }
+  return ToFunctionDef(std::move(scope), function_name);
+}
+
+absl::StatusOr<tensorflow::FunctionDef> MakeAddVarFunctionDef(
+    const std::string& function_name) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+  {
+    auto arg0 = tensorflow::ops::_Arg(scope.WithOpName("arg0"),
+                                      tensorflow::DT_INT32, 0);
+    auto var = tensorflow::ops::VarHandleOp(
+        scope.WithOpName("var"), tensorflow::DT_INT32,
+        tensorflow::TensorShape(),
+        tensorflow::ops::VarHandleOp::Attrs().SharedName("var"));
+    auto read = tensorflow::ops::ReadVariableOp(scope.WithOpName("read"), var,
+                                                tensorflow::DT_INT32);
+    auto add = tensorflow::ops::Add(scope.WithOpName("add"), read, arg0);
+    tensorflow::ops::AssignVariableOp assign_op(scope.WithOpName("assign"), var,
+                                                add);
+    auto retval0 =
+        tensorflow::ops::_Retval(scope.WithOpName("retval0"), add, 0);
+  }
+  return ToFunctionDef(std::move(scope), function_name);
+}
+
+TEST(TfHostCallbackTest, Simple) {
+  ASSERT_OK_AND_ASSIGN(auto function_defs,
+                       MakeAddOneWithCallFunctionDef("main"));
+
+  // Construct the input & output
+  auto in = AsTensor<float>({2.5f}, tensorflow::TensorShape({1}));
+  void* in_ptrs[1] = {in.data()};
+  std::vector<DtypeAndShape> in_dtype_shapes;
+  in_dtype_shapes.push_back({.dtype = in.dtype(), .shape = in.shape()});
+
+  auto out = AsTensor<float>({0.0f}, tensorflow::TensorShape({1}));
+  void* out_ptrs[1] = {out.data()};
+  std::vector<DtypeAndShape> out_dtype_shapes;
+  out_dtype_shapes.push_back({.dtype = out.dtype(), .shape = out.shape()});
+
+  ASSERT_OK_AND_ASSIGN(auto device_mgr, CreateTfStaticDeviceMgr());
+  ASSERT_OK_AND_ASSIGN(auto tf_host_callback,
+                       tensorflow::ifrt_serving::TfHostCallback::Create(
+                           function_defs, "main", in_dtype_shapes,
+                           out_dtype_shapes, device_mgr.get()));
+
+  ASSERT_OK(tf_host_callback->Call(in_ptrs, out_ptrs));
+
+  EXPECT_THAT(out,
+              TensorEq(AsTensor<float>({3.5f}, tensorflow::TensorShape({1}))));
+}
+
+TEST(TfHostCallbackTest, SharedState) {
+  tensorflow::ConfigProto session_config;
+  // Verify that two host callbacks can share the same TF resource (a variable
+  // with the same shared name in this case).
+
+  ASSERT_OK_AND_ASSIGN(auto state, CreateTfStaticDeviceMgr());
+
+  // Build the first host callback that assigns the argument to a variable.
+  std::unique_ptr<TfHostCallback> assign_callback;
+  {
+    ASSERT_OK_AND_ASSIGN(auto functions, MakeAssignVarFunctionDef("main"));
+
+    std::vector<DtypeAndShape> in_dtype_shapes;
+    in_dtype_shapes.push_back(
+        {.dtype = DT_INT32, .shape = tensorflow::TensorShape({1})});
+    std::vector<DtypeAndShape> out_dtype_shapes;
+
+    ASSERT_OK_AND_ASSIGN(
+        assign_callback,
+        TfHostCallback::Create({functions}, "main", in_dtype_shapes,
+                               out_dtype_shapes, state.get()));
+  }
+
+  // Build the second host callback that adds the argument to the same variable
+  // and returns its value.
+  std::unique_ptr<TfHostCallback> incr_callback;
+  {
+    ASSERT_OK_AND_ASSIGN(auto functions, MakeAddVarFunctionDef("main"));
+
+    std::vector<DtypeAndShape> in_dtype_shapes;
+    in_dtype_shapes.push_back(
+        {.dtype = DT_INT32, .shape = tensorflow::TensorShape({1})});
+    std::vector<DtypeAndShape> out_dtype_shapes;
+    out_dtype_shapes.push_back(
+        {.dtype = DT_INT32, .shape = tensorflow::TensorShape({1})});
+
+    ASSERT_OK_AND_ASSIGN(
+        incr_callback,
+        TfHostCallback::Create({functions}, "main", in_dtype_shapes,
+                               out_dtype_shapes, state.get()));
+  }
+
+  // Assign `kInit` to the variable.
+  constexpr int32_t kInit = 2;
+  {
+    // Construct the output literals.
+    auto in = AsTensor<int32_t>({kInit}, tensorflow::TensorShape({1}));
+    void* in_ptrs[1] = {in.data()};
+
+    void* out_ptrs[0];
+
+    ASSERT_OK(assign_callback->Call(in_ptrs, out_ptrs));
+  }
+
+  // Add one to the variable every iteration and check its value. Its value
+  // should start from `kInit`.
+  for (int i = 0; i < 3; ++i) {
+    // Construct the output literals.
+
+    auto in = AsTensor<int32_t>({1}, tensorflow::TensorShape({1}));
+    void* in_ptrs[1] = {in.data()};
+
+    auto out = AsTensor<int32_t>({0}, tensorflow::TensorShape({1}));
+    void* out_ptrs[1] = {out.data()};
+
+    ASSERT_OK(incr_callback->Call(in_ptrs, out_ptrs));
+
+    EXPECT_THAT(out, TensorEq(AsTensor<int32_t>({kInit + i + 1},
+                                                tensorflow::TensorShape({1}))));
+  }
+}
+
+}  // namespace
+}  // namespace ifrt_serving
+}  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/kernels/ifrt_program_ops.h b/tensorflow/core/tfrt/kernels/ifrt_program_ops.h
index 31bb908519d405..463647eca61a29 100644
--- a/tensorflow/core/tfrt/kernels/ifrt_program_ops.h
+++ b/tensorflow/core/tfrt/kernels/ifrt_program_ops.h
@@ -18,7 +18,8 @@ limitations under the License.
 
 #include <stdint.h>
 
-#include <memory>
+#include <string>
+#include <vector>
 
 #include "absl/base/call_once.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -46,7 +47,7 @@ class IfrtCallOp : public tensorflow::OpKernel {
 
   // Ifrt program to be called. Cached after the first call.
   absl::once_flag init_once_;
-  std::shared_ptr<tensorflow::ifrt_serving::IfrtServingExecutable> executable_;
+  tensorflow::ifrt_serving::IfrtServingExecutable* executable_;  // Not owned.
 };
 
 }  // namespace tfrt_stub
diff --git a/tensorflow/core/tfrt/mla/BUILD b/tensorflow/core/tfrt/mla/BUILD
index 47fd6be39dcc6c..53f595ee5a196a 100644
--- a/tensorflow/core/tfrt/mla/BUILD
+++ b/tensorflow/core/tfrt/mla/BUILD
@@ -9,9 +9,7 @@ cc_library(
     name = "mla_test_utils",
     testonly = True,
     hdrs = ["mla_test_utils.h"],
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
     deps = [
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:status",
@@ -27,9 +25,7 @@ cc_library(
 cc_library(
     name = "mla_utils",
     hdrs = ["mla_utils.h"],
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
     deps = [
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:statusor",
diff --git a/tensorflow/core/tfrt/mlrt/attribute/BUILD b/tensorflow/core/tfrt/mlrt/attribute/BUILD
index 7190d1e4f742b8..5bebee19f89926 100644
--- a/tensorflow/core/tfrt/mlrt/attribute/BUILD
+++ b/tensorflow/core/tfrt/mlrt/attribute/BUILD
@@ -22,11 +22,13 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_attributes",
         "//tensorflow/compiler/mlir/tfrt/translate/mlrt:mlir_to_bytecode",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/platform:status",
         "//tensorflow/core/tfrt/mlrt/bytecode",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
-        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:errors",
     ],
 )
 
@@ -40,8 +42,13 @@ tf_cc_test(
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "//tensorflow/compiler/mlir/tfrt/translate/mlrt:mlir_to_bytecode",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/tfrt/mlrt/bytecode",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:status_matchers",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
diff --git a/tensorflow/core/tfrt/mlrt/attribute/attribute.cc b/tensorflow/core/tfrt/mlrt/attribute/attribute.cc
index f709caad267a33..daa92fcaca5953 100644
--- a/tensorflow/core/tfrt/mlrt/attribute/attribute.cc
+++ b/tensorflow/core/tfrt/mlrt/attribute/attribute.cc
@@ -19,10 +19,19 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
-#include "tensorflow/core/platform/status.h"
-#include "tsl/platform/statusor.h"
+#include "tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
+#include "tsl/platform/errors.h"
 
 namespace tensorflow {
 namespace tf_mlrt {
diff --git a/tensorflow/core/tfrt/mlrt/attribute/attribute_test.cc b/tensorflow/core/tfrt/mlrt/attribute/attribute_test.cc
index 168d7de56eee71..d7afb03b0e7b7e 100644
--- a/tensorflow/core/tfrt/mlrt/attribute/attribute_test.cc
+++ b/tensorflow/core/tfrt/mlrt/attribute/attribute_test.cc
@@ -20,15 +20,22 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
 #include "tsl/platform/status_matchers.h"
+#include "tsl/platform/statusor.h"
 
 namespace tensorflow {
 namespace tf_mlrt {
diff --git a/tensorflow/core/tfrt/mlrt/kernel/BUILD b/tensorflow/core/tfrt/mlrt/kernel/BUILD
index 304f637952aec3..a68cff125ea408 100644
--- a/tensorflow/core/tfrt/mlrt/kernel/BUILD
+++ b/tensorflow/core/tfrt/mlrt/kernel/BUILD
@@ -60,28 +60,25 @@ cc_library(
         "//tensorflow/core/framework:attr_value_proto_cc",
         "//tensorflow/core/framework:types_proto_cc",
         "//tensorflow/core/platform:protobuf",
-        "//tensorflow/core/platform:refcount",
         "//tensorflow/core/tfrt/fallback:op_kernel_runner",
         "//tensorflow/core/tfrt/ifrt:ifrt_config_proto_cc",
-        "//tensorflow/core/tfrt/ifrt:ifrt_loaded_variable_registry",
+        "//tensorflow/core/tfrt/ifrt:ifrt_loaded_variable_utils",
         "//tensorflow/core/tfrt/ifrt:ifrt_model_context",
         "//tensorflow/core/tfrt/ifrt:ifrt_restore_tensor_registry",
-        "//tensorflow/core/tfrt/ifrt:sharding_utils",
         "//tensorflow/core/tfrt/mlrt/bytecode",
         "//tensorflow/core/tfrt/mlrt/interpreter:context",
+        "//tensorflow/core/tfrt/mlrt/interpreter:future",
         "//tensorflow/core/tfrt/utils:fallback_tensor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/concurrency:ref_count",
         "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:tstring",
         "@local_xla//xla:xla_data_proto_cc",
-        "@local_xla//xla/hlo/ir:hlo",
         "@local_xla//xla/python/ifrt",
+        "@tf_runtime//:hostcontext",
     ],
     alwayslink = 1,
 )
@@ -94,8 +91,9 @@ cc_library(
         ":context",
         ":kernel_runner_utils",
         "//tensorflow/core:framework",
+        "//tensorflow/core/kernels/batching_util:batch_resource_base",
         "//tensorflow/core/kernels/batching_util:batch_scheduler_hdrs",
-        "//tensorflow/core/platform:protobuf",
+        "//tensorflow/core/platform:status",
         "//tensorflow/core/platform:statusor",
         "//tensorflow/core/runtime_fallback/runtime:fallback_batch_kernel",
         "//tensorflow/core/tfrt/fallback:op_kernel_runner_cache",
@@ -195,6 +193,7 @@ tf_cc_shared_test(
         "//tensorflow/core/tfrt/fallback:op_kernel_runner",
         "//tensorflow/core/tfrt/ifrt:ifrt_config_proto_cc",
         "//tensorflow/core/tfrt/ifrt:ifrt_model_context",
+        "//tensorflow/core/tfrt/ifrt:ifrt_restore_tensor_registry",
         "//tensorflow/core/tfrt/mlrt/bytecode",
         "//tensorflow/core/tfrt/mlrt/bytecode:executable",
         "//tensorflow/core/tfrt/mlrt/interpreter:builtin_kernels",
diff --git a/tensorflow/core/tfrt/mlrt/kernel/batch_kernel.cc b/tensorflow/core/tfrt/mlrt/kernel/batch_kernel.cc
index 52e875750698c6..d4536743757c2a 100644
--- a/tensorflow/core/tfrt/mlrt/kernel/batch_kernel.cc
+++ b/tensorflow/core/tfrt/mlrt/kernel/batch_kernel.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tfrt/mlrt/kernel/batch_kernel.h"
 
+#include <cstdint>
 #include <cstdlib>
 #include <functional>
 #include <memory>
@@ -25,7 +26,9 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/batching_util/batch_resource_base.h"
 #include "tensorflow/core/kernels/batching_util/batch_scheduler.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.h"
 #include "tensorflow/core/tfrt/fallback/op_kernel_runner_cache.h"
@@ -146,7 +149,7 @@ void BatchFunctionOp::Invoke() {
     auto ptr_value = absl::bit_cast<int64_t>(f);
     (*attr_value_map)["opaque_function_handle"].set_i(ptr_value);
 
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   tfrt::Location loc;
@@ -191,14 +194,14 @@ class MlrtBatchResource : public tensorflow::serving::BatchResourceBase {
 
   // This can only be called in Compute() and ComputeAsync() because thread
   // local is used to pass the context.
-  static StatusOr<std::unique_ptr<BatchTask>> CreateBatchTask(
+  static absl::StatusOr<std::unique_ptr<BatchTask>> CreateBatchTask(
       OpKernelContext*) {
     return {std::make_unique<MlrtBatchTask>(GetBatchFunctionMlrtContext())};
   }
 
   // This can only be called in Compute() and ComputeAsync() because thread
   // local is used to pass the context.
-  static StatusOr<tfrt::ResourceContext*> GetClientGraphResourceContext(
+  static absl::StatusOr<tfrt::ResourceContext*> GetClientGraphResourceContext(
       OpKernelContext*) {
     const auto& context =
         GetBatchFunctionMlrtContext()->GetUserContext<Context>();
@@ -215,26 +218,29 @@ class MlrtBatchResource : public tensorflow::serving::BatchResourceBase {
     return batch_function.name();
   }
 
-  static Status Create(OpKernelContext* c, int32_t num_batch_threads,
-                       int32_t max_batch_size, int32_t batch_timeout_micros,
-                       int32_t max_enqueued_batches,
-                       const std::vector<int32_t>& allowed_batch_sizes,
+  static Status Create(OpKernelContext* c,
+                       const serving::BatchResourceOptions& options,
                        mlrt::bc::Function function,
                        bool enable_large_batch_splitting, bool disable_padding,
                        std::unique_ptr<MlrtBatchResource>* resource) {
     BatcherT::Options batcher_options;
-    batcher_options.num_batch_threads = num_batch_threads;
+    batcher_options.num_batch_threads = options.num_batch_threads;
     std::shared_ptr<BatcherT> batcher;
     TF_RETURN_IF_ERROR(BatcherT::Create(batcher_options, &batcher));
 
     resource->reset(new MlrtBatchResource(
         function, std::move(batcher),
-        GetBatcherQueueOptions(num_batch_threads, max_batch_size,
-                               batch_timeout_micros, max_enqueued_batches,
-                               allowed_batch_sizes,
-                               enable_large_batch_splitting, disable_padding),
-        allowed_batch_sizes));
-    return OkStatus();
+        GetBatcherQueueOptions(
+            options.num_batch_threads, options.max_batch_size,
+            options.batch_timeout_micros, options.max_enqueued_batches,
+            options.allowed_batch_sizes, enable_large_batch_splitting,
+            disable_padding, options.low_priority_max_batch_size,
+            options.low_priority_batch_timeout_micros,
+            options.low_priority_max_enqueued_batches,
+            options.low_priority_allowed_batch_sizes,
+            options.mixed_priority_batching_policy),
+        options.allowed_batch_sizes));
+    return absl::OkStatus();
   }
 
   static Status Create(
@@ -256,7 +262,7 @@ class MlrtBatchResource : public tensorflow::serving::BatchResourceBase {
                                        true /* enable large batch split */,
                                        allowed_batch_sizes, disable_padding),
         allowed_batch_sizes));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   string DebugString() const final { return "MlrtBatchResource"; }
@@ -336,14 +342,14 @@ void MlrtBatchResource::ProcessFuncBatchImpl(
   fallback_request_state.set_runtime_config(
       caller_fallback_request_state.runtime_config());
 
-  tensorflow::profiler::TraceMeProducer activity(
+  tsl::profiler::TraceMeProducer activity(
       // To TraceMeConsumers in WorkQueue.
       [step_id] {
-        return tensorflow::profiler::TraceMeEncode(
-            "RunMlrtFunction", {{"id", step_id}, {"_r", 1}});
+        return tsl::profiler::TraceMeEncode("RunMlrtFunction",
+                                            {{"id", step_id}, {"_r", 1}});
       },
-      tensorflow::profiler::ContextType::kTfrtExecutor, step_id,
-      tensorflow::profiler::TraceMeLevel::kInfo);
+      tsl::profiler::ContextType::kTfrtExecutor, step_id,
+      tsl::profiler::TraceMeLevel::kInfo);
 
   // Copy the ExecutionContext and its user contexts for async execution.
   auto user_contexts = caller_context.CopyUserContexts();
diff --git a/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel.cc b/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel.cc
index 47d1bd03e22758..3d3cd0ae7d6c04 100644
--- a/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel.cc
+++ b/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel.cc
@@ -26,8 +26,6 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h"
-#include "xla/hlo/ir/hlo_sharding.h"
-#include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/future.h"
 #include "xla/xla_data.pb.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
@@ -41,76 +39,28 @@ limitations under the License.
 #include "tensorflow/core/platform/protobuf.h"  // IWYU pragma: keep
 #include "tensorflow/core/tfrt/fallback/op_kernel_runner.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_config.pb.h"
-#include "tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_utils.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_model_context.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.h"
-#include "tensorflow/core/tfrt/ifrt/sharding_utils.h"
 #include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
 #include "tensorflow/core/tfrt/mlrt/interpreter/context.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/future.h"
 #include "tensorflow/core/tfrt/mlrt/kernel/context.h"
 #include "tensorflow/core/tfrt/mlrt/kernel/kernel.h"
 #include "tensorflow/core/tfrt/mlrt/kernel/kernel_runner_utils.h"
 #include "tensorflow/core/tfrt/utils/fallback_tensor.h"
-#include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
 #include "tsl/platform/tstring.h"
+#include "tfrt/host_context/concurrent_work_queue.h"  // from @tf_runtime
+
+using tensorflow::ifrt_serving::IfrtModelContext;
+using tensorflow::ifrt_serving::VariableDeviceShardingConfigProto;
 
 namespace tensorflow {
 namespace tf_mlrt {
 
 namespace {
 
-absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> LoadIfrtVariable(
-    tensorflow::ifrt_serving::IfrtModelContext& ifrt_model_context,
-    const tensorflow::Tensor& variable,
-    absl::string_view sharding_config_proto_text, absl::string_view name) {
-  tensorflow::ifrt_serving::VariableDeviceShardingConfigProto sharding_config;
-
-  if (!tensorflow::protobuf::TextFormat::ParseFromString(
-          sharding_config_proto_text, &sharding_config)) {
-    return absl::InvalidArgumentError(absl::StrCat(
-        "Attribute: ", sharding_config_proto_text, " cannot be parsed"));
-  }
-
-  std::vector<int> device_ids{sharding_config.device_ids().begin(),
-                              sharding_config.device_ids().end()};
-  TF_ASSIGN_OR_RETURN(xla::HloSharding hlo_sharding,
-                      xla::HloSharding::FromProto(sharding_config.sharding()));
-  TF_ASSIGN_OR_RETURN(
-      auto result_array,
-      tensorflow::ifrt_serving::MakeArrayFromTensor(
-          *ifrt_model_context.GetClient(), variable, absl::MakeSpan(device_ids),
-          hlo_sharding, ifrt_model_context.GetThreadPool()));
-
-  return result_array;
-}
-
-std::string GetRuntimeNameFromVarHandle(const ResourceHandle& handle) {
-  return absl::StrCat(handle.container(), "__", handle.name());
-}
-
-absl::StatusOr<ifrt_serving::DtypeAndShape> GetDtypeAndShape(
-    const ResourceHandle& variable) {
-  std::vector<DtypeAndPartialTensorShape> dtype_and_partial_shapes =
-      variable.dtypes_and_shapes();
-
-  if (dtype_and_partial_shapes.size() != 1) {
-    return absl::InvalidArgumentError(absl::StrCat(
-        "Expected 1 dtype and shape, got ", dtype_and_partial_shapes.size()));
-  }
-  ifrt_serving::DtypeAndShape dtype_and_shape;
-  if (!dtype_and_partial_shapes.front().shape.AsTensorShape(
-          &dtype_and_shape.shape)) {
-    return absl::InvalidArgumentError(
-        absl::StrCat("Failed to convert partial shape to full tensor shape: ",
-                     dtype_and_partial_shapes.front().shape.DebugString()));
-  }
-
-  dtype_and_shape.dtype = dtype_and_partial_shapes.front().dtype;
-  return dtype_and_shape;
-}
-
 struct MlrtIfrtRestoreVariableKernel : mlrt::KernelFrame {
   using KernelFrame::KernelFrame;
 
@@ -149,12 +99,9 @@ struct MlrtIfrtRestoreVariableKernel : mlrt::KernelFrame {
 };
 
 void MlrtIfrtRestoreVariableKernel::Invoke() {
-  std::optional<tensorflow::ifrt_serving::IfrtModelContext*>
-      ifrt_model_context =
-          context()
-              .resource_context()
-              .GetResource<tensorflow::ifrt_serving::IfrtModelContext>(
-                  "IfrtModelContext");
+  std::optional<IfrtModelContext*> ifrt_model_context =
+      context().resource_context().GetResource<IfrtModelContext>(
+          "IfrtModelContext");
   if (!ifrt_model_context.has_value()) {
     execution_context().Fail(absl::FailedPreconditionError(
         "RestoreVariableOp: failed to fetch IfrtModelContext"));
@@ -221,11 +168,22 @@ void MlrtIfrtRestoreVariableKernel::Invoke() {
         xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>>::CreatePromise();
     auto future =
         xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>>(promise);
-
-    std::string runtime_name = GetRuntimeNameFromVarHandle(
-        var_handles()[i].tensor().scalar<ResourceHandle>()());
-    if (auto status =
-            ifrt_restore_tensor_registry.TryRegister(runtime_name, future);
+    const ResourceHandle& var_handle =
+        var_handles()[i].tensor().scalar<ResourceHandle>()();
+    absl::StatusOr<ifrt_serving::DtypeAndShape> dtype_and_shape =
+        ifrt_serving::GetDtypeAndShape(var_handle);
+    if (!dtype_and_shape.ok()) {
+      // TODO(b/330360798) Refactor Invoke() to have less usage on
+      // execution_context().Fail.
+      execution_context().Fail(dtype_and_shape.status());
+      return;
+    }
+    std::string runtime_name =
+        ifrt_serving::GetRuntimeNameFromVarHandle(var_handle);
+    ifrt_serving::IfrtRestoreTensorRegistry::RestoredTensorInfo
+        restored_tensor_info = {*std::move(dtype_and_shape), std::move(future)};
+    if (auto status = ifrt_restore_tensor_registry.TryRegister(
+            runtime_name, restored_tensor_info);
         !status.ok()) {
       // Propagate errors so that if already-registered futures are being waited
       // on, they can be unblocked.
@@ -268,14 +226,14 @@ class MlrtIfrtLoadVariableKernel : public mlrt::KernelFrame {
 
   static constexpr char kName[] = "tf_mlrt.ifrt_load_variable";
 
-  const ResourceHandle& variable() const {
+  const tensorflow::Tensor& variable_handler_tensor() const {
     DCHECK_GE(arguments().size(), 1);
-    const auto& tensor =
+    const tensorflow::Tensor& ret =
         arguments()[0].Get<tensorflow::tfrt_stub::FallbackTensor>().tensor();
-
-    DCHECK_EQ(tensor.NumElements(), 1);
-    return tensor.scalar<ResourceHandle>()();
+    DCHECK_EQ(ret.NumElements(), 1);
+    return ret;
   }
+
   absl::string_view sharding_config_proto_text() const {
     DCHECK_EQ(attributes().size(), 2);
     return attributes().GetAs<mlrt::bc::String>(0).Get();
@@ -297,79 +255,58 @@ void MlrtIfrtLoadVariableKernel::Invoke() {
 }
 
 absl::Status MlrtIfrtLoadVariableKernel::InvokeHelper() {
-  DCHECK_EQ(1, results().size());
-  std::optional<tensorflow::ifrt_serving::IfrtModelContext*>
-      ifrt_model_context =
-          context()
-              .resource_context()
-              .GetResource<tensorflow::ifrt_serving::IfrtModelContext>(
-                  "IfrtModelContext");
+  DCHECK_EQ(2, results().size());
+  std::optional<IfrtModelContext*> ifrt_model_context =
+      context().resource_context().GetResource<IfrtModelContext>(
+          "IfrtModelContext");
   if (!ifrt_model_context.has_value()) {
     return absl::FailedPreconditionError(
         "LoadVariableOp: failed to fetch IfrtModelContext: ");
   }
 
-  // TODO(b/319045348): remove name() attribute. we now gets name from variable
-  // handle.
-  std::string runtime_name = GetRuntimeNameFromVarHandle(variable());
-  xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>> restored_tensor_future =
-      (*ifrt_model_context)->GetRestoreTensorRegistry().Get(runtime_name);
-  if (!restored_tensor_future.IsValid()) {
-    return absl::InternalError(absl::StrCat(
-        "LoadVariableOp: failed to fetch variable tensor: ", runtime_name));
+  VariableDeviceShardingConfigProto sharding_config;
+  absl::string_view sharding_config_text = sharding_config_proto_text();
+
+  if (!tensorflow::protobuf::TextFormat::ParseFromString(sharding_config_text,
+                                                         &sharding_config)) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Attribute: ", sharding_config_text, " cannot be parsed"));
   }
 
-  auto loaded_variable_promise = xla::ifrt::Future<
-      absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>>::CreatePromise();
-  auto loaded_variable_future =
-      xla::ifrt::Future<absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>>(
-          loaded_variable_promise);
-
-  TF_ASSIGN_OR_RETURN(ifrt_serving::DtypeAndShape dtype_and_shape,
-                      GetDtypeAndShape(variable()));
-
-  TF_RETURN_IF_ERROR(
-      (*ifrt_model_context)
-          ->GetLoadedVariableRegistry()
-          .TryRegisterLoadedVariable(
-              runtime_name,
-              [&]() -> absl::StatusOr<ifrt_serving::IfrtLoadedVariableRegistry::
-                                          LoadedVariable> {
-                return ifrt_serving::IfrtLoadedVariableRegistry::LoadedVariable(
-                    {.dtype_and_shape = dtype_and_shape,
-                     .array = loaded_variable_future});
-              }));
+  auto tensor_promise =
+      mlrt::Promise::Allocate<tensorflow::tfrt_stub::FallbackTensor>();
+  auto tensor_future = tensor_promise.GetFuture();
+
+  ifrt_serving::IfrtRestoreTensorRegistry& ifrt_restore_tensor_registry =
+      (*ifrt_model_context)->GetRestoreTensorRegistry();
+
+  std::string runtime_name = ifrt_serving::GetRuntimeNameFromVarHandle(
+      variable_handler_tensor().scalar<ResourceHandle>()());
+
+  TF_RETURN_IF_ERROR(ifrt_serving::LoadRestoredTensorAsIfrtLoadedVariable(
+      runtime_name, (*ifrt_model_context)->GetClient(),
+      (*ifrt_model_context)->GetThreadPool(), ifrt_restore_tensor_registry,
+      (*ifrt_model_context)->GetLoadedVariableRegistry(),
+      (*ifrt_model_context)->checkpoint_loader_queue(), sharding_config));
+  xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>> restored_tensor_future =
+      ifrt_restore_tensor_registry.GetRestoredTensor(runtime_name);
 
   restored_tensor_future.OnReady(
-      [ifrt_model_context = *ifrt_model_context,
-       sharding_config = std::string(sharding_config_proto_text()),
-       runtime_name = runtime_name,
-       loaded_variable_promise = std::move(loaded_variable_promise)](
+      [tensor_promise = std::move(tensor_promise)](
           absl::StatusOr<tensorflow::Tensor> restored_tensor) mutable {
         if (!restored_tensor.ok()) {
-          loaded_variable_promise.Set(std::move(restored_tensor).status());
+          std::move(tensor_promise).SetError(restored_tensor.status());
           return;
         }
-
-        // Transfer tensor to array in a separate thread.
-        ifrt_model_context->checkpoint_loader_queue()->AddTask(
-            [ifrt_model_context, runtime_name = std::move(runtime_name),
-             sharding_config = std::move(sharding_config),
-             restored_tensor = std::move(*restored_tensor),
-             loaded_variable_promise =
-                 std::move(loaded_variable_promise)]() mutable {
-              absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>
-                  variable_array =
-                      LoadIfrtVariable(*ifrt_model_context, restored_tensor,
-                                       sharding_config, runtime_name);
-              loaded_variable_promise.Set(std::move(variable_array));
-            });
+        std::move(tensor_promise)
+            .Set<tensorflow::tfrt_stub::FallbackTensor>(
+                tensorflow::tfrt_stub::FallbackTensor(*restored_tensor));
       });
   // Return the name as the key
   tensorflow::Tensor key_tensor(tensorflow::DT_STRING, {});
   key_tensor.scalar<tsl::tstring>()() = runtime_name;
   results()[0].Set(tensorflow::tfrt_stub::FallbackTensor(key_tensor));
-
+  results()[1].Set(std::move(tensor_future));
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel_test.cc b/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel_test.cc
index c76deb3bfb62c3..56d276fbfc0abc 100644
--- a/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel_test.cc
+++ b/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel_test.cc
@@ -43,6 +43,7 @@ limitations under the License.
 #include "tensorflow/core/tfrt/fallback/op_kernel_runner.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_config.pb.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_model_context.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.h"
 #include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
 #include "tensorflow/core/tfrt/mlrt/bytecode/executable.h"
 #include "tensorflow/core/tfrt/mlrt/interpreter/builtin_kernels.h"
@@ -298,7 +299,8 @@ mlrt::bc::Buffer CreateExecutableForIfrtLoadVariableOp(
     {
       auto kernel_ctor = kernels_ctor.ConstructAt(kernel_index);
       kernel_ctor.set_code(kernels.Use("tf_mlrt.ifrt_load_variable"));
-      kernel_ctor.construct_results(1).Assign({regs.Use("output_tensor")});
+      kernel_ctor.construct_results(2).Assign(
+          {regs.Use("output_tensor"), regs.Def("dummy_future")});
       kernel_ctor.construct_arguments(1).Assign({regs.Use("variable_handle")});
       kernel_ctor.construct_attributes(2).Assign(
           {attributes.GetHandle("sharding_config"),
@@ -310,7 +312,8 @@ mlrt::bc::Buffer CreateExecutableForIfrtLoadVariableOp(
     if (redundant_ifrt_load_variable_op) {
       auto kernel_ctor = kernels_ctor.ConstructAt(kernel_index);
       kernel_ctor.set_code(kernels.Use("tf_mlrt.ifrt_load_variable"));
-      kernel_ctor.construct_results(1).Assign({regs.Def("dummy")});
+      kernel_ctor.construct_results(2).Assign(
+          {regs.Def("dummy"), regs.Def("dummy_future2")});
       kernel_ctor.construct_attributes(2).Assign(
           {attributes.GetHandle("sharding_config"),
            attributes.GetHandle("variable_name")});
@@ -399,10 +402,14 @@ TEST(KernelTest, IfrtLoadVariableOp) {
   auto input_tensor_future =
       xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>>(
           input_tensor_promise);
+  ifrt_serving::IfrtRestoreTensorRegistry::RestoredTensorInfo
+      restore_tensor_info{.dtype_and_shape = {.dtype = input_tensor.dtype(),
+                                              .shape = input_tensor.shape()},
+                          .tensor_future = input_tensor_future};
   input_tensor_promise.Set(input_tensor);
   TF_ASSERT_OK((*ifrt_model_context)
                    ->GetRestoreTensorRegistry()
-                   .TryRegister(kVariableRuntimeName, input_tensor_future));
+                   .TryRegister(kVariableRuntimeName, restore_tensor_info));
 
   std::vector<mlrt::Value> args;
   std::vector<uint8_t> last_uses;
@@ -498,10 +505,14 @@ TEST(KernelTest, DuplicateIfrtLoadVariableOpShallSucceed) {
   auto input_tensor_future =
       xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>>(
           input_tensor_promise);
+  ifrt_serving::IfrtRestoreTensorRegistry::RestoredTensorInfo
+      restore_tensor_info{.dtype_and_shape = {.dtype = input_tensor.dtype(),
+                                              .shape = input_tensor.shape()},
+                          .tensor_future = input_tensor_future};
   input_tensor_promise.Set(input_tensor);
   TF_ASSERT_OK((*ifrt_model_context)
                    ->GetRestoreTensorRegistry()
-                   .TryRegister(kVariableRuntimeName, input_tensor_future));
+                   .TryRegister(kVariableRuntimeName, restore_tensor_info));
 
   std::vector<mlrt::Value> args;
   std::vector<uint8_t> last_uses;
@@ -587,7 +598,7 @@ TEST(KernelTest, IfrtRestoreVariableOp) {
   xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>> uninitialized_entry =
       (*ifrt_model_context)
           ->GetRestoreTensorRegistry()
-          .Get(kVariableRuntimeName);
+          .GetRestoredTensor(kVariableRuntimeName);
   ASSERT_TRUE(uninitialized_entry.IsReady());
   EXPECT_THAT(uninitialized_entry.Await().status(),
               ::tsl::testing::StatusIs(absl::StatusCode::kNotFound));
@@ -628,7 +639,7 @@ TEST(KernelTest, IfrtRestoreVariableOp) {
   xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>> restored_future =
       (*ifrt_model_context)
           ->GetRestoreTensorRegistry()
-          .Get(kVariableRuntimeName);
+          .GetRestoredTensor(kVariableRuntimeName);
   absl::StatusOr<tensorflow::Tensor> restored_tensor = restored_future.Await();
   TF_ASSERT_OK(restored_tensor.status());
   EXPECT_THAT(*restored_tensor, TensorEq(AsTensor<int32_t>({1, 2, 3}, {3})));
diff --git a/tensorflow/core/tfrt/mlrt/kernel/kernel.cc b/tensorflow/core/tfrt/mlrt/kernel/kernel.cc
index 323ee9fccf6503..23435a942b0ea9 100644
--- a/tensorflow/core/tfrt/mlrt/kernel/kernel.cc
+++ b/tensorflow/core/tfrt/mlrt/kernel/kernel.cc
@@ -705,7 +705,7 @@ void CreateOp::Invoke() {
                     node_def.input().size(),
                     [&](tensorflow::AttrValueMap* attr_value_map) {
                       *attr_value_map = node_def.attr();
-                      return OkStatus();
+                      return absl::OkStatus();
                     },
                     fallback_request_state.device_manager(),
                     fallback_request_state.process_function_library_runtime())
diff --git a/tensorflow/core/tfrt/mlrt/kernel/kernel_test.cc b/tensorflow/core/tfrt/mlrt/kernel/kernel_test.cc
index e38f2c8c4dab79..d6348880280517 100644
--- a/tensorflow/core/tfrt/mlrt/kernel/kernel_test.cc
+++ b/tensorflow/core/tfrt/mlrt/kernel/kernel_test.cc
@@ -1287,6 +1287,10 @@ mlrt::bc::Buffer CreateExecutableForBatchFunctionOp() {
                         key: "low_priority_max_enqueued_batches"
                         value { i: 1 }
                       }
+                      attr {
+                        key: "mixed_priority_policy"
+                        value { s: "low_priority_padding_with_max_batch_size" }
+                      }
                       attr {
                         key: "container"
                         value { s: "container" }
diff --git a/tensorflow/core/tfrt/ops/ifrt_program_ops.cc b/tensorflow/core/tfrt/ops/ifrt_program_ops.cc
index ba60bba95ecd83..8f70c48939d890 100644
--- a/tensorflow/core/tfrt/ops/ifrt_program_ops.cc
+++ b/tensorflow/core/tfrt/ops/ifrt_program_ops.cc
@@ -50,6 +50,7 @@ the input `args[variable_arg_indices[k]]` is the key to look for this loaded arr
 REGISTER_OP("IfrtLoadVariable")
     .Input("variable: Tin")
     .Output("array_key: Tout")
+    .Output("tensor: Tout")
     .Attr("Tin: type")
     .Attr("Tout: type")
     .Attr("config: string")
diff --git a/tensorflow/core/tfrt/run_handler_thread_pool/run_handler.cc b/tensorflow/core/tfrt/run_handler_thread_pool/run_handler.cc
index cf89c8bb09a558..c0147eb6392fb5 100644
--- a/tensorflow/core/tfrt/run_handler_thread_pool/run_handler.cc
+++ b/tensorflow/core/tfrt/run_handler_thread_pool/run_handler.cc
@@ -159,12 +159,12 @@ ThreadWorkSource::~ThreadWorkSource() {
 Task ThreadWorkSource::EnqueueTask(Task t, bool is_blocking,
                                    bool enable_wake_up) {
   uint64_t id = t.f->trace_id;
-  tensorflow::profiler::TraceMe activity(
+  tsl::profiler::TraceMe activity(
       [id, is_blocking] {
-        return tensorflow::profiler::TraceMeEncode(
+        return tsl::profiler::TraceMeEncode(
             "Enqueue", {{"id", id}, {"is_blocking", is_blocking}});
       },
-      tensorflow::profiler::TraceMeLevel::kInfo);
+      tsl::profiler::TraceMeLevel::kInfo);
   tensorflow::mutex* mu = nullptr;
   Queue* task_queue = nullptr;
   thread_local int64_t closure_counter = 0;
@@ -586,12 +586,12 @@ void RunHandlerThreadPool::WorkerLoop(int thread_id,
       tws->DecrementInflightTaskCount(task_from_blocking_queue);
       tws->DecrementPendingTaskCount();
     } else {
-      tensorflow::profiler::TraceMe activity(
+      tsl::profiler::TraceMe activity(
           [thread_id] {
-            return tensorflow::profiler::TraceMeEncode(
-                "Sleeping", {{"thread_id", thread_id}});
+            return tsl::profiler::TraceMeEncode("Sleeping",
+                                                {{"thread_id", thread_id}});
           },
-          tensorflow::profiler::TraceMeLevel::kInfo);
+          tsl::profiler::TraceMeLevel::kInfo);
       if (VLOG_IS_ON(4)) {
         for (int i = 0; i < thread_work_sources->size(); ++i) {
           VLOG(4) << "source id " << i << " "
@@ -789,12 +789,12 @@ class RunHandlerPool::Impl {
     {
       tensorflow::mutex_lock l(mu_);
       if (!has_free_handler()) {
-        tensorflow::profiler::TraceMe activity(
+        tsl::profiler::TraceMe activity(
             [step_id] {
-              return tensorflow::profiler::TraceMeEncode(
-                  "WaitingForHandler", {{"step_id", step_id}});
+              return tsl::profiler::TraceMeEncode("WaitingForHandler",
+                                                  {{"step_id", step_id}});
             },
-            tensorflow::profiler::TraceMeLevel::kInfo);
+            tsl::profiler::TraceMeLevel::kInfo);
         if (timeout_in_ms == 0) {
           mu_.Await(tensorflow::Condition(this, &Impl::has_free_handler));
         } else if (!mu_.AwaitWithDeadline(
diff --git a/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue.cc b/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue.cc
index 4d1b7fea5d98a5..f55d6ebd33a95e 100644
--- a/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue.cc
+++ b/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue.cc
@@ -59,7 +59,7 @@ RunHandlerThreadWorkQueue::RunHandlerThreadWorkQueue(const Options& options)
   handler_pool_ = std::make_unique<RunHandlerPool>(pool_options);
 }
 
-tensorflow::StatusOr<std::unique_ptr<tensorflow::tfrt_stub::WorkQueueInterface>>
+absl::StatusOr<std::unique_ptr<tensorflow::tfrt_stub::WorkQueueInterface>>
 RunHandlerThreadWorkQueue::InitializeRequest(int64_t request_id) const {
   RunHandlerOptions options;
   std::unique_ptr<RunHandler> handler =
diff --git a/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue.h b/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue.h
index 511fdcd9b86eca..774601213024e5 100644
--- a/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue.h
+++ b/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue.h
@@ -91,8 +91,7 @@ class RunHandlerThreadWorkQueue
         " complementary threads)");
   }
 
-  tensorflow::StatusOr<
-      std::unique_ptr<tensorflow::tfrt_stub::WorkQueueInterface>>
+  absl::StatusOr<std::unique_ptr<tensorflow::tfrt_stub::WorkQueueInterface>>
   InitializeRequest(int64_t request_id) const override;
 
   int GetParallelismLevel() const override {
diff --git a/tensorflow/core/tfrt/runtime/runtime.h b/tensorflow/core/tfrt/runtime/runtime.h
index 37bcd33f989d18..988715ba31bac4 100644
--- a/tensorflow/core/tfrt/runtime/runtime.h
+++ b/tensorflow/core/tfrt/runtime/runtime.h
@@ -192,13 +192,14 @@ class Runtime {
   }
 
   void SetCreateRequestQueueFn(
-      std::function<StatusOr<std::unique_ptr<WorkQueueInterface>>(int64_t)>
+      std::function<
+          absl::StatusOr<std::unique_ptr<WorkQueueInterface>>(int64_t)>
           create_request_queue_fn) {
     create_request_queue_fn_ = std::move(create_request_queue_fn);
   }
 
   // Creates a work queue for a request.
-  StatusOr<std::unique_ptr<WorkQueueInterface>> CreateRequestQueue(
+  absl::StatusOr<std::unique_ptr<WorkQueueInterface>> CreateRequestQueue(
       int64_t request_id) const {
     if (create_request_queue_fn_) {
       return create_request_queue_fn_(request_id);
@@ -212,7 +213,7 @@ class Runtime {
                    WorkQueueInterface* work_queue);
 
   std::unique_ptr<tfrt::CoreRuntime> core_runtime_;
-  std::function<StatusOr<std::unique_ptr<WorkQueueInterface>>(int64_t)>
+  std::function<absl::StatusOr<std::unique_ptr<WorkQueueInterface>>(int64_t)>
       create_request_queue_fn_;
   WorkQueueInterface* work_queue_ = nullptr;
   std::vector<std::function<absl::Status(ModelRuntimeContext&)>>
diff --git a/tensorflow/core/tfrt/runtime/tf_threadpool_concurrent_work_queue.cc b/tensorflow/core/tfrt/runtime/tf_threadpool_concurrent_work_queue.cc
index 84e9588dc9de3f..c7bf3277e2acd0 100644
--- a/tensorflow/core/tfrt/runtime/tf_threadpool_concurrent_work_queue.cc
+++ b/tensorflow/core/tfrt/runtime/tf_threadpool_concurrent_work_queue.cc
@@ -34,7 +34,7 @@ namespace tfrt_stub {
 
 using ::tensorflow::thread::ThreadPoolInterface;
 
-StatusOr<std::unique_ptr<WorkQueueInterface>>
+absl::StatusOr<std::unique_ptr<WorkQueueInterface>>
 TfThreadPoolWorkQueue::InitializeRequest(int64_t request_id) const {
   return {std::make_unique<TfThreadPoolWorkQueue>(
       request_id, intra_op_threadpool_, inter_op_threadpool_)};
diff --git a/tensorflow/core/tfrt/runtime/tf_threadpool_concurrent_work_queue.h b/tensorflow/core/tfrt/runtime/tf_threadpool_concurrent_work_queue.h
index 5579bf5b491c6e..c28f0107fee0d7 100644
--- a/tensorflow/core/tfrt/runtime/tf_threadpool_concurrent_work_queue.h
+++ b/tensorflow/core/tfrt/runtime/tf_threadpool_concurrent_work_queue.h
@@ -49,7 +49,7 @@ class TfThreadPoolWorkQueue : public WorkQueueInterface {
         intra_op_threadpool_(intra_op_threadpool),
         inter_op_threadpool_(inter_op_threadpool) {}
 
-  StatusOr<std::unique_ptr<WorkQueueInterface>> InitializeRequest(
+  absl::StatusOr<std::unique_ptr<WorkQueueInterface>> InitializeRequest(
       int64_t request_id) const override;
 
   int GetParallelismLevel() const override {
diff --git a/tensorflow/core/tfrt/runtime/work_queue_interface.cc b/tensorflow/core/tfrt/runtime/work_queue_interface.cc
index 97a555b37b6ba8..5e0019622d8792 100644
--- a/tensorflow/core/tfrt/runtime/work_queue_interface.cc
+++ b/tensorflow/core/tfrt/runtime/work_queue_interface.cc
@@ -75,7 +75,7 @@ class DefaultWorkQueueWrapper : public WorkQueueInterface {
     return work_queue_->IsInWorkerThread();
   }
 
-  StatusOr<std::unique_ptr<WorkQueueInterface>> InitializeRequest(
+  absl::StatusOr<std::unique_ptr<WorkQueueInterface>> InitializeRequest(
       int64_t request_id) const override {
     return {std::make_unique<DefaultWorkQueueWrapper>(request_id, work_queue_,
                                                       GetIntraOpThreadPool())};
diff --git a/tensorflow/core/tfrt/runtime/work_queue_interface.h b/tensorflow/core/tfrt/runtime/work_queue_interface.h
index 08c9f786496796..c8f11613fcf525 100644
--- a/tensorflow/core/tfrt/runtime/work_queue_interface.h
+++ b/tensorflow/core/tfrt/runtime/work_queue_interface.h
@@ -56,7 +56,7 @@ class WorkQueueInterface : public tfrt::ConcurrentWorkQueue {
   // interface so that the interface is more composable. Per-request logic
   // should be handled separately.
   ABSL_DEPRECATED("Create the instance directly instead.")
-  virtual StatusOr<std::unique_ptr<WorkQueueInterface>> InitializeRequest(
+  virtual absl::StatusOr<std::unique_ptr<WorkQueueInterface>> InitializeRequest(
       int64_t request_id) const {
     return {nullptr};
   }
@@ -87,17 +87,17 @@ template <typename Callable>
 tfrt::TaskFunction WrapWork(int64_t id, absl::string_view name,
                             Callable&& work) {
   tensorflow::Context context(tensorflow::ContextKind::kThread);
-  tensorflow::profiler::TraceMeProducer producer(
+  tsl::profiler::TraceMeProducer producer(
       [&]() { return absl::StrCat("producer_", name); },
-      tensorflow::profiler::ContextType::kTfrtExecutor);
+      tsl::profiler::ContextType::kTfrtExecutor);
   return tfrt::TaskFunction([traceme_id = producer.GetContextId(),
                              name = std::string(name),
                              context = std::move(context),
                              work = std::forward<Callable>(work)]() mutable {
-    tensorflow::profiler::TraceMeConsumer consumer(
+    tsl::profiler::TraceMeConsumer consumer(
         [&]() { return absl::StrCat("consumer_", name); },
-        tensorflow::profiler::ContextType::kTfrtExecutor, traceme_id,
-        tensorflow::profiler::TraceMeLevel::kInfo);
+        tsl::profiler::ContextType::kTfrtExecutor, traceme_id,
+        tsl::profiler::TraceMeLevel::kInfo);
     tensorflow::WithContext wc(context);
     std::forward<Callable>(work)();
   });
diff --git a/tensorflow/core/tfrt/saved_model/BUILD b/tensorflow/core/tfrt/saved_model/BUILD
index 03976ee0048ca7..7a682f7ce6d00f 100644
--- a/tensorflow/core/tfrt/saved_model/BUILD
+++ b/tensorflow/core/tfrt/saved_model/BUILD
@@ -287,6 +287,7 @@ cc_library(
         "//tensorflow/core/tfrt/fallback:fallback_state",
         "//tensorflow/core/tfrt/graph_executor",
         "//tensorflow/core/tfrt/graph_executor:graph_execution_options",
+        "//tensorflow/core/tfrt/mlrt/bytecode",
         "//tensorflow/core/tfrt/runtime",
         "//tensorflow/core/tfrt/saved_model/utils:serialize_utils",
         "@com_google_absl//absl/container:flat_hash_map",
diff --git a/tensorflow/core/tfrt/saved_model/saved_model.cc b/tensorflow/core/tfrt/saved_model/saved_model.cc
index 56cec2b5821ae3..0dc659468e28f0 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model.cc
+++ b/tensorflow/core/tfrt/saved_model/saved_model.cc
@@ -174,7 +174,7 @@ tensorflow::Status RunBytecodeInitializers(
         /*sync_resource_state=*/nullptr));
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status RunBefInitializers(
@@ -228,7 +228,7 @@ tensorflow::Status RunBefInitializers(
   TF_RETURN_IF_ERROR(
       RunRuntimeInitializer(exec_ctx, bef_file, "_tfrt_resource_init"));
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status IsInputSpecsCorrect(
@@ -249,7 +249,7 @@ tensorflow::Status IsInputSpecsCorrect(
         << " input shape is wrong, expected : " << expected_input_spec.shape
         << ", actual: " << inputs[i].shape();
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status CheckInputSpecs(
@@ -259,7 +259,7 @@ tensorflow::Status CheckInputSpecs(
     absl::Span<const tensorflow::Tensor> input_tensors) {
   if (!run_options.validate_input_specs &&
       !run_options.validate_input_specs_dry_run) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   auto status = IsInputSpecsCorrect(signature_name, signature, input_tensors);
@@ -278,7 +278,7 @@ tensorflow::Status CheckInputSpecs(
         << "TFRT input specs validation failed, " << error_string;
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status PreprocessSignature(
@@ -341,7 +341,7 @@ tensorflow::Status PreprocessSignature(
     output_tensor_names.push_back(tensor_info.name());
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool AotPackageExists(absl::string_view saved_model_dir) {
@@ -436,20 +436,18 @@ void UpdateCompileOptions(SavedModel::Options& options) {
 
 }  // namespace
 
-tensorflow::StatusOr<std::unique_ptr<SavedModel>>
-SavedModelImpl::LoadSavedModel(Options options,
-                               absl::string_view saved_model_dir,
-                               const std::unordered_set<std::string>& tags) {
+absl::StatusOr<std::unique_ptr<SavedModel>> SavedModelImpl::LoadSavedModel(
+    Options options, absl::string_view saved_model_dir,
+    const std::unordered_set<std::string>& tags) {
   TF_ASSIGN_OR_RETURN(auto meta_graph_def,
                       ReadSavedModel(saved_model_dir, tags));
   return LoadSavedModel(std::move(options), std::move(meta_graph_def),
                         saved_model_dir);
 }
 
-tensorflow::StatusOr<std::unique_ptr<SavedModel>>
-SavedModelImpl::LoadSavedModel(Options options,
-                               tensorflow::MetaGraphDef meta_graph_def,
-                               absl::string_view saved_model_dir) {
+absl::StatusOr<std::unique_ptr<SavedModel>> SavedModelImpl::LoadSavedModel(
+    Options options, tensorflow::MetaGraphDef meta_graph_def,
+    absl::string_view saved_model_dir) {
   LOG(INFO) << "TFRT loading v1 savedmodel: " << saved_model_dir;
   tfrt::metrics::AddTFRTVersionMetric();
 
@@ -513,7 +511,7 @@ SavedModelImpl::LoadSavedModel(Options options,
     LOG(INFO) << "Found AOT package. Load and deserialize MLIR module.";
 
     TF_RETURN_IF_ERROR(
-        DeserializeAotMlirModule(saved_model_dir, &context, &mlir_module));
+        DeserializeAoTMlirModule(saved_model_dir, &context, &mlir_module));
   } else {
     ASSIGN_OR_RETURN_IN_IMPORT(
         mlir_module,
@@ -579,11 +577,19 @@ SavedModelImpl::LoadSavedModel(Options options,
   mlrt::bc::Buffer bytecode;
   tfrt::BefBuffer bef;
   if (aot_exist) {
-    LOG(INFO) << "Found AOT package. Load and deserialize BEF.";
+    LOG(INFO) << "Found AoT package. Load and deserialize BEF.";
     if (options.graph_execution_options.enable_mlrt) {
-      // TODO(b/303504882): Add deserialization for mlrt path
-      return absl::InternalError("AOT is not supported in MLRT");
+      LOG(INFO) << "Found AoT package. Load and deserialize MLRT Bytecode.";
+
+      ASSIGN_OR_RETURN_IN_COMPILE(
+          bytecode,
+          LoadMlrtAndMlir(options.graph_execution_options.compile_options,
+                          mlir_module.get(), saved_model_dir_string,
+                          fallback_state.get()));
+
     } else {
+      LOG(INFO) << "Found AoT package. Load and deserialize BEF.";
+
       ASSIGN_OR_RETURN_IN_COMPILE(
           bef, LoadBefAndMlir(options.graph_execution_options.compile_options,
                               mlir_module.get(), saved_model_dir_string,
@@ -762,7 +768,10 @@ tensorflow::Status SavedModelImpl::Run(
         /*visited_feed_tensor_names=*/nullptr, input_tensors,
         output_tensor_names));
 
-    return graph_executor_->Run(run_options, input_tensors, output_tensor_names,
+    auto run_opt = run_options;
+    run_opt.name = name;
+
+    return graph_executor_->Run(run_opt, input_tensors, output_tensor_names,
                                 /*target_tensor_names=*/{}, outputs);
   }
 
@@ -892,10 +901,10 @@ tensorflow::Status SavedModelImpl::RunMultipleSignatures(
     cur += len;
     DCHECK_LE(std::distance(flat_outputs.begin(), cur), flat_outputs.size());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-tensorflow::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
 SavedModelImpl::ImportSubgraph(
     mlir::MLIRContext* context, absl::string_view name,
     const tensorflow::GraphImportConfig::InputArrays& input_nodes,
@@ -943,7 +952,7 @@ using JoinedSignature = SavedModelImpl::JoinedSignature;
 // the order of inputs for the joined signature would be different from the
 // original order. For outputs, overlapping is fine so we only flatten it in the
 // original order.
-StatusOr<JoinedSignature> JoinSignatures(
+absl::StatusOr<JoinedSignature> JoinSignatures(
     absl::Span<const std::string> names, const SignatureMap& signature_map,
     const tensorflow::protobuf::Map<std::string, tensorflow::SignatureDef>&
         signature_def_map) {
@@ -1018,7 +1027,7 @@ StatusOr<JoinedSignature> JoinSignatures(
 }  // namespace
 
 // TODO(b/216379787): Reuse `GraphExecutor::LoadClientGraph()`.
-StatusOr<std::reference_wrapper<const SavedModelImpl::LoadingResult>>
+absl::StatusOr<std::reference_wrapper<const SavedModelImpl::LoadingResult>>
 SavedModelImpl::LoadJoinedSignature(const JoinedSignature& joined_signature) {
   // Step 1: Import the combined subgraph from proto to an MLIR module.
   mlir::DialectRegistry registry;
@@ -1090,7 +1099,7 @@ SavedModelImpl::LoadJoinedSignature(const JoinedSignature& joined_signature) {
   return {*loading_result_ptr};
 }
 
-StatusOr<std::reference_wrapper<const SavedModelImpl::LoadingResult>>
+absl::StatusOr<std::reference_wrapper<const SavedModelImpl::LoadingResult>>
 SavedModelImpl::GetOrCreateLoadingResult(const RunOptions& run_options,
                                          absl::Span<const std::string> names) {
   const auto joined_name = absl::StrJoin(names, kSignatureJoiningDelimiter);
diff --git a/tensorflow/core/tfrt/saved_model/saved_model.h b/tensorflow/core/tfrt/saved_model/saved_model.h
index 5023e75735f59f..297a0468130d7c 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model.h
+++ b/tensorflow/core/tfrt/saved_model/saved_model.h
@@ -207,14 +207,14 @@ class SavedModelImpl final : public SavedModel {
   //
   // If `options.maybe_load_from_mla` is true, tries opening `saved_model_dir`
   // as an MLA. If it's not an MLA, uses it as a normal SavedModel directory.
-  static tensorflow::StatusOr<std::unique_ptr<SavedModel>> LoadSavedModel(
+  static absl::StatusOr<std::unique_ptr<SavedModel>> LoadSavedModel(
       Options options, absl::string_view saved_model_dir,
       const std::unordered_set<std::string>& tags);
 
   // Loads all SignatureDefs in `meta_graph_def`. Refer to
   // http://g3doc/learning/serving/g3doc/saved_model/overview.md
   // for explanations on SavedModel.
-  static tensorflow::StatusOr<std::unique_ptr<SavedModel>> LoadSavedModel(
+  static absl::StatusOr<std::unique_ptr<SavedModel>> LoadSavedModel(
       Options options, tensorflow::MetaGraphDef meta_graph_def,
       absl::string_view saved_model_dir);
 
@@ -283,21 +283,19 @@ class SavedModelImpl final : public SavedModel {
 
   // Imports a subgraph as an MLIR module with the specified `input_nodes`,
   // `output_nodes`.
-  tensorflow::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ImportSubgraph(
+  absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ImportSubgraph(
       mlir::MLIRContext* context, absl::string_view name,
       const tensorflow::GraphImportConfig::InputArrays& input_nodes,
       const std::vector<std::string>& output_nodes,
       const std::vector<std::string>& target_nodes);
 
   // Given the joined signature, loads the subgraph and returns loading result.
-  tensorflow::StatusOr<
-      std::reference_wrapper<const SavedModelImpl::LoadingResult>>
+  absl::StatusOr<std::reference_wrapper<const SavedModelImpl::LoadingResult>>
   LoadJoinedSignature(const JoinedSignature& joined_signature)
       TF_EXCLUSIVE_LOCKS_REQUIRED(loading_result_cache_mu_);
 
   // Returns the loading result given the signature names.
-  tensorflow::StatusOr<
-      std::reference_wrapper<const SavedModelImpl::LoadingResult>>
+  absl::StatusOr<std::reference_wrapper<const SavedModelImpl::LoadingResult>>
   GetOrCreateLoadingResult(const RunOptions& run_options,
                            absl::Span<const std::string> names)
       TF_LOCKS_EXCLUDED(loading_result_cache_mu_);
diff --git a/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.cc b/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.cc
index 6547d2172d3cad..3e75f8aa09b186 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.cc
+++ b/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.cc
@@ -167,7 +167,7 @@ Status UpdateGraphDefWithInputShapes(
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Constructs function and args in place using `xla_func_def`.
@@ -195,8 +195,8 @@ void ConstructFunctionAndArgs(const std::string& name,
 
 AotOptions::AotOptions() : graph_execution_options(nullptr) {}
 
-StatusOr<AotResult> AotCompileSavedModel(absl::string_view input_model_dir,
-                                         AotOptions aot_options) {
+absl::StatusOr<AotResult> AotCompileSavedModel(
+    absl::string_view input_model_dir, AotOptions aot_options) {
   TF_ASSIGN_OR_RETURN(tensorflow::MetaGraphDef meta_graph_def,
                       ReadSavedModel(input_model_dir, aot_options.tags));
 
@@ -292,7 +292,8 @@ StatusOr<AotResult> AotCompileSavedModel(absl::string_view input_model_dir,
   return AotResult{std::move(bef), std::move(xla_functions)};
 }
 
-StatusOr<std::unique_ptr<xla::PjRtExecutable>> AotCompileToGpuPjRtExecutable(
+absl::StatusOr<std::unique_ptr<xla::PjRtExecutable>>
+AotCompileToGpuPjRtExecutable(
     const FunctionLibraryDefinition* flib_def, const NameAttrList& function,
     int graph_def_version, const std::vector<XlaCompiler::Argument>& args,
     bool has_ref_vars, bool may_alias_resource_update,
@@ -315,7 +316,7 @@ StatusOr<std::unique_ptr<xla::PjRtExecutable>> AotCompileToGpuPjRtExecutable(
       pjrt_options, *((*compilation_result)->computation), topology, nullptr);
 }
 
-StatusOr<std::string> AotCompileToGpuPjRtLoadedExecutableWithDevice(
+absl::StatusOr<std::string> AotCompileToGpuPjRtLoadedExecutableWithDevice(
     const FunctionLibraryDefinition* flib_def, const NameAttrList& function,
     int graph_def_version, const std::vector<XlaCompiler::Argument>& args,
     bool has_ref_vars, bool may_alias_resource_update,
@@ -338,7 +339,7 @@ StatusOr<std::string> AotCompileToGpuPjRtLoadedExecutableWithDevice(
   return se_client->SerializeExecutable(*executable);
 }
 
-StatusOr<AotResult::ExecutableMap> AotCompileXlaFunctionsInMetaGraphDef(
+absl::StatusOr<AotResult::ExecutableMap> AotCompileXlaFunctionsInMetaGraphDef(
     const MetaGraphDef& meta_graph_def, const std::string& signature_name,
     const absl::flat_hash_map<std::string, tensorflow::TensorShapeProto>&
         input_shapes,
@@ -370,7 +371,9 @@ StatusOr<AotResult::ExecutableMap> AotCompileXlaFunctionsInMetaGraphDef(
   RETURN_IF_ERROR_IN_COMPILE(ConvertTfMlirToRuntimeExecutable(
       aot_options.graph_execution_options->compile_options, mlir_module.get(),
       [](mlir::PassManager& pm, mlir::ModuleOp module,
-         const tensorflow::TfrtPipelineOptions& options) { return OkStatus(); },
+         const tensorflow::TfrtPipelineOptions& options) {
+        return absl::OkStatus();
+      },
       model_context, fallback_state.get(), &xla_function_names));
 
   AotResult::ExecutableMap result;
diff --git a/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.h b/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.h
index 6f1015ef41d10f..27db2c92aa9324 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.h
+++ b/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.h
@@ -61,8 +61,8 @@ struct AotResult {
 
 // AOT compiles saved_model in input_model_dir and returns AotResult, otherwise
 // returns error.
-StatusOr<AotResult> AotCompileSavedModel(absl::string_view input_model_dir,
-                                         AotOptions aot_options = {});
+absl::StatusOr<AotResult> AotCompileSavedModel(
+    absl::string_view input_model_dir, AotOptions aot_options = {});
 
 // TODO(b/296466237): Add unit test.
 // Runs bridge and compiles the generated XLA functions corresponding to the
@@ -72,7 +72,7 @@ StatusOr<AotResult> AotCompileSavedModel(absl::string_view input_model_dir,
 // so that shape inference pass in bridge can proceed correctly. Returns
 // AotResult::ExecutableMap as compilation result, which maps function
 // signatures to serialized executables.
-StatusOr<AotResult::ExecutableMap> AotCompileXlaFunctionsInMetaGraphDef(
+absl::StatusOr<AotResult::ExecutableMap> AotCompileXlaFunctionsInMetaGraphDef(
     const MetaGraphDef& meta_graph_def, const std::string& signature_name,
     const absl::flat_hash_map<std::string, tensorflow::TensorShapeProto>&
         input_shapes,
@@ -85,7 +85,8 @@ StatusOr<AotResult::ExecutableMap> AotCompileXlaFunctionsInMetaGraphDef(
 // AOT compiles `function` into PjRtExecutable. It is the counterpart of the JIT
 // version `CompileToPjRtLoadedExecutable`. `compilation_result` contains the
 // generated XLA computation.
-StatusOr<std::unique_ptr<xla::PjRtExecutable>> AotCompileToGpuPjRtExecutable(
+absl::StatusOr<std::unique_ptr<xla::PjRtExecutable>>
+AotCompileToGpuPjRtExecutable(
     const FunctionLibraryDefinition* flib_def, const NameAttrList& function,
     int graph_def_version, const std::vector<XlaCompiler::Argument>& args,
     bool has_ref_vars, bool may_alias_resource_update,
@@ -94,7 +95,7 @@ StatusOr<std::unique_ptr<xla::PjRtExecutable>> AotCompileToGpuPjRtExecutable(
 
 // Returns serialized PJRT loaded GPU executable. This function requires GPU
 // device to be present during compilation.
-StatusOr<std::string> AotCompileToGpuPjRtLoadedExecutableWithDevice(
+absl::StatusOr<std::string> AotCompileToGpuPjRtLoadedExecutableWithDevice(
     const FunctionLibraryDefinition* flib_def, const NameAttrList& function,
     int graph_def_version, const std::vector<XlaCompiler::Argument>& args,
     bool has_ref_vars, bool may_alias_resource_update,
diff --git a/tensorflow/core/tfrt/saved_model/saved_model_import_input.cc b/tensorflow/core/tfrt/saved_model/saved_model_import_input.cc
index 379ed634f98606..4dfc7d64a4160d 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model_import_input.cc
+++ b/tensorflow/core/tfrt/saved_model/saved_model_import_input.cc
@@ -25,7 +25,8 @@ limitations under the License.
 namespace tensorflow {
 namespace tfrt_stub {
 
-StatusOr<TfrtSavedModelMLIRImportInput> TfrtSavedModelMLIRImportInput::Create(
+absl::StatusOr<TfrtSavedModelMLIRImportInput>
+TfrtSavedModelMLIRImportInput::Create(
     const FallbackState& fallback_state, const MetaGraphDef* meta_graph_def,
     const GraphDebugInfo& debug_info,
     bool run_placer_grappler_on_nested_functions) {
@@ -49,7 +50,8 @@ TfrtSavedModelMLIRImportInput::TfrtSavedModelMLIRImportInput(
     : SavedModelMLIRImportInput(meta_graph_def, debug_info),
       graph_execution_state_(std::move(graph_execution_state)) {}
 
-StatusOr<const tensorflow::Graph*> TfrtSavedModelMLIRImportInput::GetSubGraph(
+absl::StatusOr<const tensorflow::Graph*>
+TfrtSavedModelMLIRImportInput::GetSubGraph(
     absl::string_view name, GraphImportConfig& graph_import_config) {
   LOG(INFO) << "TFRT importing savedmodel signature: " << name;
 
diff --git a/tensorflow/core/tfrt/saved_model/saved_model_import_input.h b/tensorflow/core/tfrt/saved_model/saved_model_import_input.h
index f1913359935801..a17dfb39668680 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model_import_input.h
+++ b/tensorflow/core/tfrt/saved_model/saved_model_import_input.h
@@ -30,7 +30,7 @@ namespace tfrt_stub {
 // the MLIR importer.
 class TfrtSavedModelMLIRImportInput : public SavedModelMLIRImportInput {
  public:
-  static StatusOr<TfrtSavedModelMLIRImportInput> Create(
+  static absl::StatusOr<TfrtSavedModelMLIRImportInput> Create(
       const FallbackState& fallback_state, const MetaGraphDef* meta_graph_def,
       const GraphDebugInfo& debug_info,
       bool run_placer_grappler_on_nested_functions = false);
@@ -39,7 +39,7 @@ class TfrtSavedModelMLIRImportInput : public SavedModelMLIRImportInput {
       const MetaGraphDef* meta_graph_def, const GraphDebugInfo& debug_info,
       std::unique_ptr<TfrtGraphExecutionState> graph_execution_state);
 
-  StatusOr<const tensorflow::Graph*> GetSubGraph(
+  absl::StatusOr<const tensorflow::Graph*> GetSubGraph(
       absl::string_view name, GraphImportConfig& graph_import_config) override;
 
   // Return the time used by grappler.
diff --git a/tensorflow/core/tfrt/saved_model/saved_model_util.cc b/tensorflow/core/tfrt/saved_model/saved_model_util.cc
index 10df526642c809..e03c7d6f24fd76 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model_util.cc
+++ b/tensorflow/core/tfrt/saved_model/saved_model_util.cc
@@ -57,6 +57,7 @@ limitations under the License.
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 #include "tensorflow/core/tfrt/fallback/fallback_state.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
 #include "tensorflow/core/tfrt/saved_model/saved_model_import_input.h"
 #include "tensorflow/core/tfrt/saved_model/utils/serialize_utils.h"
 #include "tsl/platform/env.h"
@@ -128,7 +129,7 @@ tensorflow::Tensor CreateScalarStringTensor(absl::string_view str) {
 // Create the tensor for the bound input, which can be a variable or an asset.
 //
 // TODO(chky): For V2 models, the bound input can also be a resource.
-StatusOr<tensorflow::Tensor> CreateTensorFromBoundInput(
+absl::StatusOr<tensorflow::Tensor> CreateTensorFromBoundInput(
     mlir::Operation* bound_input, absl::string_view saved_model_dir) {
   // Assets are files in the saved model directory. We pass their filenames to
   // functions so that they can be used.
@@ -143,7 +144,7 @@ StatusOr<tensorflow::Tensor> CreateTensorFromBoundInput(
       "Failed to create captured tensors: unknown bound input type.");
 }
 
-StatusOr<InitializersAndSignatures> GetInitializersAndSignatures(
+absl::StatusOr<InitializersAndSignatures> GetInitializersAndSignatures(
     mlir::ModuleOp module, absl::string_view saved_model_dir) {
   InitializersAndSignatures result;
 
@@ -223,7 +224,7 @@ StatusOr<InitializersAndSignatures> GetInitializersAndSignatures(
   return result;
 }
 
-StatusOr<tensorflow::MetaGraphDef> ReadSavedModel(
+absl::StatusOr<tensorflow::MetaGraphDef> ReadSavedModel(
     absl::string_view saved_model_dir,
     const std::unordered_set<std::string>& tags) {
   LOG(INFO) << "TFRT reading v1 savedmodel: " << saved_model_dir;
@@ -231,7 +232,7 @@ StatusOr<tensorflow::MetaGraphDef> ReadSavedModel(
 
   tensorflow::MetaGraphDef meta_graph_def;
   TF_RETURN_IF_ERROR(tensorflow::ReadMetaGraphDefFromSavedModel(
-      std::string(saved_model_dir), tags, &meta_graph_def));
+      saved_model_dir, tags, &meta_graph_def));
 
   const auto read_meta_graph_duration = absl::Now() - read_start_time;
   saved_model_read_meta_graph_time_seconds
@@ -242,7 +243,7 @@ StatusOr<tensorflow::MetaGraphDef> ReadSavedModel(
   return std::move(meta_graph_def);
 }
 
-StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ImportSavedModel(
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ImportSavedModel(
     mlir::MLIRContext* context, const tensorflow::MetaGraphDef& meta_graph_def,
     const FallbackState& fallback_state, std::string saved_model_dir,
     bool import_user_signatures, bool run_placer_grappler_on_functions,
@@ -302,6 +303,11 @@ std::string GetBefFilePath(std::string aot_package_directory) {
                            std::string(kBefBufferFileName));
 }
 
+std::string GetMlrtByteCodeFilePath(const std::string& aot_package_directory) {
+  return tsl::io::JoinPath(aot_package_directory,
+                           std::string(kMlrtBufferFileName));
+}
+
 std::string GetMlirFilePath(const std::string& aot_package_directory) {
   return tsl::io::JoinPath(aot_package_directory, kMlirModuleFilename);
 }
@@ -326,7 +332,28 @@ absl::StatusOr<tfrt::BefBuffer> LoadBefAndMlir(
   return bef;
 }
 
-absl::Status DeserializeAotMlirModule(
+absl::StatusOr<mlrt::bc::Buffer> LoadMlrtAndMlir(
+    const TfrtCompileOptions& options, mlir::ModuleOp mlir_module,
+    const std::string& saved_model_dir,
+    tfrt_stub::FallbackState* fallback_state) {
+  const std::string aot_package_directory = GetAotPackagePath(saved_model_dir);
+  const std::string mlrt_file_path =
+      tfrt_stub::GetMlrtByteCodeFilePath(aot_package_directory);
+  TF_ASSIGN_OR_RETURN(mlrt::bc::Buffer mlrt_bytecode,
+                      DeserializeMlrtBytecodeBuffer(mlrt_file_path));
+
+  if (mlrt_bytecode.empty()) {
+    return absl::InternalError("MLRT Bytecode is empty.");
+  }
+
+  if (options.device_target == TfrtDeviceInfraTarget::kGpu) {
+    TF_RETURN_IF_ERROR(AddXlaFunctions(fallback_state, mlir_module));
+  }
+
+  return mlrt_bytecode;
+}
+
+absl::Status DeserializeAoTMlirModule(
     absl::string_view saved_model_dir, mlir::MLIRContext* context,
     mlir::OwningOpRef<mlir::ModuleOp>* mlir_module) {
   const std::string aot_package_directory = GetAotPackagePath(saved_model_dir);
diff --git a/tensorflow/core/tfrt/saved_model/saved_model_util.h b/tensorflow/core/tfrt/saved_model/saved_model_util.h
index bc208406419080..42fd2473875053 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model_util.h
+++ b/tensorflow/core/tfrt/saved_model/saved_model_util.h
@@ -91,13 +91,13 @@ struct Signature {
 
 // If `import_signature_names` is non-empty, this function only imports the
 // graph that corresponds to this list.
-StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ImportSavedModel(
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ImportSavedModel(
     mlir::MLIRContext* context, const tensorflow::MetaGraphDef& meta_graph_def,
     const FallbackState& fallback_state, std::string saved_model_dir,
     bool import_user_signatures, bool run_placer_grappler_on_functions,
     const std::vector<std::string>& import_signature_names = {});
 
-StatusOr<tensorflow::MetaGraphDef> ReadSavedModel(
+absl::StatusOr<tensorflow::MetaGraphDef> ReadSavedModel(
     absl::string_view saved_model_dir,
     const std::unordered_set<std::string>& tags);
 
@@ -118,7 +118,7 @@ struct InitializersAndSignatures {
 
 // If `saved_model_dir` is non-empty, this function fills in the Initializer's
 // inputs in the returned result.
-StatusOr<InitializersAndSignatures> GetInitializersAndSignatures(
+absl::StatusOr<InitializersAndSignatures> GetInitializersAndSignatures(
     mlir::ModuleOp module, absl::string_view saved_model_dir = "");
 
 std::string GetAotPackagePath(absl::string_view saved_model_dir);
@@ -134,7 +134,12 @@ absl::StatusOr<tfrt::BefBuffer> LoadBefAndMlir(
     const std::string& saved_model_dir,
     tfrt_stub::FallbackState* fallback_state);
 
-absl::Status DeserializeAotMlirModule(
+absl::StatusOr<mlrt::bc::Buffer> LoadMlrtAndMlir(
+    const TfrtCompileOptions& options, mlir::ModuleOp mlir_module,
+    const std::string& saved_model_dir,
+    tfrt_stub::FallbackState* fallback_state);
+
+absl::Status DeserializeAoTMlirModule(
     absl::string_view saved_model_dir, mlir::MLIRContext* context,
     mlir::OwningOpRef<mlir::ModuleOp>* mlir_module);
 
diff --git a/tensorflow/core/tfrt/saved_model/tests/BUILD b/tensorflow/core/tfrt/saved_model/tests/BUILD
index 71a181205f6a33..5da2ed329f5115 100644
--- a/tensorflow/core/tfrt/saved_model/tests/BUILD
+++ b/tensorflow/core/tfrt/saved_model/tests/BUILD
@@ -599,18 +599,34 @@ cc_library(
     tags = ["no_oss"],
     deps = [
         "//tensorflow/compiler/mlir/tfrt:backend_compiler",
+        "//tensorflow/core:framework",
         "//tensorflow/core:test",
+        "//tensorflow/core/framework:tensor",
+        "//tensorflow/core/framework:types_proto_cc",
         "//tensorflow/core/platform:path",
         "//tensorflow/core/platform:resource_loader",
-        "//tensorflow/core/tfrt/fallback:cost_recorder",
         "//tensorflow/core/tfrt/graph_executor:config",
         "//tensorflow/core/tfrt/graph_executor:test_config_proto_cc",
         "//tensorflow/core/tfrt/run_handler_thread_pool:run_handler_concurrent_work_queue",
+        "//tensorflow/core/tfrt/runtime",
+        "//tensorflow/core/tfrt/runtime:work_queue_interface",
+        "//tensorflow/core/tfrt/saved_model:saved_model_cpu",
         "//tensorflow/core/tfrt/saved_model:saved_model_testutil",
+        "//tensorflow/core/tfrt/saved_model:saved_model_util",
         "//tensorflow/python/framework:test_ops_kernels",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
         "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
         "@tf_runtime//:core_runtime_alwayslink",
+        "@tf_runtime//:hostcontext",
     ],
     alwayslink = 1,
 )
@@ -634,18 +650,19 @@ cc_library(
         "//tensorflow/core/runtime_fallback/runtime:runtime_fallback_alwayslink",
         "//tensorflow/core/tfrt:ifrt_program_ops_op_lib",
         "//tensorflow/core/tfrt/ifrt:ifrt_model_context",
+        "//tensorflow/core/tfrt/mlrt/kernel:ifrt_ops_kernel",
         "//tensorflow/core/tfrt/runtime",
         "//tensorflow/core/tfrt/saved_model:saved_model_cpu",
         "//tensorflow/core/tfrt/saved_model:saved_model_testutil",
         "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest",
-        "@eigen_archive//:eigen3",
         "@local_tsl//tsl/platform:env",
         "@local_xla//xla/python/ifrt",
         "@local_xla//xla/python/ifrt:test_util",
         "@local_xla//xla/python/pjrt_ifrt:tfrt_cpu_client_test_lib",
         "@tf_runtime//:basic_kernels_alwayslink",
         "@tf_runtime//:core_runtime_alwayslink",
+        "@tf_runtime//:hostcontext",
         "@tf_runtime//:test_kernels_alwayslink",
         "@tf_runtime//backends/cpu:core_runtime_alwayslink",
         "@tf_runtime//backends/cpu:tf_ops_alwayslink",
diff --git a/tensorflow/core/tfrt/saved_model/tests/saved_model_ifrt_test.cc b/tensorflow/core/tfrt/saved_model/tests/saved_model_ifrt_test.cc
index ca4f42ce64da92..cef110c75083d0 100644
--- a/tensorflow/core/tfrt/saved_model/tests/saved_model_ifrt_test.cc
+++ b/tensorflow/core/tfrt/saved_model/tests/saved_model_ifrt_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 #include <string>
-#include <utility>
 #include <vector>
 
 #include <gmock/gmock.h>
@@ -36,6 +35,7 @@ limitations under the License.
 #include "tsl/platform/env.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/threadpool.h"
+#include "tfrt/host_context/concurrent_work_queue.h"  // from @tf_runtime
 
 namespace tensorflow {
 namespace tfrt_stub {
@@ -49,8 +49,7 @@ tsl::thread::ThreadPool& GetThreadPool() {
   return *thread_pool;
 }
 
-// TODO(b/319045348): replace with a variableless model.
-TEST(SavedModelIfrt, DISABLED_Basic) {
+TEST(SavedModelIfrt, Basic) {
   std::string saved_model_dir = tensorflow::GetDataDependencyFilepath(
       "tensorflow/core/tfrt/saved_model/tests/toy_v2");
 
@@ -61,17 +60,27 @@ TEST(SavedModelIfrt, DISABLED_Basic) {
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::ifrt::Client> client,
                           xla::ifrt::test_util::GetClient());
 
+  auto work_queue = tfrt::CreateMultiThreadedWorkQueue(
+      /*num_threads=*/4, /*num_blocking_threads=*/4);
+
   // Use IFRT compiler
   runtime->AddCreateRuntimeResourceFn(
       [&](tensorflow::tfrt_stub::ModelRuntimeContext& model_context) {
         model_context.resource_context()
             .CreateResource<tensorflow::ifrt_serving::IfrtModelContext>(
                 "IfrtModelContext", client, &GetThreadPool());
+
+        (*model_context.resource_context()
+              .GetResource<tensorflow::ifrt_serving::IfrtModelContext>(
+                  "IfrtModelContext"))
+            ->set_checkpoint_loader_queue(work_queue.get());
+
         return absl::OkStatus();
       });
   tensorflow::ifrt_serving::IfrtBackendCompiler ifrt_compiler;
 
   auto options = DefaultSavedModelOptions(runtime.get());
+  options.graph_execution_options.enable_mlrt = true;
   options.enable_lazy_loading = true;
   options.lazy_loading_use_graph_executor = true;
   options.graph_execution_options.compile_options.backend_compiler =
diff --git a/tensorflow/core/tfrt/saved_model/tests/saved_model_test.cc b/tensorflow/core/tfrt/saved_model/tests/saved_model_test.cc
index ab745550e1932e..cbb7f716c7632d 100644
--- a/tensorflow/core/tfrt/saved_model/tests/saved_model_test.cc
+++ b/tensorflow/core/tfrt/saved_model/tests/saved_model_test.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/core/tfrt/saved_model/saved_model.h"
+
 #include <algorithm>
 #include <cstdint>
 #include <iterator>
@@ -22,15 +24,31 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/match.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "absl/types/span.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tfrt/backend_compiler.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/resource_loader.h"
-#include "tensorflow/core/tfrt/fallback/cost_recorder.h"
 #include "tensorflow/core/tfrt/graph_executor/config.h"
 #include "tensorflow/core/tfrt/graph_executor/test_config.pb.h"
 #include "tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue.h"
+#include "tensorflow/core/tfrt/runtime/runtime.h"
+#include "tensorflow/core/tfrt/runtime/work_queue_interface.h"
 #include "tensorflow/core/tfrt/saved_model/saved_model_testutil.h"
+#include "tensorflow/core/tfrt/saved_model/saved_model_util.h"
+#include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/status.h"
+#include "tsl/platform/statusor.h"
+#include "tfrt/host_context/concurrent_work_queue.h"  // from @tf_runtime
 
 namespace tensorflow {
 namespace tfrt_stub {
@@ -1160,7 +1178,7 @@ TEST(SavedModelTest, CustomCompiler) {
   tfrt::SavedModel::RunOptions run_options;
   TF_ASSERT_OK(saved_model->Run(run_options, "toy", inputs, &outputs));
 
-  EXPECT_EQ(test_context.signature_name, "input1:0^result1:0^");
+  EXPECT_EQ(test_context.signature_name, "toy");
 }
 
 }  // namespace
diff --git a/tensorflow/core/tfrt/saved_model/utils/BUILD b/tensorflow/core/tfrt/saved_model/utils/BUILD
index 042710004c2ff4..b76aa6fa0b8c62 100644
--- a/tensorflow/core/tfrt/saved_model/utils/BUILD
+++ b/tensorflow/core/tfrt/saved_model/utils/BUILD
@@ -55,11 +55,13 @@ tf_cc_shared_test(
         "//tensorflow/core/tfrt/fallback:fallback_state",
         "//tensorflow/core/tfrt/mlrt/bytecode",
         "//tensorflow/core/tfrt/saved_model:saved_model_testutil",
+        "//tensorflow/core/tfrt/saved_model:saved_model_util",
         "//tensorflow/core/tfrt/utils",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
         "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:env",
         "@tf_runtime//:bef",
     ],
 )
diff --git a/tensorflow/core/tfrt/saved_model/utils/serialize_utils.cc b/tensorflow/core/tfrt/saved_model/utils/serialize_utils.cc
index eb624457f27e7f..fc9a2c4e739fe1 100644
--- a/tensorflow/core/tfrt/saved_model/utils/serialize_utils.cc
+++ b/tensorflow/core/tfrt/saved_model/utils/serialize_utils.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/tfrt/saved_model/utils/serialize_utils.h"
 
+#include <cstring>
 #include <memory>
 #include <string>
 
@@ -53,6 +54,7 @@ absl::StatusOr<tfrt::BefBuffer> DeserializeBEFBuffer(
 absl::Status SerializeMLRTBytecode(const mlrt::bc::Buffer &bytecode,
                                    const std::string &filepath) {
   std::string errorMessage;
+
   auto output = mlir::openOutputFile(filepath, &errorMessage);
   (output->os())
       .write(reinterpret_cast<const char *>(bytecode.data()), bytecode.size());
@@ -62,5 +64,20 @@ absl::Status SerializeMLRTBytecode(const mlrt::bc::Buffer &bytecode,
   return absl::OkStatus();
 }
 
+absl::StatusOr<mlrt::bc::Buffer> DeserializeMlrtBytecodeBuffer(
+    const std::string &filepath) {
+  std::string bytecode_data;
+  TF_CHECK_OK(ReadFileToString(tsl::Env::Default(), filepath, &bytecode_data));
+  // Convert the string to a byte array.
+  mlrt::bc::Buffer buffer;
+  mlrt::bc::Allocator allocator(&buffer);
+  allocator.Allocate(bytecode_data.length(), alignof(char));
+
+  memcpy(buffer.data(), bytecode_data.data(), bytecode_data.length());
+
+  LOG(INFO) << "Successfully loaded serialized MLRTBytecode from: " << filepath;
+  return buffer;
+}
+
 }  // namespace tfrt_stub
 }  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/saved_model/utils/serialize_utils.h b/tensorflow/core/tfrt/saved_model/utils/serialize_utils.h
index a0258518cfc6d1..6708b44af63c5c 100644
--- a/tensorflow/core/tfrt/saved_model/utils/serialize_utils.h
+++ b/tensorflow/core/tfrt/saved_model/utils/serialize_utils.h
@@ -44,6 +44,10 @@ absl::StatusOr<tfrt::BefBuffer> DeserializeBEFBuffer(
 absl::Status SerializeMLRTBytecode(const mlrt::bc::Buffer &byteCode,
                                    const std::string &filepath);
 
+// Deserializes byte code from the given filepath into a MLRTBytecodeBuffer.
+absl::StatusOr<mlrt::bc::Buffer> DeserializeMlrtBytecodeBuffer(
+    const std::string &filepath);
+
 }  // namespace tfrt_stub
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/tfrt/saved_model/utils/serialize_utils_test.cc b/tensorflow/core/tfrt/saved_model/utils/serialize_utils_test.cc
index 9ca6b53712a380..deaf171d9d1162 100644
--- a/tensorflow/core/tfrt/saved_model/utils/serialize_utils_test.cc
+++ b/tensorflow/core/tfrt/saved_model/utils/serialize_utils_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/tfrt/saved_model/utils/serialize_utils.h"
 
+#include <cstdlib>
 #include <memory>
 #include <string>
 
@@ -30,8 +31,10 @@ limitations under the License.
 #include "tensorflow/core/tfrt/fallback/fallback_state.h"
 #include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
 #include "tensorflow/core/tfrt/saved_model/saved_model_testutil.h"
+#include "tensorflow/core/tfrt/saved_model/saved_model_util.h"
 #include "tensorflow/core/tfrt/utils/utils.h"
 #include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/env.h"
 #include "tfrt/bef/bef_buffer.h"  // from @tf_runtime
 
 namespace tensorflow {
@@ -39,10 +42,9 @@ namespace tfrt_stub {
 namespace {
 
 TEST(SerializeBEFTest, HandlesCompleteProcess) {
-  // Create Empty BEF Buffer
   tfrt::BefBuffer old_bef;
 
-  // Load BEF Buffer Data
+  // Load BEF Buffer Data.
 
   const std::string saved_model_mlir_path =
       "third_party/tensorflow/compiler/mlir/tfrt/tests/saved_model/testdata/"
@@ -64,25 +66,24 @@ TEST(SerializeBEFTest, HandlesCompleteProcess) {
   TF_ASSERT_OK(ConvertTfMlirToBef(options.compile_options, module.get(),
                                   &old_bef, model_context));
 
-  // Create Filepath for .mlir.bef
+  // Create Filepath for .mlir.bef.
   const std::string filepath =
       io::JoinPath(getenv("TEST_UNDECLARED_OUTPUTS_DIR"),
                    std::string("serialized_bef.mlir.bef"));
 
-  // Serialize BEF Buffer
+  // Serialize BEF Buffer.
   TF_ASSERT_OK(tensorflow::tfrt_stub::SerializeBEF(old_bef, filepath));
-  // Check that Bef Buffer is not empty
   ASSERT_NE(old_bef.size(), 0);
 
-  // Create new empty BEF buffer and deserialize to verify data
+  // Create new empty BEF buffer and deserialize to verify data.
 
   TF_ASSERT_OK_AND_ASSIGN(const tfrt::BefBuffer bef,
                           DeserializeBEFBuffer(filepath));
 
-  // Check for any data loss during deserialization process
+  // Check for any data loss during deserialization process.
   ASSERT_TRUE(old_bef.size() == bef.size());
 
-  // Check file creation
+  // Check file creation.
   std::unique_ptr<Runtime> default_runtime =
       DefaultTfrtRuntime(/*num_threads=*/1);
   SavedModel::Options default_options =
@@ -92,12 +93,10 @@ TEST(SerializeBEFTest, HandlesCompleteProcess) {
                    .status());
 }
 
-TEST(SerializeMLRTTest, HandlesSerializeProcess) {
-  // Create Empty MLRT Bytecode
-  // tfrt::BefBuffer old_bef;
-  mlrt::bc::Buffer old_byteCode;
+TEST(SerializeMLRTTest, HandlesSerializeAndDeserializeProcess) {
+  mlrt::bc::Buffer old_bytecode;
 
-  // Load MLRT Bytecode Data
+  // Load MLRT Bytecode Data.
 
   const std::string saved_model_mlir_path =
       "third_party/tensorflow/compiler/mlir/tfrt/tests/saved_model/testdata/"
@@ -121,20 +120,41 @@ TEST(SerializeMLRTTest, HandlesSerializeProcess) {
   tfrt_stub::ModelRuntimeContext model_context(
       &options, options.compile_options.saved_model_dir, &resource_context);
   TF_ASSERT_OK_AND_ASSIGN(
-      auto buffer, mlrt_compiler::ConvertTfMlirToBytecode(
-                       options.compile_options, *fallback_state, module.get(),
-                       model_context, &module_with_op_keys));
+      old_bytecode, mlrt_compiler::ConvertTfMlirToBytecode(
+                        options.compile_options, *fallback_state, module.get(),
+                        model_context, &module_with_op_keys));
+
+  // Create Filepath for .mlir.mlrt.
+  const std::string aot_package_path =
+      GetAotPackagePath(getenv("TEST_UNDECLARED_OUTPUTS_DIR"));
+  tsl::Env* env = tsl::Env::Default();
+  TF_ASSERT_OK(env->RecursivelyCreateDir(aot_package_path));
 
-  // Create Filepath for .mlir.mlrt
   const std::string filepath =
-      io::JoinPath(getenv("TEST_UNDECLARED_OUTPUTS_DIR"),
-                   std::string("serialized_mlrt.mlir.mlrt"));
+      io::JoinPath(aot_package_path, std::string("serialized_mlrt.mlir.mlrt"));
 
-  // Serialize MLRT Bytecode
+  // Serialize MLRT Bytecode.
   TF_ASSERT_OK(
-      tensorflow::tfrt_stub::SerializeMLRTBytecode(old_byteCode, filepath));
-  // Check that MLRT Bytecode is not empty
-  ASSERT_NE(buffer.size(), 0);
+      tensorflow::tfrt_stub::SerializeMLRTBytecode(old_bytecode, filepath));
+  ASSERT_NE(old_bytecode.size(), 0);
+
+  // Create new MLRT Bytecode and deserialize to verify data.
+  mlrt::bc::Buffer bytecode;
+  TF_ASSERT_OK_AND_ASSIGN(bytecode, DeserializeMlrtBytecodeBuffer(filepath));
+
+  // Check for any data loss during deserialization process.
+  ASSERT_TRUE(old_bytecode.size() == bytecode.size());
+  EXPECT_STREQ(old_bytecode.data(), bytecode.data());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      bytecode,
+      LoadMlrtAndMlir(options.compile_options, module_with_op_keys.get(),
+                      getenv("TEST_UNDECLARED_OUTPUTS_DIR"),
+                      fallback_state.get()));
+
+  // Check for any data loss during deserialization process.
+  ASSERT_TRUE(old_bytecode.size() == bytecode.size());
+  EXPECT_STREQ(old_bytecode.data(), bytecode.data());
 }
 }  // namespace
 }  // namespace tfrt_stub
diff --git a/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc b/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc
index a664a14d58ffe1..7acc2ae074e7af 100644
--- a/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc
+++ b/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc
@@ -165,7 +165,7 @@ class TfrtSession : public tensorflow::Session {
       TF_EXCLUSIVE_LOCKS_REQUIRED(session_state_lock_) {
     if (graph.node_size() == 0) {
       LOG(ERROR) << "Ignoring empty graph.";
-      return OkStatus();
+      return absl::OkStatus();
     }
     if (session_state_ == SessionState::kCreated) {
       return errors::AlreadyExists(
@@ -256,7 +256,7 @@ class TfrtSession : public tensorflow::Session {
             std::move(graph), std::move(kernel_registry)));
 
     session_state_ = SessionState::kCreated;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status Extend(const GraphDef& graph) override {
@@ -339,7 +339,7 @@ class TfrtSession : public tensorflow::Session {
       DCHECK(output_tensor_names.empty()) << "No outputs in Run()";
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status Run(const std::vector<std::pair<std::string, Tensor>>& inputs,
@@ -387,7 +387,7 @@ class TfrtSession : public tensorflow::Session {
     *out_handle = next_callable_handle_++;
     assert(callables_.find(*out_handle) == callables_.end());
     callables_[*out_handle] = {callable_options};
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   /// \brief Invokes the subgraph named by `handle` with the given options and
@@ -453,13 +453,13 @@ class TfrtSession : public tensorflow::Session {
     if (it == callables_.end())
       return errors::InvalidArgument("No such callable handle: ", handle);
     callables_.erase(it);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status Close() override {
     absl::MutexLock lock(&session_state_lock_);
     session_state_ = SessionState::kClosed;
-    return OkStatus();
+    return absl::OkStatus();
   }
   Status ListDevices(std::vector<DeviceAttributes>* response) override {
     return errors::Unimplemented("TfrtSession::ListDevices is Unimplemented.");
@@ -505,7 +505,7 @@ class TfrtSession : public tensorflow::Session {
     if (session_state_ == SessionState::kClosed) {
       return errors::Cancelled("Session has been closed.");
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   struct Callable {
@@ -740,8 +740,8 @@ class InitializerRegistry {
 
   absl::Status RunInitializer(tfrt_stub::Runtime* runtime) {
     LOG(INFO) << "Running Initializer within TfrtSessionFactory.";
-    TF_RETURN_IF_ERROR(initializer_ ? initializer_(runtime) : OkStatus());
-    return OkStatus();
+    TF_RETURN_IF_ERROR(initializer_ ? initializer_(runtime) : absl::OkStatus());
+    return absl::OkStatus();
   }
 
  private:
@@ -756,7 +756,7 @@ Status TfrtSessionFactory::InitializeLocked(const TfrtSessionOptions& options) {
   mutex_.AssertHeld();
   if (options.use_tpu) {
     DCHECK(!options.backend_compiler);
-    device_target_ = TfrtDeviceInfraTarget::kBridgeFallback;
+    device_target_ = TfrtDeviceInfraTarget::kTpurt;
     tpu_use_tpu_runner_ = true;
   } else if (options.backend_compiler) {
     backend_compiler_ = options.backend_compiler;
@@ -770,7 +770,7 @@ Status TfrtSessionFactory::InitializeLocked(const TfrtSessionOptions& options) {
     runtime_ = owned_runtime_.get();
   }
   enable_mlrt_ = options.enable_mlrt;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool TfrtSessionFactory::AcceptsOptions(const SessionOptions& options) {
@@ -811,7 +811,7 @@ Status TfrtSessionFactory::NewSession(const SessionOptions& options,
   *out_session = new TfrtSession(
       options, runtime_, device_target_, tpu_use_tpu_runner_,
       std::move(inter_op_thread_pools), enable_mlrt_, backend_compiler);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 namespace {
diff --git a/tensorflow/core/tfrt/utils/debug/node_io_dump_rewriter.cc b/tensorflow/core/tfrt/utils/debug/node_io_dump_rewriter.cc
index c73f779fccfa90..af28439d42ca19 100644
--- a/tensorflow/core/tfrt/utils/debug/node_io_dump_rewriter.cc
+++ b/tensorflow/core/tfrt/utils/debug/node_io_dump_rewriter.cc
@@ -71,7 +71,7 @@ Status InsertDumpOpsForNode(Graph& graph, Node& node,
       TF_RETURN_IF_ERROR(
           graph.UpdateEdge(dump_node, 0, edge->dst(), edge->dst_input()));
     }
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   // Make a copy of the edges to avoid modifying edges while iterating.
@@ -79,7 +79,7 @@ Status InsertDumpOpsForNode(Graph& graph, Node& node,
                             {node.in_edges().begin(), node.in_edges().end()}));
   TF_RETURN_IF_ERROR(insert(
       /*is_input=*/false, {node.out_edges().begin(), node.out_edges().end()}));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -94,7 +94,7 @@ Status InsertDumpOps(Graph& graph,
         TF_RETURN_IF_ERROR(InsertDumpOpsForNode(graph, *node, dir));
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   TF_RETURN_IF_ERROR(insert(graph));
@@ -111,7 +111,7 @@ Status InsertDumpOps(Graph& graph,
     TF_RETURN_IF_ERROR(
         graph.mutable_flib_def()->ReplaceFunction(fname, new_fdef));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status InsertDumpOps(MetaGraphDef& meta_graph_def,
@@ -122,7 +122,7 @@ Status InsertDumpOps(MetaGraphDef& meta_graph_def,
       ConvertGraphDefToGraph({}, meta_graph_def.graph_def(), &graph));
   TF_RETURN_IF_ERROR(InsertDumpOps(graph, nodes_to_dump, dump_dir));
   graph.ToGraphDef(meta_graph_def.mutable_graph_def());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tfrt_stub
diff --git a/tensorflow/core/tfrt/utils/error_util.cc b/tensorflow/core/tfrt/utils/error_util.cc
index 3345cd8797bc24..1f8fb6a6770570 100644
--- a/tensorflow/core/tfrt/utils/error_util.cc
+++ b/tensorflow/core/tfrt/utils/error_util.cc
@@ -44,7 +44,7 @@ tensorflow::Status ToTfStatus(const tfrt::AsyncValue* av) {
   if (av->IsError()) {
     return av->GetError();
   }
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 absl::Status AbslStatusFromTfStatus(tensorflow::Status status) {
diff --git a/tensorflow/core/tfrt/utils/graph_partition.cc b/tensorflow/core/tfrt/utils/graph_partition.cc
index ca34d6fc7a852b..3d4b8d6871a549 100644
--- a/tensorflow/core/tfrt/utils/graph_partition.cc
+++ b/tensorflow/core/tfrt/utils/graph_partition.cc
@@ -208,7 +208,7 @@ Status PrepareSubgraphForFunctionConversion(
 
     subgraph->AddEdge(const_node, 0, ret_node, 0);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Converts the subgraph to a function, and builds a PartitionedCallOp
diff --git a/tensorflow/core/tfrt/utils/tfrt_graph_execution_state.cc b/tensorflow/core/tfrt/utils/tfrt_graph_execution_state.cc
index ba9473c25b93fa..498d07a0e41e23 100644
--- a/tensorflow/core/tfrt/utils/tfrt_graph_execution_state.cc
+++ b/tensorflow/core/tfrt/utils/tfrt_graph_execution_state.cc
@@ -308,7 +308,7 @@ Status TfrtGraphExecutionState::Extend(const GraphDef& graph) {
       functions_to_optimize_,
       PreprocessGraph(*graph_def, options_.run_placer_grappler_on_functions));
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 namespace {
@@ -484,7 +484,7 @@ Status PruneGraphDef(GraphDef& graph_def,
     *graph_def.add_node() = std::move(node);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status EliminateRefVariablesFromV1ControlFlow(tensorflow::GraphDef& graph_def) {
@@ -561,7 +561,7 @@ Status EliminateRefVariablesFromV1ControlFlow(tensorflow::GraphDef& graph_def) {
   }
 
   graph_def.mutable_node()->Swap(updated_graph_def.mutable_node());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void RemoveInputShapesInFunctions(tensorflow::GraphDef& graph_def) {
@@ -643,7 +643,7 @@ Status OptimizeFunctions(
 
     fdef = std::move(new_fdef);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
diff --git a/tensorflow/core/tfrt/utils/utils.cc b/tensorflow/core/tfrt/utils/utils.cc
index 7af270d7faf1a2..3cc53af88cc692 100644
--- a/tensorflow/core/tfrt/utils/utils.cc
+++ b/tensorflow/core/tfrt/utils/utils.cc
@@ -58,7 +58,7 @@ tensorflow::Status RunRuntimeInitializer(const tfrt::ExecutionContext& exec_ctx,
 
   auto* func = bef_file->GetFunction(
       {fallback_init_func.data(), fallback_init_func.size()});
-  if (func == nullptr) return ::tensorflow::OkStatus();
+  if (func == nullptr) return absl::OkStatus();
 
   if (func->function_kind() == FunctionKind::kBEFFunction) {
     auto ready_chain = GetReadyChain();
@@ -84,7 +84,7 @@ tensorflow::Status RunRuntimeInitializer(const tfrt::ExecutionContext& exec_ctx,
     }
   }
 
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 void CreateDummyTfDevices(
diff --git a/tensorflow/core/tfrt/utils/utils.h b/tensorflow/core/tfrt/utils/utils.h
index d247ff80f42691..d97c174a913875 100644
--- a/tensorflow/core/tfrt/utils/utils.h
+++ b/tensorflow/core/tfrt/utils/utils.h
@@ -42,7 +42,7 @@ class ExecutionContext;
 class HostContext;
 
 typedef tensorflow::gtl::InlinedVector<tfrt::DType, 4> TfrtDataTypeVector;
-typedef tensorflow::gtl::ArraySlice<tfrt::DType> TfrtDataTypeSlice;
+typedef absl::Span<const tfrt::DType> TfrtDataTypeSlice;
 
 DType ConvertTfDTypeToTfrtDType(tensorflow::DataType dtype);
 
diff --git a/tensorflow/core/tfrt/utils/utils_test.cc b/tensorflow/core/tfrt/utils/utils_test.cc
index b2199bdb2f68bc..7943cc78034b77 100644
--- a/tensorflow/core/tfrt/utils/utils_test.cc
+++ b/tensorflow/core/tfrt/utils/utils_test.cc
@@ -61,7 +61,7 @@ TEST(UtilsTest, ReturnIfErrorInImport) {
   auto status = []() {
     RETURN_IF_ERROR_IN_IMPORT(
         tensorflow::errors::CancelledWithPayloads("msg", {{"a", "b"}}));
-    return tensorflow::OkStatus();
+    return absl::OkStatus();
   }();
   EXPECT_FALSE(status.ok());
   EXPECT_STREQ(status.ToString().c_str(),
@@ -73,7 +73,7 @@ TEST(UtilsTest, ReturnIfErrorInCompile) {
   auto status = []() {
     RETURN_IF_ERROR_IN_COMPILE(
         tensorflow::errors::CancelledWithPayloads("msg", {{"a", "b"}}));
-    return tensorflow::OkStatus();
+    return absl::OkStatus();
   }();
   EXPECT_FALSE(status.ok());
   EXPECT_STREQ(
@@ -87,7 +87,7 @@ TEST(UtilsTest, ReturnIfErrorInInit) {
   auto status = []() {
     RETURN_IF_ERROR_IN_INIT(
         tensorflow::errors::CancelledWithPayloads("msg", {{"a", "b"}}));
-    return tensorflow::OkStatus();
+    return absl::OkStatus();
   }();
   EXPECT_FALSE(status.ok());
   EXPECT_STREQ(status.ToString().c_str(),
@@ -101,7 +101,7 @@ TEST(UtilsTest, AssignOrReturnInImport) {
         [[maybe_unused]] auto unused_value,
         absl::StatusOr<int>(
             tensorflow::errors::CancelledWithPayloads("msg", {{"a", "b"}})));
-    return tensorflow::OkStatus();
+    return absl::OkStatus();
   }();
   EXPECT_FALSE(status.ok());
   EXPECT_STREQ(status.ToString().c_str(),
@@ -115,7 +115,7 @@ TEST(UtilsTest, AssignOrReturnInCompile) {
         [[maybe_unused]] auto unused_value,
         absl::StatusOr<int>(
             tensorflow::errors::CancelledWithPayloads("msg", {{"a", "b"}})));
-    return tensorflow::OkStatus();
+    return absl::OkStatus();
   }();
   EXPECT_FALSE(status.ok());
   EXPECT_STREQ(status.ToString().c_str(),
@@ -130,7 +130,7 @@ TEST(UtilsTest, AssignOrReturnInInit) {
         [[maybe_unused]] auto unused_value,
         absl::StatusOr<int>(
             tensorflow::errors::CancelledWithPayloads("msg", {{"a", "b"}})));
-    return tensorflow::OkStatus();
+    return absl::OkStatus();
   }();
   EXPECT_FALSE(status.ok());
   EXPECT_STREQ(std::string(status.ToString()).c_str(),
diff --git a/tensorflow/core/tpu/graph_rewrite/tpu_embedding_software_deduplication_rewrite_pass.cc b/tensorflow/core/tpu/graph_rewrite/tpu_embedding_software_deduplication_rewrite_pass.cc
index e317e83f49287e..7787dea7322fb6 100644
--- a/tensorflow/core/tpu/graph_rewrite/tpu_embedding_software_deduplication_rewrite_pass.cc
+++ b/tensorflow/core/tpu/graph_rewrite/tpu_embedding_software_deduplication_rewrite_pass.cc
@@ -36,7 +36,7 @@ namespace {
 
 // Check the number of outputs for RecvActivationsNode or for number of inputs
 // For SendGradientsNode.
-xla::Status CheckNumInputsOrOutputs(
+absl::Status CheckNumInputsOrOutputs(
     const int32 num_input_or_outputs, const std::string& attribute_name,
     const std::string& node_name,
     const tpu::TPUEmbeddingConfiguration& tpu_embedding_config) {
diff --git a/tensorflow/core/tpu/kernels/BUILD b/tensorflow/core/tpu/kernels/BUILD
index afb90c5869ca21..3f4aadae425640 100644
--- a/tensorflow/core/tpu/kernels/BUILD
+++ b/tensorflow/core/tpu/kernels/BUILD
@@ -682,6 +682,7 @@ cc_library(
         ":tpu_compilation_cache_lookup",
         ":tpu_program_group_interface",
         "//tensorflow/core/platform:status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
     ] + tf_grpc_cc_dependencies(),
 )
@@ -780,7 +781,7 @@ cc_library(
         "//tensorflow/core/distributed_runtime/rpc:grpc_util",
         "//tensorflow/core/lib/core:threadpool",
         "//tensorflow/core/platform:coding",
-        "@local_tsl//tsl/distributed_runtime/rpc:grpc_call",
+        "@local_xla//xla/tsl/distributed_runtime/rpc:grpc_call",
     ] + tf_grpc_cc_dependencies(),
 )
 
diff --git a/tensorflow/core/tpu/kernels/image_resize_ops.cc b/tensorflow/core/tpu/kernels/image_resize_ops.cc
index 2e7c4b937212a1..2c442e0748fc20 100644
--- a/tensorflow/core/tpu/kernels/image_resize_ops.cc
+++ b/tensorflow/core/tpu/kernels/image_resize_ops.cc
@@ -43,7 +43,7 @@ class TpuCustomResizeOp : public XlaOpKernel {
                    ctx->GetAttr("half_pixel_centers", &half_pixel_centers_));
   }
 
-  StatusOr<xla::Shape> GetOutputShape(XlaOpKernelContext* ctx) const {
+  absl::StatusOr<xla::Shape> GetOutputShape(XlaOpKernelContext* ctx) const {
     std::vector<int64_t> out_size;
     auto status = ctx->ConstantInputAsIntVector(1, &out_size);
     CHECK_EQ(out_size.size(), 2) << status;
diff --git a/tensorflow/core/tpu/kernels/infeed_ops.cc b/tensorflow/core/tpu/kernels/infeed_ops.cc
index ebe736b2b7baeb..c33a7f92241e51 100644
--- a/tensorflow/core/tpu/kernels/infeed_ops.cc
+++ b/tensorflow/core/tpu/kernels/infeed_ops.cc
@@ -88,7 +88,7 @@ xla::Shape GetTPUInfeedLayout(const xla::Shape& shape) {
 absl::StatusOr<Tensor> TransposeTensor(OpKernelContext* ctx,
                                        const Tensor& input_tensor,
                                        const xla::Shape& xla_shape) {
-  profiler::TraceMe trace_me("TransposeTensor", /*level=*/2);
+  tsl::profiler::TraceMe trace_me("TransposeTensor", /*level=*/2);
   const int64_t rank = xla_shape.rank();
   std::vector<int32_t> permutation(rank);
   std::vector<int64_t> transposed_shapes(rank);
@@ -154,7 +154,7 @@ Status GetInfeedShapeWithLayout(OpKernelConstruction* ctx,
       *output_shape->mutable_layout() =
           GetTPUInfeedLayout(*output_shape).layout();
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   auto layout_func = [](const xla::Shape& shape) -> xla::Layout {
@@ -247,7 +247,7 @@ Status AutoTransposeAndLinearize(OpKernelContext* ctx,
       break;
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // PrelinearizeOp is used to linearize one tensor to the device format.
@@ -475,7 +475,7 @@ Status TpuInfeedEnqueueOp::DoWork(OpKernelContext* ctx, int device_ordinal) {
       transfer_op_->TransferLiteralToInfeed(device_ordinal, literal));
   VLOG(1) << "TpuInfeedEnqueueOp completes. iter_id="
           << ctx->frame_iter().iter_id << " device_ordinal=" << device_ordinal;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 TpuInfeedEnqueueTupleOp::TpuInfeedEnqueueTupleOp(
@@ -551,7 +551,7 @@ Status TpuInfeedEnqueueTupleOp::DoWork(OpKernelContext* ctx,
   VLOG(1) << "TpuInfeedEnqueueTupleOp completes. iter_id="
           << ctx->frame_iter().iter_id << " device_ordinal=" << device_ordinal;
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 InfeedEnqueuePrelinearizedBufferOp::InfeedEnqueuePrelinearizedBufferOp(
@@ -568,7 +568,7 @@ Status InfeedEnqueuePrelinearizedBufferOp::DoWork(OpKernelContext* ctx,
   TF_RETURN_IF_ERROR(
       transfer_op_->TransferBuffersToInfeed(device_ordinal, wrapper->buffers));
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // These ops execute on either the TPU device or the CPU device. When running on
diff --git a/tensorflow/core/tpu/kernels/sharding_util_ops.cc b/tensorflow/core/tpu/kernels/sharding_util_ops.cc
index fe726011527165..4ca817b23ebc13 100644
--- a/tensorflow/core/tpu/kernels/sharding_util_ops.cc
+++ b/tensorflow/core/tpu/kernels/sharding_util_ops.cc
@@ -352,7 +352,7 @@ class XlaConcatNDBaseOp : public XlaConcatNDShared<Device, T> {
   void ComputeInternal(
       bool resource, OpKernelContext* ctx, const OpInputList& inputs,
       const std::function<Status(const Tensor&)>& assign_or_copy_value_fn,
-      const std::function<StatusOr<Tensor*>()>& get_output_fn) {
+      const std::function<absl::StatusOr<Tensor*>()>& get_output_fn) {
     const Device& device = ctx->eigen_device<Device>();
     std::vector<Tensor> input_tensors(inputs.begin(), inputs.end());
     auto status = this->concatenator_->ComputeInternal(
@@ -379,7 +379,7 @@ class XlaConcatNDOp : public XlaConcatNDBaseOp<Device, T> {
       return absl::OkStatus();
     };
 
-    auto get_output_fn = [&ctx, &output_shape]() -> StatusOr<Tensor*> {
+    auto get_output_fn = [&ctx, &output_shape]() -> absl::StatusOr<Tensor*> {
       Tensor* output = nullptr;
       TF_RETURN_IF_ERROR(
           ctx->allocate_output(/*index=*/0, output_shape, &output));
@@ -447,7 +447,7 @@ class AssignVariableXlaConcatNDOp : public XlaConcatNDBaseOp<Device, T> {
     };
 
     auto get_output_fn = [this, &ctx, &output_shape,
-                          &variable]() -> StatusOr<Tensor*> {
+                          &variable]() -> absl::StatusOr<Tensor*> {
       if (variable->copy_on_read_mode.load() ||
           !variable->tensor()->RefCountIsOne() ||
           !variable->tensor()->shape().IsSameSize(output_shape)) {
diff --git a/tensorflow/core/tpu/kernels/sharding_utils.h b/tensorflow/core/tpu/kernels/sharding_utils.h
index 6f28baace0e3a0..6bde1072e7c5e8 100644
--- a/tensorflow/core/tpu/kernels/sharding_utils.h
+++ b/tensorflow/core/tpu/kernels/sharding_utils.h
@@ -334,7 +334,7 @@ class XlaNDConcatenator {
   absl::Status ComputeInternal(
       absl::Span<const Tensor> inputs,
       const std::function<Status(const Tensor&)>& assign_or_copy_value_fn,
-      const std::function<StatusOr<Tensor*>()>& get_output_fn,
+      const std::function<absl::StatusOr<Tensor*>()>& get_output_fn,
       const Device& device) {
     const int rank = inputs[0].shape().dims();
 
diff --git a/tensorflow/core/tpu/kernels/sparse_core_xla_ops.cc b/tensorflow/core/tpu/kernels/sparse_core_xla_ops.cc
index e66d675b142afb..cd3dd1a1cc59f2 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_xla_ops.cc
+++ b/tensorflow/core/tpu/kernels/sparse_core_xla_ops.cc
@@ -60,7 +60,7 @@ namespace {
 
 // Get the SparseCore logical replica count.
 // TODO(agagik): get it from the tpu topology.
-StatusOr<int64_t> GetSparseCoresPerChip() { return 4; }
+absl::StatusOr<int64_t> GetSparseCoresPerChip() { return 4; }
 
 // This TensorFlow op performs the embedding lookup on SparseCore. It takes the
 // embedding table and input sparse tensor represented by the `row_ids`,
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.cc
index f4b8383e3521e0..8ecd59c79112ab 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.cc
@@ -96,7 +96,7 @@ CompiledSubgraph* TpuCompilationCacheExternal::InitializeEntry(
   {
     mu_.Unlock();
     {
-      profiler::TraceMe compile_programs_traceme(
+      tsl::profiler::TraceMe compile_programs_traceme(
           "TPU compilation cache compile",
           /*level=*/2);
       initialization_status = initialize_program(&tpu_program_group);
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc
index b245f8bc1f5f16..b5919b8b850eff 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc
@@ -148,7 +148,7 @@ TpuCompilationCacheInterface::~TpuCompilationCacheInterface() {
 
 Status TpuCompilationCacheInterface::MarkEntryForEviction(
     int64_t subgraph_uid) {
-  profiler::TraceMe key_release_traceme(
+  tsl::profiler::TraceMe key_release_traceme(
       "TPU compilation cache possibly evict uid",
       /*level=*/2);
   CompiledSubgraph* deleted_entry = nullptr;
@@ -195,8 +195,9 @@ Status TpuCompilationCacheInterface::MarkEntryForEviction(
 }
 
 Status TpuCompilationCacheInterface::Release(int64_t subgraph_uid) {
-  profiler::TraceMe key_release_traceme("TPU compilation cache release uid",
-                                        /*level=*/2);
+  tsl::profiler::TraceMe key_release_traceme(
+      "TPU compilation cache release uid",
+      /*level=*/2);
 
   CompiledSubgraph* deleted_entry = nullptr;
   {
@@ -287,7 +288,7 @@ CompilationRefHolder* TpuCompilationCacheInterface::MakePerStepRefHolder() {
 }
 
 void TpuCompilationCacheInterface::DiscardEntryRefs(
-    gtl::ArraySlice<CompiledSubgraph*> entries) {
+    absl::Span<CompiledSubgraph* const> entries) {
   std::vector<CompiledSubgraph*> removed_entries;
   {
     absl::MutexLock lock(&mu_);
@@ -415,7 +416,7 @@ Status TpuCompilationCacheInterface::CompileIfKeyAbsentHelper(
     const std::function<Status(TpuProgramGroupInterface*)>& compile_function) {
   CompiledSubgraph* entry = nullptr;
 
-  profiler::TraceMe subgraph_lookup_traceme(
+  tsl::profiler::TraceMe subgraph_lookup_traceme(
       "TPU compilation cache subgraph lookup",
       /*level=*/2);
 
@@ -567,7 +568,7 @@ Status TpuCompilationCacheInterface::Lookup(
     std::unique_ptr<CompilationCacheEntryRef>* entry) {
   entry->reset();
 
-  profiler::TraceMe proto_lookup_traceme(
+  tsl::profiler::TraceMe proto_lookup_traceme(
       "TPU compilation cache proto lookup by uid",
       /*level=*/2);
 
@@ -592,8 +593,9 @@ Status TpuCompilationCacheInterface::Lookup(
     std::unique_ptr<CompilationCacheEntryRef>* entry) {
   entry->reset();
 
-  profiler::TraceMe proto_lookup_traceme("TPU compilation cache proto lookup",
-                                         /*level=*/2);
+  tsl::profiler::TraceMe proto_lookup_traceme(
+      "TPU compilation cache proto lookup",
+      /*level=*/2);
 
   absl::MutexLock lock(&mu_);
   const auto iter = entries_by_proto_key_.find(proto_key);
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h
index 6c652d3c855e17..5ac1df1ad7523c 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h
@@ -162,7 +162,7 @@ class TpuCompilationCacheInterface : public ResourceBase {
 
   // Convenience method called by ~RefHolder without mu_ held. Calls
   // DiscardEntryRef on every element of entries.
-  void DiscardEntryRefs(gtl::ArraySlice<CompiledSubgraph*> entries);
+  void DiscardEntryRefs(absl::Span<CompiledSubgraph* const> entries);
 
   std::string DebugString() const override { return "TpuCompilationCacheBase"; }
 
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.cc
index 58089a9abbad4b..8afed3979e622b 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.cc
@@ -41,8 +41,8 @@ Status TpuCompilationCacheLocalLookup::Lookup(
     const std::string& proto_key,
     std::unique_ptr<CompilationCacheEntryRef>* entry,
     CompilationCacheFetchTarget fetch_target) {
-  profiler::TraceMe proto_lookup_traceme("Local TPU proto cache lookup",
-                                         /*level=*/2);
+  tsl::profiler::TraceMe proto_lookup_traceme("Local TPU proto cache lookup",
+                                              /*level=*/2);
   Status s = cache_->Lookup(proto_key, entry);
   VLOG(1) << "Looked up key " << proto_key << " in local subgraph cache status "
           << s;
@@ -60,8 +60,9 @@ Status TpuCompilationCacheLocalLookup::Lookup(
     int64_t uid, int proto_index,
     std::unique_ptr<CompilationCacheEntryRef>* entry,
     CompilationCacheFetchTarget fetch_target) {
-  profiler::TraceMe proto_lookup_traceme("Local TPU proto cache lookup by uid",
-                                         /*level=*/2);
+  tsl::profiler::TraceMe proto_lookup_traceme(
+      "Local TPU proto cache lookup by uid",
+      /*level=*/2);
   Status s = cache_->Lookup(uid, proto_index, entry);
   VLOG(1) << "Looked up uid " << uid << ", index " << proto_index
           << " in local subgraph cache status " << s;
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.cc
index 7754a984e34797..f060e323ca3203 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.cc
@@ -69,8 +69,8 @@ Status TpuCompilationCacheRpcLookup::Lookup(
     const std::string& proto_key,
     std::unique_ptr<CompilationCacheEntryRef>* entry,
     tpu::CompilationCacheFetchTarget fetch_target) {
-  profiler::TraceMe proto_lookup_traceme("Remote TPU proto cache lookup",
-                                         /*level=*/2);
+  tsl::profiler::TraceMe proto_lookup_traceme("Remote TPU proto cache lookup",
+                                              /*level=*/2);
   entry->reset();
   std::shared_ptr<CacheEntry> cache_entry;
   // Keep a reference to CacheEntry objects evicted from the cache so that the
@@ -104,8 +104,9 @@ Status TpuCompilationCacheRpcLookup::Lookup(
     int64_t uid, int proto_index,
     std::unique_ptr<CompilationCacheEntryRef>* entry,
     tpu::CompilationCacheFetchTarget fetch_target) {
-  profiler::TraceMe proto_lookup_traceme("Remote TPU proto cache lookup by uid",
-                                         /*level=*/2);
+  tsl::profiler::TraceMe proto_lookup_traceme(
+      "Remote TPU proto cache lookup by uid",
+      /*level=*/2);
   entry->reset();
   std::shared_ptr<CacheEntry> cache_entry;
   // Keep a reference to CacheEntry objects evicted from the cache so that the
@@ -149,8 +150,8 @@ Status TpuCompilationCacheRpcLookup::RemoteLookupLocked(
     const std::string& local_proto_key,
     const tpu::GetTpuProgramRequest& request,
     std::shared_ptr<CacheEntry>* cache_entry) {
-  profiler::TraceMe proto_lookup_traceme("Remote TPU proto cache fetch",
-                                         /*level=*/2);
+  tsl::profiler::TraceMe proto_lookup_traceme("Remote TPU proto cache fetch",
+                                              /*level=*/2);
   // Perform the RPC while holding the lock unless it is demonstrated that
   // this causes a performance problem.
   ::grpc::ClientContext client_context;
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.cc
index 4d19ad41970adf..6802348c91295c 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.cc
@@ -67,7 +67,7 @@ Status DeserializeRpcResponseToCacheEntry<GetTpuProgramResponseExternal>(
   return OkStatus();
 }
 
-xla::StatusOr<std::vector<::grpc::Slice>> SerializeCacheEntryToBufferSlices(
+absl::StatusOr<std::vector<::grpc::Slice>> SerializeCacheEntryToBufferSlices(
     const TpuCompilationCacheEntry& cache_entry) {
   if (cache_entry.tpu_program_group() == nullptr) {
     // It's possible that the sharding/unsharding entry does not exist, but the
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.h
index 9d29a81663aa91..4a299d29a6e88e 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.h
@@ -1,3 +1,4 @@
+#include "absl/status/statusor.h"
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -89,7 +90,7 @@ Status DeserializeRpcResponseToCacheEntry(
     std::shared_ptr<CacheEntry>* cache_entry);
 
 // Serializes `TpuCompilationCacheEntry` to gRPC bufer slices.
-xla::StatusOr<std::vector<::grpc::Slice>> SerializeCacheEntryToBufferSlices(
+absl::StatusOr<std::vector<::grpc::Slice>> SerializeCacheEntryToBufferSlices(
     const TpuCompilationCacheEntry& cache_entry);
 }  // namespace tpu
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_service.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_service.cc
index 980ac861565656..718b26fd1c3f73 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_service.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_service.cc
@@ -130,7 +130,7 @@ void TpuCompilationCacheService::GetTpuProgram(GetTpuProgramCall* call) {
              tpu::CompilationCacheFetchTarget::MAIN);
   }
 
-  xla::StatusOr<std::vector<::grpc::Slice>> buffer_slices =
+  absl::StatusOr<std::vector<::grpc::Slice>> buffer_slices =
       tpu::SerializeCacheEntryToBufferSlices(cache_entry);
 
   if (!buffer_slices.ok()) {
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_service.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_service.h
index f7106235f95998..6dd644d321eda7 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_service.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_service.h
@@ -19,11 +19,11 @@ limitations under the License.
 #include <memory>
 
 #include "grpcpp/server_builder.h"
+#include "xla/tsl/distributed_runtime/rpc/grpc_call.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_common.pb.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
-#include "tsl/distributed_runtime/rpc/grpc_call.h"
 
 namespace tensorflow {
 // gRPC service for handling CompilationCache requests.
diff --git a/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc b/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
index cb26120386938e..d78c755301270f 100644
--- a/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
+++ b/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
@@ -118,7 +118,8 @@ Status CreateTpuCompilationCache(
       });
 }
 
-StatusOr<std::vector<int32_t>> ConstructDevicesPerHost(OpKernelContext* ctx) {
+absl::StatusOr<std::vector<int32_t>> ConstructDevicesPerHost(
+    OpKernelContext* ctx) {
   std::vector<int32_t> num_devices_per_host;
   int chips_per_host = -1;
   for (int i = 0; i < ctx->num_inputs(); ++i) {
@@ -146,7 +147,7 @@ void ConfigureDistributedTpuOp::Compute(OpKernelContext* ctx) {
   VLOG(1) << "ConfigureDistributedTpuOp";
   XLA_SCOPED_LOGGING_TIMER("ConfigureDistributedTpuOp");
 
-  StatusOr<std::vector<int32_t>> num_devices_per_host =
+  absl::StatusOr<std::vector<int32_t>> num_devices_per_host =
       ConstructDevicesPerHost(ctx);
   OP_REQUIRES_OK(ctx, num_devices_per_host.status());
   ResourceMgr* rmgr = GetTPUConfigResourceMgr();
diff --git a/tensorflow/core/tpu/kernels/tpu_configuration_ops.h b/tensorflow/core/tpu/kernels/tpu_configuration_ops.h
index a60e3b494ac655..0e0c4bbe04d16c 100644
--- a/tensorflow/core/tpu/kernels/tpu_configuration_ops.h
+++ b/tensorflow/core/tpu/kernels/tpu_configuration_ops.h
@@ -31,7 +31,8 @@ namespace tensorflow {
 Status CreateTpuCompilationCache(
     ResourceMgr* rmgr, tpu::TpuCompilationCacheInterface** compilation_cache);
 
-StatusOr<std::vector<int32_t>> ConstructDevicesPerHost(OpKernelContext* ctx);
+absl::StatusOr<std::vector<int32_t>> ConstructDevicesPerHost(
+    OpKernelContext* ctx);
 
 // The ConfigureDistributedTpu op is used to start an TPUDriver from
 // TensorFlow. It should be run on a TPU_SYSTEM device and returns the
diff --git a/tensorflow/core/tpu/kernels/tpu_embedding_enqueue_ops.cc b/tensorflow/core/tpu/kernels/tpu_embedding_enqueue_ops.cc
index 96ba38b12129df..69f72aff50919a 100644
--- a/tensorflow/core/tpu/kernels/tpu_embedding_enqueue_ops.cc
+++ b/tensorflow/core/tpu/kernels/tpu_embedding_enqueue_ops.cc
@@ -169,18 +169,16 @@ class EnqueueTPUEmbeddingArbitraryTensorBatchOp : public OpKernel {
     params.mode = mode;
 
     {
-      tensorflow::profiler::TraceMe enqueue_batch_trace(
-          [] { return "EnqueueBatch"; },
-          tensorflow::profiler::TraceMeLevel::kInfo);
+      tsl::profiler::TraceMe enqueue_batch_trace(
+          [] { return "EnqueueBatch"; }, tsl::profiler::TraceMeLevel::kInfo);
       stream_executor::tpu::OpsApiFn()->TpuEmbeddingEngine_EnqueueTensorBatchFn(
           &params);
       OP_REQUIRES_OK(ctx, status.status());
     }
 
     {
-      tensorflow::profiler::TraceMe delete_tensors_trace(
-          [] { return "DeleteTensors"; },
-          tensorflow::profiler::TraceMeLevel::kInfo);
+      tsl::profiler::TraceMe delete_tensors_trace(
+          [] { return "DeleteTensors"; }, tsl::profiler::TraceMeLevel::kInfo);
 
       DeleteTensors(sample_indices_or_row_splits_tensors);
       DeleteTensors(embedding_indices_tensors);
diff --git a/tensorflow/core/tpu/kernels/tpu_embedding_ops.cc b/tensorflow/core/tpu/kernels/tpu_embedding_ops.cc
index 9202b40dfc9370..364c1eefb9d005 100644
--- a/tensorflow/core/tpu/kernels/tpu_embedding_ops.cc
+++ b/tensorflow/core/tpu/kernels/tpu_embedding_ops.cc
@@ -480,7 +480,7 @@ class SplitDedupDataOp : public XlaOpKernel {
     const xla::XlaOp& input_tuple = ctx->Input(0);
     xla::XlaBuilder* builder = ctx->builder();
 
-    StatusOr<xla::Shape> tuple_shape = builder->GetShape(input_tuple);
+    absl::StatusOr<xla::Shape> tuple_shape = builder->GetShape(input_tuple);
     OP_REQUIRES_OK(ctx, tuple_shape.status());
 
     const int num_tuple_elements = tuple_shape->tuple_shapes_size();
@@ -683,7 +683,7 @@ class MergeDedupDataOp : public XlaOpKernel {
     }
 
     // `integer_tensor` should be a 1-D tensor.
-    StatusOr<xla::Shape> integer_tensor_shape =
+    absl::StatusOr<xla::Shape> integer_tensor_shape =
         ctx->builder()->GetShape(integer_tensor);
     OP_REQUIRES_OK(ctx, integer_tensor_shape.status());
     OP_REQUIRES(ctx, integer_tensor_shape->rank() == 1,
@@ -693,7 +693,7 @@ class MergeDedupDataOp : public XlaOpKernel {
     const int64_t num_integers = integer_tensor_shape->dimensions(0);
 
     // `float_tensor` should be a 1-D tensor.
-    StatusOr<xla::Shape> float_tensor_shape =
+    absl::StatusOr<xla::Shape> float_tensor_shape =
         ctx->builder()->GetShape(float_tensor);
     OP_REQUIRES_OK(ctx, float_tensor_shape.status());
     OP_REQUIRES(ctx, float_tensor_shape->rank() == 1,
diff --git a/tensorflow/core/tpu/kernels/tpu_execute_op.cc b/tensorflow/core/tpu/kernels/tpu_execute_op.cc
index 1b3a7c335434f6..02654c0d6a2cc6 100644
--- a/tensorflow/core/tpu/kernels/tpu_execute_op.cc
+++ b/tensorflow/core/tpu/kernels/tpu_execute_op.cc
@@ -93,7 +93,7 @@ Status GetComputationCacheEntry(
     std::unique_ptr<CompilationCacheEntryRef>* entry) {
   const Tensor* key;
   TF_RETURN_IF_ERROR(context->input("key", &key));
-  profiler::TraceMe trace_me("TpuExecuteOp::LookupProto", /*level=*/2);
+  tsl::profiler::TraceMe trace_me("TpuExecuteOp::LookupProto", /*level=*/2);
   if (!TensorShapeUtils::IsVector(key->shape()) ||
       key->shape().dim_size(0) != 3) {
     return absl::InvalidArgumentError(
@@ -125,7 +125,7 @@ struct VariableUpdateMap {
 
 // Creates a VariableUpdateMap from both the compilation and the fused variable
 // reads/updates.
-xla::StatusOr<VariableUpdateMap> BuildVariableUpdateMap(
+absl::StatusOr<VariableUpdateMap> BuildVariableUpdateMap(
     absl::Span<const TPUExecutableInfoProto::UpdateIndexPair* const>
         compiled_variable_updates,
     absl::Span<int const> fused_device_var_reads_in_computation_inputs,
@@ -215,11 +215,11 @@ struct InputBuffers {
 };
 
 // Builds an InputBuffers object that describes the inputs to the computation.
-xla::StatusOr<std::unique_ptr<InputBuffers>> BuildComputationInputs(
+absl::StatusOr<std::unique_ptr<InputBuffers>> BuildComputationInputs(
     OpKernelContext* context, const xla::Shape& input_host_shape,
     const VariableUpdateMap& variable_updates, xla::Backend* backend,
     int device_ordinal, se::Stream* stream) {
-  profiler::TraceMe trace_me("BuildComputationInputs", /*level=*/2);
+  tsl::profiler::TraceMe trace_me("BuildComputationInputs", /*level=*/2);
   OpInputList arg_list;
   TF_RETURN_IF_ERROR(context->input_list("args", &arg_list));
 
@@ -334,7 +334,7 @@ xla::StatusOr<std::unique_ptr<InputBuffers>> BuildComputationInputs(
   // Assigns the buffers of 'tensor' as computation input 'i'. Allocates fresh
   // buffers for zero-element tensors where required.
   auto assign_input = [&](int i, const Tensor& tensor,
-                          bool may_reuse) -> xla::Status {
+                          bool may_reuse) -> absl::Status {
     XlaTensor* xla_tensor = XlaTensor::FromTensor(&tensor);
 
     // Size 0 tensors have no backing XlaTensor, but may still need to have
@@ -414,7 +414,7 @@ struct OutputBuffers {
 // any output buffers that do not have corresponding output tensors. The latter
 // may happen for zero-element tensors of type int64 or complex64 which still
 // require a tuple buffer but do not have a corresponding XlaTensor.
-xla::StatusOr<std::unique_ptr<OutputBuffers>> AllocateOutputTensors(
+absl::StatusOr<std::unique_ptr<OutputBuffers>> AllocateOutputTensors(
     OpKernelContext* context, xla::ScopedShapedBuffer scoped_buffers,
     absl::Span<const TensorShapeProto* const> output_tensor_shape_protos,
     const VariableUpdateMap& variable_updates, TpuNodeContext* node_context,
@@ -422,7 +422,7 @@ xla::StatusOr<std::unique_ptr<OutputBuffers>> AllocateOutputTensors(
     const std::shared_ptr<se::Event>& definition_event) {
   VLOG(4) << "Output buffers: " << scoped_buffers.ToString();
 
-  profiler::TraceMe trace_me("AllocateOutputTensors", /*level=*/2);
+  tsl::profiler::TraceMe trace_me("AllocateOutputTensors", /*level=*/2);
   // Shapes of the outputs, in TensorShape form.
   const int64_t sub_elements =
       xla::ShapeUtil::TupleElementCount(scoped_buffers.on_host_shape());
@@ -647,15 +647,15 @@ Status TPUExecuteOp::DoWork(OpKernelContext* context) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<TpuNodeContext> node_context,
                       TpuNodeContext::Create(device_ordinal));
 
-  profiler::TraceMe trace_me(
+  tsl::profiler::TraceMe trace_me(
       [device_ordinal, context] {
-        return profiler::TraceMeEncode(
+        return tsl::profiler::TraceMeEncode(
             "TpuExecuteOp", {{"device_ordinal", device_ordinal},
                              {"id", context->step_id()},
                              {"iter_num", context->frame_iter().iter_id}});
       },
       /*level=*/2);
-  profiler::TraceMe trace_me_init("TPUExecuteOp::Init", /*level=*/2);
+  tsl::profiler::TraceMe trace_me_init("TPUExecuteOp::Init", /*level=*/2);
 
   std::string rendezvous_key_base;
   std::unique_ptr<CompilationCacheEntryRef> entry_ref;
diff --git a/tensorflow/core/tpu/kernels/tpu_functional_ops.cc b/tensorflow/core/tpu/kernels/tpu_functional_ops.cc
index f07473b799b5e3..2c95582a186e03 100644
--- a/tensorflow/core/tpu/kernels/tpu_functional_ops.cc
+++ b/tensorflow/core/tpu/kernels/tpu_functional_ops.cc
@@ -1308,7 +1308,7 @@ void TPUPartitionedCallOp::ComputeAsync(OpKernelContext* ctx,
             << " cache_hash: " << cache_hash
             << " device_ordinal: " << device_ordinal;
 
-    profiler::TraceMe trace_me(
+    tsl::profiler::TraceMe trace_me(
         "TPUPartitionedCallOp-RewriteAndInstantiateFunctions");
     std::unique_ptr<Graph> graph(new Graph(flib_def_.get()));
     bool enable_spmd_xla_partitioning = false;
@@ -1397,7 +1397,7 @@ Status TPUPartitionedCallOp::GetTpuCoreOrdinal(OpKernelContext* ctx,
                                                uint64_t input_hash,
                                                int64_t* ordinal_selector_req_id,
                                                int32_t* core_ordinal) {
-  profiler::TraceMe trace_me("TPUPartitionedCallOp-GetTpuCoreOrdinal");
+  tsl::profiler::TraceMe trace_me("TPUPartitionedCallOp-GetTpuCoreOrdinal");
   const Tensor* device_ordinal_t;
   TF_RETURN_IF_ERROR(ctx->input(kDeviceOrdinalAttr, &device_ordinal_t));
   int device_ordinal = device_ordinal_t->scalar<int>()();
@@ -1468,7 +1468,7 @@ Status TPUPartitionedCallOp::InitializeVarOnTPU(
   std::vector<Tensor>* dummy_rets = new std::vector<Tensor>;
   Notification done;
   Status status;
-  profiler::TraceMe trace_me("TPUPartitionedCallOp-InitializeVarOnTPU");
+  tsl::profiler::TraceMe trace_me("TPUPartitionedCallOp-InitializeVarOnTPU");
   library_runtime_->Run(opts, fhandle, dummy_args, dummy_rets,
                         [dummy_rets, &done, &status](const Status& s) {
                           status = s;
@@ -1639,7 +1639,7 @@ Status TPUPartitionedCallOp::InitializeShardedVarOnTPU(
     std::vector<Tensor> dummy_args;
     std::vector<Tensor>* dummy_rets = new std::vector<Tensor>;
 
-    profiler::TraceMe trace_me(
+    tsl::profiler::TraceMe trace_me(
         "TPUPartitionedCallOp-InitializeShardedVarOnTPU");
     library_runtime_->Run(opts, handle, dummy_args, dummy_rets,
                           [dummy_rets, i, &bcount, &statuses](const Status& s) {
@@ -2698,7 +2698,7 @@ void TPUPartitionedCallOp::ExecuteRemoteFunction(
   std::vector<Tensor> dummy_args;
   std::vector<Tensor>* dummy_rets = new std::vector<Tensor>;
 
-  profiler::TraceMe trace_me("TPUPartitionedCallOp-ExecuteRemote");
+  tsl::profiler::TraceMe trace_me("TPUPartitionedCallOp-ExecuteRemote");
   library_runtime_->Run(opts, handle, dummy_args, dummy_rets,
                         [dummy_rets, done, ctx](const Status& status) {
                           if (!status.ok()) {
@@ -2724,7 +2724,7 @@ void TPUPartitionedCallOp::ExecuteLocalFunction(
   }
   auto* rets = new std::vector<Tensor>;
 
-  profiler::TraceMe trace_me("TPUPartitionedCallOp-ExecuteLocal");
+  tsl::profiler::TraceMe trace_me("TPUPartitionedCallOp-ExecuteLocal");
   library_runtime_->Run(opts, handle, args, rets,
                         [rets, done, ctx](const Status& status) {
                           if (!status.ok()) {
@@ -2742,7 +2742,7 @@ void TPUPartitionedCallOp::ExecuteLocalFunction(
 void TPUPartitionedCallOp::ExecuteFunctions(
     const std::vector<DeviceAndFHandle>& functions, OpKernelContext* ctx,
     int device_ordinal, int64_t ordinal_selector_req_id, DoneCallback done) {
-  profiler::TraceMe trace_me("TPUPartitionedCallOp-ExecuteFunctions");
+  tsl::profiler::TraceMe trace_me("TPUPartitionedCallOp-ExecuteFunctions");
   FunctionLibraryRuntime::Options opts(ctx->step_id());
   opts.step_container = ctx->step_container();
   opts.stats_collector = ctx->stats_collector();
diff --git a/tensorflow/core/tpu/kernels/tpu_pod_state.cc b/tensorflow/core/tpu/kernels/tpu_pod_state.cc
index 71cdf1349a1f79..32c66b4b08550c 100644
--- a/tensorflow/core/tpu/kernels/tpu_pod_state.cc
+++ b/tensorflow/core/tpu/kernels/tpu_pod_state.cc
@@ -48,11 +48,11 @@ Status DeleteIfExists(ResourceMgr* resource_manager,
       resource_manager->default_container(), resource_name);
   if (status.ok()) {
     VLOG(1) << "Removed existing resource " << resource_name;
-    return OkStatus();
+    return absl::OkStatus();
   }
   if (status.code() == error::NOT_FOUND) {
     VLOG(1) << "No resource " << resource_name << " to remove";
-    return OkStatus();
+    return absl::OkStatus();
   }
   VLOG(1) << "Error removing resource " << resource_name << " : " << status;
   return status;
@@ -101,7 +101,7 @@ Status GetServerAddressAndPort(std::string* server_address, int* serving_port) {
   *server_address =
       std::string(server_address_output, server_address_output_size);
   CHECK_NE(*serving_port, -1);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 TpuPodState::TpuPodState(
@@ -136,7 +136,7 @@ Status GetTPUPodState(const ResourceMgr* rmgr, TpuPodState** pod_state) {
     return errors::FailedPrecondition(
         "The TPU system has not been initialized.");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool HasTPUPodState(const ResourceMgr* rmgr) {
diff --git a/tensorflow/core/tpu/kernels/tpu_program_group.h b/tensorflow/core/tpu/kernels/tpu_program_group.h
index 6859b0facd038c..1b82d17b62ffe0 100644
--- a/tensorflow/core/tpu/kernels/tpu_program_group.h
+++ b/tensorflow/core/tpu/kernels/tpu_program_group.h
@@ -86,7 +86,7 @@ class TpuAotCompilationOptions : public xla::AotCompilationOptions {
 
 class TpuProgramGroup : public TpuProgramGroupInterface {
  public:
-  using Status = ::tsl::Status;
+  using Status = absl::Status;
 
   // Compiles Mlir or TF function computation by lowering into HLO IR and
   // returns TPU programs ready for execution.
diff --git a/tensorflow/core/tpu/kernels/tpu_reshard_variables_op.cc b/tensorflow/core/tpu/kernels/tpu_reshard_variables_op.cc
index b7744172c489fb..96ae248347474f 100644
--- a/tensorflow/core/tpu/kernels/tpu_reshard_variables_op.cc
+++ b/tensorflow/core/tpu/kernels/tpu_reshard_variables_op.cc
@@ -136,14 +136,15 @@ Status TPUReshardVariablesOpKernel::DoTpuExecute(
   TF_ASSIGN_OR_RETURN(std::unique_ptr<tpu::TpuNodeContext> node_interfaces,
                       tpu::TpuNodeContext::Create(device_ordinal));
 
-  profiler::TraceMe trace_me(
+  tsl::profiler::TraceMe trace_me(
       [device_ordinal] {
-        return profiler::TraceMeEncode("TPUReshardVariablesOpKernel",
-                                       {{"device_ordinal", device_ordinal}});
+        return tsl::profiler::TraceMeEncode(
+            "TPUReshardVariablesOpKernel",
+            {{"device_ordinal", device_ordinal}});
       },
       /*level=*/2);
-  profiler::TraceMe trace_me_init("TPUReshardVariablesOpKernel::Init",
-                                  /*level=*/2);
+  tsl::profiler::TraceMe trace_me_init("TPUReshardVariablesOpKernel::Init",
+                                       /*level=*/2);
 
   string rendezvous_key_base;
   std::unique_ptr<tpu::CompilationCacheEntryRef> entry_ref;
diff --git a/tensorflow/core/tpu/kernels/tpu_reshard_variables_op_util.cc b/tensorflow/core/tpu/kernels/tpu_reshard_variables_op_util.cc
index a267f6c8d0fc4b..8a9b7f6dc55e90 100644
--- a/tensorflow/core/tpu/kernels/tpu_reshard_variables_op_util.cc
+++ b/tensorflow/core/tpu/kernels/tpu_reshard_variables_op_util.cc
@@ -72,7 +72,7 @@ Status CheckIsValidKey(const Tensor& key) {
         "new_format_key argument to TPUReshardVariables must be DT_STRING "
         "type");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool IsDefaultKey(const Tensor& key) { return key.vec<tstring>()(0).empty(); }
@@ -83,8 +83,8 @@ Status GetComputationCacheEntry(
     const Tensor& key, string* rendezvous_key_base,
     std::unique_ptr<tpu::CompilationCacheEntryRef>* entry,
     tpu::CompilationCacheFetchTarget fetch_target) {
-  profiler::TraceMe trace_me("TPUReshardVariablesOpKernel::LookupProto",
-                             /*level=*/2);
+  tsl::profiler::TraceMe trace_me("TPUReshardVariablesOpKernel::LookupProto",
+                                  /*level=*/2);
   TF_RETURN_IF_ERROR(CheckIsValidKey(key));
   auto* rmgr = GetTPUConfigResourceMgr();
   tpu::TpuCompilationCacheLookup* proto_lookup;
@@ -95,7 +95,7 @@ Status GetComputationCacheEntry(
   TF_RETURN_IF_ERROR(
       proto_lookup->Lookup(key.vec<tstring>()(0), entry, fetch_target));
   *rendezvous_key_base = key.vec<tstring>()(1);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Builds an InputBuffers object that describes the inputs to the computation.
@@ -103,7 +103,7 @@ absl::StatusOr<xla::ShapeTree<xla::MaybeOwningDeviceMemory>> BuildInputBuffers(
     OpKernelContext* context, const std::vector<VariableInfo>& variables,
     const xla::Shape& input_host_shape, xla::Backend* backend,
     int device_ordinal, se::Stream* stream) {
-  profiler::TraceMe trace_me("BuildComputationInputs", /*level=*/2);
+  tsl::profiler::TraceMe trace_me("BuildComputationInputs", /*level=*/2);
   OpInputList var_list;
   TF_RETURN_IF_ERROR(context->input_list("vars", &var_list));
 
@@ -142,7 +142,7 @@ absl::StatusOr<xla::ShapeTree<xla::MaybeOwningDeviceMemory>> BuildInputBuffers(
       }
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   for (int i = 0; i < variables.size(); ++i) {
@@ -182,7 +182,7 @@ absl::StatusOr<xla::ShapeTree<xla::MaybeOwningDeviceMemory>> BuildInputBuffers(
 
   // Assigns the buffers of 'tensor' as computation input 'i'. Allocates fresh
   // buffers for zero-element tensors where required.
-  auto assign_input = [&](int i, const Tensor& tensor) -> xla::Status {
+  auto assign_input = [&](int i, const Tensor& tensor) -> absl::Status {
     XlaTensor* xla_tensor = XlaTensor::FromTensor(&tensor);
 
     // Size 0 tensors have no backing XlaTensor, but may still need to have
@@ -200,7 +200,7 @@ absl::StatusOr<xla::ShapeTree<xla::MaybeOwningDeviceMemory>> BuildInputBuffers(
                                tensor.RefCountIsOne());
       xla_tensor->WaitForDefinitionEventOnStream(stream);
     }
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   for (int i = 0; i < var_list.size(); ++i) {
@@ -213,7 +213,7 @@ absl::StatusOr<xla::ShapeTree<xla::MaybeOwningDeviceMemory>> BuildInputBuffers(
 
 // Perform a compaction to reduce fragmentation.
 Status PerformCompaction(stream_executor::Stream* stream) {
-  profiler::TraceMe trace_me("PerformCompaction", /*level=*/2);
+  tsl::profiler::TraceMe trace_me("PerformCompaction", /*level=*/2);
   auto* ds_executor =
       down_cast<tpu::TpuExecutorInterface*>(stream->parent()->implementation());
   TF_RETURN_IF_ERROR(ds_executor->EnqueueCompactionOnStreamForHbm(stream));
@@ -231,7 +231,7 @@ Status UpdateOutputVariables(
     xla::Backend* backend, se::Stream* stream, int device_ordinal,
     const std::vector<VariableInfo>& variables,
     const std::shared_ptr<se::Event>& definition_event) {
-  profiler::TraceMe trace_me("UpdateOutputVariables", /*level=*/2);
+  tsl::profiler::TraceMe trace_me("UpdateOutputVariables", /*level=*/2);
   // Shapes of the outputs, in TensorShape form.
   const int64_t sub_elements =
       xla::ShapeUtil::TupleElementCount(result_buffers.on_host_shape());
diff --git a/tensorflow/core/tpu/kernels/tpu_reshard_variables_op_util.h b/tensorflow/core/tpu/kernels/tpu_reshard_variables_op_util.h
index 40cf39a9381b58..ffe7bebcc33f79 100644
--- a/tensorflow/core/tpu/kernels/tpu_reshard_variables_op_util.h
+++ b/tensorflow/core/tpu/kernels/tpu_reshard_variables_op_util.h
@@ -42,7 +42,7 @@ Status GetComputationCacheEntry(
     std::unique_ptr<tpu::CompilationCacheEntryRef>* entry,
     tpu::CompilationCacheFetchTarget fetch_target);
 
-xla::StatusOr<xla::ShapeTree<xla::MaybeOwningDeviceMemory>> BuildInputBuffers(
+absl::StatusOr<xla::ShapeTree<xla::MaybeOwningDeviceMemory>> BuildInputBuffers(
     OpKernelContext* context, const std::vector<VariableInfo>& variables,
     const xla::Shape& input_host_shape, xla::Backend* backend,
     int device_ordinal, se::Stream* stream);
diff --git a/tensorflow/core/tpu/kernels/tpu_util.cc b/tensorflow/core/tpu/kernels/tpu_util.cc
index 94adcc37ad7c38..8b39e4f49174b6 100644
--- a/tensorflow/core/tpu/kernels/tpu_util.cc
+++ b/tensorflow/core/tpu/kernels/tpu_util.cc
@@ -88,7 +88,7 @@ Status DynamicShapesToTensorShapes(const OpInputList& dynamic_shapes,
     TF_RETURN_IF_ERROR(
         ShapeTensorToTensorShape(dynamic_shapes[i], &(*shapes)[i]));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DynamicShapesToTensorShapes(const InputList& dynamic_shapes,
@@ -100,7 +100,7 @@ Status DynamicShapesToTensorShapes(const InputList& dynamic_shapes,
         ShapeTensorToTensorShape(dynamic_shape.tensor(), &(*shapes)[i]));
     ++i;
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 absl::StatusOr<std::unique_ptr<::grpc::ServerBuilder>> CreateServerBuilder(
diff --git a/tensorflow/core/tpu/kernels/tpu_util.h b/tensorflow/core/tpu/kernels/tpu_util.h
index b4ecc04b938af8..a44175fbb2c2cd 100644
--- a/tensorflow/core/tpu/kernels/tpu_util.h
+++ b/tensorflow/core/tpu/kernels/tpu_util.h
@@ -39,7 +39,7 @@ std::string SessionNameFromMetadata(const SessionMetadata* session_metadata);
 std::string ProtoKeyForComputation(const std::string& key, int core);
 
 // Returns a TpuCompilationCacheKey parsed from given key or an error.
-xla::StatusOr<TpuCompilationCacheKey> ParseCompilationCacheKey(
+absl::StatusOr<TpuCompilationCacheKey> ParseCompilationCacheKey(
     const std::string& key);
 
 xla::CompileOnlyClient::AotXlaComputationInstance
@@ -58,7 +58,7 @@ Status DynamicShapesToTensorShapes(const InputList& dynamic_shapes,
                                    std::vector<TensorShape>* shapes);
 
 // Creates gRPC ServerBuilder.
-xla::StatusOr<std::unique_ptr<::grpc::ServerBuilder>> CreateServerBuilder(
+absl::StatusOr<std::unique_ptr<::grpc::ServerBuilder>> CreateServerBuilder(
     int serving_port);
 }  // namespace tpu
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/transfer_ops.cc b/tensorflow/core/tpu/kernels/transfer_ops.cc
index b7990aa4a0999d..d30fbf5b86cb3a 100644
--- a/tensorflow/core/tpu/kernels/transfer_ops.cc
+++ b/tensorflow/core/tpu/kernels/transfer_ops.cc
@@ -64,7 +64,7 @@ TpuTransferAsyncOpKernelBase::TpuTransferAsyncOpKernelBase(
 
 void TpuTransferAsyncOpKernelBase::ComputeAsync(OpKernelContext* ctx,
                                                 DoneCallback done) {
-  profiler::TraceMeProducer schedule_activity(
+  tsl::profiler::TraceMeProducer schedule_activity(
       "TpuTransferAsyncOpKernelBase::ComputeAsync");
   CancellationToken token =
       ctx->cancellation_manager()->get_cancellation_token();
@@ -84,8 +84,8 @@ void TpuTransferAsyncOpKernelBase::ComputeAsync(OpKernelContext* ctx,
   thread_pool_->Schedule(
       [this, ctx, done, token,
        traceme_context_id = schedule_activity.GetContextId()]() {
-        profiler::TraceMeConsumer compute_activity(
-            [this] { return profiler::TraceMeOp(name(), type_string()); },
+        tsl::profiler::TraceMeConsumer compute_activity(
+            [this] { return tsl::profiler::TraceMeOp(name(), type_string()); },
             traceme_context_id);
         Status s = RunTransfer(ctx);
         ctx->cancellation_manager()->DeregisterCallback(token);
@@ -102,9 +102,9 @@ Status TpuTransferAsyncOpKernelBase::RunTransferWithOrdinal(
                         transfer_op_->GetDeviceOrdinal(ctx));
   }
 
-  profiler::TraceMe activity(
+  tsl::profiler::TraceMe activity(
       [real_device_ordinal] {
-        return profiler::TraceMeEncode(
+        return tsl::profiler::TraceMeEncode(
             "RunTransferWithOrdinal",
             {{"device_ordinal", real_device_ordinal}});
       },
@@ -162,7 +162,7 @@ void StreamExecutorTransferOpImpl::Cancel() {
   TF_CHECK_OK(tpu::TpuNodeContext::CloseTpuHost());
 }
 
-StatusOr<int> StreamExecutorTransferOpImpl::GetDeviceOrdinal(
+absl::StatusOr<int> StreamExecutorTransferOpImpl::GetDeviceOrdinal(
     OpKernelContext* ctx) {
   const XlaDevice::Metadata* metadata;
   TF_RETURN_IF_ERROR(XlaDevice::GetMetadata(ctx, &metadata));
@@ -188,7 +188,7 @@ Status StreamExecutorTransferOpImpl::TransferLiteralFromOutfeed(
   return transfer_manager_->TransferLiteralFromOutfeed(executor, literal);
 }
 
-StatusOr<stream_executor::StreamExecutor*>
+absl::StatusOr<stream_executor::StreamExecutor*>
 StreamExecutorTransferOpImpl::GetStreamExecutor(int device_ordinal) {
   return tpu_platform_->ExecutorForDevice(device_ordinal);
 }
diff --git a/tensorflow/core/tpu/kernels/transfer_ops.h b/tensorflow/core/tpu/kernels/transfer_ops.h
index 765a8124c18222..286e0173a075dc 100644
--- a/tensorflow/core/tpu/kernels/transfer_ops.h
+++ b/tensorflow/core/tpu/kernels/transfer_ops.h
@@ -37,7 +37,7 @@ class TpuTransferOpInterface {
  public:
   virtual ~TpuTransferOpInterface() = default;
   virtual void Cancel() = 0;
-  virtual StatusOr<int> GetDeviceOrdinal(OpKernelContext* ctx) = 0;
+  virtual absl::StatusOr<int> GetDeviceOrdinal(OpKernelContext* ctx) = 0;
 
   virtual Status TransferBuffersToInfeed(
       int device_ordinal,
@@ -117,7 +117,7 @@ class StreamExecutorTransferOpImpl : public TpuTransferOpInterface {
   explicit StreamExecutorTransferOpImpl();
   ~StreamExecutorTransferOpImpl() override = default;
   void Cancel() override;
-  StatusOr<int> GetDeviceOrdinal(OpKernelContext* ctx) override;
+  absl::StatusOr<int> GetDeviceOrdinal(OpKernelContext* ctx) override;
 
   Status TransferBuffersToInfeed(
       int device_ordinal,
@@ -129,7 +129,7 @@ class StreamExecutorTransferOpImpl : public TpuTransferOpInterface {
       int device_ordinal, xla::MutableBorrowingLiteral literal) override;
 
  private:
-  StatusOr<stream_executor::StreamExecutor*> GetStreamExecutor(
+  absl::StatusOr<stream_executor::StreamExecutor*> GetStreamExecutor(
       int device_ordinal);
   xla::TpuTransferManagerInterface* transfer_manager_;
   tpu::TpuPlatformInterface* tpu_platform_;
diff --git a/tensorflow/core/tpu/kernels/xla/host_compute_ops.cc b/tensorflow/core/tpu/kernels/xla/host_compute_ops.cc
index 70e8bfb7921c24..ece6d68825630b 100644
--- a/tensorflow/core/tpu/kernels/xla/host_compute_ops.cc
+++ b/tensorflow/core/tpu/kernels/xla/host_compute_ops.cc
@@ -66,8 +66,8 @@ namespace {
 static const char* const kSendFromHostOp = "_XlaSendFromHost";
 static const char* const kRecvAtHostOp = "_XlaRecvAtHost";
 
-Status MakeXlaShapes(gtl::ArraySlice<TensorShape> shapes,
-                     gtl::ArraySlice<DataType> dtypes,
+Status MakeXlaShapes(absl::Span<const TensorShape> shapes,
+                     absl::Span<const DataType> dtypes,
                      std::vector<xla::Shape>* xla_shapes,
                      xla::Shape* xla_shape) {
   for (int i = 0; i < shapes.size(); i++) {
diff --git a/tensorflow/core/tpu/ops/sparse_core_ops.cc b/tensorflow/core/tpu/ops/sparse_core_ops.cc
index e770c1814399a2..ff71f572993c62 100644
--- a/tensorflow/core/tpu/ops/sparse_core_ops.cc
+++ b/tensorflow/core/tpu/ops/sparse_core_ops.cc
@@ -60,7 +60,7 @@ REGISTER_OP("XlaSparseDenseMatmul")
       c->set_output(2, c->UnknownShapeOfRank(1));
       c->set_output(3, c->UnknownShapeOfRank(1));
       c->set_output(4, c->UnknownShapeOfRank(1));
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 REGISTER_OP("XlaSparseDenseMatmulWithCsrInput")
@@ -93,7 +93,7 @@ REGISTER_OP("XlaSparseDenseMatmulWithCsrInput")
           c->ReplaceDim(c->input(4), 0, c->MakeDim(input_size), &output_shape));
       TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &rank));
       c->set_output(0, output_shape);
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 REGISTER_OP("XlaSparseDenseMatmulGradWithSgdAndCsrInput")
@@ -111,7 +111,7 @@ REGISTER_OP("XlaSparseDenseMatmulGradWithSgdAndCsrInput")
     .Attr("table_name: string")
     .SetShapeFn([](shape_inference::InferenceContext* c) -> Status {
       c->set_output(0, c->input(6));
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 REGISTER_OP("XlaSparseDenseMatmulGradWithAdagradAndCsrInput")
@@ -132,7 +132,7 @@ REGISTER_OP("XlaSparseDenseMatmulGradWithAdagradAndCsrInput")
     .SetShapeFn([](shape_inference::InferenceContext* c) -> Status {
       c->set_output(0, c->input(6));
       c->set_output(1, c->input(7));
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 REGISTER_OP("XlaSparseDenseMatmulGradWithAdagradMomentumAndCsrInput")
@@ -161,7 +161,7 @@ REGISTER_OP("XlaSparseDenseMatmulGradWithAdagradMomentumAndCsrInput")
       c->set_output(0, c->input(6));
       c->set_output(1, c->input(7));
       c->set_output(2, c->input(8));
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 REGISTER_OP("XlaSparseDenseMatmulGradWithAdamAndCsrInput")
@@ -189,7 +189,7 @@ REGISTER_OP("XlaSparseDenseMatmulGradWithAdamAndCsrInput")
       c->set_output(0, c->input(6));
       c->set_output(1, c->input(7));
       c->set_output(2, c->input(8));
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 REGISTER_OP("XlaSparseDenseMatmulGradWithFtrlAndCsrInput")
@@ -218,7 +218,7 @@ REGISTER_OP("XlaSparseDenseMatmulGradWithFtrlAndCsrInput")
       c->set_output(0, c->input(6));
       c->set_output(1, c->input(7));
       c->set_output(2, c->input(8));
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 REGISTER_OP("XlaSparseCoreSgd")
@@ -231,7 +231,7 @@ REGISTER_OP("XlaSparseCoreSgd")
     .SetIsStateful()
     .SetShapeFn([](shape_inference::InferenceContext* c) -> Status {
       c->set_output(0, c->input(3));
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 REGISTER_OP("XlaSparseCoreAdagrad")
@@ -247,7 +247,7 @@ REGISTER_OP("XlaSparseCoreAdagrad")
     .SetShapeFn([](shape_inference::InferenceContext* c) -> Status {
       c->set_output(0, c->input(4));
       c->set_output(1, c->input(3));
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 REGISTER_OP("XlaSparseCoreAdagradMomentum")
@@ -271,7 +271,7 @@ REGISTER_OP("XlaSparseCoreAdagradMomentum")
       c->set_output(0, c->input(7));
       c->set_output(1, c->input(5));
       c->set_output(2, c->input(6));
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 REGISTER_OP("XlaSparseCoreAdam")
@@ -295,7 +295,7 @@ REGISTER_OP("XlaSparseCoreAdam")
       c->set_output(0, c->input(0));
       c->set_output(1, c->input(5));
       c->set_output(2, c->input(4));
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 REGISTER_OP("XlaSparseCoreFtrl")
@@ -319,7 +319,7 @@ REGISTER_OP("XlaSparseCoreFtrl")
       c->set_output(0, c->input(0));
       c->set_output(1, c->input(1));
       c->set_output(2, c->input(2));
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 REGISTER_OP("GlobalIterId")
@@ -327,7 +327,7 @@ REGISTER_OP("GlobalIterId")
     .SetIsStateful()
     .SetShapeFn([](shape_inference::InferenceContext* c) -> Status {
       c->set_output(0, c->Scalar());
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/ops/sparse_core_preprocess_ops.cc b/tensorflow/core/tpu/ops/sparse_core_preprocess_ops.cc
index a6d1fa90dca82d..5d96559932e286 100644
--- a/tensorflow/core/tpu/ops/sparse_core_preprocess_ops.cc
+++ b/tensorflow/core/tpu/ops/sparse_core_preprocess_ops.cc
@@ -37,7 +37,7 @@ REGISTER_OP("ConvertToCooTensor")
         TF_RETURN_IF_ERROR(c->WithRank(c->output(i), 1, &rank));
         c->set_output(i, c->UnknownShapeOfRank(1));
       }
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 REGISTER_OP("GetMinibatchesInCsrWithPhysicalReplica")
@@ -103,7 +103,7 @@ REGISTER_OP("GetMinibatchesInCsrWithPhysicalReplica")
       c->set_output(4, c->Scalar());
       c->set_output(5, c->Scalar());
       c->set_output(6, c->Scalar());
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 REGISTER_OP("GetMinibatchSplitsWithPhysicalReplica")
@@ -135,7 +135,7 @@ REGISTER_OP("GetMinibatchSplitsWithPhysicalReplica")
       c->set_output(4, c->UnknownShapeOfRank(1));
       c->set_output(5, c->Scalar());
       c->set_output(6, c->Scalar());
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 REGISTER_OP("StoreMinibatchStatisticsInFdo")
@@ -150,6 +150,6 @@ REGISTER_OP("StoreMinibatchStatisticsInFdo")
     .Attr("mini_batch_splits: string")
     .SetIsStateful()
     .SetShapeFn([](shape_inference::InferenceContext* c) {
-      return OkStatus();
+      return absl::OkStatus();
     });
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/ops/topk_ops.cc b/tensorflow/core/tpu/ops/topk_ops.cc
index df4eaa0aedaa9e..b13464bf1f9c6a 100644
--- a/tensorflow/core/tpu/ops/topk_ops.cc
+++ b/tensorflow/core/tpu/ops/topk_ops.cc
@@ -35,7 +35,7 @@ REGISTER_OP("KthOrderStatistic")
       ShapeHandle s;
       TF_RETURN_IF_ERROR(c->Subshape(input, 0, -1, &s));
       c->set_output(0, s);
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 REGISTER_OP("TopKUnique")
@@ -54,7 +54,7 @@ REGISTER_OP("TopKUnique")
       TF_RETURN_IF_ERROR(c->ReplaceDim(input, 1, c->MakeDim(k), &s));
       c->set_output(0, s);
       c->set_output(1, s);
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 REGISTER_OP("MakeUnique")
@@ -64,7 +64,7 @@ REGISTER_OP("MakeUnique")
       ShapeHandle input;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &input));
       c->set_output(0, input);
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 REGISTER_OP("TopKWithUnique")
@@ -83,6 +83,6 @@ REGISTER_OP("TopKWithUnique")
       TF_RETURN_IF_ERROR(c->ReplaceDim(input, 1, c->MakeDim(k), &s));
       c->set_output(0, s);
       c->set_output(1, s);
-      return OkStatus();
+      return absl::OkStatus();
     });
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/ops/tpu_compile_op.cc b/tensorflow/core/tpu/ops/tpu_compile_op.cc
index 1ef83fde38a6cf..0c3de059d0ac54 100644
--- a/tensorflow/core/tpu/ops/tpu_compile_op.cc
+++ b/tensorflow/core/tpu/ops/tpu_compile_op.cc
@@ -42,7 +42,7 @@ REGISTER_OP("_TPUCompileMlir")
       for (int i = 0; i < num_computations; ++i) {
         c->set_output(i + 1, c->Vector(3));
       }
-      return OkStatus();
+      return absl::OkStatus();
     })
     .Doc(
         R"(
@@ -65,7 +65,7 @@ REGISTER_OP("_XlaCompileMlirPlaceholderProgramKey")
     .Output("program: string")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       c->set_output(0, c->Vector(3));
-      return OkStatus();
+      return absl::OkStatus();
     })
     .SetIsStateful()
     .Doc(
@@ -105,7 +105,7 @@ REGISTER_OP("TPUCompile")
       for (int i = 0; i < num_computations; ++i) {
         c->set_output(num_computations + i + 1, c->Scalar());
       }
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 REGISTER_OP("TPUCompileSucceededAssert")
diff --git a/tensorflow/core/tpu/ops/tpu_copy_with_dynamic_shape_op.cc b/tensorflow/core/tpu/ops/tpu_copy_with_dynamic_shape_op.cc
index e40280554bd439..03ae84a7ceb0b5 100644
--- a/tensorflow/core/tpu/ops/tpu_copy_with_dynamic_shape_op.cc
+++ b/tensorflow/core/tpu/ops/tpu_copy_with_dynamic_shape_op.cc
@@ -34,7 +34,7 @@ REGISTER_OP("TPUCopyWithDynamicShape")
       for (int i = 0; i < c->num_inputs() - n; ++i) {
         c->set_output(i, c->input(i));
       }
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 REGISTER_OP("TPUAnnotateTensorsWithDynamicShape")
@@ -46,7 +46,7 @@ REGISTER_OP("TPUAnnotateTensorsWithDynamicShape")
       for (int i = 0; i < c->num_inputs(); ++i) {
         c->set_output(i, c->input(i));
       }
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/ops/tpu_execute_op.cc b/tensorflow/core/tpu/ops/tpu_execute_op.cc
index c27197f1fa41ea..323b215d9cf348 100644
--- a/tensorflow/core/tpu/ops/tpu_execute_op.cc
+++ b/tensorflow/core/tpu/ops/tpu_execute_op.cc
@@ -36,7 +36,7 @@ REGISTER_OP("TPUExecute")
       for (int i = 0; i < c->num_outputs(); ++i) {
         c->set_output(i, c->UnknownShape());
       }
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 REGISTER_OP("TPUExecuteAndUpdateVariables")
@@ -57,7 +57,7 @@ REGISTER_OP("TPUExecuteAndUpdateVariables")
       for (int i = 0; i < c->num_outputs(); ++i) {
         c->set_output(i, c->UnknownShape());
       }
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/ops/tpu_handle_to_key_op.cc b/tensorflow/core/tpu/ops/tpu_handle_to_key_op.cc
index 07ea2bcb58ab4b..06a7f284af5e4e 100644
--- a/tensorflow/core/tpu/ops/tpu_handle_to_key_op.cc
+++ b/tensorflow/core/tpu/ops/tpu_handle_to_key_op.cc
@@ -27,7 +27,7 @@ REGISTER_OP("TpuHandleToProtoKey")
       shape_inference::ShapeHandle input;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &input));
       c->set_output(0, c->UnknownShapeOfRank(1));
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/tpu_embedding_spmd_sharding_utils.cc b/tensorflow/core/tpu/tpu_embedding_spmd_sharding_utils.cc
index 9598d48cbdc0fc..80a31ecff2d880 100644
--- a/tensorflow/core/tpu/tpu_embedding_spmd_sharding_utils.cc
+++ b/tensorflow/core/tpu/tpu_embedding_spmd_sharding_utils.cc
@@ -28,7 +28,7 @@ limitations under the License.
 namespace tensorflow {
 namespace tpu {
 
-StatusOr<xla::OpSharding> SpmdShardingAnnotationOnFirstDim(
+absl::StatusOr<xla::OpSharding> SpmdShardingAnnotationOnFirstDim(
     const xla::Shape& shape, int core_count_per_replica,
     xla::XlaBuilder* builder) {
   if (!shape.IsArray()) {
diff --git a/tensorflow/core/tpu/tpu_embedding_spmd_sharding_utils.h b/tensorflow/core/tpu/tpu_embedding_spmd_sharding_utils.h
index 8585f551a6a1dc..586c060bbfc7c3 100644
--- a/tensorflow/core/tpu/tpu_embedding_spmd_sharding_utils.h
+++ b/tensorflow/core/tpu/tpu_embedding_spmd_sharding_utils.h
@@ -28,7 +28,7 @@ namespace tpu {
 // scalar (rank = 0), the tensor is replicated across all the cores within the
 // replica. If the shape is a non-scalar (rank >= 1), the tensor is sharded on
 // dimension `0' across all the cores within the same replica.
-StatusOr<xla::OpSharding> SpmdShardingAnnotationOnFirstDim(
+absl::StatusOr<xla::OpSharding> SpmdShardingAnnotationOnFirstDim(
     const xla::Shape& shape, int core_count_per_replica,
     xla::XlaBuilder* builder);
 
diff --git a/tensorflow/core/tpu/tpu_execute.cc b/tensorflow/core/tpu/tpu_execute.cc
index 8a50c90c646510..5daa760f96ba85 100644
--- a/tensorflow/core/tpu/tpu_execute.cc
+++ b/tensorflow/core/tpu/tpu_execute.cc
@@ -104,16 +104,16 @@ int64_t ShapeSizeCompactRaw(const xla::Shape& shape) {
 
 // Given a tuple, fix all non-leaf nodes (tuples) such that the tuple tables
 // point to the correct leaf nodes.
-xla::Status FixTupleTableAsync(se::Stream* stream,
-                               const xla::Shape& tuple_shape,
-                               xla::ExecutionInput* mem,
-                               xla::TransferManager* transfer_manager) {
+absl::Status FixTupleTableAsync(se::Stream* stream,
+                                const xla::Shape& tuple_shape,
+                                xla::ExecutionInput* mem,
+                                xla::TransferManager* transfer_manager) {
   return xla::ShapeUtil::ForEachSubshapeWithStatus(
       tuple_shape,
       [&](const xla::Shape& element_shape,
           const xla::ShapeIndex& index) -> Status {
         if (!element_shape.IsTuple()) {
-          return OkStatus();
+          return absl::OkStatus();
         }
         std::vector<se::DeviceMemoryBase> elements;
         xla::ShapeIndex element_index = index;
@@ -159,7 +159,7 @@ bool DynamicShapeIsCompatible(const xla::Shape& dynamic_shape,
 //
 // Metadata contains the sizes of shape without padding, eventually
 // representing the size of valid data.
-xla::Status UpdateDynamicInputs(
+absl::Status UpdateDynamicInputs(
     se::Stream* stream, se::DeviceMemoryAllocator* allocator,
     std::vector<xla::ExecutionInput>* runtime_inputs,
     const std::vector<xla::Shape>& compile_time_shapes) {
@@ -183,7 +183,7 @@ xla::Status UpdateDynamicInputs(
         [&](const xla::Shape& compile_time_shape,
             const xla::ShapeIndex& index) -> Status {
           if (compile_time_shape.IsTuple() || compile_time_shape.is_static()) {
-            return OkStatus();
+            return absl::OkStatus();
           }
 
           const xla::Shape& runtime_shape =
@@ -251,7 +251,7 @@ xla::Status UpdateDynamicInputs(
           *mutable_input_mem =
               xla::MaybeOwningDeviceMemory(std::move(new_input));
           element_modified = true;
-          return OkStatus();
+          return absl::OkStatus();
         }));
     if (element_modified) {
       // The input location has been modified, need to fix tuple table to
@@ -261,7 +261,7 @@ xla::Status UpdateDynamicInputs(
                                             &runtime_input, transfer_manager));
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void TPUCancelExecution(int device_ordinal) {
@@ -405,7 +405,7 @@ OcParamsPtr CreateOcParams(const std::string& rendezvous_key_base,
 
 }  // namespace
 
-xla::StatusOr<xla::ExecutionOutput> TPUExecute(
+absl::StatusOr<xla::ExecutionOutput> TPUExecute(
     const TPUExecutableInfoProto& executable,
     const TPUHostTransferInfoProto& host_transfers,
     const xla::HloProto& hlo_metadata,
@@ -416,7 +416,7 @@ xla::StatusOr<xla::ExecutionOutput> TPUExecute(
     stream_executor::Stream* stream,
     stream_executor::Stream* host_to_device_stream,
     const XLA_TpuProgram* tpu_program) {
-  profiler::TraceMe traceme("TPUExecute", 2);
+  tsl::profiler::TraceMe traceme("TPUExecute", 2);
   TF_RET_CHECK(tpu::TpuPlatformInterface::GetRegisteredPlatform() != nullptr);
   TF_RET_CHECK(tpu_program != nullptr);
   const int device_ordinal = node_context->device_ordinal();
@@ -527,7 +527,7 @@ xla::StatusOr<xla::ExecutionOutput> TPUExecute(
         "RPC cancelled, not running TPU program on device ", device_ordinal));
   }
 
-  xla::StatusOr<xla::ExecutionOutput> output =
+  absl::StatusOr<xla::ExecutionOutput> output =
       tpu_executable->ExecuteAsyncOnStream(&service_run_options,
                                            std::move(arguments),
                                            /*hlo_execution_profile=*/nullptr);
diff --git a/tensorflow/core/tpu/tpu_execute.h b/tensorflow/core/tpu/tpu_execute.h
index a7d909579c99e4..053f48807ab77e 100644
--- a/tensorflow/core/tpu/tpu_execute.h
+++ b/tensorflow/core/tpu/tpu_execute.h
@@ -37,7 +37,7 @@ namespace tensorflow {
 // `output_shape` is the output shape of the XLA computation from which
 // `program` was derived. If `session_module` is not nullptr, it will be filled
 // with the input and output literals of the execution.
-xla::StatusOr<xla::ExecutionOutput> TPUExecute(
+absl::StatusOr<xla::ExecutionOutput> TPUExecute(
     const TPUExecutableInfoProto& executable,
     const TPUHostTransferInfoProto& host_transfers,
     const xla::HloProto& hlo_metadata,
diff --git a/tensorflow/core/util/activation_mode.cc b/tensorflow/core/util/activation_mode.cc
index f07b7d8d0ce065..575d48852e6d88 100644
--- a/tensorflow/core/util/activation_mode.cc
+++ b/tensorflow/core/util/activation_mode.cc
@@ -39,7 +39,7 @@ Status GetActivationModeFromString(const string& str_value,
   } else {
     return errors::NotFound(str_value, " is not an allowed activation mode");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/util/autotune_maps/BUILD b/tensorflow/core/util/autotune_maps/BUILD
index 10078d6ce696a8..0e2176f9948218 100644
--- a/tensorflow/core/util/autotune_maps/BUILD
+++ b/tensorflow/core/util/autotune_maps/BUILD
@@ -73,9 +73,7 @@ cc_library(
         "conv_autotune_maps.h",
         "conv_parameters.h",
     ],
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
 )
 
 tf_cuda_library(
diff --git a/tensorflow/core/util/batch_util.cc b/tensorflow/core/util/batch_util.cc
index d38e626bab3c01..24823886285488 100644
--- a/tensorflow/core/util/batch_util.cc
+++ b/tensorflow/core/util/batch_util.cc
@@ -68,7 +68,7 @@ Status HandleElementToSlice<tstring>(const Tensor& element, tstring* src,
   } else {
     std::copy_n(src, num_values, dest);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <>
@@ -81,7 +81,7 @@ Status HandleElementToSlice<Variant>(const Tensor& element, Variant* src,
   } else {
     std::copy_n(src, num_values, dest);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <>
@@ -90,7 +90,7 @@ Status HandleElementToSlice<ResourceHandle>(const Tensor& /* element */,
                                             ResourceHandle* dest,
                                             int64_t num_values) {
   std::copy_n(src, num_values, dest);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <>
@@ -98,7 +98,7 @@ Status HandleElementToSlice<Eigen::half>(const Tensor& /* element */,
                                          Eigen::half* src, Eigen::half* dest,
                                          int64_t num_values) {
   std::copy_n(src, num_values, dest);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename T>
diff --git a/tensorflow/core/util/example_proto_fast_parsing.cc b/tensorflow/core/util/example_proto_fast_parsing.cc
index 9c3ab5396cea76..436ccb480fa69e 100644
--- a/tensorflow/core/util/example_proto_fast_parsing.cc
+++ b/tensorflow/core/util/example_proto_fast_parsing.cc
@@ -1132,8 +1132,8 @@ void CopySparseBufferToTensor(DataType dtype, size_t offset, SparseBuffer* src,
 }  // namespace
 
 Status FastParseExample(const Config& config,
-                        gtl::ArraySlice<tstring> serialized,
-                        gtl::ArraySlice<tstring> example_names,
+                        absl::Span<const tstring> serialized,
+                        absl::Span<const tstring> example_names,
                         thread::ThreadPool* thread_pool, Result* result) {
   DCHECK(result != nullptr);
   // Check config so we can safely CHECK(false) in switches on config.*.dtype
@@ -1857,7 +1857,7 @@ struct FeatureProtos {
 // Map from feature name to FeatureProtos for that feature.
 using FeatureProtosMap = absl::flat_hash_map<StringPiece, FeatureProtos>;
 
-string ExampleName(const gtl::ArraySlice<tstring> example_names, int n) {
+string ExampleName(const absl::Span<const tstring> example_names, int n) {
   return example_names.empty() ? "<unknown>" : example_names[n];
 }
 
@@ -2094,8 +2094,8 @@ inline bool SkipEmptyFeature(protobuf::io::CodedInputStream* stream,
 
 // Reads an example proto, and extracts a StringPiece pointer to each feature.
 Status ExtractFeaturesFromSequenceExamples(
-    const gtl::ArraySlice<tstring> examples,
-    const gtl::ArraySlice<tstring> example_names,
+    const absl::Span<const tstring> examples,
+    const absl::Span<const tstring> example_names,
     FeatureProtosMap* context_features, FeatureProtosMap* sequence_features) {
   for (int d = 0; d < examples.size(); d++) {
     const tstring& example = examples[d];
@@ -2162,7 +2162,7 @@ Status ExtractFeaturesFromSequenceExamples(
 
 // Populates context_features[k].length based on context_features[k].protos
 // (for all k).
-Status GetContextFeatureLengths(const gtl::ArraySlice<tstring> example_names,
+Status GetContextFeatureLengths(const absl::Span<const tstring> example_names,
                                 FeatureProtosMap* context_features) {
   for (auto& c : *context_features) {
     FeatureProtos& feature = c.second;
@@ -2197,7 +2197,7 @@ Status GetContextFeatureLengths(const gtl::ArraySlice<tstring> example_names,
 
 // Populates sequence_features[k].length and sequence_features[k].num_rows based
 // on sequence_features[k].protos (for all k).
-Status GetSequenceFeatureLengths(const gtl::ArraySlice<tstring> example_names,
+Status GetSequenceFeatureLengths(const absl::Span<const tstring> example_names,
                                  FeatureProtosMap* sequence_features) {
   for (auto& c : *sequence_features) {
     FeatureProtos& feature = c.second;
@@ -2296,7 +2296,7 @@ void CopyTensorIntoTensor(DataType dtype, const Tensor& src, Tensor* dst,
 // values to `context_results`.
 Status ParseContextDenseFeatures(const FeatureProtosMap& context_features,
                                  const FastParseExampleConfig& context_config,
-                                 gtl::ArraySlice<tstring> example_names,
+                                 absl::Span<const tstring> example_names,
                                  bool is_batch, int num_examples,
                                  Allocator* allocator, Result* context_result) {
   for (int t = 0; t < context_config.dense.size(); ++t) {
@@ -2359,7 +2359,7 @@ Status ParseContextDenseFeatures(const FeatureProtosMap& context_features,
 // values to `context_results`.
 Status ParseContextSparseFeatures(const FeatureProtosMap& context_features,
                                   const FastParseExampleConfig& context_config,
-                                  gtl::ArraySlice<tstring> example_names,
+                                  absl::Span<const tstring> example_names,
                                   bool is_batch, int num_examples,
                                   Allocator* allocator,
                                   Result* context_result) {
@@ -2421,7 +2421,7 @@ Status ParseContextSparseFeatures(const FeatureProtosMap& context_features,
 // values to `context_results`.
 Status ParseContextRaggedFeatures(const FeatureProtosMap& context_features,
                                   const FastParseExampleConfig& context_config,
-                                  gtl::ArraySlice<tstring> example_names,
+                                  absl::Span<const tstring> example_names,
                                   bool is_batch, int num_examples,
                                   Allocator* allocator,
                                   Result* context_result) {
@@ -2499,7 +2499,7 @@ Status ParseContextRaggedFeatures(const FeatureProtosMap& context_features,
 // values to `sequence_result`.
 Status ParseSequenceDenseFeatures(const FeatureProtosMap& sequence_features,
                                   const FastParseExampleConfig& sequence_config,
-                                  gtl::ArraySlice<tstring> example_names,
+                                  absl::Span<const tstring> example_names,
                                   bool is_batch, int num_examples,
                                   Allocator* allocator, Result* sequence_result,
                                   std::vector<Tensor>* dense_feature_lengths) {
@@ -2654,7 +2654,7 @@ Status ParseSequenceDenseFeatures(const FeatureProtosMap& sequence_features,
 Status ParseSequenceSparseFeatures(
     const FeatureProtosMap& sequence_features,
     const FastParseExampleConfig& sequence_config,
-    gtl::ArraySlice<tstring> example_names, bool is_batch, int num_examples,
+    absl::Span<const tstring> example_names, bool is_batch, int num_examples,
     Allocator* allocator, Result* sequence_result) {
   for (int t = 0; t < sequence_config.sparse.size(); ++t) {
     const auto& c = sequence_config.sparse[t];
@@ -2782,7 +2782,7 @@ Status ParseSequenceSparseFeatures(
 Status ParseSequenceRaggedFeatures(
     const FeatureProtosMap& sequence_features,
     const FastParseExampleConfig& sequence_config,
-    gtl::ArraySlice<tstring> example_names, bool is_batch, int num_examples,
+    absl::Span<const tstring> example_names, bool is_batch, int num_examples,
     Allocator* allocator, Result* sequence_result) {
   for (int t = 0; t < sequence_config.ragged.size(); ++t) {
     const auto& c = sequence_config.ragged[t];
@@ -2928,8 +2928,8 @@ Status ParseSequenceRaggedFeatures(
 // TODO(b/111553342): Support extracting feature statistics from the examples.
 Status FastParseSequenceExample(const FastParseExampleConfig& context_config,
                                 const FastParseExampleConfig& sequence_config,
-                                gtl::ArraySlice<tstring> serialized,
-                                gtl::ArraySlice<tstring> example_names,
+                                absl::Span<const tstring> serialized,
+                                absl::Span<const tstring> example_names,
                                 thread::ThreadPool* thread_pool,
                                 Result* context_result, Result* sequence_result,
                                 std::vector<Tensor>* dense_feature_lengths,
diff --git a/tensorflow/core/util/example_proto_fast_parsing.h b/tensorflow/core/util/example_proto_fast_parsing.h
index db924c748db104..9c14aa9b6b3844 100644
--- a/tensorflow/core/util/example_proto_fast_parsing.h
+++ b/tensorflow/core/util/example_proto_fast_parsing.h
@@ -135,8 +135,8 @@ struct Result {
 // Given example names have to either be empty or the same size as serialized.
 // example_names are used only for error messages.
 Status FastParseExample(const FastParseExampleConfig& config,
-                        gtl::ArraySlice<tstring> serialized,
-                        gtl::ArraySlice<tstring> example_names,
+                        absl::Span<const tstring> serialized,
+                        absl::Span<const tstring> example_names,
                         thread::ThreadPool* thread_pool, Result* result);
 
 // TODO(mrry): Move the hash table construction into the config object.
@@ -153,9 +153,9 @@ Status FastParseSingleExample(const FastParseSingleExampleConfig& config,
 Status FastParseSequenceExample(
     const example::FastParseExampleConfig& context_config,
     const example::FastParseExampleConfig& sequence_config,
-    gtl::ArraySlice<tstring> serialized, gtl::ArraySlice<tstring> example_names,
-    thread::ThreadPool* thread_pool, example::Result* context_result,
-    example::Result* sequence_result,
+    absl::Span<const tstring> serialized,
+    absl::Span<const tstring> example_names, thread::ThreadPool* thread_pool,
+    example::Result* context_result, example::Result* sequence_result,
     std::vector<Tensor>* dense_feature_lengths, bool is_batch = true);
 
 // This function parses serialized Example and populates given example.
diff --git a/tensorflow/core/util/example_proto_fast_parsing_test.cc b/tensorflow/core/util/example_proto_fast_parsing_test.cc
index 945271c2050624..4d19be4a732932 100644
--- a/tensorflow/core/util/example_proto_fast_parsing_test.cc
+++ b/tensorflow/core/util/example_proto_fast_parsing_test.cc
@@ -424,8 +424,8 @@ TEST(TestFastParseExample, Empty) {
   FastParseExampleConfig config;
   config.sparse.push_back({"test", DT_STRING});
   Status status =
-      FastParseExample(config, gtl::ArraySlice<tstring>(),
-                       gtl::ArraySlice<tstring>(), nullptr, &result);
+      FastParseExample(config, absl::Span<const tstring>(),
+                       absl::Span<const tstring>(), nullptr, &result);
   EXPECT_TRUE(status.ok()) << status;
 }
 
diff --git a/tensorflow/core/util/quantization/uniform_quant_ops_params.cc b/tensorflow/core/util/quantization/uniform_quant_ops_params.cc
index 3ff4674080a089..71d3c51d4d891a 100644
--- a/tensorflow/core/util/quantization/uniform_quant_ops_params.cc
+++ b/tensorflow/core/util/quantization/uniform_quant_ops_params.cc
@@ -33,7 +33,7 @@ Status ValidDim(int64_t dims, int64_t dim) {
         "Each dimension number must be in region [0, rank). Given rank ", dims,
         " and dimension number value ", dim);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ValidSpatialDimensions(
@@ -46,7 +46,7 @@ Status ValidSpatialDimensions(
   for (int i = 0; i < spatial_dimensions.size(); ++i) {
     TF_RETURN_IF_ERROR(ValidDim(dims, spatial_dimensions.Get(i)));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -279,7 +279,7 @@ Status UniformQuantizedConvolutionParams::LoadFromAttrsInternal(
   } else if (!dimension_numbers_.ParseFromString(dimension_numbers_str)) {
     return InvalidArgument("Error parsing convolution dimension numbers.");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status UniformQuantizedConvolutionParams::ValidateOrFillPaddingList(
@@ -322,7 +322,7 @@ Status UniformQuantizedConvolutionParams::ValidateOrFillPaddingList(
       padding_list_[2 * i + 1] = padding_end;
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/quantization/uniform_quant_ops_params.h b/tensorflow/core/util/quantization/uniform_quant_ops_params.h
index 8c970245a1cbea..904bb0b15c17bf 100644
--- a/tensorflow/core/util/quantization/uniform_quant_ops_params.h
+++ b/tensorflow/core/util/quantization/uniform_quant_ops_params.h
@@ -86,7 +86,7 @@ class UniformQuantizedConvolutionParams {
   // ValidateOrFillParamsAndValidateShape().
   // Reference:
   // https://github.com/google/jax/blob/0584c6a1c405b23317deb1596c2c161eb5709c84/jax/_src/lax/convolution.py#L349
-  StatusOr<TensorShape> CalculateOutputShape(
+  absl::StatusOr<TensorShape> CalculateOutputShape(
       const TensorShape& lhs_shape, const TensorShape& rhs_shape) const;
 
   // Given the original size of a dimension and a dilation, calculate the
diff --git a/tensorflow/core/util/sparse/BUILD b/tensorflow/core/util/sparse/BUILD
index 5f140a988e1294..057592db44dd73 100644
--- a/tensorflow/core/util/sparse/BUILD
+++ b/tensorflow/core/util/sparse/BUILD
@@ -9,9 +9,7 @@ package(
 filegroup(
     name = "higher_level_tests_group",
     srcs = ["sparse_tensor_test.cc"],
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
 )
 
 # Export source files needed for mobile builds, which do not use granular targets.
diff --git a/tensorflow/core/util/sparse/sparse_tensor.h b/tensorflow/core/util/sparse/sparse_tensor.h
index 1234728af91a36..04240b92764473 100644
--- a/tensorflow/core/util/sparse/sparse_tensor.h
+++ b/tensorflow/core/util/sparse/sparse_tensor.h
@@ -41,7 +41,7 @@ namespace sparse {
 
 class SparseTensor {
  public:
-  typedef typename gtl::ArraySlice<int64_t> VarDimArray;
+  typedef absl::Span<const int64_t> VarDimArray;
   typedef typename gtl::InlinedVector<int64_t, 8> ShapeArray;
 
   static Status Create(Tensor ix, Tensor vals, const VarDimArray shape,
@@ -163,7 +163,7 @@ class SparseTensor {
   // having any order and a Reorder<T>() should be called on it before
   // performing any subsequent operations.
   template <typename T>
-  static SparseTensor Concat(const gtl::ArraySlice<SparseTensor>& tensors);
+  static SparseTensor Concat(const absl::Span<const SparseTensor>& tensors);
 
   // Split() will split the input SparseTensor into a list of num_split
   // SparseTensor given a splitting dimension. If the input dimension range
@@ -179,11 +179,11 @@ class SparseTensor {
   // index at each dimension and the size is the size at each dimension.
   template <typename T>
   static absl::StatusOr<SparseTensor> Slice(
-      const SparseTensor& tensor, const gtl::ArraySlice<int64_t> start,
-      const gtl::ArraySlice<int64_t> size);
+      const SparseTensor& tensor, const absl::Span<const int64_t> start,
+      const absl::Span<const int64_t> size);
 
   // Picks out the dimensions according to `dim_indices`.
-  std::vector<int64_t> PickDims(gtl::ArraySlice<int64_t> dim_indices) const {
+  std::vector<int64_t> PickDims(absl::Span<const int64_t> dim_indices) const {
     std::vector<int64_t> res(dim_indices.size());
     for (size_t i = 0; i < dim_indices.size(); ++i) {
       res[i] = shape_[dim_indices[i]];
@@ -420,7 +420,7 @@ inline bool SparseTensor::ToDense(Tensor* out, bool initialize) {
 
 template <typename T>
 inline SparseTensor SparseTensor::Concat(
-    const gtl::ArraySlice<SparseTensor>& tensors) {
+    const absl::Span<const SparseTensor>& tensors) {
   DCHECK_GE(tensors.size(), size_t{1}) << "Cannot concat 0 SparseTensors";
   const int dims = tensors[0].dims_;
   DCHECK_GE(dims, 1) << "Cannot concat 0-dimensional SparseTensors";
@@ -574,13 +574,13 @@ inline Status SparseTensor::Split(const SparseTensor& input_tensor,
     }
     result->push_back(std::move(tensor));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename T>
 inline absl::StatusOr<SparseTensor> SparseTensor::Slice(
-    const SparseTensor& input_tensor, const gtl::ArraySlice<int64_t> start,
-    const gtl::ArraySlice<int64_t> size) {
+    const SparseTensor& input_tensor, const absl::Span<const int64_t> start,
+    const absl::Span<const int64_t> size) {
   TensorShape output_shape(input_tensor.shape());
 
   const int dims = input_tensor.dims();
diff --git a/tensorflow/core/util/strided_slice_op_test.cc b/tensorflow/core/util/strided_slice_op_test.cc
index b543c03cedde18..f0a9dccf502264 100644
--- a/tensorflow/core/util/strided_slice_op_test.cc
+++ b/tensorflow/core/util/strided_slice_op_test.cc
@@ -350,7 +350,7 @@ TEST(StridedSliceAssignBCastTest, RemapDimensionsOutOfBoundsFails) {
 
 using IntVector = gtl::InlinedVector<int64_t, 4>;
 
-TensorShape AsTensorShape(gtl::ArraySlice<int64_t> dim_sizes) {
+TensorShape AsTensorShape(absl::Span<const int64_t> dim_sizes) {
   TensorShape out;
   TF_CHECK_OK(TensorShape::BuildTensorShape(dim_sizes, &out));
   return out;
diff --git a/tensorflow/core/util/tensor_bundle/byte_swap_tensor.cc b/tensorflow/core/util/tensor_bundle/byte_swap_tensor.cc
index 9696696018147d..6e04d4eec0893f 100644
--- a/tensorflow/core/util/tensor_bundle/byte_swap_tensor.cc
+++ b/tensorflow/core/util/tensor_bundle/byte_swap_tensor.cc
@@ -63,7 +63,7 @@ Status ByteSwapBuffer(char* buff, size_t size, DataType dtype,
     case DT_BOOL:
     case DT_UINT8:
     case DT_INT8:
-      return OkStatus();
+      return absl::OkStatus();
 
     // 16-bit types
     case DT_BFLOAT16:
@@ -119,7 +119,7 @@ Status ByteSwapBuffer(char* buff, size_t size, DataType dtype,
   }
 
   TF_RETURN_IF_ERROR(ByteSwapArray(buff, bytes_per_elem, array_len));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -198,7 +198,7 @@ Status ByteSwapTensorContentInNode(NodeDef& node) {
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ByteSwapTensorContentInMetaGraphDef(MetaGraphDef* meta_graph_def) {
@@ -207,13 +207,13 @@ Status ByteSwapTensorContentInMetaGraphDef(MetaGraphDef* meta_graph_def) {
                              ->mutable_function())
     for (auto& node : (*function.mutable_node_def()))
       TF_RETURN_IF_ERROR(ByteSwapTensorContentInNode(node));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ByteSwapTensorContentInGraphDef(GraphDef* graph_def) {
   for (auto& node : *graph_def->mutable_node())
     TF_RETURN_IF_ERROR(ByteSwapTensorContentInNode(node));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
index 8f58c0bdbcb6d9..87038432ae9428 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
@@ -92,7 +92,7 @@ namespace {
 Status ReadStringTensor(io::InputBuffer* buffered_file, size_t num_elements,
                         size_t offset, size_t size, tstring* destination,
                         uint32* actual_crc32c, bool need_to_swap_bytes) {
-  if (size == 0) return OkStatus();
+  if (size == 0) return absl::OkStatus();
   CHECK_GT(size, 0);
 
   // Reads "num_elements" varint64's from "buffered_file".
@@ -157,7 +157,7 @@ Status ReadStringTensor(io::InputBuffer* buffered_file, size_t num_elements,
         buffered_file->ReadNBytes(string_length, &(*buffer)[0], &bytes_read));
     *actual_crc32c = crc32c::Extend(*actual_crc32c, buffer->data(), bytes_read);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ReadVariantTensor(io::InputBuffer* buffered_file, Tensor* ret,
@@ -168,7 +168,7 @@ Status ReadVariantTensor(io::InputBuffer* buffered_file, Tensor* ret,
   //   [varint64 lenN][bytes variantN][4 byte checksum]
   // Var "crc32c" checksums all the lens, variant bytes, individual variant
   // checksums (as uint32, not varint32 bytes).
-  if (size == 0) return OkStatus();
+  if (size == 0) return absl::OkStatus();
   size_t num_elements = ret->NumElements();
 
   // Reads the actual string bytes.
@@ -220,7 +220,7 @@ Status ReadVariantTensor(io::InputBuffer* buffered_file, Tensor* ret,
     ret->flat<Variant>()(i) = std::move(v);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 char* GetBackingBuffer(const Tensor& val) {
@@ -238,7 +238,7 @@ Status ParseEntryProto(StringPiece key, StringPiece value,
   if (!out->ParseFromArray(value.data(), value.size())) {
     return errors::DataLoss("Entry for key ", key, " not parseable.");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Serializes the data bytes of the non-string tensor "val".  Discards the
@@ -309,7 +309,7 @@ Status WriteStringTensor(const Tensor& val, tsl::BufferedWritableFile* out,
     *bytes_written += string->size();
     *crc32c = crc32c::Extend(*crc32c, string->data(), string->size());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status WriteVariantTensor(const Tensor& val, tsl::BufferedWritableFile* out,
@@ -361,7 +361,7 @@ Status WriteVariantTensor(const Tensor& val, tsl::BufferedWritableFile* out,
     *bytes_written += sizeof(uint32);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Returns whether "slice_spec" is a full slice, with respect to the full shape.
@@ -414,7 +414,7 @@ Status PadAlignment(tsl::BufferedWritableFile* out, int alignment,
                     int64_t* size) {
   int bytes_over = *size % alignment;
   if (bytes_over == 0) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   int bytes_to_write = alignment - bytes_over;
   Status status = out->Append(string(bytes_to_write, '\0'));
@@ -583,7 +583,7 @@ Status BundleWriter::Finish() {
     if (!status_.ok()) return status_;
   }
   status_ = errors::Internal("BundleWriter is closed");
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Merging tensor bundles.
@@ -704,10 +704,10 @@ static Status MergeOneBundle(Env* env, StringPiece prefix,
     to_merge_entry.set_shard_id(result.first->second);
     merge_state->entries[key] = to_merge_entry;
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status MergeBundles(Env* env, gtl::ArraySlice<tstring> prefixes,
+Status MergeBundles(Env* env, absl::Span<const tstring> prefixes,
                     StringPiece merged_prefix, bool allow_missing_files) {
   // Merges all metadata tables.
   // TODO(zhifengc): KeyValue sorter if it becomes too big.
@@ -874,7 +874,7 @@ Status BundleReader::GetBundleEntryProto(StringPiece key,
   }
 
   entry->Swap(&entry_copy);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status BundleReader::GetValue(const BundleEntryProto& entry, Tensor* val) {
@@ -1016,7 +1016,7 @@ Status BundleReader::GetValue(const BundleEntryProto& entry, Tensor* val) {
 
   *val = *ret;
   if (ret != val) delete ret;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status BundleReader::Lookup(StringPiece key, Tensor* val) {
@@ -1060,7 +1060,7 @@ Status BundleReader::LookupTensorSlices(StringPiece key,
   for (const auto& slice : entry.slices()) {
     slices->emplace_back(slice);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status BundleReader::LookupSlice(StringPiece full_tensor_key,
@@ -1185,7 +1185,7 @@ Status BundleReader::GetSliceValue(StringPiece full_tensor_key,
     }
 #undef HANDLE_COPY
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool BundleReader::Contains(StringPiece key) {
@@ -1199,7 +1199,7 @@ Status BundleReader::LookupDtypeAndShape(StringPiece key, DataType* dtype,
   TF_RETURN_IF_ERROR(GetBundleEntryProto(key, &entry));
   *dtype = entry.dtype();
   *shape = TensorShape(entry.shape());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status BundleReader::LookupTensorShape(StringPiece key, TensorShape* shape) {
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.h b/tensorflow/core/util/tensor_bundle/tensor_bundle.h
index e570f539910234..ba1a4f7053aac6 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.h
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.h
@@ -190,7 +190,7 @@ class BundleWriter {
 //
 // Returns a NotFoundError when "allow_missing_files" is set to false and
 // any data file named in "prefixes" does not exist.
-Status MergeBundles(Env* env, gtl::ArraySlice<tstring> prefixes,
+Status MergeBundles(Env* env, absl::Span<const tstring> prefixes,
                     absl::string_view merged_prefix,
                     bool allow_missing_files = false);
 
@@ -387,7 +387,7 @@ Status BundleReader::SortForSequentialAccess(
       return file_offset_a.shard_id < file_offset_b.shard_id;
     }
   });
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // BundleCache provides cached opening of files.
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
index 5e404f48621ad5..ac0b15644f106b 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
@@ -329,7 +329,7 @@ TEST(TensorBundleTest, SwapBytes) {
   // functions. As a workaround, we make some dummy calls here.
   // TODO(frreiss): Remove this workaround when the compiler bug is fixed.
   ByteSwap(Constant_2x3<int>(42));
-  EXPECT_NE(OkStatus(), FlipEndiannessBit(Prefix("not_a_valid_prefix")));
+  EXPECT_NE(absl::OkStatus(), FlipEndiannessBit(Prefix("not_a_valid_prefix")));
 
   // Test patterns, manually swapped so that we aren't relying on the
   // correctness of our own byte-swapping macros when testing those macros.
@@ -885,7 +885,7 @@ TEST(TensorBundleTest, DirectoryStructure) {
 
   // Ensures we have the expected files.
   auto CheckDirFiles = [env](const string& bundle_prefix,
-                             gtl::ArraySlice<string> expected_files) {
+                             absl::Span<const string> expected_files) {
     StringPiece dir = io::Dirname(bundle_prefix);
     for (const string& expected_file : expected_files) {
       TF_EXPECT_OK(env->FileExists(io::JoinPath(dir, expected_file)));
diff --git a/tensorflow/core/util/tensor_format.h b/tensorflow/core/util/tensor_format.h
index 2ac0ea82526cb2..59e702697a1ccb 100644
--- a/tensorflow/core/util/tensor_format.h
+++ b/tensorflow/core/util/tensor_format.h
@@ -468,7 +468,7 @@ T GetTensorDim(const std::vector<T>& attributes, TensorFormat format,
 // according to 'tensor_format'.
 inline int64_t GetTensorDim(const TensorShape& tensor_shape,
                             TensorFormat tensor_format, char dimension) {
-  return GetTensorDim(gtl::ArraySlice<int64_t>(tensor_shape.dim_sizes()),
+  return GetTensorDim(absl::Span<const int64_t>(tensor_shape.dim_sizes()),
                       tensor_format, dimension);
 }
 
@@ -477,7 +477,7 @@ inline int64_t GetTensorDim(const TensorShape& tensor_shape,
 inline int64_t GetFilterDim(const TensorShape& tensor_shape,
                             FilterTensorFormat tensor_filter_format,
                             char dimension) {
-  return GetFilterDim(gtl::ArraySlice<int64_t>(tensor_shape.dim_sizes()),
+  return GetFilterDim(absl::Span<const int64_t>(tensor_shape.dim_sizes()),
                       tensor_filter_format, dimension);
 }
 
@@ -521,7 +521,7 @@ std::string GetConvnetDataFormat2D3DAttrString();
 // FORMAT_NCHW_VECT_C: (N, C, spatial, InnerC); rank = spatial.size() + 3
 // FORMAT_NHWC_VECT_W: (N, spatial, C, InnerW); rank = spatial.size() + 3
 inline Status ShapeFromFormatWithStatus(TensorFormat format, int64_t N,
-                                        gtl::ArraySlice<int64_t> spatial,
+                                        absl::Span<const int64_t> spatial,
                                         int64_t C, TensorShape* shape) {
   const int dims = GetTensorDimsFromSpatialDims(spatial.size(), format);
   gtl::InlinedVector<int64_t, 6> dim_sizes(dims);
@@ -551,7 +551,7 @@ inline Status ShapeFromFormatWithStatus(TensorFormat format, int64_t N,
 }
 
 inline TensorShape ShapeFromFormat(TensorFormat format, int64_t N,
-                                   gtl::ArraySlice<int64_t> spatial,
+                                   absl::Span<const int64_t> spatial,
                                    int64_t C) {
   TensorShape shape;
   TF_CHECK_OK(ShapeFromFormatWithStatus(format, N, spatial, C, &shape));
@@ -562,9 +562,9 @@ inline TensorShape ShapeFromFormat(TensorFormat format, int64_t N,
 // Works for both 2D and 3D operations. If 'format' is OIHW_VECT_I,
 // the output TensorShape has spatial.size() + 3 dimensions, otherwise
 // it has spatial.size() + 2 dimensions.
-inline TensorShape ShapeFromFilterTensorFormat(FilterTensorFormat format,
-                                               gtl::ArraySlice<int64_t> spatial,
-                                               int64_t I, int64_t O) {
+inline TensorShape ShapeFromFilterTensorFormat(
+    FilterTensorFormat format, absl::Span<const int64_t> spatial, int64_t I,
+    int64_t O) {
   const int dims = GetFilterTensorDimsFromSpatialDims(spatial.size(), format);
   gtl::InlinedVector<int64_t, 6> dim_sizes(dims);
   dim_sizes[GetFilterTensorOutputChannelsDimIndex(dims, format)] = O;
@@ -622,7 +622,7 @@ inline Status ShapeFromFormatWithStatus(TensorFormat dst_format,
       GetTensorSpatialDims(src_shape.dims(), src_format);
   std::vector<int64_t> spatial_dims(num_src_spatial_dims);
   for (int spatial_dim = 0; spatial_dim < num_src_spatial_dims; ++spatial_dim) {
-    spatial_dims[spatial_dim] = gtl::ArraySlice<int64_t>(
+    spatial_dims[spatial_dim] = absl::Span<const int64_t>(
         src_shape.dim_sizes())[GetTensorSpatialDimIndex(
         src_shape.dims(), src_format, spatial_dim)];
   }
diff --git a/tensorflow/core/util/tensor_slice_writer.cc b/tensorflow/core/util/tensor_slice_writer.cc
index 10a65e0861e428..c23f2d92126ad6 100644
--- a/tensorflow/core/util/tensor_slice_writer.cc
+++ b/tensorflow/core/util/tensor_slice_writer.cc
@@ -201,7 +201,7 @@ Status TensorSliceWriter::SaveData(const tstring* data, int64_t num_elements,
   Fill(data, num_elements, ss->mutable_data());
   DCHECK_GE(ss->ByteSize(), 0);
   DCHECK_LE(ss->ByteSize(), size_bound);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace checkpoint
diff --git a/tensorflow/distribute/experimental/rpc/kernels/rpc_ops.cc b/tensorflow/distribute/experimental/rpc/kernels/rpc_ops.cc
index 49dc0bfcb96fad..2adcc2d649d882 100644
--- a/tensorflow/distribute/experimental/rpc/kernels/rpc_ops.cc
+++ b/tensorflow/distribute/experimental/rpc/kernels/rpc_ops.cc
@@ -206,7 +206,7 @@ class FunctionRegistry {
       return tensorflow::errors::InvalidArgument(
           absl::StrCat(method, " is already registered."));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   tensorflow::Status LookUp(const std::string& method,
@@ -219,7 +219,7 @@ class FunctionRegistry {
     }
 
     *output = it->second;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   const gtl::FlatMap<std::string, FunctionMetadata>& List() const {
@@ -532,7 +532,7 @@ void RpcServerOp::Compute(OpKernelContext* ctx) {
   // Create resource
   auto creator = [address](RpcServer** server) {
     *server = new RpcServer(address);
-    return OkStatus();
+    return absl::OkStatus();
   };
   core::RefCountPtr<RpcServer> server;
   OP_REQUIRES_OK(ctx, LookupOrCreateResource<RpcServer>(ctx, resource_handle,
@@ -573,7 +573,7 @@ void RpcClientOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
   // Create resource
   auto creator = [&address, &resource_name, timeout_in_ms](RpcClient** client) {
     *client = new RpcClient(address, resource_name, timeout_in_ms);
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   core::RefCountPtr<RpcClient> client;
@@ -624,7 +624,7 @@ void RpcServerStartOp::Compute(OpKernelContext* ctx) {
   OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &server));
 
   server->StartServer();
-  ctx->SetStatus(OkStatus());
+  ctx->SetStatus(absl::OkStatus());
 }
 
 RpcServerRegisterOp::RpcServerRegisterOp(OpKernelConstruction* ctx)
@@ -735,7 +735,7 @@ void RpcCallOp::Compute(OpKernelContext* ctx) {
   // Create resource
   auto creator = [](RpcFutureResource** resource) {
     *resource = new RpcFutureResource();
-    return OkStatus();
+    return absl::OkStatus();
   };
   core::RefCountPtr<RpcFutureResource> future_resource;
   OP_REQUIRES_OK(ctx, LookupOrCreateResource<RpcFutureResource>(
diff --git a/tensorflow/dtensor/cc/dtensor_device.cc b/tensorflow/dtensor/cc/dtensor_device.cc
index d169ba4c8595b6..6632f22be4f6d7 100644
--- a/tensorflow/dtensor/cc/dtensor_device.cc
+++ b/tensorflow/dtensor/cc/dtensor_device.cc
@@ -1589,9 +1589,9 @@ StatusOr<DTensorDevice::DTensorOperationLoweringContext>
 DTensorDevice::DTensorOperationToModule(
     TFE_Context* context, const std::vector<TensorWithLayout*>& inputs,
     const DTensorOperation& doperation, const NameAttrList& eager_attributes) {
-  profiler::TraceMe activity(
+  tsl::profiler::TraceMe activity(
       [&] { return "DTensorDevice::DTensorOperationToModule"; },
-      profiler::TraceMeLevel::kInfo);
+      tsl::profiler::TraceMeLevel::kInfo);
   FunctionLibraryDefinition* flib_def =
       tensorflow::unwrap(context)->FuncLibDef();
   DTensorOperationLoweringContext result;
@@ -1679,9 +1679,9 @@ void DTensorDevice::ModuleToExecutionFunctions(
     const DTensorOperation& doperation, const NameAttrList& eager_attributes,
     int num_outputs, DTensorOperationLoweringContext& lowering_context,
     const ExecutionFunctions** execution_functions, TF_Status* status) {
-  profiler::TraceMe activity(
+  tsl::profiler::TraceMe activity(
       [&] { return "DTensorDevice::ModuleToExecutionFunctions"; },
-      profiler::TraceMeLevel::kInfo);
+      tsl::profiler::TraceMeLevel::kInfo);
   FunctionLibraryDefinition* flib_def =
       tensorflow::unwrap(context)->FuncLibDef();
   const FunctionDef* function_def = doperation.function_def;
@@ -1706,8 +1706,9 @@ void DTensorDevice::ModuleToExecutionFunctions(
                   "ModuleOp for ExecutionFunctions extraction is missing.");
   }
   {
-    profiler::TraceMe activity([&] { return "DTensorDevice::RunMLIRPasses"; },
-                               profiler::TraceMeLevel::kInfo);
+    tsl::profiler::TraceMe activity(
+        [&] { return "DTensorDevice::RunMLIRPasses"; },
+        tsl::profiler::TraceMeLevel::kInfo);
     RETURN_C_STATUS_IF_NOT_OK(pass_runner_.Run(*lowering_context.module),
                               status);
   }
diff --git a/tensorflow/dtensor/cc/parallel_executor.h b/tensorflow/dtensor/cc/parallel_executor.h
index 8d3570c837eba4..8234cd36e317b3 100644
--- a/tensorflow/dtensor/cc/parallel_executor.h
+++ b/tensorflow/dtensor/cc/parallel_executor.h
@@ -30,7 +30,7 @@ limitations under the License.
 namespace tensorflow {
 namespace dtensor {
 
-template <typename T>
+template <typename T = void>
 using Future = ::xla::PjRtFuture<T>;
 
 // ParallelExecutor Interface
@@ -53,7 +53,7 @@ class ParallelExecutor {
   // raw pointers.
   // The client is responsible for the ownership of the outputs.
   struct ExecutionResult {
-    Future<Status> status;
+    Future<> status;
     // The pointed data of `outputs` are filled after `status` future resolves
     // as ok.
     std::vector<TensorWithLayout*> outputs;
diff --git a/tensorflow/dtensor/mlir/BUILD b/tensorflow/dtensor/mlir/BUILD
index 18a1057808e981..80531f9c04036a 100644
--- a/tensorflow/dtensor/mlir/BUILD
+++ b/tensorflow/dtensor/mlir/BUILD
@@ -385,10 +385,11 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
         "//tensorflow/core:framework",
         "//tensorflow/core/platform:errors",
-        "//tensorflow/core/platform:statusor",
         "//tensorflow/dtensor/cc:dstatus",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/dtensor/mlir/dtensor_dialect/ir/dtensor_dialect.td b/tensorflow/dtensor/mlir/dtensor_dialect/ir/dtensor_dialect.td
index 0f2991ea2596f4..195383c497f041 100644
--- a/tensorflow/dtensor/mlir/dtensor_dialect/ir/dtensor_dialect.td
+++ b/tensorflow/dtensor/mlir/dtensor_dialect/ir/dtensor_dialect.td
@@ -52,8 +52,6 @@ void registerAttributes();
 
 public:
   }];
-
-  let usePropertiesForAttributes = 0;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/dtensor/mlir/sparse_expansions/dynamic_enqueue_sparse_expander.cc b/tensorflow/dtensor/mlir/sparse_expansions/dynamic_enqueue_sparse_expander.cc
index 7f3ab1f55d6bc5..3cc8d9e3ec513c 100644
--- a/tensorflow/dtensor/mlir/sparse_expansions/dynamic_enqueue_sparse_expander.cc
+++ b/tensorflow/dtensor/mlir/sparse_expansions/dynamic_enqueue_sparse_expander.cc
@@ -15,15 +15,18 @@ limitations under the License.
 
 #include "tensorflow/dtensor/mlir/sparse_expansions/dynamic_enqueue_sparse_expander.h"
 
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.h"
 #include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/dtensor/cc/dstatus.h"
 #include "tensorflow/dtensor/mlir/sparse_expander_common.h"
-#include "tensorflow/dtensor/mlir/value_utils.h"
 
 namespace tensorflow {
 namespace dtensor {
diff --git a/tensorflow/dtensor/mlir/sparse_expansions/dynamic_enqueue_sparse_expander.h b/tensorflow/dtensor/mlir/sparse_expansions/dynamic_enqueue_sparse_expander.h
index 817468e01299ac..928f0752c58052 100644
--- a/tensorflow/dtensor/mlir/sparse_expansions/dynamic_enqueue_sparse_expander.h
+++ b/tensorflow/dtensor/mlir/sparse_expansions/dynamic_enqueue_sparse_expander.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_DTENSOR_MLIR_SPARSE_EXPANSIONS_DYNAMIC_ENQUEUE_SPARSE_EXPANDER_H_
 
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
 #include "tensorflow/dtensor/mlir/sparse_expander.h"
 
 namespace tensorflow {
diff --git a/tensorflow/dtensor/mlir/sparse_expansions/matmul_sparse_expander.cc b/tensorflow/dtensor/mlir/sparse_expansions/matmul_sparse_expander.cc
index 1d6fbaaed97c3b..7ed10e42dfe186 100644
--- a/tensorflow/dtensor/mlir/sparse_expansions/matmul_sparse_expander.cc
+++ b/tensorflow/dtensor/mlir/sparse_expansions/matmul_sparse_expander.cc
@@ -15,8 +15,12 @@ limitations under the License.
 
 #include "tensorflow/dtensor/mlir/sparse_expansions/matmul_sparse_expander.h"
 
+#include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/ValueRange.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/dtensor/cc/dstatus.h"
 #include "tensorflow/dtensor/mlir/sparse_expander_common.h"
 
 namespace tensorflow {
diff --git a/tensorflow/dtensor/mlir/sparse_expansions/matmul_sparse_expander.h b/tensorflow/dtensor/mlir/sparse_expansions/matmul_sparse_expander.h
index e03051a57d610f..c79acd9fb871c6 100644
--- a/tensorflow/dtensor/mlir/sparse_expansions/matmul_sparse_expander.h
+++ b/tensorflow/dtensor/mlir/sparse_expansions/matmul_sparse_expander.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_DTENSOR_MLIR_SPARSE_EXPANSIONS_MATMUL_SPARSE_EXPANDER_H_
 
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/dstatus.h"
 #include "tensorflow/dtensor/mlir/sparse_expander.h"
 
 namespace tensorflow {
diff --git a/tensorflow/examples/label_image/BUILD b/tensorflow/examples/label_image/BUILD
index f5be337be5d092..7545a2f49ef5cd 100644
--- a/tensorflow/examples/label_image/BUILD
+++ b/tensorflow/examples/label_image/BUILD
@@ -32,16 +32,12 @@ tf_cc_binary(
     }),
     deps = select({
         "//tensorflow:android": [
-            # cc:cc_ops is used to include image ops (for label_image)
-            # Jpg, gif, and png related code won't be included
-            "//tensorflow/cc:cc_ops",
             "//tensorflow/core:portable_tensorflow_lib",
             # cc:android_tensorflow_image_op is for including jpeg/gif/png
             # decoder to enable real-image evaluation on Android
             "//tensorflow/core/kernels/image:android_tensorflow_image_op",
         ],
         "//conditions:default": [
-            "//tensorflow/cc:cc_ops",
             "//tensorflow/core:core_cpu",
             "//tensorflow/core:framework",
             "//tensorflow/core:framework_internal",
@@ -49,7 +45,17 @@ tf_cc_binary(
             "//tensorflow/core:protos_all_cc",
             "//tensorflow/core:tensorflow",
         ],
-    }),
+    }) + [
+        "@com_google_absl//absl/status",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:scope",
+        "@local_xla//xla/tsl/util:command_line_flags",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:types",
+        # cc:cc_ops is used to include image ops (for label_image)
+        # Jpg, gif, and png related code won't be included
+        "//tensorflow/cc:cc_ops",
+    ],
 )
 
 py_strict_binary(
diff --git a/tensorflow/examples/label_image/main.cc b/tensorflow/examples/label_image/main.cc
index 946fc4ddfb6d68..7a287a7ff3d7b2 100644
--- a/tensorflow/examples/label_image/main.cc
+++ b/tensorflow/examples/label_image/main.cc
@@ -41,25 +41,37 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/const_op.h"
 #include "tensorflow/cc/ops/image_ops.h"
-#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/cc/ops/math_ops.h"
+#include "tensorflow/cc/ops/nn_ops.h"
+#include "xla/tsl/util/command_line_flags.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/graph/default_device.h"
-#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
-#include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/file_system.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/tstring.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session.h"
+#include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/util/command_line_flags.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/status.h"
+#include "tsl/platform/types.h"
 
 // These are all common classes it's handy to reference with no namespace.
 using tensorflow::Flag;
diff --git a/tensorflow/examples/speech_commands/BUILD b/tensorflow/examples/speech_commands/BUILD
index ae630e0247b246..f8aef47de374db 100644
--- a/tensorflow/examples/speech_commands/BUILD
+++ b/tensorflow/examples/speech_commands/BUILD
@@ -314,7 +314,12 @@ tf_cc_binary(
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:tensorflow",
+        "@com_google_absl//absl/status",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:types",
+        "@local_xla//xla/tsl/util:command_line_flags",
     ],
 )
 
@@ -371,6 +376,7 @@ cc_library(
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:tensorflow",
+        "@com_google_absl//absl/status",
     ],
 )
 
@@ -382,8 +388,10 @@ tf_cc_test(
     ],
     deps = [
         ":recognize_commands",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
@@ -403,6 +411,9 @@ cc_library(
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:tensorflow",
+        "//tensorflow/core/platform:numbers",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
     ],
 )
 
@@ -418,7 +429,6 @@ tf_cc_test(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
     ],
 )
 
@@ -435,6 +445,9 @@ tf_cc_binary(
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/status",
+        "@local_tsl//tsl/platform:types",
     ],
 )
 
diff --git a/tensorflow/examples/speech_commands/accuracy_utils.cc b/tensorflow/examples/speech_commands/accuracy_utils.cc
index 4857244dc1e15d..cbd91c817204f2 100644
--- a/tensorflow/examples/speech_commands/accuracy_utils.cc
+++ b/tensorflow/examples/speech_commands/accuracy_utils.cc
@@ -19,8 +19,13 @@ limitations under the License.
 #include <iomanip>
 #include <unordered_set>
 
-#include "tensorflow/core/lib/io/path.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/numbers.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/examples/speech_commands/accuracy_utils.h b/tensorflow/examples/speech_commands/accuracy_utils.h
index 480691f16a790b..d40a441f60e1a7 100644
--- a/tensorflow/examples/speech_commands/accuracy_utils.h
+++ b/tensorflow/examples/speech_commands/accuracy_utils.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
diff --git a/tensorflow/examples/speech_commands/accuracy_utils_test.cc b/tensorflow/examples/speech_commands/accuracy_utils_test.cc
index 37d7b59c9e32e0..cf4f5bad49c31d 100644
--- a/tensorflow/examples/speech_commands/accuracy_utils_test.cc
+++ b/tensorflow/examples/speech_commands/accuracy_utils_test.cc
@@ -15,12 +15,11 @@ limitations under the License.
 
 #include "tensorflow/examples/speech_commands/accuracy_utils.h"
 
-#include "tensorflow/core/framework/tensor_testutil.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/lib/io/path.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/path.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+#include "tsl/lib/core/status_test_util.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/examples/speech_commands/label_wav.cc b/tensorflow/examples/speech_commands/label_wav.cc
index 72fc9140faaaed..7ea4e72c165218 100644
--- a/tensorflow/examples/speech_commands/label_wav.cc
+++ b/tensorflow/examples/speech_commands/label_wav.cc
@@ -16,13 +16,24 @@ limitations under the License.
 #include <fstream>
 #include <vector>
 
+#include "absl/status/status.h"
+#include "xla/tsl/util/command_line_flags.h"
+#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/platform/tstring.h"
 #include "tensorflow/core/public/session.h"
+#include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/util/command_line_flags.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/status.h"
+#include "tsl/platform/types.h"
 
 // These are all common classes it's handy to reference with no namespace.
 using tensorflow::Flag;
diff --git a/tensorflow/examples/speech_commands/recognize_commands.cc b/tensorflow/examples/speech_commands/recognize_commands.cc
index 386f1d62663e54..f209b5567a2594 100644
--- a/tensorflow/examples/speech_commands/recognize_commands.cc
+++ b/tensorflow/examples/speech_commands/recognize_commands.cc
@@ -15,6 +15,12 @@ limitations under the License.
 
 #include "tensorflow/examples/speech_commands/recognize_commands.h"
 
+#include "absl/status/status.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
+
 namespace tensorflow {
 
 RecognizeCommands::RecognizeCommands(const std::vector<string>& labels,
diff --git a/tensorflow/examples/speech_commands/recognize_commands.h b/tensorflow/examples/speech_commands/recognize_commands.h
index bfcda6d03d9d76..a075b65d408af3 100644
--- a/tensorflow/examples/speech_commands/recognize_commands.h
+++ b/tensorflow/examples/speech_commands/recognize_commands.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
diff --git a/tensorflow/examples/speech_commands/recognize_commands_test.cc b/tensorflow/examples/speech_commands/recognize_commands_test.cc
index dc9bf7254c1206..1730d064037821 100644
--- a/tensorflow/examples/speech_commands/recognize_commands_test.cc
+++ b/tensorflow/examples/speech_commands/recognize_commands_test.cc
@@ -15,10 +15,12 @@ limitations under the License.
 
 #include "tensorflow/examples/speech_commands/recognize_commands.h"
 
+#include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+#include "tsl/lib/core/status_test_util.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/examples/speech_commands/test_streaming_accuracy.cc b/tensorflow/examples/speech_commands/test_streaming_accuracy.cc
index 08e0638eb64101..8beeaffdd9ae6c 100644
--- a/tensorflow/examples/speech_commands/test_streaming_accuracy.cc
+++ b/tensorflow/examples/speech_commands/test_streaming_accuracy.cc
@@ -68,18 +68,25 @@ bazel run tensorflow/examples/speech_commands:test_streaming_accuracy -- \
 #include <unordered_set>
 #include <vector>
 
+#include "absl/status/status.h"
+#include "xla/tsl/util/command_line_flags.h"
+#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/lib/io/path.h"
-#include "tensorflow/core/lib/strings/numbers.h"
-#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/wav/wav_io.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session.h"
+#include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/examples/speech_commands/accuracy_utils.h"
 #include "tensorflow/examples/speech_commands/recognize_commands.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/status.h"
+#include "tsl/platform/types.h"
 
 // These are all common classes it's handy to reference with no namespace.
 using ::int64_t;
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 8a825b310ca47c..ed921329d2fea3 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -1709,22 +1709,11 @@ func AssignVariableXlaConcatNDPaddings(value []int64) AssignVariableXlaConcatNDA
 //
 // Arguments:
 //
-//		resource: Resource variable for concatenated input tensors across all dimensions.
-//	  }
-//	  in_arg {
-//	    name: "inputs"
-//	    description: <<END
+//	resource: Resource variable for concatenated input tensors across all dimensions.
+//	inputs: Input tensor slices in row-major order to merge across all dimensions. All
 //
-// Input tensor slices in row-major order to merge across all dimensions. All
 // inputs must have the same shape.
 //
-//	}
-//	out_arg {
-//	  name: "output"
-//	  description: <<END
-//
-// Output tensor formed from merging input slices based on num_concats defined.
-//
 //	num_concats: Number of ways to merge per dimension.
 //
 // Returns the created operation.
@@ -36784,17 +36773,14 @@ func ReadVariableXlaSplitNDPaddings(value []int64) ReadVariableXlaSplitNDAttr {
 //
 // Arguments:
 //
-//		resource: Resource variable of input tensor to split across all dimensions.
-//	  }
-//	  out_arg {
-//	    name: "outputs"
-//	    description: <<END
+//	resource: Resource variable of input tensor to split across all dimensions.
 //
-// Output slices based on input and num_splits defined, in row-major order.
 //
 //	num_splits: Number of ways to split per dimension. Shape dimensions must be evenly
 //
 // divisible.
+//
+// Returns Output slices based on input and num_splits defined, in row-major order.
 func ReadVariableXlaSplitND(scope *Scope, resource tf.Output, T tf.DataType, N int64, num_splits []int64, optional ...ReadVariableXlaSplitNDAttr) (outputs []tf.Output) {
 	if scope.Err() != nil {
 		return
@@ -58561,14 +58547,9 @@ func XlaConcatNDPaddings(value []int64) XlaConcatNDAttr {
 //
 // inputs must have the same shape.
 //
-//	}
-//	out_arg {
-//	  name: "output"
-//	  description: <<END
-//
-// Output tensor formed from merging input slices based on num_concats defined.
-//
 //	num_concats: Number of ways to merge per dimension.
+//
+// Returns Output tensor formed from merging input slices based on num_concats defined.
 func XlaConcatND(scope *Scope, inputs []tf.Output, num_concats []int64, optional ...XlaConcatNDAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index d97faeaaaeca08..e9fd75d7050234 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -7,6 +7,50 @@ load("//tensorflow/lite:build_def.bzl", "tflite_cc_shared_object", "tflite_copts
 load("//tensorflow/lite:special_rules.bzl", "SPECIAL_RULES_DEPS", "internal_visibility_allowlist", "tflite_internal_cc_3p_api_deps_src_all_visibility_allowlist", "tflite_portable_test_suite")
 load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "alias_with_tflite", "cc_library_with_tflite", "cc_test_with_tflite")
 
+package_group(
+    name = "tf_lite_runtime",
+    packages = ["//tensorflow/lite/..."],
+)
+
+package_group(
+    name = "tf_lite_current_dependencies_in_tf",
+    packages = [
+        "//tensorflow",
+        "//tensorflow/compiler/mlir/lite",
+        "//tensorflow/compiler/mlir/lite/experimental/google/interpreter_from_mlir",
+        "//tensorflow/compiler/mlir/lite/experimental/google/tooling",
+        "//tensorflow/compiler/mlir/lite/experimental/google/tooling/tools",
+        "//tensorflow/compiler/mlir/lite/experimental/jax/tests",
+        "//tensorflow/compiler/mlir/lite/experimental/tac/tools",
+        "//tensorflow/compiler/mlir/lite/experimental/xnnpack_mhlo",
+        "//tensorflow/compiler/mlir/lite/metrics",
+        "//tensorflow/compiler/mlir/lite/python",
+        "//tensorflow/compiler/mlir/lite/quantization/lite",
+        "//tensorflow/compiler/mlir/lite/sparsity",
+        "//tensorflow/compiler/mlir/lite/stablehlo",
+        "//tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir",
+        "//tensorflow/compiler/mlir/quantization/common/quantization_lib",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/utils",
+        "//tensorflow/compiler/mlir/tosa",
+        "//tensorflow/core",
+        "//tensorflow/python",
+        "//tensorflow/python/kernel_tests/signal",
+        "//tensorflow/python/tools/api/generator",
+    ],
+)
+
+package_group(
+    name = "anything_but_tf",
+    includes = [
+        "tf_lite_current_dependencies_in_tf",
+        "tf_lite_runtime",
+    ],
+    packages = [
+        "-//third_party/tensorflow/...",
+        "//...",
+    ],
+)
+
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:LICENSE"],
     default_visibility = ["//visibility:public"],
diff --git a/tensorflow/lite/CMakeLists.txt b/tensorflow/lite/CMakeLists.txt
index 7fa2ede1f4066d..f79ea9b63deea0 100644
--- a/tensorflow/lite/CMakeLists.txt
+++ b/tensorflow/lite/CMakeLists.txt
@@ -501,6 +501,8 @@ if(TFLITE_ENABLE_XNNPACK)
   list(APPEND TFLITE_TARGET_PUBLIC_OPTIONS "-DTFLITE_BUILD_WITH_XNNPACK_DELEGATE")
   list(APPEND TFLITE_TARGET_PUBLIC_OPTIONS "-DXNNPACK_DELEGATE_ENABLE_QS8")
   list(APPEND TFLITE_TARGET_PUBLIC_OPTIONS "-DXNNPACK_DELEGATE_ENABLE_QU8")
+  list(APPEND TFLITE_TARGET_PUBLIC_OPTIONS "-DXNNPACK_DELEGATE_USE_LATEST_OPS")
+  list(APPEND TFLITE_TARGET_PUBLIC_OPTIONS "-DXNNPACK_DELEGATE_ENABLE_SUBGRAPH_RESHAPING")
 endif()
 if(TFLITE_ENABLE_EXTERNAL_DELEGATE)
   populate_tflite_source_vars("delegates/external"
diff --git a/tensorflow/lite/acceleration/configuration/configuration.proto b/tensorflow/lite/acceleration/configuration/configuration.proto
index 4456d031970d4d..a2bbd88004f8f6 100644
--- a/tensorflow/lite/acceleration/configuration/configuration.proto
+++ b/tensorflow/lite/acceleration/configuration/configuration.proto
@@ -99,6 +99,8 @@ enum Delegate {
   CORE_ML = 7;
   // Arm NN Delegate.
   ARMNN = 8;
+  // MediaTek Neuron Delegate.
+  MTK_NEURON = 9;
 }
 
 enum NNAPIExecutionPreference {
@@ -664,6 +666,111 @@ message ArmNNSettings {
   optional string additional_parameters = 3;
 }
 
+// MediaTek Neuron Delegate Settings.
+// See https://neuropilot.mediatek.com/ for more information.
+message MtkNeuronSettings {
+  enum ExecutionPreference {
+    PREFERENCE_UNDEFINED = 0;
+
+    // Prefer execution in a power-efficient mode, optimizing for low power
+    // consumption.
+    PREFERENCE_LOW_POWER = 1;
+
+    // Prefer execution that provides shorter single-shot latency, optimizing
+    // for fast response times.
+    PREFERENCE_FAST_SINGLE_ANSWER = 2;
+
+    // Prefer execution that provides sustained speed for continuous operation
+    // and higher throughput, optimizing for overall performance in ongoing or
+    // repetitive tasks.
+    PREFERENCE_SUSTAINED_SPEED = 3;
+
+    // Prefer execution in the turbo boost mode, which may boost the frequencies
+    // of APU and other system components such as CPU and DRAM, to achieve
+    // maximum performance. If boosting is not supported in the underlying
+    // system, it falls back to the behavior of PREFERENCE_FAST_SINGLE_ANSWER.
+    PREFERENCE_TURBO_BOOST = 4;
+  }
+
+  enum ExecutionPriority {
+    PRIORITY_UNDEFINED = 0;
+    PRIORITY_LOW = 90;
+    PRIORITY_MEDIUM = 100;
+    PRIORITY_HIGH = 110;
+  }
+
+  enum OptimizationHint {
+    OPTIMIZATION_NONE = 0;
+
+    // Optimization hint for reducing latency. This hint may distribute the
+    // workload across multiple APU cores in the compiled model to achieve
+    // faster execution.
+    OPTIMIZATION_LOW_LATENCY = 1;
+
+    // Optimization hint for reducing DRAM access and minimizing memory
+    // bandwidth usage through kernel fusion and data fusion techniques.
+    OPTIMIZATION_DEEP_FUSION = 2;
+
+    // Optimization hint for processing multiple input samples in parallel
+    // across available APU cores in the batch dimension. This optimization is
+    // effective for models with a batch size greater than 1.
+    OPTIMIZATION_BATCH_PROCESSING = 3;
+  }
+
+  // How to check the operator compatibility with the underlying accelerator.
+  enum OperationCheckMode {
+    NO_OPERATION_CHECK = 0;
+
+    // Checks each node separately with multiple queries to the backend.
+    PER_NODE_OPERATION_CHECK = 1;
+
+    // Checks all nodes in the graph at once with a batched query to the
+    // backend.
+    PRE_OPERATION_CHECK = 2;
+  }
+
+  // The preferred execution mode. The system-wide default will be used when
+  // PREFERENCE_UNDEFINED is passed to the delegate.
+  optional ExecutionPreference execution_preference = 1;
+
+  // The execution priority of the inference request. The system-wide default
+  // will be used when PRIORITY_UNDEFINED is passed to the delegate.
+  optional ExecutionPriority execution_priority = 2;
+
+  // The optimization hints that will instruct the model compiler.
+  repeated OptimizationHint optimization_hints = 3 [packed = true];
+
+  // Whether and how to check the operator compatibility with the underlying
+  // accelerator.
+  optional OperationCheckMode operation_check_mode = 4;
+
+  // Whether to allow the accelerator to optionally use lower-precision FP16
+  // arithmetic when performing calculations on FP32 data.
+  optional bool allow_fp16_precision_for_fp32 = 5;
+
+  // Whether to use AHardwareBuffer_* API to manage buffers. Requires Android
+  // API level >= 26, or a dedicated AHardwareBuffer API shim on non-Android
+  // platforms.
+  optional bool use_ahwb = 6;
+
+  // Whether to use cachable (consistent / coherent) memory. This will affect
+  // both buffer allocation and buffer importing behaviors.
+  optional bool use_cacheable_buffer = 7 [default = true];
+
+  // Extra options for the Neuron compiler, such as "--opt-bw".
+  // See docs at https://neuropilot.mediatek.com/ for available options.
+  repeated string compile_options = 8;
+
+  // Optional list of target accelerator device names.
+  // If empty, the delegate will automatically select the accelerator.
+  // See docs at https://neuropilot.mediatek.com/ for available accelerators.
+  repeated string accelerator_names = 9;
+
+  // Optional path to the platform-dependent Neuron configuration file.
+  // See docs at https://neuropilot.mediatek.com/ for more details.
+  optional string neuron_config_path = 10;
+}
+
 // How to configure TFLite.
 message TFLiteSettings {
   // Which delegate to use.
@@ -721,6 +828,9 @@ message TFLiteSettings {
 
   // For configuring the Arm NN delegate.
   optional ArmNNSettings armnn_settings = 16;
+
+  // For configuring MediaTek Neuron delegate.
+  optional MtkNeuronSettings mtk_neuron_settings = 17;
 }
 
 // Whether to automatically fallback to TFLite CPU path on delegation errors.
diff --git a/tensorflow/lite/acceleration/configuration/configuration_generated.h b/tensorflow/lite/acceleration/configuration/configuration_generated.h
index 675dfc9d8c9a09..aa6d94af9e30d8 100644
--- a/tensorflow/lite/acceleration/configuration/configuration_generated.h
+++ b/tensorflow/lite/acceleration/configuration/configuration_generated.h
@@ -89,6 +89,10 @@ struct ArmNNSettings;
 struct ArmNNSettingsBuilder;
 struct ArmNNSettingsT;
 
+struct MtkNeuronSettings;
+struct MtkNeuronSettingsBuilder;
+struct MtkNeuronSettingsT;
+
 struct TFLiteSettings;
 struct TFLiteSettingsBuilder;
 struct TFLiteSettingsT;
@@ -191,6 +195,8 @@ bool operator==(const CPUSettingsT &lhs, const CPUSettingsT &rhs);
 bool operator!=(const CPUSettingsT &lhs, const CPUSettingsT &rhs);
 bool operator==(const ArmNNSettingsT &lhs, const ArmNNSettingsT &rhs);
 bool operator!=(const ArmNNSettingsT &lhs, const ArmNNSettingsT &rhs);
+bool operator==(const MtkNeuronSettingsT &lhs, const MtkNeuronSettingsT &rhs);
+bool operator!=(const MtkNeuronSettingsT &lhs, const MtkNeuronSettingsT &rhs);
 bool operator==(const TFLiteSettingsT &lhs, const TFLiteSettingsT &rhs);
 bool operator!=(const TFLiteSettingsT &lhs, const TFLiteSettingsT &rhs);
 bool operator==(const FallbackSettingsT &lhs, const FallbackSettingsT &rhs);
@@ -276,11 +282,12 @@ enum Delegate : int32_t {
   Delegate_EDGETPU_CORAL = 6,
   Delegate_CORE_ML = 7,
   Delegate_ARMNN = 8,
+  Delegate_MTK_NEURON = 9,
   Delegate_MIN = Delegate_NONE,
-  Delegate_MAX = Delegate_ARMNN
+  Delegate_MAX = Delegate_MTK_NEURON
 };
 
-inline const Delegate (&EnumValuesDelegate())[9] {
+inline const Delegate (&EnumValuesDelegate())[10] {
   static const Delegate values[] = {
     Delegate_NONE,
     Delegate_NNAPI,
@@ -290,13 +297,14 @@ inline const Delegate (&EnumValuesDelegate())[9] {
     Delegate_EDGETPU,
     Delegate_EDGETPU_CORAL,
     Delegate_CORE_ML,
-    Delegate_ARMNN
+    Delegate_ARMNN,
+    Delegate_MTK_NEURON
   };
   return values;
 }
 
 inline const char * const *EnumNamesDelegate() {
-  static const char * const names[10] = {
+  static const char * const names[11] = {
     "NONE",
     "NNAPI",
     "GPU",
@@ -306,13 +314,14 @@ inline const char * const *EnumNamesDelegate() {
     "EDGETPU_CORAL",
     "CORE_ML",
     "ARMNN",
+    "MTK_NEURON",
     nullptr
   };
   return names;
 }
 
 inline const char *EnumNameDelegate(Delegate e) {
-  if (::flatbuffers::IsOutRange(e, Delegate_NONE, Delegate_ARMNN)) return "";
+  if (::flatbuffers::IsOutRange(e, Delegate_NONE, Delegate_MTK_NEURON)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesDelegate()[index];
 }
@@ -874,6 +883,147 @@ inline const char *EnumNamePerformance(Performance e) {
 
 }  // namespace CoralSettings_
 
+namespace MtkNeuronSettings_ {
+
+enum ExecutionPreference : int32_t {
+  ExecutionPreference_PREFERENCE_UNDEFINED = 0,
+  ExecutionPreference_PREFERENCE_LOW_POWER = 1,
+  ExecutionPreference_PREFERENCE_FAST_SINGLE_ANSWER = 2,
+  ExecutionPreference_PREFERENCE_SUSTAINED_SPEED = 3,
+  ExecutionPreference_PREFERENCE_TURBO_BOOST = 4,
+  ExecutionPreference_MIN = ExecutionPreference_PREFERENCE_UNDEFINED,
+  ExecutionPreference_MAX = ExecutionPreference_PREFERENCE_TURBO_BOOST
+};
+
+inline const ExecutionPreference (&EnumValuesExecutionPreference())[5] {
+  static const ExecutionPreference values[] = {
+    ExecutionPreference_PREFERENCE_UNDEFINED,
+    ExecutionPreference_PREFERENCE_LOW_POWER,
+    ExecutionPreference_PREFERENCE_FAST_SINGLE_ANSWER,
+    ExecutionPreference_PREFERENCE_SUSTAINED_SPEED,
+    ExecutionPreference_PREFERENCE_TURBO_BOOST
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesExecutionPreference() {
+  static const char * const names[6] = {
+    "PREFERENCE_UNDEFINED",
+    "PREFERENCE_LOW_POWER",
+    "PREFERENCE_FAST_SINGLE_ANSWER",
+    "PREFERENCE_SUSTAINED_SPEED",
+    "PREFERENCE_TURBO_BOOST",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameExecutionPreference(ExecutionPreference e) {
+  if (::flatbuffers::IsOutRange(e, ExecutionPreference_PREFERENCE_UNDEFINED, ExecutionPreference_PREFERENCE_TURBO_BOOST)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesExecutionPreference()[index];
+}
+
+enum ExecutionPriority : int32_t {
+  ExecutionPriority_PRIORITY_UNDEFINED = 0,
+  ExecutionPriority_PRIORITY_LOW = 90,
+  ExecutionPriority_PRIORITY_MEDIUM = 100,
+  ExecutionPriority_PRIORITY_HIGH = 110,
+  ExecutionPriority_MIN = ExecutionPriority_PRIORITY_UNDEFINED,
+  ExecutionPriority_MAX = ExecutionPriority_PRIORITY_HIGH
+};
+
+inline const ExecutionPriority (&EnumValuesExecutionPriority())[4] {
+  static const ExecutionPriority values[] = {
+    ExecutionPriority_PRIORITY_UNDEFINED,
+    ExecutionPriority_PRIORITY_LOW,
+    ExecutionPriority_PRIORITY_MEDIUM,
+    ExecutionPriority_PRIORITY_HIGH
+  };
+  return values;
+}
+
+inline const char *EnumNameExecutionPriority(ExecutionPriority e) {
+  switch (e) {
+    case ExecutionPriority_PRIORITY_UNDEFINED: return "PRIORITY_UNDEFINED";
+    case ExecutionPriority_PRIORITY_LOW: return "PRIORITY_LOW";
+    case ExecutionPriority_PRIORITY_MEDIUM: return "PRIORITY_MEDIUM";
+    case ExecutionPriority_PRIORITY_HIGH: return "PRIORITY_HIGH";
+    default: return "";
+  }
+}
+
+enum OptimizationHint : int32_t {
+  OptimizationHint_OPTIMIZATION_NONE = 0,
+  OptimizationHint_OPTIMIZATION_LOW_LATENCY = 1,
+  OptimizationHint_OPTIMIZATION_DEEP_FUSION = 2,
+  OptimizationHint_OPTIMIZATION_BATCH_PROCESSING = 3,
+  OptimizationHint_MIN = OptimizationHint_OPTIMIZATION_NONE,
+  OptimizationHint_MAX = OptimizationHint_OPTIMIZATION_BATCH_PROCESSING
+};
+
+inline const OptimizationHint (&EnumValuesOptimizationHint())[4] {
+  static const OptimizationHint values[] = {
+    OptimizationHint_OPTIMIZATION_NONE,
+    OptimizationHint_OPTIMIZATION_LOW_LATENCY,
+    OptimizationHint_OPTIMIZATION_DEEP_FUSION,
+    OptimizationHint_OPTIMIZATION_BATCH_PROCESSING
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesOptimizationHint() {
+  static const char * const names[5] = {
+    "OPTIMIZATION_NONE",
+    "OPTIMIZATION_LOW_LATENCY",
+    "OPTIMIZATION_DEEP_FUSION",
+    "OPTIMIZATION_BATCH_PROCESSING",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameOptimizationHint(OptimizationHint e) {
+  if (::flatbuffers::IsOutRange(e, OptimizationHint_OPTIMIZATION_NONE, OptimizationHint_OPTIMIZATION_BATCH_PROCESSING)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesOptimizationHint()[index];
+}
+
+enum OperationCheckMode : int32_t {
+  OperationCheckMode_NO_OPERATION_CHECK = 0,
+  OperationCheckMode_PER_NODE_OPERATION_CHECK = 1,
+  OperationCheckMode_PRE_OPERATION_CHECK = 2,
+  OperationCheckMode_MIN = OperationCheckMode_NO_OPERATION_CHECK,
+  OperationCheckMode_MAX = OperationCheckMode_PRE_OPERATION_CHECK
+};
+
+inline const OperationCheckMode (&EnumValuesOperationCheckMode())[3] {
+  static const OperationCheckMode values[] = {
+    OperationCheckMode_NO_OPERATION_CHECK,
+    OperationCheckMode_PER_NODE_OPERATION_CHECK,
+    OperationCheckMode_PRE_OPERATION_CHECK
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesOperationCheckMode() {
+  static const char * const names[4] = {
+    "NO_OPERATION_CHECK",
+    "PER_NODE_OPERATION_CHECK",
+    "PRE_OPERATION_CHECK",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameOperationCheckMode(OperationCheckMode e) {
+  if (::flatbuffers::IsOutRange(e, OperationCheckMode_NO_OPERATION_CHECK, OperationCheckMode_PRE_OPERATION_CHECK)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesOperationCheckMode()[index];
+}
+
+}  // namespace MtkNeuronSettings_
+
 enum BenchmarkEventType : int32_t {
   BenchmarkEventType_UNDEFINED_BENCHMARK_EVENT_TYPE = 0,
   BenchmarkEventType_START = 1,
@@ -2633,6 +2783,193 @@ inline ::flatbuffers::Offset<ArmNNSettings> CreateArmNNSettingsDirect(
 
 ::flatbuffers::Offset<ArmNNSettings> CreateArmNNSettings(::flatbuffers::FlatBufferBuilder &_fbb, const ArmNNSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct MtkNeuronSettingsT : public ::flatbuffers::NativeTable {
+  typedef MtkNeuronSettings TableType;
+  tflite::MtkNeuronSettings_::ExecutionPreference execution_preference = tflite::MtkNeuronSettings_::ExecutionPreference_PREFERENCE_UNDEFINED;
+  tflite::MtkNeuronSettings_::ExecutionPriority execution_priority = tflite::MtkNeuronSettings_::ExecutionPriority_PRIORITY_UNDEFINED;
+  std::vector<tflite::MtkNeuronSettings_::OptimizationHint> optimization_hints{};
+  tflite::MtkNeuronSettings_::OperationCheckMode operation_check_mode = tflite::MtkNeuronSettings_::OperationCheckMode_NO_OPERATION_CHECK;
+  bool allow_fp16_precision_for_fp32 = false;
+  bool use_ahwb = false;
+  bool use_cacheable_buffer = true;
+  std::vector<std::string> compile_options{};
+  std::vector<std::string> accelerator_names{};
+  std::string neuron_config_path{};
+};
+
+struct MtkNeuronSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef MtkNeuronSettingsT NativeTableType;
+  typedef MtkNeuronSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_EXECUTION_PREFERENCE = 4,
+    VT_EXECUTION_PRIORITY = 6,
+    VT_OPTIMIZATION_HINTS = 8,
+    VT_OPERATION_CHECK_MODE = 10,
+    VT_ALLOW_FP16_PRECISION_FOR_FP32 = 12,
+    VT_USE_AHWB = 14,
+    VT_USE_CACHEABLE_BUFFER = 16,
+    VT_COMPILE_OPTIONS = 18,
+    VT_ACCELERATOR_NAMES = 20,
+    VT_NEURON_CONFIG_PATH = 22
+  };
+  tflite::MtkNeuronSettings_::ExecutionPreference execution_preference() const {
+    return static_cast<tflite::MtkNeuronSettings_::ExecutionPreference>(GetField<int32_t>(VT_EXECUTION_PREFERENCE, 0));
+  }
+  tflite::MtkNeuronSettings_::ExecutionPriority execution_priority() const {
+    return static_cast<tflite::MtkNeuronSettings_::ExecutionPriority>(GetField<int32_t>(VT_EXECUTION_PRIORITY, 0));
+  }
+  const ::flatbuffers::Vector<int32_t> *optimization_hints() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_OPTIMIZATION_HINTS);
+  }
+  tflite::MtkNeuronSettings_::OperationCheckMode operation_check_mode() const {
+    return static_cast<tflite::MtkNeuronSettings_::OperationCheckMode>(GetField<int32_t>(VT_OPERATION_CHECK_MODE, 0));
+  }
+  bool allow_fp16_precision_for_fp32() const {
+    return GetField<uint8_t>(VT_ALLOW_FP16_PRECISION_FOR_FP32, 0) != 0;
+  }
+  bool use_ahwb() const {
+    return GetField<uint8_t>(VT_USE_AHWB, 0) != 0;
+  }
+  bool use_cacheable_buffer() const {
+    return GetField<uint8_t>(VT_USE_CACHEABLE_BUFFER, 1) != 0;
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *compile_options() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_COMPILE_OPTIONS);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *accelerator_names() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_ACCELERATOR_NAMES);
+  }
+  const ::flatbuffers::String *neuron_config_path() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NEURON_CONFIG_PATH);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_EXECUTION_PREFERENCE, 4) &&
+           VerifyField<int32_t>(verifier, VT_EXECUTION_PRIORITY, 4) &&
+           VerifyOffset(verifier, VT_OPTIMIZATION_HINTS) &&
+           verifier.VerifyVector(optimization_hints()) &&
+           VerifyField<int32_t>(verifier, VT_OPERATION_CHECK_MODE, 4) &&
+           VerifyField<uint8_t>(verifier, VT_ALLOW_FP16_PRECISION_FOR_FP32, 1) &&
+           VerifyField<uint8_t>(verifier, VT_USE_AHWB, 1) &&
+           VerifyField<uint8_t>(verifier, VT_USE_CACHEABLE_BUFFER, 1) &&
+           VerifyOffset(verifier, VT_COMPILE_OPTIONS) &&
+           verifier.VerifyVector(compile_options()) &&
+           verifier.VerifyVectorOfStrings(compile_options()) &&
+           VerifyOffset(verifier, VT_ACCELERATOR_NAMES) &&
+           verifier.VerifyVector(accelerator_names()) &&
+           verifier.VerifyVectorOfStrings(accelerator_names()) &&
+           VerifyOffset(verifier, VT_NEURON_CONFIG_PATH) &&
+           verifier.VerifyString(neuron_config_path()) &&
+           verifier.EndTable();
+  }
+  MtkNeuronSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(MtkNeuronSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<MtkNeuronSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const MtkNeuronSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct MtkNeuronSettingsBuilder {
+  typedef MtkNeuronSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_execution_preference(tflite::MtkNeuronSettings_::ExecutionPreference execution_preference) {
+    fbb_.AddElement<int32_t>(MtkNeuronSettings::VT_EXECUTION_PREFERENCE, static_cast<int32_t>(execution_preference), 0);
+  }
+  void add_execution_priority(tflite::MtkNeuronSettings_::ExecutionPriority execution_priority) {
+    fbb_.AddElement<int32_t>(MtkNeuronSettings::VT_EXECUTION_PRIORITY, static_cast<int32_t>(execution_priority), 0);
+  }
+  void add_optimization_hints(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> optimization_hints) {
+    fbb_.AddOffset(MtkNeuronSettings::VT_OPTIMIZATION_HINTS, optimization_hints);
+  }
+  void add_operation_check_mode(tflite::MtkNeuronSettings_::OperationCheckMode operation_check_mode) {
+    fbb_.AddElement<int32_t>(MtkNeuronSettings::VT_OPERATION_CHECK_MODE, static_cast<int32_t>(operation_check_mode), 0);
+  }
+  void add_allow_fp16_precision_for_fp32(bool allow_fp16_precision_for_fp32) {
+    fbb_.AddElement<uint8_t>(MtkNeuronSettings::VT_ALLOW_FP16_PRECISION_FOR_FP32, static_cast<uint8_t>(allow_fp16_precision_for_fp32), 0);
+  }
+  void add_use_ahwb(bool use_ahwb) {
+    fbb_.AddElement<uint8_t>(MtkNeuronSettings::VT_USE_AHWB, static_cast<uint8_t>(use_ahwb), 0);
+  }
+  void add_use_cacheable_buffer(bool use_cacheable_buffer) {
+    fbb_.AddElement<uint8_t>(MtkNeuronSettings::VT_USE_CACHEABLE_BUFFER, static_cast<uint8_t>(use_cacheable_buffer), 1);
+  }
+  void add_compile_options(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> compile_options) {
+    fbb_.AddOffset(MtkNeuronSettings::VT_COMPILE_OPTIONS, compile_options);
+  }
+  void add_accelerator_names(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> accelerator_names) {
+    fbb_.AddOffset(MtkNeuronSettings::VT_ACCELERATOR_NAMES, accelerator_names);
+  }
+  void add_neuron_config_path(::flatbuffers::Offset<::flatbuffers::String> neuron_config_path) {
+    fbb_.AddOffset(MtkNeuronSettings::VT_NEURON_CONFIG_PATH, neuron_config_path);
+  }
+  explicit MtkNeuronSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<MtkNeuronSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<MtkNeuronSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<MtkNeuronSettings> CreateMtkNeuronSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::MtkNeuronSettings_::ExecutionPreference execution_preference = tflite::MtkNeuronSettings_::ExecutionPreference_PREFERENCE_UNDEFINED,
+    tflite::MtkNeuronSettings_::ExecutionPriority execution_priority = tflite::MtkNeuronSettings_::ExecutionPriority_PRIORITY_UNDEFINED,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> optimization_hints = 0,
+    tflite::MtkNeuronSettings_::OperationCheckMode operation_check_mode = tflite::MtkNeuronSettings_::OperationCheckMode_NO_OPERATION_CHECK,
+    bool allow_fp16_precision_for_fp32 = false,
+    bool use_ahwb = false,
+    bool use_cacheable_buffer = true,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> compile_options = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> accelerator_names = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> neuron_config_path = 0) {
+  MtkNeuronSettingsBuilder builder_(_fbb);
+  builder_.add_neuron_config_path(neuron_config_path);
+  builder_.add_accelerator_names(accelerator_names);
+  builder_.add_compile_options(compile_options);
+  builder_.add_operation_check_mode(operation_check_mode);
+  builder_.add_optimization_hints(optimization_hints);
+  builder_.add_execution_priority(execution_priority);
+  builder_.add_execution_preference(execution_preference);
+  builder_.add_use_cacheable_buffer(use_cacheable_buffer);
+  builder_.add_use_ahwb(use_ahwb);
+  builder_.add_allow_fp16_precision_for_fp32(allow_fp16_precision_for_fp32);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<MtkNeuronSettings> CreateMtkNeuronSettingsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::MtkNeuronSettings_::ExecutionPreference execution_preference = tflite::MtkNeuronSettings_::ExecutionPreference_PREFERENCE_UNDEFINED,
+    tflite::MtkNeuronSettings_::ExecutionPriority execution_priority = tflite::MtkNeuronSettings_::ExecutionPriority_PRIORITY_UNDEFINED,
+    const std::vector<int32_t> *optimization_hints = nullptr,
+    tflite::MtkNeuronSettings_::OperationCheckMode operation_check_mode = tflite::MtkNeuronSettings_::OperationCheckMode_NO_OPERATION_CHECK,
+    bool allow_fp16_precision_for_fp32 = false,
+    bool use_ahwb = false,
+    bool use_cacheable_buffer = true,
+    const std::vector<::flatbuffers::Offset<::flatbuffers::String>> *compile_options = nullptr,
+    const std::vector<::flatbuffers::Offset<::flatbuffers::String>> *accelerator_names = nullptr,
+    const char *neuron_config_path = nullptr) {
+  auto optimization_hints__ = optimization_hints ? _fbb.CreateVector<int32_t>(*optimization_hints) : 0;
+  auto compile_options__ = compile_options ? _fbb.CreateVector<::flatbuffers::Offset<::flatbuffers::String>>(*compile_options) : 0;
+  auto accelerator_names__ = accelerator_names ? _fbb.CreateVector<::flatbuffers::Offset<::flatbuffers::String>>(*accelerator_names) : 0;
+  auto neuron_config_path__ = neuron_config_path ? _fbb.CreateString(neuron_config_path) : 0;
+  return tflite::CreateMtkNeuronSettings(
+      _fbb,
+      execution_preference,
+      execution_priority,
+      optimization_hints__,
+      operation_check_mode,
+      allow_fp16_precision_for_fp32,
+      use_ahwb,
+      use_cacheable_buffer,
+      compile_options__,
+      accelerator_names__,
+      neuron_config_path__);
+}
+
+::flatbuffers::Offset<MtkNeuronSettings> CreateMtkNeuronSettings(::flatbuffers::FlatBufferBuilder &_fbb, const MtkNeuronSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct TFLiteSettingsT : public ::flatbuffers::NativeTable {
   typedef TFLiteSettings TableType;
   tflite::Delegate delegate = tflite::Delegate_NONE;
@@ -2651,6 +2988,7 @@ struct TFLiteSettingsT : public ::flatbuffers::NativeTable {
   std::unique_ptr<tflite::GoogleEdgeTpuSettingsT> google_edgetpu_settings{};
   std::unique_ptr<tflite::CompilationCachingSettingsT> compilation_caching_settings{};
   std::unique_ptr<tflite::ArmNNSettingsT> armnn_settings{};
+  std::unique_ptr<tflite::MtkNeuronSettingsT> mtk_neuron_settings{};
   TFLiteSettingsT() = default;
   TFLiteSettingsT(const TFLiteSettingsT &o);
   TFLiteSettingsT(TFLiteSettingsT&&) FLATBUFFERS_NOEXCEPT = default;
@@ -2676,7 +3014,8 @@ struct TFLiteSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
     VT_STABLE_DELEGATE_LOADER_SETTINGS = 28,
     VT_GOOGLE_EDGETPU_SETTINGS = 30,
     VT_COMPILATION_CACHING_SETTINGS = 32,
-    VT_ARMNN_SETTINGS = 34
+    VT_ARMNN_SETTINGS = 34,
+    VT_MTK_NEURON_SETTINGS = 36
   };
   tflite::Delegate delegate() const {
     return static_cast<tflite::Delegate>(GetField<int32_t>(VT_DELEGATE, 0));
@@ -2726,6 +3065,9 @@ struct TFLiteSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   const tflite::ArmNNSettings *armnn_settings() const {
     return GetPointer<const tflite::ArmNNSettings *>(VT_ARMNN_SETTINGS);
   }
+  const tflite::MtkNeuronSettings *mtk_neuron_settings() const {
+    return GetPointer<const tflite::MtkNeuronSettings *>(VT_MTK_NEURON_SETTINGS);
+  }
   bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_DELEGATE, 4) &&
@@ -2757,6 +3099,8 @@ struct TFLiteSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
            verifier.VerifyTable(compilation_caching_settings()) &&
            VerifyOffset(verifier, VT_ARMNN_SETTINGS) &&
            verifier.VerifyTable(armnn_settings()) &&
+           VerifyOffset(verifier, VT_MTK_NEURON_SETTINGS) &&
+           verifier.VerifyTable(mtk_neuron_settings()) &&
            verifier.EndTable();
   }
   TFLiteSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -2816,6 +3160,9 @@ struct TFLiteSettingsBuilder {
   void add_armnn_settings(::flatbuffers::Offset<tflite::ArmNNSettings> armnn_settings) {
     fbb_.AddOffset(TFLiteSettings::VT_ARMNN_SETTINGS, armnn_settings);
   }
+  void add_mtk_neuron_settings(::flatbuffers::Offset<tflite::MtkNeuronSettings> mtk_neuron_settings) {
+    fbb_.AddOffset(TFLiteSettings::VT_MTK_NEURON_SETTINGS, mtk_neuron_settings);
+  }
   explicit TFLiteSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -2844,8 +3191,10 @@ inline ::flatbuffers::Offset<TFLiteSettings> CreateTFLiteSettings(
     ::flatbuffers::Offset<tflite::StableDelegateLoaderSettings> stable_delegate_loader_settings = 0,
     ::flatbuffers::Offset<tflite::GoogleEdgeTpuSettings> google_edgetpu_settings = 0,
     ::flatbuffers::Offset<tflite::CompilationCachingSettings> compilation_caching_settings = 0,
-    ::flatbuffers::Offset<tflite::ArmNNSettings> armnn_settings = 0) {
+    ::flatbuffers::Offset<tflite::ArmNNSettings> armnn_settings = 0,
+    ::flatbuffers::Offset<tflite::MtkNeuronSettings> mtk_neuron_settings = 0) {
   TFLiteSettingsBuilder builder_(_fbb);
+  builder_.add_mtk_neuron_settings(mtk_neuron_settings);
   builder_.add_armnn_settings(armnn_settings);
   builder_.add_compilation_caching_settings(compilation_caching_settings);
   builder_.add_google_edgetpu_settings(google_edgetpu_settings);
@@ -5130,6 +5479,79 @@ inline ::flatbuffers::Offset<ArmNNSettings> CreateArmNNSettings(::flatbuffers::F
 }
 
 
+inline bool operator==(const MtkNeuronSettingsT &lhs, const MtkNeuronSettingsT &rhs) {
+  return
+      (lhs.execution_preference == rhs.execution_preference) &&
+      (lhs.execution_priority == rhs.execution_priority) &&
+      (lhs.optimization_hints == rhs.optimization_hints) &&
+      (lhs.operation_check_mode == rhs.operation_check_mode) &&
+      (lhs.allow_fp16_precision_for_fp32 == rhs.allow_fp16_precision_for_fp32) &&
+      (lhs.use_ahwb == rhs.use_ahwb) &&
+      (lhs.use_cacheable_buffer == rhs.use_cacheable_buffer) &&
+      (lhs.compile_options == rhs.compile_options) &&
+      (lhs.accelerator_names == rhs.accelerator_names) &&
+      (lhs.neuron_config_path == rhs.neuron_config_path);
+}
+
+inline bool operator!=(const MtkNeuronSettingsT &lhs, const MtkNeuronSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline MtkNeuronSettingsT *MtkNeuronSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<MtkNeuronSettingsT>(new MtkNeuronSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void MtkNeuronSettings::UnPackTo(MtkNeuronSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = execution_preference(); _o->execution_preference = _e; }
+  { auto _e = execution_priority(); _o->execution_priority = _e; }
+  { auto _e = optimization_hints(); if (_e) { _o->optimization_hints.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->optimization_hints[_i] = static_cast<tflite::MtkNeuronSettings_::OptimizationHint>(_e->Get(_i)); } } else { _o->optimization_hints.resize(0); } }
+  { auto _e = operation_check_mode(); _o->operation_check_mode = _e; }
+  { auto _e = allow_fp16_precision_for_fp32(); _o->allow_fp16_precision_for_fp32 = _e; }
+  { auto _e = use_ahwb(); _o->use_ahwb = _e; }
+  { auto _e = use_cacheable_buffer(); _o->use_cacheable_buffer = _e; }
+  { auto _e = compile_options(); if (_e) { _o->compile_options.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->compile_options[_i] = _e->Get(_i)->str(); } } else { _o->compile_options.resize(0); } }
+  { auto _e = accelerator_names(); if (_e) { _o->accelerator_names.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->accelerator_names[_i] = _e->Get(_i)->str(); } } else { _o->accelerator_names.resize(0); } }
+  { auto _e = neuron_config_path(); if (_e) _o->neuron_config_path = _e->str(); }
+}
+
+inline ::flatbuffers::Offset<MtkNeuronSettings> MtkNeuronSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const MtkNeuronSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateMtkNeuronSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<MtkNeuronSettings> CreateMtkNeuronSettings(::flatbuffers::FlatBufferBuilder &_fbb, const MtkNeuronSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const MtkNeuronSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _execution_preference = _o->execution_preference;
+  auto _execution_priority = _o->execution_priority;
+  auto _optimization_hints = _o->optimization_hints.size() ? _fbb.CreateVectorScalarCast<int32_t>(::flatbuffers::data(_o->optimization_hints), _o->optimization_hints.size()) : 0;
+  auto _operation_check_mode = _o->operation_check_mode;
+  auto _allow_fp16_precision_for_fp32 = _o->allow_fp16_precision_for_fp32;
+  auto _use_ahwb = _o->use_ahwb;
+  auto _use_cacheable_buffer = _o->use_cacheable_buffer;
+  auto _compile_options = _o->compile_options.size() ? _fbb.CreateVectorOfStrings(_o->compile_options) : 0;
+  auto _accelerator_names = _o->accelerator_names.size() ? _fbb.CreateVectorOfStrings(_o->accelerator_names) : 0;
+  auto _neuron_config_path = _o->neuron_config_path.empty() ? 0 : _fbb.CreateString(_o->neuron_config_path);
+  return tflite::CreateMtkNeuronSettings(
+      _fbb,
+      _execution_preference,
+      _execution_priority,
+      _optimization_hints,
+      _operation_check_mode,
+      _allow_fp16_precision_for_fp32,
+      _use_ahwb,
+      _use_cacheable_buffer,
+      _compile_options,
+      _accelerator_names,
+      _neuron_config_path);
+}
+
+
 inline bool operator==(const TFLiteSettingsT &lhs, const TFLiteSettingsT &rhs) {
   return
       (lhs.delegate == rhs.delegate) &&
@@ -5147,7 +5569,8 @@ inline bool operator==(const TFLiteSettingsT &lhs, const TFLiteSettingsT &rhs) {
       ((lhs.stable_delegate_loader_settings == rhs.stable_delegate_loader_settings) || (lhs.stable_delegate_loader_settings && rhs.stable_delegate_loader_settings && *lhs.stable_delegate_loader_settings == *rhs.stable_delegate_loader_settings)) &&
       ((lhs.google_edgetpu_settings == rhs.google_edgetpu_settings) || (lhs.google_edgetpu_settings && rhs.google_edgetpu_settings && *lhs.google_edgetpu_settings == *rhs.google_edgetpu_settings)) &&
       ((lhs.compilation_caching_settings == rhs.compilation_caching_settings) || (lhs.compilation_caching_settings && rhs.compilation_caching_settings && *lhs.compilation_caching_settings == *rhs.compilation_caching_settings)) &&
-      ((lhs.armnn_settings == rhs.armnn_settings) || (lhs.armnn_settings && rhs.armnn_settings && *lhs.armnn_settings == *rhs.armnn_settings));
+      ((lhs.armnn_settings == rhs.armnn_settings) || (lhs.armnn_settings && rhs.armnn_settings && *lhs.armnn_settings == *rhs.armnn_settings)) &&
+      ((lhs.mtk_neuron_settings == rhs.mtk_neuron_settings) || (lhs.mtk_neuron_settings && rhs.mtk_neuron_settings && *lhs.mtk_neuron_settings == *rhs.mtk_neuron_settings));
 }
 
 inline bool operator!=(const TFLiteSettingsT &lhs, const TFLiteSettingsT &rhs) {
@@ -5171,7 +5594,8 @@ inline TFLiteSettingsT::TFLiteSettingsT(const TFLiteSettingsT &o)
         stable_delegate_loader_settings((o.stable_delegate_loader_settings) ? new tflite::StableDelegateLoaderSettingsT(*o.stable_delegate_loader_settings) : nullptr),
         google_edgetpu_settings((o.google_edgetpu_settings) ? new tflite::GoogleEdgeTpuSettingsT(*o.google_edgetpu_settings) : nullptr),
         compilation_caching_settings((o.compilation_caching_settings) ? new tflite::CompilationCachingSettingsT(*o.compilation_caching_settings) : nullptr),
-        armnn_settings((o.armnn_settings) ? new tflite::ArmNNSettingsT(*o.armnn_settings) : nullptr) {
+        armnn_settings((o.armnn_settings) ? new tflite::ArmNNSettingsT(*o.armnn_settings) : nullptr),
+        mtk_neuron_settings((o.mtk_neuron_settings) ? new tflite::MtkNeuronSettingsT(*o.mtk_neuron_settings) : nullptr) {
 }
 
 inline TFLiteSettingsT &TFLiteSettingsT::operator=(TFLiteSettingsT o) FLATBUFFERS_NOEXCEPT {
@@ -5191,6 +5615,7 @@ inline TFLiteSettingsT &TFLiteSettingsT::operator=(TFLiteSettingsT o) FLATBUFFER
   std::swap(google_edgetpu_settings, o.google_edgetpu_settings);
   std::swap(compilation_caching_settings, o.compilation_caching_settings);
   std::swap(armnn_settings, o.armnn_settings);
+  std::swap(mtk_neuron_settings, o.mtk_neuron_settings);
   return *this;
 }
 
@@ -5219,6 +5644,7 @@ inline void TFLiteSettings::UnPackTo(TFLiteSettingsT *_o, const ::flatbuffers::r
   { auto _e = google_edgetpu_settings(); if (_e) { if(_o->google_edgetpu_settings) { _e->UnPackTo(_o->google_edgetpu_settings.get(), _resolver); } else { _o->google_edgetpu_settings = std::unique_ptr<tflite::GoogleEdgeTpuSettingsT>(_e->UnPack(_resolver)); } } else if (_o->google_edgetpu_settings) { _o->google_edgetpu_settings.reset(); } }
   { auto _e = compilation_caching_settings(); if (_e) { if(_o->compilation_caching_settings) { _e->UnPackTo(_o->compilation_caching_settings.get(), _resolver); } else { _o->compilation_caching_settings = std::unique_ptr<tflite::CompilationCachingSettingsT>(_e->UnPack(_resolver)); } } else if (_o->compilation_caching_settings) { _o->compilation_caching_settings.reset(); } }
   { auto _e = armnn_settings(); if (_e) { if(_o->armnn_settings) { _e->UnPackTo(_o->armnn_settings.get(), _resolver); } else { _o->armnn_settings = std::unique_ptr<tflite::ArmNNSettingsT>(_e->UnPack(_resolver)); } } else if (_o->armnn_settings) { _o->armnn_settings.reset(); } }
+  { auto _e = mtk_neuron_settings(); if (_e) { if(_o->mtk_neuron_settings) { _e->UnPackTo(_o->mtk_neuron_settings.get(), _resolver); } else { _o->mtk_neuron_settings = std::unique_ptr<tflite::MtkNeuronSettingsT>(_e->UnPack(_resolver)); } } else if (_o->mtk_neuron_settings) { _o->mtk_neuron_settings.reset(); } }
 }
 
 inline ::flatbuffers::Offset<TFLiteSettings> TFLiteSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const TFLiteSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
@@ -5245,6 +5671,7 @@ inline ::flatbuffers::Offset<TFLiteSettings> CreateTFLiteSettings(::flatbuffers:
   auto _google_edgetpu_settings = _o->google_edgetpu_settings ? CreateGoogleEdgeTpuSettings(_fbb, _o->google_edgetpu_settings.get(), _rehasher) : 0;
   auto _compilation_caching_settings = _o->compilation_caching_settings ? CreateCompilationCachingSettings(_fbb, _o->compilation_caching_settings.get(), _rehasher) : 0;
   auto _armnn_settings = _o->armnn_settings ? CreateArmNNSettings(_fbb, _o->armnn_settings.get(), _rehasher) : 0;
+  auto _mtk_neuron_settings = _o->mtk_neuron_settings ? CreateMtkNeuronSettings(_fbb, _o->mtk_neuron_settings.get(), _rehasher) : 0;
   return tflite::CreateTFLiteSettings(
       _fbb,
       _delegate,
@@ -5262,7 +5689,8 @@ inline ::flatbuffers::Offset<TFLiteSettings> CreateTFLiteSettings(::flatbuffers:
       _stable_delegate_loader_settings,
       _google_edgetpu_settings,
       _compilation_caching_settings,
-      _armnn_settings);
+      _armnn_settings,
+      _mtk_neuron_settings);
 }
 
 
diff --git a/tensorflow/lite/acceleration/configuration/flatbuffer_to_proto.cc b/tensorflow/lite/acceleration/configuration/flatbuffer_to_proto.cc
index 0d09bfb01d9ae8..87f822cf40c308 100644
--- a/tensorflow/lite/acceleration/configuration/flatbuffer_to_proto.cc
+++ b/tensorflow/lite/acceleration/configuration/flatbuffer_to_proto.cc
@@ -57,6 +57,8 @@ proto::Delegate ConvertDelegate(Delegate delegate) {
       return proto::Delegate::CORE_ML;
     case Delegate_ARMNN:
       return proto::Delegate::ARMNN;
+    case Delegate_MTK_NEURON:
+      return proto::Delegate::MTK_NEURON;
   }
   TFLITE_LOG_PROD(TFLITE_LOG_ERROR, "Unexpected value for Delegate: %d",
                   delegate);
@@ -423,6 +425,53 @@ proto::CompilationCachingSettings ConvertCompilationCachingSettings(
   return proto_settings;
 }
 
+proto::MtkNeuronSettings ConvertMtkNeuronSettings(
+    const MtkNeuronSettings& settings) {
+  proto::MtkNeuronSettings proto_settings;
+  proto_settings.set_execution_preference(
+      static_cast<proto::MtkNeuronSettings_ExecutionPreference>(
+          settings.execution_preference()));
+  proto_settings.set_execution_priority(
+      static_cast<proto::MtkNeuronSettings_ExecutionPriority>(
+          settings.execution_priority()));
+
+  auto optimization_hints = settings.optimization_hints();
+  if (optimization_hints != nullptr) {
+    for (auto hint : *optimization_hints) {
+      proto_settings.add_optimization_hints(
+          static_cast<proto::MtkNeuronSettings_OptimizationHint>(hint));
+    }
+  }
+
+  proto_settings.set_operation_check_mode(
+      static_cast<proto::MtkNeuronSettings_OperationCheckMode>(
+          settings.operation_check_mode()));
+  proto_settings.set_allow_fp16_precision_for_fp32(
+      settings.allow_fp16_precision_for_fp32());
+  proto_settings.set_use_ahwb(settings.use_ahwb());
+  proto_settings.set_use_cacheable_buffer(settings.use_cacheable_buffer());
+
+  auto compile_options = settings.compile_options();
+  if (compile_options != nullptr) {
+    for (auto option : *compile_options) {
+      proto_settings.add_compile_options(option->str());
+    }
+  }
+
+  auto accelerator_names = settings.accelerator_names();
+  if (accelerator_names != nullptr) {
+    for (auto name : *accelerator_names) {
+      proto_settings.add_accelerator_names(name->str());
+    }
+  }
+
+  if (settings.neuron_config_path()) {
+    proto_settings.set_neuron_config_path(settings.neuron_config_path()->str());
+  }
+
+  return proto_settings;
+}
+
 proto::TFLiteSettings ConvertTfliteSettings(const TFLiteSettings& settings) {
   proto::TFLiteSettings proto_settings;
   proto_settings.set_delegate(ConvertDelegate(settings.delegate()));
@@ -490,6 +539,11 @@ proto::TFLiteSettings ConvertTfliteSettings(const TFLiteSettings& settings) {
             *settings.compilation_caching_settings());
   }
 
+  if (settings.mtk_neuron_settings() != nullptr) {
+    *proto_settings.mutable_mtk_neuron_settings() =
+        ConvertMtkNeuronSettings(*settings.mtk_neuron_settings());
+  }
+
   return proto_settings;
 }
 
diff --git a/tensorflow/lite/acceleration/configuration/flatbuffer_to_proto_test.cc b/tensorflow/lite/acceleration/configuration/flatbuffer_to_proto_test.cc
index d194826d97b3f0..bd5ef446b088f5 100644
--- a/tensorflow/lite/acceleration/configuration/flatbuffer_to_proto_test.cc
+++ b/tensorflow/lite/acceleration/configuration/flatbuffer_to_proto_test.cc
@@ -550,6 +550,54 @@ TEST_F(ConversionTest, CompilationCachingSettings) {
   EXPECT_EQ(output_settings.model_token(), "model");
 }
 
+TEST_F(ConversionTest, MtkNeuronSettings) {
+  settings_.tflite_settings = std::make_unique<TFLiteSettingsT>();
+  settings_.tflite_settings->mtk_neuron_settings =
+      std::make_unique<MtkNeuronSettingsT>();
+  MtkNeuronSettingsT* input_settings =
+      settings_.tflite_settings->mtk_neuron_settings.get();
+
+  input_settings->execution_preference =
+      MtkNeuronSettings_::ExecutionPreference_PREFERENCE_UNDEFINED;
+  input_settings->execution_priority =
+      MtkNeuronSettings_::ExecutionPriority_PRIORITY_MEDIUM;
+  input_settings->optimization_hints = {
+      MtkNeuronSettings_::OptimizationHint_OPTIMIZATION_LOW_LATENCY,
+      MtkNeuronSettings_::OptimizationHint_OPTIMIZATION_BATCH_PROCESSING};
+  input_settings->operation_check_mode =
+      MtkNeuronSettings_::OperationCheckMode_PER_NODE_OPERATION_CHECK;
+  input_settings->allow_fp16_precision_for_fp32 = true;
+  input_settings->use_ahwb = false;
+  input_settings->use_cacheable_buffer = true;
+  input_settings->compile_options = {"TEST_COMPILE_OPTIONS"};
+  input_settings->accelerator_names = {"TEST_ACCELERATOR_NAME"};
+  input_settings->neuron_config_path = "TEST_NEURON_CONFIG_PATH";
+
+  const proto::ComputeSettings compute = ConvertFromFlatbuffer(settings_);
+  const proto::MtkNeuronSettings& output_settings =
+      compute.tflite_settings().mtk_neuron_settings();
+
+  EXPECT_EQ(output_settings.execution_preference(),
+            proto::MtkNeuronSettings::PREFERENCE_UNDEFINED);
+  EXPECT_EQ(output_settings.execution_priority(),
+            proto::MtkNeuronSettings::PRIORITY_MEDIUM);
+  EXPECT_EQ(output_settings.optimization_hints().size(), 2);
+  EXPECT_EQ(output_settings.optimization_hints().at(0),
+            proto::MtkNeuronSettings::OPTIMIZATION_LOW_LATENCY);
+  EXPECT_EQ(output_settings.optimization_hints().at(1),
+            proto::MtkNeuronSettings::OPTIMIZATION_BATCH_PROCESSING);
+  EXPECT_EQ(output_settings.operation_check_mode(),
+            proto::MtkNeuronSettings::PER_NODE_OPERATION_CHECK);
+  EXPECT_TRUE(output_settings.allow_fp16_precision_for_fp32());
+  EXPECT_FALSE(output_settings.use_ahwb());
+  EXPECT_TRUE(output_settings.use_cacheable_buffer());
+  EXPECT_EQ(output_settings.compile_options().size(), 1);
+  EXPECT_EQ(output_settings.compile_options().at(0), "TEST_COMPILE_OPTIONS");
+  EXPECT_EQ(output_settings.accelerator_names().size(), 1);
+  EXPECT_EQ(output_settings.accelerator_names().at(0), "TEST_ACCELERATOR_NAME");
+  EXPECT_EQ(output_settings.neuron_config_path(), "TEST_NEURON_CONFIG_PATH");
+}
+
 TEST_F(ConversionTest, MiniBenchmarkSettings) {
   settings_.tflite_settings = std::make_unique<TFLiteSettingsT>();
   settings_.tflite_settings->cpu_settings = std::make_unique<CPUSettingsT>();
diff --git a/tensorflow/lite/acceleration/configuration/proto_to_flatbuffer.cc b/tensorflow/lite/acceleration/configuration/proto_to_flatbuffer.cc
index 95109fcf1e5aeb..366869babe63d3 100644
--- a/tensorflow/lite/acceleration/configuration/proto_to_flatbuffer.cc
+++ b/tensorflow/lite/acceleration/configuration/proto_to_flatbuffer.cc
@@ -66,6 +66,8 @@ Delegate ConvertDelegate(proto::Delegate delegate) {
       return Delegate_CORE_ML;
     case proto::Delegate::ARMNN:
       return Delegate_ARMNN;
+    case proto::Delegate::MTK_NEURON:
+      return Delegate_MTK_NEURON;
   }
   TFLITE_LOG_PROD(TFLITE_LOG_ERROR, "Unexpected value for Delegate: %d",
                   delegate);
@@ -393,6 +395,42 @@ Offset<EdgeTpuSettings> ConvertEdgeTpuSettings(
           settings.use_layer_ir_tgc_backend()));
 }
 
+Offset<CompilationCachingSettings> ConvertCompilationCachingSettings(
+    const proto::CompilationCachingSettings& settings,
+    FlatBufferBuilder& builder) {
+  return CreateCompilationCachingSettings(
+      builder, builder.CreateString(settings.cache_dir()),
+      builder.CreateString(settings.model_token()));
+}
+
+Offset<ArmNNSettings> ConvertArmNNSettings(const proto::ArmNNSettings& settings,
+                                           FlatBufferBuilder& builder) {
+  return CreateArmNNSettings(
+      builder, builder.CreateString(settings.backends()), settings.fastmath(),
+      builder.CreateString(settings.additional_parameters()));
+}
+
+Offset<MtkNeuronSettings> ConvertMtkNeuronSettings(
+    const proto::MtkNeuronSettings& settings, FlatBufferBuilder& builder) {
+  return CreateMtkNeuronSettings(
+      builder,
+      static_cast<MtkNeuronSettings_::ExecutionPreference>(
+          settings.execution_preference()),
+      static_cast<MtkNeuronSettings_::ExecutionPriority>(
+          settings.execution_priority()),
+      builder.CreateVector(settings.optimization_hints().data(),
+                           settings.optimization_hints().size()),
+      static_cast<MtkNeuronSettings_::OperationCheckMode>(
+          settings.operation_check_mode()),
+      settings.allow_fp16_precision_for_fp32(), settings.use_ahwb(),
+      settings.use_cacheable_buffer(),
+      builder.CreateVectorOfStrings(settings.compile_options().begin(),
+                                    settings.compile_options().end()),
+      builder.CreateVectorOfStrings(settings.accelerator_names().begin(),
+                                    settings.accelerator_names().end()),
+      builder.CreateString(settings.neuron_config_path()));
+}
+
 Offset<CoralSettings> ConvertCoralSettings(const proto::CoralSettings& settings,
                                            FlatBufferBuilder& builder) {
   return CreateCoralSettings(
@@ -418,8 +456,11 @@ Offset<TFLiteSettings> ConvertTfliteSettings(
       settings.disable_default_delegates(),
       ConvertStableDelegateLoaderSettings(
           settings.stable_delegate_loader_settings(), builder),
-      ConvertGoogleEdgeTpuSettings(settings.google_edgetpu_settings(),
-                                   builder));
+      ConvertGoogleEdgeTpuSettings(settings.google_edgetpu_settings(), builder),
+      ConvertCompilationCachingSettings(settings.compilation_caching_settings(),
+                                        builder),
+      ConvertArmNNSettings(settings.armnn_settings(), builder),
+      ConvertMtkNeuronSettings(settings.mtk_neuron_settings(), builder));
 }
 
 Offset<ModelFile> ConvertModelFile(const proto::ModelFile& model_file,
diff --git a/tensorflow/lite/acceleration/configuration/proto_to_flatbuffer_test.cc b/tensorflow/lite/acceleration/configuration/proto_to_flatbuffer_test.cc
index c59ec771cab7f6..e504f09c6825fe 100644
--- a/tensorflow/lite/acceleration/configuration/proto_to_flatbuffer_test.cc
+++ b/tensorflow/lite/acceleration/configuration/proto_to_flatbuffer_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/acceleration/configuration/proto_to_flatbuffer.h"
 
+#include <cstdint>
 #include <string>
 #include <vector>
 
@@ -116,5 +117,121 @@ TEST(ConversionTest, StableDelegateLoaderSettings) {
             kDelegateName);
 }
 
+TEST(ConversionTest, CompilationCachingSettings) {
+  // Define the fields to be tested.
+  const std::string kCacheDir = "TEST_CACHE_DIR";
+  const std::string kModelToken = "TEST_MODEL_TOKEN";
+
+  // Create the proto settings.
+  proto::TFLiteSettings input_settings;
+  auto* compilation_caching_settings =
+      input_settings.mutable_compilation_caching_settings();
+  compilation_caching_settings->set_cache_dir(kCacheDir);
+  compilation_caching_settings->set_model_token(kModelToken);
+  flatbuffers::FlatBufferBuilder flatbuffers_builder;
+
+  // Convert.
+  auto output_settings = ConvertFromProto(input_settings, &flatbuffers_builder);
+
+  // Verify the conversion results.
+  const auto* output_compilation_caching_settings =
+      output_settings->compilation_caching_settings();
+  ASSERT_NE(output_compilation_caching_settings, nullptr);
+  EXPECT_EQ(output_compilation_caching_settings->cache_dir()->str(), kCacheDir);
+  EXPECT_EQ(output_compilation_caching_settings->model_token()->str(),
+            kModelToken);
+}
+
+TEST(ConversionTest, ArmNNSettings) {
+  // Define the fields to be tested.
+  const std::string kBackends = "TEST_BACKENDS";
+  const bool kFastmath = true;
+  const std::string kAdditionalParameters = "TEST_ADDITIONAL_PARAMETERS";
+
+  // Create the proto settings.
+  proto::TFLiteSettings input_settings;
+  auto* armnn_settings = input_settings.mutable_armnn_settings();
+  armnn_settings->set_backends(kBackends);
+  armnn_settings->set_fastmath(kFastmath);
+  armnn_settings->set_additional_parameters(kAdditionalParameters);
+  flatbuffers::FlatBufferBuilder flatbuffers_builder;
+
+  // Convert.
+  auto output_settings = ConvertFromProto(input_settings, &flatbuffers_builder);
+
+  // Verify the conversion results.
+  const auto* output_armnn_settings = output_settings->armnn_settings();
+  ASSERT_NE(output_armnn_settings, nullptr);
+  EXPECT_EQ(output_armnn_settings->backends()->str(), kBackends);
+  EXPECT_EQ(output_armnn_settings->fastmath(), kFastmath);
+  EXPECT_EQ(output_armnn_settings->additional_parameters()->str(),
+            kAdditionalParameters);
+}
+
+TEST(ConversionTest, MtkNeuronSettings) {
+  // Define the fields to be tested.
+  const proto::MtkNeuronSettings_ExecutionPreference kExecutionPreference =
+      proto::MtkNeuronSettings::PREFERENCE_FAST_SINGLE_ANSWER;
+  const proto::MtkNeuronSettings_ExecutionPriority kExecutionPriority =
+      proto::MtkNeuronSettings::PRIORITY_MEDIUM;
+  const proto::MtkNeuronSettings_OptimizationHint kOptimizationHint =
+      proto::MtkNeuronSettings::OPTIMIZATION_LOW_LATENCY;
+  const proto::MtkNeuronSettings_OperationCheckMode kOperationCheckMode =
+      proto::MtkNeuronSettings::PER_NODE_OPERATION_CHECK;
+  const bool kAllowFp16 = true;
+  const bool kUseAhwb = false;
+  const bool kUseCacheableBuffer = true;
+  const std::string kCompileOptions = "TEST_COMPILE_OPTIONS";
+  const std::string kAcceleratorName = "TEST_ACCELERATOR_NAME";
+  const std::string kNeuronConfigPath = "TEST_NEURON_CONFIG_PATH";
+
+  // Create the proto settings.
+  proto::TFLiteSettings input_settings;
+  auto* mtk_neuron_settings = input_settings.mutable_mtk_neuron_settings();
+  mtk_neuron_settings->set_execution_preference(kExecutionPreference);
+  mtk_neuron_settings->set_execution_priority(kExecutionPriority);
+  mtk_neuron_settings->add_optimization_hints(kOptimizationHint);
+  mtk_neuron_settings->set_operation_check_mode(kOperationCheckMode);
+  mtk_neuron_settings->set_allow_fp16_precision_for_fp32(kAllowFp16);
+  mtk_neuron_settings->set_use_ahwb(kUseAhwb);
+  mtk_neuron_settings->set_use_cacheable_buffer(kUseCacheableBuffer);
+  mtk_neuron_settings->add_compile_options(kCompileOptions);
+  mtk_neuron_settings->add_accelerator_names(kAcceleratorName);
+  mtk_neuron_settings->set_neuron_config_path(kNeuronConfigPath);
+  flatbuffers::FlatBufferBuilder flatbuffers_builder;
+
+  // Convert.
+  auto output_settings = ConvertFromProto(input_settings, &flatbuffers_builder);
+
+  // Verify the conversion results.
+  const auto* output_mtk_neuron_settings =
+      output_settings->mtk_neuron_settings();
+  ASSERT_NE(output_mtk_neuron_settings, nullptr);
+  EXPECT_EQ(
+      output_mtk_neuron_settings->execution_preference(),
+      MtkNeuronSettings_::ExecutionPreference_PREFERENCE_FAST_SINGLE_ANSWER);
+  EXPECT_EQ(output_mtk_neuron_settings->execution_priority(),
+            MtkNeuronSettings_::ExecutionPriority_PRIORITY_MEDIUM);
+
+  EXPECT_EQ(output_mtk_neuron_settings->optimization_hints()->size(), 1);
+  EXPECT_EQ(output_mtk_neuron_settings->optimization_hints()->Get(0),
+            kOptimizationHint);
+  EXPECT_EQ(output_mtk_neuron_settings->operation_check_mode(),
+            MtkNeuronSettings_::OperationCheckMode_PER_NODE_OPERATION_CHECK);
+  EXPECT_EQ(output_mtk_neuron_settings->allow_fp16_precision_for_fp32(),
+            kAllowFp16);
+  EXPECT_EQ(output_mtk_neuron_settings->use_ahwb(), kUseAhwb);
+  EXPECT_EQ(output_mtk_neuron_settings->use_cacheable_buffer(),
+            kUseCacheableBuffer);
+  EXPECT_EQ(output_mtk_neuron_settings->compile_options()->size(), 1);
+  EXPECT_EQ(output_mtk_neuron_settings->compile_options()->Get(0)->str(),
+            kCompileOptions);
+  EXPECT_EQ(output_mtk_neuron_settings->accelerator_names()->size(), 1);
+  EXPECT_EQ(output_mtk_neuron_settings->accelerator_names()->Get(0)->str(),
+            kAcceleratorName);
+  EXPECT_EQ(output_mtk_neuron_settings->neuron_config_path()->str(),
+            kNeuronConfigPath);
+}
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/acceleration/configuration/testdata/configuration.proto_prev b/tensorflow/lite/acceleration/configuration/testdata/configuration.proto_prev
index 9d930646ebfc07..655d95f5990120 100644
--- a/tensorflow/lite/acceleration/configuration/testdata/configuration.proto_prev
+++ b/tensorflow/lite/acceleration/configuration/testdata/configuration.proto_prev
@@ -510,6 +510,22 @@ message EdgeTpuSettings {
   // 4. Please try to use a unique name so that it's easier to identify the
   // model during debugging.
   optional string public_model_id = 9;
+
+  // Layer IR (intermediate representation) TGC (tensor graph in C++) backend
+  // options.
+  // * If set to YES, compile as per layer IR -> TGC -> codegen flow.
+  // * If set to NO, compile as per layer IR -> layer IR -> codegen flow.
+  // * If set to AUTO, we try to run as per layer IR -> TGC -> codegen flow. If
+  // not successful, we will fallback to layer IR -> layer IR -> codegen flow.
+  // * If UNSPECIFIED, it is same as NO for now.
+  enum UseLayerIrTgcBackend {
+    USE_LAYER_IR_TGC_BACKEND_UNSPECIFIED = 0;
+    USE_LAYER_IR_TGC_BACKEND_NO = 1;
+    USE_LAYER_IR_TGC_BACKEND_YES = 2;
+    USE_LAYER_IR_TGC_BACKEND_AUTO = 3;
+  }
+  optional UseLayerIrTgcBackend use_layer_ir_tgc_backend = 10
+      [default = USE_LAYER_IR_TGC_BACKEND_UNSPECIFIED];
 }
 
 // Google EdgeTPU delegate settings.
diff --git a/tensorflow/lite/core/BUILD b/tensorflow/lite/core/BUILD
index 4b8859346ad71f..da0becf7c6a52b 100644
--- a/tensorflow/lite/core/BUILD
+++ b/tensorflow/lite/core/BUILD
@@ -52,7 +52,6 @@ cc_library(
         "//tensorflow/lite:external_cpu_backend_context",
         "//tensorflow/lite:graph_info",
         "//tensorflow/lite:interpreter_options_header",
-        "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:macros",
         "//tensorflow/lite:memory_planner",
         "//tensorflow/lite:mutable_op_resolver",
@@ -70,10 +69,9 @@ cc_library(
         "//tensorflow/lite/experimental/resource",
         "//tensorflow/lite/internal:signature_def",
         "//tensorflow/lite/profiling:root_profiler",
-        "//tensorflow/lite/profiling/telemetry",
         "//tensorflow/lite/profiling/telemetry:profiler",
+        "//tensorflow/lite/profiling/telemetry/c:telemetry_setting",
         "//tensorflow/lite/profiling/telemetry/c:telemetry_setting_internal",
-        "//tensorflow/lite/schema:conversion_metadata_fbs",
         "//tensorflow/lite/schema:schema_fbs",
         "@flatbuffers//:runtime_cc",
     ],
@@ -124,11 +122,9 @@ cc_library(
         "//tensorflow/lite:external_cpu_backend_context",
         "//tensorflow/lite:graph_info",
         "//tensorflow/lite:interpreter_options_header",
-        "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:macros",
         "//tensorflow/lite:memory_planner",
         "//tensorflow/lite:mutable_op_resolver",
-        "//tensorflow/lite:optional_debug_tools",
         "//tensorflow/lite:stderr_reporter",
         "//tensorflow/lite:string",
         "//tensorflow/lite:type_to_tflitetype",
@@ -143,10 +139,9 @@ cc_library(
         "//tensorflow/lite/experimental/resource",
         "//tensorflow/lite/internal:signature_def",
         "//tensorflow/lite/profiling:root_profiler",
-        "//tensorflow/lite/profiling/telemetry",
         "//tensorflow/lite/profiling/telemetry:profiler",
+        "//tensorflow/lite/profiling/telemetry/c:telemetry_setting",
         "//tensorflow/lite/profiling/telemetry/c:telemetry_setting_internal",
-        "//tensorflow/lite/schema:conversion_metadata_fbs",
         "//tensorflow/lite/schema:schema_fbs",
         "@flatbuffers//:runtime_cc",
     ],
@@ -210,6 +205,7 @@ cc_library(
         "//tensorflow/lite/profiling:root_profiler",
         "//tensorflow/lite/profiling/telemetry",
         "//tensorflow/lite/profiling/telemetry:profiler",
+        "//tensorflow/lite/profiling/telemetry/c:telemetry_setting",
         "//tensorflow/lite/profiling/telemetry/c:telemetry_setting_internal",
         "//tensorflow/lite/schema:conversion_metadata_fbs",
         "//tensorflow/lite/schema:schema_fbs",
@@ -254,13 +250,11 @@ cc_library(
     deps = [
         ":cc_api_stable",
         "//tensorflow/lite:allocation",
-        "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite:external_cpu_backend_context",
         "//tensorflow/lite:graph_info",
         "//tensorflow/lite:interpreter_options_header",
         "//tensorflow/lite:macros",
         "//tensorflow/lite:memory_planner",
-        "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite:mutable_op_resolver",
         "//tensorflow/lite:stderr_reporter",
         "//tensorflow/lite:string",
@@ -277,13 +271,11 @@ cc_library(
         "//tensorflow/lite/experimental/resource",
         "//tensorflow/lite/internal:signature_def",
         "//tensorflow/lite/profiling:root_profiler",
-        "//tensorflow/lite/profiling/telemetry",
         "//tensorflow/lite/profiling/telemetry:profiler",
+        "//tensorflow/lite/profiling/telemetry/c:telemetry_setting",
         "//tensorflow/lite/profiling/telemetry/c:telemetry_setting_internal",
-        "//tensorflow/lite/schema:conversion_metadata_fbs",
         "//tensorflow/lite/schema:schema_fbs",
         "@flatbuffers//:runtime_cc",
-        "@ruy//ruy:denormal",
     ],
     alwayslink = 1,  # TODO(b/161243354): eliminate this.
 )
diff --git a/tensorflow/lite/core/c/operator.h b/tensorflow/lite/core/c/operator.h
index 796b0a23d8cb98..21986aea2212b2 100644
--- a/tensorflow/lite/core/c/operator.h
+++ b/tensorflow/lite/core/c/operator.h
@@ -73,32 +73,27 @@ typedef struct TfLiteOperator TfLiteOperator;
 // \param version      Version of the op.  See
 //                     https://www.tensorflow.org/lite/guide/ops_version
 //
-// \warning This is an experimental API and subject to change.
 TFL_CAPI_EXPORT extern TfLiteOperator* TfLiteOperatorCreate(
     TfLiteBuiltinOperator builtin_code, const char* custom_name, int version);
 
 // Destroys the TfLiteOperator instance.
 //
-// \warning This is an experimental API and subject to change.
 TFL_CAPI_EXPORT extern void TfLiteOperatorDelete(TfLiteOperator* registration);
 
 // Return the builtin op code of the provided external 'registration'.
 //
-// \warning This is an experimental API and subject to change.
 TFL_CAPI_EXPORT extern TfLiteBuiltinOperator TfLiteOperatorGetBuiltInCode(
     const TfLiteOperator* registration);
 
 /// Returns the custom name of the provided 'registration'. The returned pointer
 /// will be non-null iff the op is a custom op.
 ///
-/// \warning This is an experimental API and subject to change.
 TFL_CAPI_EXPORT extern const char* TfLiteOperatorGetCustomName(
     const TfLiteOperator* registration);
 
 /// Return the OP version of the provided external 'registration'.  Return -1
 /// in case of error, or if the provided address is null.
 ///
-/// \warning This is an experimental API and subject to change.
 TFL_CAPI_EXPORT extern int TfLiteOperatorGetVersion(
     const TfLiteOperator* registration);
 
@@ -107,7 +102,6 @@ TFL_CAPI_EXPORT extern int TfLiteOperatorGetVersion(
 // The callback is called to initialize the op from serialized data.
 // Please refer `init` of `TfLiteRegistration` for the detail.
 //
-// \warning This is an experimental API and subject to change.
 TFL_CAPI_EXPORT extern void TfLiteOperatorSetInit(
     TfLiteOperator* registration,
     void* (*init)(TfLiteOpaqueContext* context, const char* buffer,
@@ -120,7 +114,6 @@ TFL_CAPI_EXPORT extern void TfLiteOperatorSetInit(
 // the `init` callback.
 // Please refer `free` of `TfLiteRegistration` for the detail.
 //
-// \warning This is an experimental API and subject to change.
 TFL_CAPI_EXPORT extern void TfLiteOperatorSetFree(
     TfLiteOperator* registration,
     void (*free)(TfLiteOpaqueContext* context, void* data));
@@ -130,7 +123,6 @@ TFL_CAPI_EXPORT extern void TfLiteOperatorSetFree(
 // The callback is called when the inputs of operator have been resized.
 // Please refer `prepare` of `TfLiteRegistration` for the detail.
 //
-// \warning This is an experimental API and subject to change.
 TFL_CAPI_EXPORT extern void TfLiteOperatorSetPrepare(
     TfLiteOperator* registration,
     TfLiteStatus (*prepare)(TfLiteOpaqueContext* context,
@@ -141,7 +133,6 @@ TFL_CAPI_EXPORT extern void TfLiteOperatorSetPrepare(
 // The callback is called when the operator is executed.
 // Please refer `invoke` of `TfLiteRegistration` for the detail.
 //
-// \warning This is an experimental API and subject to change.
 TFL_CAPI_EXPORT extern void TfLiteOperatorSetInvoke(
     TfLiteOperator* registration,
     TfLiteStatus (*invoke)(TfLiteOpaqueContext* context,
@@ -165,7 +156,6 @@ TFL_CAPI_EXPORT extern void TfLiteOperatorSetAsyncKernel(
 /// This is a bitmask. Please refer to `inplace_operator` field of
 /// `TfLiteRegistration` for details.
 ///
-/// \warning This is an experimental API and subject to change.
 TFL_CAPI_EXPORT extern void TfLiteOperatorSetInplaceOperator(
     TfLiteOperator* registration, uint64_t inplace_operator);
 
diff --git a/tensorflow/lite/core/interpreter.cc b/tensorflow/lite/core/interpreter.cc
index d794902490649c..15b5d18d272b5d 100644
--- a/tensorflow/lite/core/interpreter.cc
+++ b/tensorflow/lite/core/interpreter.cc
@@ -32,9 +32,14 @@ limitations under the License.
 #include "tensorflow/lite/core/api/profiler.h"
 #include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/core/signature_runner.h"
+#include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/experimental/remat/metadata_util.h"
 #include "tensorflow/lite/external_cpu_backend_context.h"
 #include "tensorflow/lite/interpreter_options.h"
+#include "tensorflow/lite/logger.h"
 #include "tensorflow/lite/minimal_logging.h"
+#include "tensorflow/lite/profiling/root_profiler.h"
+#include "tensorflow/lite/profiling/telemetry/c/telemetry_setting.h"
 #include "tensorflow/lite/profiling/telemetry/telemetry.h"
 #include "tensorflow/lite/stderr_reporter.h"
 #include "tensorflow/lite/util.h"
diff --git a/tensorflow/lite/core/interpreter_builder.cc b/tensorflow/lite/core/interpreter_builder.cc
index 6f225aecd08e38..41e62cfd675340 100644
--- a/tensorflow/lite/core/interpreter_builder.cc
+++ b/tensorflow/lite/core/interpreter_builder.cc
@@ -26,7 +26,9 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "flatbuffers/buffer.h"  // from @flatbuffers
+#include "flatbuffers/vector.h"  // from @flatbuffers
+#include "tensorflow/lite/allocation.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/flatbuffer_conversions.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
@@ -36,15 +38,15 @@ limitations under the License.
 #include "tensorflow/lite/core/model_builder.h"
 #include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/internal/signature_def.h"
-#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/interpreter_options.h"
 #include "tensorflow/lite/profiling/platform_profiler.h"
+#include "tensorflow/lite/profiling/telemetry/c/telemetry_setting.h"
 #include "tensorflow/lite/profiling/telemetry/c/telemetry_setting_internal.h"
 #include "tensorflow/lite/schema/conversion_metadata_generated.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/shared_library.h"
 #include "tensorflow/lite/stderr_reporter.h"
-#include "tensorflow/lite/string_type.h"
 #include "tensorflow/lite/util.h"
 #include "tensorflow/lite/version.h"
 
diff --git a/tensorflow/lite/core/interpreter_experimental.cc b/tensorflow/lite/core/interpreter_experimental.cc
index 016d45df977955..4f7e2e81f26a64 100644
--- a/tensorflow/lite/core/interpreter_experimental.cc
+++ b/tensorflow/lite/core/interpreter_experimental.cc
@@ -22,20 +22,15 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "ruy/denormal.h"  // from @ruy
-#include "tensorflow/lite/allocation.h"
 #include "tensorflow/lite/c/common_internal.h"
-#include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/profiler.h"
 #include "tensorflow/lite/core/async/async_signature_runner.h"
 #include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/interpreter.h"
 #include "tensorflow/lite/core/subgraph.h"
-#include "tensorflow/lite/external_cpu_backend_context.h"
-#include "tensorflow/lite/minimal_logging.h"
-#include "tensorflow/lite/stderr_reporter.h"
-#include "tensorflow/lite/util.h"
+#include "tensorflow/lite/interpreter_options.h"
+#include "tensorflow/lite/profiling/root_profiler.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/core/shims/BUILD b/tensorflow/lite/core/shims/BUILD
index 7febb33af081ed..28a64d7bf2cba8 100644
--- a/tensorflow/lite/core/shims/BUILD
+++ b/tensorflow/lite/core/shims/BUILD
@@ -64,7 +64,10 @@ bzl_library(
     name = "cc_library_with_tflite_bzl",
     srcs = ["cc_library_with_tflite.bzl"],
     visibility = ["//visibility:public"],
-    deps = ["//tensorflow/lite:build_def_bzl"],
+    deps = [
+        "//tensorflow/lite:build_def_bzl",
+        "@rules_java//java:rules",
+    ],
 )
 
 cc_library_with_tflite_test_suite(name = "cc_library_with_tflite_test_suite")
diff --git a/tensorflow/lite/core/shims/cc_library_with_tflite.bzl b/tensorflow/lite/core/shims/cc_library_with_tflite.bzl
index a328f8e61ddafe..c2502e7d5843dd 100644
--- a/tensorflow/lite/core/shims/cc_library_with_tflite.bzl
+++ b/tensorflow/lite/core/shims/cc_library_with_tflite.bzl
@@ -2,6 +2,7 @@
 
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load("@build_bazel_rules_android//android:rules.bzl", "android_binary", "android_library")
+load("@rules_java//java:defs.bzl", "java_library", "java_test")
 load("//tensorflow:tensorflow.bzl", "clean_dep")
 load(
     "//tensorflow/lite:build_def.bzl",
@@ -317,7 +318,7 @@ def java_library_with_tflite(
         'cc_library_with_tflite' / 'java_library_with_tflite'.
       **kwargs: Additional java_library parameters.
     """
-    native.java_library(
+    java_library(
         name = name,
         exports = exports + tflite_exports,
         deps = deps + tflite_deps + tflite_jni_binaries,
@@ -352,7 +353,7 @@ def java_test_with_tflite(
         using 'jni_binary_with_tflite'.
       **kwargs: Additional java_library parameters.
     """
-    native.java_test(
+    java_test(
         name = name,
         deps = deps + tflite_deps,
         runtime_deps = deps + tflite_jni_binaries,
diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc
index a1a9f66721167f..3ccc72bc7ea949 100644
--- a/tensorflow/lite/core/subgraph.cc
+++ b/tensorflow/lite/core/subgraph.cc
@@ -1867,7 +1867,7 @@ TfLiteStatus Subgraph::SetTensorParametersReadOnly(
     TF_LITE_ENSURE_OK(
         &context_,
         tflite::BytesRequired(type, dims, ndims, &required_bytes, &context_));
-    TF_LITE_ENSURE_EQ(&context_, required_bytes, bytes);
+    TF_LITE_ENSURE(&context_, required_bytes <= bytes);
   }
 
   TfLiteTensor& tensor = context_.tensors[tensor_index];
diff --git a/tensorflow/lite/core/subgraph.h b/tensorflow/lite/core/subgraph.h
index ae1b1ca5670fda..379374bcc56d63 100644
--- a/tensorflow/lite/core/subgraph.h
+++ b/tensorflow/lite/core/subgraph.h
@@ -576,6 +576,12 @@ class Subgraph {
       int64_t* custom_initial_data_offset_in_file,
       int64_t* custom_initial_data_size) const;
 
+  // Returns true if the subgraph has delegates applied.
+  bool HasDelegates();
+
+  // Returns true if the subgraph has been fully delegated.
+  bool IsFullyDelegated() const;
+
  private:
 #ifndef DOXYGEN_SKIP
   friend class tflite::impl::InterpreterBuilder;
@@ -867,12 +873,6 @@ class Subgraph {
   // afterwards.
   TfLiteStatus RemoveAllDelegates();
 
-  // Returns true if the subgraph has delegates applied.
-  bool HasDelegates();
-
-  // Returns true if the subgraph has been fully delegated.
-  bool IsFullyDelegated() const;
-
   // Cleanups up data reserved for the given node. Does not remove the {node,
   // registration} pair from nodes_and_registrations_.
   void CleanupNode(int node_index);
diff --git a/tensorflow/lite/delegates/BUILD b/tensorflow/lite/delegates/BUILD
index d0a207e1ff6879..6442a85901f10d 100644
--- a/tensorflow/lite/delegates/BUILD
+++ b/tensorflow/lite/delegates/BUILD
@@ -80,6 +80,9 @@ cc_library(
     copts = tflite_copts(),
     deps = [
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/api:error_reporter",
     ],
 )
 
@@ -105,22 +108,10 @@ cc_test(
     deps = [
         ":delegate_test_util",
         ":interpreter_utils",
-        ":utils",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite:kernel_api",
-        "//tensorflow/lite:string",
-        "//tensorflow/lite:util",
-        "//tensorflow/lite:version",
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
-        "//tensorflow/lite/kernels:kernel_util",
-        "//tensorflow/lite/kernels/internal:compatibility",
-        "//tensorflow/lite/schema:schema_conversion_utils",
-        "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest_main",
-        "@eigen_archive//:eigen3",
-        "@flatbuffers",
     ],
 )
 
diff --git a/tensorflow/lite/delegates/flex/buffer_map_util.cc b/tensorflow/lite/delegates/flex/buffer_map_util.cc
index 8b7b0f4966772d..21a4caf0a838bd 100644
--- a/tensorflow/lite/delegates/flex/buffer_map_util.cc
+++ b/tensorflow/lite/delegates/flex/buffer_map_util.cc
@@ -152,7 +152,7 @@ tensorflow::Status SetTfTensorFromTfLite(const TfLiteTensor* tensor,
     handle.set_name(TfLiteResourceIdentifier(tensor));
     t.flat<tensorflow::ResourceHandle>()(0) = handle;
     *tf_tensor = t;
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   } else if (IsResourceOrVariant(tensor)) {
     // TODO(b/179094265): This is an experimental implementation, subject to
     // change. This can be re-implemented with life cycle management mechanism
@@ -169,7 +169,7 @@ tensorflow::Status SetTfTensorFromTfLite(const TfLiteTensor* tensor,
     const tensorflow::Tensor** tf_tensor_ptr =
         reinterpret_cast<const tensorflow::Tensor**>(tensor->data.raw);
     *tf_tensor = **tf_tensor_ptr;
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   tensorflow::TensorShape shape;
@@ -192,7 +192,7 @@ tensorflow::Status SetTfTensorFromTfLite(const TfLiteTensor* tensor,
   buf->Unref();
 
   *tf_tensor = std::move(t);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace flex
diff --git a/tensorflow/lite/delegates/flex/delegate_data.cc b/tensorflow/lite/delegates/flex/delegate_data.cc
index e0b0d1041e45b4..1db4df5aec8cd1 100644
--- a/tensorflow/lite/delegates/flex/delegate_data.cc
+++ b/tensorflow/lite/delegates/flex/delegate_data.cc
@@ -146,7 +146,7 @@ tensorflow::Status GetSubgraphNamesForFunctionExecution(
       }
     }
   }
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -163,7 +163,7 @@ tensorflow::Status RegisterFunctionDefForSubgraphs(
   if (!subgraphs) {
     // If there are no subgraphs associated with the main subgraph, we will
     // return ok status because no FunctionDef needs to be registered.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   std::set<std::string> function_subgraphs;
   TF_RETURN_IF_ERROR(
@@ -186,7 +186,7 @@ tensorflow::Status RegisterFunctionDefForSubgraphs(
     BuildFunctionDefProto(subgraph_name, *(subgraphs->at(i)), fdef);
     TF_RETURN_IF_ERROR(eager_context->AddFunctionDef(fdef));
   }
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 DelegateData::DelegateData() {}
diff --git a/tensorflow/lite/delegates/flex/delegate_data_test.cc b/tensorflow/lite/delegates/flex/delegate_data_test.cc
index 058abe00cc66b5..3232f8c469ee5a 100644
--- a/tensorflow/lite/delegates/flex/delegate_data_test.cc
+++ b/tensorflow/lite/delegates/flex/delegate_data_test.cc
@@ -71,7 +71,7 @@ TEST(DelegateDataTest, CheckFunctionDef) {
          std::set<std::string>* result) {
         result->insert("add_subgraph");
         result->insert("mul_subgraph");
-        return ::tensorflow::OkStatus();
+        return absl::OkStatus();
       };
 
   // Builds a TF Lite primary graph with two subgraphs.
@@ -216,7 +216,7 @@ TEST(DelegateDataTest, CheckFunctionDefWithOnlyMainGraph) {
          std::set<std::string>* result) {
         result->insert("add_subgraph");
         result->insert("mul_subgraph");
-        return ::tensorflow::OkStatus();
+        return absl::OkStatus();
       };
 
   // Builds a TF Lite primary graph with two subgraphs.
diff --git a/tensorflow/lite/delegates/flex/kernel.cc b/tensorflow/lite/delegates/flex/kernel.cc
index 63fd381dabe78a..46fa5a21f31a24 100644
--- a/tensorflow/lite/delegates/flex/kernel.cc
+++ b/tensorflow/lite/delegates/flex/kernel.cc
@@ -247,7 +247,7 @@ class OpNode {
     // completion. See b/304799442 for more context.
     (*nodedef_.mutable_attr())["use_inter_op_parallelism"].set_b(false);
 
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   tensorflow::Status BuildOpKernelRunner(
@@ -258,12 +258,12 @@ class OpNode {
                             name_, inputs_.Size(), /*attr_builder=*/
                             [this](tensorflow::AttrValueMap* attr_value_map) {
                               *attr_value_map = nodedef_.attr();
-                              return ::tensorflow::OkStatus();
+                              return absl::OkStatus();
                             },
                             *eager_context->pflr(),
                             eager_context->local_device_mgr()->HostCPU()));
 
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   tensorflow::Status BuildOpKernelInputs(
@@ -298,7 +298,7 @@ class OpNode {
       run_state->input_tf_tensor_values[i].tensor =
           &run_state->input_tf_tensors[i];
     }
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Returns whether an output tensor should be preserved in the buffer map by
@@ -375,7 +375,7 @@ class OpNode {
         }
       }
     }
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
  private:
diff --git a/tensorflow/lite/delegates/gpu/cl/BUILD b/tensorflow/lite/delegates/gpu/cl/BUILD
index 7e02516b5d8c87..cc593ef4334dbe 100644
--- a/tensorflow/lite/delegates/gpu/cl/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/BUILD
@@ -448,11 +448,18 @@ cc_library(
         "//tensorflow:android": ["-lm"],
         "//conditions:default": [],
     }) + ["-ldl"],  # opencl_wrapper calls dlopen()
+    local_defines = select({
+        # copybara:uncomment_begin(google-only)
+        # "//tools/cc_target_os:linux-google": ["__LINUX_GOOGLE__"],
+        # copybara:uncomment_end
+        "//conditions:default": [],
+    }),
     deps = [
         "//tensorflow/lite/delegates/gpu/common:status",
         "@com_google_absl//absl/strings",
         "@opencl_headers",
         "//tensorflow/lite/delegates/gpu/cl/" + if_google("google", "default") + ":qcom_wrapper",
+        "//tensorflow/lite/tools:logging",
     ] + select({
         # copybara:uncomment_begin(google-only)
         # "//tools/cc_target_os:linux-google": ["//third_party/opencl_icd_loader"],
diff --git a/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc b/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc
index 6daa5d668f05ff..2419b2c9325ad3 100644
--- a/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc
+++ b/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc
@@ -29,6 +29,7 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/tools/logging.h"
 
 namespace tflite {
 namespace gpu {
@@ -83,7 +84,8 @@ void* AndroidDlopenSphalLibrary(const char* filename, int dlopen_flags) {
 #define LoadFunction(function) \
   function =                   \
       reinterpret_cast<PFN_##function>(GetProcAddress(libopencl, #function));
-
+#elif defined(__LINUX_GOOGLE__) && !defined(__aarch64__)
+#define LoadFunction(function) function = ::function;
 #else
 #define LoadFunction(function) \
   function = reinterpret_cast<PFN_##function>(dlsym(libopencl, #function));
@@ -142,15 +144,19 @@ absl::Status LoadOpenCL() {
   libopencl = dlopen(kClLibName, RTLD_NOW | RTLD_LOCAL);
 #endif
   if (libopencl) {
+    TFLITE_LOG(INFO) << "Loaded OpenCL library with dlopen.";
     LoadOpenCLFunctions(libopencl, false);
     return absl::OkStatus();
   }
+  TFLITE_LOG(INFO) << "Failed to load OpenCL library with dlopen: " << dlerror()
+                   << ". Trying ICD loader.";
   // Check if OpenCL functions are found via OpenCL ICD Loader.
   LoadOpenCLFunctions(libopencl, false);
   if (clGetPlatformIDs != nullptr) {
     cl_uint num_platforms;
     cl_int status = clGetPlatformIDs(0, nullptr, &num_platforms);
     if (status == CL_SUCCESS && num_platforms != 0) {
+      TFLITE_LOG(INFO) << "Loaded OpenCL library with ICD loader.";
       return absl::OkStatus();
     }
     return absl::UnknownError("OpenCL is not supported.");
diff --git a/tensorflow/lite/delegates/interpreter_utils.cc b/tensorflow/lite/delegates/interpreter_utils.cc
index 9f0d5de8dc3707..6574082597718a 100644
--- a/tensorflow/lite/delegates/interpreter_utils.cc
+++ b/tensorflow/lite/delegates/interpreter_utils.cc
@@ -17,6 +17,11 @@ limitations under the License.
 
 #include <algorithm>
 
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/interpreter.h"
+
 namespace tflite {
 namespace delegates {
 TfLiteStatus InterpreterUtils::InvokeWithCPUFallback(Interpreter* interpreter) {
diff --git a/tensorflow/lite/delegates/interpreter_utils_test.cc b/tensorflow/lite/delegates/interpreter_utils_test.cc
index aa883065f90bb4..0e95f873c651b3 100644
--- a/tensorflow/lite/delegates/interpreter_utils_test.cc
+++ b/tensorflow/lite/delegates/interpreter_utils_test.cc
@@ -23,8 +23,6 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "tensorflow/lite/delegates/delegate_test_util.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/internal/compatibility.h"
-#include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
 namespace delegates {
diff --git a/tensorflow/lite/delegates/xnnpack/BUILD b/tensorflow/lite/delegates/xnnpack/BUILD
index 84e1bf66d61765..9a7360165d0888 100644
--- a/tensorflow/lite/delegates/xnnpack/BUILD
+++ b/tensorflow/lite/delegates/xnnpack/BUILD
@@ -234,10 +234,12 @@ cc_library(
         "//conditions:default": [],
     }) + select({
         ":xnnpack_enable_subgraph_reshaping_explicit": ["-DXNNPACK_DELEGATE_ENABLE_SUBGRAPH_RESHAPING=1"],
-        "//conditions:default": [],
+        # copybara:uncomment "//conditions:default": [],
+        "//conditions:default": ["-DXNNPACK_DELEGATE_ENABLE_SUBGRAPH_RESHAPING=1"],  # copybara:comment
     }) + select({
         ":xnnpack_use_latest_ops_explicit": ["-DXNNPACK_DELEGATE_USE_LATEST_OPS=1"],
-        "//conditions:default": [],
+        # copybara:uncomment "//conditions:default": [],
+        "//conditions:default": ["-DXNNPACK_DELEGATE_USE_LATEST_OPS=1"],  # copybara:comment
     }) + select({
         ":xnnpack_use_transient_indirection_buffers_explicit": ["-DXNNPACK_DELEGATE_USE_TRANSIENT_INDIRECTION_BUFFERS=1"],
         "//conditions:default": [],
@@ -252,6 +254,7 @@ cc_library(
         ":tflite_with_xnnpack_transient_indirection_buffer",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:minimal_logging",
+        "//tensorflow/lite/c:c_api_types",
         "//tensorflow/lite/core:subgraph",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/core/c:common",
@@ -265,6 +268,7 @@ cc_library(
         "//tensorflow/lite/tools/optimize:reduced_precision_support",
         "@XNNPACK",
         "@XNNPACK//:experiments_config",
+        "@XNNPACK//:logging",
     ],
 )
 
@@ -288,6 +292,7 @@ cc_library(
         ":quantization_util",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:minimal_logging",
+        "//tensorflow/lite/c:c_api_types",
         "//tensorflow/lite/core:subgraph",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/core/c:common",
@@ -301,6 +306,7 @@ cc_library(
         "//tensorflow/lite/tools/optimize:reduced_precision_support",
         "@XNNPACK//:XNNPACK_test_mode",
         "@XNNPACK//:experiments_config",
+        "@XNNPACK//:logging",
     ],
 )
 
@@ -357,6 +363,7 @@ cc_library(
     srcs = ["batch_matrix_multiply_tester.cc"],
     hdrs = ["batch_matrix_multiply_tester.h"],
     deps = [
+        ":test_util",
         ":xnnpack_delegate_test_mode",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
@@ -546,6 +553,25 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "odml_sdpa_tester",
+    testonly = 1,
+    srcs = ["odml_sdpa_tester.cc"],
+    hdrs = ["odml_sdpa_tester.h"],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
+        "//tensorflow/lite/experimental/genai:genai_ops",
+        "//tensorflow/lite/schema:schema_conversion_utils",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
 cc_library(
     name = "pad_tester",
     testonly = 1,
@@ -1540,6 +1566,22 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "odml_sdpa_test",
+    srcs = ["odml_sdpa_test.cc"],
+    linkopts = select({
+        "//tensorflow:emscripten": EMSCRIPTEN_LINKOPTS,
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":odml_sdpa_tester",
+        ":test_main",
+        ":xnnpack_delegate_test_mode",
+        "//tensorflow/lite/c:c_api_types",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_test(
     name = "pad_test",
     srcs = ["pad_test.cc"],
@@ -1798,6 +1840,23 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "rsqrt_test",
+    srcs = ["rsqrt_test.cc"],
+    linkopts = select({
+        "//tensorflow:emscripten": EMSCRIPTEN_LINKOPTS,
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":test_main",
+        ":unary_elementwise_tester",
+        ":xnnpack_delegate_test_mode",
+        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_test(
     name = "signed_dequantize_test",
     srcs = ["signed_dequantize_test.cc"],
diff --git a/tensorflow/lite/delegates/xnnpack/batch_matrix_multiply_test.cc b/tensorflow/lite/delegates/xnnpack/batch_matrix_multiply_test.cc
index 30642a3df2a517..c65b6c336629c0 100644
--- a/tensorflow/lite/delegates/xnnpack/batch_matrix_multiply_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/batch_matrix_multiply_test.cc
@@ -25,7 +25,7 @@ limitations under the License.
 namespace tflite {
 namespace xnnpack {
 
-class DISABLED_BatchMatrixMultiplyTest : public testing::Test {
+class BatchMatrixMultiplyTest : public testing::Test {
  public:
   // std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
   auto get_delegate(int num_threads = 1) {
@@ -52,158 +52,221 @@ class DISABLED_BatchMatrixMultiplyTest : public testing::Test {
   std::mt19937 rng_ = std::mt19937(random_device_());
 };
 
-TEST_F(DISABLED_BatchMatrixMultiplyTest, 3D) {
+TEST_F(BatchMatrixMultiplyTest, 3D) {
   const auto batch = shape_rng();
   const auto height = shape_rng();
-  const auto input1_channels = channels_rng();
+  const auto input_channels = channels_rng();
   const auto output_channels = channels_rng();
   auto xnnpack_delegate = get_delegate();
 
   BatchMatrixMultiplyTester()
-      .Input1Shape({batch, height, input1_channels})
-      .Input2Shape({batch, input1_channels, output_channels})
+      .InputADims({batch, height, input_channels})
+      .InputBDims({batch, input_channels, output_channels})
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DISABLED_BatchMatrixMultiplyTest, BroadcastOne3D) {
+// TODO(b/332675940): This test is currently disabled since the TFLite default
+// implementation of `BatchMatMul` can't handle per-channel quantized inputs.
+TEST_F(BatchMatrixMultiplyTest,
+       DISABLED_DynamicallyQuantizedPerChannelWeights2D) {
+  const auto height = shape_rng();
+  const auto input_channels = channels_rng();
+  const auto output_channels = channels_rng();
+  auto xnnpack_delegate = get_delegate();
+
+  BatchMatrixMultiplyTester()
+      .InputADims({height, input_channels})
+      .InputBDims({input_channels, output_channels})
+      .InputBQuant(BatchMatrixMultiplyTester::kChannel)
+      .Test(xnnpack_delegate.get());
+}
+
+// TODO(b/332675940): This test is currently disabled since the TFLite default
+// implementation of `BatchMatMul` can't handle per-channel quantized inputs.
+TEST_F(BatchMatrixMultiplyTest,
+       DISABLED_DynamicallyQuantizedPerChannelWeights2DTransposeB) {
+  const auto height = shape_rng();
+  const auto input_channels = channels_rng();
+  const auto output_channels = channels_rng();
+  auto xnnpack_delegate = get_delegate();
+
+  BatchMatrixMultiplyTester()
+      .InputADims({height, input_channels})
+      .InputBDims({output_channels, input_channels})
+      .InputBQuant(BatchMatrixMultiplyTester::kChannel)
+      .TransposeB(true)
+      .Test(xnnpack_delegate.get());
+}
+
+TEST_F(BatchMatrixMultiplyTest, DynamicallyQuantizedPerTensorWeights3D) {
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto input_channels = channels_rng();
+  const auto output_channels = channels_rng();
+  auto xnnpack_delegate = get_delegate();
+
+  BatchMatrixMultiplyTester()
+      .InputADims({batch, height, input_channels})
+      .InputBDims({batch, input_channels, output_channels})
+      .InputBQuant(BatchMatrixMultiplyTester::kTensor)
+      .Test(xnnpack_delegate.get());
+}
+
+TEST_F(BatchMatrixMultiplyTest,
+       DynamicallyQuantizedPerTensorWeights3DTransposeB) {
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto input_channels = channels_rng();
+  const auto output_channels = channels_rng();
+  auto xnnpack_delegate = get_delegate();
+
+  BatchMatrixMultiplyTester()
+      .InputADims({batch, height, input_channels})
+      .InputBDims({batch, output_channels, input_channels})
+      .InputBQuant(BatchMatrixMultiplyTester::kTensor)
+      .TransposeB(true)
+      .Test(xnnpack_delegate.get());
+}
+
+TEST_F(BatchMatrixMultiplyTest, BroadcastOne3D) {
   const auto batch = shape_rng();
   const auto height = shape_rng();
-  const auto input1_channels = channels_rng();
+  const auto input_channels = channels_rng();
   const auto output_channels = channels_rng();
   auto xnnpack_delegate = get_delegate();
 
   BatchMatrixMultiplyTester()
-      .Input1Shape({batch, height, input1_channels})
-      .Input2Shape({1, input1_channels, output_channels})
+      .InputADims({batch, height, input_channels})
+      .InputBDims({1, input_channels, output_channels})
       .Test(xnnpack_delegate.get());
 
   BatchMatrixMultiplyTester()
-      .Input1Shape({1, height, input1_channels})
-      .Input2Shape({batch, input1_channels, output_channels})
+      .InputADims({1, height, input_channels})
+      .InputBDims({batch, input_channels, output_channels})
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DISABLED_BatchMatrixMultiplyTest, BroadcastImplicit3D) {
+TEST_F(BatchMatrixMultiplyTest, BroadcastImplicit3D) {
   const auto batch = shape_rng();
   const auto height = shape_rng();
-  const auto input1_channels = channels_rng();
+  const auto input_channels = channels_rng();
   const auto output_channels = channels_rng();
   auto xnnpack_delegate = get_delegate();
 
   BatchMatrixMultiplyTester()
-      .Input1Shape({batch, height, input1_channels})
-      .Input2Shape({input1_channels, output_channels})
+      .InputADims({batch, height, input_channels})
+      .InputBDims({input_channels, output_channels})
       .Test(xnnpack_delegate.get());
 
   BatchMatrixMultiplyTester()
-      .Input1Shape({height, input1_channels})
-      .Input2Shape({batch, input1_channels, output_channels})
+      .InputADims({height, input_channels})
+      .InputBDims({batch, input_channels, output_channels})
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DISABLED_BatchMatrixMultiplyTest, 4D) {
+TEST_F(BatchMatrixMultiplyTest, 4D) {
   const auto outer_batch = shape_rng();
   const auto inner_batch = shape_rng();
   const auto height = shape_rng();
-  const auto input1_channels = channels_rng();
+  const auto input_channels = channels_rng();
   const auto output_channels = channels_rng();
   auto xnnpack_delegate = get_delegate();
 
   BatchMatrixMultiplyTester()
-      .Input1Shape({outer_batch, inner_batch, height, input1_channels})
-      .Input2Shape({outer_batch, inner_batch, input1_channels, output_channels})
+      .InputADims({outer_batch, inner_batch, height, input_channels})
+      .InputBDims({outer_batch, inner_batch, input_channels, output_channels})
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DISABLED_BatchMatrixMultiplyTest, BroadcastOne4D) {
+TEST_F(BatchMatrixMultiplyTest, BroadcastOne4D) {
   const auto outer_batch = shape_rng();
   const auto inner_batch = shape_rng();
   const auto height = shape_rng();
-  const auto input1_channels = channels_rng();
+  const auto input_channels = channels_rng();
   const auto output_channels = channels_rng();
   auto xnnpack_delegate = get_delegate();
 
   BatchMatrixMultiplyTester()
-      .Input1Shape({1, inner_batch, height, input1_channels})
-      .Input2Shape({outer_batch, inner_batch, input1_channels, output_channels})
+      .InputADims({1, inner_batch, height, input_channels})
+      .InputBDims({outer_batch, inner_batch, input_channels, output_channels})
       .Test(xnnpack_delegate.get());
   BatchMatrixMultiplyTester()
-      .Input1Shape({outer_batch, inner_batch, height, input1_channels})
-      .Input2Shape({1, inner_batch, input1_channels, output_channels})
+      .InputADims({outer_batch, inner_batch, height, input_channels})
+      .InputBDims({1, inner_batch, input_channels, output_channels})
       .Test(xnnpack_delegate.get());
   BatchMatrixMultiplyTester()
-      .Input1Shape({outer_batch, 1, height, input1_channels})
-      .Input2Shape({outer_batch, inner_batch, input1_channels, output_channels})
+      .InputADims({outer_batch, 1, height, input_channels})
+      .InputBDims({outer_batch, inner_batch, input_channels, output_channels})
       .Test(xnnpack_delegate.get());
   BatchMatrixMultiplyTester()
-      .Input1Shape({outer_batch, inner_batch, height, input1_channels})
-      .Input2Shape({outer_batch, 1, input1_channels, output_channels})
+      .InputADims({outer_batch, inner_batch, height, input_channels})
+      .InputBDims({outer_batch, 1, input_channels, output_channels})
       .Test(xnnpack_delegate.get());
   BatchMatrixMultiplyTester()
-      .Input1Shape({1, 1, height, input1_channels})
-      .Input2Shape({outer_batch, inner_batch, input1_channels, output_channels})
+      .InputADims({1, 1, height, input_channels})
+      .InputBDims({outer_batch, inner_batch, input_channels, output_channels})
       .Test(xnnpack_delegate.get());
   BatchMatrixMultiplyTester()
-      .Input1Shape({outer_batch, inner_batch, height, input1_channels})
-      .Input2Shape({1, 1, input1_channels, output_channels})
+      .InputADims({outer_batch, inner_batch, height, input_channels})
+      .InputBDims({1, 1, input_channels, output_channels})
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DISABLED_BatchMatrixMultiplyTest, BroadcastImplicit4D) {
+TEST_F(BatchMatrixMultiplyTest, BroadcastImplicit4D) {
   const auto outer_batch = shape_rng();
   const auto inner_batch = shape_rng();
   const auto height = shape_rng();
-  const auto input1_channels = channels_rng();
+  const auto input_channels = channels_rng();
   const auto output_channels = channels_rng();
   auto xnnpack_delegate = get_delegate();
 
   BatchMatrixMultiplyTester()
-      .Input1Shape({inner_batch, height, input1_channels})
-      .Input2Shape({outer_batch, inner_batch, input1_channels, output_channels})
+      .InputADims({inner_batch, height, input_channels})
+      .InputBDims({outer_batch, inner_batch, input_channels, output_channels})
       .Test(xnnpack_delegate.get());
   BatchMatrixMultiplyTester()
-      .Input1Shape({outer_batch, inner_batch, height, input1_channels})
-      .Input2Shape({inner_batch, input1_channels, output_channels})
+      .InputADims({outer_batch, inner_batch, height, input_channels})
+      .InputBDims({inner_batch, input_channels, output_channels})
       .Test(xnnpack_delegate.get());
   BatchMatrixMultiplyTester()
-      .Input1Shape({height, input1_channels})
-      .Input2Shape({outer_batch, inner_batch, input1_channels, output_channels})
+      .InputADims({height, input_channels})
+      .InputBDims({outer_batch, inner_batch, input_channels, output_channels})
       .Test(xnnpack_delegate.get());
   BatchMatrixMultiplyTester()
-      .Input1Shape({outer_batch, inner_batch, height, input1_channels})
-      .Input2Shape({input1_channels, output_channels})
+      .InputADims({outer_batch, inner_batch, height, input_channels})
+      .InputBDims({input_channels, output_channels})
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DISABLED_BatchMatrixMultiplyTest, 4D_AdjY) {
+TEST_F(BatchMatrixMultiplyTest, 4D_TransposeB) {
   const auto outer_batch = shape_rng();
   const auto inner_batch = shape_rng();
   const auto height = shape_rng();
-  const auto input1_channels = channels_rng();
+  const auto input_channels = channels_rng();
   const auto output_channels = channels_rng();
   auto xnnpack_delegate = get_delegate();
 
   BatchMatrixMultiplyTester()
-      .Input1Shape({outer_batch, inner_batch, height, input1_channels})
-      .Input2Shape({outer_batch, inner_batch, output_channels, input1_channels})
-      .AdjY(true)
+      .InputADims({outer_batch, inner_batch, height, input_channels})
+      .InputBDims({outer_batch, inner_batch, output_channels, input_channels})
+      .TransposeB(true)
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DISABLED_BatchMatrixMultiplyTest, MultiThreading) {
+TEST_F(BatchMatrixMultiplyTest, MultiThreading) {
   const auto batch = shape_rng();
   const auto height = shape_rng();
-  const auto input1_channels = channels_rng();
+  const auto input_channels = channels_rng();
   const auto output_channels = channels_rng();
   auto xnnpack_delegate = get_delegate(/*num_threads=*/2);
 
   BatchMatrixMultiplyTester()
-      .Input1Shape({batch, height, input1_channels})
-      .Input2Shape({batch, input1_channels, output_channels})
+      .InputADims({batch, height, input_channels})
+      .InputBDims({batch, input_channels, output_channels})
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DISABLED_BatchMatrixMultiplyTest, WeightsCache) {
+TEST_F(BatchMatrixMultiplyTest, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -219,12 +282,12 @@ TEST_F(DISABLED_BatchMatrixMultiplyTest, WeightsCache) {
 
   const auto batch = shape_rng();
   const auto height = shape_rng();
-  const auto input1_channels = channels_rng();
+  const auto input_channels = channels_rng();
   const auto output_channels = channels_rng();
 
   BatchMatrixMultiplyTester()
-      .Input1Shape({batch, height, input1_channels})
-      .Input2Shape({batch, input1_channels, output_channels})
+      .InputADims({batch, height, input_channels})
+      .InputBDims({batch, input_channels, output_channels})
       .WeightsCache(weights_cache.get())
       .Test(xnnpack_delegate.get());
 }
diff --git a/tensorflow/lite/delegates/xnnpack/batch_matrix_multiply_tester.cc b/tensorflow/lite/delegates/xnnpack/batch_matrix_multiply_tester.cc
index bde7a188f44235..c7ced5fa1215d9 100644
--- a/tensorflow/lite/delegates/xnnpack/batch_matrix_multiply_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/batch_matrix_multiply_tester.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
 #include "flatbuffers/string.h"  // from @flatbuffers
 #include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/delegates/xnnpack/test_util.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
@@ -42,18 +43,18 @@ namespace tflite {
 namespace xnnpack {
 
 std::vector<int32_t> BatchMatrixMultiplyTester::OutputShape() const {
-  std::vector<int32_t> output_shape = Input1Shape();
+  std::vector<int32_t> output_shape = InputADims();
   const size_t output_dimensions = output_shape.size();
   output_shape[output_dimensions - 1] =
-      AdjY() ? Input2Shape()[Input2Shape().size() - 2]
-             : Input2Shape()[Input2Shape().size() - 1];
+      TransposeB() ? InputBDims()[InputBDims().size() - 2]
+                   : InputBDims()[InputBDims().size() - 1];
   return output_shape;
 }
 
 void BatchMatrixMultiplyTester::Test(TfLiteDelegate* delegate) const {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
-  auto input_rng =
+  auto input_rng_f32 =
       std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
 
   std::vector<char> buffer = CreateTfLiteModel();
@@ -87,6 +88,7 @@ void BatchMatrixMultiplyTester::Test(TfLiteDelegate* delegate) const {
   ASSERT_EQ(default_interpreter->AllocateTensors(), kTfLiteOk);
 
   ASSERT_EQ(delegate_interpreter->ModifyGraphWithDelegate(delegate), kTfLiteOk);
+  ASSERT_TRUE(delegate_interpreter->primary_subgraph().IsFullyDelegated());
 
   if (weights_cache_ != nullptr) {
     TfLiteXNNPackDelegateWeightsCacheFinalizeHard(weights_cache_);
@@ -94,17 +96,19 @@ void BatchMatrixMultiplyTester::Test(TfLiteDelegate* delegate) const {
 
   float* default_input1_data =
       default_interpreter->typed_input_tensor<float>(0);
-  float* default_input2_data =
-      default_interpreter->typed_input_tensor<float>(1);
-  std::generate_n(default_input1_data, Input1Size(), std::ref(input_rng));
-  std::generate_n(default_input2_data, Input2Size(), std::ref(input_rng));
-
+  std::generate_n(default_input1_data, Input1Size(), std::ref(input_rng_f32));
   float* delegate_input1_data =
       delegate_interpreter->typed_input_tensor<float>(0);
-  float* delegate_input2_data =
-      delegate_interpreter->typed_input_tensor<float>(1);
   std::copy_n(default_input1_data, Input1Size(), delegate_input1_data);
-  std::copy_n(default_input2_data, Input2Size(), delegate_input2_data);
+
+  if (InputBQuant() == kNone) {
+    float* default_input2_data =
+        default_interpreter->typed_input_tensor<float>(1);
+    std::generate_n(default_input2_data, Input2Size(), std::ref(input_rng_f32));
+    float* delegate_input2_data =
+        delegate_interpreter->typed_input_tensor<float>(1);
+    std::copy_n(default_input2_data, Input2Size(), delegate_input2_data);
+  }
 
   ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
   ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
@@ -114,11 +118,20 @@ void BatchMatrixMultiplyTester::Test(TfLiteDelegate* delegate) const {
   float* delegate_output_data =
       delegate_interpreter->typed_output_tensor<float>(0);
 
+  // The error estimate used here assume that the inputs are all in the range
+  // `[-1, 1]`. When no quantization is applied, the error measure is the value
+  // $\gamma_k$ for the dot products used to compute the entries of the output
+  // matrix. For quantized inputs, the error bound is the maximum accumulated
+  // quantization error for said dot product.
   const int32_t output_size = ComputeSize(OutputShape());
+  const int32_t k = InputADims().back();
+  float max_abs_error =
+      (InputBQuant() == kNone)
+          ? k * std::numeric_limits<float>::epsilon() /
+                (1.0f - k * std::numeric_limits<float>::epsilon())
+          : k * 0.5f / 127;
   for (size_t i = 0; i < output_size; i++) {
-    ASSERT_NEAR(default_output_data[i], delegate_output_data[i],
-                std::numeric_limits<float>::epsilon() *
-                    std::max(std::abs(default_output_data[i]) * 20.0f, 1.0f));
+    ASSERT_NEAR(default_output_data[i], delegate_output_data[i], max_abs_error);
   }
 }
 
@@ -137,23 +150,78 @@ std::vector<char> BatchMatrixMultiplyTester::CreateTfLiteModel() const {
   std::vector<flatbuffers::Offset<Tensor>> tensors;
   tensors.emplace_back(CreateTensor(
       builder,
-      builder.CreateVector<int32_t>(Input1Shape().data(), Input1Shape().size()),
-      TensorType_FLOAT32, /*buffer=*/0));
-  tensors.emplace_back(CreateTensor(
-      builder,
-      builder.CreateVector<int32_t>(Input2Shape().data(), Input2Shape().size()),
+      builder.CreateVector<int32_t>(InputADims().data(), InputADims().size()),
       TensorType_FLOAT32, /*buffer=*/0));
+
+  if (InputBQuant() != kNone) {
+    std::vector<float> input2_data(Input2Size());
+    std::random_device random_device;
+    auto rng = std::mt19937(random_device());
+    auto input_rng_f32 = [&]() {
+      return std::uniform_real_distribution<float>()(rng);
+    };
+    std::generate(input2_data.begin(), input2_data.end(), input_rng_f32);
+    std::vector<float> filter_scales;
+    std::vector<int64_t> filter_zero_points;
+    int32_t filter_quantized_dimension = 0;
+
+    std::vector<int8_t> quantized_input2_data(input2_data.size());
+    if (InputBQuant() == kChannel) {
+      const int32_t num_dims_b = InputBDims().size();
+      filter_quantized_dimension =
+          TransposeB() ? num_dims_b - 2 : num_dims_b - 1;
+      filter_scales = GetInt8QuantizationScalePerChannel(
+          input2_data.data(), filter_quantized_dimension, InputBDims());
+      filter_zero_points.resize(filter_scales.size(), 0);
+      QuantizeInt8PerChannel(filter_scales.data(), filter_zero_points.data(),
+                             filter_quantized_dimension, input2_data.data(),
+                             quantized_input2_data.data(), InputBDims());
+    } else {
+      filter_scales.resize(1, GetInt8QuantizationScale(input2_data));
+      filter_zero_points.resize(1, 0);
+      std::transform(
+          input2_data.begin(), input2_data.end(), quantized_input2_data.begin(),
+          std::bind(QuantizeInt8, std::placeholders::_1, 0, filter_scales[0]));
+    }
+
+    const int quantized_filter_buffer_id = buffers.size();
+    buffers.emplace_back(CreateBuffer(
+        builder,
+        builder.CreateVector(
+            reinterpret_cast<const uint8_t*>(quantized_input2_data.data()),
+            sizeof(int8_t) * quantized_input2_data.size())));
+
+    flatbuffers::Offset<tflite::QuantizationParameters>
+        filter_quantization_params = CreateQuantizationParameters(
+            builder, /*min=*/0, /*max=*/0,
+            builder.CreateVector<float>(filter_scales),
+            builder.CreateVector<int64_t>(filter_zero_points),
+            /*details_type=*/QuantizationDetails_NONE,
+            /*details=*/0, filter_quantized_dimension);
+
+    tensors.emplace_back(CreateTensor(
+        builder,
+        builder.CreateVector<int32_t>(InputBDims().data(), InputBDims().size()),
+        /*type=*/TensorType_INT8,
+        /*buffer=*/quantized_filter_buffer_id,
+        /*name=*/0, filter_quantization_params));
+  } else {
+    tensors.emplace_back(CreateTensor(
+        builder,
+        builder.CreateVector<int32_t>(InputBDims().data(), InputBDims().size()),
+        TensorType_FLOAT32, /*buffer=*/0));
+  }
+
   tensors.emplace_back(CreateTensor(
       builder,
       builder.CreateVector<int32_t>(OutputShape().data(), OutputShape().size()),
       TensorType_FLOAT32));
 
   /***************************** Define operators *****************************/
-
   std::vector<int32_t> op_inputs{{0, 1}};
   const std::array<int32_t, 1> op_outputs{{2}};
   const flatbuffers::Offset<BatchMatMulOptions> batch_matmul_options =
-      CreateBatchMatMulOptions(builder, AdjX(), AdjY());
+      CreateBatchMatMulOptions(builder, false, TransposeB());
   const flatbuffers::Offset<Operator> op = CreateOperator(
       builder, /*opcode_index=*/0,
       builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
diff --git a/tensorflow/lite/delegates/xnnpack/batch_matrix_multiply_tester.h b/tensorflow/lite/delegates/xnnpack/batch_matrix_multiply_tester.h
index 6bf963f84841f1..19c81473a699d2 100644
--- a/tensorflow/lite/delegates/xnnpack/batch_matrix_multiply_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/batch_matrix_multiply_tester.h
@@ -34,58 +34,57 @@ class BatchMatrixMultiplyTester {
     kFP32,
   };
 
+  enum QuantizationType {
+    kNone,
+    kChannel,
+    kTensor,
+  };
+
   BatchMatrixMultiplyTester() = default;
   BatchMatrixMultiplyTester(const BatchMatrixMultiplyTester&) = delete;
   BatchMatrixMultiplyTester& operator=(const BatchMatrixMultiplyTester&) =
       delete;
 
-  inline BatchMatrixMultiplyTester& Input1Shape(
-      std::initializer_list<int32_t> shape) {
+  BatchMatrixMultiplyTester& InputADims(std::initializer_list<int32_t> shape) {
     EXPECT_THAT(shape, testing::Each(testing::Gt(0)));
-    input1_shape_ = std::vector<int32_t>(shape.begin(), shape.end());
-    input1_size_ = ComputeSize(input1_shape_);
+    input_a_dims_ = std::vector<int32_t>(shape.begin(), shape.end());
+    input1_size_ = ComputeSize(input_a_dims_);
     return *this;
   }
 
-  inline const std::vector<int32_t>& Input1Shape() const {
-    return input1_shape_;
-  }
+  const std::vector<int32_t>& InputADims() const { return input_a_dims_; }
 
-  inline BatchMatrixMultiplyTester& Input2Shape(
-      std::initializer_list<int32_t> shape) {
+  BatchMatrixMultiplyTester& InputBDims(std::initializer_list<int32_t> shape) {
     for (auto it = shape.begin(); it != shape.end(); ++it) {
       EXPECT_GT(*it, 0);
     }
-    input2_shape_ = std::vector<int32_t>(shape.begin(), shape.end());
-    input2_size_ = ComputeSize(input2_shape_);
+    input_b_dims_ = std::vector<int32_t>(shape.begin(), shape.end());
+    input2_size_ = ComputeSize(input_b_dims_);
     return *this;
   }
 
-  inline const std::vector<int32_t>& Input2Shape() const {
-    return input2_shape_;
+  const std::vector<int32_t>& InputBDims() const { return input_b_dims_; }
+
+  BatchMatrixMultiplyTester& InputBQuant(QuantizationType quantization) {
+    quant_b_ = quantization;
+    return *this;
   }
+  QuantizationType InputBQuant() const { return quant_b_; }
 
-  inline int32_t Input1Size() const { return input1_size_; }
+  int32_t Input1Size() const { return input1_size_; }
 
-  inline int32_t Input2Size() const { return input2_size_; }
+  int32_t Input2Size() const { return input2_size_; }
 
   std::vector<int32_t> OutputShape() const;
 
-  inline BatchMatrixMultiplyTester& AdjX(bool adj_x) {
-    adj_x_ = adj_x;
-    return *this;
-  }
-
-  inline bool AdjX() const { return adj_x_; }
-
-  inline BatchMatrixMultiplyTester& AdjY(bool adj_y) {
-    adj_y_ = adj_y;
+  BatchMatrixMultiplyTester& TransposeB(bool adj_y) {
+    transpose_b_ = adj_y;
     return *this;
   }
 
-  inline bool AdjY() const { return adj_y_; }
+  bool TransposeB() const { return transpose_b_; }
 
-  inline BatchMatrixMultiplyTester& WeightsCache(
+  BatchMatrixMultiplyTester& WeightsCache(
       TfLiteXNNPackDelegateWeightsCache* weights_cache) {
     weights_cache_ = weights_cache;
     return *this;
@@ -96,16 +95,16 @@ class BatchMatrixMultiplyTester {
  private:
   std::vector<char> CreateTfLiteModel() const;
 
-  inline WeightsType WeightsType() const { return weights_type_; }
+  WeightsType WeightsType() const { return weights_type_; }
 
   static int32_t ComputeSize(const std::vector<int32_t>& shape);
 
-  std::vector<int32_t> input1_shape_;
-  std::vector<int32_t> input2_shape_;
+  std::vector<int32_t> input_a_dims_;
+  std::vector<int32_t> input_b_dims_;
+  QuantizationType quant_b_ = kNone;
   int32_t input1_size_ = 1;
   int32_t input2_size_ = 1;
-  bool adj_x_ = false;
-  bool adj_y_ = false;
+  bool transpose_b_ = false;
   enum WeightsType weights_type_ { WeightsType::kFP32 };
   TfLiteXNNPackDelegateWeightsCache* weights_cache_ = nullptr;
 };
diff --git a/tensorflow/lite/delegates/xnnpack/odml_sdpa_test.cc b/tensorflow/lite/delegates/xnnpack/odml_sdpa_test.cc
new file mode 100644
index 00000000000000..e015b2006dc056
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/odml_sdpa_test.cc
@@ -0,0 +1,67 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/delegates/xnnpack/odml_sdpa_tester.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+
+namespace tflite {
+namespace xnnpack {
+
+TEST(ODMLSDPA, MQA) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  const auto batch = 1;
+  const auto input_seq_len = 1;
+  const auto max_seq_len = 500;
+  const auto q_heads = 32;
+  const auto kv_heads = 1;
+  const auto head_dim = 4;  // embedding_dim//q_heads
+
+  ODMLSDPATester()
+      .QueryShape({batch, input_seq_len, q_heads, head_dim})  // q
+      .KeyShape({batch, max_seq_len, kv_heads, head_dim})     // k
+      .ValueShape({batch, max_seq_len, kv_heads, head_dim})   // v
+      .MaskShape({batch, 1, input_seq_len, max_seq_len})      // mask
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(ODMLSDPA, MHA) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  const auto batch = 1;
+  const auto input_seq_len = 1;
+  const auto max_seq_len = 500;
+  const auto q_heads = 32;
+  const auto kv_heads = 32;
+  const auto head_dim = 4;  // embedding_dim//q_heads
+
+  ODMLSDPATester()
+      .QueryShape({batch, input_seq_len, q_heads, head_dim})  // q
+      .KeyShape({batch, max_seq_len, kv_heads, head_dim})     // k
+      .ValueShape({batch, max_seq_len, kv_heads, head_dim})   // v
+      .MaskShape({batch, 1, input_seq_len, max_seq_len})      // mask
+      .Test(xnnpack_delegate.get());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/odml_sdpa_tester.cc b/tensorflow/lite/delegates/xnnpack/odml_sdpa_tester.cc
new file mode 100644
index 00000000000000..704390bc2aa6aa
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/odml_sdpa_tester.cc
@@ -0,0 +1,208 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/xnnpack/odml_sdpa_tester.h"
+
+#include <math.h>
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <numeric>
+#include <random>
+#include <vector>
+
+#include "flatbuffers/flexbuffers.h"
+#include <gtest/gtest.h>
+#include "flatbuffers/buffer.h"  // from @flatbuffers
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "flatbuffers/string.h"  // from @flatbuffers
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/experimental/genai/genai_ops.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/schema/schema_conversion_utils.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+
+namespace tflite {
+namespace xnnpack {
+
+std::vector<int32_t> ODMLSDPATester::OutputShape() const {
+  std::vector<int32_t> output_shape = QueryShape();
+  return output_shape;
+}
+
+void ODMLSDPATester::Test(TfLiteDelegate* delegate) const {
+  // Test for SDPA XNNPACK delegate vs TfLite Reference SDPA op.
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto input_rng =
+      std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
+
+  std::vector<char> buffer = CreateTfLiteModel();
+  const Model* model = GetModel(buffer.data());
+
+  std::unique_ptr<Interpreter> delegate_interpreter;
+  auto resolver =
+      ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates();
+  resolver.AddCustom("odml.scaled_dot_product_attention",
+                     tflite::ops::custom::Register_SDPA());
+  ASSERT_EQ(InterpreterBuilder(model, resolver)(&delegate_interpreter),
+            kTfLiteOk);
+  std::unique_ptr<Interpreter> default_interpreter;
+  ASSERT_EQ(InterpreterBuilder(model, resolver)(&default_interpreter),
+            kTfLiteOk);
+
+  ASSERT_TRUE(delegate_interpreter);
+  ASSERT_TRUE(default_interpreter);
+
+  ASSERT_EQ(delegate_interpreter->inputs().size(), 4);
+  ASSERT_EQ(default_interpreter->inputs().size(), 4);
+
+  ASSERT_EQ(delegate_interpreter->outputs().size(), 1);
+  ASSERT_EQ(default_interpreter->outputs().size(), 1);
+
+  ASSERT_EQ(delegate_interpreter->AllocateTensors(), kTfLiteOk);
+  ASSERT_EQ(default_interpreter->AllocateTensors(), kTfLiteOk);
+
+  ASSERT_EQ(delegate_interpreter->ModifyGraphWithDelegate(delegate), kTfLiteOk);
+
+  float* delegate_input1_data =
+      delegate_interpreter->typed_input_tensor<float>(0);
+  float* delegate_input2_data =
+      delegate_interpreter->typed_input_tensor<float>(1);
+  float* delegate_input3_data =
+      delegate_interpreter->typed_input_tensor<float>(2);
+  float* delegate_input4_data =
+      delegate_interpreter->typed_input_tensor<float>(3);
+  std::generate_n(delegate_input1_data, QuerySize(), std::ref(input_rng));
+  std::generate_n(delegate_input2_data, KeySize(), std::ref(input_rng));
+  std::generate_n(delegate_input3_data, ValueSize(), std::ref(input_rng));
+  std::generate_n(delegate_input4_data, MaskSize(), std::ref(input_rng));
+
+  float* default_input1_data =
+      default_interpreter->typed_input_tensor<float>(0);
+  float* default_input2_data =
+      default_interpreter->typed_input_tensor<float>(1);
+  float* default_input3_data =
+      default_interpreter->typed_input_tensor<float>(2);
+  float* default_input4_data =
+      default_interpreter->typed_input_tensor<float>(3);
+  std::copy_n(delegate_input1_data, QuerySize(), default_input1_data);
+  std::copy_n(delegate_input2_data, KeySize(), default_input2_data);
+  std::copy_n(delegate_input3_data, ValueSize(), default_input3_data);
+  std::copy_n(delegate_input4_data, MaskSize(), default_input4_data);
+
+  ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
+  ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
+
+  float* delegate_output_data =
+      delegate_interpreter->typed_output_tensor<float>(0);
+  float* default_output_data =
+      default_interpreter->typed_output_tensor<float>(0);
+  const int32_t output_size = ComputeSize(OutputShape());
+
+  for (size_t i = 0; i < output_size; i++) {
+    ASSERT_NEAR(default_output_data[i], delegate_output_data[i],
+                std::numeric_limits<float>::epsilon() *
+                    std::max(std::abs(default_output_data[i]) * 20.0f, 1.0f));
+  }
+}
+
+std::vector<char> ODMLSDPATester::CreateTfLiteModel() const {
+  flatbuffers::FlatBufferBuilder builder;
+  flatbuffers::Offset<OperatorCode> operator_code = CreateOperatorCode(
+      builder, BuiltinOperator_CUSTOM,
+      builder.CreateString("odml.scaled_dot_product_attention"));
+
+  const std::array<flatbuffers::Offset<Buffer>, 1> buffers{{
+      CreateBuffer(builder, builder.CreateVector({})),
+  }};
+
+  const std::array<flatbuffers::Offset<Tensor>, 5> tensors{{
+      CreateTensor(builder,
+                   builder.CreateVector<int32_t>(QueryShape().data(),
+                                                 QueryShape().size()),
+                   TensorType_FLOAT32),
+      CreateTensor(
+          builder,
+          builder.CreateVector<int32_t>(KeyShape().data(), KeyShape().size()),
+          TensorType_FLOAT32),
+      CreateTensor(builder,
+                   builder.CreateVector<int32_t>(ValueShape().data(),
+                                                 ValueShape().size()),
+                   TensorType_FLOAT32),
+      CreateTensor(
+          builder,
+          builder.CreateVector<int32_t>(MaskShape().data(), MaskShape().size()),
+          TensorType_FLOAT32),
+      CreateTensor(builder,
+                   builder.CreateVector<int32_t>(OutputShape().data(),
+                                                 OutputShape().size()),
+                   TensorType_FLOAT32),
+  }};
+
+  auto fbb = std::make_unique<flexbuffers::Builder>();
+  float scale = 1 / sqrt(QueryShape().data()[QueryShape().size() - 1]);
+  fbb->Map([&]() { fbb->Float("scale", scale); });
+  fbb->Finish();
+
+  const std::array<int32_t, 4> op_inputs{{0, 1, 2, 3}};
+  const std::array<int32_t, 1> op_outputs{{4}};
+  flatbuffers::Offset<Operator> op = CreateOperator(
+      builder, /*opcode_index=*/0,
+      builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
+      builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()),
+      tflite::BuiltinOptions_NONE, 0,
+      builder.CreateVector<uint8_t>(
+          reinterpret_cast<const uint8_t*>(fbb->GetBuffer().data()),
+          fbb->GetSize()));
+
+  const std::array<int32_t, 4> subgraph_inputs{{0, 1, 2, 3}};
+  const std::array<int32_t, 1> subgraph_outputs{{4}};
+  flatbuffers::Offset<SubGraph> subgraph = CreateSubGraph(
+      builder, builder.CreateVector(tensors.data(), tensors.size()),
+      builder.CreateVector<int32_t>(subgraph_inputs.data(),
+                                    subgraph_inputs.size()),
+      builder.CreateVector<int32_t>(subgraph_outputs.data(),
+                                    subgraph_outputs.size()),
+      builder.CreateVector(&op, 1));
+
+  flatbuffers::Offset<flatbuffers::String> description =
+      builder.CreateString("ODML SDPA model");
+
+  flatbuffers::Offset<Model> model_buffer = CreateModel(
+      builder, TFLITE_SCHEMA_VERSION, builder.CreateVector(&operator_code, 1),
+      builder.CreateVector(&subgraph, 1), description,
+      builder.CreateVector(buffers.data(), buffers.size()));
+
+  builder.Finish(model_buffer);
+
+  return std::vector<char>(builder.GetBufferPointer(),
+                           builder.GetBufferPointer() + builder.GetSize());
+}
+
+int32_t ODMLSDPATester::ComputeSize(const std::vector<int32_t>& shape) {
+  return std::accumulate(shape.cbegin(), shape.cend(), 1,
+                         std::multiplies<int32_t>());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/odml_sdpa_tester.h b/tensorflow/lite/delegates/xnnpack/odml_sdpa_tester.h
new file mode 100644
index 00000000000000..da88b8c890729f
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/odml_sdpa_tester.h
@@ -0,0 +1,103 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_ODML_SDPA_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_ODML_SDPA_TESTER_H_
+
+#include <cstdint>
+#include <functional>
+#include <random>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class ODMLSDPATester {
+ public:
+  ODMLSDPATester() = default;
+  ODMLSDPATester(const ODMLSDPATester&) = delete;
+  ODMLSDPATester& operator=(const ODMLSDPATester&) = delete;
+
+  inline ODMLSDPATester& QueryShape(std::initializer_list<int32_t> shape) {
+    EXPECT_THAT(shape, testing::Each(testing::Gt(0)));
+    query_shape_ = std::vector<int32_t>(shape.begin(), shape.end());
+    query_size_ = ComputeSize(query_shape_);
+    return *this;
+  }
+
+  inline const std::vector<int32_t>& QueryShape() const { return query_shape_; }
+
+  inline ODMLSDPATester& KeyShape(std::initializer_list<int32_t> shape) {
+    EXPECT_THAT(shape, testing::Each(testing::Gt(0)));
+    key_shape_ = std::vector<int32_t>(shape.begin(), shape.end());
+    key_size_ = ComputeSize(key_shape_);
+    return *this;
+  }
+
+  inline const std::vector<int32_t>& KeyShape() const { return key_shape_; }
+
+  inline ODMLSDPATester& ValueShape(std::initializer_list<int32_t> shape) {
+    EXPECT_THAT(shape, testing::Each(testing::Gt(0)));
+    value_shape_ = std::vector<int32_t>(shape.begin(), shape.end());
+    value_size_ = ComputeSize(value_shape_);
+    return *this;
+  }
+
+  inline const std::vector<int32_t>& ValueShape() const { return value_shape_; }
+
+  inline ODMLSDPATester& MaskShape(std::initializer_list<int32_t> shape) {
+    EXPECT_THAT(shape, testing::Each(testing::Gt(0)));
+    mask_shape_ = std::vector<int32_t>(shape.begin(), shape.end());
+    mask_size_ = ComputeSize(mask_shape_);
+    return *this;
+  }
+
+  inline const std::vector<int32_t>& MaskShape() const { return mask_shape_; }
+
+  inline int32_t QuerySize() const { return query_size_; }
+
+  inline int32_t KeySize() const { return key_size_; }
+
+  inline int32_t ValueSize() const { return value_size_; }
+
+  inline int32_t MaskSize() const { return mask_size_; }
+
+  std::vector<int32_t> OutputShape() const;
+
+  static int32_t ComputeSize(const std::vector<int32_t>& shape);
+
+  void Test(TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel() const;
+
+  std::vector<int32_t> query_shape_;
+  std::vector<int32_t> key_shape_;
+  std::vector<int32_t> value_shape_;
+  std::vector<int32_t> mask_shape_;
+  int32_t query_size_ = 1;
+  int32_t key_size_ = 1;
+  int32_t value_size_ = 1;
+  int32_t mask_size_ = 1;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_ODML_SDPA_TESTER_H_
diff --git a/tensorflow/lite/delegates/xnnpack/rsqrt_test.cc b/tensorflow/lite/delegates/xnnpack/rsqrt_test.cc
new file mode 100644
index 00000000000000..a11549e0e1e9bc
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/rsqrt_test.cc
@@ -0,0 +1,122 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <random>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace xnnpack {
+
+TEST(Rsqrt, 4D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  UnaryElementwiseTester()
+      .Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_RSQRT, xnnpack_delegate.get());
+}
+
+TEST(Rsqrt, 3D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  UnaryElementwiseTester()
+      .Shape({batch, width, channels})
+      .Test(BuiltinOperator_RSQRT, xnnpack_delegate.get());
+}
+
+TEST(Rsqrt, 2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  UnaryElementwiseTester()
+      .Shape({batch, channels})
+      .Test(BuiltinOperator_RSQRT, xnnpack_delegate.get());
+}
+
+TEST(Rsqrt, 1D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+
+  UnaryElementwiseTester().Shape({batch}).Test(BuiltinOperator_RSQRT,
+                                               xnnpack_delegate.get());
+}
+
+TEST(Rsqrt, MultiThreading) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.num_threads = 2;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  UnaryElementwiseTester()
+      .Shape({batch, height, width, channels})
+      .Test(BuiltinOperator_RSQRT, xnnpack_delegate.get());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/test_util.cc b/tensorflow/lite/delegates/xnnpack/test_util.cc
index cb9dfda32468ba..25564e051bfa0d 100644
--- a/tensorflow/lite/delegates/xnnpack/test_util.cc
+++ b/tensorflow/lite/delegates/xnnpack/test_util.cc
@@ -69,8 +69,8 @@ float GetInt8QuantizationScaleFromMinMax(float min, float max) {
 
 float GetInt8QuantizationScale(const std::vector<float>& data) {
   return GetInt8QuantizationScaleFromMinMax(
-      *std::max_element(data.begin(), data.end()),
-      *std::min_element(data.begin(), data.end()));
+      *std::min_element(data.begin(), data.end()),
+      *std::max_element(data.begin(), data.end()));
 }
 
 std::vector<float> GetInt8QuantizationScalePerChannel(
diff --git a/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.cc b/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.cc
index 1bf519d46d67d7..a8a5df5a028d1e 100644
--- a/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.cc
@@ -17,17 +17,22 @@ limitations under the License.
 
 #include <algorithm>
 #include <array>
+#include <cstddef>
 #include <cstdint>
+#include <cstdlib>
 #include <functional>
+#include <limits>
 #include <memory>
 #include <numeric>
 #include <random>
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "flatbuffers/buffer.h"  // from @flatbuffers
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "flatbuffers/string.h"  // from @flatbuffers
+#include "tensorflow/lite/core/interpreter_builder.h"
 #include "tensorflow/lite/core/kernels/register.h"
-#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
@@ -45,6 +50,10 @@ void UnaryElementwiseTester::Test(tflite::BuiltinOperator unary_op,
     case BuiltinOperator_SQRT:
       input_distribution = std::uniform_real_distribution<float>(0.0f, 10.0f);
       break;
+    case BuiltinOperator_RSQRT:
+      input_distribution = std::uniform_real_distribution<float>(
+          std::numeric_limits<float>::epsilon(), 10.0f);
+      break;
     default:
       break;
   }
diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index 518575bcbe596f..d9d2bc1125b4eb 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "experiments-config.h"  // from @XNNPACK
 #include "xnnpack.h"  // from @XNNPACK
 #include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/core/api/profiler.h"
 #include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/core/c/common.h"
@@ -1968,9 +1969,10 @@ class Subgraph {
           }
           if (quantization_params->scale->size > 1) {
             if (xnn_validate_channelwise_quantized_tensor(
-                    xnn_datatype_qcint8, /*zero_point=*/0,
+                    xnn_datatype_qcint8,
+                    /*zero_point=*/quantization_params->zero_point->data[0],
                     quantization_params->scale->data, tensor_dims.size(),
-                    /*channel_dim=*/0,
+                    /*channel_dim=*/quantization_params->quantized_dimension,
                     tensor_dims.data()) != xnn_status_success) {
               TF_LITE_MAYBE_KERNEL_LOG(
                   context,
@@ -2795,6 +2797,9 @@ class Subgraph {
       case kTfLiteBuiltinRound:
         return VisitRoundNode(subgraph, delegate, logging_context, node_index,
                               node, context->tensors, input_output_tensors);
+      case kTfLiteBuiltinRsqrt:
+        return VisitRsqrtNode(subgraph, delegate, logging_context, node_index,
+                              node, context->tensors, input_output_tensors);
       case kTfLiteBuiltinSlice:
         return VisitSliceNode(subgraph, delegate, logging_context, node_index,
                               node, context->tensors, input_output_tensors);
@@ -2890,6 +2895,26 @@ class Subgraph {
           return VisitMediaPipeUnpoolingNode(
               subgraph, delegate, context, node_index, node, context->tensors,
               &pool_params, input_output_tensors);
+        } else if (strcmp(registration->custom_name,
+                          "odml.scaled_dot_product_attention") == 0) {
+          const float* scale_val = nullptr;
+          // ensure 28 bytes as we expect
+          if (node->custom_initial_data_size == 28) {
+            // Custom data here is a flexbuffer map.
+            // byte_width is 4 for our map.
+            // First 5 values are "scale", then is the float value, and last is
+            // flexbuffer metadata.
+            const uint8_t* buffer =
+                reinterpret_cast<const uint8_t*>(node->custom_initial_data);
+            char str_val[20];
+            memcpy(str_val, buffer, 5 * 4);
+            if (strcmp(str_val, "scale") == 0)
+              scale_val = reinterpret_cast<const float*>(buffer + 5 * 4);
+          }
+
+          return VisitDotAttentionNode(subgraph, delegate, context, node_index,
+                                       node, context->tensors, scale_val,
+                                       input_output_tensors);
         } else {
 #ifdef XNNPACK_DELEGATE_ENABLE_LOGGING
           TF_LITE_KERNEL_LOG(
@@ -3128,30 +3153,44 @@ class Subgraph {
       TfLiteContext* logging_context, int node_index, TfLiteNode* node,
       const TfLiteTensor* tensors, const TfLiteBatchMatMulParams* params,
       const std::unordered_map<int, uint32_t>& input_output_tensors) {
-    if (!delegate.enable_latest_operators()) {
+    // Check whether all required options are supported.
+    if (params->adj_x) {
       TF_LITE_MAYBE_KERNEL_LOG(
           logging_context,
-          "failed to delegate %s node #%d. Delegation of latest "
-          "operators must be enabled",
+          "failed to delegate %s node #%d. adj_x is not supported",
           EnumNameBuiltinOperator(BuiltinOperator_BATCH_MATMUL), node_index);
       return kTfLiteError;
     }
-    const TfLiteTensor& input1_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, input1_tensor, node->inputs->data[0], node_index));
-    const TfLiteTensor& input2_tensor = tensors[node->inputs->data[1]];
+
+    // Check the input tensor types.
+    const TfLiteTensor& input_a = tensors[node->inputs->data[0]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, input2_tensor, node->inputs->data[1], node_index));
+        logging_context, input_a, node->inputs->data[0], node_index));
+    const TfLiteTensor& input_b = tensors[node->inputs->data[1]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32OrQCInt8Type(
+        delegate, logging_context, input_b,
+        /*expected_quantized_dimension=*/params->adj_y
+            ? NumDimensions(&input_b) - 2
+            : NumDimensions(&input_b) - 1,
+        node->inputs->data[1], node_index));
+
+    // Check whether input_a will be quantized dynamically.
+    const bool dynamically_quantized =
+        (input_a.type == kTfLiteFloat32 && input_b.type == kTfLiteInt8);
+
+    // Check the output tensor type.
     const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
         logging_context, output_tensor, node->outputs->data[0], node_index));
+
+    // Check the input tensor non-dynamic allocations.
     TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        delegate, logging_context, input1_tensor, node->inputs->data[0],
-        node_index));
+        delegate, logging_context, input_a, node->inputs->data[0], node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        delegate, logging_context, input2_tensor, node->inputs->data[1],
-        node_index));
-    const int num_dims_a = NumDimensions(&input1_tensor);
+        delegate, logging_context, input_b, node->inputs->data[1], node_index));
+
+    // Check whether the dimensions are compatible.
+    const int num_dims_a = NumDimensions(&input_a);
     if (num_dims_a < 2) {
       TF_LITE_MAYBE_KERNEL_LOG(
           logging_context,
@@ -3161,7 +3200,7 @@ class Subgraph {
           node->inputs->data[0], num_dims_a);
       return kTfLiteError;
     }
-    const int num_dims_b = NumDimensions(&input2_tensor);
+    const int num_dims_b = NumDimensions(&input_b);
     if (num_dims_b < 2) {
       TF_LITE_MAYBE_KERNEL_LOG(
           logging_context,
@@ -3178,17 +3217,131 @@ class Subgraph {
           EnumNameBuiltinOperator(BuiltinOperator_BATCH_MATMUL), node_index);
       return kTfLiteError;
     }
+
+    // Create and attach the subgraph nodes.
     if (subgraph != nullptr) {
-      uint32_t flags = params->adj_y ? XNN_FLAG_TRANSPOSE_B : 0;
-      xnn_status status = xnn_define_batch_matrix_multiply(
-          subgraph, input_output_tensors.at(node->inputs->data[0]),
-          input_output_tensors.at(node->inputs->data[1]),
-          input_output_tensors.at(node->outputs->data[0]), flags);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(
-            logging_context, "failed to delegate %s node #%d",
-            EnumNameBuiltinOperator(BuiltinOperator_BATCH_MATMUL), node_index);
-        return kTfLiteError;
+      const uint32_t flags = params->adj_y ? XNN_FLAG_TRANSPOSE_B : 0;
+
+      // If we're using dynamic quantization, we first need to convert the first
+      // input `A` from `float32` to `int8`, and set up the quantization
+      // parameters of the already-quantized input `B`.
+      if (dynamically_quantized) {
+        // Compute some shapes and sizes.
+        const int32_t n = params->adj_y
+                              ? SizeOfDimension(&input_b, num_dims_b - 2)
+                              : SizeOfDimension(&input_b, num_dims_b - 1);
+        int32_t batch_size_b = 1;
+        for (int i = 0; i < num_dims_b - 2; ++i) {
+          batch_size_b *= SizeOfDimension(&input_b, i);
+        }
+
+        // Validate or create the quantization parameters for the per-channel
+        // quantized input_b. Note that we currently only expect the `B` tensor
+        // to be per-tensor quantized, and not per-channel (see b/332675940).
+        TfLiteAffineQuantization* quant_params_b =
+            reinterpret_cast<TfLiteAffineQuantization*>(
+                input_b.quantization.params);
+        if (quant_params_b->scale->size != batch_size_b * n) {
+          if (quant_params_b->scale->size != 1) {
+            TF_LITE_MAYBE_KERNEL_LOG(
+                logging_context,
+                "failed to delegate %s node #%d. unexpected number of "
+                "quantizations scales (expected %d or 1, got %d)",
+                EnumNameBuiltinOperator(BuiltinOperator_BATCH_MATMUL),
+                node_index, batch_size_b * n, quant_params_b->scale->size);
+            return kTfLiteError;
+          }
+          TfLiteFloatArrayFree(quant_params_b->scale);
+          quant_params_b->scale = TfLiteFloatArrayCreate(batch_size_b * n);
+          std::fill_n(quant_params_b->scale->data, batch_size_b * n,
+                      input_b.params.scale);
+          TfLiteIntArrayFree(quant_params_b->zero_point);
+          quant_params_b->zero_point = TfLiteIntArrayCreate(batch_size_b * n);
+          std::fill_n(quant_params_b->zero_point->data, batch_size_b * n,
+                      input_b.params.zero_point);
+          quant_params_b->quantized_dimension =
+              params->adj_y ? num_dims_b - 2 : num_dims_b - 1;
+        }
+
+        // Create the quantized input_b.
+        std::vector<size_t> dims_b(num_dims_b, 0);
+        for (int i = 0; i < num_dims_b; ++i) {
+          dims_b[i] = SizeOfDimension(&input_b, i);
+        }
+        const int32_t zero_point_value = quant_params_b->zero_point->data[0];
+        uint32_t cq_input_b_id = XNN_INVALID_VALUE_ID;
+        if (xnn_status status =
+                xnn_define_channelwise_quantized_tensor_value_v2(
+                    subgraph, xnn_datatype_qcint8, zero_point_value,
+                    quant_params_b->scale->data, dims_b.size(),
+                    /*channel_dim=*/
+                    (params->adj_y ? num_dims_b - 2 : num_dims_b - 1),
+                    dims_b.data(), GetTensorData<int8_t>(&input_b),
+                    XNN_INVALID_VALUE_ID,
+                    /*flags=*/0, &cq_input_b_id);
+            status != xnn_status_success) {
+          TF_LITE_KERNEL_LOG(
+              logging_context, "failed to update filter tensor %s node #%d",
+              EnumNameBuiltinOperator(BuiltinOperator_BATCH_MATMUL),
+              node_index);
+          return kTfLiteError;
+        }
+
+        // Create the dynamically quantized input_a.
+        uint32_t dq_input_a_id = XNN_INVALID_VALUE_ID;
+        size_t dims_a[XNN_MAX_TENSOR_DIMS];
+        for (int i = 0; i < num_dims_a; ++i) {
+          dims_a[i] = SizeOfDimension(&input_a, i);
+        }
+        if (xnn_status status = xnn_define_dynamically_quantized_tensor_value(
+                subgraph, xnn_datatype_qdint8, num_dims_a,
+                /*num_nonbatch_dims=*/1, dims_a, XNN_INVALID_VALUE_ID,
+                /*flags=*/0, &dq_input_a_id);
+            status != xnn_status_success) {
+          TF_LITE_KERNEL_LOG(logging_context,
+                             "failed to create XNNPACK Value for tensor %d",
+                             -1);
+          return kTfLiteError;
+        }
+
+        // Define the conversion op for the quantized input_a.
+        if (xnn_status status = xnn_define_convert(
+                subgraph,
+                /*input_id=*/input_output_tensors.at(node->inputs->data[0]),
+                dq_input_a_id, /*flags=*/0);
+            status != xnn_status_success) {
+          TF_LITE_KERNEL_LOG(
+              logging_context, "failed to delegate %s node #%d",
+              EnumNameBuiltinOperator(BuiltinOperator_BATCH_MATMUL),
+              node_index);
+          return kTfLiteError;
+        }
+
+        // Create the batch_matrix_multiply op.
+        if (xnn_status status = xnn_define_batch_matrix_multiply(
+                subgraph, dq_input_a_id, cq_input_b_id,
+                input_output_tensors.at(node->outputs->data[0]), flags);
+            status != xnn_status_success) {
+          TF_LITE_KERNEL_LOG(
+              logging_context, "failed to delegate %s node #%d",
+              EnumNameBuiltinOperator(BuiltinOperator_BATCH_MATMUL),
+              node_index);
+          return kTfLiteError;
+        }
+
+      } else {
+        // No conversion of the inputs necessary, just send them on their way.
+        if (xnn_status status = xnn_define_batch_matrix_multiply(
+                subgraph, input_output_tensors.at(node->inputs->data[0]),
+                input_output_tensors.at(node->inputs->data[1]),
+                input_output_tensors.at(node->outputs->data[0]), flags);
+            status != xnn_status_success) {
+          TF_LITE_KERNEL_LOG(
+              logging_context, "failed to delegate %s node #%d",
+              EnumNameBuiltinOperator(BuiltinOperator_BATCH_MATMUL),
+              node_index);
+          return kTfLiteError;
+        }
       }
     }
     return kTfLiteOk;
@@ -3911,14 +4064,14 @@ class Subgraph {
       }
     }
 
+    const int32_t output_channels = SizeOfDimension(&filter_tensor, 0);
+    const int32_t input_channels = SizeOfDimension(&filter_tensor, 1);
+
     int bias_tensor_id = -1;
     if (node->inputs->size >= 3) {
       bias_tensor_id = node->inputs->data[2];
       if (bias_tensor_id >= 0) {
         const TfLiteTensor& bias_tensor = tensors[bias_tensor_id];
-        TF_LITE_ENSURE_STATUS(CheckTensorShape(
-            logging_context, bias_tensor, 1, node->inputs->data[2],
-            BuiltinOperator_FULLY_CONNECTED, node_index));
         // Dynamic bias is supported, but only for FP32.
         if (delegate.support_dynamic_fully_connected_operator() &&
             bias_tensor.type == kTfLiteFloat32) {
@@ -3926,6 +4079,15 @@ class Subgraph {
               delegate, logging_context, bias_tensor, node->inputs->data[2],
               node_index));
         } else {
+          const int num_bias_elements = NumElements(&bias_tensor);
+          if (num_bias_elements != output_channels) {
+            TF_LITE_MAYBE_KERNEL_LOG(
+                logging_context,
+                "Fully Connected: Mismatch between number of bias elements %d "
+                "and number of output channels %d at node %d",
+                num_bias_elements, output_channels, node->inputs->data[0]);
+            return kTfLiteError;
+          }
           TF_LITE_ENSURE_STATUS(CheckTensorFloat32OrQInt32Type(
               delegate, logging_context, bias_tensor, node->inputs->data[2],
               node_index));
@@ -3946,9 +4108,6 @@ class Subgraph {
         delegate, logging_context, output_tensor, node->outputs->data[0],
         node_index));
 
-    const int32_t output_channels = SizeOfDimension(&filter_tensor, 0);
-    const int32_t input_channels = SizeOfDimension(&filter_tensor, 1);
-
     bool dynamically_quantized = (delegate.enable_latest_operators() &&
                                   (input_tensor.type == kTfLiteFloat32 &&
                                    (filter_tensor.type == kTfLiteInt4 ||
@@ -3979,71 +4138,6 @@ class Subgraph {
       return kTfLiteError;
     }
 
-    int32_t num_input_elements = 1;
-    for (int i = 0; i < NumDimensions(&input_tensor); i++) {
-      if (SizeOfDimension(&input_tensor, i) <= 0) {
-        TF_LITE_MAYBE_KERNEL_LOG(
-            logging_context, "invalid dimension #%d (%d) in tensor #%d", i,
-            SizeOfDimension(&input_tensor, i), node->inputs->data[0]);
-        return kTfLiteError;
-      }
-      num_input_elements *= SizeOfDimension(&input_tensor, i);
-    }
-
-    if (fc_params->keep_num_dims) {
-      TF_LITE_ENSURE_STATUS(CheckTensorShape(
-          logging_context, output_tensor, NumDimensions(&input_tensor),
-          node->outputs->data[0], BuiltinOperator_FULLY_CONNECTED, node_index));
-
-      for (int i = 0; i < NumDimensions(&input_tensor) - 1; i++) {
-        if (SizeOfDimension(&input_tensor, i) !=
-            SizeOfDimension(&output_tensor, i)) {
-          TF_LITE_MAYBE_KERNEL_LOG(
-              logging_context,
-              "mismatch in shape dimension %d (%d != %d) in input and output "
-              "tensors of FULLY_CONNECTED operator #%d",
-              i, SizeOfDimension(&input_tensor, i),
-              SizeOfDimension(&output_tensor, i), node_index);
-          return kTfLiteError;
-        }
-      }
-    } else {
-      if (num_input_elements % input_channels != 0) {
-        TF_LITE_MAYBE_KERNEL_LOG(
-            logging_context,
-            "number of elements in input tensor #%d in FULLY_CONNECTED "
-            "operator is not divisible by input channels (%d)",
-            node->inputs->data[0], input_channels);
-        return kTfLiteError;
-      }
-
-      TF_LITE_ENSURE_STATUS(CheckTensorShape(
-          logging_context, output_tensor, 2, node->outputs->data[0],
-          BuiltinOperator_FULLY_CONNECTED, node_index));
-
-      if (SizeOfDimension(&output_tensor, 0) !=
-          num_input_elements / input_channels) {
-        TF_LITE_MAYBE_KERNEL_LOG(
-            logging_context,
-            "batch size %d in output tensor #%d in FULLY_CONNECTED operator "
-            "does not match batch size %d in reshaped input tensor #%d",
-            SizeOfDimension(&output_tensor, 0), node->outputs->data[0],
-            num_input_elements / input_channels, node->inputs->data[0]);
-        return kTfLiteError;
-      }
-    }
-
-    if (SizeOfDimension(&output_tensor, NumDimensions(&output_tensor) - 1) !=
-        output_channels) {
-      TF_LITE_MAYBE_KERNEL_LOG(
-          logging_context,
-          "number of channels %d in output tensor #%d does not match output "
-          "channels %d in filter tensor #%d",
-          SizeOfDimension(&output_tensor, NumDimensions(&output_tensor) - 1),
-          node->outputs->data[0], output_channels, node->inputs->data[1]);
-      return kTfLiteError;
-    }
-
     float output_min = -std::numeric_limits<float>::infinity();
     float output_max = +std::numeric_limits<float>::infinity();
     TF_LITE_ENSURE_STATUS(ConvertActivationToOutputRange(
@@ -6132,6 +6226,38 @@ class Subgraph {
     return kTfLiteOk;
   }
 
+  static TfLiteStatus VisitRsqrtNode(
+      xnn_subgraph_t subgraph, const Delegate& delegate,
+      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
+      const TfLiteTensor* tensors,
+      const std::unordered_map<int, uint32_t>& input_output_tensors) {
+    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
+        logging_context, node, 1, 1, BuiltinOperator_RSQRT, node_index));
+
+    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+
+    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+
+    if (subgraph != nullptr) {
+      const xnn_status status = xnn_define_reciprocal_square_root(
+          subgraph, /*input_id=*/input_output_tensors.at(node->inputs->data[0]),
+          /*output_id=*/input_output_tensors.at(node->outputs->data[0]),
+          /*flags=*/0);
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
+                           EnumNameBuiltinOperator(BuiltinOperator_RSQRT),
+                           node_index);
+        return kTfLiteError;
+      }
+    }
+
+    return kTfLiteOk;
+  }
+
   static TfLiteStatus VisitSquaredDifferenceNode(
       xnn_subgraph_t subgraph, const Delegate& delegate,
       TfLiteContext* logging_context, int node_index, TfLiteNode* node,
@@ -6346,6 +6472,261 @@ class Subgraph {
     return kTfLiteOk;
   }
 
+  static TfLiteStatus VisitDotAttentionNode(
+      xnn_subgraph_t subgraph, const Delegate& delegate,
+      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
+      const TfLiteTensor* tensors, const float* scale_param,
+      const std::unordered_map<int, uint32_t>& input_output_tensors) {
+    const TfLiteTensor& query_proj = tensors[node->inputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+        logging_context, query_proj, node->inputs->data[0], node_index));
+
+    const TfLiteTensor& key_proj = tensors[node->inputs->data[1]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+        logging_context, key_proj, node->inputs->data[1], node_index));
+
+    const TfLiteTensor& value_proj = tensors[node->inputs->data[2]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+        logging_context, value_proj, node->inputs->data[2], node_index));
+
+    const TfLiteTensor& atten_mask = tensors[node->inputs->data[3]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+        logging_context, atten_mask, node->inputs->data[3], node_index));
+
+    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+
+    // Head dimension match.
+    TF_LITE_ENSURE_EQ(logging_context,
+                      query_proj.dims->data[query_proj.dims->size - 1],
+                      key_proj.dims->data[key_proj.dims->size - 1]);
+    TF_LITE_ENSURE_EQ(logging_context,
+                      query_proj.dims->data[query_proj.dims->size - 1],
+                      value_proj.dims->data[value_proj.dims->size - 1]);
+    // Max sequence length match.
+    TF_LITE_ENSURE_EQ(logging_context, key_proj.dims->data[1],
+                      atten_mask.dims->data[atten_mask.dims->size - 1]);
+    TF_LITE_ENSURE_EQ(logging_context, value_proj.dims->data[1],
+                      atten_mask.dims->data[atten_mask.dims->size - 1]);
+
+    if (subgraph != nullptr) {
+      // constants
+      uint32_t query_proj_id = input_output_tensors.at(node->inputs->data[0]);
+      uint32_t key_proj_id = input_output_tensors.at(node->inputs->data[1]);
+      uint32_t value_proj_id = input_output_tensors.at(node->inputs->data[2]);
+      uint32_t atten_mask_id = input_output_tensors.at(node->inputs->data[3]);
+      uint32_t output_id = input_output_tensors.at(node->outputs->data[0]);
+      float default_out_min = -std::numeric_limits<float>::infinity();
+      float default_out_max = std::numeric_limits<float>::infinity();
+
+      // Attention Type
+      bool is_mqa = (key_proj.dims->data[2] == 1);
+
+      // Scale the query values by multiplying 1 / sqrt(dim_per_head).
+      const auto query_dim = query_proj.dims;
+      TF_LITE_ENSURE_EQ(logging_context, query_dim->size, 4);
+      float scale_const = 1.0f / sqrt(query_dim->data[3]);
+      uint32_t scale_out_id = XNN_INVALID_VALUE_ID;
+      if (scale_param != nullptr && *scale_param == scale_const) {
+        TF_LITE_ENSURE_EQ(
+            logging_context, xnn_status_success,
+            xnn_define_tensor_value(subgraph, xnn_datatype_fp32, 0, nullptr,
+                                    scale_param, XNN_INVALID_VALUE_ID, 0,
+                                    &scale_out_id));
+      } else {
+        // fallback
+        uint32_t scale_orig_id = XNN_INVALID_VALUE_ID;
+        TF_LITE_ENSURE_EQ(
+            logging_context, xnn_status_success,
+            xnn_define_tensor_value(subgraph, xnn_datatype_fp32, 0, nullptr,
+                                    &query_proj.dims->data[3],
+                                    XNN_INVALID_VALUE_ID, 0, &scale_orig_id));
+        TF_LITE_ENSURE_EQ(logging_context, xnn_status_success,
+                          xnn_define_tensor_value(
+                              subgraph, xnn_datatype_fp32, 0, nullptr, nullptr,
+                              XNN_INVALID_VALUE_ID, 0, &scale_out_id));
+        TF_LITE_ENSURE_EQ(logging_context, xnn_status_success,
+                          xnn_define_clamp(subgraph, scale_const, scale_const,
+                                           scale_orig_id, scale_out_id, 0));
+      }
+      uint32_t multiply_out_id = XNN_INVALID_VALUE_ID;
+      TF_LITE_ENSURE_EQ(logging_context, xnn_status_success,
+                        xnn_define_tensor_value(
+                            subgraph, xnn_datatype_fp32, 0, nullptr, nullptr,
+                            XNN_INVALID_VALUE_ID, 0, &multiply_out_id));
+      TF_LITE_ENSURE_EQ(logging_context, xnn_status_success,
+                        xnn_define_multiply2(subgraph, default_out_min,
+                                             default_out_max, query_proj_id,
+                                             scale_out_id, multiply_out_id, 0));
+      // Dot similarity
+      // BTNH -> BNTH
+      std::vector<size_t> permute_q = {0, 2, 1, 3};
+      TF_LITE_ENSURE_EQ(logging_context, query_proj.dims->size,
+                        permute_q.size());
+      uint32_t permute_q_out_id = XNN_INVALID_VALUE_ID;
+      TF_LITE_ENSURE_EQ(logging_context, xnn_status_success,
+                        xnn_define_tensor_value(
+                            subgraph, xnn_datatype_fp32, 0, nullptr, nullptr,
+                            XNN_INVALID_VALUE_ID, 0, &permute_q_out_id));
+      TF_LITE_ENSURE_EQ(logging_context, xnn_status_success,
+                        xnn_define_static_transpose(
+                            subgraph, permute_q.size(), permute_q.data(),
+                            multiply_out_id, permute_q_out_id, 0));
+      // BSNH -> BNSH
+      std::vector<size_t> permute_k = {0, 2, 1, 3};
+      TF_LITE_ENSURE_EQ(logging_context, key_proj.dims->size, permute_k.size());
+      uint32_t permute_k_out_id = XNN_INVALID_VALUE_ID;
+      TF_LITE_ENSURE_EQ(logging_context, xnn_status_success,
+                        xnn_define_tensor_value(
+                            subgraph, xnn_datatype_fp32, 0, nullptr, nullptr,
+                            XNN_INVALID_VALUE_ID, 0, &permute_k_out_id));
+      TF_LITE_ENSURE_EQ(logging_context, xnn_status_success,
+                        xnn_define_static_transpose(
+                            subgraph, permute_k.size(), permute_k.data(),
+                            key_proj_id, permute_k_out_id, 0));
+      // einsum(BNTH.BNSH -> BNTS)
+      uint32_t fc_out_id = XNN_INVALID_VALUE_ID;
+      if (!is_mqa) {
+        // BatchMM (permute_q, permute_k)
+        // [B, N, T, S] . [B, N, H, S]
+        // output shape [query_proj_dim[0], query_proj_dim[2],
+        // query_proj_dim[1], key_proj_dim[1]];
+        TF_LITE_ENSURE_EQ(logging_context, xnn_status_success,
+                          xnn_define_tensor_value(
+                              subgraph, xnn_datatype_fp32, 0, nullptr, nullptr,
+                              XNN_INVALID_VALUE_ID, 0, &fc_out_id));
+        TF_LITE_ENSURE_EQ(logging_context, xnn_status_success,
+                          xnn_define_batch_matrix_multiply(
+                              subgraph, permute_q_out_id, permute_k_out_id,
+                              fc_out_id, XNN_FLAG_TRANSPOSE_B));
+      } else {
+        // FC (permute_q, permute_k)
+        TFLITE_DCHECK(key_proj.dims->data[0] == 1);
+        TFLITE_DCHECK(key_proj.dims->data[2] == 1);
+        // squeezed_rhs shape: [S, H]
+        std::vector<size_t> reshape_dims_k = {(size_t)key_proj.dims->data[1],
+                                              (size_t)key_proj.dims->data[3]};
+        uint32_t reshape_dims_k_out_id = XNN_INVALID_VALUE_ID;
+        TF_LITE_ENSURE_EQ(logging_context, xnn_status_success,
+                          xnn_define_tensor_value(
+                              subgraph, xnn_datatype_fp32, 0, nullptr, nullptr,
+                              XNN_INVALID_VALUE_ID, 0, &reshape_dims_k_out_id));
+        TF_LITE_ENSURE_EQ(
+            logging_context, xnn_status_success,
+            xnn_define_static_reshape(subgraph, reshape_dims_k.size(),
+                                      reshape_dims_k.data(), permute_k_out_id,
+                                      reshape_dims_k_out_id, 0));
+        // Output shape: [B, N, T, S]
+        // FC: input = permuted_q, weight = reshaped_k, bias = nullptr,
+        // params=(transpose=false)
+        // assumes no sparse computation for now
+        TF_LITE_ENSURE_EQ(logging_context, xnn_status_success,
+                          xnn_define_tensor_value(
+                              subgraph, xnn_datatype_fp32, 0, nullptr, nullptr,
+                              XNN_INVALID_VALUE_ID, 0, &fc_out_id));
+        TF_LITE_ENSURE_EQ(
+            logging_context, xnn_status_success,
+            xnn_define_fully_connected(
+                subgraph, default_out_min, default_out_max, permute_q_out_id,
+                reshape_dims_k_out_id, XNN_INVALID_VALUE_ID, fc_out_id, 0));
+      }
+      // TODO(b/323195341): add CapTanh support.
+      // element_add atten_mask and matmul_out
+      uint32_t padded_logits_id = XNN_INVALID_VALUE_ID;
+      TF_LITE_ENSURE_EQ(logging_context, xnn_status_success,
+                        xnn_define_tensor_value(
+                            subgraph, xnn_datatype_fp32, 0, nullptr, nullptr,
+                            XNN_INVALID_VALUE_ID, 0, &padded_logits_id));
+      TF_LITE_ENSURE_EQ(
+          logging_context, xnn_status_success,
+          xnn_define_add2(subgraph, default_out_min, default_out_max,
+                          atten_mask_id, fc_out_id, padded_logits_id, 0));
+      // softmax(padded_logits)
+      uint32_t probs_id = XNN_INVALID_VALUE_ID;
+      TF_LITE_ENSURE_EQ(
+          logging_context, xnn_status_success,
+          xnn_define_tensor_value(subgraph, xnn_datatype_fp32, 0, nullptr,
+                                  nullptr, XNN_INVALID_VALUE_ID, 0, &probs_id));
+      TF_LITE_ENSURE_EQ(
+          logging_context, xnn_status_success,
+          xnn_define_softmax(subgraph, padded_logits_id, probs_id, 0));
+      // Permute(value_proj, {0, 2, 3, 1})
+      std::vector<size_t> permute_v = {0, 2, 3, 1};
+      TF_LITE_ENSURE_EQ(logging_context, value_proj.dims->size,
+                        permute_v.size());
+      uint32_t permute_v_out_id = XNN_INVALID_VALUE_ID;
+      TF_LITE_ENSURE_EQ(logging_context, xnn_status_success,
+                        xnn_define_tensor_value(
+                            subgraph, xnn_datatype_fp32, 0, nullptr, nullptr,
+                            XNN_INVALID_VALUE_ID, 0, &permute_v_out_id));
+      TF_LITE_ENSURE_EQ(logging_context, xnn_status_success,
+                        xnn_define_static_transpose(
+                            subgraph, permute_v.size(), permute_v.data(),
+                            value_proj_id, permute_v_out_id, 0));
+      // Outcome
+      // BNTS.BNHS -> BNTH
+      uint32_t fc2_out_id = XNN_INVALID_VALUE_ID;
+      if (!is_mqa) {
+        // BatchMM (padded_logits, permute_v)
+        // [B, N, T, S] . [B, N, H, S]
+        // output shape [padded_logits_dims[0], padded_logits_dims[1],
+        // padded_logits_dims[2], value_proj_dims[3]];
+        TF_LITE_ENSURE_EQ(logging_context, xnn_status_success,
+                          xnn_define_tensor_value(
+                              subgraph, xnn_datatype_fp32, 0, nullptr, nullptr,
+                              XNN_INVALID_VALUE_ID, 0, &fc2_out_id));
+        TF_LITE_ENSURE_EQ(logging_context, xnn_status_success,
+                          xnn_define_batch_matrix_multiply(
+                              subgraph, probs_id, permute_v_out_id, fc2_out_id,
+                              XNN_FLAG_TRANSPOSE_B));
+      } else {
+        // FC (padded_logits, permute_v)
+        TFLITE_DCHECK(value_proj.dims->data[0] == 1);
+        TFLITE_DCHECK(value_proj.dims->data[2] == 1);
+        // squeezed_rhs shape: [S, H]
+        std::vector<size_t> reshape_dims_v = {(size_t)value_proj.dims->data[3],
+                                              (size_t)value_proj.dims->data[1]};
+        uint32_t reshape_dims_v_out_id = XNN_INVALID_VALUE_ID;
+        TF_LITE_ENSURE_EQ(logging_context, xnn_status_success,
+                          xnn_define_tensor_value(
+                              subgraph, xnn_datatype_fp32, 0, nullptr, nullptr,
+                              XNN_INVALID_VALUE_ID, 0, &reshape_dims_v_out_id));
+        TF_LITE_ENSURE_EQ(
+            logging_context, xnn_status_success,
+            xnn_define_static_reshape(subgraph, reshape_dims_v.size(),
+                                      reshape_dims_v.data(), permute_v_out_id,
+                                      reshape_dims_v_out_id, 0));
+        // Output shape: [B, N, T, S]
+        // FC: input = padded_logits, weight = reshaped_v, bias = nullptr,
+        // params=(transpose=false)
+        // assumes no sparse computation for now
+        TF_LITE_ENSURE_EQ(logging_context, xnn_status_success,
+                          xnn_define_tensor_value(
+                              subgraph, xnn_datatype_fp32, 0, nullptr, nullptr,
+                              XNN_INVALID_VALUE_ID, 0, &fc2_out_id));
+        TF_LITE_ENSURE_EQ(
+            logging_context, xnn_status_success,
+            xnn_define_fully_connected(
+                subgraph, default_out_min, default_out_max, probs_id,
+                reshape_dims_v_out_id, XNN_INVALID_VALUE_ID, fc2_out_id, 0));
+      }
+      // [B, N, T, H] -> BTNH
+      // Permute(fc2_out_id, {0, 2, 1, 3}) -> output tensor
+      std::vector<size_t> permute_fc = {0, 2, 1, 3};
+      const xnn_status status = xnn_define_static_transpose(
+          subgraph, permute_fc.size(), permute_fc.data(), fc2_out_id, output_id,
+          0);
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
+                           "odml.scaled_dot_product_attention", node_index);
+        return kTfLiteError;
+      }
+    }
+
+    return kTfLiteOk;
+  }
+
   static TfLiteStatus VisitSubNode(
       xnn_subgraph_t subgraph, const Delegate& delegate,
       TfLiteContext* logging_context, int node_index, TfLiteNode* node,
diff --git a/tensorflow/lite/examples/ios/camera/.gitignore b/tensorflow/lite/examples/ios/camera/.gitignore
deleted file mode 100644
index 9e8962f4c63562..00000000000000
--- a/tensorflow/lite/examples/ios/camera/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-/data/*.txt
-/data/*.tflite
diff --git a/tensorflow/lite/examples/ios/camera/CameraExampleAppDelegate.m b/tensorflow/lite/examples/ios/camera/CameraExampleAppDelegate.m
deleted file mode 100644
index 128266d53f560f..00000000000000
--- a/tensorflow/lite/examples/ios/camera/CameraExampleAppDelegate.m
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#import "CameraExampleAppDelegate.h"
-
-@implementation CameraExampleAppDelegate
-
-@synthesize window = _window;
-
-- (BOOL)application:(UIApplication *)application
-    didFinishLaunchingWithOptions:(NSDictionary *)launchOptions {
-  [self.window makeKeyAndVisible];
-  return YES;
-}
-
-- (void)applicationWillResignActive:(UIApplication *)application {
-  [[UIApplication sharedApplication] setIdleTimerDisabled:NO];
-}
-
-- (void)applicationDidEnterBackground:(UIApplication *)application {
-}
-
-- (void)applicationWillEnterForeground:(UIApplication *)application {
-}
-
-- (void)applicationDidBecomeActive:(UIApplication *)application {
-  [[UIApplication sharedApplication] setIdleTimerDisabled:YES];
-}
-
-- (void)applicationWillTerminate:(UIApplication *)application {
-}
-
-@end
diff --git a/tensorflow/lite/examples/ios/camera/CameraExampleViewController.h b/tensorflow/lite/examples/ios/camera/CameraExampleViewController.h
deleted file mode 100644
index 333e9205032ef5..00000000000000
--- a/tensorflow/lite/examples/ios/camera/CameraExampleViewController.h
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#import <AVFoundation/AVFoundation.h>
-#import <UIKit/UIKit.h>
-
-#include <vector>
-
-// Set TFLITE_USE_GPU_DELEGATE to 1 to use TFLite GPU Delegate.
-#ifndef TFLITE_USE_GPU_DELEGATE
-#define TFLITE_USE_GPU_DELEGATE 0
-#endif
-
-@interface CameraExampleViewController
-    : UIViewController<UIGestureRecognizerDelegate, AVCaptureVideoDataOutputSampleBufferDelegate> {
-  IBOutlet UIView* previewView;
-  AVCaptureVideoPreviewLayer* previewLayer;
-  AVCaptureVideoDataOutput* videoDataOutput;
-  dispatch_queue_t videoDataOutputQueue;
-  UIView* flashView;
-  BOOL isUsingFrontFacingCamera;
-  NSMutableDictionary* oldPredictionValues;
-  NSMutableArray* labelLayers;
-  AVCaptureSession* session;
-
-  std::vector<std::string> labels;
-  double total_latency;
-  int total_count;
-}
-@property(strong, nonatomic) CATextLayer* predictionTextLayer;
-
-- (IBAction)takePicture:(id)sender;
-
-@end
diff --git a/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm b/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm
deleted file mode 100644
index 2fe533a5637bae..00000000000000
--- a/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm
+++ /dev/null
@@ -1,606 +0,0 @@
-// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#import "CameraExampleViewController.h"
-#import <AssertMacros.h>
-#import <AssetsLibrary/AssetsLibrary.h>
-#import <CoreImage/CoreImage.h>
-#import <ImageIO/ImageIO.h>
-
-#include <sys/time.h>
-#include <fstream>
-#include <iostream>
-#include <queue>
-#include <vector>
-
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/op_resolver.h"
-#include "tensorflow/lite/string_util.h"
-#if TFLITE_USE_GPU_DELEGATE
-#include "tensorflow/lite/delegates/gpu/metal_delegate.h"
-#endif
-
-#define LOG(x) std::cerr
-
-namespace {
-
-// If you have your own model, modify this to the file name, and make sure
-// you've added the file to your app resources too.
-#if TFLITE_USE_GPU_DELEGATE
-// GPU Delegate only supports float model now.
-NSString* model_file_name = @"mobilenet_v1_1.0_224";
-#else
-NSString* model_file_name = @"mobilenet_quant_v1_224";
-#endif
-NSString* model_file_type = @"tflite";
-// If you have your own model, point this to the labels file.
-NSString* labels_file_name = @"labels";
-NSString* labels_file_type = @"txt";
-
-// These dimensions need to match those the model was trained with.
-const int wanted_input_width = 224;
-const int wanted_input_height = 224;
-const int wanted_input_channels = 3;
-const float input_mean = 127.5f;
-const float input_std = 127.5f;
-const std::string input_layer_name = "input";
-const std::string output_layer_name = "softmax1";
-
-NSString* FilePathForResourceName(NSString* name, NSString* extension) {
-  NSString* file_path = [[NSBundle mainBundle] pathForResource:name ofType:extension];
-  if (file_path == NULL) {
-    LOG(FATAL) << "Couldn't find '" << [name UTF8String] << "." << [extension UTF8String]
-               << "' in bundle.";
-  }
-  return file_path;
-}
-
-void LoadLabels(NSString* file_name, NSString* file_type, std::vector<std::string>* label_strings) {
-  NSString* labels_path = FilePathForResourceName(file_name, file_type);
-  if (!labels_path) {
-    LOG(ERROR) << "Failed to find model proto at" << [file_name UTF8String]
-               << [file_type UTF8String];
-  }
-  std::ifstream t;
-  t.open([labels_path UTF8String]);
-  std::string line;
-  while (t) {
-    std::getline(t, line);
-    label_strings->push_back(line);
-  }
-  t.close();
-}
-
-// Returns the top N confidence values over threshold in the provided vector,
-// sorted by confidence in descending order.
-void GetTopN(
-    const float* prediction, const int prediction_size, const int num_results,
-    const float threshold, std::vector<std::pair<float, int> >* top_results) {
-  // Will contain top N results in ascending order.
-  std::priority_queue<std::pair<float, int>, std::vector<std::pair<float, int> >,
-                      std::greater<std::pair<float, int> > >
-      top_result_pq;
-
-  const long count = prediction_size;
-  for (int i = 0; i < count; ++i) {
-    const float value = prediction[i];
-    // Only add it if it beats the threshold and has a chance at being in
-    // the top N.
-    if (value < threshold) {
-      continue;
-    }
-
-    top_result_pq.push(std::pair<float, int>(value, i));
-
-    // If at capacity, kick the smallest value out.
-    if (top_result_pq.size() > num_results) {
-      top_result_pq.pop();
-    }
-  }
-
-  // Copy to output vector and reverse into descending order.
-  while (!top_result_pq.empty()) {
-    top_results->push_back(top_result_pq.top());
-    top_result_pq.pop();
-  }
-  std::reverse(top_results->begin(), top_results->end());
-}
-
-// Preprocess the input image and feed the TFLite interpreter buffer for a float model.
-void ProcessInputWithFloatModel(
-    uint8_t* input, float* buffer, int image_width, int image_height, int image_channels) {
-  for (int y = 0; y < wanted_input_height; ++y) {
-    float* out_row = buffer + (y * wanted_input_width * wanted_input_channels);
-    for (int x = 0; x < wanted_input_width; ++x) {
-      const int in_x = (y * image_width) / wanted_input_width;
-      const int in_y = (x * image_height) / wanted_input_height;
-      uint8_t* input_pixel =
-          input + (in_y * image_width * image_channels) + (in_x * image_channels);
-      float* out_pixel = out_row + (x * wanted_input_channels);
-      for (int c = 0; c < wanted_input_channels; ++c) {
-        out_pixel[c] = (input_pixel[c] - input_mean) / input_std;
-      }
-    }
-  }
-}
-
-// Preprocess the input image and feed the TFLite interpreter buffer for a quantized model.
-void ProcessInputWithQuantizedModel(
-    uint8_t* input, uint8_t* output, int image_width, int image_height, int image_channels) {
-  for (int y = 0; y < wanted_input_height; ++y) {
-    uint8_t* out_row = output + (y * wanted_input_width * wanted_input_channels);
-    for (int x = 0; x < wanted_input_width; ++x) {
-      const int in_x = (y * image_width) / wanted_input_width;
-      const int in_y = (x * image_height) / wanted_input_height;
-      uint8_t* in_pixel = input + (in_y * image_width * image_channels) + (in_x * image_channels);
-      uint8_t* out_pixel = out_row + (x * wanted_input_channels);
-      for (int c = 0; c < wanted_input_channels; ++c) {
-        out_pixel[c] = in_pixel[c];
-      }
-    }
-  }
-}
-
-}  // namespace
-
-@interface CameraExampleViewController (InternalMethods)
-- (void)setupAVCapture;
-- (void)teardownAVCapture;
-@end
-
-@implementation CameraExampleViewController {
-  std::unique_ptr<tflite::FlatBufferModel> model;
-  tflite::ops::builtin::BuiltinOpResolver resolver;
-  std::unique_ptr<tflite::Interpreter> interpreter;
-  TfLiteDelegate* delegate;
-}
-
-- (void)setupAVCapture {
-  NSError* error = nil;
-
-  session = [AVCaptureSession new];
-  if ([[UIDevice currentDevice] userInterfaceIdiom] == UIUserInterfaceIdiomPhone)
-    [session setSessionPreset:AVCaptureSessionPreset640x480];
-  else
-    [session setSessionPreset:AVCaptureSessionPresetPhoto];
-
-  AVCaptureDevice* device = [AVCaptureDevice defaultDeviceWithMediaType:AVMediaTypeVideo];
-  AVCaptureDeviceInput* deviceInput =
-      [AVCaptureDeviceInput deviceInputWithDevice:device error:&error];
-
-  if (error != nil) {
-    NSLog(@"Failed to initialize AVCaptureDeviceInput. Note: This app doesn't work with simulator");
-    assert(NO);
-  }
-
-  if ([session canAddInput:deviceInput]) [session addInput:deviceInput];
-
-  videoDataOutput = [AVCaptureVideoDataOutput new];
-
-  NSDictionary* rgbOutputSettings =
-      [NSDictionary dictionaryWithObject:[NSNumber numberWithInt:kCMPixelFormat_32BGRA]
-                                  forKey:(id)kCVPixelBufferPixelFormatTypeKey];
-  [videoDataOutput setVideoSettings:rgbOutputSettings];
-  [videoDataOutput setAlwaysDiscardsLateVideoFrames:YES];
-  videoDataOutputQueue = dispatch_queue_create("VideoDataOutputQueue", DISPATCH_QUEUE_SERIAL);
-  [videoDataOutput setSampleBufferDelegate:self queue:videoDataOutputQueue];
-
-  if ([session canAddOutput:videoDataOutput]) [session addOutput:videoDataOutput];
-  [[videoDataOutput connectionWithMediaType:AVMediaTypeVideo] setEnabled:YES];
-
-  previewLayer = [[AVCaptureVideoPreviewLayer alloc] initWithSession:session];
-  [previewLayer setBackgroundColor:[[UIColor blackColor] CGColor]];
-  [previewLayer setVideoGravity:AVLayerVideoGravityResizeAspect];
-  CALayer* rootLayer = [previewView layer];
-  [rootLayer setMasksToBounds:YES];
-  [previewLayer setFrame:[rootLayer bounds]];
-  [rootLayer addSublayer:previewLayer];
-  [session startRunning];
-
-  if (error) {
-    NSString* title = [NSString stringWithFormat:@"Failed with error %d", (int)[error code]];
-    UIAlertController* alertController =
-        [UIAlertController alertControllerWithTitle:title
-                                            message:[error localizedDescription]
-                                     preferredStyle:UIAlertControllerStyleAlert];
-    UIAlertAction* dismiss =
-        [UIAlertAction actionWithTitle:@"Dismiss" style:UIAlertActionStyleDefault handler:nil];
-    [alertController addAction:dismiss];
-    [self presentViewController:alertController animated:YES completion:nil];
-    [self teardownAVCapture];
-  }
-}
-
-- (void)teardownAVCapture {
-  [previewLayer removeFromSuperlayer];
-}
-
-- (AVCaptureVideoOrientation)avOrientationForDeviceOrientation:
-    (UIDeviceOrientation)deviceOrientation {
-  AVCaptureVideoOrientation result = (AVCaptureVideoOrientation)(deviceOrientation);
-  if (deviceOrientation == UIDeviceOrientationLandscapeLeft)
-    result = AVCaptureVideoOrientationLandscapeRight;
-  else if (deviceOrientation == UIDeviceOrientationLandscapeRight)
-    result = AVCaptureVideoOrientationLandscapeLeft;
-  return result;
-}
-
-- (IBAction)takePicture:(id)sender {
-  if ([session isRunning]) {
-    [session stopRunning];
-    [sender setTitle:@"Continue" forState:UIControlStateNormal];
-
-    flashView = [[UIView alloc] initWithFrame:[previewView frame]];
-    [flashView setBackgroundColor:[UIColor whiteColor]];
-    [flashView setAlpha:0.f];
-    [[[self view] window] addSubview:flashView];
-
-    [UIView animateWithDuration:.2f
-        animations:^{
-          [flashView setAlpha:1.f];
-        }
-        completion:^(BOOL finished) {
-          [UIView animateWithDuration:.2f
-              animations:^{
-                [flashView setAlpha:0.f];
-              }
-              completion:^(BOOL finished) {
-                [flashView removeFromSuperview];
-                flashView = nil;
-              }];
-        }];
-
-  } else {
-    [session startRunning];
-    [sender setTitle:@"Freeze Frame" forState:UIControlStateNormal];
-  }
-}
-
-- (void)captureOutput:(AVCaptureOutput*)captureOutput
-    didOutputSampleBuffer:(CMSampleBufferRef)sampleBuffer
-           fromConnection:(AVCaptureConnection*)connection {
-  CVPixelBufferRef pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer);
-  CFRetain(pixelBuffer);
-  [self runModelOnFrame:pixelBuffer];
-  CFRelease(pixelBuffer);
-}
-
-- (void)runModelOnFrame:(CVPixelBufferRef)pixelBuffer {
-  assert(pixelBuffer != NULL);
-
-  OSType sourcePixelFormat = CVPixelBufferGetPixelFormatType(pixelBuffer);
-  (void)sourcePixelFormat;  // Fix -wunused-variable warning
-  assert(sourcePixelFormat == kCVPixelFormatType_32ARGB ||
-         sourcePixelFormat == kCVPixelFormatType_32BGRA);
-
-  const int sourceRowBytes = (int)CVPixelBufferGetBytesPerRow(pixelBuffer);
-  const int image_width = (int)CVPixelBufferGetWidth(pixelBuffer);
-  const int fullHeight = (int)CVPixelBufferGetHeight(pixelBuffer);
-
-  CVPixelBufferLockFlags unlockFlags = kNilOptions;
-  CVPixelBufferLockBaseAddress(pixelBuffer, unlockFlags);
-
-  unsigned char* sourceBaseAddr = (unsigned char*)(CVPixelBufferGetBaseAddress(pixelBuffer));
-  int image_height;
-  unsigned char* sourceStartAddr;
-  if (fullHeight <= image_width) {
-    image_height = fullHeight;
-    sourceStartAddr = sourceBaseAddr;
-  } else {
-    image_height = image_width;
-    const int marginY = ((fullHeight - image_width) / 2);
-    sourceStartAddr = (sourceBaseAddr + (marginY * sourceRowBytes));
-  }
-  const int image_channels = 4;
-  assert(image_channels >= wanted_input_channels);
-  uint8_t* in = sourceStartAddr;
-
-  int input = interpreter->inputs()[0];
-  TfLiteTensor *input_tensor = interpreter->tensor(input);
-
-  bool is_quantized;
-  switch (input_tensor->type) {
-  case kTfLiteFloat32:
-    is_quantized = false;
-    break;
-  case kTfLiteUInt8:
-    is_quantized = true;
-    break;
-  default:
-    NSLog(@"Input data type is not supported by this demo app.");
-    return;
-  }
-
-  if (is_quantized) {
-    uint8_t* out = interpreter->typed_tensor<uint8_t>(input);
-    ProcessInputWithQuantizedModel(in, out, image_width, image_height, image_channels);
-  } else {
-    float* out = interpreter->typed_tensor<float>(input);
-    ProcessInputWithFloatModel(in, out, image_width, image_height, image_channels);
-  }
-
-  double start = [[NSDate new] timeIntervalSince1970];
-  if (interpreter->Invoke() != kTfLiteOk) {
-    LOG(FATAL) << "Failed to invoke!";
-  }
-  double end = [[NSDate new] timeIntervalSince1970];
-  total_latency += (end - start);
-  total_count += 1;
-  NSLog(@"Time: %.4lf, avg: %.4lf, count: %d", end - start, total_latency / total_count,
-        total_count);
-
-  // read output size from the output sensor
-  const int output_tensor_index = interpreter->outputs()[0];
-  TfLiteTensor* output_tensor = interpreter->tensor(output_tensor_index);
-  TfLiteIntArray* output_dims = output_tensor->dims;
-  if (output_dims->size != 2 || output_dims->data[0] != 1) {
-    LOG(FATAL) << "Output of the model is in invalid format.";
-  }
-  const int output_size = output_dims->data[1];
-
-  const int kNumResults = 5;
-  const float kThreshold = 0.1f;
-
-  std::vector<std::pair<float, int> > top_results;
-
-  if (is_quantized) {
-    uint8_t* quantized_output = interpreter->typed_output_tensor<uint8_t>(0);
-    int32_t zero_point = input_tensor->params.zero_point;
-    float scale = input_tensor->params.scale;
-    std::vector<float> output(output_size);
-    for (int i = 0; i < output_size; ++i) {
-      output[i] = (quantized_output[i] - zero_point) * scale;
-    }
-    GetTopN(output.data(), output_size, kNumResults, kThreshold, &top_results);
-  } else {
-    float* output = interpreter->typed_output_tensor<float>(0);
-    GetTopN(output, output_size, kNumResults, kThreshold, &top_results);
-  }
-
-  NSMutableDictionary* newValues = [NSMutableDictionary dictionary];
-  for (const auto& result : top_results) {
-    const float confidence = result.first;
-    const int index = result.second;
-    NSString* labelObject = [NSString stringWithUTF8String:labels[index].c_str()];
-    NSNumber* valueObject = [NSNumber numberWithFloat:confidence];
-    [newValues setObject:valueObject forKey:labelObject];
-  }
-  dispatch_async(dispatch_get_main_queue(), ^(void) {
-    [self setPredictionValues:newValues];
-  });
-
-  CVPixelBufferUnlockBaseAddress(pixelBuffer, unlockFlags);
-  CVPixelBufferUnlockBaseAddress(pixelBuffer, 0);
-}
-
-- (void)dealloc {
-#if TFLITE_USE_GPU_DELEGATE
-  if (delegate) {
-    DeleteGpuDelegate(delegate);
-  }
-#endif
-  [self teardownAVCapture];
-}
-
-- (void)didReceiveMemoryWarning {
-  [super didReceiveMemoryWarning];
-}
-
-- (void)viewDidLoad {
-  [super viewDidLoad];
-  labelLayers = [[NSMutableArray alloc] init];
-  oldPredictionValues = [[NSMutableDictionary alloc] init];
-
-  NSString* graph_path = FilePathForResourceName(model_file_name, model_file_type);
-  model = tflite::FlatBufferModel::BuildFromFile([graph_path UTF8String]);
-  if (!model) {
-    LOG(FATAL) << "Failed to mmap model " << graph_path;
-  }
-  LOG(INFO) << "Loaded model " << graph_path;
-  model->error_reporter();
-  LOG(INFO) << "resolved reporter";
-
-  LoadLabels(labels_file_name, labels_file_type, &labels);
-
-  tflite::InterpreterBuilder(*model, resolver)(&interpreter);
-
-#if TFLITE_USE_GPU_DELEGATE
-  GpuDelegateOptions options;
-  options.allow_precision_loss = true;
-  options.wait_type = GpuDelegateOptions::WaitType::kActive;
-  delegate = NewGpuDelegate(&options);
-  interpreter->ModifyGraphWithDelegate(delegate);
-#endif
-
-  // Explicitly resize the input tensor.
-  {
-    int input = interpreter->inputs()[0];
-    std::vector<int> sizes = {1, 224, 224, 3};
-    interpreter->ResizeInputTensor(input, sizes);
-  }
-  if (!interpreter) {
-    LOG(FATAL) << "Failed to construct interpreter";
-  }
-  if (interpreter->AllocateTensors() != kTfLiteOk) {
-    LOG(FATAL) << "Failed to allocate tensors!";
-  }
-
-  [self setupAVCapture];
-}
-
-- (void)viewDidUnload {
-  [super viewDidUnload];
-}
-
-- (void)viewWillAppear:(BOOL)animated {
-  [super viewWillAppear:animated];
-}
-
-- (void)viewDidAppear:(BOOL)animated {
-  [super viewDidAppear:animated];
-}
-
-- (void)viewWillDisappear:(BOOL)animated {
-  [super viewWillDisappear:animated];
-}
-
-- (void)viewDidDisappear:(BOOL)animated {
-  [super viewDidDisappear:animated];
-}
-
-- (BOOL)shouldAutorotateToInterfaceOrientation:(UIInterfaceOrientation)interfaceOrientation {
-  return (interfaceOrientation == UIInterfaceOrientationPortrait);
-}
-
-- (BOOL)prefersStatusBarHidden {
-  return YES;
-}
-
-- (void)setPredictionValues:(NSDictionary*)newValues {
-  const float decayValue = 0.75f;
-  const float updateValue = 0.25f;
-  const float minimumThreshold = 0.01f;
-
-  NSMutableDictionary* decayedPredictionValues = [[NSMutableDictionary alloc] init];
-  for (NSString* label in oldPredictionValues) {
-    NSNumber* oldPredictionValueObject = [oldPredictionValues objectForKey:label];
-    const float oldPredictionValue = [oldPredictionValueObject floatValue];
-    const float decayedPredictionValue = (oldPredictionValue * decayValue);
-    if (decayedPredictionValue > minimumThreshold) {
-      NSNumber* decayedPredictionValueObject = [NSNumber numberWithFloat:decayedPredictionValue];
-      [decayedPredictionValues setObject:decayedPredictionValueObject forKey:label];
-    }
-  }
-  oldPredictionValues = decayedPredictionValues;
-
-  for (NSString* label in newValues) {
-    NSNumber* newPredictionValueObject = [newValues objectForKey:label];
-    NSNumber* oldPredictionValueObject = [oldPredictionValues objectForKey:label];
-    if (!oldPredictionValueObject) {
-      oldPredictionValueObject = [NSNumber numberWithFloat:0.0f];
-    }
-    const float newPredictionValue = [newPredictionValueObject floatValue];
-    const float oldPredictionValue = [oldPredictionValueObject floatValue];
-    const float updatedPredictionValue = (oldPredictionValue + (newPredictionValue * updateValue));
-    NSNumber* updatedPredictionValueObject = [NSNumber numberWithFloat:updatedPredictionValue];
-    [oldPredictionValues setObject:updatedPredictionValueObject forKey:label];
-  }
-  NSArray* candidateLabels = [NSMutableArray array];
-  for (NSString* label in oldPredictionValues) {
-    NSNumber* oldPredictionValueObject = [oldPredictionValues objectForKey:label];
-    const float oldPredictionValue = [oldPredictionValueObject floatValue];
-    if (oldPredictionValue > 0.05f) {
-      NSDictionary* entry = @{@"label" : label, @"value" : oldPredictionValueObject};
-      candidateLabels = [candidateLabels arrayByAddingObject:entry];
-    }
-  }
-  NSSortDescriptor* sort = [NSSortDescriptor sortDescriptorWithKey:@"value" ascending:NO];
-  NSArray* sortedLabels =
-      [candidateLabels sortedArrayUsingDescriptors:[NSArray arrayWithObject:sort]];
-
-  const float leftMargin = 10.0f;
-  const float topMargin = 10.0f;
-
-  const float valueWidth = 48.0f;
-  const float valueHeight = 18.0f;
-
-  const float labelWidth = 246.0f;
-  const float labelHeight = 18.0f;
-
-  const float labelMarginX = 5.0f;
-  const float labelMarginY = 5.0f;
-
-  [self removeAllLabelLayers];
-
-  int labelCount = 0;
-  for (NSDictionary* entry in sortedLabels) {
-    NSString* label = [entry objectForKey:@"label"];
-    NSNumber* valueObject = [entry objectForKey:@"value"];
-    const float value = [valueObject floatValue];
-    const float originY = topMargin + ((labelHeight + labelMarginY) * labelCount);
-    const int valuePercentage = (int)roundf(value * 100.0f);
-
-    const float valueOriginX = leftMargin;
-    NSString* valueText = [NSString stringWithFormat:@"%d%%", valuePercentage];
-
-    [self addLabelLayerWithText:valueText
-                        originX:valueOriginX
-                        originY:originY
-                          width:valueWidth
-                         height:valueHeight
-                      alignment:kCAAlignmentRight];
-
-    const float labelOriginX = (leftMargin + valueWidth + labelMarginX);
-
-    [self addLabelLayerWithText:[label capitalizedString]
-                        originX:labelOriginX
-                        originY:originY
-                          width:labelWidth
-                         height:labelHeight
-                      alignment:kCAAlignmentLeft];
-
-    labelCount += 1;
-    if (labelCount > 4) {
-      break;
-    }
-  }
-}
-
-- (void)removeAllLabelLayers {
-  for (CATextLayer* layer in labelLayers) {
-    [layer removeFromSuperlayer];
-  }
-  [labelLayers removeAllObjects];
-}
-
-- (void)addLabelLayerWithText:(NSString*)text
-                      originX:(float)originX
-                      originY:(float)originY
-                        width:(float)width
-                       height:(float)height
-                    alignment:(NSString*)alignment {
-  CFTypeRef font = (CFTypeRef) @"Menlo-Regular";
-  const float fontSize = 12.0;
-  const float marginSizeX = 5.0f;
-  const float marginSizeY = 2.0f;
-
-  const CGRect backgroundBounds = CGRectMake(originX, originY, width, height);
-  const CGRect textBounds = CGRectMake((originX + marginSizeX), (originY + marginSizeY),
-                                       (width - (marginSizeX * 2)), (height - (marginSizeY * 2)));
-
-  CATextLayer* background = [CATextLayer layer];
-  [background setBackgroundColor:[UIColor blackColor].CGColor];
-  [background setOpacity:0.5f];
-  [background setFrame:backgroundBounds];
-  background.cornerRadius = 5.0f;
-
-  [[self.view layer] addSublayer:background];
-  [labelLayers addObject:background];
-
-  CATextLayer* layer = [CATextLayer layer];
-  [layer setForegroundColor:[UIColor whiteColor].CGColor];
-  [layer setFrame:textBounds];
-  [layer setAlignmentMode:alignment];
-  [layer setWrapped:YES];
-  [layer setFont:font];
-  [layer setFontSize:fontSize];
-  layer.contentsScale = [[UIScreen mainScreen] scale];
-  [layer setString:text];
-
-  [[self.view layer] addSublayer:layer];
-  [labelLayers addObject:layer];
-}
-
-@end
diff --git a/tensorflow/lite/examples/ios/camera/Info.plist b/tensorflow/lite/examples/ios/camera/Info.plist
deleted file mode 100644
index f3d96bab162a70..00000000000000
--- a/tensorflow/lite/examples/ios/camera/Info.plist
+++ /dev/null
@@ -1,44 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>CFBundleDevelopmentRegion</key>
-	<string>en</string>
-	<key>CFBundleDisplayName</key>
-	<string>tflite_camera_example</string>
-	<key>CFBundleExecutable</key>
-	<string>${EXECUTABLE_NAME}</string>
-	<key>CFBundleIdentifier</key>
-	<string>$(PRODUCT_BUNDLE_IDENTIFIER)</string>
-	<key>CFBundleInfoDictionaryVersion</key>
-	<string>6.0</string>
-	<key>CFBundleName</key>
-	<string>${PRODUCT_NAME}</string>
-	<key>CFBundlePackageType</key>
-	<string>APPL</string>
-	<key>CFBundleShortVersionString</key>
-	<string>1.0</string>
-	<key>CFBundleSignature</key>
-	<string>????</string>
-	<key>CFBundleVersion</key>
-	<string>1.0</string>
-	<key>LSRequiresIPhoneOS</key>
-	<true/>
-	<key>NSCameraUsageDescription</key>
-	<string>Capture images to detect object</string>
-	<key>UIMainStoryboardFile</key>
-	<string>MainStoryboard_iPhone</string>
-	<key>UIRequiresFullScreen</key>
-	<true/>
-	<key>UIStatusBarHidden</key>
-	<true/>
-	<key>UISupportedInterfaceOrientations</key>
-	<array>
-		<string>UIInterfaceOrientationPortrait</string>
-	</array>
-	<key>UISupportedInterfaceOrientations~ipad</key>
-	<array>
-		<string>UIInterfaceOrientationPortrait</string>
-	</array>
-</dict>
-</plist>
diff --git a/tensorflow/lite/examples/ios/camera/MainStoryboard_iPhone.storyboard b/tensorflow/lite/examples/ios/camera/MainStoryboard_iPhone.storyboard
deleted file mode 100644
index 0f10a22e415bd2..00000000000000
--- a/tensorflow/lite/examples/ios/camera/MainStoryboard_iPhone.storyboard
+++ /dev/null
@@ -1,46 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="9531" systemVersion="15E65" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" initialViewController="2">
-    <dependencies>
-        <deployment identifier="iOS"/>
-        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="9529"/>
-    </dependencies>
-    <scenes>
-        <!--Camera Example View Controller-->
-        <scene sceneID="5">
-            <objects>
-                <viewController id="2" customClass="CameraExampleViewController" sceneMemberID="viewController">
-                    <view key="view" contentMode="scaleToFill" id="3">
-                        <rect key="frame" x="0.0" y="0.0" width="320" height="568"/>
-                        <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
-                        <subviews>
-                            <view contentMode="scaleToFill" id="12">
-                                <rect key="frame" x="0.0" y="0.0" width="320" height="522"/>
-                                <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
-                                <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="calibratedWhite"/>
-                                <gestureRecognizers/>
-                            </view>
-                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" lineBreakMode="middleTruncation" id="iD8-yH-eWH">
-                                <rect key="frame" x="0.0" y="454" width="320" height="33"/>
-                                <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
-                                <color key="backgroundColor" red="0.0" green="0.0" blue="0.0" alpha="1" colorSpace="calibratedRGB"/>
-                                <fontDescription key="fontDescription" name="Menlo-Regular" family="Menlo" pointSize="20"/>
-                                <state key="normal" title="Freeze Frame">
-                                    <color key="titleColor" white="1" alpha="1" colorSpace="calibratedWhite"/>
-                                    <color key="titleShadowColor" white="0.5" alpha="1" colorSpace="calibratedWhite"/>
-                                </state>
-                                <connections>
-                                    <action selector="takePicture:" destination="2" eventType="touchUpInside" id="BTy-7E-XUS"/>
-                                </connections>
-                            </button>
-                        </subviews>
-                        <color key="backgroundColor" red="0.0" green="0.0" blue="0.0" alpha="1" colorSpace="calibratedRGB"/>
-                    </view>
-                    <connections>
-                        <outlet property="previewView" destination="12" id="13"/>
-                    </connections>
-                </viewController>
-                <placeholder placeholderIdentifier="IBFirstResponder" id="4" sceneMemberID="firstResponder"/>
-            </objects>
-        </scene>
-    </scenes>
-</document>
diff --git a/tensorflow/lite/examples/ios/camera/Podfile b/tensorflow/lite/examples/ios/camera/Podfile
deleted file mode 100644
index ef5b9261fc71fe..00000000000000
--- a/tensorflow/lite/examples/ios/camera/Podfile
+++ /dev/null
@@ -1,13 +0,0 @@
-platform :ios, '8.0'
-inhibit_all_warnings!
-
-project 'tflite_camera_example.xcodeproj'
-
-target 'tflite_camera_example'
-  # Comment 'TensorFlowLite' pod and un-comment 'TensorFlowLiteGpuExperimental'
-  # to use TFLite GPU Delegate.
-  # Note: TFLite GPU Delegate binary isn't released yet, but we're working
-  # on it.
-
-  pod 'TensorFlowLite', '1.13.1'
-  # pod 'TensorFlowLiteGpuExperimental', '0.0.1'
diff --git a/tensorflow/lite/examples/ios/camera/README.md b/tensorflow/lite/examples/ios/camera/README.md
deleted file mode 100644
index dc41278aed518a..00000000000000
--- a/tensorflow/lite/examples/ios/camera/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-## Using the iOS Demo App
-
-Please read the [TensorFlow Lite iOS Demo App](https://www.tensorflow.org/lite/guide/ios) page.
diff --git a/tensorflow/lite/examples/ios/camera/data/labels.txt b/tensorflow/lite/examples/ios/camera/data/labels.txt
deleted file mode 100644
index 572eccf90087c1..00000000000000
--- a/tensorflow/lite/examples/ios/camera/data/labels.txt
+++ /dev/null
@@ -1,1001 +0,0 @@
-dummy
-tench
-goldfish
-great white shark
-tiger shark
-hammerhead
-electric ray
-stingray
-cock
-hen
-ostrich
-brambling
-goldfinch
-house finch
-junco
-indigo bunting
-robin
-bulbul
-jay
-magpie
-chickadee
-water ouzel
-kite
-bald eagle
-vulture
-great grey owl
-European fire salamander
-common newt
-eft
-spotted salamander
-axolotl
-bullfrog
-tree frog
-tailed frog
-loggerhead
-leatherback turtle
-mud turtle
-terrapin
-box turtle
-banded gecko
-common iguana
-American chameleon
-whiptail
-agama
-frilled lizard
-alligator lizard
-Gila monster
-green lizard
-African chameleon
-Komodo dragon
-African crocodile
-American alligator
-triceratops
-thunder snake
-ringneck snake
-hognose snake
-green snake
-king snake
-garter snake
-water snake
-vine snake
-night snake
-boa constrictor
-rock python
-Indian cobra
-green mamba
-sea snake
-horned viper
-diamondback
-sidewinder
-trilobite
-harvestman
-scorpion
-black and gold garden spider
-barn spider
-garden spider
-black widow
-tarantula
-wolf spider
-tick
-centipede
-black grouse
-ptarmigan
-ruffed grouse
-prairie chicken
-peacock
-quail
-partridge
-African grey
-macaw
-sulphur-crested cockatoo
-lorikeet
-coucal
-bee eater
-hornbill
-hummingbird
-jacamar
-toucan
-drake
-red-breasted merganser
-goose
-black swan
-tusker
-echidna
-platypus
-wallaby
-koala
-wombat
-jellyfish
-sea anemone
-brain coral
-flatworm
-nematode
-conch
-snail
-slug
-sea slug
-chiton
-chambered nautilus
-Dungeness crab
-rock crab
-fiddler crab
-king crab
-American lobster
-spiny lobster
-crayfish
-hermit crab
-isopod
-white stork
-black stork
-spoonbill
-flamingo
-little blue heron
-American egret
-bittern
-crane
-limpkin
-European gallinule
-American coot
-bustard
-ruddy turnstone
-red-backed sandpiper
-redshank
-dowitcher
-oystercatcher
-pelican
-king penguin
-albatross
-grey whale
-killer whale
-dugong
-sea lion
-Chihuahua
-Japanese spaniel
-Maltese dog
-Pekinese
-Shih-Tzu
-Blenheim spaniel
-papillon
-toy terrier
-Rhodesian ridgeback
-Afghan hound
-basset
-beagle
-bloodhound
-bluetick
-black-and-tan coonhound
-Walker hound
-English foxhound
-redbone
-borzoi
-Irish wolfhound
-Italian greyhound
-whippet
-Ibizan hound
-Norwegian elkhound
-otterhound
-Saluki
-Scottish deerhound
-Weimaraner
-Staffordshire bullterrier
-American Staffordshire terrier
-Bedlington terrier
-Border terrier
-Kerry blue terrier
-Irish terrier
-Norfolk terrier
-Norwich terrier
-Yorkshire terrier
-wire-haired fox terrier
-Lakeland terrier
-Sealyham terrier
-Airedale
-cairn
-Australian terrier
-Dandie Dinmont
-Boston bull
-miniature schnauzer
-giant schnauzer
-standard schnauzer
-Scotch terrier
-Tibetan terrier
-silky terrier
-soft-coated wheaten terrier
-West Highland white terrier
-Lhasa
-flat-coated retriever
-curly-coated retriever
-golden retriever
-Labrador retriever
-Chesapeake Bay retriever
-German short-haired pointer
-vizsla
-English setter
-Irish setter
-Gordon setter
-Brittany spaniel
-clumber
-English springer
-Welsh springer spaniel
-cocker spaniel
-Sussex spaniel
-Irish water spaniel
-kuvasz
-schipperke
-groenendael
-malinois
-briard
-kelpie
-komondor
-Old English sheepdog
-Shetland sheepdog
-collie
-Border collie
-Bouvier des Flandres
-Rottweiler
-German shepherd
-Doberman
-miniature pinscher
-Greater Swiss Mountain dog
-Bernese mountain dog
-Appenzeller
-EntleBucher
-boxer
-bull mastiff
-Tibetan mastiff
-French bulldog
-Great Dane
-Saint Bernard
-Eskimo dog
-malamute
-Siberian husky
-dalmatian
-affenpinscher
-basenji
-pug
-Leonberg
-Newfoundland
-Great Pyrenees
-Samoyed
-Pomeranian
-chow
-keeshond
-Brabancon griffon
-Pembroke
-Cardigan
-toy poodle
-miniature poodle
-standard poodle
-Mexican hairless
-timber wolf
-white wolf
-red wolf
-coyote
-dingo
-dhole
-African hunting dog
-hyena
-red fox
-kit fox
-Arctic fox
-grey fox
-tabby
-tiger cat
-Persian cat
-Siamese cat
-Egyptian cat
-cougar
-lynx
-leopard
-snow leopard
-jaguar
-lion
-tiger
-cheetah
-brown bear
-American black bear
-ice bear
-sloth bear
-mongoose
-meerkat
-tiger beetle
-ladybug
-ground beetle
-long-horned beetle
-leaf beetle
-dung beetle
-rhinoceros beetle
-weevil
-fly
-bee
-ant
-grasshopper
-cricket
-walking stick
-cockroach
-mantis
-cicada
-leafhopper
-lacewing
-dragonfly
-damselfly
-admiral
-ringlet
-monarch
-cabbage butterfly
-sulphur butterfly
-lycaenid
-starfish
-sea urchin
-sea cucumber
-wood rabbit
-hare
-Angora
-hamster
-porcupine
-fox squirrel
-marmot
-beaver
-guinea pig
-sorrel
-zebra
-hog
-wild boar
-warthog
-hippopotamus
-ox
-water buffalo
-bison
-ram
-bighorn
-ibex
-hartebeest
-impala
-gazelle
-Arabian camel
-llama
-weasel
-mink
-polecat
-black-footed ferret
-otter
-skunk
-badger
-armadillo
-three-toed sloth
-orangutan
-gorilla
-chimpanzee
-gibbon
-siamang
-guenon
-patas
-baboon
-macaque
-langur
-colobus
-proboscis monkey
-marmoset
-capuchin
-howler monkey
-titi
-spider monkey
-squirrel monkey
-Madagascar cat
-indri
-Indian elephant
-African elephant
-lesser panda
-giant panda
-barracouta
-eel
-coho
-rock beauty
-anemone fish
-sturgeon
-gar
-lionfish
-puffer
-abacus
-abaya
-academic gown
-accordion
-acoustic guitar
-aircraft carrier
-airliner
-airship
-altar
-ambulance
-amphibian
-analog clock
-apiary
-apron
-ashcan
-assault rifle
-backpack
-bakery
-balance beam
-balloon
-ballpoint
-Band Aid
-banjo
-bannister
-barbell
-barber chair
-barbershop
-barn
-barometer
-barrel
-barrow
-baseball
-basketball
-bassinet
-bassoon
-bathing cap
-bath towel
-bathtub
-beach wagon
-beacon
-beaker
-bearskin
-beer bottle
-beer glass
-bell cote
-bib
-bicycle-built-for-two
-bikini
-binder
-binoculars
-birdhouse
-boathouse
-bobsled
-bolo tie
-bonnet
-bookcase
-bookshop
-bottlecap
-bow
-bow tie
-brass
-brassiere
-breakwater
-breastplate
-broom
-bucket
-buckle
-bulletproof vest
-bullet train
-butcher shop
-cab
-caldron
-candle
-cannon
-canoe
-can opener
-cardigan
-car mirror
-carousel
-carpenter's kit
-carton
-car wheel
-cash machine
-cassette
-cassette player
-castle
-catamaran
-CD player
-cello
-cellular telephone
-chain
-chainlink fence
-chain mail
-chain saw
-chest
-chiffonier
-chime
-china cabinet
-Christmas stocking
-church
-cinema
-cleaver
-cliff dwelling
-cloak
-clog
-cocktail shaker
-coffee mug
-coffeepot
-coil
-combination lock
-computer keyboard
-confectionery
-container ship
-convertible
-corkscrew
-cornet
-cowboy boot
-cowboy hat
-cradle
-crane
-crash helmet
-crate
-crib
-Crock Pot
-croquet ball
-crutch
-cuirass
-dam
-desk
-desktop computer
-dial telephone
-diaper
-digital clock
-digital watch
-dining table
-dishrag
-dishwasher
-disk brake
-dock
-dogsled
-dome
-doormat
-drilling platform
-drum
-drumstick
-dumbbell
-Dutch oven
-electric fan
-electric guitar
-electric locomotive
-entertainment center
-envelope
-espresso maker
-face powder
-feather boa
-file
-fireboat
-fire engine
-fire screen
-flagpole
-flute
-folding chair
-football helmet
-forklift
-fountain
-fountain pen
-four-poster
-freight car
-French horn
-frying pan
-fur coat
-garbage truck
-gasmask
-gas pump
-goblet
-go-kart
-golf ball
-golfcart
-gondola
-gong
-gown
-grand piano
-greenhouse
-grille
-grocery store
-guillotine
-hair slide
-hair spray
-half track
-hammer
-hamper
-hand blower
-hand-held computer
-handkerchief
-hard disc
-harmonica
-harp
-harvester
-hatchet
-holster
-home theater
-honeycomb
-hook
-hoopskirt
-horizontal bar
-horse cart
-hourglass
-iPod
-iron
-jack-o'-lantern
-jean
-jeep
-jersey
-jigsaw puzzle
-jinrikisha
-joystick
-kimono
-knee pad
-knot
-lab coat
-ladle
-lampshade
-laptop
-lawn mower
-lens cap
-letter opener
-library
-lifeboat
-lighter
-limousine
-liner
-lipstick
-Loafer
-lotion
-loudspeaker
-loupe
-lumbermill
-magnetic compass
-mailbag
-mailbox
-maillot
-maillot
-manhole cover
-maraca
-marimba
-mask
-matchstick
-maypole
-maze
-measuring cup
-medicine chest
-megalith
-microphone
-microwave
-military uniform
-milk can
-minibus
-miniskirt
-minivan
-missile
-mitten
-mixing bowl
-mobile home
-Model T
-modem
-monastery
-monitor
-moped
-mortar
-mortarboard
-mosque
-mosquito net
-motor scooter
-mountain bike
-mountain tent
-mouse
-mousetrap
-moving van
-muzzle
-nail
-neck brace
-necklace
-nipple
-notebook
-obelisk
-oboe
-ocarina
-odometer
-oil filter
-organ
-oscilloscope
-overskirt
-oxcart
-oxygen mask
-packet
-paddle
-paddlewheel
-padlock
-paintbrush
-pajama
-palace
-panpipe
-paper towel
-parachute
-parallel bars
-park bench
-parking meter
-passenger car
-patio
-pay-phone
-pedestal
-pencil box
-pencil sharpener
-perfume
-Petri dish
-photocopier
-pick
-pickelhaube
-picket fence
-pickup
-pier
-piggy bank
-pill bottle
-pillow
-ping-pong ball
-pinwheel
-pirate
-pitcher
-plane
-planetarium
-plastic bag
-plate rack
-plow
-plunger
-Polaroid camera
-pole
-police van
-poncho
-pool table
-pop bottle
-pot
-potter's wheel
-power drill
-prayer rug
-printer
-prison
-projectile
-projector
-puck
-punching bag
-purse
-quill
-quilt
-racer
-racket
-radiator
-radio
-radio telescope
-rain barrel
-recreational vehicle
-reel
-reflex camera
-refrigerator
-remote control
-restaurant
-revolver
-rifle
-rocking chair
-rotisserie
-rubber eraser
-rugby ball
-rule
-running shoe
-safe
-safety pin
-saltshaker
-sandal
-sarong
-sax
-scabbard
-scale
-school bus
-schooner
-scoreboard
-screen
-screw
-screwdriver
-seat belt
-sewing machine
-shield
-shoe shop
-shoji
-shopping basket
-shopping cart
-shovel
-shower cap
-shower curtain
-ski
-ski mask
-sleeping bag
-slide rule
-sliding door
-slot
-snorkel
-snowmobile
-snowplow
-soap dispenser
-soccer ball
-sock
-solar dish
-sombrero
-soup bowl
-space bar
-space heater
-space shuttle
-spatula
-speedboat
-spider web
-spindle
-sports car
-spotlight
-stage
-steam locomotive
-steel arch bridge
-steel drum
-stethoscope
-stole
-stone wall
-stopwatch
-stove
-strainer
-streetcar
-stretcher
-studio couch
-stupa
-submarine
-suit
-sundial
-sunglass
-sunglasses
-sunscreen
-suspension bridge
-swab
-sweatshirt
-swimming trunks
-swing
-switch
-syringe
-table lamp
-tank
-tape player
-teapot
-teddy
-television
-tennis ball
-thatch
-theater curtain
-thimble
-thresher
-throne
-tile roof
-toaster
-tobacco shop
-toilet seat
-torch
-totem pole
-tow truck
-toyshop
-tractor
-trailer truck
-tray
-trench coat
-tricycle
-trimaran
-tripod
-triumphal arch
-trolleybus
-trombone
-tub
-turnstile
-typewriter keyboard
-umbrella
-unicycle
-upright
-vacuum
-vase
-vault
-velvet
-vending machine
-vestment
-viaduct
-violin
-volleyball
-waffle iron
-wall clock
-wallet
-wardrobe
-warplane
-washbasin
-washer
-water bottle
-water jug
-water tower
-whiskey jug
-whistle
-wig
-window screen
-window shade
-Windsor tie
-wine bottle
-wing
-wok
-wooden spoon
-wool
-worm fence
-wreck
-yawl
-yurt
-web site
-comic book
-crossword puzzle
-street sign
-traffic light
-book jacket
-menu
-plate
-guacamole
-consomme
-hot pot
-trifle
-ice cream
-ice lolly
-French loaf
-bagel
-pretzel
-cheeseburger
-hotdog
-mashed potato
-head cabbage
-broccoli
-cauliflower
-zucchini
-spaghetti squash
-acorn squash
-butternut squash
-cucumber
-artichoke
-bell pepper
-cardoon
-mushroom
-Granny Smith
-strawberry
-orange
-lemon
-fig
-pineapple
-banana
-jackfruit
-custard apple
-pomegranate
-hay
-carbonara
-chocolate sauce
-dough
-meat loaf
-pizza
-potpie
-burrito
-red wine
-espresso
-cup
-eggnog
-alp
-bubble
-cliff
-coral reef
-geyser
-lakeside
-promontory
-sandbar
-seashore
-valley
-volcano
-ballplayer
-groom
-scuba diver
-rapeseed
-daisy
-yellow lady's slipper
-corn
-acorn
-hip
-buckeye
-coral fungus
-agaric
-gyromitra
-stinkhorn
-earthstar
-hen-of-the-woods
-bolete
-ear
-toilet tissue
diff --git a/tensorflow/lite/examples/ios/camera/main.mm b/tensorflow/lite/examples/ios/camera/main.mm
deleted file mode 100644
index 1a9e542f7c9a5b..00000000000000
--- a/tensorflow/lite/examples/ios/camera/main.mm
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#import <UIKit/UIKit.h>
-
-#import "CameraExampleAppDelegate.h"
-
-int main(int argc, char* argv[]) {
-  int retVal = 0;
-
-  @autoreleasepool {
-    retVal =
-        UIApplicationMain(argc, argv, nil, NSStringFromClass([CameraExampleAppDelegate class]));
-  }
-  return retVal;
-}
diff --git a/tensorflow/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj b/tensorflow/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj
deleted file mode 100644
index bbab17b400c8e7..00000000000000
--- a/tensorflow/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj
+++ /dev/null
@@ -1,373 +0,0 @@
-// !$*UTF8*$!
-{
-	archiveVersion = 1;
-	classes = {
-	};
-	objectVersion = 46;
-	objects = {
-
-/* Begin PBXBuildFile section */
-		1C3C9DCC1ED3AB4200B8B5FA /* main.mm in Sources */ = {isa = PBXBuildFile; fileRef = 1C3C9DCA1ED3AB4200B8B5FA /* main.mm */; };
-		1C99111C1ED3B0E600A6BFB9 /* MainStoryboard_iPhone.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = 1C99111B1ED3B0E600A6BFB9 /* MainStoryboard_iPhone.storyboard */; };
-		1CA5EB931ED3ABFB00247A34 /* CoreMedia.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 1CA5EB921ED3ABFB00247A34 /* CoreMedia.framework */; };
-		1CB47D491ED3AD1700DF7666 /* AVFoundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 1CB47D481ED3AD1700DF7666 /* AVFoundation.framework */; };
-		1CDB2D491ED3A9CD007929E9 /* CameraExampleAppDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = 1CDB2D431ED3A9CD007929E9 /* CameraExampleAppDelegate.m */; };
-		1CDB2D4A1ED3A9CD007929E9 /* CameraExampleViewController.mm in Sources */ = {isa = PBXBuildFile; fileRef = 1CDB2D451ED3A9CD007929E9 /* CameraExampleViewController.mm */; };
-		54DC6C3C5F734F3A58069F0C /* libPods-tflite_camera_example.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 3BA8BF92C84895BFE59D8236 /* libPods-tflite_camera_example.a */; };
-		AC1F82661FBA3CBD0052BA77 /* labels.txt in Resources */ = {isa = PBXBuildFile; fileRef = AC1F82641FBA3CBD0052BA77 /* labels.txt */; };
-		AC31178921BB3FF900AFF1D2 /* mobilenet_quant_v1_224.tflite in Resources */ = {isa = PBXBuildFile; fileRef = AC31178821BB3FF900AFF1D2 /* mobilenet_quant_v1_224.tflite */; };
-		AC3BB41720114C400084552C /* mobilenet_v1_1.0_224.tflite in Resources */ = {isa = PBXBuildFile; fileRef = AC3BB41620114C400084552C /* mobilenet_v1_1.0_224.tflite */; };
-/* End PBXBuildFile section */
-
-/* Begin PBXFileReference section */
-		1C0D73481ECCC41B008C1DAB /* CoreImage.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreImage.framework; path = System/Library/Frameworks/CoreImage.framework; sourceTree = SDKROOT; };
-		1C0D734A1ECCC460008C1DAB /* CoreGraphics.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreGraphics.framework; path = System/Library/Frameworks/CoreGraphics.framework; sourceTree = SDKROOT; };
-		1C3C9DCA1ED3AB4200B8B5FA /* main.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = main.mm; sourceTree = "<group>"; };
-		1C564C0D1ED3A92E00087306 /* tflite_camera_example.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = tflite_camera_example.app; sourceTree = BUILT_PRODUCTS_DIR; };
-		1C99111B1ED3B0E600A6BFB9 /* MainStoryboard_iPhone.storyboard */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = file.storyboard; path = MainStoryboard_iPhone.storyboard; sourceTree = "<group>"; };
-		1CA45FFE1ECCC356002FA6A4 /* UIKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = UIKit.framework; path = System/Library/Frameworks/UIKit.framework; sourceTree = SDKROOT; };
-		1CA5EB921ED3ABFB00247A34 /* CoreMedia.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreMedia.framework; path = System/Library/Frameworks/CoreMedia.framework; sourceTree = SDKROOT; };
-		1CB47D481ED3AD1700DF7666 /* AVFoundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = AVFoundation.framework; path = System/Library/Frameworks/AVFoundation.framework; sourceTree = SDKROOT; };
-		1CDB2D421ED3A9CD007929E9 /* CameraExampleAppDelegate.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = CameraExampleAppDelegate.h; sourceTree = "<group>"; };
-		1CDB2D431ED3A9CD007929E9 /* CameraExampleAppDelegate.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = CameraExampleAppDelegate.m; sourceTree = "<group>"; };
-		1CDB2D441ED3A9CD007929E9 /* CameraExampleViewController.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = CameraExampleViewController.h; sourceTree = "<group>"; };
-		1CDB2D451ED3A9CD007929E9 /* CameraExampleViewController.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = CameraExampleViewController.mm; sourceTree = "<group>"; };
-		1CDB2D4D1ED3AA35007929E9 /* Info.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
-		3BA8BF92C84895BFE59D8236 /* libPods-tflite_camera_example.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = "libPods-tflite_camera_example.a"; sourceTree = BUILT_PRODUCTS_DIR; };
-		3BC5BE4BBD09374D3E98F082 /* Pods-tflite_camera_example.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-tflite_camera_example.debug.xcconfig"; path = "Pods/Target Support Files/Pods-tflite_camera_example/Pods-tflite_camera_example.debug.xcconfig"; sourceTree = "<group>"; };
-		55ED318E8D29C8AFEF03DF1E /* Pods-tflite_camera_example.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-tflite_camera_example.release.xcconfig"; path = "Pods/Target Support Files/Pods-tflite_camera_example/Pods-tflite_camera_example.release.xcconfig"; sourceTree = "<group>"; };
-		AC1F82641FBA3CBD0052BA77 /* labels.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = labels.txt; sourceTree = "<group>"; };
-		AC31178821BB3FF900AFF1D2 /* mobilenet_quant_v1_224.tflite */ = {isa = PBXFileReference; lastKnownFileType = file; path = mobilenet_quant_v1_224.tflite; sourceTree = "<group>"; };
-		AC3BB41620114C400084552C /* mobilenet_v1_1.0_224.tflite */ = {isa = PBXFileReference; lastKnownFileType = file; path = mobilenet_v1_1.0_224.tflite; sourceTree = "<group>"; };
-/* End PBXFileReference section */
-
-/* Begin PBXFrameworksBuildPhase section */
-		1C564C0A1ED3A92E00087306 /* Frameworks */ = {
-			isa = PBXFrameworksBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				1CB47D491ED3AD1700DF7666 /* AVFoundation.framework in Frameworks */,
-				1CA5EB931ED3ABFB00247A34 /* CoreMedia.framework in Frameworks */,
-				54DC6C3C5F734F3A58069F0C /* libPods-tflite_camera_example.a in Frameworks */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXFrameworksBuildPhase section */
-
-/* Begin PBXGroup section */
-		24D7686C331131624F4454A0 /* Frameworks */ = {
-			isa = PBXGroup;
-			children = (
-				1CB47D481ED3AD1700DF7666 /* AVFoundation.framework */,
-				1CA5EB921ED3ABFB00247A34 /* CoreMedia.framework */,
-				1C0D734A1ECCC460008C1DAB /* CoreGraphics.framework */,
-				1C0D73481ECCC41B008C1DAB /* CoreImage.framework */,
-				1CA45FFE1ECCC356002FA6A4 /* UIKit.framework */,
-				3BA8BF92C84895BFE59D8236 /* libPods-tflite_camera_example.a */,
-			);
-			name = Frameworks;
-			sourceTree = "<group>";
-		};
-		3E9FC355632FB928EA23BEED /* Pods */ = {
-			isa = PBXGroup;
-			children = (
-				3BC5BE4BBD09374D3E98F082 /* Pods-tflite_camera_example.debug.xcconfig */,
-				55ED318E8D29C8AFEF03DF1E /* Pods-tflite_camera_example.release.xcconfig */,
-			);
-			name = Pods;
-			sourceTree = "<group>";
-		};
-		591157921CF4011C00C31E3A = {
-			isa = PBXGroup;
-			children = (
-				1C99111B1ED3B0E600A6BFB9 /* MainStoryboard_iPhone.storyboard */,
-				1C3C9DCA1ED3AB4200B8B5FA /* main.mm */,
-				1CDB2D4D1ED3AA35007929E9 /* Info.plist */,
-				1CDB2D421ED3A9CD007929E9 /* CameraExampleAppDelegate.h */,
-				1CDB2D431ED3A9CD007929E9 /* CameraExampleAppDelegate.m */,
-				1CDB2D441ED3A9CD007929E9 /* CameraExampleViewController.h */,
-				1CDB2D451ED3A9CD007929E9 /* CameraExampleViewController.mm */,
-				59A3CFF31CF4E68100C4259F /* data */,
-				5911579C1CF4011C00C31E3A /* Products */,
-				3E9FC355632FB928EA23BEED /* Pods */,
-				24D7686C331131624F4454A0 /* Frameworks */,
-			);
-			sourceTree = "<group>";
-		};
-		5911579C1CF4011C00C31E3A /* Products */ = {
-			isa = PBXGroup;
-			children = (
-				1C564C0D1ED3A92E00087306 /* tflite_camera_example.app */,
-			);
-			name = Products;
-			sourceTree = "<group>";
-		};
-		59A3CFF31CF4E68100C4259F /* data */ = {
-			isa = PBXGroup;
-			children = (
-				AC31178821BB3FF900AFF1D2 /* mobilenet_quant_v1_224.tflite */,
-				AC3BB41620114C400084552C /* mobilenet_v1_1.0_224.tflite */,
-				AC1F82641FBA3CBD0052BA77 /* labels.txt */,
-			);
-			path = data;
-			sourceTree = "<group>";
-		};
-/* End PBXGroup section */
-
-/* Begin PBXNativeTarget section */
-		1C564C0C1ED3A92E00087306 /* tflite_camera_example */ = {
-			isa = PBXNativeTarget;
-			buildConfigurationList = 1C564C351ED3A92E00087306 /* Build configuration list for PBXNativeTarget "tflite_camera_example" */;
-			buildPhases = (
-				66DAEAAEE9EF6550C3A061E0 /* [CP] Check Pods Manifest.lock */,
-				1C564C091ED3A92E00087306 /* Sources */,
-				1C564C0A1ED3A92E00087306 /* Frameworks */,
-				1C564C0B1ED3A92E00087306 /* Resources */,
-			);
-			buildRules = (
-			);
-			dependencies = (
-			);
-			name = tflite_camera_example;
-			productName = tflite_camera_example;
-			productReference = 1C564C0D1ED3A92E00087306 /* tflite_camera_example.app */;
-			productType = "com.apple.product-type.application";
-		};
-/* End PBXNativeTarget section */
-
-/* Begin PBXProject section */
-		591157931CF4011C00C31E3A /* Project object */ = {
-			isa = PBXProject;
-			attributes = {
-				LastSwiftUpdateCheck = 0830;
-				LastUpgradeCheck = 0830;
-				ORGANIZATIONNAME = Google;
-				TargetAttributes = {
-					1C564C0C1ED3A92E00087306 = {
-						CreatedOnToolsVersion = 8.3.2;
-						DevelopmentTeam = EQHXZ8M8AV;
-						ProvisioningStyle = Automatic;
-					};
-				};
-			};
-			buildConfigurationList = 591157961CF4011C00C31E3A /* Build configuration list for PBXProject "tflite_camera_example" */;
-			compatibilityVersion = "Xcode 3.2";
-			developmentRegion = English;
-			hasScannedForEncodings = 0;
-			knownRegions = (
-				en,
-				Base,
-			);
-			mainGroup = 591157921CF4011C00C31E3A;
-			productRefGroup = 5911579C1CF4011C00C31E3A /* Products */;
-			projectDirPath = "";
-			projectRoot = "";
-			targets = (
-				1C564C0C1ED3A92E00087306 /* tflite_camera_example */,
-			);
-		};
-/* End PBXProject section */
-
-/* Begin PBXResourcesBuildPhase section */
-		1C564C0B1ED3A92E00087306 /* Resources */ = {
-			isa = PBXResourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				AC3BB41720114C400084552C /* mobilenet_v1_1.0_224.tflite in Resources */,
-				1C99111C1ED3B0E600A6BFB9 /* MainStoryboard_iPhone.storyboard in Resources */,
-				AC1F82661FBA3CBD0052BA77 /* labels.txt in Resources */,
-				AC31178921BB3FF900AFF1D2 /* mobilenet_quant_v1_224.tflite in Resources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXResourcesBuildPhase section */
-
-/* Begin PBXShellScriptBuildPhase section */
-		66DAEAAEE9EF6550C3A061E0 /* [CP] Check Pods Manifest.lock */ = {
-			isa = PBXShellScriptBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-			);
-			inputPaths = (
-				"${PODS_PODFILE_DIR_PATH}/Podfile.lock",
-				"${PODS_ROOT}/Manifest.lock",
-			);
-			name = "[CP] Check Pods Manifest.lock";
-			outputPaths = (
-				"$(DERIVED_FILE_DIR)/Pods-tflite_camera_example-checkManifestLockResult.txt",
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-			shellPath = /bin/sh;
-			shellScript = "diff \"${PODS_PODFILE_DIR_PATH}/Podfile.lock\" \"${PODS_ROOT}/Manifest.lock\" > /dev/null\nif [ $? != 0 ] ; then\n    # print error to STDERR\n    echo \"error: The sandbox is not in sync with the Podfile.lock. Run 'pod install' or update your CocoaPods installation.\" >&2\n    exit 1\nfi\n# This output is used by Xcode 'outputs' to avoid re-running this script phase.\necho \"SUCCESS\" > \"${SCRIPT_OUTPUT_FILE_0}\"\n";
-			showEnvVarsInLog = 0;
-		};
-/* End PBXShellScriptBuildPhase section */
-
-/* Begin PBXSourcesBuildPhase section */
-		1C564C091ED3A92E00087306 /* Sources */ = {
-			isa = PBXSourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				1CDB2D4A1ED3A9CD007929E9 /* CameraExampleViewController.mm in Sources */,
-				1CDB2D491ED3A9CD007929E9 /* CameraExampleAppDelegate.m in Sources */,
-				1C3C9DCC1ED3AB4200B8B5FA /* main.mm in Sources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXSourcesBuildPhase section */
-
-/* Begin XCBuildConfiguration section */
-		1C564C361ED3A92E00087306 /* Debug */ = {
-			isa = XCBuildConfiguration;
-			baseConfigurationReference = 3BC5BE4BBD09374D3E98F082 /* Pods-tflite_camera_example.debug.xcconfig */;
-			buildSettings = {
-				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
-				CLANG_ANALYZER_NONNULL = YES;
-				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
-				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
-				DEVELOPMENT_TEAM = EQHXZ8M8AV;
-				INFOPLIST_FILE = Info.plist;
-				IPHONEOS_DEPLOYMENT_TARGET = 10.3;
-				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
-				PRODUCT_BUNDLE_IDENTIFIER = "com.pf.tf-camera-example";
-				PRODUCT_NAME = "$(TARGET_NAME)";
-				SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG;
-				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
-				SWIFT_VERSION = 3.0;
-				VALID_ARCHS = arm64;
-			};
-			name = Debug;
-		};
-		1C564C371ED3A92E00087306 /* Release */ = {
-			isa = XCBuildConfiguration;
-			baseConfigurationReference = 55ED318E8D29C8AFEF03DF1E /* Pods-tflite_camera_example.release.xcconfig */;
-			buildSettings = {
-				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
-				CLANG_ANALYZER_NONNULL = YES;
-				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
-				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
-				DEVELOPMENT_TEAM = EQHXZ8M8AV;
-				INFOPLIST_FILE = Info.plist;
-				IPHONEOS_DEPLOYMENT_TARGET = 10.3;
-				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
-				PRODUCT_BUNDLE_IDENTIFIER = "com.pf.tf-camera-example";
-				PRODUCT_NAME = "$(TARGET_NAME)";
-				SWIFT_OPTIMIZATION_LEVEL = "-Owholemodule";
-				SWIFT_VERSION = 3.0;
-				VALID_ARCHS = arm64;
-			};
-			name = Release;
-		};
-		591157B01CF4011D00C31E3A /* Debug */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ALWAYS_SEARCH_USER_PATHS = NO;
-				CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
-				CLANG_CXX_LIBRARY = "libc++";
-				CLANG_ENABLE_MODULES = YES;
-				CLANG_ENABLE_OBJC_ARC = YES;
-				CLANG_WARN_BOOL_CONVERSION = YES;
-				CLANG_WARN_CONSTANT_CONVERSION = YES;
-				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-				CLANG_WARN_EMPTY_BODY = YES;
-				CLANG_WARN_ENUM_CONVERSION = YES;
-				CLANG_WARN_INFINITE_RECURSION = YES;
-				CLANG_WARN_INT_CONVERSION = YES;
-				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-				CLANG_WARN_SUSPICIOUS_MOVE = YES;
-				CLANG_WARN_UNREACHABLE_CODE = YES;
-				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-				"CODE_SIGN_IDENTITY[sdk=iphoneos*]" = "iPhone Developer";
-				COPY_PHASE_STRIP = NO;
-				DEBUG_INFORMATION_FORMAT = dwarf;
-				ENABLE_STRICT_OBJC_MSGSEND = YES;
-				ENABLE_TESTABILITY = YES;
-				GCC_C_LANGUAGE_STANDARD = gnu99;
-				GCC_DYNAMIC_NO_PIC = NO;
-				GCC_NO_COMMON_BLOCKS = YES;
-				GCC_OPTIMIZATION_LEVEL = 0;
-				GCC_PREPROCESSOR_DEFINITIONS = (
-					"DEBUG=1",
-					"$(inherited)",
-				);
-				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-				GCC_WARN_UNDECLARED_SELECTOR = YES;
-				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-				GCC_WARN_UNUSED_FUNCTION = YES;
-				GCC_WARN_UNUSED_VARIABLE = YES;
-				HEADER_SEARCH_PATHS = "$(inherited)";
-				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
-				MTL_ENABLE_DEBUG_INFO = YES;
-				ONLY_ACTIVE_ARCH = YES;
-				SDKROOT = iphoneos;
-				TARGETED_DEVICE_FAMILY = "1,2";
-			};
-			name = Debug;
-		};
-		591157B11CF4011D00C31E3A /* Release */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ALWAYS_SEARCH_USER_PATHS = NO;
-				CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
-				CLANG_CXX_LIBRARY = "libc++";
-				CLANG_ENABLE_MODULES = YES;
-				CLANG_ENABLE_OBJC_ARC = YES;
-				CLANG_WARN_BOOL_CONVERSION = YES;
-				CLANG_WARN_CONSTANT_CONVERSION = YES;
-				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-				CLANG_WARN_EMPTY_BODY = YES;
-				CLANG_WARN_ENUM_CONVERSION = YES;
-				CLANG_WARN_INFINITE_RECURSION = YES;
-				CLANG_WARN_INT_CONVERSION = YES;
-				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-				CLANG_WARN_SUSPICIOUS_MOVE = YES;
-				CLANG_WARN_UNREACHABLE_CODE = YES;
-				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-				"CODE_SIGN_IDENTITY[sdk=iphoneos*]" = "iPhone Developer";
-				COPY_PHASE_STRIP = NO;
-				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
-				ENABLE_NS_ASSERTIONS = NO;
-				ENABLE_STRICT_OBJC_MSGSEND = YES;
-				GCC_C_LANGUAGE_STANDARD = gnu99;
-				GCC_NO_COMMON_BLOCKS = YES;
-				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-				GCC_WARN_UNDECLARED_SELECTOR = YES;
-				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-				GCC_WARN_UNUSED_FUNCTION = YES;
-				GCC_WARN_UNUSED_VARIABLE = YES;
-				HEADER_SEARCH_PATHS = "$(inherited)";
-				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
-				MTL_ENABLE_DEBUG_INFO = NO;
-				SDKROOT = iphoneos;
-				TARGETED_DEVICE_FAMILY = "1,2";
-				VALIDATE_PRODUCT = YES;
-			};
-			name = Release;
-		};
-/* End XCBuildConfiguration section */
-
-/* Begin XCConfigurationList section */
-		1C564C351ED3A92E00087306 /* Build configuration list for PBXNativeTarget "tflite_camera_example" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				1C564C361ED3A92E00087306 /* Debug */,
-				1C564C371ED3A92E00087306 /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-		591157961CF4011C00C31E3A /* Build configuration list for PBXProject "tflite_camera_example" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				591157B01CF4011D00C31E3A /* Debug */,
-				591157B11CF4011D00C31E3A /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-/* End XCConfigurationList section */
-	};
-	rootObject = 591157931CF4011C00C31E3A /* Project object */;
-}
diff --git a/tensorflow/lite/experimental/acceleration/compatibility/variables.h b/tensorflow/lite/experimental/acceleration/compatibility/variables.h
index 2fa0788f52f0a3..0965eca78c76bb 100644
--- a/tensorflow/lite/experimental/acceleration/compatibility/variables.h
+++ b/tensorflow/lite/experimental/acceleration/compatibility/variables.h
@@ -27,49 +27,49 @@ namespace acceleration {
 //
 // Android SDK version number. Android system property ro.build.version.sdk.
 // E.g., "28".
-constexpr char kAndroidSdkVersion[] = "tflite.android_sdk_version";
+inline constexpr char kAndroidSdkVersion[] = "tflite.android_sdk_version";
 // SoC model. Looked up from database or possibly returned from Android system
 // property ro.board.platform, normalized. E.g., "sdm450".
-constexpr char kSoCModel[] = "tflite.soc_model";
+inline constexpr char kSoCModel[] = "tflite.soc_model";
 // SoC vendor. Looked up from database. E.g., "qualcomm".
-constexpr char kSoCVendor[] = "tflite.soc_vendor";
+inline constexpr char kSoCVendor[] = "tflite.soc_vendor";
 // Device manufacturer. Android API android.os.Build.MANUFACTURER, normalized.
 // E.g., "google".
-constexpr char kManufacturer[] = "tflite.manufacturer";
+inline constexpr char kManufacturer[] = "tflite.manufacturer";
 // Device model. Android API android.os.Build.MODEL, normalized.
 // E.g., "pixel_2".
-constexpr char kDeviceModel[] = "tflite.device_model";
+inline constexpr char kDeviceModel[] = "tflite.device_model";
 // Device name. Android API android.os.Build.DEVICE, normalized.
 // E.g., "walleye".
-constexpr char kDeviceName[] = "tflite.device_name";
+inline constexpr char kDeviceName[] = "tflite.device_name";
 
 // GPU-related properties.
 //
 // OpenGL ES version. E.g., 3.2.
-constexpr char kOpenGLESVersion[] = "tflite.opengl_es_version";
+inline constexpr char kOpenGLESVersion[] = "tflite.opengl_es_version";
 // GPU model, result of querying GL_RENDERER, normalized. E.g.,
 // "adreno_(tm)_505".
-constexpr char kGPUModel[] = "tflite.gpu_model";
+inline constexpr char kGPUModel[] = "tflite.gpu_model";
 // GPU vendor, normalized. E.g., "adreno_(tm)_505".
-constexpr char kGPUVendor[] = "tflite.gpu_vendor";
+inline constexpr char kGPUVendor[] = "tflite.gpu_vendor";
 // OpenGL driver version, result of querying GL_VERSION. E.g.,
 // "opengl_es_3.2_v@328.0_(git@6fb5a5b,_ife855c4895)_(date:08/21/18)"
-constexpr char kOpenGLDriverVersion[] = "tflite.opengl_driver_version";
+inline constexpr char kOpenGLDriverVersion[] = "tflite.opengl_driver_version";
 
 // Allowlist use case. This property is used to allow joining multiple lists
 // into a single decision tree.
-constexpr char kUseCase[] = "tflite.use_case";
+inline constexpr char kUseCase[] = "tflite.use_case";
 
 // NNAPI-related properties.
 //
 // NNAPI accelerator name, returned by ANeuralNetworksDevice_getName. E.g.,
 // "qti-dsp".
-constexpr char kNNAPIAccelerator[] = "tflite.nnapi_accelerator";
+inline constexpr char kNNAPIAccelerator[] = "tflite.nnapi_accelerator";
 // NNAPI accelerator feature level, returned by
 // ANeuralNetworksDevice_getFeatureLevel. E.g., 29. Actual variables are named
 // "tflite.nnapi_feature_level.<accelerator name>", e.g.,
 // "tflite.nnapi_feature_level.qti-dsp".
-constexpr char kNNAPIFeatureLevelPrefix[] = "tflite.nnapi_feature_level";
+inline constexpr char kNNAPIFeatureLevelPrefix[] = "tflite.nnapi_feature_level";
 
 namespace gpu {
 // GPU-delegate derived properties.
@@ -77,11 +77,11 @@ namespace gpu {
 // Whether the GPU delegate works in general.
 // Possible values are ("", "SUPPORTED", "UNSUPPORTED"). An empty value for
 // this field means that the device is unsupported.
-constexpr char kStatus[] = "tflite.gpu.status";
+inline constexpr char kStatus[] = "tflite.gpu.status";
 
-constexpr char kStatusSupported[] = "SUPPORTED";
-constexpr char kStatusUnknown[] = "UNKNOWN";
-constexpr char kStatusUnsupported[] = "UNSUPPORTED";
+inline constexpr char kStatusSupported[] = "SUPPORTED";
+inline constexpr char kStatusUnknown[] = "UNKNOWN";
+inline constexpr char kStatusUnsupported[] = "UNSUPPORTED";
 
 enum class CompatibilityStatus {
   kUnknown = 0,
@@ -91,6 +91,15 @@ enum class CompatibilityStatus {
 
 }  // namespace gpu
 
+namespace metadata {
+// Latency for model to run.
+inline constexpr char kLatency[] = "tflite.latency";
+
+// Frame time for one frame through model.
+inline constexpr char kFrameTime[] = "tflite.frame_time";
+
+}  // namespace metadata
+
 }  // namespace acceleration
 }  // namespace tflite
 
diff --git a/tensorflow/lite/experimental/genai/BUILD b/tensorflow/lite/experimental/genai/BUILD
index 7d8db7e0da8dbe..ca447df3c3db05 100644
--- a/tensorflow/lite/experimental/genai/BUILD
+++ b/tensorflow/lite/experimental/genai/BUILD
@@ -1,19 +1,33 @@
 # Custom Ops useful for GenAI models.
+load("//tensorflow:tensorflow.default.bzl", "pybind_extension")
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 
 # copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
 
 cc_library(
-    name = "kvcache",
+    name = "genai_ops",
     srcs = [
+        "genai_ops.cc",
         "kvcache.cc",
+        "sdpa.cc",
     ],
-    hdrs = ["kvcache.h"],
-    copts = tflite_copts(),
+    hdrs = [
+        "genai_ops.h",
+    ],
+    visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/core:subgraph",
         "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/experimental/resource",
+        "//tensorflow/lite/experimental/resource:cache_buffer",
         "//tensorflow/lite/kernels:kernel_util",
+        "//tensorflow/lite/kernels:reference_ops",
+        "//tensorflow/lite/kernels/internal:common",
+        "//tensorflow/lite/kernels/internal:compatibility",
+        "//tensorflow/lite/kernels/internal:reference_base",
         "//tensorflow/lite/kernels/internal:tensor",
         "//tensorflow/lite/kernels/internal:tensor_utils",
         "//tensorflow/lite/kernels/internal:types",
@@ -26,7 +40,7 @@ cc_test(
     srcs = ["kvcache_test.cc"],
     copts = tflite_copts(),
     deps = [
-        ":kvcache",
+        ":genai_ops",
         "//tensorflow/lite/c:c_api_types",
         "//tensorflow/lite/kernels:test_util",
         "//tensorflow/lite/schema:schema_fbs",
@@ -34,4 +48,27 @@ cc_test(
     ],
 )
 
+pybind_extension(
+    name = "pywrap_genai_ops",
+    srcs = [
+        "genai_ops_wrapper.cc",
+    ],
+    hdrs = ["genai_ops.h"],
+    additional_exported_symbols = ["GenAIOpsRegisterer"],
+    enable_stub_generation = True,
+    link_in_framework = True,
+    module_name = "pywrap_genai_ops",
+    pytype_srcs = [
+        "pywrap_genai_ops.pyi",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":genai_ops",
+        "//tensorflow/lite:framework_stable",
+        "//tensorflow/lite:mutable_op_resolver",
+        "//third_party/python_runtime:headers",
+        "@pybind11",
+    ],
+)
+
 tflite_portable_test_suite()
diff --git a/tensorflow/lite/experimental/genai/genai_ops.cc b/tensorflow/lite/experimental/genai/genai_ops.cc
new file mode 100644
index 00000000000000..f1bcafafd707bd
--- /dev/null
+++ b/tensorflow/lite/experimental/genai/genai_ops.cc
@@ -0,0 +1,33 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/genai/genai_ops.h"
+
+#include "tensorflow/lite/mutable_op_resolver.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+
+extern "C" void GenAIOpsRegisterer(::tflite::MutableOpResolver* resolver) {
+  resolver->AddCustom("odml.update_kv_cache",
+                      tflite::ops::custom::Register_KV_CACHE());
+  resolver->AddCustom("odml.scaled_dot_product_attention",
+                      tflite::ops::custom::Register_SDPA());
+}
+
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/genai/kvcache.h b/tensorflow/lite/experimental/genai/genai_ops.h
similarity index 71%
rename from tensorflow/lite/experimental/genai/kvcache.h
rename to tensorflow/lite/experimental/genai/genai_ops.h
index e93ae747286544..d2b320f516f078 100644
--- a/tensorflow/lite/experimental/genai/kvcache.h
+++ b/tensorflow/lite/experimental/genai/genai_ops.h
@@ -13,24 +13,22 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_GENAI_KVCACHE_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_GENAI_KVCACHE_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_GENAI_GENAI_OPS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_GENAI_GENAI_OPS_H_
 
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/mutable_op_resolver.h"
 
 namespace tflite {
 namespace ops {
 namespace custom {
 
 TfLiteRegistration* Register_KV_CACHE();
+TfLiteRegistration* Register_SDPA();
+
+extern "C" void GenAIOpsRegisterer(::tflite::MutableOpResolver* resolver);
 
 }  // namespace custom
 }  // namespace ops
 }  // namespace tflite
 
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_GENAI_KVCACHE_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_GENAI_GENAI_OPS_H_
diff --git a/tensorflow/lite/experimental/genai/genai_ops_wrapper.cc b/tensorflow/lite/experimental/genai/genai_ops_wrapper.cc
new file mode 100644
index 00000000000000..7b0e6ab5faade9
--- /dev/null
+++ b/tensorflow/lite/experimental/genai/genai_ops_wrapper.cc
@@ -0,0 +1,35 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "pybind11/pybind11.h"  // from @pybind11
+#include "pybind11/pytypes.h"  // from @pybind11
+#include "tensorflow/lite/experimental/genai/genai_ops.h"
+
+PYBIND11_MODULE(pywrap_genai_ops, m) {
+  m.doc() = R"pbdoc(
+    pywrap_genai_ops
+    -----
+  )pbdoc";
+  m.def(
+      "GenAIOpsRegisterer",
+      [](uintptr_t resolver) {
+        tflite::ops::custom::GenAIOpsRegisterer(
+            reinterpret_cast<tflite::MutableOpResolver*>(resolver));
+      },
+      R"pbdoc(
+        GenAI op registerer function with the correct signature.
+        Registers GenAI custom ops.
+      )pbdoc");
+}
diff --git a/tensorflow/lite/experimental/genai/kvcache.cc b/tensorflow/lite/experimental/genai/kvcache.cc
index 4f57f010db4d14..adff1d5e0f18c6 100644
--- a/tensorflow/lite/experimental/genai/kvcache.cc
+++ b/tensorflow/lite/experimental/genai/kvcache.cc
@@ -13,13 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/experimental/genai/kvcache.h"
-
+#include <algorithm>
 #include <cstddef>
 #include <cstdint>
 #include <cstring>
 
+#include "flatbuffers/flexbuffers.h"
 #include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/experimental/resource/cache_buffer.h"
+#include "tensorflow/lite/experimental/resource/resource_base.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
@@ -28,36 +31,87 @@ namespace ops {
 namespace custom {
 namespace llm {
 
-static const int kKeyTensor = 0;
-static const int kValueTensor = 1;
+static const int kPositionTensor = 0;
+static const int kKeyTensor = 1;
+static const int kValueTensor = 2;
 static const int kFullKeyTensor = 0;
 static const int kFullValueTensor = 1;
-static const int kMaxNumCacheEntries = 1024;
 static const int kRequiredNumDimensions = 4;
+static const int kDefaultMaxNumCacheEntries = 2048;
+static const int kDefaultNumTransformerLayers = 32;
+static const int kDefaultTransformerLayerId = 0;
+
+static const int KVCACHE_KEY_RESOURCE = 42;
+static const int KVCACHE_VALUE_RESOURCE = 43;
 
 struct OpData {
-  int num_entries;
+  int num_layers;
+  int layer_index;
+  int max_num_entries;
+  int first_slot_index;
+  // Pointers to the key and value cache buffers that this Op doesn't own
+  // (and therefore does not free on destruction of this Op).
+  resource::CacheBuffer* key_cache_buffer;
+  resource::CacheBuffer* value_cache_buffer;
+  bool is_initialized;
+  bool output_initialized;
 };
 
 void* KVCacheInit(TfLiteContext* context, const char* buffer, size_t length) {
   OpData* op_data = new OpData();
-  // TODO(talumbau) Reset this value via ClearCaches in InternalBackendContext.
-  op_data->num_entries = 0;
+  // TODO(b/333891673) Reset this value via ClearCaches in
+  // InternalBackendContext.
+  op_data->max_num_entries = -1;
+  op_data->num_layers = -1;
+  op_data->layer_index = -1;
+  op_data->first_slot_index = -1;
+  op_data->key_cache_buffer = nullptr;
+  op_data->value_cache_buffer = nullptr;
+  op_data->is_initialized = false;
+  op_data->output_initialized = false;
   return op_data;
 }
 
 TfLiteStatus KVCachePrepare(TfLiteContext* context, TfLiteNode* node) {
-  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 2);
 
+  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
+
+  if (!op_data->is_initialized) {
+    const uint8_t* buffer =
+        reinterpret_cast<const uint8_t*>(node->custom_initial_data);
+    const size_t length = node->custom_initial_data_size;
+    auto flexbuffer_map = flexbuffers::GetRoot(buffer, length).AsMap();
+    int32_t max_num_entries = flexbuffer_map["kv_cache_max"].AsInt32();
+    int32_t num_layers = flexbuffer_map["num_layers"].AsInt32();
+    int32_t layer_index = flexbuffer_map["layer_index"].AsInt32();
+    op_data->max_num_entries =
+        max_num_entries > 0 ? max_num_entries : kDefaultMaxNumCacheEntries;
+    op_data->num_layers =
+        num_layers > 0 ? num_layers : kDefaultNumTransformerLayers;
+    op_data->layer_index =
+        layer_index > 0 ? layer_index : kDefaultTransformerLayerId;
+    op_data->first_slot_index = 0;
+    op_data->is_initialized = true;
+  }
+
   // Prepare the inputs.
+  const TfLiteTensor* position;
   const TfLiteTensor* key;
   const TfLiteTensor* value;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kPositionTensor, &position));
   TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kKeyTensor, &key));
   TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kValueTensor, &value));
 
+  TF_LITE_ENSURE_EQ(context, position->type, kTfLiteInt64);
   TF_LITE_ENSURE_EQ(context, key->type, kTfLiteFloat32);
   TF_LITE_ENSURE_EQ(context, value->type, kTfLiteFloat32);
+  // Ensure Positions correspond to KV sequence length.
+  TF_LITE_ENSURE(context, NumDimensions(position) == 1);
+  TF_LITE_ENSURE(
+      context, GetTensorShape(position).Dims(0) == GetTensorShape(key).Dims(1));
   // Support only (B, S, N, H) for now.
   TF_LITE_ENSURE(context, NumDimensions(key) == kRequiredNumDimensions);
   // Enforce Batch == 1 for now.
@@ -81,14 +135,56 @@ TfLiteStatus KVCachePrepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteIntArray* input_dims = key->dims;
   TfLiteIntArray* kcache_dims = TfLiteIntArrayCopy(input_dims);
   TfLiteIntArray* vcache_dims = TfLiteIntArrayCopy(input_dims);
-  kcache_dims->data[1] = kMaxNumCacheEntries;
-  vcache_dims->data[1] = kMaxNumCacheEntries;
+  kcache_dims->data[1] = op_data->max_num_entries;
+  vcache_dims->data[1] = op_data->max_num_entries;
+
+  TfLiteIntArray* kcache_buffer_dims = TfLiteIntArrayCreate(5);
+  // Batch
+  kcache_buffer_dims->data[0] = input_dims->data[0];
+  // Number of layers
+  kcache_buffer_dims->data[1] = op_data->num_layers;
+  // Sequence Length
+  kcache_buffer_dims->data[2] = op_data->max_num_entries;
+  // Num heads
+  kcache_buffer_dims->data[3] = input_dims->data[2];
+  // Head dim
+  kcache_buffer_dims->data[4] = input_dims->data[3];
+
+  TfLiteIntArray* vcache_buffer_dims = TfLiteIntArrayCopy(kcache_buffer_dims);
 
   TF_LITE_ENSURE_OK(context,
                     context->ResizeTensor(context, kfull, kcache_dims));
   TF_LITE_ENSURE_OK(context,
                     context->ResizeTensor(context, vfull, vcache_dims));
 
+  // Get the pointer to the tensor for our buffer storage.
+  Subgraph* subgraph = reinterpret_cast<Subgraph*>(context->impl_);
+  auto& resources = subgraph->resources();
+
+  if (resources.count(KVCACHE_KEY_RESOURCE) == 0) {
+    auto* cbuffer = new resource::CacheBuffer();
+    cbuffer->Initialize(*kcache_buffer_dims);
+    resources.emplace(KVCACHE_KEY_RESOURCE, cbuffer);
+    op_data->key_cache_buffer = cbuffer;
+  } else {
+    resource::ResourceBase* resourcePtr =
+        resources.at(KVCACHE_KEY_RESOURCE).get();
+    resource::CacheBuffer* cbuffer = (resource::CacheBuffer*)(resourcePtr);
+    op_data->key_cache_buffer = cbuffer;
+  }
+  if (resources.count(KVCACHE_VALUE_RESOURCE) == 0) {
+    auto* cbuffer = new resource::CacheBuffer();
+    cbuffer->Initialize(*vcache_buffer_dims);
+    resources.emplace(KVCACHE_VALUE_RESOURCE, cbuffer);
+    op_data->value_cache_buffer = cbuffer;
+  } else {
+    resource::ResourceBase* resourcePtr =
+        resources.at(KVCACHE_VALUE_RESOURCE).get();
+    resource::CacheBuffer* cbuffer = (resource::CacheBuffer*)(resourcePtr);
+    op_data->value_cache_buffer = cbuffer;
+  }
+  TfLiteIntArrayFree(kcache_buffer_dims);
+  TfLiteIntArrayFree(vcache_buffer_dims);
   return kTfLiteOk;
 }
 
@@ -97,6 +193,9 @@ void KVCacheFree(TfLiteContext* context, void* buffer) {
 }
 
 TfLiteStatus KVCacheEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* position;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kPositionTensor, &position));
   const TfLiteTensor* key;
   TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kKeyTensor, &key));
   const TfLiteTensor* value;
@@ -110,52 +209,112 @@ TfLiteStatus KVCacheEval(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_OK(context,
                     GetOutputSafe(context, node, kFullValueTensor, &vfull));
   OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
-  int current_num_entries = op_data->num_entries;
 
-  // 0. Init the cache if there are no entries.
-  if (current_num_entries == 0) {
+  float* key_cache_ptr = op_data->key_cache_buffer->GetBuffer();
+  float* value_cache_ptr = op_data->value_cache_buffer->GetBuffer();
+  const int layer_index = op_data->layer_index;
+  const int64_t max_num_entries = op_data->max_num_entries;
+  int current_num_entries =
+      op_data->key_cache_buffer->GetNumEntries(layer_index);
+
+  // Compute some constants for various pieces of the cache.
+  RuntimeShape shape(GetTensorShape(key));
+  const int64_t num_slots_needed = shape.Dims(1);
+  const int elements_in_one_entry = shape.Dims(2) * shape.Dims(3);
+  const int elements_in_one_block =
+      op_data->max_num_entries * elements_in_one_entry;
+  const int64_t num_bytes_per_tensor = sizeof(float) * elements_in_one_entry;
+
+  // Get the pointers to the individual caches for a layer.
+  uint8_t* k_ptr = reinterpret_cast<uint8_t*>(key_cache_ptr);
+  uint8_t* v_ptr = reinterpret_cast<uint8_t*>(value_cache_ptr);
+  k_ptr = k_ptr + sizeof(float) * op_data->layer_index * elements_in_one_block;
+  v_ptr = v_ptr + sizeof(float) * op_data->layer_index * elements_in_one_block;
+
+  // 0. Init the outputs from the cache buffer if needed.
+  if (!op_data->output_initialized) {
     float* kfull_ptr = GetTensorData<float>(kfull);
     float* vfull_ptr = GetTensorData<float>(vfull);
-    memset(kfull_ptr, 0, kfull->bytes);
-    memset(vfull_ptr, 0, vfull->bytes);
+    memcpy(kfull_ptr, k_ptr, kfull->bytes);
+    memcpy(vfull_ptr, v_ptr, kfull->bytes);
+    op_data->output_initialized = true;
   }
 
-  // 1. Determine how many slots remain in the cache.
-  const int num_slots_remaining = kMaxNumCacheEntries - current_num_entries;
+  // 1. Determine which slots the inputs take up, and which slots are in the
+  //    existing span of the cache.
+
+  // Compute the span of the inputs.
+  const int64_t input_first_idx = position->data.i64[0];
+  const int64_t input_last_idx = input_first_idx + num_slots_needed - 1;
+
+  // Compute the span of the cache.
+  const int64_t cache_first_slot_idx = op_data->first_slot_index;
+  const int64_t cache_last_slot_idx =
+      cache_first_slot_idx + op_data->max_num_entries - 1;
+
+  // Compute if a shift is needed.
+  const int slots_to_shift = std::min(
+      std::max(static_cast<int64_t>(0), input_last_idx - cache_last_slot_idx),
+      max_num_entries);
+
+  // These values determine how we will write to the output tensor:
+  // first_slot := the first cache entry that we will write to in the output
+  int64_t first_slot = input_first_idx - op_data->first_slot_index;
+  if (first_slot < 0) {
+    TF_LITE_KERNEL_LOG(
+        context,
+        "Can not specify a position before this cache's first slot index of %d",
+        op_data->first_slot_index);
+    return kTfLiteError;
+  }
 
-  // 2. Determine how many slots these inputs take up.
-  RuntimeShape shape(GetTensorShape(key));
-  const int num_slots_needed = shape.Dims(1);
-  const int elements_in_one_entry = shape.Dims(2) * shape.Dims(3);
+  // byte_offset_for_output := the byte offset for the first slot.
+  int64_t byte_offset_for_output = first_slot * num_bytes_per_tensor;
+  // num_slots_for_output := the number of slots we write in the output
+  int64_t num_slots_for_output = num_slots_needed;
 
   // 3. If we need more slots, make room in the cache by writing over oldest
   //    entries.
-  if (num_slots_remaining < num_slots_needed) {
-    char* k_ptr = reinterpret_cast<char*>(kfull->data.data);
-    char* v_ptr = reinterpret_cast<char*>(vfull->data.data);
-    const int slots_to_grow = num_slots_needed - num_slots_remaining;
+  if (slots_to_shift > 0 && slots_to_shift < max_num_entries) {
+    // If we are shifting the cache, we need to start writing from the
+    // beginning.
+    byte_offset_for_output = 0;
+    // And we need to write the entire cache.
+    num_slots_for_output = max_num_entries;
     const int bytes_offset =
-        sizeof(float) * elements_in_one_entry * slots_to_grow;
-
-    const int num_bytes_to_shift =
-        sizeof(float) * elements_in_one_entry * current_num_entries;
-    // TODO(talumbau): This is O(cache_size) data motion. Consider optimizing
+        sizeof(float) * elements_in_one_entry * slots_to_shift;
+    const int size_bytes_to_shift = sizeof(float) * elements_in_one_entry *
+                                    (max_num_entries - slots_to_shift);
+    // TODO(b/333893996): This is O(cache_size) data motion. Consider optimizing
     // with a circular buffer or similar.
-    memmove(k_ptr, k_ptr + bytes_offset, num_bytes_to_shift);
-    memmove(v_ptr, v_ptr + bytes_offset, num_bytes_to_shift);
-    // Just reduced the number of cache entries.
-    current_num_entries -= slots_to_grow;
+    memmove(k_ptr, k_ptr + bytes_offset, size_bytes_to_shift);
+    memmove(v_ptr, v_ptr + bytes_offset, size_bytes_to_shift);
   }
 
-  // 4. Put the key and value in their respective caches.
-  const int64_t num_bytes_per_tensor = sizeof(float) * elements_in_one_entry;
-  const int64_t offset = current_num_entries * num_bytes_per_tensor;
-  memcpy((uint8_t*)(kfull->data.data) + offset, key->data.data, key->bytes);
-  memcpy((uint8_t*)(vfull->data.data) + offset, value->data.data, value->bytes);
+  // Update the first slot this cache now covers.
+  op_data->first_slot_index = op_data->first_slot_index + slots_to_shift;
+
+  // Recompute the first slot in case any shifting occurred.
+  first_slot = input_first_idx - op_data->first_slot_index;
+  const int64_t bytes_offset_for_cache = first_slot * num_bytes_per_tensor;
 
-  // Update count.
-  current_num_entries += num_slots_needed;
-  op_data->num_entries = current_num_entries;
+  // 4. Put the key and value in their respective caches.
+  memcpy(k_ptr + bytes_offset_for_cache, key->data.data, key->bytes);
+  memcpy(v_ptr + bytes_offset_for_cache, value->data.data, value->bytes);
+
+  // 5. Set the output tensors with the relevant block's cache.
+  memcpy((uint8_t*)(kfull->data.data) + byte_offset_for_output,
+         k_ptr + byte_offset_for_output,
+         num_slots_for_output * num_bytes_per_tensor);
+  memcpy((uint8_t*)(vfull->data.data) + byte_offset_for_output,
+         v_ptr + byte_offset_for_output,
+         num_slots_for_output * num_bytes_per_tensor);
+
+  // Update counts.
+  current_num_entries =
+      std::min(first_slot + num_slots_needed, max_num_entries);
+  op_data->key_cache_buffer->SetNumEntries(layer_index, current_num_entries);
+  op_data->value_cache_buffer->SetNumEntries(layer_index, current_num_entries);
 
   return kTfLiteOk;
 }
diff --git a/tensorflow/lite/experimental/genai/kvcache_test.cc b/tensorflow/lite/experimental/genai/kvcache_test.cc
index 6b338233cb16cf..375b03b1f4a377 100644
--- a/tensorflow/lite/experimental/genai/kvcache_test.cc
+++ b/tensorflow/lite/experimental/genai/kvcache_test.cc
@@ -13,38 +13,50 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/experimental/genai/kvcache.h"
-
+#include <cstdint>
 #include <vector>
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/experimental/genai/genai_ops.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
 
+static const int kDefaultMaxNumCacheEntries = 2048;
+
 class SimpleCacheOpModel : public SingleOpModel {
  public:
-  SimpleCacheOpModel(const TensorData& k_tensor, const TensorData& v_tensor) {
+  SimpleCacheOpModel(const TensorData& pos_tensor, const TensorData& k_tensor,
+                     const TensorData& v_tensor) {
+    pos_ = AddInput(pos_tensor);
     k_ = AddInput(k_tensor);
     v_ = AddInput(v_tensor);
     kfull_ = AddOutput(k_tensor.type);
     vfull_ = AddOutput(v_tensor.type);
     SetCustomOp("KV_Cache", {}, ops::custom::Register_KV_CACHE);
 
-    BuildInterpreter({GetShape(k_), GetShape(v_)});
+    BuildInterpreter({GetShape(pos_), GetShape(k_), GetShape(v_)});
   }
 
+  void SetPosition(const std::vector<int64_t>& data) {
+    PopulateTensor(pos_, data);
+  }
   void SetKey(const std::vector<float>& data) { PopulateTensor(k_, data); }
   void SetValue(const std::vector<float>& data) { PopulateTensor(v_, data); }
+
+  void ResizePosition(const std::vector<int>& dims) {
+    interpreter_->ResizeInputTensor(pos_, dims);
+  }
   void ResizeKey(const std::vector<int>& dims) {
     interpreter_->ResizeInputTensor(k_, dims);
   }
   void ResizeValue(const std::vector<int>& dims) {
     interpreter_->ResizeInputTensor(v_, dims);
   }
+
   std::vector<float> GetFullK() {
     const auto output = ExtractVector<float>(kfull_);
     return output;
@@ -58,33 +70,37 @@ class SimpleCacheOpModel : public SingleOpModel {
   TfLiteStatus ReAllocate() { return interpreter_->AllocateTensors(); }
 
  protected:
+  int pos_;
   int k_;
   int v_;
   int kfull_;
   int vfull_;
-  int idx_;
 };
 
 TEST(SimpleCacheOp1Test, BasicTest) {
-  SimpleCacheOpModel m({TensorType_FLOAT32, {1, 2, 2, 3}},
+  SimpleCacheOpModel m({TensorType_INT64, {2}},
+                       {TensorType_FLOAT32, {1, 2, 2, 3}},
                        {TensorType_FLOAT32, {1, 2, 2, 3}});
 
+  m.SetPosition({0, 1});
   m.SetKey({{1, 0, -6, 2, 4, 3, 1, 0, -6, 2, 4, 3}});
   m.SetValue({{4, 2, -4, 2, 4, 2, 4, 2, -4, 2, 4, 2}});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   std::vector<float> fullk = m.GetFullK();
   std::vector<float> fullv = m.GetFullV();
-  ASSERT_EQ(fullk.size(), 2 * 3 * 1024);
-  ASSERT_EQ(fullv.size(), 2 * 3 * 1024);
+  ASSERT_EQ(fullk.size(), 2 * 3 * kDefaultMaxNumCacheEntries);
+  ASSERT_EQ(fullv.size(), 2 * 3 * kDefaultMaxNumCacheEntries);
 }
 
 TEST(SimpleCacheOp2Test, AddToCache) {
-  SimpleCacheOpModel m({TensorType_FLOAT32, {1, 2, 2, 3}},
+  SimpleCacheOpModel m({TensorType_INT64, {2}},
+                       {TensorType_FLOAT32, {1, 2, 2, 3}},
                        {TensorType_FLOAT32, {1, 2, 2, 3}});
 
-  std::vector<float> key = {1, 5, -6, 2, 4, 3, 1, 5, -6, 2, 4, 3};
+  m.SetPosition({0, 1});
+  std::vector<float> key = {1, 5, -6, 2, 4, 3, 8, 9, -8, 7, 2, 11};
   m.SetKey(key);
-  std::vector<float> value = {4, 2, -4, 2, 4, 2, 4, 2, -4, 2, 4, 2};
+  std::vector<float> value = {2, 3, -4, 5, 6, 7, 1, 8, -12, 11, 14, 21};
   m.SetValue(value);
   const int key_size = 2 * 3;
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
@@ -99,10 +115,12 @@ TEST(SimpleCacheOp2Test, AddToCache) {
     ASSERT_EQ(fullv[i], 0.);
   }
 
-  ASSERT_EQ(fullk.size(), 2 * 3 * 1024);
-  ASSERT_EQ(fullv.size(), 2 * 3 * 1024);
+  ASSERT_EQ(fullk.size(), 2 * 3 * kDefaultMaxNumCacheEntries);
+  ASSERT_EQ(fullv.size(), 2 * 3 * kDefaultMaxNumCacheEntries);
 
   for (int i = 0; i < 510; i++) {
+    int offset = 2 * i + 2;
+    m.SetPosition({offset, offset + 1});
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
   }
 
@@ -119,9 +137,11 @@ TEST(SimpleCacheOp2Test, AddToCache) {
 }
 
 TEST(SimpleCacheOp2Test, ShiftSlotsInCache) {
-  SimpleCacheOpModel m({TensorType_FLOAT32, {1, 2, 2, 3}},
+  SimpleCacheOpModel m({TensorType_INT64, {2}},
+                       {TensorType_FLOAT32, {1, 2, 2, 3}},
                        {TensorType_FLOAT32, {1, 2, 2, 3}});
 
+  m.SetPosition({0, 1});
   std::vector<float> key = {1, 5, -6, 2, 4, 3, 2, 6, -7, 3, 5, 4};
   m.SetKey(key);
   std::vector<float> value = {4, 2, -4, 2, 4, 2, 9, 8, -9, 8, 9, 1};
@@ -137,11 +157,14 @@ TEST(SimpleCacheOp2Test, ShiftSlotsInCache) {
     ASSERT_EQ(fullk[i], 0.);
     ASSERT_EQ(fullv[i], 0.);
   }
-  ASSERT_EQ(fullk.size(), 2 * 3 * 1024);
-  ASSERT_EQ(fullv.size(), 2 * 3 * 1024);
+  ASSERT_EQ(fullk.size(), 2 * 3 * kDefaultMaxNumCacheEntries);
+  ASSERT_EQ(fullv.size(), 2 * 3 * kDefaultMaxNumCacheEntries);
 
   // Now fill up the cache
-  for (int i = 0; i < 511; i++) {
+  for (int i = 0; i < 1023; i++) {
+    ASSERT_EQ(m.Invoke(), kTfLiteOk);
+    int offset = 2 * i + 2;
+    m.SetPosition({offset, offset + 1});
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
   }
 
@@ -164,6 +187,7 @@ TEST(SimpleCacheOp2Test, ShiftSlotsInCache) {
   m.SetKey(key2);
   std::vector<float> value2 = {8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9};
   m.SetValue(value2);
+  m.SetPosition({2048, 2049});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
 
   fullk = m.GetFullK();
@@ -179,12 +203,14 @@ TEST(SimpleCacheOp2Test, ShiftSlotsInCache) {
   // the cached contents.
   m.ResizeKey({1, 1, 2, 3});
   m.ResizeValue({1, 1, 2, 3});
+  m.ResizePosition({1});
   m.ReAllocate();
 
   std::vector<float> key3 = {4, 4, 4, 4, 4, 4};
   m.SetKey(key3);
   std::vector<float> value3 = {2, 2, 2, 2, 2, 2};
   m.SetValue(value3);
+  m.SetPosition({2050});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
 
   fullk = m.GetFullK();
@@ -195,6 +221,20 @@ TEST(SimpleCacheOp2Test, ShiftSlotsInCache) {
     ASSERT_EQ(fullk[idxfull], key3[j]);
     ASSERT_EQ(fullv[idxfull], value3[j]);
   }
+
+  // Verify that other cache entries got shifted up 1.
+  for (int j = 0; j < 6; ++j) {
+    int idxfull = fullk.size() - 12 + j;
+    ASSERT_EQ(fullk[idxfull], key2[6 + j]);
+    ASSERT_EQ(fullv[idxfull], value2[6 + j]);
+  }
+
+  std::vector<float> key4 = {5, 5, 5, 5, 5, 5};
+  m.SetKey(key3);
+  std::vector<float> value4 = {3, 3, 3, 3, 3, 3};
+  m.SetValue(value3);
+  m.SetPosition({0});
+  ASSERT_EQ(m.Invoke(), kTfLiteError);
 }
 
 }  // namespace
diff --git a/tensorflow/lite/experimental/genai/pywrap_genai_ops.pyi b/tensorflow/lite/experimental/genai/pywrap_genai_ops.pyi
new file mode 100644
index 00000000000000..2bb9bf284a5d0c
--- /dev/null
+++ b/tensorflow/lite/experimental/genai/pywrap_genai_ops.pyi
@@ -0,0 +1,16 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+def GenAIOpsRegisterer(arg0: int) -> None: ...
diff --git a/tensorflow/lite/experimental/genai/sdpa.cc b/tensorflow/lite/experimental/genai/sdpa.cc
new file mode 100644
index 00000000000000..c551c68096c315
--- /dev/null
+++ b/tensorflow/lite/experimental/genai/sdpa.cc
@@ -0,0 +1,563 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <math.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <limits>
+
+#include "flatbuffers/flexbuffers.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/reference/add.h"
+#include "tensorflow/lite/kernels/internal/reference/batch_matmul.h"
+#include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
+#include "tensorflow/lite/kernels/internal/reference/softmax.h"
+#include "tensorflow/lite/kernels/internal/reference/transpose.h"
+#include "tensorflow/lite/kernels/internal/runtime_shape.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+namespace llm {
+
+static const int kQueryTensor = 0;
+static const int kKeyTensor = 1;
+static const int kValueTensor = 2;
+static const int kAttentionMaskTensor = 3;
+static const int kOutputTensor = 0;
+
+static const int kNumTempTensors = 8;
+static const int kTransposeQueryTempTensorIndex = 0;
+static const int kTransposeKeyTempTensorIndex = 1;
+static const int kMatMul1TempTensorIndex = 2;
+static const int kAddTempTensorIndex = 3;
+static const int kTransposeValueTempTensorIndex = 4;
+static const int kMatMul2TempTensorIndex = 5;
+static const int kReshape1TempTensorIndex = 6;
+static const int kReshape2TempTensorIndex = 7;
+
+struct OpData {
+  float scale;
+  int scratch_tensor_index;
+};
+
+void* SDPAInit(TfLiteContext* context, const char* buffer, size_t length) {
+  OpData* op_data = new OpData();
+  op_data->scale = 0.0f;
+  context->AddTensors(context, kNumTempTensors, &op_data->scratch_tensor_index);
+  return op_data;
+}
+
+TfLiteStatus SDPAPrepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 4);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
+
+  const TfLiteTensor* q_tensor;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kQueryTensor, &q_tensor));
+  const TfLiteTensor* k_tensor;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kKeyTensor, &k_tensor));
+  const TfLiteTensor* v_tensor;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kValueTensor, &v_tensor));
+  const TfLiteTensor* mask_tensor;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node, kAttentionMaskTensor, &mask_tensor));
+  TF_LITE_ENSURE_EQ(context, NumDimensions(q_tensor), NumDimensions(k_tensor));
+  TF_LITE_ENSURE_EQ(context, NumDimensions(k_tensor), NumDimensions(v_tensor));
+  TF_LITE_ENSURE_EQ(context, NumDimensions(v_tensor),
+                    NumDimensions(mask_tensor));
+  TF_LITE_ENSURE_EQ(context, NumDimensions(mask_tensor), 4);
+
+  // Get custom op params
+  const uint8_t* buffer =
+      reinterpret_cast<const uint8_t*>(node->custom_initial_data);
+  const size_t length = node->custom_initial_data_size;
+  auto flexbuffer_map = flexbuffers::GetRoot(buffer, length).AsMap();
+  float scale = flexbuffer_map["scale"].AsFloat();
+  op_data->scale = scale > 0.0f ? scale : 0.0f;
+
+  // If scale is not set, use sqrt(q_tensor->dims->data[3])
+  if (op_data->scale == 0.0f)
+    op_data->scale = 1 / sqrt(q_tensor->dims->data[3]);
+
+  TfLiteIntArrayFree(node->temporaries);
+  node->temporaries = TfLiteIntArrayCreate(kNumTempTensors);
+  bool mqa = k_tensor->dims->data[2] == 1;
+
+  // Temp tensor for Transposed Q;
+  {
+    node->temporaries->data[kTransposeQueryTempTensorIndex] =
+        op_data->scratch_tensor_index + kTransposeQueryTempTensorIndex;
+    TfLiteTensor* scratch_buffer;
+    TF_LITE_ENSURE_OK(context,
+                      GetTemporarySafe(context, node,
+                                       /*index=*/kTransposeQueryTempTensorIndex,
+                                       &scratch_buffer));
+    TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(4);
+    for (int i = 0; i < 4; ++i) {
+      scratch_buffer_size->data[i] = q_tensor->dims->data[i];
+    }
+    // Swap middle two dimensions.
+    scratch_buffer_size->data[1] = q_tensor->dims->data[2];
+    scratch_buffer_size->data[2] = q_tensor->dims->data[1];
+
+    scratch_buffer->type = kTfLiteFloat32;
+    scratch_buffer->allocation_type = kTfLiteArenaRw;
+    TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_buffer,
+                                                     scratch_buffer_size));
+  }
+
+  // Temp tensor for Transposed K;
+  {
+    node->temporaries->data[kTransposeKeyTempTensorIndex] =
+        op_data->scratch_tensor_index + kTransposeKeyTempTensorIndex;
+    TfLiteTensor* scratch_buffer;
+    TF_LITE_ENSURE_OK(context,
+                      GetTemporarySafe(context, node,
+                                       /*index=*/kTransposeKeyTempTensorIndex,
+                                       &scratch_buffer));
+    TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(4);
+    for (int i = 0; i < 4; ++i) {
+      scratch_buffer_size->data[i] = k_tensor->dims->data[i];
+    }
+    // Swap to middle two dimensions.
+    scratch_buffer_size->data[1] = k_tensor->dims->data[2];
+    scratch_buffer_size->data[2] = k_tensor->dims->data[1];
+
+    scratch_buffer->type = kTfLiteFloat32;
+    scratch_buffer->allocation_type = kTfLiteArenaRw;
+    TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_buffer,
+                                                     scratch_buffer_size));
+  }
+
+  TfLiteIntArray* add_broadcast_shape = nullptr;
+  // Temp tensor for Matmul1 output;
+  {
+    node->temporaries->data[kMatMul1TempTensorIndex] =
+        op_data->scratch_tensor_index + kMatMul1TempTensorIndex;
+    TfLiteTensor* scratch_buffer;
+    TF_LITE_ENSURE_OK(
+        context,
+        GetTemporarySafe(context, node,
+                         /*index=*/kMatMul1TempTensorIndex, &scratch_buffer));
+    TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(4);
+    // mha/gqa: [permute_q[0], permute_q[1], permute_q[2], permute_k[2]]
+    int matmul_out_shape[4] = {q_tensor->dims->data[0], q_tensor->dims->data[2],
+                               q_tensor->dims->data[1],
+                               k_tensor->dims->data[1]};
+    for (int i = 0; i < 4; ++i) {
+      scratch_buffer_size->data[i] = matmul_out_shape[i];
+    }
+
+    scratch_buffer->type = kTfLiteFloat32;
+    scratch_buffer->allocation_type = kTfLiteArenaRw;
+    TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_buffer,
+                                                     scratch_buffer_size));
+    // get dims from attention_mask, matmul1_out for add broadcast
+    CalculateShapeForBroadcast(context, mask_tensor, scratch_buffer,
+                               &add_broadcast_shape);
+  }
+
+  // Temp tensor for add output;
+  int add_out_shape[4];
+  {
+    node->temporaries->data[kAddTempTensorIndex] =
+        op_data->scratch_tensor_index + kAddTempTensorIndex;
+    TfLiteTensor* scratch_buffer;
+    TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node,
+                                                /*index=*/kAddTempTensorIndex,
+                                                &scratch_buffer));
+    TfLiteIntArray* scratch_buffer_size = add_broadcast_shape;
+    for (int i = 0; i < 4; ++i) {
+      add_out_shape[i] = scratch_buffer_size->data[i];
+    }
+
+    scratch_buffer->type = kTfLiteFloat32;
+    scratch_buffer->allocation_type = kTfLiteArenaRw;
+    TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_buffer,
+                                                     scratch_buffer_size));
+  }
+
+  // Temp tensor for Transposed V;
+  {
+    node->temporaries->data[kTransposeValueTempTensorIndex] =
+        op_data->scratch_tensor_index + kTransposeValueTempTensorIndex;
+    TfLiteTensor* scratch_buffer;
+    TF_LITE_ENSURE_OK(context,
+                      GetTemporarySafe(context, node,
+                                       /*index=*/kTransposeValueTempTensorIndex,
+                                       &scratch_buffer));
+    TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(4);
+    // Swap to {0, 2, 3, 1} dimensions.
+    scratch_buffer_size->data[0] = v_tensor->dims->data[0];
+    scratch_buffer_size->data[1] = v_tensor->dims->data[2];
+    scratch_buffer_size->data[2] = v_tensor->dims->data[3];
+    scratch_buffer_size->data[3] = v_tensor->dims->data[1];
+
+    scratch_buffer->type = kTfLiteFloat32;
+    scratch_buffer->allocation_type = kTfLiteArenaRw;
+    TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_buffer,
+                                                     scratch_buffer_size));
+  }
+
+  // Temp tensor for Matmul2 output;
+  {
+    node->temporaries->data[kMatMul2TempTensorIndex] =
+        op_data->scratch_tensor_index + kMatMul2TempTensorIndex;
+    TfLiteTensor* scratch_buffer;
+    TF_LITE_ENSURE_OK(
+        context,
+        GetTemporarySafe(context, node,
+                         /*index=*/kMatMul2TempTensorIndex, &scratch_buffer));
+    TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(4);
+    // logits_out_shape = add_out_shape
+    // mha/gqa: [logits_out[0], logits_out[1], logits_out[2], permute_v[2]]
+    scratch_buffer_size->data[0] = add_out_shape[0];
+    scratch_buffer_size->data[1] = add_out_shape[1];
+    scratch_buffer_size->data[2] = add_out_shape[2];
+    scratch_buffer_size->data[3] = v_tensor->dims->data[3];
+
+    scratch_buffer->type = kTfLiteFloat32;
+    scratch_buffer->allocation_type = kTfLiteArenaRw;
+    TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_buffer,
+                                                     scratch_buffer_size));
+  }
+
+  // Temp tensor for Reshape K / Transpose Q;
+  {
+    node->temporaries->data[kReshape1TempTensorIndex] =
+        op_data->scratch_tensor_index + kReshape1TempTensorIndex;
+    TfLiteTensor* scratch_buffer;
+    TF_LITE_ENSURE_OK(
+        context,
+        GetTemporarySafe(context, node,
+                         /*index=*/kReshape1TempTensorIndex, &scratch_buffer));
+    TfLiteIntArray* scratch_buffer_size;
+    if (mqa)
+      scratch_buffer_size = TfLiteIntArrayCreate(2);
+    else
+      scratch_buffer_size = TfLiteIntArrayCreate(4);
+    if (mqa) {
+      scratch_buffer_size->data[0] = k_tensor->dims->data[1];
+      scratch_buffer_size->data[1] = k_tensor->dims->data[3];
+    } else {
+      scratch_buffer_size->data[0] = q_tensor->dims->data[0];
+      scratch_buffer_size->data[1] = q_tensor->dims->data[2];
+      scratch_buffer_size->data[2] = q_tensor->dims->data[3];
+      scratch_buffer_size->data[3] = q_tensor->dims->data[1];
+    }
+
+    scratch_buffer->type = kTfLiteFloat32;
+    scratch_buffer->allocation_type = kTfLiteArenaRw;
+    TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_buffer,
+                                                     scratch_buffer_size));
+  }
+
+  // Temp tensor for Reshape V / Add_out (softmax_out);
+  {
+    node->temporaries->data[kReshape2TempTensorIndex] =
+        op_data->scratch_tensor_index + kReshape2TempTensorIndex;
+    TfLiteTensor* scratch_buffer;
+    TF_LITE_ENSURE_OK(
+        context,
+        GetTemporarySafe(context, node,
+                         /*index=*/kReshape2TempTensorIndex, &scratch_buffer));
+    TfLiteIntArray* scratch_buffer_size;
+    if (mqa)
+      scratch_buffer_size = TfLiteIntArrayCreate(2);
+    else
+      scratch_buffer_size = TfLiteIntArrayCreate(4);
+    if (mqa) {
+      scratch_buffer_size->data[0] = v_tensor->dims->data[3];
+      scratch_buffer_size->data[1] = v_tensor->dims->data[1];
+    } else {
+      scratch_buffer_size->data[0] = add_out_shape[0];
+      scratch_buffer_size->data[1] = add_out_shape[1];
+      scratch_buffer_size->data[2] = add_out_shape[3];
+      scratch_buffer_size->data[3] = add_out_shape[2];
+    }
+
+    scratch_buffer->type = kTfLiteFloat32;
+    scratch_buffer->allocation_type = kTfLiteArenaRw;
+    TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_buffer,
+                                                     scratch_buffer_size));
+  }
+
+  return kTfLiteOk;
+}
+
+void SDPAFree(TfLiteContext* context, void* buffer) {
+  delete static_cast<OpData*>(buffer);
+}
+
+TfLiteStatus SDPAEval(TfLiteContext* context, TfLiteNode* node) {
+  /*
+  Simple implementation of Scaled Dot Product Attention.
+  Takes query_proj, key_proj, value_proj, mask tensors as inputs, and
+  outputs the attention result.
+
+  Notes:
+  Scale is computed using 1/sqrt(head_dim),
+  head_dim = q[-1] = embedding_dim // num_q_heads
+  Only support for FLOAT32 inputs for now.
+  Only support static tensors for now (k/v[1] = max sequence length)
+  */
+
+  const TfLiteTensor* query_tensor;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kQueryTensor, &query_tensor));
+  auto query_shape = GetTensorShape(query_tensor);
+  auto query_data = GetTensorData<float>(query_tensor);
+  const TfLiteTensor* key_tensor;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kKeyTensor, &key_tensor));
+  auto key_shape = GetTensorShape(key_tensor);
+  auto key_data = GetTensorData<float>(key_tensor);
+  const TfLiteTensor* value_tensor;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kValueTensor, &value_tensor));
+  auto value_shape = GetTensorShape(value_tensor);
+  auto value_data = GetTensorData<float>(value_tensor);
+  const TfLiteTensor* attention_mask_tensor;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kAttentionMaskTensor,
+                                          &attention_mask_tensor));
+  auto attention_mask_shape = GetTensorShape(attention_mask_tensor);
+  auto attention_mask_data = GetTensorData<float>(attention_mask_tensor);
+  TfLiteTensor* output_tensor;
+  TF_LITE_ENSURE_OK(
+      context, GetOutputSafe(context, node, kOutputTensor, &output_tensor));
+  auto output_shape = GetTensorShape(output_tensor);
+  auto output_data = GetTensorData<float>(output_tensor);
+
+  // temporaries
+  TfLiteTensor* transpose_q_out_tensor;
+  TF_LITE_ENSURE_OK(
+      context,
+      GetTemporarySafe(context, node, /*index=*/kTransposeQueryTempTensorIndex,
+                       &transpose_q_out_tensor));
+  auto transpose_q_out_shape = GetTensorShape(transpose_q_out_tensor);
+  auto transpose_q_out_data = GetTensorData<float>(transpose_q_out_tensor);
+  TfLiteTensor* transpose_k_out_tensor;
+  TF_LITE_ENSURE_OK(
+      context,
+      GetTemporarySafe(context, node, /*index=*/kTransposeKeyTempTensorIndex,
+                       &transpose_k_out_tensor));
+  auto transpose_k_out_shape = GetTensorShape(transpose_k_out_tensor);
+  auto transpose_k_out_data = GetTensorData<float>(transpose_k_out_tensor);
+  TfLiteTensor* matmul1_out_tensor;
+  TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node,
+                                              /*index=*/kMatMul1TempTensorIndex,
+                                              &matmul1_out_tensor));
+  auto matmul1_out_shape = GetTensorShape(matmul1_out_tensor);
+  auto matmul1_out_data = GetTensorData<float>(matmul1_out_tensor);
+  TfLiteTensor* add_out_tensor;
+  TF_LITE_ENSURE_OK(
+      context, GetTemporarySafe(context, node, /*index=*/kAddTempTensorIndex,
+                                &add_out_tensor));
+  auto add_out_shape = GetTensorShape(add_out_tensor);
+  auto add_out_data = GetTensorData<float>(add_out_tensor);
+  TfLiteTensor* transpose_v_out_tensor;
+  TF_LITE_ENSURE_OK(
+      context,
+      GetTemporarySafe(context, node, /*index=*/kTransposeValueTempTensorIndex,
+                       &transpose_v_out_tensor));
+  auto transpose_v_out_shape = GetTensorShape(transpose_v_out_tensor);
+  auto transpose_v_out_data = GetTensorData<float>(transpose_v_out_tensor);
+  TfLiteTensor* matmul2_out_tensor;
+  TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node,
+                                              /*index=*/kMatMul2TempTensorIndex,
+                                              &matmul2_out_tensor));
+  auto matmul2_out_shape = GetTensorShape(matmul2_out_tensor);
+  auto matmul2_out_data = GetTensorData<float>(matmul2_out_tensor);
+  TfLiteTensor* reshape_k_or_q_out_tensor;
+  TF_LITE_ENSURE_OK(
+      context,
+      GetTemporarySafe(context, node, /*index=*/kReshape1TempTensorIndex,
+                       &reshape_k_or_q_out_tensor));
+  auto reshape_k_or_q_out_shape = GetTensorShape(reshape_k_or_q_out_tensor);
+  auto reshape_k_or_q_out_data =
+      GetTensorData<float>(reshape_k_or_q_out_tensor);
+  TfLiteTensor* reshape_v_or_add_out_tensor;
+  TF_LITE_ENSURE_OK(
+      context,
+      GetTemporarySafe(context, node, /*index=*/kReshape2TempTensorIndex,
+                       &reshape_v_or_add_out_tensor));
+  auto reshape_v_or_add_out_shape = GetTensorShape(reshape_v_or_add_out_tensor);
+  auto reshape_v_or_add_out_data =
+      GetTensorData<float>(reshape_v_or_add_out_tensor);
+
+  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
+
+  bool mqa = key_tensor->dims->data[2] == 1;
+
+  // scale * q
+  float scale = op_data->scale;
+  int flat_size = query_shape.FlatSize();
+  float output_min = -std::numeric_limits<float>::infinity();
+  float output_max = std::numeric_limits<float>::infinity();
+  for (int i = 0; i < flat_size; ++i) {
+    query_tensor->data.f[i] = ActivationFunctionWithMinMax(
+        query_tensor->data.f[i] * scale, output_min, output_max);
+  }
+
+  // permute q {0, 2, 1, 3}
+  tflite::TransposeParams transpose_q_params;
+  transpose_q_params.perm_count = 4;
+  transpose_q_params.perm[0] = 0;
+  transpose_q_params.perm[1] = 2;
+  transpose_q_params.perm[2] = 1;
+  transpose_q_params.perm[3] = 3;
+  reference_ops::Transpose(transpose_q_params, query_shape, query_data,
+                           transpose_q_out_shape, transpose_q_out_data);
+
+  // permute k {0, 2, 1, 3}
+  tflite::TransposeParams transpose_k_params;
+  transpose_k_params.perm_count = 4;
+  transpose_k_params.perm[0] = 0;
+  transpose_k_params.perm[1] = 2;
+  transpose_k_params.perm[2] = 1;
+  transpose_k_params.perm[3] = 3;
+  reference_ops::Transpose(transpose_k_params, key_shape, key_data,
+                           transpose_k_out_shape, transpose_k_out_data);
+
+  // reshape k for MQA, or transpose q for MHA
+  if (mqa) {
+    TF_LITE_ENSURE_EQ(context, transpose_k_out_tensor->bytes,
+                      reshape_k_or_q_out_tensor->bytes);
+    memcpy(reshape_k_or_q_out_tensor->data.data,
+           transpose_k_out_tensor->data.data, transpose_k_out_tensor->bytes);
+  } else {
+    // permute q2 {0, 1, 3, 2}
+    tflite::TransposeParams transpose_q2_params;
+    transpose_q2_params.perm_count = 4;
+    transpose_q2_params.perm[0] = 0;
+    transpose_q2_params.perm[1] = 1;
+    transpose_q2_params.perm[2] = 3;
+    transpose_q2_params.perm[3] = 2;
+    reference_ops::Transpose(transpose_q2_params, transpose_q_out_shape,
+                             transpose_q_out_data, reshape_k_or_q_out_shape,
+                             reshape_k_or_q_out_data);
+  }
+
+  // mqa FC (q, squeezed_k)
+  // mha BMM(q, k) transpose_b = true
+  if (mqa) {
+    tflite::FullyConnectedParams fc_params;
+    fc_params.float_activation_min = output_min;
+    fc_params.float_activation_max = output_max;
+    reference_ops::FullyConnected(
+        fc_params, transpose_q_out_shape, transpose_q_out_data,
+        reshape_k_or_q_out_shape, reshape_k_or_q_out_data, RuntimeShape(),
+        nullptr, matmul1_out_shape, matmul1_out_data);
+  } else {
+    // pass rhs first (this is why we transpose q above)
+    reference_ops::BatchMatMul(
+        transpose_k_out_shape, transpose_k_out_data, reshape_k_or_q_out_shape,
+        reshape_k_or_q_out_data, matmul1_out_shape, matmul1_out_data);
+  }
+
+  // add matmul_out + mask
+  tflite::ArithmeticParams add_params;
+  SetActivationParams(output_min, output_max, &add_params);
+  reference_ops::BroadcastAdd6DSlow(
+      add_params, attention_mask_shape, attention_mask_data, matmul1_out_shape,
+      matmul1_out_data, add_out_shape, add_out_data);
+
+  // softmax, can do in-place
+  tflite::SoftmaxParams softmax_params;
+  softmax_params.beta = 1.0f;
+  reference_ops::Softmax(softmax_params, add_out_shape, add_out_data,
+                         add_out_shape, add_out_data);
+
+  // permute v {0, 2, 3, 1}
+  tflite::TransposeParams transpose_v_params;
+  transpose_v_params.perm_count = 4;
+  transpose_v_params.perm[0] = 0;
+  transpose_v_params.perm[1] = 2;
+  transpose_v_params.perm[2] = 3;
+  transpose_v_params.perm[3] = 1;
+  reference_ops::Transpose(transpose_v_params, value_shape, value_data,
+                           transpose_v_out_shape, transpose_v_out_data);
+
+  // reshape v for MQA, or add_out (softmax_out)
+  if (mqa) {
+    TF_LITE_ENSURE_EQ(context, transpose_v_out_tensor->bytes,
+                      reshape_v_or_add_out_tensor->bytes);
+    memcpy(reshape_v_or_add_out_tensor->data.data,
+           transpose_v_out_tensor->data.data, transpose_v_out_tensor->bytes);
+  } else {
+    // permute softmax_out {0, 1, 3, 2}
+    tflite::TransposeParams transpose_softmax_out_params;
+    transpose_softmax_out_params.perm_count = 4;
+    transpose_softmax_out_params.perm[0] = 0;
+    transpose_softmax_out_params.perm[1] = 1;
+    transpose_softmax_out_params.perm[2] = 3;
+    transpose_softmax_out_params.perm[3] = 2;
+    reference_ops::Transpose(transpose_softmax_out_params, add_out_shape,
+                             add_out_data, reshape_v_or_add_out_shape,
+                             reshape_v_or_add_out_data);
+  }
+
+  // mqa FC (softmax_out, squeezed_v)
+  // mha BMM(softmax_out, v) transpose_b = true
+  if (mqa) {
+    tflite::FullyConnectedParams fc_params;
+    fc_params.float_activation_min = output_min;
+    fc_params.float_activation_max = output_max;
+    reference_ops::FullyConnected(fc_params, add_out_shape, add_out_data,
+                                  reshape_v_or_add_out_shape,
+                                  reshape_v_or_add_out_data, RuntimeShape(),
+                                  nullptr, matmul2_out_shape, matmul2_out_data);
+  } else {
+    // pass rhs first (this is why we transpose add_out above)
+    reference_ops::BatchMatMul(
+        transpose_v_out_shape, transpose_v_out_data, reshape_v_or_add_out_shape,
+        reshape_v_or_add_out_data, matmul2_out_shape, matmul2_out_data);
+  }
+
+  // permute out {0, 2, 1, 3}
+  tflite::TransposeParams transpose_out_params;
+  transpose_out_params.perm_count = 4;
+  transpose_out_params.perm[0] = 0;
+  transpose_out_params.perm[1] = 2;
+  transpose_out_params.perm[2] = 1;
+  transpose_out_params.perm[3] = 3;
+  reference_ops::Transpose(transpose_out_params, matmul2_out_shape,
+                           matmul2_out_data, output_shape, output_data);
+
+  return kTfLiteOk;
+}
+
+}  // namespace llm
+
+TfLiteRegistration* Register_SDPA() {
+  static TfLiteRegistration r = {llm::SDPAInit, llm::SDPAFree, llm::SDPAPrepare,
+                                 llm::SDPAEval};
+  return &r;
+}
+
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/microfrontend/BUILD b/tensorflow/lite/experimental/microfrontend/BUILD
index e1c4f30baa7ffd..921b54b2b58d18 100644
--- a/tensorflow/lite/experimental/microfrontend/BUILD
+++ b/tensorflow/lite/experimental/microfrontend/BUILD
@@ -23,8 +23,11 @@ cc_library(
     hdrs = ["audio_microfrontend.h"],
     deps = [
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/c:common",
         "//tensorflow/lite/experimental/microfrontend/lib:frontend",
         "//tensorflow/lite/kernels:kernel_util",
+        "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/kernels/internal:reference",
         "@flatbuffers",
     ],
@@ -71,6 +74,7 @@ cc_test(
         "//tensorflow/lite:framework",
         "//tensorflow/lite/core:framework",
         "//tensorflow/lite/kernels:test_util",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest_main",
         "@flatbuffers",
     ],
diff --git a/tensorflow/lite/experimental/microfrontend/audio_microfrontend.cc b/tensorflow/lite/experimental/microfrontend/audio_microfrontend.cc
index 090871da2ef807..3748c273c52ced 100644
--- a/tensorflow/lite/experimental/microfrontend/audio_microfrontend.cc
+++ b/tensorflow/lite/experimental/microfrontend/audio_microfrontend.cc
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "flatbuffers/flexbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/context.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/experimental/microfrontend/lib/frontend.h"
 #include "tensorflow/lite/experimental/microfrontend/lib/frontend_util.h"
-#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/experimental/microfrontend/audio_microfrontend_test.cc b/tensorflow/lite/experimental/microfrontend/audio_microfrontend_test.cc
index e5c7cf130c7b52..b8dbf953600075 100644
--- a/tensorflow/lite/experimental/microfrontend/audio_microfrontend_test.cc
+++ b/tensorflow/lite/experimental/microfrontend/audio_microfrontend_test.cc
@@ -23,8 +23,8 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "flatbuffers/flexbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/core/interpreter.h"
-#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/experimental/resource/cache_buffer.cc b/tensorflow/lite/experimental/resource/cache_buffer.cc
index 0e221589b4cc64..e7584d8c6d94b4 100644
--- a/tensorflow/lite/experimental/resource/cache_buffer.cc
+++ b/tensorflow/lite/experimental/resource/cache_buffer.cc
@@ -26,30 +26,30 @@ limitations under the License.
 namespace tflite {
 namespace resource {
 
-constexpr char kCacheBufferTensorName[] = "CacheBuffer";
-
-TfLiteStatus CacheBuffer::Initialize(const TfLiteIntArray &shape,
-                                     const TfLiteType &type) {
-  // Set basic parameters.
-  tensor_.name = kCacheBufferTensorName;
-  tensor_.allocation_type = kTfLiteDynamic;
-  tensor_.type = type;
-
-  // Set the shape and allocate the memory.
-  tensor_.dims = TfLiteIntArrayCopy(&shape);
-  const size_t num_bytes = TfLiteTypeGetSize(type) * NumElements(&tensor_);
-  TfLiteTensorRealloc(num_bytes, &tensor_);
-
-  memset(tensor_.data.raw, 0, tensor_.bytes);
+TfLiteStatus CacheBuffer::Initialize(const TfLiteIntArray& shape) {
+  // Set the dims and allocate the memory.
+  dims_ = TfLiteIntArrayCopy(&shape);
+  const size_t buf_size = NumElements(&shape);
+  buffer_.reset(new float[buf_size]);
+  memset(buffer_.get(), 0, sizeof(float) * buf_size);
+
+  num_entries_.reset(new size_t[shape.data[1]]);
+  memset(num_entries_.get(), 0, sizeof(size_t) * shape.data[1]);
   is_initialized_ = true;
   return kTfLiteOk;
 }
 
-size_t CacheBuffer::GetNumEntries() const { return num_entries_; }
+size_t CacheBuffer::GetSize() { return sizeof(float) * NumElements(dims_); }
+
+size_t CacheBuffer::GetNumEntries(int idx) const { return num_entries_[idx]; }
+
+CacheBuffer::~CacheBuffer() { TfLiteIntArrayFree(dims_); }
+
+float* CacheBuffer::GetBuffer() { return buffer_.get(); }
 
-void CacheBuffer::SetNumEntries(size_t count) {
-  TFLITE_DCHECK(count <= tensor_.dims->data[2]);
-  num_entries_ = count;
+void CacheBuffer::SetNumEntries(int idx, size_t count) {
+  TFLITE_DCHECK(count <= dims_->data[2]);
+  num_entries_[idx] = count;
 }
 
 }  // namespace resource
diff --git a/tensorflow/lite/experimental/resource/cache_buffer.h b/tensorflow/lite/experimental/resource/cache_buffer.h
index 1e500fab07c269..88e45fa0029a4b 100644
--- a/tensorflow/lite/experimental/resource/cache_buffer.h
+++ b/tensorflow/lite/experimental/resource/cache_buffer.h
@@ -34,15 +34,22 @@ class CacheBuffer : public ResourceVariable {
  public:
   CacheBuffer() = default;
   CacheBuffer(const CacheBuffer &) = delete;
+  ~CacheBuffer() override;
   CacheBuffer &operator=(const CacheBuffer &) = delete;
   // Initialize tensor of a certain shape using the provided type.
-  TfLiteStatus Initialize(const TfLiteIntArray &shape, const TfLiteType &type);
-  size_t GetNumEntries() const;
-  void SetNumEntries(size_t count);
+  TfLiteStatus Initialize(const TfLiteIntArray &shape);
+  size_t GetNumEntries(int idx) const;
+  float *GetBuffer();
+  size_t GetSize();
+  void SetNumEntries(int idx, size_t count);
 
  private:
   // The number of entries currently used in the buffer;
-  size_t num_entries_ = 0;
+  std::unique_ptr<size_t[]> num_entries_;
+  // The float buffer for storage. Has shape:
+  // <batch, num layers, seq length, num heads, head dim>
+  std::unique_ptr<float[]> buffer_;
+  TfLiteIntArray *dims_;
 };
 
 }  // namespace resource
diff --git a/tensorflow/lite/experimental/resource/cache_buffer_test.cc b/tensorflow/lite/experimental/resource/cache_buffer_test.cc
index 6b54f6c787138d..b80809f2d9d249 100644
--- a/tensorflow/lite/experimental/resource/cache_buffer_test.cc
+++ b/tensorflow/lite/experimental/resource/cache_buffer_test.cc
@@ -27,19 +27,16 @@ TEST(CacheBufferTest, Initialize) {
   shape->data[2] = 5;
   shape->data[3] = 7;
 
-  TfLiteType type = kTfLiteFloat32;
   CacheBuffer cache_buffer;
-  cache_buffer.Initialize(*shape, type);
-
-  EXPECT_EQ(cache_buffer.GetTensor()->type, type);
-  EXPECT_EQ(cache_buffer.GetTensor()->dims->size, 4);
-  EXPECT_EQ(cache_buffer.GetTensor()->dims->data[0], 1);
-  EXPECT_EQ(cache_buffer.GetTensor()->dims->data[1], 3);
-  EXPECT_EQ(cache_buffer.GetTensor()->bytes, 420);
-  ASSERT_NE(cache_buffer.GetTensor()->data.raw, nullptr);
-  EXPECT_EQ(cache_buffer.GetNumEntries(), 0);
-  cache_buffer.SetNumEntries(3);
-  EXPECT_EQ(cache_buffer.GetNumEntries(), 3);
+  cache_buffer.Initialize(*shape);
+
+  EXPECT_EQ(cache_buffer.GetSize(), 420);
+  ASSERT_NE(cache_buffer.GetBuffer(), nullptr);
+  EXPECT_EQ(cache_buffer.GetNumEntries(0), 0);
+  EXPECT_EQ(cache_buffer.GetNumEntries(1), 0);
+  EXPECT_EQ(cache_buffer.GetNumEntries(2), 0);
+  cache_buffer.SetNumEntries(0, 3);
+  EXPECT_EQ(cache_buffer.GetNumEntries(0), 3);
   TfLiteIntArrayFree(shape);
 }
 
diff --git a/tensorflow/lite/experimental/shlo/BUILD b/tensorflow/lite/experimental/shlo/BUILD
index a274bab28d0f4a..adf8fe9bc58c60 100644
--- a/tensorflow/lite/experimental/shlo/BUILD
+++ b/tensorflow/lite/experimental/shlo/BUILD
@@ -19,12 +19,29 @@ cc_library(
     hdrs = ["tensor.h"],
     deps = [
         ":data_type",
+        ":overload",
         ":quantized_tensor_element_type",
         ":shape",
+        "@com_google_absl//absl/log:absl_check",
         "@com_google_absl//absl/types:span",
     ],
 )
 
+cc_library(
+    name = "i4",
+    hdrs = ["i4.h"],
+    deps = [],
+)
+
+cc_test(
+    name = "i4_test",
+    srcs = ["i4_test.cc"],
+    deps = [
+        ":i4",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_test(
     name = "tensor_test",
     srcs = ["tensor_test.cc"],
@@ -59,6 +76,7 @@ cc_test(
 
 cc_library(
     name = "quantized_tensor_element_type",
+    srcs = ["quantized_tensor_element_type.cc"],
     hdrs = ["quantized_tensor_element_type.h"],
     deps = [
         ":data_type",
@@ -66,6 +84,7 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log:absl_check",
+        "@com_google_absl//absl/log:absl_log",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -76,7 +95,6 @@ cc_test(
     linkopts = shlo_ref_linkopts(),
     deps = [
         ":data_type",
-        ":f16",
         ":quantized_tensor_element_type",
         "@com_google_googletest//:gtest_main",
     ],
@@ -105,6 +123,36 @@ cc_test(
 cc_library(
     name = "f16",
     hdrs = ["f16.h"],
+    deps = ["@FP16"],
+)
+
+cc_library(
+    name = "f16_emulated",
+    hdrs = ["f16.h"],
+    copts = ["-DSHLO_REF_EMULATE_F16=1"],
+    deps = ["@FP16"],
+)
+
+cc_test(
+    name = "f16_test",
+    srcs = ["f16_test.cc"],
+    linkopts = shlo_ref_linkopts(),
+    deps = [
+        ":f16",
+        "@com_google_absl//absl/base",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_test(
+    name = "f16_emulated_test",
+    srcs = ["f16_test.cc"],
+    linkopts = shlo_ref_linkopts(),
+    deps = [
+        ":f16_emulated",
+        "@com_google_absl//absl/base",
+        "@com_google_googletest//:gtest_main",
+    ],
 )
 
 cc_library(
@@ -113,6 +161,7 @@ cc_library(
     deps = [
         ":bf16",
         ":f16",
+        ":i4",
     ],
 )
 
@@ -134,6 +183,23 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "overload",
+    hdrs = ["overload.h"],
+    visibility = ["//tensorflow/lite/experimental/shlo:__subpackages__"],
+)
+
+cc_test(
+    name = "overload_test",
+    srcs = ["overload_test.cc"],
+    deps = [
+        ":overload",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "quantize",
     hdrs = ["quantize.h"],
diff --git a/tensorflow/lite/experimental/shlo/bf16.h b/tensorflow/lite/experimental/shlo/bf16.h
index 3f228fb161af14..45acdd85669e5b 100644
--- a/tensorflow/lite/experimental/shlo/bf16.h
+++ b/tensorflow/lite/experimental/shlo/bf16.h
@@ -105,6 +105,12 @@ class BF16 {
   INTERNAL_BF16_ARITHMETIC_ASSIGN_OP(*)
   INTERNAL_BF16_ARITHMETIC_OP(/)
   INTERNAL_BF16_ARITHMETIC_ASSIGN_OP(/)
+  INTERNAL_BF16_ARITHMETIC_OP(==)
+  INTERNAL_BF16_ARITHMETIC_OP(!=)
+  INTERNAL_BF16_ARITHMETIC_OP(<)
+  INTERNAL_BF16_ARITHMETIC_OP(<=)
+  INTERNAL_BF16_ARITHMETIC_OP(>)
+  INTERNAL_BF16_ARITHMETIC_OP(>=)
 
 #undef INTERNAL_BF16_ARITHMETIC_OP
 #undef INTERNAL_BF16_ARITHMETIC_ASSIGN_OP
diff --git a/tensorflow/lite/experimental/shlo/data_type.h b/tensorflow/lite/experimental/shlo/data_type.h
index f313fdc175ce58..f3484483813739 100644
--- a/tensorflow/lite/experimental/shlo/data_type.h
+++ b/tensorflow/lite/experimental/shlo/data_type.h
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "tensorflow/lite/experimental/shlo/bf16.h"
 #include "tensorflow/lite/experimental/shlo/f16.h"
+#include "tensorflow/lite/experimental/shlo/i4.h"
 
 namespace shlo_ref {
 
@@ -49,9 +50,9 @@ struct Storage<DataType::kI1> {
 };
 template <>
 struct Storage<DataType::kSI4> {
-  using Type = int8_t;
-  static constexpr Type kMinValue = -8;
-  static constexpr Type kMaxValue = 7;
+  using Type = I4;
+  static constexpr Type kMinValue = std::numeric_limits<Type>::lowest();
+  static constexpr Type kMaxValue = std::numeric_limits<Type>::max();
 };
 template <>
 struct Storage<DataType::kSI8> {
@@ -138,6 +139,37 @@ constexpr int64_t SizeOf(DataType data_type) {
   }
 }
 
+// Gets a string representation of the given DataType.
+constexpr const char* ToString(DataType t) {
+  switch (t) {
+    case DataType::kI1:
+      return "I1";
+      break;
+    case DataType::kSI4:
+      return "SI4";
+      break;
+    case DataType::kSI8:
+      return "SI8";
+      break;
+    case DataType::kSI16:
+      return "SI16";
+      break;
+    case DataType::kSI32:
+      return "SI32";
+      break;
+    case DataType::kBF16:
+      return "BF16";
+      break;
+    case DataType::kF16:
+      return "F16";
+      break;
+    case DataType::kF32:
+      return "F32";
+      break;
+  }
+  return "Unknown data type";
+}
+
 }  // namespace shlo_ref
 
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_DATA_TYPE_H_
diff --git a/tensorflow/lite/experimental/shlo/f16.h b/tensorflow/lite/experimental/shlo/f16.h
index f18170cb052682..82f9425cfe6ca7 100644
--- a/tensorflow/lite/experimental/shlo/f16.h
+++ b/tensorflow/lite/experimental/shlo/f16.h
@@ -16,17 +16,226 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_F16_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_F16_H_
 
-#if defined(__STDCPP_FLOAT16_T__)
-#include <stdfloat>
+#include <cstdint>
+#include <type_traits>
+
+#include "fp16.h"  // from @FP16  // IWYU pragma: keep, used with no builtin float16
+
+// Use __FLT16_MAX__ to determine whether _Float16 is builtin
+#if defined(__FLT16_MAX__) && !SHLO_REF_EMULATE_F16
+#define SHLO_REF_HAS_BUILTIN_FLOAT16 1
+#endif
+
 namespace shlo_ref {
-using F16 = ::std::float16_t;
+
+class alignas(uint16_t) F16 {
+ public:
+  F16() = default;
+
+  template <typename T,
+            typename = std::enable_if_t<std::is_convertible_v<T, float>>>
+  // Allow implicit conversions from types convertible to float.
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  F16(T x);
+
+  // Tagged constructor to allow construction from bits
+  struct bitcast_construct_t {};
+  explicit F16(bitcast_construct_t, uint16_t bits) : bits_(bits) {}
+
+  // Allow implicit conversions to float.
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  operator float() const;
+
+  explicit operator bool() const;
+
+  F16& operator=(float x) { return *this = static_cast<F16>(x); }
+
+#define SHLO_REF_DECLARE_BINARY_OP(OP)   \
+  friend F16 operator OP(F16 x, F16 y);  \
+  template <typename T, typename SIFNAE> \
+  friend auto operator OP(F16 x, T y);   \
+  template <typename T, typename SIFNAE> \
+  friend auto operator OP(T x, F16 y);
+
+#define SHLO_REF_DECLARE_BINARY_ASSIGN_OP(OP) \
+  SHLO_REF_DECLARE_BINARY_OP(OP);             \
+  friend F16& operator OP##=(F16 & x, F16 y); \
+  template <typename T, typename SIFNAE>      \
+  friend F16& operator OP##=(F16 & x, T y);
+
+  friend F16 operator+(F16 x);
+  friend F16 operator-(F16 x);
+  friend F16& operator++(F16& x);
+  friend F16 operator++(F16& x, int);
+  friend F16& operator--(F16& x);
+  friend F16 operator--(F16& x, int);
+  SHLO_REF_DECLARE_BINARY_ASSIGN_OP(+);
+  SHLO_REF_DECLARE_BINARY_ASSIGN_OP(-);
+  SHLO_REF_DECLARE_BINARY_ASSIGN_OP(*);
+  SHLO_REF_DECLARE_BINARY_ASSIGN_OP(/);
+  SHLO_REF_DECLARE_BINARY_OP(<);
+  SHLO_REF_DECLARE_BINARY_OP(<=);
+  SHLO_REF_DECLARE_BINARY_OP(>);
+  SHLO_REF_DECLARE_BINARY_OP(>=);
+  SHLO_REF_DECLARE_BINARY_OP(==);
+  SHLO_REF_DECLARE_BINARY_OP(!=);
+#undef SHLO_REF_DECLARE_BINARY_ASSIGN_OP
+#undef SHLO_REF_DECLARE_BINARY_OP
+
+ private:
+  union {
+#ifdef SHLO_REF_HAS_BUILTIN_FLOAT16
+    _Float16 native_;
+#endif
+
+    uint16_t bits_;
+  };
+};
 }  // namespace shlo_ref
 
-#else
+namespace std {
+
+template <typename T>
+struct common_type<shlo_ref::F16, T> {
+  static_assert(
+      std::is_arithmetic_v<T>,
+      "Can only find a common type between F16 and an arithmetic types.");
+  using type = shlo_ref::F16;
+};
+
+template <typename T>
+struct common_type<T, shlo_ref::F16> : common_type<shlo_ref::F16, T> {};
+
+}  // namespace std
+
 namespace shlo_ref {
-using F16 = _Float16;
-}  // namespace shlo_ref
+static_assert(sizeof(F16) == sizeof(uint16_t));
+static_assert(alignof(F16) == alignof(uint16_t));
+
+#ifdef SHLO_REF_HAS_BUILTIN_FLOAT16
+
+template <typename T, typename _>
+F16::F16(T x) : native_(static_cast<_Float16>(x)) {}
+
+inline F16::operator float() const { return native_; }
+
+inline F16::operator bool() const { return native_; }
+
+inline F16 operator+(F16 x) { return x.native_; }
+
+inline F16 operator-(F16 x) { return -x.native_; }
+
+inline F16& operator++(F16& x) {
+  ++x.native_;
+  return x;
+}
+
+inline F16 operator++(F16& x, int) { return x.native_++; }
+
+inline F16& operator--(F16& x) {
+  --x.native_;
+  return x;
+}
+
+inline F16 operator--(F16& x, int) { return x.native_--; }
+
+#define INTERNAL_F16_ARITHMETIC_OP(OP)                                        \
+  inline F16 operator OP(F16 x, F16 y) { return x.native_ OP y.native_; }     \
+                                                                              \
+  template <typename T, typename = std::enable_if_t<std::is_arithmetic_v<T>>> \
+  inline auto operator OP(F16 x, T y) {                                       \
+    return x.native_ OP y;                                                    \
+  }                                                                           \
+                                                                              \
+  template <typename T, typename = std::enable_if_t<std::is_arithmetic_v<T>>> \
+  inline auto operator OP(T x, F16 y) {                                       \
+    return x OP y.native_;                                                    \
+  }
+
+#define INTERNAL_F16_ARITHMETIC_ASSIGN_OP(OP)                              \
+  INTERNAL_F16_ARITHMETIC_OP(OP);                                          \
+  inline F16& operator OP##=(F16 & x, F16 y) {                             \
+    x.native_ OP## = y.native_;                                            \
+    return x;                                                              \
+  }                                                                        \
+                                                                           \
+  template <class T, typename = std::enable_if_t<std::is_arithmetic_v<T>>> \
+  inline F16& operator OP##=(F16 & x, T y) {                               \
+    x.native_ OP## = y;                                                    \
+    return x;                                                              \
+  }
+
+#else  // !SHLO_REF_HAS_BUILTIN_FLOAT16
+
+template <typename T, typename _>
+inline F16::F16(T x)
+    : bits_(fp16_ieee_from_fp32_value(static_cast<float>(x))) {}
+
+inline F16::operator float() const { return fp16_ieee_to_fp32_value(bits_); }
+
+inline F16::operator bool() const { return bits_; }
+
+#define INTERNAL_F16_ARITHMETIC_OP(OP)                          \
+  inline F16 operator OP(F16 x, F16 y) {                        \
+    return F16(static_cast<float>(x) OP static_cast<float>(y)); \
+  }                                                             \
+                                                                \
+  template <typename T, typename C = std::common_type<F16, T>>  \
+  inline auto operator OP(F16 x, T y) {                         \
+    return static_cast<C>(static_cast<float>(x) OP y);          \
+  }                                                             \
+                                                                \
+  template <typename T, typename C = std::common_type<F16, T>>  \
+  inline std::common_type<F16, T> operator OP(T x, F16 y) {     \
+    return static_cast<C>(x OP static_cast<float>(y));          \
+  }
+
+#define INTERNAL_F16_ARITHMETIC_ASSIGN_OP(OP)                              \
+  INTERNAL_F16_ARITHMETIC_OP(OP);                                          \
+  inline F16& operator OP##=(F16 & x, F16 y) {                             \
+    return x = F16(static_cast<float>(x) OP static_cast<float>(y));        \
+  }                                                                        \
+                                                                           \
+  template <class T, typename = std::enable_if_t<std::is_arithmetic_v<T>>> \
+  inline F16& operator OP##=(F16 & x, T y) {                               \
+    return x = static_cast<float>(x) OP y;                                 \
+  }
+
+inline F16 operator-(F16 x) { return F16(-static_cast<float>(x)); }
+inline F16 operator+(F16 x) { return F16(static_cast<float>(x)); }
+
+inline F16& operator++(F16& x) { return x += 1; }
+
+inline F16 operator++(F16& x, int) {
+  const F16 y = x;
+  ++x;
+  return y;
+}
+
+inline F16& operator--(F16& x) { return x -= 1; }
+
+inline F16 operator--(F16& x, int) {
+  const F16 y = x;
+  --x;
+  return y;
+}
 
 #endif
 
+INTERNAL_F16_ARITHMETIC_ASSIGN_OP(+)
+INTERNAL_F16_ARITHMETIC_ASSIGN_OP(-)
+INTERNAL_F16_ARITHMETIC_ASSIGN_OP(*)
+INTERNAL_F16_ARITHMETIC_ASSIGN_OP(/)
+INTERNAL_F16_ARITHMETIC_OP(<)
+INTERNAL_F16_ARITHMETIC_OP(<=)
+INTERNAL_F16_ARITHMETIC_OP(>)
+INTERNAL_F16_ARITHMETIC_OP(>=)
+INTERNAL_F16_ARITHMETIC_OP(==)
+INTERNAL_F16_ARITHMETIC_OP(!=)
+
+#undef INTERNAL_F16_ARITHMETIC_OP
+#undef INTERNAL_F16_ARITHMETIC_ASSIGN_OP
+
+}  // namespace shlo_ref
+
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_F16_H_
diff --git a/tensorflow/lite/experimental/shlo/f16_test.cc b/tensorflow/lite/experimental/shlo/f16_test.cc
new file mode 100644
index 00000000000000..1b1889e02f5a04
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/f16_test.cc
@@ -0,0 +1,188 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/f16.h"
+
+#include <cstdint>
+#include <limits>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/base/casts.h"
+
+namespace shlo_ref {
+namespace {
+
+using ::testing::FloatNear;
+
+using RoundtripTypeList = ::testing::Types<float, double>;
+
+template <class T>
+struct RoundtripF16Test : testing::Test {};
+
+TYPED_TEST_SUITE(RoundtripF16Test, RoundtripTypeList);
+
+TYPED_TEST(RoundtripF16Test, RoundtripConversions) {
+  for (TypeParam value : {
+           -std::numeric_limits<TypeParam>::infinity(),
+           std::numeric_limits<TypeParam>::infinity(),
+           TypeParam(-1.0),
+           TypeParam(-0.5),
+           TypeParam(-0.0),
+           TypeParam(1.0),
+           TypeParam(0.5),
+           TypeParam(0.0),
+       }) {
+    EXPECT_EQ(value, static_cast<TypeParam>(static_cast<F16>(value)));
+  }
+}
+
+TEST(F16Test, Arithmetic) {
+  EXPECT_EQ(static_cast<float>(F16(2) + F16(2)), 4);
+  EXPECT_EQ(static_cast<float>(F16(2) + F16(-2)), 0);
+  EXPECT_THAT(static_cast<float>(F16(0.33333f) + F16(0.66667f)),
+              FloatNear(1.0f, 1e-3));
+  EXPECT_EQ(static_cast<float>(F16(2.0f) * F16(-5.5f)), -11.0f);
+  EXPECT_THAT(static_cast<float>(F16(1.0f) / F16(3.0f)),
+              FloatNear(0.3339f, 1e-3));
+  EXPECT_EQ(static_cast<float>(-F16(4096.0f)), -4096.0f);
+  EXPECT_EQ(static_cast<float>(-F16(-4096.0f)), 4096.0f);
+}
+
+TEST(F16Test, DefaultConstruct) { EXPECT_EQ(static_cast<float>(F16()), 0.0f); }
+
+TEST(F16Test, ImplicitConversionToFloat) {
+  EXPECT_EQ((absl::bit_cast<F16, uint16_t>(0x0000)), 0.0f);
+  EXPECT_EQ((absl::bit_cast<F16, uint16_t>(0x3C00)), 1.0f);
+}
+
+TEST(F16Test, ConstructFromArithmeticType) {
+  const F16 from_int8(static_cast<int8_t>(1));
+  EXPECT_EQ(static_cast<float>(from_int8), 1);
+  const F16 from_int16(static_cast<int16_t>(1));
+  EXPECT_EQ(static_cast<float>(from_int16), 1);
+  const F16 from_int32(static_cast<int32_t>(1));
+  EXPECT_EQ(static_cast<float>(from_int32), 1);
+  const F16 from_int64(static_cast<int64_t>(1));
+  EXPECT_EQ(static_cast<float>(from_int64), 1);
+  const F16 from_float(static_cast<float>(1));
+  EXPECT_EQ(static_cast<float>(from_float), 1);
+  const F16 from_double(static_cast<double>(1));
+  EXPECT_EQ(static_cast<float>(from_double), 1);
+}
+
+template <class T>
+T ImplicitConversion(T v) {
+  return v;
+}
+
+TEST(F16Test, ConvertToArithmeticType) {
+  const F16 ref(-1);
+  EXPECT_EQ(ImplicitConversion<int8_t>(ref), -1);
+  EXPECT_EQ(ImplicitConversion<int16_t>(ref), -1);
+  EXPECT_EQ(ImplicitConversion<int32_t>(ref), -1);
+  EXPECT_EQ(ImplicitConversion<int64_t>(ref), -1);
+  EXPECT_EQ(ImplicitConversion<float>(ref), -1);
+  EXPECT_EQ(ImplicitConversion<double>(ref), -1);
+}
+
+TEST(F16Test, ArithmeticOperations) {
+  // Every test relies on the equality comparisons working. We test all the 4
+  // bit integral values.
+  for (int i = -8; i < 8; ++i) {
+    for (int j = -8; j < 8; ++j) {
+      EXPECT_EQ(F16(i) == F16(j), i == j);
+      EXPECT_EQ(F16(i) != F16(j), i != j);
+      EXPECT_EQ(F16(i) > F16(j), i > j);
+      EXPECT_EQ(F16(i) >= F16(j), i >= j);
+      EXPECT_EQ(F16(i) < F16(j), i < j);
+      EXPECT_EQ(F16(i) <= F16(j), i <= j);
+    }
+  }
+  F16 val(0);
+  EXPECT_EQ(++val, 1);
+  EXPECT_EQ(val++, 1);
+  EXPECT_EQ(val, 2);
+  EXPECT_EQ(val--, 2);
+  EXPECT_EQ(val, 1);
+  EXPECT_EQ(--val, 0);
+  EXPECT_EQ(val += F16(1), 1);
+  EXPECT_EQ(val, 1);
+  EXPECT_EQ(val *= F16(2), 2);
+  EXPECT_EQ(val, 2);
+  EXPECT_EQ(val /= F16(2), 1);
+  EXPECT_EQ(val, 1);
+  EXPECT_EQ(val -= F16(4), -3);
+  EXPECT_EQ(val, -3);
+  EXPECT_EQ(val = F16(7), 7);
+  EXPECT_EQ(val, 7);
+  EXPECT_EQ(+val, 7);
+  EXPECT_EQ(-val, -7);
+  EXPECT_EQ(static_cast<bool>(val), true);
+  EXPECT_EQ(!val, false);
+  EXPECT_EQ(val && F16(2), true);
+  EXPECT_EQ(val && F16(0), false);
+  EXPECT_EQ(val || F16(0), true);
+  EXPECT_EQ(F16(0) || F16(0), false);
+}
+
+using ArithmeticTypeList =
+    ::testing::Types<int8_t, int16_t, int32_t, int64_t, float, double>;
+
+template <class T>
+struct ArithmeticTypeF16Test : testing::Test {};
+
+TYPED_TEST_SUITE(ArithmeticTypeF16Test, ArithmeticTypeList);
+
+TYPED_TEST(ArithmeticTypeF16Test, InPlaceArithmetic) {
+  // Every test relies on the equality comparisons working. We test all the 4
+  // bit integral values.
+  for (TypeParam i = -8; i < 8; ++i) {
+    for (TypeParam j = -8; j < 8; ++j) {
+      EXPECT_EQ(F16(i) == j, i == j);
+      EXPECT_EQ(i == F16(j), i == j);
+      EXPECT_EQ(F16(i) != j, i != j);
+      EXPECT_EQ(i != F16(j), i != j);
+      EXPECT_EQ(F16(i) > j, i > j);
+      EXPECT_EQ(i > F16(j), i > j);
+      EXPECT_EQ(F16(i) >= j, i >= j);
+      EXPECT_EQ(i >= F16(j), i >= j);
+      EXPECT_EQ(F16(i) < j, i < j);
+      EXPECT_EQ(i < F16(j), i < j);
+      EXPECT_EQ(F16(i) <= j, i <= j);
+      EXPECT_EQ(i <= F16(j), i <= j);
+    }
+  }
+  const TypeParam one = TypeParam(1);
+  const TypeParam two = TypeParam(2);
+  const TypeParam four = TypeParam(4);
+  F16 val(0);
+  EXPECT_EQ(val += one, 1);
+  EXPECT_EQ(val, 1);
+  EXPECT_EQ(val *= two, 2);
+  EXPECT_EQ(val, 2);
+  EXPECT_EQ(val /= two, 1);
+  EXPECT_EQ(val, 1);
+  EXPECT_EQ(val -= four, -3);
+  EXPECT_EQ(val, -3);
+  const F16 f16_three(3);
+  EXPECT_EQ(f16_three + one, 4.);
+  EXPECT_EQ(f16_three - one, 2.);
+  EXPECT_EQ(f16_three * two, 3. * two);
+  EXPECT_EQ(f16_three / two, 3. / two);
+}
+
+}  // namespace
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/i4.h b/tensorflow/lite/experimental/shlo/i4.h
new file mode 100644
index 00000000000000..07a2c0f87f55a6
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/i4.h
@@ -0,0 +1,416 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_I4_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_I4_H_
+
+#include <cstdint>
+#include <limits>
+#include <ostream>
+#include <type_traits>
+
+namespace shlo_ref {
+
+struct I4 {
+  int8_t data;
+
+  constexpr I4() = default;
+  constexpr I4(const I4&) = default;
+  constexpr I4& operator=(const I4&) = default;
+
+  template <class T>
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  constexpr I4(T v) : data(v) {}
+
+  template <class T>
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  constexpr operator T() const {
+    return static_cast<T>(data);
+  }
+
+  // ++a
+  friend I4& operator++(I4& lhs) {
+    ++lhs.data;
+    return lhs;
+  }
+  // --a
+  friend I4& operator--(I4& lhs) {
+    --lhs.data;
+    return lhs;
+  }
+  // a++
+  friend I4 operator++(I4& lhs, int) {
+    I4 ret = lhs;
+    ++lhs.data;
+    return ret;
+  }
+  // a--
+  friend I4 operator--(I4& lhs, int) {
+    I4 ret = lhs;
+    --lhs.data;
+    return ret;
+  }
+  // a += b
+  friend I4& operator+=(I4& lhs, I4 rhs) {
+    lhs.data += rhs.data;
+    return lhs;
+  }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend I4& operator+=(I4& lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    lhs.data += static_cast<C>(rhs);
+    return lhs;
+  }
+  // a -= b
+  friend I4& operator-=(I4& lhs, I4 rhs) {
+    lhs.data -= rhs.data;
+    return lhs;
+  }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend I4& operator-=(I4& lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    lhs.data -= static_cast<C>(rhs);
+    return lhs;
+  }
+  // a *= b
+  friend I4& operator*=(I4& lhs, I4 rhs) {
+    lhs.data *= rhs.data;
+    return lhs;
+  }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend I4& operator*=(I4& lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    lhs.data *= static_cast<C>(rhs);
+    return lhs;
+  }
+  // a /= b
+  friend I4& operator/=(I4& lhs, I4 rhs) {
+    lhs.data /= rhs.data;
+    return lhs;
+  }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend I4& operator/=(I4& lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    lhs.data /= static_cast<C>(rhs);
+    return lhs;
+  }
+  // a %= b
+  friend I4& operator%=(I4& lhs, I4 rhs) {
+    lhs.data %= rhs.data;
+    return lhs;
+  }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend I4& operator%=(I4& lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    lhs.data %= static_cast<C>(rhs);
+    return lhs;
+  }
+  // a &= b
+  friend I4& operator&=(I4& lhs, I4 rhs) {
+    lhs.data &= rhs.data;
+    return lhs;
+  }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend I4& operator&=(I4& lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    lhs.data &= static_cast<C>(rhs);
+    return lhs;
+  }
+  // a |= b
+  friend I4& operator|=(I4& lhs, I4 rhs) {
+    lhs.data |= rhs.data;
+    return lhs;
+  }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend I4& operator|=(I4& lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    lhs.data |= static_cast<C>(rhs);
+    return lhs;
+  }
+  // a ^= b
+  friend I4& operator^=(I4& lhs, I4 rhs) {
+    lhs.data ^= rhs.data;
+    return lhs;
+  }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend I4& operator^=(I4& lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    lhs.data ^= static_cast<C>(rhs);
+    return lhs;
+  }
+  // a <<= b
+  friend I4& operator<<=(I4& lhs, I4 rhs) {
+    lhs.data <<= rhs.data;
+    return lhs;
+  }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend I4& operator<<=(I4& lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    lhs.data <<= static_cast<C>(rhs);
+    return lhs;
+  }
+  // a >>= b
+  friend I4& operator>>=(I4& lhs, I4 rhs) {
+    lhs.data >>= rhs.data;
+    return lhs;
+  }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend I4& operator>>=(I4& lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    lhs.data >>= static_cast<C>(rhs);
+    return lhs;
+  }
+  // +a
+  friend auto operator+(I4 lhs) { return +lhs.data; }
+  // -a
+  friend auto operator-(I4 lhs) { return -lhs.data; }
+  // a + b
+  friend auto operator+(I4 lhs, I4 rhs) { return lhs.data + rhs.data; }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend auto operator+(I4 lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    return lhs.data + static_cast<C>(rhs);
+  }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend auto operator+(T lhs, I4 rhs) {
+    using C = std::common_type_t<T, int>;
+    return static_cast<C>(lhs) + rhs.data;
+  }
+  // a - b
+  friend auto operator-(I4 lhs, I4 rhs) { return lhs.data - rhs.data; }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend auto operator-(I4 lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    return lhs.data - static_cast<C>(rhs);
+  }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend auto operator-(T lhs, I4 rhs) {
+    using C = std::common_type_t<T, int>;
+    return static_cast<C>(lhs) - rhs.data;
+  }
+  // a * b
+  friend auto operator*(I4 lhs, I4 rhs) { return lhs.data * rhs.data; }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend auto operator*(I4 lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    return lhs.data * static_cast<C>(rhs);
+  }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend auto operator*(T lhs, I4 rhs) {
+    using C = std::common_type_t<T, int>;
+    return static_cast<C>(lhs) * rhs.data;
+  }
+  // a / b
+  friend auto operator/(I4 lhs, I4 rhs) { return lhs.data / rhs.data; }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend auto operator/(I4 lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    return lhs.data / static_cast<C>(rhs);
+  }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend auto operator/(T lhs, I4 rhs) {
+    using C = std::common_type_t<T, int>;
+    return static_cast<C>(lhs) / rhs.data;
+  }
+  // a % b
+  friend auto operator%(I4 lhs, I4 rhs) { return lhs.data % rhs.data; }
+  template <class T, class = std::enable_if_t<std::is_integral_v<T>>>
+  friend auto operator%(I4 lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    return lhs.data % static_cast<C>(rhs);
+  }
+  template <class T, class = std::enable_if_t<std::is_integral_v<T>>>
+  friend auto operator%(T lhs, I4 rhs) {
+    using C = std::common_type_t<T, int>;
+    return static_cast<C>(lhs) % rhs.data;
+  }
+  // ~a
+  friend auto operator~(I4 lhs) { return ~lhs.data; }
+  // a & b
+  friend auto operator&(I4 lhs, I4 rhs) { return lhs.data & rhs.data; }
+  template <class T, class = std::enable_if_t<std::is_integral_v<T>>>
+  friend auto operator&(I4 lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    return lhs.data & static_cast<C>(rhs);
+  }
+  template <class T, class = std::enable_if_t<std::is_integral_v<T>>>
+  friend auto operator&(T lhs, I4 rhs) {
+    using C = std::common_type_t<T, int>;
+    return static_cast<C>(lhs) & rhs.data;
+  }
+  // a | b
+  friend auto operator|(I4 lhs, I4 rhs) { return lhs.data | rhs.data; }
+  template <class T, class = std::enable_if_t<std::is_integral_v<T>>>
+  friend auto operator|(I4 lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    return lhs.data | static_cast<C>(rhs);
+  }
+  template <class T, class = std::enable_if_t<std::is_integral_v<T>>>
+  friend auto operator|(T lhs, I4 rhs) {
+    using C = std::common_type_t<T, int>;
+    return static_cast<C>(lhs) | rhs.data;
+  }
+  // a ^ b
+  friend auto operator^(I4 lhs, I4 rhs) { return lhs.data ^ rhs.data; }
+  template <class T, class = std::enable_if_t<std::is_integral_v<T>>>
+  friend auto operator^(I4 lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    return lhs.data ^ static_cast<C>(rhs);
+  }
+  template <class T, class = std::enable_if_t<std::is_integral_v<T>>>
+  friend auto operator^(T lhs, I4 rhs) {
+    using C = std::common_type_t<T, int>;
+    return static_cast<C>(lhs) ^ rhs.data;
+  }
+  // a << b
+  friend auto operator<<(I4 lhs, I4 rhs) { return lhs.data << rhs.data; }
+  template <class T, class = std::enable_if_t<std::is_integral_v<T>>>
+  friend auto operator<<(I4 lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    return lhs.data << static_cast<C>(rhs);
+  }
+  template <class T, class = std::enable_if_t<std::is_integral_v<T>>>
+  friend auto operator<<(T lhs, I4 rhs) {
+    using C = std::common_type_t<T, int>;
+    return static_cast<C>(lhs) << rhs.data;
+  }
+  // a >> b
+  friend auto operator>>(I4 lhs, I4 rhs) { return lhs.data >> rhs.data; }
+  template <class T, class = std::enable_if_t<std::is_integral_v<T>>>
+  friend auto operator>>(I4 lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    return lhs.data >> static_cast<C>(rhs);
+  }
+  template <class T, class = std::enable_if_t<std::is_integral_v<T>>>
+  friend auto operator>>(T lhs, I4 rhs) {
+    using C = std::common_type_t<T, int>;
+    return static_cast<C>(lhs) >> rhs.data;
+  }
+  // !a
+  friend bool operator!(I4 v) { return !v.data; }
+  // a && b
+  friend auto operator&&(I4 lhs, I4 rhs) { return lhs.data && rhs.data; }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend auto operator&&(I4 lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    return lhs.data && static_cast<C>(rhs);
+  }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend auto operator&&(T lhs, I4 rhs) {
+    using C = std::common_type_t<T, int>;
+    return static_cast<C>(lhs) && rhs.data;
+  }
+  // a || b
+  friend auto operator||(I4 lhs, I4 rhs) { return lhs.data || rhs.data; }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend auto operator||(I4 lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    return lhs.data || static_cast<C>(rhs);
+  }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend auto operator||(T lhs, I4 rhs) {
+    using C = std::common_type_t<T, int>;
+    return static_cast<C>(lhs) || rhs.data;
+  }
+  // a == b
+  friend bool operator==(I4 lhs, I4 rhs) { return lhs.data == rhs.data; }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend bool operator==(I4 lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    return lhs.data == static_cast<C>(rhs);
+  }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend bool operator==(T lhs, I4 rhs) {
+    using C = std::common_type_t<T, int>;
+    return static_cast<C>(lhs) == rhs.data;
+  }
+  // a != b
+  friend bool operator!=(I4 lhs, I4 rhs) { return lhs.data != rhs.data; }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend bool operator!=(I4 lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    return lhs.data != static_cast<C>(rhs);
+  }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend bool operator!=(T lhs, I4 rhs) {
+    using C = std::common_type_t<T, int>;
+    return static_cast<C>(lhs) != rhs.data;
+  }
+  // a < b
+  friend bool operator<(I4 lhs, I4 rhs) { return lhs.data < rhs.data; }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend bool operator<(I4 lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    return lhs.data < static_cast<C>(rhs);
+  }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend bool operator<(T lhs, I4 rhs) {
+    using C = std::common_type_t<T, int>;
+    return static_cast<C>(lhs) < rhs.data;
+  }
+  // a > b
+  friend bool operator>(I4 lhs, I4 rhs) { return lhs.data > rhs.data; }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend bool operator>(I4 lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    return lhs.data > static_cast<C>(rhs);
+  }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend bool operator>(T lhs, I4 rhs) {
+    using C = std::common_type_t<T, int>;
+    return static_cast<C>(lhs) > rhs.data;
+  }
+  // a <= b
+  friend bool operator<=(I4 lhs, I4 rhs) { return lhs.data <= rhs.data; }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend bool operator<=(I4 lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    return lhs.data <= static_cast<C>(rhs);
+  }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend bool operator<=(T lhs, I4 rhs) {
+    using C = std::common_type_t<T, int>;
+    return static_cast<C>(lhs) <= rhs.data;
+  }
+  // a >= b
+  friend bool operator>=(I4 lhs, I4 rhs) { return lhs.data >= rhs.data; }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend bool operator>=(I4 lhs, T rhs) {
+    using C = std::common_type_t<T, int>;
+    return lhs.data >= static_cast<C>(rhs);
+  }
+  template <class T, class = std::enable_if_t<std::is_arithmetic_v<T>>>
+  friend bool operator>=(T lhs, I4 rhs) {
+    using C = std::common_type_t<T, int>;
+    return static_cast<C>(lhs) >= rhs.data;
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, I4 v) { return os << +v; }
+};
+
+}  // namespace shlo_ref
+
+namespace std {
+
+template <>
+struct numeric_limits<shlo_ref::I4> : std::numeric_limits<int8_t> {
+  static constexpr shlo_ref::I4 min() noexcept { return shlo_ref::I4(-8); }
+  static constexpr shlo_ref::I4 lowest() noexcept { return min(); }
+  static constexpr shlo_ref::I4 max() noexcept { return shlo_ref::I4(7); }
+};
+
+}  // namespace std
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_I4_H_
diff --git a/tensorflow/lite/experimental/shlo/i4_test.cc b/tensorflow/lite/experimental/shlo/i4_test.cc
new file mode 100644
index 00000000000000..3e2b80839bd2bd
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/i4_test.cc
@@ -0,0 +1,198 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/i4.h"
+
+#include <cstdint>
+
+#include <gtest/gtest.h>
+
+namespace shlo_ref {
+namespace {
+
+TEST(I4Test, ConstructFromArithmeticType) {
+  const I4 from_int8(static_cast<int8_t>(1));
+  EXPECT_EQ(from_int8.data, 1);
+  const I4 from_int16(static_cast<int16_t>(1));
+  EXPECT_EQ(from_int16.data, 1);
+  const I4 from_int32(static_cast<int32_t>(1));
+  EXPECT_EQ(from_int32.data, 1);
+  const I4 from_int64(static_cast<int64_t>(1));
+  EXPECT_EQ(from_int64.data, 1);
+  const I4 from_float(static_cast<float>(1));
+  EXPECT_EQ(from_float.data, 1);
+  const I4 from_double(static_cast<double>(1));
+  EXPECT_EQ(from_double.data, 1);
+}
+
+template <class T>
+T ImplicitConversion(T v) {
+  return v;
+}
+
+TEST(I4Test, ConvertToArithmeticType) {
+  const I4 ref(-1);
+  EXPECT_EQ(ImplicitConversion<int8_t>(ref), -1);
+  EXPECT_EQ(ImplicitConversion<int16_t>(ref), -1);
+  EXPECT_EQ(ImplicitConversion<int32_t>(ref), -1);
+  EXPECT_EQ(ImplicitConversion<int64_t>(ref), -1);
+  EXPECT_EQ(ImplicitConversion<float>(ref), -1);
+  EXPECT_EQ(ImplicitConversion<double>(ref), -1);
+}
+
+TEST(I4Test, Arithmetic) {
+  // Every test relies on the equality comparisons working. We test all the 4
+  // bit integral values.
+  for (int i = -8; i < 8; ++i) {
+    for (int j = -8; j < 8; ++j) {
+      EXPECT_EQ(I4(i) == I4(j), i == j);
+      EXPECT_EQ(I4(i) != I4(j), i != j);
+      EXPECT_EQ(I4(i) > I4(j), i > j);
+      EXPECT_EQ(I4(i) >= I4(j), i >= j);
+      EXPECT_EQ(I4(i) < I4(j), i < j);
+      EXPECT_EQ(I4(i) <= I4(j), i <= j);
+    }
+  }
+  I4 val(0);
+  EXPECT_EQ(++val, 1);
+  EXPECT_EQ(val++, 1);
+  EXPECT_EQ(val, 2);
+  EXPECT_EQ(val--, 2);
+  EXPECT_EQ(val, 1);
+  EXPECT_EQ(--val, 0);
+  EXPECT_EQ(val += I4(1), 1);
+  EXPECT_EQ(val, 1);
+  EXPECT_EQ(val *= I4(2), 2);
+  EXPECT_EQ(val, 2);
+  EXPECT_EQ(val /= I4(2), 1);
+  EXPECT_EQ(val, 1);
+  EXPECT_EQ(val -= I4(4), -3);
+  EXPECT_EQ(val, -3);
+  EXPECT_EQ(val %= I4(2), -1);
+  EXPECT_EQ(val, -1);
+  EXPECT_EQ(val = I4(7), 7);
+  EXPECT_EQ(val, 7);
+  EXPECT_EQ(val &= I4(2), 2);
+  EXPECT_EQ(val, 2);
+  EXPECT_EQ(val |= I4(1), 3);
+  EXPECT_EQ(val, 3);
+  EXPECT_EQ(val ^= I4(7), 4);
+  EXPECT_EQ(val, 4);
+  EXPECT_EQ(val >>= I4(1), 2);
+  EXPECT_EQ(val, 2);
+  EXPECT_EQ(val <<= I4(1), 4);
+  EXPECT_EQ(val, 4);
+  EXPECT_EQ(val >>= I4(1), 2);
+  EXPECT_EQ(val, 2);
+  EXPECT_EQ(val <<= I4(1), 4);
+  EXPECT_EQ(val, 4);
+  EXPECT_EQ(+val, 4);
+  EXPECT_EQ(-val, -4);
+  EXPECT_EQ(!val, false);
+  EXPECT_EQ(~val, ~4);
+  EXPECT_EQ(val && I4(2), true);
+  EXPECT_EQ(val && I4(0), false);
+  EXPECT_EQ(val || I4(0), true);
+  EXPECT_EQ(I4(0) || I4(0), false);
+}
+
+using IntegralTypeList = ::testing::Types<int8_t, int16_t, int32_t, int64_t>;
+
+using ArithmeticTypeList =
+    ::testing::Types<int8_t, int16_t, int32_t, int64_t, float, double>;
+
+template <class T>
+struct ArithmeticTypeI4Test : testing::Test {};
+
+TYPED_TEST_SUITE(ArithmeticTypeI4Test, ArithmeticTypeList);
+
+TYPED_TEST(ArithmeticTypeI4Test, Arithmetic) {
+  // Every test relies on the equality comparisons working. We test all the 4
+  // bit integral values.
+  for (TypeParam i = -8; i < 8; ++i) {
+    for (TypeParam j = -8; j < 8; ++j) {
+      EXPECT_EQ(I4(i) == j, i == j);
+      EXPECT_EQ(i == I4(j), i == j);
+      EXPECT_EQ(I4(i) != j, i != j);
+      EXPECT_EQ(i != I4(j), i != j);
+      EXPECT_EQ(I4(i) > j, i > j);
+      EXPECT_EQ(i > I4(j), i > j);
+      EXPECT_EQ(I4(i) >= j, i >= j);
+      EXPECT_EQ(i >= I4(j), i >= j);
+      EXPECT_EQ(I4(i) < j, i < j);
+      EXPECT_EQ(i < I4(j), i < j);
+      EXPECT_EQ(I4(i) <= j, i <= j);
+      EXPECT_EQ(i <= I4(j), i <= j);
+    }
+  }
+  I4 val(0);
+  const TypeParam one = TypeParam(1);
+  const TypeParam two = TypeParam(2);
+  const TypeParam three = TypeParam(3);
+  const TypeParam four = TypeParam(4);
+  EXPECT_EQ(val += one, 1);
+  EXPECT_EQ(val, 1);
+  EXPECT_EQ(val *= two, 2);
+  EXPECT_EQ(val, 2);
+  EXPECT_EQ(val /= two, 1);
+  EXPECT_EQ(val, 1);
+  EXPECT_EQ(val -= four, -3);
+  EXPECT_EQ(val, -3);
+  const I4 i4_three(3);
+  EXPECT_EQ(i4_three + one, four);
+  EXPECT_EQ(i4_three - one, two);
+  EXPECT_EQ(i4_three * two, three * two);
+  EXPECT_EQ(i4_three / two, three / two);
+}
+
+template <class T>
+struct IntegralTypeI4Test : testing::Test {};
+
+TYPED_TEST_SUITE(IntegralTypeI4Test, IntegralTypeList);
+
+TYPED_TEST(IntegralTypeI4Test, Arithmetic) {
+  const TypeParam minus_one = TypeParam(-1);
+  const TypeParam one = TypeParam(1);
+  const TypeParam two = TypeParam(2);
+  const TypeParam three = TypeParam(3);
+  const TypeParam four = TypeParam(4);
+  const TypeParam six = TypeParam(6);
+  const TypeParam seven = TypeParam(7);
+  const I4 i4_three(3);
+  EXPECT_EQ(i4_three % two, one);
+  EXPECT_EQ(i4_three & two, two);
+  EXPECT_EQ(i4_three | four, seven);
+  EXPECT_EQ(i4_three ^ four, seven);
+  EXPECT_EQ(i4_three << one, six);
+  EXPECT_EQ(i4_three >> one, one);
+  I4 val(-3);
+  EXPECT_EQ(val %= two, minus_one);
+  EXPECT_EQ(val, -1);
+  EXPECT_EQ(val = I4(7), seven);
+  EXPECT_EQ(val, 7);
+  EXPECT_EQ(val &= two, two);
+  EXPECT_EQ(val, 2);
+  EXPECT_EQ(val |= one, three);
+  EXPECT_EQ(val, 3);
+  EXPECT_EQ(val ^= seven, four);
+  EXPECT_EQ(val, 4);
+  EXPECT_EQ(val >>= one, two);
+  EXPECT_EQ(val, 2);
+  EXPECT_EQ(val <<= one, four);
+  EXPECT_EQ(val, 4);
+}
+
+}  // namespace
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/BUILD b/tensorflow/lite/experimental/shlo/ops/BUILD
index f839714f3ecaf2..db9d1ba10d0bd9 100644
--- a/tensorflow/lite/experimental/shlo/ops/BUILD
+++ b/tensorflow/lite/experimental/shlo/ops/BUILD
@@ -130,6 +130,7 @@ cc_library(
     hdrs = ["test_util.h"],
     deps = [
         "//tensorflow/lite/experimental/shlo:data_type",
+        "//tensorflow/lite/experimental/shlo:i4",
         "//tensorflow/lite/experimental/shlo:quantized_tensor_element_type",
         "//tensorflow/lite/experimental/shlo:shape",
         "//tensorflow/lite/experimental/shlo:tensor",
@@ -226,7 +227,6 @@ cc_library(
         ":unary_elementwise",
         ":util",
         "//tensorflow/lite/experimental/shlo:bf16",
-        "//tensorflow/lite/experimental/shlo:data_type",
         "//tensorflow/lite/experimental/shlo:dispatch",
         "//tensorflow/lite/experimental/shlo:f16",
         "//tensorflow/lite/experimental/shlo:tensor",
@@ -667,6 +667,7 @@ cc_library(
         ":unary_elementwise",
         ":util",
         "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:i4",
         "//tensorflow/lite/experimental/shlo:tensor",
         "@com_google_absl//absl/numeric:bits",
         "@com_google_absl//absl/status",
@@ -682,9 +683,11 @@ cc_test(
         ":test_util",
         ":unary_elementwise_test_util",
         "//tensorflow/lite/experimental/shlo:data_type",
+        "//tensorflow/lite/experimental/shlo:i4",
         "//tensorflow/lite/experimental/shlo:shape",
         "//tensorflow/lite/experimental/shlo:status_matcher",
         "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/numeric:bits",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -725,6 +728,7 @@ cc_library(
         ":unary_elementwise",
         ":util",
         "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:i4",
         "//tensorflow/lite/experimental/shlo:tensor",
         "@com_google_absl//absl/numeric:bits",
         "@com_google_absl//absl/status",
@@ -739,6 +743,7 @@ cc_test(
         ":test_util",
         ":unary_elementwise_test_util",
         "//tensorflow/lite/experimental/shlo:data_type",
+        "//tensorflow/lite/experimental/shlo:i4",
         "//tensorflow/lite/experimental/shlo:shape",
         "//tensorflow/lite/experimental/shlo:status_matcher",
         "//tensorflow/lite/experimental/shlo:tensor",
@@ -1021,3 +1026,38 @@ cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+cc_library(
+    name = "compare",
+    srcs = ["compare.cc"],
+    hdrs = ["compare.h"],
+    deps = [
+        ":binary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:data_type",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "compare_test",
+    srcs = ["compare_test.cc"],
+    deps = [
+        ":binary_elementwise_test_util",
+        ":compare",
+        ":test_util",
+        "//tensorflow/lite/experimental/shlo:data_type",
+        "//tensorflow/lite/experimental/shlo:quantize",
+        "//tensorflow/lite/experimental/shlo:quantized_tensor_element_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/random",
+        "@com_google_absl//absl/random:distributions",
+        "@com_google_absl//absl/status",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/experimental/shlo/ops/abs.cc b/tensorflow/lite/experimental/shlo/ops/abs.cc
index dd92713df972ed..37defa420524fe 100644
--- a/tensorflow/lite/experimental/shlo/ops/abs.cc
+++ b/tensorflow/lite/experimental/shlo/ops/abs.cc
@@ -25,7 +25,7 @@ namespace shlo_ref {
 struct Abs {
   template <class T>
   T operator()(const T& val) {
-    return val < static_cast<T>(0) ? -val : val;
+    return val < static_cast<T>(0) ? static_cast<T>(-val) : val;
   }
 };
 
@@ -44,10 +44,11 @@ absl::Status Prepare(AbsOp& op, const Tensor& input, Tensor& output) {
 absl::Status Evaluate(AbsOp& op, const Tensor& input, Tensor& output) {
   Abs abs;
   if (input.IsPerTensorQuantized()) {
-    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
-                       input.quantized_tensor_element_type().StorageType(),
-                       input.quantized_tensor_element_type().ExpressedType(),
-                       abs, input, output)
+    DISPATCH_QUANTIZED(
+        detail::DequantizeOpQuantizePerTensor,
+        input.quantized_per_tensor_element_type().StorageType(),
+        input.quantized_per_tensor_element_type().ExpressedType(), abs, input,
+        output)
   } else if (IsSignedIntTensor(input) || IsFloatTensor(input)) {
     DISPATCH_INT_FLOAT(detail::EvaluateNoQuantization,
                        input.tensor_element_type(), abs, input, output);
diff --git a/tensorflow/lite/experimental/shlo/ops/abs_test.cc b/tensorflow/lite/experimental/shlo/ops/abs_test.cc
index 0e3962825c56d6..b50587479b2784 100644
--- a/tensorflow/lite/experimental/shlo/ops/abs_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/abs_test.cc
@@ -40,7 +40,7 @@ namespace {
 constexpr struct AbsRef {
   template <class T>
   T operator()(T v) const {
-    return v < 0 ? -v : v;
+    return v < static_cast<T>(0) ? static_cast<T>(-v) : v;
   }
 } abs_ref;
 
@@ -99,15 +99,16 @@ TYPED_TEST(QuantizedAbsTest, QuantizedPerTensor) {
   Vector<StorageT> output_data(shape.NumElements());
   const ExpressedT scale = static_cast<ExpressedT>(1.5);
   const StorageT zero_point = static_cast<StorageT>(5);
-  const QuantizedTensorElementType tensor_type =
-      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
-                                            TypeParam::kExpressed>(scale,
-                                                                   zero_point);
+  const QuantizedElementTypePerTensor tensor_type =
+      QuantizedElementTypePerTensor(TypeParam::kStorage, zero_point,
+                                    TypeParam::kExpressed, scale);
   Tensor input_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = input_data.data()};
   Tensor output_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = output_data.data()};
 
   Vector<StorageT> expected_data(shape.NumElements());
diff --git a/tensorflow/lite/experimental/shlo/ops/binary_elementwise.h b/tensorflow/lite/experimental/shlo/ops/binary_elementwise.h
index 94d4c0c82b02b8..994129a2f6061c 100644
--- a/tensorflow/lite/experimental/shlo/ops/binary_elementwise.h
+++ b/tensorflow/lite/experimental/shlo/ops/binary_elementwise.h
@@ -33,21 +33,21 @@ void DequantizeOpQuantizePerTensor(F&& func, const Tensor& lhs,
   using ExpressedT = StorageType<expressed_type>;
   const DimensionSize num_elements = lhs.NumElements();
   const StorageT lhs_zero_point =
-      lhs.quantized_tensor_element_type().ZeroPoints<storage_type>()[0];
+      lhs.quantized_per_tensor_element_type().ZeroPointAs<storage_type>();
   const ExpressedT lhs_scale =
-      lhs.quantized_tensor_element_type().Scales<expressed_type>()[0];
+      lhs.quantized_per_tensor_element_type().ScaleAs<expressed_type>();
   const StorageT rhs_zero_point =
-      rhs.quantized_tensor_element_type().ZeroPoints<storage_type>()[0];
+      rhs.quantized_per_tensor_element_type().ZeroPointAs<storage_type>();
   const ExpressedT rhs_scale =
-      rhs.quantized_tensor_element_type().Scales<expressed_type>()[0];
+      rhs.quantized_per_tensor_element_type().ScaleAs<expressed_type>();
   const StorageT output_zero_point =
-      output.quantized_tensor_element_type().ZeroPoints<storage_type>()[0];
+      output.quantized_per_tensor_element_type().ZeroPointAs<storage_type>();
   const ExpressedT output_scale =
-      output.quantized_tensor_element_type().Scales<expressed_type>()[0];
+      output.quantized_per_tensor_element_type().ScaleAs<expressed_type>();
   const StorageT* lhs_data = lhs.GetDataAs<storage_type>();
   const StorageT* rhs_data = rhs.GetDataAs<storage_type>();
   StorageT* output_data = output.GetDataAs<storage_type>();
-  const ExpressedT inv_scale = static_cast<ExpressedT>(1 / output_scale);
+  const ExpressedT inv_scale = static_cast<ExpressedT>(1) / output_scale;
   for (DimensionSize i = 0; i < num_elements;
        ++i, ++lhs_data, ++rhs_data, ++output_data) {
     const ExpressedT dequantized_lhs =
@@ -70,7 +70,7 @@ void EvaluateNoQuantization(F&& func, const Tensor& lhs, const Tensor& rhs,
   const DimensionSize num_elements = lhs.NumElements();
   for (DimensionSize i = 0; i < num_elements;
        ++i, ++output_data, ++lhs_data, ++rhs_data) {
-    *output_data = func(*lhs_data, *rhs_data);
+    *output_data = static_cast<T>(func(*lhs_data, *rhs_data));
   }
 }
 
diff --git a/tensorflow/lite/experimental/shlo/ops/binary_elementwise_test.cc b/tensorflow/lite/experimental/shlo/ops/binary_elementwise_test.cc
index 5c2904b229ef66..04052c4e94f8f9 100644
--- a/tensorflow/lite/experimental/shlo/ops/binary_elementwise_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/binary_elementwise_test.cc
@@ -90,33 +90,27 @@ TYPED_TEST(DequantizeOpQuantizePerTensor, QuantizedPerTensorWithTestOp) {
   const StorageT rhs_zero_point = static_cast<StorageT>(5);
   const ExpressedT output_scale = static_cast<ExpressedT>(1.5);
   const StorageT output_zero_point = static_cast<StorageT>(3);
-  Tensor lhs_tensor{
-      .type =
-          QuantizedTensorType{
-              .shape = shape,
-              .element_type =
-                  QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
-                                                        TypeParam::kExpressed>(
-                      lhs_scale, lhs_zero_point)},
-      .data = lhs_data.data()};
-  Tensor rhs_tensor{
-      .type =
-          QuantizedTensorType{
-              .shape = shape,
-              .element_type =
-                  QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
-                                                        TypeParam::kExpressed>(
-                      rhs_scale, rhs_zero_point)},
-      .data = rhs_data.data()};
-  Tensor output_tensor{
-      .type =
-          QuantizedTensorType{
-              .shape = shape,
-              .element_type =
-                  QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
-                                                        TypeParam::kExpressed>(
-                      output_scale, output_zero_point)},
-      .data = output_data.data()};
+  Tensor lhs_tensor{.type =
+                        QuantizedPerTensorTensorType{
+                            .shape = shape,
+                            .element_type = QuantizedElementTypePerTensor(
+                                TypeParam::kStorage, lhs_zero_point,
+                                TypeParam::kExpressed, lhs_scale)},
+                    .data = lhs_data.data()};
+  Tensor rhs_tensor{.type =
+                        QuantizedPerTensorTensorType{
+                            .shape = shape,
+                            .element_type = QuantizedElementTypePerTensor(
+                                TypeParam::kStorage, rhs_zero_point,
+                                TypeParam::kExpressed, rhs_scale)},
+                    .data = rhs_data.data()};
+  Tensor output_tensor{.type =
+                           QuantizedPerTensorTensorType{
+                               .shape = shape,
+                               .element_type = QuantizedElementTypePerTensor(
+                                   TypeParam::kStorage, output_zero_point,
+                                   TypeParam::kExpressed, output_scale)},
+                       .data = output_data.data()};
 
   Vector<StorageT> expected_data(shape.NumElements());
   absl::c_transform(
diff --git a/tensorflow/lite/experimental/shlo/ops/binary_elementwise_test_util.h b/tensorflow/lite/experimental/shlo/ops/binary_elementwise_test_util.h
index fccd0a5c510bea..252678588d1f8c 100644
--- a/tensorflow/lite/experimental/shlo/ops/binary_elementwise_test_util.h
+++ b/tensorflow/lite/experimental/shlo/ops/binary_elementwise_test_util.h
@@ -71,7 +71,7 @@ class BinaryElementwiseOpShapePropagationTest : public ::testing::Test {
     return lhs_tensor_.shape() == output_tensor_.shape();
   }
 
-  Op op_ = Create(typename Op::Attributes{});
+  Op op_ = Create(SupportedOpAttributes<Op>::Get());
   Tensor lhs_tensor_ = {
       .type = TensorType{.shape = Shape({2, 3, 4}),
                          .element_type = SupportedOpDataType<Op>::kStorageType},
@@ -82,7 +82,8 @@ class BinaryElementwiseOpShapePropagationTest : public ::testing::Test {
       .data = nullptr};
   Tensor output_tensor_ = {
       .type = TensorType{.shape = Shape(),
-                         .element_type = SupportedOpDataType<Op>::kStorageType},
+                         .element_type =
+                             SupportedOpOutputDataType<Op>::kStorageType},
       .data = nullptr};
 };
 
diff --git a/tensorflow/lite/experimental/shlo/ops/cbrt.cc b/tensorflow/lite/experimental/shlo/ops/cbrt.cc
index 076e5175436a5e..a5784d5e528f2b 100644
--- a/tensorflow/lite/experimental/shlo/ops/cbrt.cc
+++ b/tensorflow/lite/experimental/shlo/ops/cbrt.cc
@@ -58,10 +58,11 @@ absl::Status Prepare(CbrtOp& op, const Tensor& input, Tensor& output) {
 absl::Status Evaluate(CbrtOp& op, const Tensor& input, Tensor& output) {
   Cbrt cbrt;
   if (input.IsPerTensorQuantized()) {
-    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
-                       input.quantized_tensor_element_type().StorageType(),
-                       input.quantized_tensor_element_type().ExpressedType(),
-                       cbrt, input, output)
+    DISPATCH_QUANTIZED(
+        detail::DequantizeOpQuantizePerTensor,
+        input.quantized_per_tensor_element_type().StorageType(),
+        input.quantized_per_tensor_element_type().ExpressedType(), cbrt, input,
+        output)
   } else if (IsFloatTensor(input)) {
     DISPATCH_FLOAT(detail::EvaluateNoQuantization, input.tensor_element_type(),
                    cbrt, input, output);
diff --git a/tensorflow/lite/experimental/shlo/ops/cbrt_test.cc b/tensorflow/lite/experimental/shlo/ops/cbrt_test.cc
index b62edcfebbfb7c..bbb437e064817c 100644
--- a/tensorflow/lite/experimental/shlo/ops/cbrt_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/cbrt_test.cc
@@ -115,15 +115,16 @@ TYPED_TEST(QuantizedCbrtTest, PerTensorWorks) {
   Vector<StorageT> output_data(shape.NumElements());
   const ExpressedT scale = static_cast<ExpressedT>(1.5);
   const StorageT zero_point = static_cast<StorageT>(5);
-  const QuantizedTensorElementType tensor_type =
-      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
-                                            TypeParam::kExpressed>(scale,
-                                                                   zero_point);
+  const QuantizedElementTypePerTensor tensor_type =
+      QuantizedElementTypePerTensor(TypeParam::kStorage, zero_point,
+                                    TypeParam::kExpressed, scale);
   Tensor input_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = input_data.data()};
   Tensor output_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = output_data.data()};
 
   Vector<StorageT> expected_data(shape.NumElements());
diff --git a/tensorflow/lite/experimental/shlo/ops/ceil.cc b/tensorflow/lite/experimental/shlo/ops/ceil.cc
index 95f96a38bafc4a..e68c60a1670f03 100644
--- a/tensorflow/lite/experimental/shlo/ops/ceil.cc
+++ b/tensorflow/lite/experimental/shlo/ops/ceil.cc
@@ -19,7 +19,6 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "tensorflow/lite/experimental/shlo/bf16.h"
-#include "tensorflow/lite/experimental/shlo/data_type.h"
 #include "tensorflow/lite/experimental/shlo/dispatch.h"
 #include "tensorflow/lite/experimental/shlo/f16.h"
 #include "tensorflow/lite/experimental/shlo/ops/unary_elementwise.h"
@@ -59,10 +58,11 @@ absl::Status Prepare(CeilOp& op, const Tensor& input, Tensor& output) {
 absl::Status Evaluate(CeilOp& op, const Tensor& input, Tensor& output) {
   Ceil ceil;
   if (input.IsPerTensorQuantized()) {
-    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
-                       input.quantized_tensor_element_type().StorageType(),
-                       input.quantized_tensor_element_type().ExpressedType(),
-                       ceil, input, output)
+    DISPATCH_QUANTIZED(
+        detail::DequantizeOpQuantizePerTensor,
+        input.quantized_per_tensor_element_type().StorageType(),
+        input.quantized_per_tensor_element_type().ExpressedType(), ceil, input,
+        output)
   } else if (IsFloatTensor(input)) {
     DISPATCH_FLOAT(detail::EvaluateNoQuantization, input.tensor_element_type(),
                    ceil, input, output);
diff --git a/tensorflow/lite/experimental/shlo/ops/ceil_test.cc b/tensorflow/lite/experimental/shlo/ops/ceil_test.cc
index 2e3e6288c14f4f..e0074922c7ebed 100644
--- a/tensorflow/lite/experimental/shlo/ops/ceil_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/ceil_test.cc
@@ -115,15 +115,16 @@ TYPED_TEST(QuantizedCeilTest, PerTensorWorks) {
   Vector<StorageT> output_data(shape.NumElements());
   const ExpressedT scale = static_cast<ExpressedT>(1.5);
   const StorageT zero_point = static_cast<StorageT>(5);
-  const QuantizedTensorElementType tensor_type =
-      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
-                                            TypeParam::kExpressed>(scale,
-                                                                   zero_point);
+  const QuantizedElementTypePerTensor tensor_type =
+      QuantizedElementTypePerTensor(TypeParam::kStorage, zero_point,
+                                    TypeParam::kExpressed, scale);
   Tensor input_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = input_data.data()};
   Tensor output_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = output_data.data()};
 
   Vector<StorageT> expected_data(shape.NumElements());
diff --git a/tensorflow/lite/experimental/shlo/ops/compare.cc b/tensorflow/lite/experimental/shlo/ops/compare.cc
new file mode 100644
index 00000000000000..651be32f127460
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/compare.cc
@@ -0,0 +1,111 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/compare.h"
+
+#include <functional>
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/ops/binary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+namespace {
+
+template <DataType storage_type, DataType expressed_type, typename F>
+void DequantizeCompare(F&& func, const Tensor& lhs, const Tensor& rhs,
+                       Tensor& output) {
+  using StorageT = StorageType<storage_type>;
+  using ExpressedT = StorageType<expressed_type>;
+  const DimensionSize num_elements = lhs.NumElements();
+  const StorageT lhs_zero_point =
+      lhs.quantized_per_tensor_element_type().ZeroPointAs<storage_type>();
+  const ExpressedT lhs_scale =
+      lhs.quantized_per_tensor_element_type().ScaleAs<expressed_type>();
+  const StorageT rhs_zero_point =
+      rhs.quantized_per_tensor_element_type().ZeroPointAs<storage_type>();
+  const ExpressedT rhs_scale =
+      rhs.quantized_per_tensor_element_type().ScaleAs<expressed_type>();
+  const StorageT* lhs_data = lhs.GetDataAs<storage_type>();
+  const StorageT* rhs_data = rhs.GetDataAs<storage_type>();
+  bool* output_data = output.GetDataAs<DataType::kI1>();
+  for (DimensionSize i = 0; i < num_elements;
+       ++i, ++lhs_data, ++rhs_data, ++output_data) {
+    const ExpressedT dequantized_lhs =
+        Dequantize(*lhs_data, lhs_zero_point, lhs_scale);
+    const ExpressedT dequantized_rhs =
+        Dequantize(*rhs_data, rhs_zero_point, rhs_scale);
+    *output_data = func(dequantized_lhs, dequantized_rhs);
+  }
+}
+
+}  // namespace
+
+CompareOp Create(CompareOp::Attributes attributes) {
+  return {.attributes = attributes};
+}
+
+absl::Status Prepare(CompareOp& op, const Tensor& lhs, const Tensor& rhs,
+                     Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSupportedTypes(CheckCtx("compare"), lhs, IsBoolTensor, IsIntTensor,
+                          IsFloatTensor, IsQuantizedPerTensorTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSupportedTypes(CheckCtx("compare"), output, IsBoolTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("compare"), lhs, rhs));
+  SHLO_REF_RETURN_ON_ERROR(Propagate(lhs.shape(), rhs.shape(), output.shape()));
+  return absl::OkStatus();
+}
+
+// Huge body because of the type dispatch.
+// NOLINTNEXTLINE(google-readability-function-size)
+absl::Status Evaluate(CompareOp& op, const Tensor& lhs, const Tensor& rhs,
+                      Tensor& output) {
+#define SHLO_REF_COMPARISON_DIRECTION_CASE(DIRECTION, COMPARISON_OP)         \
+  case CompareOp::ComparisonDirection::DIRECTION: {                          \
+    if (IsBoolTensor(lhs) || IsIntTensor(lhs) || IsFloatTensor(lhs)) {       \
+      DISPATCH_BOOL_INT_FLOAT(detail::EvaluateNoQuantization,                \
+                              lhs.tensor_element_type(), COMPARISON_OP, lhs, \
+                              rhs, output);                                  \
+    } else if (IsQuantizedPerTensorTensor(lhs)) {                            \
+      DISPATCH_QUANTIZED(                                                    \
+          DequantizeCompare,                                                 \
+          lhs.quantized_per_tensor_element_type().StorageType(),             \
+          lhs.quantized_per_tensor_element_type().ExpressedType(),           \
+          COMPARISON_OP, lhs, rhs, output)                                   \
+    }                                                                        \
+    break;                                                                   \
+  }
+
+  switch (op.attributes.comparison_direction) {
+    SHLO_REF_COMPARISON_DIRECTION_CASE(kEq, std::equal_to<void>());
+    SHLO_REF_COMPARISON_DIRECTION_CASE(kNe, std::not_equal_to<void>());
+    SHLO_REF_COMPARISON_DIRECTION_CASE(kGe, std::greater_equal<void>());
+    SHLO_REF_COMPARISON_DIRECTION_CASE(kGt, std::greater<void>());
+    SHLO_REF_COMPARISON_DIRECTION_CASE(kLe, std::less_equal<void>());
+    SHLO_REF_COMPARISON_DIRECTION_CASE(kLt, std::less<void>());
+  }
+
+  return absl::FailedPreconditionError(
+      "stablehlo.compare: Unsupported tensor type.");
+}
+
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/compare.h b/tensorflow/lite/experimental/shlo/ops/compare.h
new file mode 100644
index 00000000000000..369ab05083c3a0
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/compare.h
@@ -0,0 +1,46 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_COMPARE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_COMPARE_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct CompareOp {
+  enum class ComparisonDirection { kEq, kNe, kGe, kGt, kLe, kLt };
+  // `compare_type` is considered for deprecation. We won't implement it until
+  // the deprecation is lifted.
+  //
+  // https://github.com/openxla/stablehlo/issues/584
+
+  struct Attributes {
+    ComparisonDirection comparison_direction;
+  };
+
+  Attributes attributes;
+};
+
+CompareOp Create(CompareOp::Attributes attributes);
+absl::Status Prepare(CompareOp& op, const Tensor& lhs, const Tensor& rhs,
+                     Tensor& output);
+absl::Status Evaluate(CompareOp& op, const Tensor& lhs, const Tensor& rhs,
+                      Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_COMPARE_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/compare_test.cc b/tensorflow/lite/experimental/shlo/ops/compare_test.cc
new file mode 100644
index 00000000000000..4962bdcf2bf98c
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/compare_test.cc
@@ -0,0 +1,257 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/compare.h"
+
+#include <string>
+#include <tuple>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/random/distributions.h"
+#include "absl/random/random.h"
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/ops/binary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/quantize.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::FloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<CompareOp> {
+  static std::string Get() { return "Compare"; }
+};
+
+struct Compare {
+  template <class T>
+  constexpr bool operator()(const T a, const T b) const {
+    switch (comparison_direction) {
+      case CompareOp::ComparisonDirection::kEq:
+        return a == b;
+      case CompareOp::ComparisonDirection::kNe:
+        return a != b;
+      case CompareOp::ComparisonDirection::kGe:
+        return a >= b;
+      case CompareOp::ComparisonDirection::kGt:
+        return a > b;
+      case CompareOp::ComparisonDirection::kLe:
+        return a <= b;
+      case CompareOp::ComparisonDirection::kLt:
+        return a < b;
+    }
+    return false;
+  }
+
+  CompareOp::ComparisonDirection comparison_direction;
+};
+
+const char* ToString(CompareOp::ComparisonDirection comparison_direction) {
+  switch (comparison_direction) {
+    case CompareOp::ComparisonDirection::kEq:
+      return "eq";
+    case CompareOp::ComparisonDirection::kNe:
+      return "ne";
+    case CompareOp::ComparisonDirection::kGe:
+      return "ge";
+    case CompareOp::ComparisonDirection::kGt:
+      return "gt";
+    case CompareOp::ComparisonDirection::kLe:
+      return "le";
+    case CompareOp::ComparisonDirection::kLt:
+      return "lt";
+  }
+}
+
+template <>
+struct SupportedOpAttributes<CompareOp> {
+  static CompareOp::Attributes Get() {
+    return {.comparison_direction = CompareOp::ComparisonDirection::kEq};
+  }
+};
+
+template <>
+struct SupportedOpOutputDataType<CompareOp> {
+  static constexpr DataType kStorageType = DataType::kI1;
+};
+
+namespace {
+INSTANTIATE_TYPED_TEST_SUITE_P(Compare, BinaryElementwiseOpShapePropagationTest,
+                               CompareOp, TestParamNames);
+
+// Tests that the baseline element type of the input and output tensors is the
+// same.
+//
+// Compare has input/output constraints that are different from the other binary
+// element wise ops. Thus the specific test suite.
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(
+    BinaryElementwiseSameBaselineElementTypeConstraintTest);
+
+template <class Op, class SupportedTypes>
+using CompareBaselineConstraintTypesCrossProduct =
+    MapTypes<OpTupleFactory<Op>::template WithOp,
+             FilterTypes<NegatePred<SameTypes>::template Predicate,
+                         CrossProductTypes<SupportedTypes, SupportedTypes>>>;
+
+using CompareBaselineContraintTypes =
+    CompareBaselineConstraintTypesCrossProduct<
+        CompareOp, ConcatTypes<BoolTestType, BaselineConstraintIntTypes,
+                               BaselineConstraintFloatTypes,
+                               BaselineConstraintQuantizedPerTensorTypes>>;
+
+template <class T>
+class CompareSameBaselineElementTypeConstraintTest : public ::testing::Test {};
+
+TYPED_TEST_SUITE(CompareSameBaselineElementTypeConstraintTest,
+                 CompareBaselineContraintTypes, TestParamNames);
+
+TYPED_TEST(CompareSameBaselineElementTypeConstraintTest,
+           DifferentInputOutputStorageTypesRaiseAnError) {
+  using Op = std::tuple_element_t<0, TypeParam>;
+  using LhsTypeDesc = std::tuple_element_t<1, TypeParam>;
+  using RhsTypeDesc = std::tuple_element_t<2, TypeParam>;
+  const Shape shape({2, 3, 4});
+  Tensor lhs_tensor{.type = TensorTypeFor(LhsTypeDesc{}, shape),
+                    .data = nullptr};
+  Tensor rhs_tensor{.type = TensorTypeFor(RhsTypeDesc{}, shape),
+                    .data = nullptr};
+  Tensor output_tensor{.type = TensorTypeFor(TestParam<DataType::kI1>{}, shape),
+                       .data = nullptr};
+  auto op = Create(typename Op::Attributes{});
+  const absl::Status status =
+      Prepare(op, lhs_tensor, rhs_tensor, output_tensor);
+  EXPECT_THAT(status, shlo_ref::testing::StatusIs(
+                          absl::StatusCode::kFailedPrecondition));
+  EXPECT_THAT(
+      status.message(),
+      ::testing::ContainsRegex(
+          "stablehlo.[_a-z]+: baseline type constraint is not satisfied"));
+}
+
+using UnsupportedTypes =
+    WithOpTypes<CompareOp, ConcatTypes<PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Compare, BinaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+using SupportedTypes = ConcatTypes<BoolTestType, ArithmeticTestTypes>;
+
+template <class T>
+struct CompareTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(CompareTest, SupportedTypes, TestParamNames);
+
+TYPED_TEST(CompareTest, SupportedTestTypesTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  absl::SharedBitGen bit_gen;
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> lhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-50, /*max=*/50);
+  Vector<StorageT> rhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/1, /*max=*/5);
+  Vector<StorageT> output_data(shape.NumElements());
+  Tensor lhs_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = lhs_data.data()};
+  Tensor rhs_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = rhs_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = DataType::kI1},
+      .data = output_data.data()};
+
+  const CompareOp::ComparisonDirection comparison_direction =
+      static_cast<CompareOp::ComparisonDirection>(absl::Uniform(bit_gen, 0, 6));
+
+  Compare compare_ref{comparison_direction};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(lhs_data, rhs_data, expected_data.begin(), compare_ref);
+
+  auto op = Create(
+      CompareOp::Attributes{.comparison_direction = comparison_direction});
+  ASSERT_OK(Prepare(op, lhs_tensor, rhs_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, lhs_tensor, rhs_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(FloatEq(), expected_data));
+}
+
+template <class T>
+struct QuantizedCompareTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(QuantizedCompareTest, QuantizedTestTypes, TestParamNames);
+
+TYPED_TEST(QuantizedCompareTest, PerTensorWorks) {
+  using StorageT = typename TypeParam::StorageT;
+  using ExpressedT = typename TypeParam::ExpressedT;
+
+  absl::SharedBitGen bit_gen;
+  const Shape shape({2, 2, 2});
+  const ExpressedT scale = static_cast<ExpressedT>(1.5);
+  const StorageT zero_point = static_cast<StorageT>(2);
+  Vector<StorageT> lhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-50, /*max=*/50);
+  Vector<StorageT> rhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/zero_point + 1,
+                                        /*max=*/zero_point + 5);
+  Vector<StorageType<DataType::kI1>> output_data(shape.NumElements());
+  const QuantizedElementTypePerTensor tensor_type =
+      QuantizedElementTypePerTensor(TypeParam::kStorage, zero_point,
+                                    TypeParam::kExpressed, scale);
+  Tensor lhs_tensor{
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
+      .data = lhs_data.data()};
+  Tensor rhs_tensor{
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
+      .data = rhs_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = DataType::kI1},
+      .data = output_data.data()};
+
+  const CompareOp::ComparisonDirection comparison_direction =
+      CompareOp::ComparisonDirection::kEq;
+  // static_cast<CompareOp::ComparisonDirection>(absl::Uniform(bit_gen,
+  // 0, 6));
+
+  Compare compare_ref{comparison_direction};
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(
+      lhs_data, rhs_data, expected_data.begin(),
+      [zero_point, scale, compare_ref](auto lhs, auto rhs) {
+        const ExpressedT dequantized_lhs = Dequantize(lhs, zero_point, scale);
+        const ExpressedT dequantized_rhs = Dequantize(rhs, zero_point, scale);
+        return compare_ref(dequantized_lhs, dequantized_rhs);
+      });
+
+  auto op = Create(
+      CompareOp::Attributes{.comparison_direction = comparison_direction});
+  ASSERT_OK(Prepare(op, lhs_tensor, rhs_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, lhs_tensor, rhs_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(FloatEq(), expected_data))
+      << "lhs " << ::testing::PrintToString(lhs_data) << "\n"
+      << "rhs " << ::testing::PrintToString(rhs_data) << "\n"
+      << "dir " << ToString(comparison_direction) << "\n";
+}
+}  // namespace
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/cosine.cc b/tensorflow/lite/experimental/shlo/ops/cosine.cc
index 5bd347836af75e..5f7efaebf3d69f 100644
--- a/tensorflow/lite/experimental/shlo/ops/cosine.cc
+++ b/tensorflow/lite/experimental/shlo/ops/cosine.cc
@@ -58,10 +58,11 @@ absl::Status Prepare(CosineOp& op, const Tensor& input, Tensor& output) {
 absl::Status Evaluate(CosineOp& op, const Tensor& input, Tensor& output) {
   Cosine cosine;
   if (input.IsPerTensorQuantized()) {
-    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
-                       input.quantized_tensor_element_type().StorageType(),
-                       input.quantized_tensor_element_type().ExpressedType(),
-                       cosine, input, output)
+    DISPATCH_QUANTIZED(
+        detail::DequantizeOpQuantizePerTensor,
+        input.quantized_per_tensor_element_type().StorageType(),
+        input.quantized_per_tensor_element_type().ExpressedType(), cosine,
+        input, output)
   } else if (IsFloatTensor(input)) {
     DISPATCH_FLOAT(detail::EvaluateNoQuantization, input.tensor_element_type(),
                    cosine, input, output);
diff --git a/tensorflow/lite/experimental/shlo/ops/cosine_test.cc b/tensorflow/lite/experimental/shlo/ops/cosine_test.cc
index 9f858c9d6cc666..99b5e03c43b5ec 100644
--- a/tensorflow/lite/experimental/shlo/ops/cosine_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/cosine_test.cc
@@ -116,15 +116,16 @@ TYPED_TEST(QuantizedCosineTest, PerTensorWorks) {
   Vector<StorageT> output_data(shape.NumElements());
   const ExpressedT scale = static_cast<ExpressedT>(1.5);
   const StorageT zero_point = static_cast<StorageT>(5);
-  const QuantizedTensorElementType tensor_type =
-      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
-                                            TypeParam::kExpressed>(scale,
-                                                                   zero_point);
+  const QuantizedElementTypePerTensor tensor_type =
+      QuantizedElementTypePerTensor(TypeParam::kStorage, zero_point,
+                                    TypeParam::kExpressed, scale);
   Tensor input_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = input_data.data()};
   Tensor output_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = output_data.data()};
 
   Vector<StorageT> expected_data(shape.NumElements());
diff --git a/tensorflow/lite/experimental/shlo/ops/count_leading_zeros.cc b/tensorflow/lite/experimental/shlo/ops/count_leading_zeros.cc
index 7e3f17f551093c..666779bf48e5d3 100644
--- a/tensorflow/lite/experimental/shlo/ops/count_leading_zeros.cc
+++ b/tensorflow/lite/experimental/shlo/ops/count_leading_zeros.cc
@@ -15,11 +15,13 @@ limitations under the License.
 
 #include "tensorflow/lite/experimental/shlo/ops/count_leading_zeros.h"
 
+#include <cstdint>
 #include <type_traits>
 
 #include "absl/numeric/bits.h"
 #include "absl/status/status.h"
 #include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/i4.h"
 #include "tensorflow/lite/experimental/shlo/ops/unary_elementwise.h"
 #include "tensorflow/lite/experimental/shlo/ops/util.h"
 #include "tensorflow/lite/experimental/shlo/tensor.h"
@@ -29,7 +31,11 @@ namespace shlo_ref {
 struct CountLeadingZeros {
   template <class T>
   T operator()(T v) const {
-    return absl::countl_zero(static_cast<std::make_unsigned_t<T>>(v));
+    if constexpr (std::is_same_v<I4, T>) {
+      return I4(absl::countl_zero(static_cast<uint8_t>(v << 4 | 0xf)));
+    } else {
+      return absl::countl_zero(static_cast<std::make_unsigned_t<T>>(v));
+    }
   }
 };
 
diff --git a/tensorflow/lite/experimental/shlo/ops/count_leading_zeros_test.cc b/tensorflow/lite/experimental/shlo/ops/count_leading_zeros_test.cc
index 4780d1c4fb4b87..47796b9b3710df 100644
--- a/tensorflow/lite/experimental/shlo/ops/count_leading_zeros_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/count_leading_zeros_test.cc
@@ -22,7 +22,9 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/numeric/bits.h"
 #include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/i4.h"
 #include "tensorflow/lite/experimental/shlo/ops/test_util.h"
 #include "tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h"
 #include "tensorflow/lite/experimental/shlo/shape.h"
@@ -49,25 +51,32 @@ namespace {
 struct CountLeadingZeros {
   template <class T>
   T operator()(T v) const {
-    return absl::countl_zero(static_cast<std::make_unsigned_t<T>>(v));
+    if constexpr (std::is_same_v<I4, T>) {
+      return I4(absl::countl_zero(static_cast<uint8_t>(v << 4 | 0xf)));
+    } else {
+      return absl::countl_zero(static_cast<std::make_unsigned_t<T>>(v));
+    }
   }
 } count_leading_zeros_ref;
 
 template <class T>
 struct CountLeadingZerosFunctorTest : ::testing::Test {};
 
-using CountLeadingZerosTypes = ::testing::Types<int32_t, int16_t, int8_t>;
+using CountLeadingZerosTypes = ::testing::Types<int32_t, int16_t, int8_t, I4>;
 
 TYPED_TEST_SUITE(CountLeadingZerosFunctorTest, CountLeadingZerosTypes);
 
 TYPED_TEST(CountLeadingZerosFunctorTest, GivesCorrectResults) {
-  constexpr TypeParam byte_count = 8 * sizeof(TypeParam);
+  int64_t bit_count = 8 * sizeof(TypeParam);
+  if constexpr (std::is_same_v<I4, TypeParam>) {
+    bit_count = 4;
+  }
   EXPECT_EQ(count_leading_zeros_ref(std::numeric_limits<TypeParam>::lowest()),
             0);
   EXPECT_EQ(count_leading_zeros_ref(static_cast<TypeParam>(-1)), 0);
-  EXPECT_EQ(count_leading_zeros_ref(static_cast<TypeParam>(0)), byte_count);
-  EXPECT_EQ(count_leading_zeros_ref(static_cast<TypeParam>(1)), byte_count - 1);
-  EXPECT_EQ(count_leading_zeros_ref(static_cast<TypeParam>(2)), byte_count - 2);
+  EXPECT_EQ(count_leading_zeros_ref(static_cast<TypeParam>(0)), bit_count);
+  EXPECT_EQ(count_leading_zeros_ref(static_cast<TypeParam>(1)), bit_count - 1);
+  EXPECT_EQ(count_leading_zeros_ref(static_cast<TypeParam>(2)), bit_count - 2);
   EXPECT_EQ(count_leading_zeros_ref(std::numeric_limits<TypeParam>::max()), 1);
 }
 
diff --git a/tensorflow/lite/experimental/shlo/ops/divide.cc b/tensorflow/lite/experimental/shlo/ops/divide.cc
index 420c808e6ab310..e8e1629422cdf1 100644
--- a/tensorflow/lite/experimental/shlo/ops/divide.cc
+++ b/tensorflow/lite/experimental/shlo/ops/divide.cc
@@ -51,8 +51,8 @@ absl::Status Evaluate(DivideOp& op, const Tensor& lhs, const Tensor& rhs,
                        lhs.tensor_element_type(), divide, lhs, rhs, output);
   } else if (IsQuantizedPerTensorTensor(lhs)) {
     DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
-                       lhs.quantized_tensor_element_type().StorageType(),
-                       lhs.quantized_tensor_element_type().ExpressedType(),
+                       lhs.quantized_per_tensor_element_type().StorageType(),
+                       lhs.quantized_per_tensor_element_type().ExpressedType(),
                        divide, lhs, rhs, output)
   }
   return absl::FailedPreconditionError(
diff --git a/tensorflow/lite/experimental/shlo/ops/divide_test.cc b/tensorflow/lite/experimental/shlo/ops/divide_test.cc
index c83d02ca3e6569..fe6fa64dc211fa 100644
--- a/tensorflow/lite/experimental/shlo/ops/divide_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/divide_test.cc
@@ -112,18 +112,20 @@ TYPED_TEST(QuantizedDivideTest, PerTensorWorks) {
   Vector<StorageT> rhs_data = RandomBuffer<TypeParam::kStorage>(
       shape, /*min=*/zero_point + 1, /*max=*/zero_point + 5);
   Vector<StorageT> output_data(shape.NumElements());
-  const QuantizedTensorElementType tensor_type =
-      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
-                                            TypeParam::kExpressed>(scale,
-                                                                   zero_point);
+  const QuantizedElementTypePerTensor tensor_type =
+      QuantizedElementTypePerTensor(TypeParam::kStorage, zero_point,
+                                    TypeParam::kExpressed, scale);
   Tensor lhs_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = lhs_data.data()};
   Tensor rhs_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = rhs_data.data()};
   Tensor output_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = output_data.data()};
 
   Vector<StorageT> expected_data(shape.NumElements());
diff --git a/tensorflow/lite/experimental/shlo/ops/exponential.cc b/tensorflow/lite/experimental/shlo/ops/exponential.cc
index 979ddef45889ff..e0d16266efc29c 100644
--- a/tensorflow/lite/experimental/shlo/ops/exponential.cc
+++ b/tensorflow/lite/experimental/shlo/ops/exponential.cc
@@ -58,10 +58,11 @@ absl::Status Prepare(ExponentialOp& op, const Tensor& input, Tensor& output) {
 absl::Status Evaluate(ExponentialOp& op, const Tensor& input, Tensor& output) {
   Exponential exponential;
   if (input.IsPerTensorQuantized()) {
-    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
-                       input.quantized_tensor_element_type().StorageType(),
-                       input.quantized_tensor_element_type().ExpressedType(),
-                       exponential, input, output)
+    DISPATCH_QUANTIZED(
+        detail::DequantizeOpQuantizePerTensor,
+        input.quantized_per_tensor_element_type().StorageType(),
+        input.quantized_per_tensor_element_type().ExpressedType(), exponential,
+        input, output)
   } else if (IsFloatTensor(input)) {
     DISPATCH_FLOAT(detail::EvaluateNoQuantization, input.tensor_element_type(),
                    exponential, input, output);
diff --git a/tensorflow/lite/experimental/shlo/ops/exponential_minus_one.cc b/tensorflow/lite/experimental/shlo/ops/exponential_minus_one.cc
index a5bcab04280ba1..507932d430713b 100644
--- a/tensorflow/lite/experimental/shlo/ops/exponential_minus_one.cc
+++ b/tensorflow/lite/experimental/shlo/ops/exponential_minus_one.cc
@@ -61,10 +61,11 @@ absl::Status Evaluate(ExponentialMinusOneOp& op, const Tensor& input,
                       Tensor& output) {
   ExponentialMinusOne exponential_minus_one;
   if (input.IsPerTensorQuantized()) {
-    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
-                       input.quantized_tensor_element_type().StorageType(),
-                       input.quantized_tensor_element_type().ExpressedType(),
-                       exponential_minus_one, input, output)
+    DISPATCH_QUANTIZED(
+        detail::DequantizeOpQuantizePerTensor,
+        input.quantized_per_tensor_element_type().StorageType(),
+        input.quantized_per_tensor_element_type().ExpressedType(),
+        exponential_minus_one, input, output)
   } else if (IsFloatTensor(input)) {
     DISPATCH_FLOAT(detail::EvaluateNoQuantization, input.tensor_element_type(),
                    exponential_minus_one, input, output);
diff --git a/tensorflow/lite/experimental/shlo/ops/exponential_minus_one_test.cc b/tensorflow/lite/experimental/shlo/ops/exponential_minus_one_test.cc
index 0fbe259ecefafc..24cff531bdb651 100644
--- a/tensorflow/lite/experimental/shlo/ops/exponential_minus_one_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/exponential_minus_one_test.cc
@@ -120,15 +120,16 @@ TYPED_TEST(QuantizedExponentialMinusOneTest, PerTensorWorks) {
   Vector<StorageT> output_data(shape.NumElements());
   const ExpressedT scale = static_cast<ExpressedT>(1.5);
   const StorageT zero_point = static_cast<StorageT>(5);
-  const QuantizedTensorElementType tensor_type =
-      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
-                                            TypeParam::kExpressed>(scale,
-                                                                   zero_point);
+  const QuantizedElementTypePerTensor tensor_type =
+      QuantizedElementTypePerTensor(TypeParam::kStorage, zero_point,
+                                    TypeParam::kExpressed, scale);
   Tensor input_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = input_data.data()};
   Tensor output_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = output_data.data()};
 
   Vector<StorageT> expected_data(shape.NumElements());
diff --git a/tensorflow/lite/experimental/shlo/ops/exponential_test.cc b/tensorflow/lite/experimental/shlo/ops/exponential_test.cc
index f8cab0a7afc137..eaa94571da7d01 100644
--- a/tensorflow/lite/experimental/shlo/ops/exponential_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/exponential_test.cc
@@ -117,15 +117,16 @@ TYPED_TEST(QuantizedExponentialTest, PerTensorWorks) {
   Vector<StorageT> output_data(shape.NumElements());
   const ExpressedT scale = static_cast<ExpressedT>(1.5);
   const StorageT zero_point = static_cast<StorageT>(5);
-  const QuantizedTensorElementType tensor_type =
-      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
-                                            TypeParam::kExpressed>(scale,
-                                                                   zero_point);
+  const QuantizedElementTypePerTensor tensor_type =
+      QuantizedElementTypePerTensor(TypeParam::kStorage, zero_point,
+                                    TypeParam::kExpressed, scale);
   Tensor input_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = input_data.data()};
   Tensor output_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = output_data.data()};
 
   Vector<StorageT> expected_data(shape.NumElements());
diff --git a/tensorflow/lite/experimental/shlo/ops/floor.cc b/tensorflow/lite/experimental/shlo/ops/floor.cc
index 7ef86a3cb53e93..2bcac617ad58e6 100644
--- a/tensorflow/lite/experimental/shlo/ops/floor.cc
+++ b/tensorflow/lite/experimental/shlo/ops/floor.cc
@@ -58,10 +58,11 @@ absl::Status Prepare(FloorOp& op, const Tensor& input, Tensor& output) {
 absl::Status Evaluate(FloorOp& op, const Tensor& input, Tensor& output) {
   Floor floor;
   if (input.IsPerTensorQuantized()) {
-    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
-                       input.quantized_tensor_element_type().StorageType(),
-                       input.quantized_tensor_element_type().ExpressedType(),
-                       floor, input, output)
+    DISPATCH_QUANTIZED(
+        detail::DequantizeOpQuantizePerTensor,
+        input.quantized_per_tensor_element_type().StorageType(),
+        input.quantized_per_tensor_element_type().ExpressedType(), floor, input,
+        output)
   } else if (IsFloatTensor(input)) {
     DISPATCH_FLOAT(detail::EvaluateNoQuantization, input.tensor_element_type(),
                    floor, input, output);
diff --git a/tensorflow/lite/experimental/shlo/ops/floor_test.cc b/tensorflow/lite/experimental/shlo/ops/floor_test.cc
index bf0e19f0c10aeb..5c0afaa1ced3a4 100644
--- a/tensorflow/lite/experimental/shlo/ops/floor_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/floor_test.cc
@@ -116,15 +116,16 @@ TYPED_TEST(QuantizedFloorTest, PerTensorWorks) {
   Vector<StorageT> output_data(shape.NumElements());
   const ExpressedT scale = static_cast<ExpressedT>(1.5);
   const StorageT zero_point = static_cast<StorageT>(5);
-  const QuantizedTensorElementType tensor_type =
-      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
-                                            TypeParam::kExpressed>(scale,
-                                                                   zero_point);
+  const QuantizedElementTypePerTensor tensor_type =
+      QuantizedElementTypePerTensor(TypeParam::kStorage, zero_point,
+                                    TypeParam::kExpressed, scale);
   Tensor input_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = input_data.data()};
   Tensor output_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = output_data.data()};
 
   Vector<StorageT> expected_data(shape.NumElements());
diff --git a/tensorflow/lite/experimental/shlo/ops/is_finite_bench.cc b/tensorflow/lite/experimental/shlo/ops/is_finite_bench.cc
index 46b95c86639f14..e2a327e1a2ec28 100644
--- a/tensorflow/lite/experimental/shlo/ops/is_finite_bench.cc
+++ b/tensorflow/lite/experimental/shlo/ops/is_finite_bench.cc
@@ -77,7 +77,7 @@ BENCHMARK(BM_IsFinite<DataType::kF32>)
 
 // IsFinite will be the same regardless of quantization parameters, so only
 // benchmark one combination.
-BENCHMARK(BM_IsFiniteQuantized<DataType::kSI32, DataType::kF32>)
+BENCHMARK(BM_IsFiniteQuantized<DataType::kSI16, DataType::kF32>)
     ->RangeMultiplier(2)
     ->Range(KiB(8), KiB(64));
 
diff --git a/tensorflow/lite/experimental/shlo/ops/is_finite_test.cc b/tensorflow/lite/experimental/shlo/ops/is_finite_test.cc
index be5fbdbcf1817b..f9ce80db4ce7c5 100644
--- a/tensorflow/lite/experimental/shlo/ops/is_finite_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/is_finite_test.cc
@@ -82,7 +82,7 @@ INSTANTIATE_TEST_SUITE_P(
 INSTANTIATE_TEST_SUITE_P(
     Quantized, IsFiniteTest,
     ::testing::Values(Params{
-        .operand = TensorWithData::Create<DataType::kSI32, DataType::kF32>(
+        .operand = TensorWithData::Create<DataType::kSI16, DataType::kF32>(
             Shape{{7}}, {0.0f, -1.0f, 0.0f, 1.0f, 1.0f, 0.0f, 1.0f}, 0.1f, 0),
         .expected = TensorWithData::Create<DataType::kI1>(
             Shape{{7}}, {true, true, true, true, true, true, true})}));
diff --git a/tensorflow/lite/experimental/shlo/ops/log.cc b/tensorflow/lite/experimental/shlo/ops/log.cc
index 9f3f68ae8e7fdf..18ca44b8232d93 100644
--- a/tensorflow/lite/experimental/shlo/ops/log.cc
+++ b/tensorflow/lite/experimental/shlo/ops/log.cc
@@ -58,10 +58,11 @@ absl::Status Prepare(LogOp& op, const Tensor& input, Tensor& output) {
 absl::Status Evaluate(LogOp& op, const Tensor& input, Tensor& output) {
   Log log;
   if (input.IsPerTensorQuantized()) {
-    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
-                       input.quantized_tensor_element_type().StorageType(),
-                       input.quantized_tensor_element_type().ExpressedType(),
-                       log, input, output)
+    DISPATCH_QUANTIZED(
+        detail::DequantizeOpQuantizePerTensor,
+        input.quantized_per_tensor_element_type().StorageType(),
+        input.quantized_per_tensor_element_type().ExpressedType(), log, input,
+        output)
   } else if (IsFloatTensor(input)) {
     DISPATCH_FLOAT(detail::EvaluateNoQuantization, input.tensor_element_type(),
                    log, input, output);
diff --git a/tensorflow/lite/experimental/shlo/ops/log_plus_one.cc b/tensorflow/lite/experimental/shlo/ops/log_plus_one.cc
index f80ebcce54d9fa..ada26492956ee3 100644
--- a/tensorflow/lite/experimental/shlo/ops/log_plus_one.cc
+++ b/tensorflow/lite/experimental/shlo/ops/log_plus_one.cc
@@ -59,10 +59,11 @@ absl::Status Prepare(LogPlusOneOp& op, const Tensor& input, Tensor& output) {
 absl::Status Evaluate(LogPlusOneOp& op, const Tensor& input, Tensor& output) {
   LogPlusOne log_plus_one;
   if (input.IsPerTensorQuantized()) {
-    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
-                       input.quantized_tensor_element_type().StorageType(),
-                       input.quantized_tensor_element_type().ExpressedType(),
-                       log_plus_one, input, output)
+    DISPATCH_QUANTIZED(
+        detail::DequantizeOpQuantizePerTensor,
+        input.quantized_per_tensor_element_type().StorageType(),
+        input.quantized_per_tensor_element_type().ExpressedType(), log_plus_one,
+        input, output)
   } else if (IsFloatTensor(input)) {
     DISPATCH_FLOAT(detail::EvaluateNoQuantization, input.tensor_element_type(),
                    log_plus_one, input, output);
diff --git a/tensorflow/lite/experimental/shlo/ops/log_plus_one_test.cc b/tensorflow/lite/experimental/shlo/ops/log_plus_one_test.cc
index 9e303cc812d310..d9eb32294e0ab8 100644
--- a/tensorflow/lite/experimental/shlo/ops/log_plus_one_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/log_plus_one_test.cc
@@ -119,15 +119,16 @@ TYPED_TEST(QuantizedLogPlusOneTest, PerTensorWorks) {
   Vector<StorageT> input_data =
       RandomBuffer<TypeParam::kStorage>(shape, /*min=*/zero_point);
   Vector<StorageT> output_data(shape.NumElements());
-  const QuantizedTensorElementType tensor_type =
-      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
-                                            TypeParam::kExpressed>(scale,
-                                                                   zero_point);
+  const QuantizedElementTypePerTensor tensor_type =
+      QuantizedElementTypePerTensor(TypeParam::kStorage, zero_point,
+                                    TypeParam::kExpressed, scale);
   Tensor input_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = input_data.data()};
   Tensor output_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = output_data.data()};
 
   Vector<StorageT> expected_data(shape.NumElements());
diff --git a/tensorflow/lite/experimental/shlo/ops/log_test.cc b/tensorflow/lite/experimental/shlo/ops/log_test.cc
index 5f2c59f147ee4e..d83bec53f6de9b 100644
--- a/tensorflow/lite/experimental/shlo/ops/log_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/log_test.cc
@@ -117,15 +117,16 @@ TYPED_TEST(QuantizedLogTest, PerTensorWorks) {
   Vector<StorageT> input_data =
       RandomBuffer<TypeParam::kStorage>(shape, /*min=*/zero_point + 1);
   Vector<StorageT> output_data(shape.NumElements());
-  const QuantizedTensorElementType tensor_type =
-      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
-                                            TypeParam::kExpressed>(scale,
-                                                                   zero_point);
+  const QuantizedElementTypePerTensor tensor_type =
+      QuantizedElementTypePerTensor(TypeParam::kStorage, zero_point,
+                                    TypeParam::kExpressed, scale);
   Tensor input_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = input_data.data()};
   Tensor output_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = output_data.data()};
 
   Vector<StorageT> expected_data(shape.NumElements());
diff --git a/tensorflow/lite/experimental/shlo/ops/logistic.cc b/tensorflow/lite/experimental/shlo/ops/logistic.cc
index 3953cfe3441810..aa64097804c87d 100644
--- a/tensorflow/lite/experimental/shlo/ops/logistic.cc
+++ b/tensorflow/lite/experimental/shlo/ops/logistic.cc
@@ -59,10 +59,11 @@ absl::Status Prepare(LogisticOp& op, const Tensor& input, Tensor& output) {
 absl::Status Evaluate(LogisticOp& op, const Tensor& input, Tensor& output) {
   Logistic logistic;
   if (input.IsPerTensorQuantized()) {
-    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
-                       input.quantized_tensor_element_type().StorageType(),
-                       input.quantized_tensor_element_type().ExpressedType(),
-                       logistic, input, output)
+    DISPATCH_QUANTIZED(
+        detail::DequantizeOpQuantizePerTensor,
+        input.quantized_per_tensor_element_type().StorageType(),
+        input.quantized_per_tensor_element_type().ExpressedType(), logistic,
+        input, output)
   } else if (IsFloatTensor(input)) {
     DISPATCH_FLOAT(detail::EvaluateNoQuantization, input.tensor_element_type(),
                    logistic, input, output);
diff --git a/tensorflow/lite/experimental/shlo/ops/logistic_test.cc b/tensorflow/lite/experimental/shlo/ops/logistic_test.cc
index 3d8014e33d133a..0b8022589f45c8 100644
--- a/tensorflow/lite/experimental/shlo/ops/logistic_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/logistic_test.cc
@@ -117,15 +117,16 @@ TYPED_TEST(QuantizedLogisticTest, PerTensorWorks) {
   Vector<StorageT> output_data(shape.NumElements());
   const ExpressedT scale = static_cast<ExpressedT>(1.5);
   const StorageT zero_point = static_cast<StorageT>(5);
-  const QuantizedTensorElementType tensor_type =
-      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
-                                            TypeParam::kExpressed>(scale,
-                                                                   zero_point);
+  const QuantizedElementTypePerTensor tensor_type =
+      QuantizedElementTypePerTensor(TypeParam::kStorage, zero_point,
+                                    TypeParam::kExpressed, scale);
   Tensor input_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = input_data.data()};
   Tensor output_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = output_data.data()};
 
   Vector<StorageT> expected_data(shape.NumElements());
diff --git a/tensorflow/lite/experimental/shlo/ops/maximum.cc b/tensorflow/lite/experimental/shlo/ops/maximum.cc
index 9239dadaac3f92..aed71f81c50d4d 100644
--- a/tensorflow/lite/experimental/shlo/ops/maximum.cc
+++ b/tensorflow/lite/experimental/shlo/ops/maximum.cc
@@ -55,8 +55,8 @@ absl::Status Evaluate(MaximumOp& op, const Tensor& lhs, const Tensor& rhs,
                             output);
   } else if (IsQuantizedPerTensorTensor(lhs)) {
     DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
-                       lhs.quantized_tensor_element_type().StorageType(),
-                       lhs.quantized_tensor_element_type().ExpressedType(),
+                       lhs.quantized_per_tensor_element_type().StorageType(),
+                       lhs.quantized_per_tensor_element_type().ExpressedType(),
                        maximum, lhs, rhs, output)
   }
   return absl::FailedPreconditionError(
diff --git a/tensorflow/lite/experimental/shlo/ops/maximum_test.cc b/tensorflow/lite/experimental/shlo/ops/maximum_test.cc
index 0422331324daf9..331b229b6deb66 100644
--- a/tensorflow/lite/experimental/shlo/ops/maximum_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/maximum_test.cc
@@ -116,18 +116,20 @@ TYPED_TEST(QuantizedMaximumTest, PerTensorWorks) {
   Vector<StorageT> rhs_data = RandomBuffer<TypeParam::kStorage>(
       shape, /*min=*/zero_point + 1, /*max=*/zero_point + 5);
   Vector<StorageT> output_data(shape.NumElements());
-  const QuantizedTensorElementType tensor_type =
-      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
-                                            TypeParam::kExpressed>(scale,
-                                                                   zero_point);
+  const QuantizedElementTypePerTensor tensor_type =
+      QuantizedElementTypePerTensor(TypeParam::kStorage, zero_point,
+                                    TypeParam::kExpressed, scale);
   Tensor lhs_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = lhs_data.data()};
   Tensor rhs_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = rhs_data.data()};
   Tensor output_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = output_data.data()};
 
   Vector<StorageT> expected_data(shape.NumElements());
diff --git a/tensorflow/lite/experimental/shlo/ops/minimum.cc b/tensorflow/lite/experimental/shlo/ops/minimum.cc
index c583d7afb6b147..978feec9202e61 100644
--- a/tensorflow/lite/experimental/shlo/ops/minimum.cc
+++ b/tensorflow/lite/experimental/shlo/ops/minimum.cc
@@ -55,8 +55,8 @@ absl::Status Evaluate(MinimumOp& op, const Tensor& lhs, const Tensor& rhs,
                             output);
   } else if (IsQuantizedPerTensorTensor(lhs)) {
     DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
-                       lhs.quantized_tensor_element_type().StorageType(),
-                       lhs.quantized_tensor_element_type().ExpressedType(),
+                       lhs.quantized_per_tensor_element_type().StorageType(),
+                       lhs.quantized_per_tensor_element_type().ExpressedType(),
                        minimum, lhs, rhs, output)
   }
   return absl::FailedPreconditionError(
diff --git a/tensorflow/lite/experimental/shlo/ops/minimum_test.cc b/tensorflow/lite/experimental/shlo/ops/minimum_test.cc
index 586c096f1cbab0..6f1a1b32071448 100644
--- a/tensorflow/lite/experimental/shlo/ops/minimum_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/minimum_test.cc
@@ -116,18 +116,20 @@ TYPED_TEST(QuantizedMinimumTest, PerTensorWorks) {
   Vector<StorageT> rhs_data = RandomBuffer<TypeParam::kStorage>(
       shape, /*min=*/zero_point + 1, /*max=*/zero_point + 5);
   Vector<StorageT> output_data(shape.NumElements());
-  const QuantizedTensorElementType tensor_type =
-      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
-                                            TypeParam::kExpressed>(scale,
-                                                                   zero_point);
+  const QuantizedElementTypePerTensor tensor_type =
+      QuantizedElementTypePerTensor(TypeParam::kStorage, zero_point,
+                                    TypeParam::kExpressed, scale);
   Tensor lhs_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = lhs_data.data()};
   Tensor rhs_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = rhs_data.data()};
   Tensor output_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = output_data.data()};
 
   Vector<StorageT> expected_data(shape.NumElements());
diff --git a/tensorflow/lite/experimental/shlo/ops/multiply.cc b/tensorflow/lite/experimental/shlo/ops/multiply.cc
index 03898ff46367bd..481c2f800a89ea 100644
--- a/tensorflow/lite/experimental/shlo/ops/multiply.cc
+++ b/tensorflow/lite/experimental/shlo/ops/multiply.cc
@@ -66,8 +66,8 @@ absl::Status Evaluate(MultiplyOp& op, const Tensor& lhs, const Tensor& rhs,
   } else if (IsQuantizedPerTensorTensor(lhs)) {
     Multiply<DataType::kF32> multiply;
     DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
-                       lhs.quantized_tensor_element_type().StorageType(),
-                       lhs.quantized_tensor_element_type().ExpressedType(),
+                       lhs.quantized_per_tensor_element_type().StorageType(),
+                       lhs.quantized_per_tensor_element_type().ExpressedType(),
                        multiply, lhs, rhs, output)
   }
   return absl::FailedPreconditionError(
diff --git a/tensorflow/lite/experimental/shlo/ops/multiply_test.cc b/tensorflow/lite/experimental/shlo/ops/multiply_test.cc
index 241ed9214e2edb..2716f4910d8671 100644
--- a/tensorflow/lite/experimental/shlo/ops/multiply_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/multiply_test.cc
@@ -124,18 +124,20 @@ TYPED_TEST(QuantizedMultiplyTest, PerTensorWorks) {
   Vector<StorageT> output_data(shape.NumElements());
   const ExpressedT scale = static_cast<ExpressedT>(1.5);
   const StorageT zero_point = static_cast<StorageT>(5);
-  const QuantizedTensorElementType tensor_type =
-      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
-                                            TypeParam::kExpressed>(scale,
-                                                                   zero_point);
+  const QuantizedElementTypePerTensor tensor_type =
+      QuantizedElementTypePerTensor(TypeParam::kStorage, zero_point,
+                                    TypeParam::kExpressed, scale);
   Tensor lhs_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = lhs_data.data()};
   Tensor rhs_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = rhs_data.data()};
   Tensor output_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = output_data.data()};
 
   Vector<StorageT> expected_data(shape.NumElements());
diff --git a/tensorflow/lite/experimental/shlo/ops/negate.cc b/tensorflow/lite/experimental/shlo/ops/negate.cc
index 079d3faf0b21bb..55cfeb6763dbfa 100644
--- a/tensorflow/lite/experimental/shlo/ops/negate.cc
+++ b/tensorflow/lite/experimental/shlo/ops/negate.cc
@@ -42,10 +42,11 @@ absl::Status Prepare(NegateOp& op, const Tensor& input, Tensor& output) {
 absl::Status Evaluate(NegateOp& op, const Tensor& input, Tensor& output) {
   Negate negate;
   if (input.IsPerTensorQuantized()) {
-    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
-                       input.quantized_tensor_element_type().StorageType(),
-                       input.quantized_tensor_element_type().ExpressedType(),
-                       negate, input, output)
+    DISPATCH_QUANTIZED(
+        detail::DequantizeOpQuantizePerTensor,
+        input.quantized_per_tensor_element_type().StorageType(),
+        input.quantized_per_tensor_element_type().ExpressedType(), negate,
+        input, output)
   } else if (IsSignedIntTensor(input) || IsFloatTensor(input)) {
     DISPATCH_INT_FLOAT(detail::EvaluateNoQuantization,
                        input.tensor_element_type(), negate, input, output);
diff --git a/tensorflow/lite/experimental/shlo/ops/negate_test.cc b/tensorflow/lite/experimental/shlo/ops/negate_test.cc
index 4786dfe813cbca..2cafff63c37d23 100644
--- a/tensorflow/lite/experimental/shlo/ops/negate_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/negate_test.cc
@@ -99,15 +99,16 @@ TYPED_TEST(QuantizedNegateTest, PerTensorWorks) {
   Vector<StorageT> output_data(shape.NumElements());
   const ExpressedT scale = static_cast<ExpressedT>(1.5);
   const StorageT zero_point = static_cast<StorageT>(5);
-  const QuantizedTensorElementType tensor_type =
-      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
-                                            TypeParam::kExpressed>(scale,
-                                                                   zero_point);
+  const QuantizedElementTypePerTensor tensor_type =
+      QuantizedElementTypePerTensor(TypeParam::kStorage, zero_point,
+                                    TypeParam::kExpressed, scale);
   Tensor input_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = input_data.data()};
   Tensor output_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = output_data.data()};
 
   Vector<StorageT> expected_data(shape.NumElements());
diff --git a/tensorflow/lite/experimental/shlo/ops/not.cc b/tensorflow/lite/experimental/shlo/ops/not.cc
index b8d613309b9010..ae21734f27bfb0 100644
--- a/tensorflow/lite/experimental/shlo/ops/not.cc
+++ b/tensorflow/lite/experimental/shlo/ops/not.cc
@@ -26,7 +26,7 @@ namespace shlo_ref {
 struct Not {
   template <class T>
   T operator()(T v) const {
-    return ~v;
+    return static_cast<T>(~v);
   }
 };
 
diff --git a/tensorflow/lite/experimental/shlo/ops/popcnt.cc b/tensorflow/lite/experimental/shlo/ops/popcnt.cc
index 0de4f84e71fb4f..f055b8cecdd465 100644
--- a/tensorflow/lite/experimental/shlo/ops/popcnt.cc
+++ b/tensorflow/lite/experimental/shlo/ops/popcnt.cc
@@ -15,11 +15,13 @@ limitations under the License.
 
 #include "tensorflow/lite/experimental/shlo/ops/popcnt.h"
 
+#include <cstdint>
 #include <type_traits>
 
 #include "absl/numeric/bits.h"
 #include "absl/status/status.h"
 #include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/i4.h"
 #include "tensorflow/lite/experimental/shlo/ops/unary_elementwise.h"
 #include "tensorflow/lite/experimental/shlo/ops/util.h"
 #include "tensorflow/lite/experimental/shlo/tensor.h"
@@ -29,7 +31,11 @@ namespace shlo_ref {
 struct Popcnt {
   template <class T>
   T operator()(T v) const {
-    return absl::popcount(static_cast<std::make_unsigned_t<T>>(v));
+    if constexpr (std::is_same_v<I4, T>) {
+      return I4(absl::popcount(static_cast<uint8_t>(v & 0xf)));
+    } else {
+      return absl::popcount(static_cast<std::make_unsigned_t<T>>(v));
+    }
   }
 };
 
diff --git a/tensorflow/lite/experimental/shlo/ops/popcnt_test.cc b/tensorflow/lite/experimental/shlo/ops/popcnt_test.cc
index 3be3bdb39ac4ff..7b3664b6564d31 100644
--- a/tensorflow/lite/experimental/shlo/ops/popcnt_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/popcnt_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/lite/experimental/shlo/ops/popcnt.h"
 
+#include <cstdint>
+#include <limits>
 #include <string>
 #include <type_traits>
 
@@ -22,6 +24,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/numeric/bits.h"
 #include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/i4.h"
 #include "tensorflow/lite/experimental/shlo/ops/test_util.h"
 #include "tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h"
 #include "tensorflow/lite/experimental/shlo/shape.h"
@@ -48,10 +51,35 @@ namespace {
 struct Popcnt {
   template <class T>
   T operator()(T v) const {
-    return absl::popcount(static_cast<std::make_unsigned_t<T>>(v));
+    if constexpr (std::is_same_v<I4, T>) {
+      return I4(absl::popcount(static_cast<uint8_t>(v & 0xf)));
+    } else {
+      return absl::popcount(static_cast<std::make_unsigned_t<T>>(v));
+    }
   }
 } popcnt_ref;
 
+using PopcntTypes = ::testing::Types<int32_t, int16_t, int8_t, I4>;
+
+template <class T>
+struct PopcntFunctorTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(PopcntFunctorTest, PopcntTypes);
+
+TYPED_TEST(PopcntFunctorTest, GivesCorrectResults) {
+  int64_t bit_count = 8 * sizeof(TypeParam);
+  if constexpr (std::is_same_v<I4, TypeParam>) {
+    bit_count = 4;
+  }
+  EXPECT_EQ(popcnt_ref(std::numeric_limits<TypeParam>::lowest()), 1);
+  EXPECT_EQ(popcnt_ref(static_cast<TypeParam>(-1)), bit_count);
+  EXPECT_EQ(popcnt_ref(static_cast<TypeParam>(0)), 0);
+  EXPECT_EQ(popcnt_ref(static_cast<TypeParam>(1)), 1);
+  EXPECT_EQ(popcnt_ref(static_cast<TypeParam>(2)), 1);
+  EXPECT_EQ(popcnt_ref(static_cast<TypeParam>(3)), 2);
+  EXPECT_EQ(popcnt_ref(std::numeric_limits<TypeParam>::max()), bit_count - 1);
+}
+
 INSTANTIATE_TYPED_TEST_SUITE_P(Popcnt, UnaryElementwiseOpShapePropagationTest,
                                PopcntOp, TestParamNames);
 
diff --git a/tensorflow/lite/experimental/shlo/ops/sign.cc b/tensorflow/lite/experimental/shlo/ops/sign.cc
index 197b87ba6bb3bc..e6ebfdcea958f0 100644
--- a/tensorflow/lite/experimental/shlo/ops/sign.cc
+++ b/tensorflow/lite/experimental/shlo/ops/sign.cc
@@ -29,8 +29,9 @@ struct Sign {
   template <class T>
   T operator()(T v) const {
     constexpr T one = static_cast<T>(1);
+    constexpr T minus_one = static_cast<T>(-1);
     constexpr T zero = static_cast<T>(0);
-    return v < zero ? -one : (v > zero ? one : v);
+    return v < zero ? minus_one : (v > zero ? one : v);
   }
 };
 
@@ -60,10 +61,11 @@ absl::Status Prepare(SignOp& op, const Tensor& input, Tensor& output) {
 absl::Status Evaluate(SignOp& op, const Tensor& input, Tensor& output) {
   Sign sign;
   if (input.IsPerTensorQuantized()) {
-    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
-                       input.quantized_tensor_element_type().StorageType(),
-                       input.quantized_tensor_element_type().ExpressedType(),
-                       sign, input, output)
+    DISPATCH_QUANTIZED(
+        detail::DequantizeOpQuantizePerTensor,
+        input.quantized_per_tensor_element_type().StorageType(),
+        input.quantized_per_tensor_element_type().ExpressedType(), sign, input,
+        output)
   } else if (IsSignedIntTensor(input) || IsFloatTensor(input)) {
     DISPATCH_INT_FLOAT(detail::EvaluateNoQuantization,
                        input.tensor_element_type(), sign, input, output);
diff --git a/tensorflow/lite/experimental/shlo/ops/sign_test.cc b/tensorflow/lite/experimental/shlo/ops/sign_test.cc
index 67ec823482036a..4cec0ccac0e918 100644
--- a/tensorflow/lite/experimental/shlo/ops/sign_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/sign_test.cc
@@ -46,8 +46,9 @@ struct Sign {
   template <class T>
   T operator()(T v) const {
     constexpr T one = static_cast<T>(1);
+    constexpr T minus_one = static_cast<T>(-1);
     constexpr T zero = static_cast<T>(0);
-    return v < zero ? -one : (v > zero ? one : v);
+    return v < zero ? minus_one : (v > zero ? one : v);
   }
 } sign_ref;
 
@@ -116,15 +117,16 @@ TYPED_TEST(QuantizedSignTest, PerTensorWorks) {
   Vector<StorageT> output_data(shape.NumElements());
   const ExpressedT scale = static_cast<ExpressedT>(1.5);
   const StorageT zero_point = static_cast<StorageT>(5);
-  const QuantizedTensorElementType tensor_type =
-      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
-                                            TypeParam::kExpressed>(scale,
-                                                                   zero_point);
+  const QuantizedElementTypePerTensor tensor_type =
+      QuantizedElementTypePerTensor(TypeParam::kStorage, zero_point,
+                                    TypeParam::kExpressed, scale);
   Tensor input_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = input_data.data()};
   Tensor output_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = output_data.data()};
 
   Vector<StorageT> expected_data(shape.NumElements());
diff --git a/tensorflow/lite/experimental/shlo/ops/sine.cc b/tensorflow/lite/experimental/shlo/ops/sine.cc
index e69d98e07c1517..0477042c00c5b7 100644
--- a/tensorflow/lite/experimental/shlo/ops/sine.cc
+++ b/tensorflow/lite/experimental/shlo/ops/sine.cc
@@ -59,10 +59,11 @@ absl::Status Prepare(SineOp& op, const Tensor& input, Tensor& output) {
 absl::Status Evaluate(SineOp& op, const Tensor& input, Tensor& output) {
   Sine sine;
   if (input.IsPerTensorQuantized()) {
-    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
-                       input.quantized_tensor_element_type().StorageType(),
-                       input.quantized_tensor_element_type().ExpressedType(),
-                       sine, input, output)
+    DISPATCH_QUANTIZED(
+        detail::DequantizeOpQuantizePerTensor,
+        input.quantized_per_tensor_element_type().StorageType(),
+        input.quantized_per_tensor_element_type().ExpressedType(), sine, input,
+        output)
   } else if (!input.IsQuantized() && IsFloat(input.StorageType())) {
     DISPATCH_FLOAT(detail::EvaluateNoQuantization, input.tensor_element_type(),
                    sine, input, output);
diff --git a/tensorflow/lite/experimental/shlo/ops/sine_test.cc b/tensorflow/lite/experimental/shlo/ops/sine_test.cc
index fa16dee3b1d27f..3b8a31ebe3b394 100644
--- a/tensorflow/lite/experimental/shlo/ops/sine_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/sine_test.cc
@@ -112,10 +112,10 @@ TYPED_TEST(QuantizedSineTest, PerTensorWorks) {
   const StorageT zero_point = static_cast<StorageT>(5);
   Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(shape);
   Vector<StorageT> output_data(shape.NumElements());
-  const QuantizedTensorType tensor_type = {
+  const QuantizedPerTensorTensorType tensor_type = {
       .shape = shape,
-      .element_type = QuantizedTensorElementType::PerTensor<
-          TypeParam::kStorage, TypeParam::kExpressed>(scale, zero_point)};
+      .element_type = QuantizedElementTypePerTensor(
+          TypeParam::kStorage, zero_point, TypeParam::kExpressed, scale)};
   Tensor input_tensor{.type = tensor_type, .data = input_data.data()};
   Tensor output_tensor{.type = tensor_type, .data = output_data.data()};
 
diff --git a/tensorflow/lite/experimental/shlo/ops/sqrt.cc b/tensorflow/lite/experimental/shlo/ops/sqrt.cc
index f34841fc9cb874..ba5f3d40f8564b 100644
--- a/tensorflow/lite/experimental/shlo/ops/sqrt.cc
+++ b/tensorflow/lite/experimental/shlo/ops/sqrt.cc
@@ -58,10 +58,11 @@ absl::Status Prepare(SqrtOp& op, const Tensor& input, Tensor& output) {
 absl::Status Evaluate(SqrtOp& op, const Tensor& input, Tensor& output) {
   Sqrt sqrt;
   if (input.IsPerTensorQuantized()) {
-    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
-                       input.quantized_tensor_element_type().StorageType(),
-                       input.quantized_tensor_element_type().ExpressedType(),
-                       sqrt, input, output)
+    DISPATCH_QUANTIZED(
+        detail::DequantizeOpQuantizePerTensor,
+        input.quantized_per_tensor_element_type().StorageType(),
+        input.quantized_per_tensor_element_type().ExpressedType(), sqrt, input,
+        output)
   } else if (IsFloatTensor(input)) {
     DISPATCH_FLOAT(detail::EvaluateNoQuantization, input.tensor_element_type(),
                    sqrt, input, output);
diff --git a/tensorflow/lite/experimental/shlo/ops/sqrt_test.cc b/tensorflow/lite/experimental/shlo/ops/sqrt_test.cc
index 937c871b93745f..6669a643ad4cc8 100644
--- a/tensorflow/lite/experimental/shlo/ops/sqrt_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/sqrt_test.cc
@@ -48,7 +48,6 @@ struct Sqrt {
   T operator()(T v) const {
     return std::sqrt(v);
   }
-
 } sqrt_ref;
 
 template <>
@@ -114,19 +113,20 @@ TYPED_TEST(QuantizedSqrtTest, PerTensorWorks) {
 
   const Shape shape({2, 3, 4});
   const ExpressedT scale = static_cast<ExpressedT>(1.5);
-  const StorageT zero_point = static_cast<StorageT>(5);
-  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(
-      shape, /*min=*/static_cast<StorageT>(zero_point));
+  const StorageT zero_point = static_cast<StorageT>(4);
+  Vector<StorageT> input_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/zero_point + 1);
   Vector<StorageT> output_data(shape.NumElements());
-  const QuantizedTensorElementType tensor_type =
-      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
-                                            TypeParam::kExpressed>(scale,
-                                                                   zero_point);
+  const QuantizedElementTypePerTensor tensor_type =
+      QuantizedElementTypePerTensor(TypeParam::kStorage, zero_point,
+                                    TypeParam::kExpressed, scale);
   Tensor input_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = input_data.data()};
   Tensor output_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = output_data.data()};
 
   Vector<StorageT> expected_data(shape.NumElements());
diff --git a/tensorflow/lite/experimental/shlo/ops/subtract.cc b/tensorflow/lite/experimental/shlo/ops/subtract.cc
index 0c097afaebb578..319e48fd5e104a 100644
--- a/tensorflow/lite/experimental/shlo/ops/subtract.cc
+++ b/tensorflow/lite/experimental/shlo/ops/subtract.cc
@@ -51,8 +51,8 @@ absl::Status Evaluate(SubtractOp& op, const Tensor& lhs, const Tensor& rhs,
                        lhs.tensor_element_type(), subtract, lhs, rhs, output);
   } else if (IsQuantizedPerTensorTensor(lhs)) {
     DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
-                       lhs.quantized_tensor_element_type().StorageType(),
-                       lhs.quantized_tensor_element_type().ExpressedType(),
+                       lhs.quantized_per_tensor_element_type().StorageType(),
+                       lhs.quantized_per_tensor_element_type().ExpressedType(),
                        subtract, lhs, rhs, output)
   }
   return absl::FailedPreconditionError(
diff --git a/tensorflow/lite/experimental/shlo/ops/subtract_test.cc b/tensorflow/lite/experimental/shlo/ops/subtract_test.cc
index ed350d79d72150..66d5372c2fa059 100644
--- a/tensorflow/lite/experimental/shlo/ops/subtract_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/subtract_test.cc
@@ -114,18 +114,20 @@ TYPED_TEST(QuantizedSubtractTest, PerTensorWorks) {
   Vector<StorageT> rhs_data =
       RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-5, /*max=*/5);
   Vector<StorageT> output_data(shape.NumElements());
-  const QuantizedTensorElementType tensor_type =
-      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
-                                            TypeParam::kExpressed>(scale,
-                                                                   zero_point);
+  const QuantizedElementTypePerTensor tensor_type =
+      QuantizedElementTypePerTensor(TypeParam::kStorage, zero_point,
+                                    TypeParam::kExpressed, scale);
   Tensor lhs_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = lhs_data.data()};
   Tensor rhs_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = rhs_data.data()};
   Tensor output_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = output_data.data()};
 
   Vector<StorageT> expected_data(shape.NumElements());
diff --git a/tensorflow/lite/experimental/shlo/ops/tanh.cc b/tensorflow/lite/experimental/shlo/ops/tanh.cc
index d2518f6ba81b5f..4302a92d1698e1 100644
--- a/tensorflow/lite/experimental/shlo/ops/tanh.cc
+++ b/tensorflow/lite/experimental/shlo/ops/tanh.cc
@@ -59,10 +59,11 @@ absl::Status Prepare(TanhOp& op, const Tensor& input, Tensor& output) {
 absl::Status Evaluate(TanhOp& op, const Tensor& input, Tensor& output) {
   Tanh tanh;
   if (input.IsPerTensorQuantized()) {
-    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
-                       input.quantized_tensor_element_type().StorageType(),
-                       input.quantized_tensor_element_type().ExpressedType(),
-                       tanh, input, output)
+    DISPATCH_QUANTIZED(
+        detail::DequantizeOpQuantizePerTensor,
+        input.quantized_per_tensor_element_type().StorageType(),
+        input.quantized_per_tensor_element_type().ExpressedType(), tanh, input,
+        output)
   } else if (!input.IsQuantized() && IsFloat(input.StorageType())) {
     DISPATCH_FLOAT(detail::EvaluateNoQuantization, input.tensor_element_type(),
                    tanh, input, output);
diff --git a/tensorflow/lite/experimental/shlo/ops/tanh_test.cc b/tensorflow/lite/experimental/shlo/ops/tanh_test.cc
index 0343a087b65a62..465580b2f5d0bb 100644
--- a/tensorflow/lite/experimental/shlo/ops/tanh_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/tanh_test.cc
@@ -115,15 +115,16 @@ TYPED_TEST(QuantizedTanhTest, PerTensorWorks) {
   Vector<StorageT> output_data(shape.NumElements());
   const ExpressedT scale = static_cast<ExpressedT>(1.5);
   const StorageT zero_point = static_cast<StorageT>(5);
-  const QuantizedTensorElementType tensor_type =
-      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
-                                            TypeParam::kExpressed>(scale,
-                                                                   zero_point);
+  const QuantizedElementTypePerTensor tensor_type =
+      QuantizedElementTypePerTensor(TypeParam::kStorage, zero_point,
+                                    TypeParam::kExpressed, scale);
   Tensor input_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = input_data.data()};
   Tensor output_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = output_data.data()};
 
   Vector<StorageT> expected_data(shape.NumElements());
diff --git a/tensorflow/lite/experimental/shlo/ops/test_util.h b/tensorflow/lite/experimental/shlo/ops/test_util.h
index 58f35dcb8c446d..bdec109db21f1f 100644
--- a/tensorflow/lite/experimental/shlo/ops/test_util.h
+++ b/tensorflow/lite/experimental/shlo/ops/test_util.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/inlined_vector.h"
 #include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/i4.h"
 #include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
 #include "tensorflow/lite/experimental/shlo/shape.h"
 #include "tensorflow/lite/experimental/shlo/tensor.h"
@@ -47,6 +48,12 @@ struct UniformDistributionImpl<DataType::kI1, void>
   using std::uniform_int_distribution<int32_t>::uniform_int_distribution;
 };
 
+template <>
+struct UniformDistributionImpl<DataType::kSI4, void>
+    : std::uniform_int_distribution<int8_t> {
+  using std::uniform_int_distribution<int8_t>::uniform_int_distribution;
+};
+
 template <DataType storage_type>
 struct UniformDistributionImpl<storage_type,
                                std::enable_if_t<IsInteger(storage_type)>>
@@ -82,18 +89,26 @@ Vector<typename Config::Type> RandomBuffer(const Shape& shape,
       max < Config::kMaxValue ? static_cast<StorageT>(max) : Config::kMaxValue;
   Vector<typename Config::Type> vec(shape.NumElements());
   std::random_device rd;
-  Distribution<storage_type> dist(min_val, max_val);
-  absl::c_generate(vec, [&] { return dist(rd); });
+  if constexpr (std::is_same_v<I4, StorageT>) {
+    Distribution<DataType::kSI8> dist(min_val, max_val);
+    absl::c_generate(vec, [&] { return static_cast<StorageT>(dist(rd)); });
+  } else {
+    Distribution<storage_type> dist(min_val, max_val);
+    absl::c_generate(vec, [&] { return dist(rd); });
+  }
   return vec;
 }
 
 // Returns a vector filled with incremental value. The values wrap around
 // according to the storage type range.
-template <DataType storage_type, class Config = Storage<storage_type>>
-Vector<typename Config::Type> IotaBuffer(
-    const Shape& shape, const typename Config::Type start = Config::kMinValue,
-    const typename Config::Type min = Config::kMinValue,
-    const typename Config::Type max = Config::kMaxValue) {
+template <DataType storage_type, class StartT = StorageType<storage_type>,
+          class MinT = StorageType<storage_type>,
+          class MaxT = StorageType<storage_type>,
+          class Config = Storage<storage_type>>
+Vector<typename Config::Type> IotaBuffer(const Shape& shape,
+                                         const StartT start = Config::kMinValue,
+                                         const MinT min = Config::kMinValue,
+                                         const MaxT max = Config::kMaxValue) {
   using StorageT = StorageType<storage_type>;
   const StorageT min_val =
       min > Config::kMinValue ? static_cast<StorageT>(min) : Config::kMinValue;
@@ -150,37 +165,6 @@ struct PerAxis {
   static constexpr Axis axis = kAxis;
 };
 
-// Gets a string representation of the given DataType.
-constexpr const char* ToString(DataType t) {
-  switch (t) {
-    case DataType::kI1:
-      return "I1";
-      break;
-    case DataType::kSI4:
-      return "SI4";
-      break;
-    case DataType::kSI8:
-      return "SI8";
-      break;
-    case DataType::kSI16:
-      return "SI16";
-      break;
-    case DataType::kSI32:
-      return "SI32";
-      break;
-    case DataType::kBF16:
-      return "BF16";
-      break;
-    case DataType::kF16:
-      return "F16";
-      break;
-    case DataType::kF32:
-      return "F32";
-      break;
-  }
-  return "Unknown data type";
-}
-
 // Helps getting a human readable typed test parameter name.
 template <class T>
 struct ParamName;
@@ -390,6 +374,26 @@ struct SupportedOpDataType {
   static constexpr DataType kStorageType = DataType::kF32;
 };
 
+// Customization point for generic tests that need to create a supported output
+// tensor for an op but that don't care what that type is.
+//
+// Specialize this in the test file if `SupportedOpDataType<Op>::kStorageType`
+// isn't supported by the op under test.
+template <class Op>
+struct SupportedOpOutputDataType {
+  static constexpr DataType kStorageType =
+      SupportedOpDataType<Op>::kStorageType;
+};
+
+// Customization point for generic tests that need a valid attribute
+// configuration to create an op but that don't care what that configuration is.
+//
+// Specialize this in the test file if F32 isn't supported by the op under test.
+template <class Op>
+struct SupportedOpAttributes {
+  static typename Op::Attributes Get() { return {}; };
+};
+
 // Builds a TensorType object and returns it in a variant that can be passed to
 // a tensor.
 template <DataType storage_type>
@@ -411,12 +415,12 @@ TensorTypeVariant TensorTypeFor(
   UniformDistribution<storage_type> storage_dist(-5, 5);
   StorageType<expressed_type> scale =
       static_cast<StorageType<expressed_type>>(expressed_dist(rd));
-  StorageType<storage_type> zero_point = storage_dist(rd);
-  return QuantizedTensorType{
+  StorageType<storage_type> zero_point =
+      StorageType<storage_type>(storage_dist(rd));
+  return QuantizedPerTensorTensorType{
       .shape = shape,
-      .element_type =
-          QuantizedTensorElementType::PerTensor<storage_type, expressed_type>(
-              scale, zero_point)};
+      .element_type = QuantizedElementTypePerTensor(storage_type, zero_point,
+                                                    expressed_type, scale)};
 }
 
 // Builds a per axis QuantizedTensorType object and returns it in a variant
@@ -427,11 +431,10 @@ template <DataType storage_type, DataType expressed_type, Axis axis>
 TensorTypeVariant TensorTypeFor(
     PerAxis<TestParam<storage_type, expressed_type>, axis>,
     const Shape& shape) {
-  return QuantizedTensorType{
+  return QuantizedPerAxisTensorType{
       .shape = shape,
-      .element_type =
-          QuantizedTensorElementType::PerAxis<storage_type, expressed_type>(
-              /*scales=*/{}, /*zero_points=*/{}, axis)};
+      .element_type = QuantizedElementTypePerAxis(storage_type, {},
+                                                  expressed_type, {}, axis)};
 }
 
 }  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/unary_elementwise.h b/tensorflow/lite/experimental/shlo/ops/unary_elementwise.h
index 554c1096f845e0..fe052fad38b70b 100644
--- a/tensorflow/lite/experimental/shlo/ops/unary_elementwise.h
+++ b/tensorflow/lite/experimental/shlo/ops/unary_elementwise.h
@@ -55,7 +55,7 @@ void DequantizeOpQuantizePerAxisImpl(
       const ExpressedT dequantized_res = op(dequantized_input);
       *output_data = Quantize<StorageT, ExpressedT>(
           dequantized_res, output_zero_points[quantization_index],
-          static_cast<ExpressedT>(1 / output_scales[quantization_index]),
+          static_cast<ExpressedT>(1) / output_scales[quantization_index],
           quantization_min, quantization_max);
       output_data += strides[depth];
       input_data += strides[depth];
@@ -82,15 +82,15 @@ void DequantizeOpQuantizePerAxis(F&& func, const Tensor& input,
   using ExpressedT = StorageType<expressed_type>;
   const Shape& shape = input.shape();
   const Axis quantization_dimension =
-      input.quantized_tensor_element_type().QuantizedDimension();
+      input.quantized_per_axis_element_type().QuantizedDimension();
   const absl::Span<const StorageT> input_zero_points =
-      input.quantized_tensor_element_type().ZeroPoints<storage_type>();
+      input.quantized_per_axis_element_type().ZeroPointsAs<storage_type>();
   const absl::Span<const ExpressedT> input_scales =
-      input.quantized_tensor_element_type().Scales<expressed_type>();
+      input.quantized_per_axis_element_type().ScalesAs<expressed_type>();
   const absl::Span<const StorageT> output_zero_points =
-      output.quantized_tensor_element_type().ZeroPoints<storage_type>();
+      output.quantized_per_axis_element_type().ZeroPointsAs<storage_type>();
   const absl::Span<const ExpressedT> output_scales =
-      output.quantized_tensor_element_type().Scales<expressed_type>();
+      output.quantized_per_axis_element_type().ScalesAs<expressed_type>();
   const Strides& strides = ComputeStrides(shape);
   const StorageT* input_data = input.GetDataAs<storage_type>();
   StorageT* output_data = output.GetDataAs<storage_type>();
@@ -108,16 +108,16 @@ void DequantizeOpQuantizePerTensor(F& func, const Tensor& input,
   using ExpressedT = StorageType<expressed_type>;
   const DimensionSize num_elements = input.NumElements();
   const StorageT input_zero_point =
-      input.quantized_tensor_element_type().ZeroPoints<storage_type>()[0];
+      input.quantized_per_tensor_element_type().ZeroPointAs<storage_type>();
   const ExpressedT input_scale =
-      input.quantized_tensor_element_type().Scales<expressed_type>()[0];
+      input.quantized_per_tensor_element_type().ScaleAs<expressed_type>();
   const StorageT output_zero_point =
-      output.quantized_tensor_element_type().ZeroPoints<storage_type>()[0];
+      output.quantized_per_tensor_element_type().ZeroPointAs<storage_type>();
   const ExpressedT output_scale =
-      output.quantized_tensor_element_type().Scales<expressed_type>()[0];
+      output.quantized_per_tensor_element_type().ScaleAs<expressed_type>();
   const StorageT* input_data = input.GetDataAs<storage_type>();
   StorageT* output_data = output.GetDataAs<storage_type>();
-  const ExpressedT inv_scale = static_cast<ExpressedT>(1 / output_scale);
+  const ExpressedT inv_scale = static_cast<ExpressedT>(1) / output_scale;
   for (DimensionSize i = 0; i < num_elements;
        ++i, ++input_data, ++output_data) {
     const ExpressedT dequantized_input =
@@ -170,14 +170,15 @@ absl::Status Evaluate(UnaryElementwiseOp<F>& op, const Tensor& input,
                       Tensor& output) {
   if (input.IsPerAxisQuantized()) {
     DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerAxis,
-                       input.quantized_tensor_element_type().StorageType(),
-                       input.quantized_tensor_element_type().ExpressedType(),
+                       input.quantized_per_axis_element_type().StorageType(),
+                       input.quantized_per_axis_element_type().ExpressedType(),
                        op.func, input, output);
   } else if (input.IsPerTensorQuantized()) {
-    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
-                       input.quantized_tensor_element_type().StorageType(),
-                       input.quantized_tensor_element_type().ExpressedType(),
-                       op.func, input, output)
+    DISPATCH_QUANTIZED(
+        detail::DequantizeOpQuantizePerTensor,
+        input.quantized_per_tensor_element_type().StorageType(),
+        input.quantized_per_tensor_element_type().ExpressedType(), op.func,
+        input, output)
   } else {
     DISPATCH_BOOL_INT_FLOAT(detail::EvaluateNoQuantization,
                             input.tensor_element_type(), op.func, input,
diff --git a/tensorflow/lite/experimental/shlo/ops/unary_elementwise_test.cc b/tensorflow/lite/experimental/shlo/ops/unary_elementwise_test.cc
index 5dc5e51fd3de18..fd4d3125ebe873 100644
--- a/tensorflow/lite/experimental/shlo/ops/unary_elementwise_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/unary_elementwise_test.cc
@@ -36,7 +36,7 @@ namespace {
 struct Abs {
   template <class T>
   T operator()(const T& val) {
-    return val < static_cast<T>(0) ? -val : val;
+    return val < static_cast<T>(0) ? static_cast<T>(-val) : val;
   }
 };
 
@@ -90,15 +90,16 @@ TYPED_TEST(QuantizedUnaryElementWiseTest, QuantizedPerTensorWithAbs) {
   Vector<StorageT> output_data(shape.NumElements());
   const ExpressedT scale = static_cast<ExpressedT>(1.5);
   const StorageT zero_point = static_cast<StorageT>(5);
-  const QuantizedTensorElementType tensor_type =
-      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
-                                            TypeParam::kExpressed>(scale,
-                                                                   zero_point);
+  const QuantizedElementTypePerTensor tensor_type =
+      QuantizedElementTypePerTensor(TypeParam::kStorage, zero_point,
+                                    TypeParam::kExpressed, scale);
   Tensor input_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = input_data.data()};
   Tensor output_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerTensorTensorType{.shape = shape,
+                                           .element_type = tensor_type},
       .data = output_data.data()};
 
   Vector<StorageT> expected_data(shape.NumElements());
@@ -139,15 +140,16 @@ TYPED_TEST(QuantizedUnaryElementWiseTest, QuantizedPerAxisWithAbs) {
   Vector<ExpressedT> scales_data = RandomBuffer<TypeParam::kExpressed>(
       /*shape=*/Shape({shape.Dim(2)}), /*min=*/static_cast<ExpressedT>(1),
       /*max=*/static_cast<ExpressedT>(3));
-  const QuantizedTensorElementType tensor_type =
-      QuantizedTensorElementType::PerAxis<TypeParam::kStorage,
-                                          TypeParam::kExpressed>(
-          scales_data, zero_points_data, quantized_dimension);
+  const QuantizedElementTypePerAxis tensor_type = QuantizedElementTypePerAxis(
+      TypeParam::kStorage, zero_points_data, TypeParam::kExpressed, scales_data,
+      quantized_dimension);
   Tensor input_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerAxisTensorType{.shape = shape,
+                                         .element_type = tensor_type},
       .data = input_data.data()};
   Tensor output_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .type = QuantizedPerAxisTensorType{.shape = shape,
+                                         .element_type = tensor_type},
       .data = output_data.data()};
 
   Vector<StorageT> expected_data(shape.NumElements());
diff --git a/tensorflow/lite/experimental/shlo/ops/util.cc b/tensorflow/lite/experimental/shlo/ops/util.cc
index 6698649dae6332..ab2da3b91b28ef 100644
--- a/tensorflow/lite/experimental/shlo/ops/util.cc
+++ b/tensorflow/lite/experimental/shlo/ops/util.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/experimental/shlo/ops/util.h"
 
+#include <string>
+#include <variant>
+
 #include "absl/status/status.h"
 #include "tensorflow/lite/experimental/shlo/data_type.h"
 #include "tensorflow/lite/experimental/shlo/shape.h"
@@ -79,9 +82,16 @@ absl::Status CheckSameBaselineType(CheckCtx ctx, const Tensor& tensor1,
                                    const Tensor& tensor2) {
   if (BaselineType(tensor1.element_type()) !=
       BaselineType(tensor2.element_type())) {
+    std::string tensor1_type_repr =
+        std::visit([](auto v) -> std::string { return ToString(v); },
+                   tensor1.element_type());
+    std::string tensor2_type_repr =
+        std::visit([](auto v) -> std::string { return ToString(v); },
+                   tensor2.element_type());
     return absl::FailedPreconditionError(
         "stablehlo." + ctx.op_name +
-        ": baseline type constraint is not satisfied.");
+        ": baseline type constraint is not satisfied " + tensor1_type_repr +
+        " and " + tensor2_type_repr + ".");
   }
   return absl::OkStatus();
 }
diff --git a/tensorflow/lite/experimental/shlo/ops/util.h b/tensorflow/lite/experimental/shlo/ops/util.h
index ccb858c6194439..51a18f0dc57aa4 100644
--- a/tensorflow/lite/experimental/shlo/ops/util.h
+++ b/tensorflow/lite/experimental/shlo/ops/util.h
@@ -64,8 +64,11 @@ absl::Status CheckSupportedTypes(CheckCtx ctx, const Tensor& tensor,
   if ((static_cast<CheckFuncs&&>(checks)(tensor) || ...)) {
     return absl::OkStatus();
   }
+  std::string tensor_type_repr = std::visit(
+      [](auto v) -> std::string { return ToString(v); }, tensor.element_type());
   return absl::FailedPreconditionError("stablehlo." + ctx.op_name +
-                                       ": Unsupported tensor type.");
+                                       ": Unsupported tensor type (" +
+                                       tensor_type_repr + ").");
 }
 
 // Returns true if the tensor's storage type is boolean.
diff --git a/tensorflow/lite/experimental/shlo/overload.h b/tensorflow/lite/experimental/shlo/overload.h
new file mode 100644
index 00000000000000..913847e1ed2dd9
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/overload.h
@@ -0,0 +1,39 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OVERLOAD_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OVERLOAD_H_
+
+namespace shlo_ref {
+
+// Returns a functor that provides overloads based on the
+// functors passed to it.
+//
+// Useful when used in conjunction with `std::visit`.
+//
+// Use absl version when we know for sure the version we can use.
+template <class... Ts>
+class Overload : public Ts... {
+ public:
+  explicit Overload(Ts&&... ts) : Ts(static_cast<Ts&&>(ts))... {}
+  using Ts::operator()...;
+};
+
+template <class... Ts>
+Overload(Ts&&...) -> Overload<Ts...>;
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OVERLOAD_H_
diff --git a/tensorflow/lite/experimental/shlo/overload_test.cc b/tensorflow/lite/experimental/shlo/overload_test.cc
new file mode 100644
index 00000000000000..73c68f0ebd1ec0
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/overload_test.cc
@@ -0,0 +1,145 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/overload.h"
+
+#include <cstdint>
+#include <string>
+#include <type_traits>
+#include <variant>
+
+#include <gtest/gtest.h>
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+
+namespace {
+
+TEST(OverloadTest, DispatchConsidersTypeWithAutoFallback) {
+  auto overloaded = shlo_ref::Overload(
+      [](int v) -> std::string { return absl::StrCat("int ", v); },        //
+      [](double v) -> std::string { return absl::StrCat("double ", v); },  //
+      [](const char* v) -> std::string {                                   //
+        return absl::StrCat("const char* ", v);                            //
+      },                                                                   //
+      [](auto v) -> std::string { return absl::StrCat("auto ", v); }       //
+  );
+  EXPECT_EQ("int 1", overloaded(1));
+  EXPECT_EQ("double 2.5", overloaded(2.5));
+  EXPECT_EQ("const char* hello", overloaded("hello"));
+  EXPECT_EQ("auto 1.5", overloaded(1.5f));
+}
+
+TEST(OverloadTest, DispatchConsidersNumberOfArguments) {
+  auto overloaded = shlo_ref::Overload(             //
+      [](int a) { return a + 1; },                  //
+      [](int a, int b) { return a * b; },           //
+      []() -> absl::string_view { return "none"; }  //
+  );
+  EXPECT_EQ(3, overloaded(2));
+  EXPECT_EQ(21, overloaded(3, 7));
+  EXPECT_EQ("none", overloaded());
+}
+
+TEST(OverloadTest, SupportsConstantEvaluation) {
+  auto overloaded = shlo_ref::Overload(             //
+      [](int a) { return a + 1; },                  //
+      [](int a, int b) { return a * b; },           //
+      []() -> absl::string_view { return "none"; }  //
+  );
+  static_assert(overloaded() == "none");
+  static_assert(overloaded(2) == 3);
+  static_assert(overloaded(3, 7) == 21);
+}
+
+TEST(OverloadTest, PropogatesDefaults) {
+  auto overloaded = shlo_ref::Overload(        //
+      [](int a, int b = 5) { return a * b; },  //
+      [](double c) { return c; }               //
+  );
+
+  EXPECT_EQ(21, overloaded(3, 7));
+  EXPECT_EQ(35, overloaded(7));
+  EXPECT_EQ(2.5, overloaded(2.5));
+}
+
+TEST(OverloadTest, AmbiguousWithDefaultsNotInvocable) {
+  auto overloaded = shlo_ref::Overload(        //
+      [](int a, int b = 5) { return a * b; },  //
+      [](int c) { return c; }                  //
+  );
+  static_assert(!std::is_invocable_v<decltype(overloaded), int>);
+  static_assert(std::is_invocable_v<decltype(overloaded), int, int>);
+}
+
+TEST(OverloadTest, AmbiguousDuplicatesNotInvocable) {
+  auto overloaded = shlo_ref::Overload(  //
+      [](int a) { return a; },           //
+      [](int c) { return c; }            //
+  );
+  static_assert(!std::is_invocable_v<decltype(overloaded), int>);
+}
+
+TEST(OverloadTest, AmbiguousConversionNotInvocable) {
+  auto overloaded = shlo_ref::Overload(  //
+      [](uint16_t a) { return a; },      //
+      [](uint64_t c) { return c; }       //
+  );
+  static_assert(!std::is_invocable_v<decltype(overloaded), int>);
+}
+
+TEST(OverloadTest, AmbiguousConversionWithAutoNotInvocable) {
+  auto overloaded = shlo_ref::Overload(  //
+      [](auto a) { return a; },          //
+      [](auto c) { return c; }           //
+  );
+  static_assert(!std::is_invocable_v<decltype(overloaded), int>);
+}
+
+TEST(OverloadTest, DispatchConsidersSfinae) {
+  auto overloaded = shlo_ref::Overload(                //
+      [](auto a) -> decltype(a + 1) { return a + 1; }  //
+  );
+  static_assert(std::is_invocable_v<decltype(overloaded), int>);
+  static_assert(!std::is_invocable_v<decltype(overloaded), std::string>);
+}
+
+TEST(OverloadTest, VariantVisitDispatchesCorrectly) {
+  std::variant<int, double, std::string> v(1);
+  auto overloaded = shlo_ref::Overload(
+      [](int) -> absl::string_view { return "int"; },                   //
+      [](double) -> absl::string_view { return "double"; },             //
+      [](const std::string&) -> absl::string_view { return "string"; }  //
+  );
+  EXPECT_EQ("int", std::visit(overloaded, v));
+  v = 1.1;
+  EXPECT_EQ("double", std::visit(overloaded, v));
+  v = "hello";
+  EXPECT_EQ("string", std::visit(overloaded, v));
+}
+
+TEST(OverloadTest, VariantVisitWithAutoFallbackDispatchesCorrectly) {
+  std::variant<std::string, int32_t, int64_t> v(int32_t{1});
+  auto overloaded =
+      shlo_ref::Overload([](const std::string& s) { return s.size(); },  //
+                         [](const auto& s) { return sizeof(s); }         //
+      );
+  EXPECT_EQ(4, std::visit(overloaded, v));
+  v = int64_t{1};
+  EXPECT_EQ(8, std::visit(overloaded, v));
+  v = std::string("hello");
+  EXPECT_EQ(5, std::visit(overloaded, v));
+}
+
+}  // namespace
diff --git a/tensorflow/lite/experimental/shlo/quantize.h b/tensorflow/lite/experimental/shlo/quantize.h
index ef72d6557c856c..f94d07a4672dca 100644
--- a/tensorflow/lite/experimental/shlo/quantize.h
+++ b/tensorflow/lite/experimental/shlo/quantize.h
@@ -37,7 +37,7 @@ inline constexpr StorageT Quantize(ExpressedT expressed_value,
   tmp = std::clamp(tmp, static_cast<ExpressedT>(min_value),
                    static_cast<ExpressedT>(max_value));
   auto rounded_value = static_cast<StorageT>(tmp);
-  StorageT storage_value = rounded_value + zero_point;
+  StorageT storage_value(rounded_value + zero_point);
 
   // Clamp again using the min & max values.
   return std::clamp(storage_value, min_value, max_value);
diff --git a/tensorflow/lite/experimental/shlo/quantize_test.cc b/tensorflow/lite/experimental/shlo/quantize_test.cc
index 14ccbf4043941b..73a344353a96b5 100644
--- a/tensorflow/lite/experimental/shlo/quantize_test.cc
+++ b/tensorflow/lite/experimental/shlo/quantize_test.cc
@@ -85,6 +85,21 @@ TEST(QuantizeTest, QuantizedValueClamped) {
               Eq(expected_value));
 }
 
+TEST(QuantizeTest, SI4NegativeValue) {
+  using StorageT = StorageType<DataType::kSI4>;
+  using ExpressedT = StorageType<DataType::kF32>;
+
+  StorageT value = -8;
+  StorageT zero_point = 0;
+  ExpressedT scale = 1;
+  ExpressedT expected_value = -8.0f;
+
+  EXPECT_THAT((Dequantize(value, zero_point, scale)), Eq(expected_value));
+  EXPECT_THAT((Quantize<DataType::kSI4, DataType::kF32>(expected_value,
+                                                        zero_point, 1 / scale)),
+              Eq(value));
+}
+
 TEST(QuantizeTest, QuantizedValueRoundDown) {
   using StorageT = StorageType<DataType::kSI8>;
   using ExpressedT = StorageType<DataType::kF32>;
diff --git a/tensorflow/lite/experimental/shlo/quantized_tensor_element_type.cc b/tensorflow/lite/experimental/shlo/quantized_tensor_element_type.cc
new file mode 100644
index 00000000000000..0a1009ca9a7077
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/quantized_tensor_element_type.cc
@@ -0,0 +1,77 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <variant>
+
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+
+namespace shlo_ref {
+
+// Gets a string representation of the given element type.
+std::string ToString(const QuantizedElementTypePerTensor& t) {
+  std::stringstream sstr;
+  sstr << "QuantizedPerTensor[" << ToString(t.StorageType()) << ", "
+       << ToString(t.ExpressedType()) << "]";
+  return sstr.str();
+}
+
+// Gets a string representation of the given element type.
+std::string ToString(const QuantizedElementTypePerAxis& t) {
+  std::stringstream sstr;
+  sstr << "QuantizedPerAxis[" << ToString(t.StorageType()) << ", "
+       << ToString(t.ExpressedType()) << ", " << t.QuantizedDimension() << "]";
+  return sstr.str();
+}
+
+QuantizedElementTypePerTensor BaselineType(
+    const QuantizedElementTypePerTensor& type) {
+  QuantizedElementTypePerTensor baseline = type;
+  std::visit(
+      [](auto& scale) -> void {
+        scale = std::remove_reference_t<decltype(scale)>(1);
+      },
+      baseline.Scale());
+  std::visit(
+      [](auto& zero_point) -> void {
+        zero_point = std::remove_reference_t<decltype(zero_point)>(0);
+      },
+      baseline.ZeroPoint());
+  return baseline;
+}
+
+QuantizedElementTypePerAxis BaselineType(
+    const QuantizedElementTypePerAxis& type) {
+  QuantizedElementTypePerAxis baseline = type;
+  std::visit(
+      [](auto& scales) -> void {
+        using T = std::remove_reference_t<decltype(scales[0])>;
+        absl::c_fill(scales, static_cast<T>(1));
+      },
+      baseline.Scales());
+  std::visit(
+      [](auto& zero_points) -> void {
+        using T = std::remove_reference_t<decltype(zero_points[0])>;
+        absl::c_fill(zero_points, static_cast<T>(0));
+      },
+      baseline.ZeroPoints());
+  return baseline;
+}
+
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h b/tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h
index f26b736fe1e4a0..6097074bb9f313 100644
--- a/tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h
+++ b/tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h
@@ -16,7 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_QUANTIZED_TENSOR_ELEMENT_TYPE_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_QUANTIZED_TENSOR_ELEMENT_TYPE_H_
 
+#include <cstdint>
+#include <initializer_list>
 #include <optional>
+#include <sstream>
+#include <string>
 #include <type_traits>
 #include <utility>
 #include <variant>
@@ -24,144 +28,248 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/log/absl_check.h"
+#include "absl/log/absl_log.h"
 #include "absl/types/span.h"
 #include "tensorflow/lite/experimental/shlo/data_type.h"
 #include "tensorflow/lite/experimental/shlo/shape.h"
 
 namespace shlo_ref {
 
-class QuantizedTensorElementType {
+constexpr bool IsValidQuantizationTypePair(DataType storage_type,
+                                           DataType expressed_type) {
+  switch (storage_type) {
+    case DataType::kSI4:
+    case DataType::kSI8:
+    case DataType::kSI16:
+      break;
+    default:
+      return false;
+  }
+  switch (expressed_type) {
+    case DataType::kBF16:
+    case DataType::kF16:
+    case DataType::kF32:
+      break;
+    default:
+      return false;
+  }
+  return SizeOf(storage_type) < SizeOf(expressed_type);
+}
+
+class QuantizedElementTypePerTensor {
  public:
-  template <DataType storage_type, DataType expressed_type>
-  static QuantizedTensorElementType PerTensor(
-      StorageType<expressed_type> scale, StorageType<storage_type> zero_point) {
-    static_assert(IsInteger(storage_type),
-                  "Storage type must be an integer type");
-    static_assert(IsFloat(expressed_type),
-                  "Expressed type must be a floating point type");
-    using StorageT = typename Storage<storage_type>::Type;
-    using ExpressedT = typename Storage<expressed_type>::Type;
-
-    return QuantizedTensorElementType(
-        storage_type, expressed_type, std::nullopt,
-        SmallInlinedVector<ExpressedT>({scale}),
-        SmallInlinedVector<StorageT>({zero_point}));
-  }
-
-  template <DataType storage_type, DataType expressed_type>
-  static QuantizedTensorElementType PerAxis(
-      absl::Span<const StorageType<expressed_type>> scales,
-      absl::Span<const StorageType<storage_type>> zero_points,
-      Axis quantized_dimension) {
-    static_assert(IsInteger(storage_type),
-                  "Storage type must be an integer type");
-    static_assert(IsFloat(expressed_type),
-                  "Expressed type must be a floating point type");
-    using StorageT = typename Storage<storage_type>::Type;
-    using ExpressedT = typename Storage<expressed_type>::Type;
-
-    ABSL_CHECK(scales.size() == zero_points.size());
-    return QuantizedTensorElementType(
-        storage_type, expressed_type, quantized_dimension,
-        SmallInlinedVector<ExpressedT>(scales.begin(), scales.end()),
-        SmallInlinedVector<StorageT>(zero_points.begin(), zero_points.end()));
-  }
-
-  DataType StorageType() const { return storage_type_; }
-  DataType ExpressedType() const { return expressed_type_; }
-
-  bool IsPerTensorQuantized() const { return !quantized_dimension_; }
-  bool IsPerAxisQuantized() const { return !IsPerTensorQuantized(); }
-
-  Axis QuantizedDimension() const {
-    ABSL_CHECK(IsPerAxisQuantized());
-    return quantized_dimension_.value();
-  }
-
-  template <DataType expressed_type,
-            typename T = typename Storage<expressed_type>::Type>
-  absl::Span<const T> Scales() const {
-    ABSL_CHECK(expressed_type == expressed_type_);
-    ABSL_CHECK(std::holds_alternative<SmallInlinedVector<T>>(scales_));
-    return std::get<SmallInlinedVector<T>>(scales_);
-  }
-
-  template <DataType storage_type,
-            typename T = typename Storage<storage_type>::Type>
-  absl::Span<const T> ZeroPoints() const {
-    ABSL_CHECK(storage_type == storage_type_);
-    ABSL_CHECK(std::holds_alternative<SmallInlinedVector<T>>(zero_points_));
-    return std::get<SmallInlinedVector<T>>(zero_points_);
-  }
-
-  friend bool operator==(const QuantizedTensorElementType& lhs,
-                         const QuantizedTensorElementType& rhs) {
-    return lhs.storage_type_ == rhs.storage_type_ &&
-           lhs.expressed_type_ == rhs.expressed_type_ &&
-           lhs.quantized_dimension_ == rhs.quantized_dimension_ &&
-           lhs.scales_ == rhs.scales_ && lhs.zero_points_ == rhs.zero_points_;
-  }
-
-  friend bool operator!=(const QuantizedTensorElementType& lhs,
-                         const QuantizedTensorElementType& rhs) {
-    return !(lhs == rhs);
+  using ZeroPointVariant =
+      std::variant<Storage<DataType::kSI4>::Type, Storage<DataType::kSI8>::Type,
+                   Storage<DataType::kSI16>::Type>;
+  using ScaleVariant = std::variant<Storage<DataType::kBF16>::Type,
+                                    Storage<DataType::kF16>::Type,
+                                    Storage<DataType::kF32>::Type>;
+
+  template <class T, class U>
+  QuantizedElementTypePerTensor(DataType storage_type, T zero_point,
+                                DataType expressed_type, U scale) {
+#define SHLO_STORAGE_CASE(TYPE)                                             \
+  case DataType ::k##TYPE:                                                  \
+    zero_point_ =                                                           \
+        static_cast<typename Storage<DataType::k##TYPE>::Type>(zero_point); \
+    break;
+    switch (storage_type) {
+      SHLO_STORAGE_CASE(SI4);
+      SHLO_STORAGE_CASE(SI8);
+      SHLO_STORAGE_CASE(SI16);
+      default:
+        ABSL_LOG(FATAL) << "Unsupported quantization storage type ("
+                        << ToString(storage_type) << ").";
+    }
+#undef SHLO_STORAGE_CASE
+#define SHLO_EXPRESSED_CASE(TYPE)                                           \
+  case DataType ::k##TYPE:                                                  \
+    scale_ = static_cast<typename Storage<DataType::k##TYPE>::Type>(scale); \
+    break;
+    switch (expressed_type) {
+      SHLO_EXPRESSED_CASE(BF16);
+      SHLO_EXPRESSED_CASE(F16);
+      SHLO_EXPRESSED_CASE(F32);
+      default:
+        ABSL_LOG(FATAL) << "Unsupported quantization expressed type ("
+                        << ToString(expressed_type) << ").";
+    }
+#undef SHLO_EXPRESSED_CASE
+    ABSL_CHECK(IsValidQuantizationTypePair(StorageType(), ExpressedType()));
   }
 
-  friend QuantizedTensorElementType BaselineType(
-      const QuantizedTensorElementType& type) {
-    QuantizedTensorElementType baseline = type;
-    std::visit(
-        [](auto& scales) -> void {
-          using Container = std::remove_reference_t<decltype(scales)>;
-          absl::c_fill(scales, static_cast<typename Container::value_type>(1));
-        },
-        baseline.scales_);
-    std::visit(
-        [](auto& zero_points) -> void {
-          using Container = std::remove_reference_t<decltype(zero_points)>;
-          absl::c_fill(zero_points,
-                       static_cast<typename Container::value_type>(0));
-        },
-        baseline.zero_points_);
-    return baseline;
+  DataType ExpressedType() const {
+    const DataType scale_types[] = {DataType::kBF16, DataType::kF16,
+                                    DataType::kF32};
+    return scale_types[scale_.index()];
   }
 
- private:
-  // Most quantized tensors will likely be per tensor quantized, which will have
-  // a single element in the vector. Use an InlinedVector with a single element
-  // so we only allocate when using per axis quantization.
-  template <typename T>
-  using SmallInlinedVector = absl::InlinedVector<T, 1>;
+  DataType StorageType() const {
+    const DataType zero_point_types[] = {DataType::kSI4, DataType::kSI8,
+                                         DataType::kSI16, DataType::kSI32};
+    return zero_point_types[zero_point_.index()];
+  }
+
+  ScaleVariant& Scale() { return scale_; }
 
-  template <typename StorageT, typename ExpressedT>
-  QuantizedTensorElementType(DataType storage_type, DataType expressed_type,
-                             std::optional<Axis> quantized_dimension,
-                             SmallInlinedVector<ExpressedT> scales,
-                             SmallInlinedVector<StorageT> zero_points)
-      : storage_type_(storage_type),
-        expressed_type_(expressed_type),
-        quantized_dimension_(quantized_dimension),
-        scales_(std::move(scales)),
-        zero_points_(std::move(zero_points)) {}
+  const ScaleVariant& Scale() const { return scale_; }
+
+  template <DataType expressed_type>
+  const typename Storage<expressed_type>::Type& ScaleAs() const {
+    return std::get<typename Storage<expressed_type>::Type>(scale_);
+  }
 
-  DataType storage_type_;
-  DataType expressed_type_;
+  ZeroPointVariant& ZeroPoint() { return zero_point_; }
 
-  std::optional<Axis> quantized_dimension_;
+  const ZeroPointVariant& ZeroPoint() const { return zero_point_; }
 
-  std::variant<SmallInlinedVector<Storage<DataType::kBF16>::Type>,
-               SmallInlinedVector<Storage<DataType::kF16>::Type>,
-               SmallInlinedVector<Storage<DataType::kF32>::Type>>
-      scales_;
+  template <DataType storage_type>
+  const typename Storage<storage_type>::Type& ZeroPointAs() const {
+    return std::get<typename Storage<storage_type>::Type>(zero_point_);
+  }
+
+  friend bool operator==(const QuantizedElementTypePerTensor& lhs,
+                         const QuantizedElementTypePerTensor& rhs) {
+    return lhs.zero_point_ == rhs.zero_point_ && lhs.scale_ == rhs.scale_;
+  }
+
+  friend bool operator!=(const QuantizedElementTypePerTensor& lhs,
+                         const QuantizedElementTypePerTensor& rhs) {
+    return !(lhs == rhs);
+  }
+
+ private:
+  ZeroPointVariant zero_point_;
+  ScaleVariant scale_;
+};
+
+class QuantizedElementTypePerAxis {
+  template <class To, class FromRange, class... Ts>
+  void ConvertAndAssign(std::variant<Ts...>& dest, FromRange&& range) {
+    using std::begin;
+    using std::end;
+    dest = To(begin(range), end(range));
+  }
+
+ public:
+  template <typename T>
+  using SmallInlinedVector = absl::InlinedVector<T, 8>;
+
+  using ScalesVariant =
+      std::variant<SmallInlinedVector<Storage<DataType::kBF16>::Type>,
+                   SmallInlinedVector<Storage<DataType::kF16>::Type>,
+                   SmallInlinedVector<Storage<DataType::kF32>::Type>>;
 
   // There is no need for kSI4 because it currently uses the same underlying
   // storage type as kSI8, which complicates accessing the variant. If they ever
   // use different underlying types, please add an alternative for kSI4.
-  std::variant<SmallInlinedVector<Storage<DataType::kSI8>::Type>,
-               SmallInlinedVector<Storage<DataType::kSI16>::Type>,
-               SmallInlinedVector<Storage<DataType::kSI32>::Type>>
-      zero_points_;
+  using ZeroPointsVariant =
+      std::variant<SmallInlinedVector<Storage<DataType::kSI4>::Type>,
+                   SmallInlinedVector<Storage<DataType::kSI8>::Type>,
+                   SmallInlinedVector<Storage<DataType::kSI16>::Type>>;
+
+  template <class RangeT = std::initializer_list<int32_t>,
+            class RangeU = std::initializer_list<float>>
+  QuantizedElementTypePerAxis(DataType storage_type, RangeT&& zero_points,
+                              DataType expressed_type, RangeU&& scales,
+                              Axis quantized_dimension)
+      : quantized_dimension_(quantized_dimension) {
+#define SHLO_STORAGE_CASE(TYPE)                                             \
+  case DataType ::k##TYPE:                                                  \
+    ConvertAndAssign<SmallInlinedVector<Storage<DataType::k##TYPE>::Type>>( \
+        zero_points_, static_cast<RangeT&&>(zero_points));                  \
+    break;
+    switch (storage_type) {
+      SHLO_STORAGE_CASE(SI4);
+      SHLO_STORAGE_CASE(SI8);
+      SHLO_STORAGE_CASE(SI16);
+      default:
+        ABSL_LOG(FATAL) << "Unsupported quantization storage type ("
+                        << ToString(storage_type) << ").";
+    }
+#undef SHLO_STORAGE_CASE
+#define SHLO_EXPRESSED_CASE(TYPE)                                           \
+  case DataType ::k##TYPE:                                                  \
+    ConvertAndAssign<SmallInlinedVector<Storage<DataType::k##TYPE>::Type>>( \
+        scales_, static_cast<RangeU&&>(scales));                            \
+    break;
+    switch (expressed_type) {
+      SHLO_EXPRESSED_CASE(BF16);
+      SHLO_EXPRESSED_CASE(F16);
+      SHLO_EXPRESSED_CASE(F32);
+      default:
+        ABSL_LOG(FATAL) << "Unsupported quantization expressed type ("
+                        << ToString(expressed_type) << ").";
+    }
+#undef SHLO_EXPRESSED_CASE
+    ABSL_CHECK(IsValidQuantizationTypePair(StorageType(), ExpressedType()));
+  }
+
+  DataType ExpressedType() const {
+    const DataType scale_types[] = {DataType::kBF16, DataType::kF16,
+                                    DataType::kF32};
+    return scale_types[scales_.index()];
+  }
+
+  DataType StorageType() const {
+    const DataType zero_point_types[] = {DataType::kSI4, DataType::kSI8,
+                                         DataType::kSI16, DataType::kSI32};
+    return zero_point_types[zero_points_.index()];
+  }
+
+  Axis& QuantizedDimension() { return quantized_dimension_; }
+
+  const Axis& QuantizedDimension() const { return quantized_dimension_; }
+
+  ScalesVariant& Scales() { return scales_; }
+
+  const ScalesVariant& Scales() const { return scales_; }
+
+  template <DataType expressed_type>
+  const SmallInlinedVector<typename Storage<expressed_type>::Type>& ScalesAs()
+      const {
+    return std::get<SmallInlinedVector<typename Storage<expressed_type>::Type>>(
+        scales_);
+  }
+
+  ZeroPointsVariant& ZeroPoints() { return zero_points_; }
+
+  const ZeroPointsVariant& ZeroPoints() const { return zero_points_; }
+
+  template <DataType storage_type>
+  const SmallInlinedVector<typename Storage<storage_type>::Type>& ZeroPointsAs()
+      const {
+    return std::get<SmallInlinedVector<typename Storage<storage_type>::Type>>(
+        zero_points_);
+  }
+
+  friend bool operator==(const QuantizedElementTypePerAxis& lhs,
+                         const QuantizedElementTypePerAxis& rhs) {
+    return lhs.zero_points_ == rhs.zero_points_ && lhs.scales_ == rhs.scales_;
+  }
+
+  friend bool operator!=(const QuantizedElementTypePerAxis& lhs,
+                         const QuantizedElementTypePerAxis& rhs) {
+    return !(lhs == rhs);
+  }
+
+ private:
+  Axis quantized_dimension_;
+  ScalesVariant scales_;
+  ZeroPointsVariant zero_points_;
 };
 
+// Gets a string representation of the given element type.
+std::string ToString(const QuantizedElementTypePerTensor& t);
+
+// Gets a string representation of the given element type.
+std::string ToString(const QuantizedElementTypePerAxis& t);
+
+QuantizedElementTypePerTensor BaselineType(
+    const QuantizedElementTypePerTensor& type);
+
+QuantizedElementTypePerAxis BaselineType(
+    const QuantizedElementTypePerAxis& type);
+
 }  // namespace shlo_ref
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_QUANTIZED_TENSOR_ELEMENT_TYPE_H_
diff --git a/tensorflow/lite/experimental/shlo/quantized_tensor_element_type_test.cc b/tensorflow/lite/experimental/shlo/quantized_tensor_element_type_test.cc
index 0cfffd390efe7b..10c5a878ddd931 100644
--- a/tensorflow/lite/experimental/shlo/quantized_tensor_element_type_test.cc
+++ b/tensorflow/lite/experimental/shlo/quantized_tensor_element_type_test.cc
@@ -16,6 +16,9 @@ limitations under the License.
 #include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
 
 #include <array>
+#include <cstdint>
+#include <variant>
+#include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -26,120 +29,194 @@ namespace {
 
 using testing::Each;
 using testing::ElementsAre;
+using testing::ElementsAreArray;
 using testing::Eq;
+using testing::FloatEq;
+using testing::Pointwise;
+
+TEST(Quantization, IsValidQuantizationTypePairWorks) {
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kI1, DataType::kI1));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kI1, DataType::kSI4));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kI1, DataType::kSI8));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kI1, DataType::kSI16));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kI1, DataType::kSI32));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kI1, DataType::kBF16));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kI1, DataType::kF16));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kI1, DataType::kF32));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kSI4, DataType::kI1));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kSI4, DataType::kSI4));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kSI4, DataType::kSI8));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kSI4, DataType::kSI16));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kSI4, DataType::kSI32));
+  EXPECT_TRUE(IsValidQuantizationTypePair(DataType::kSI4, DataType::kBF16));
+  EXPECT_TRUE(IsValidQuantizationTypePair(DataType::kSI4, DataType::kF16));
+  EXPECT_TRUE(IsValidQuantizationTypePair(DataType::kSI4, DataType::kF32));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kSI8, DataType::kI1));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kSI8, DataType::kSI4));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kSI8, DataType::kSI8));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kSI8, DataType::kSI16));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kSI8, DataType::kSI32));
+  EXPECT_TRUE(IsValidQuantizationTypePair(DataType::kSI8, DataType::kBF16));
+  EXPECT_TRUE(IsValidQuantizationTypePair(DataType::kSI8, DataType::kF16));
+  EXPECT_TRUE(IsValidQuantizationTypePair(DataType::kSI8, DataType::kF32));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kSI16, DataType::kI1));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kSI16, DataType::kSI4));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kSI16, DataType::kSI8));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kSI16, DataType::kSI16));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kSI16, DataType::kSI32));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kSI16, DataType::kBF16));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kSI16, DataType::kF16));
+  EXPECT_TRUE(IsValidQuantizationTypePair(DataType::kSI16, DataType::kF32));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kSI32, DataType::kI1));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kSI32, DataType::kSI4));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kSI32, DataType::kSI8));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kSI32, DataType::kSI16));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kSI32, DataType::kSI32));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kSI32, DataType::kBF16));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kSI32, DataType::kF16));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kSI32, DataType::kF32));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kBF16, DataType::kI1));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kBF16, DataType::kSI4));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kBF16, DataType::kSI8));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kBF16, DataType::kSI16));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kBF16, DataType::kSI32));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kBF16, DataType::kBF16));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kBF16, DataType::kF16));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kBF16, DataType::kF32));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kF16, DataType::kI1));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kF16, DataType::kSI4));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kF16, DataType::kSI8));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kF16, DataType::kSI16));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kF16, DataType::kSI32));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kF16, DataType::kBF16));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kF16, DataType::kF16));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kF16, DataType::kF32));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kF32, DataType::kI1));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kF32, DataType::kSI4));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kF32, DataType::kSI8));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kF32, DataType::kSI16));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kF32, DataType::kSI32));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kF32, DataType::kBF16));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kF32, DataType::kF16));
+  EXPECT_FALSE(IsValidQuantizationTypePair(DataType::kF32, DataType::kF32));
+}
 
-template <DataType storage_type, DataType expressed_type>
-struct TestPair {
-  using StorageT = StorageType<storage_type>;
-  using ExpressedT = StorageType<expressed_type>;
-
-  static constexpr DataType kStorageType = storage_type;
-  static constexpr DataType kExpressedType = expressed_type;
+struct QuantizationPair {
+  DataType storage_type;
+  DataType expressed_type;
 };
 
-template <typename T>
-class QuantizedTensorElementTypeTest : public ::testing::Test {};
-
-using TestTypes = ::testing::Types<TestPair<DataType::kSI4, DataType::kBF16>,
-                                   TestPair<DataType::kSI4, DataType::kF16>,
-                                   TestPair<DataType::kSI4, DataType::kF32>,
-                                   TestPair<DataType::kSI8, DataType::kBF16>,
-                                   TestPair<DataType::kSI8, DataType::kF16>,
-                                   TestPair<DataType::kSI8, DataType::kF32>,
-                                   TestPair<DataType::kSI16, DataType::kBF16>,
-                                   TestPair<DataType::kSI16, DataType::kF16>,
-                                   TestPair<DataType::kSI16, DataType::kF32>,
-                                   TestPair<DataType::kSI32, DataType::kBF16>,
-                                   TestPair<DataType::kSI32, DataType::kF16>,
-                                   TestPair<DataType::kSI32, DataType::kF32>>;
-
-TYPED_TEST_SUITE(QuantizedTensorElementTypeTest, TestTypes);
-
-TYPED_TEST(QuantizedTensorElementTypeTest, PerTensor) {
-  typename TypeParam::ExpressedT scale{.5f};
-  typename TypeParam::StorageT zero_point{3};
-
-  auto element =
-      QuantizedTensorElementType::PerTensor<TypeParam::kStorageType,
-                                            TypeParam::kExpressedType>(
-          scale, zero_point);
-
-  EXPECT_THAT(element.StorageType(), Eq(TypeParam::kStorageType));
-  EXPECT_THAT(element.ExpressedType(), Eq(TypeParam::kExpressedType));
-  EXPECT_THAT(element.IsPerTensorQuantized(), Eq(true));
-  EXPECT_THAT(element.IsPerAxisQuantized(), Eq(false));
-  EXPECT_THAT(element.template Scales<TypeParam::kExpressedType>(),
-              ElementsAre(.5f));
-  EXPECT_THAT(element.template ZeroPoints<TypeParam::kStorageType>(),
-              ElementsAre(3));
+std::vector<QuantizationPair> ValidQuantizationTypePairs() {
+  return {QuantizationPair{.storage_type = DataType::kSI4,
+                           .expressed_type = DataType::kBF16},
+          QuantizationPair{.storage_type = DataType::kSI4,
+                           .expressed_type = DataType::kF16},
+          QuantizationPair{.storage_type = DataType::kSI4,
+                           .expressed_type = DataType::kF32},
+          QuantizationPair{.storage_type = DataType::kSI8,
+                           .expressed_type = DataType::kBF16},
+          QuantizationPair{.storage_type = DataType::kSI8,
+                           .expressed_type = DataType::kF16},
+          QuantizationPair{.storage_type = DataType::kSI8,
+                           .expressed_type = DataType::kF32},
+          QuantizationPair{.storage_type = DataType::kSI16,
+                           .expressed_type = DataType::kF32}};
 }
 
-TYPED_TEST(QuantizedTensorElementTypeTest, PerAxis) {
-  using ExpressedT = typename TypeParam::ExpressedT;
-  using StorageT = typename TypeParam::StorageT;
-  std::array scales = {ExpressedT{.5f}, ExpressedT{.6f}, ExpressedT{.2f}};
-  std::array zero_points = {StorageT{3}, StorageT{1}, StorageT{2}};
-
-  auto element = QuantizedTensorElementType::PerAxis<TypeParam::kStorageType,
-                                                     TypeParam::kExpressedType>(
-      absl::MakeConstSpan(scales), absl::MakeConstSpan(zero_points), 3u);
-
-  EXPECT_THAT(element.StorageType(), Eq(TypeParam::kStorageType));
-  EXPECT_THAT(element.ExpressedType(), Eq(TypeParam::kExpressedType));
-  EXPECT_THAT(element.IsPerTensorQuantized(), Eq(false));
-  EXPECT_THAT(element.IsPerAxisQuantized(), Eq(true));
-  EXPECT_THAT(element.QuantizedDimension(), Eq(3));
-  EXPECT_THAT(element.template Scales<TypeParam::kExpressedType>(),
-              ElementsAre(ExpressedT{.5f}, ExpressedT{.6f}, ExpressedT{.2f}));
-  EXPECT_THAT(element.template ZeroPoints<TypeParam::kStorageType>(),
-              ElementsAre(3, 1, 2));
+struct PerTensorTest : testing::TestWithParam<QuantizationPair> {
+  // NOLINTNEXTLINE: Using function naming for functors.
+  static constexpr auto ExtractValueAsInt = [](auto v) {
+    return static_cast<int32_t>(v);
+  };
+  // NOLINTNEXTLINE: Using function naming for functors.
+  static constexpr auto ExtractValueAsFloat = [](auto v) {
+    return static_cast<float>(v);
+  };
+};
+
+TEST_P(PerTensorTest, BuildPerTensorWorks) {
+  const QuantizationPair& config = GetParam();
+  QuantizedElementTypePerTensor type(config.storage_type, 1,
+                                     config.expressed_type, 2.5);
+
+  EXPECT_EQ(type.StorageType(), config.storage_type);
+  EXPECT_EQ(type.ExpressedType(), config.expressed_type);
+  EXPECT_EQ(std::visit(ExtractValueAsInt, type.ZeroPoint()), 1);
+  EXPECT_THAT(std::visit(ExtractValueAsFloat, type.Scale()), FloatEq(2.5));
 }
 
-TYPED_TEST(QuantizedTensorElementTypeTest, BaselineTypeForPerAxisWorks) {
-  using ExpressedT = typename TypeParam::ExpressedT;
-  ExpressedT scales[3] = {ExpressedT(0.5f), ExpressedT(0.6f), ExpressedT(0.2f)};
-  typename TypeParam::StorageT zero_points[3] = {3, 1, 2};
-  const auto element =
-      QuantizedTensorElementType::PerAxis<TypeParam::kStorageType,
-                                          TypeParam::kExpressedType>(
-          absl::MakeConstSpan(scales), absl::MakeConstSpan(zero_points),
-          /*quantized_dimension=*/3u);
+TEST_P(PerTensorTest, BaselineTypeWorks) {
+  float scale = 0.5f;
+  int32_t zero_point = 3;
+
+  const QuantizationPair& config = GetParam();
+  QuantizedElementTypePerTensor element(config.storage_type, zero_point,
+                                        config.expressed_type, scale);
   const auto baseline = BaselineType(element);
 
   EXPECT_EQ(baseline.StorageType(), element.StorageType());
   EXPECT_EQ(baseline.ExpressedType(), element.ExpressedType());
-  EXPECT_EQ(baseline.IsPerTensorQuantized(), element.IsPerTensorQuantized());
-  EXPECT_EQ(baseline.IsPerAxisQuantized(), element.IsPerAxisQuantized());
-  EXPECT_EQ(baseline.QuantizedDimension(), element.QuantizedDimension());
-  EXPECT_THAT(baseline.template Scales<TypeParam::kExpressedType>().size(),
-              element.template Scales<TypeParam::kExpressedType>().size());
-  EXPECT_THAT(baseline.template Scales<TypeParam::kExpressedType>(), Each(1.f));
-  EXPECT_THAT(baseline.template ZeroPoints<TypeParam::kStorageType>().size(),
-              element.template ZeroPoints<TypeParam::kStorageType>().size());
-  EXPECT_THAT(baseline.template ZeroPoints<TypeParam::kStorageType>(), Each(0));
+  EXPECT_EQ(std::visit(ExtractValueAsInt, baseline.ZeroPoint()), 0);
+  EXPECT_THAT(std::visit(ExtractValueAsFloat, baseline.Scale()), FloatEq(1));
 }
 
-TYPED_TEST(QuantizedTensorElementTypeTest, BaselineTypeForPerTensorWorks) {
-  typename TypeParam::ExpressedT scale(0.5f);
-  typename TypeParam::StorageT zero_point = 3;
+INSTANTIATE_TEST_SUITE_P(PerTensor, PerTensorTest,
+                         testing::ValuesIn(ValidQuantizationTypePairs()));
+
+struct PerAxisTest : testing::TestWithParam<QuantizationPair> {
+  // NOLINTNEXTLINE: Using function naming for functors.
+  static constexpr auto ExtractValueAsInt = [](auto v) {
+    return std::vector<int32_t>(v.begin(), v.end());
+  };
+  // NOLINTNEXTLINE: Using function naming for functors.
+  static constexpr auto ExtractValueAsFloat = [](auto v) {
+    return std::vector<float>(v.begin(), v.end());
+  };
+};
+
+TEST_P(PerAxisTest, BuildPerAxisWorks) {
+  const QuantizationPair& config = GetParam();
+  const std::vector<int32_t> ref_zero_points{1, 2, 3};
+  const std::vector<float> ref_scales{1.5, 2.5, 3.5};
+
+  QuantizedElementTypePerAxis type(config.storage_type, ref_zero_points,
+                                   config.expressed_type, ref_scales,
+                                   /*quantized_dimension=*/1);
+
+  EXPECT_EQ(type.StorageType(), config.storage_type);
+  EXPECT_EQ(type.ExpressedType(), config.expressed_type);
+  EXPECT_THAT(std::visit(ExtractValueAsInt, type.ZeroPoints()),
+              ElementsAreArray(ref_zero_points));
+  EXPECT_THAT(std::visit(ExtractValueAsFloat, type.Scales()),
+              Pointwise(FloatEq(), ref_scales));
+}
 
-  const auto element =
-      QuantizedTensorElementType::PerTensor<TypeParam::kStorageType,
-                                            TypeParam::kExpressedType>(
-          scale, zero_point);
+TEST_P(PerAxisTest, BaselineTypeWorks) {
+  const QuantizationPair& config = GetParam();
+  float scales[3] = {0.5f, 0.6f, 0.2f};
+  int32_t zero_points[3] = {3, 1, 2};
+  const QuantizedElementTypePerAxis element(config.storage_type, scales,
+                                            config.expressed_type, zero_points,
+                                            /*quantized_dimension=*/3u);
   const auto baseline = BaselineType(element);
 
+  const auto extracted_zero_points =
+      std::visit(ExtractValueAsInt, baseline.ZeroPoints());
+  const auto extracted_scales =
+      std::visit(ExtractValueAsFloat, baseline.Scales());
+
   EXPECT_EQ(baseline.StorageType(), element.StorageType());
   EXPECT_EQ(baseline.ExpressedType(), element.ExpressedType());
-  EXPECT_EQ(baseline.IsPerTensorQuantized(), element.IsPerTensorQuantized());
-  EXPECT_EQ(baseline.IsPerAxisQuantized(), element.IsPerAxisQuantized());
-  EXPECT_THAT(baseline.template Scales<TypeParam::kExpressedType>().size(),
-              element.template Scales<TypeParam::kExpressedType>().size());
-  EXPECT_THAT(baseline.template Scales<TypeParam::kExpressedType>(), Each(1.f));
-  EXPECT_THAT(baseline.template ZeroPoints<TypeParam::kStorageType>().size(),
-              element.template ZeroPoints<TypeParam::kStorageType>().size());
-  EXPECT_THAT(baseline.template ZeroPoints<TypeParam::kStorageType>(), Each(0));
+  EXPECT_EQ(baseline.QuantizedDimension(), element.QuantizedDimension());
+
+  EXPECT_THAT(extracted_zero_points, Each(0));
+  EXPECT_THAT(extracted_zero_points.size(), std::size(zero_points));
+  EXPECT_THAT(extracted_scales, Each(FloatEq(1.0f)));
+  EXPECT_THAT(extracted_scales.size(), std::size(scales));
 }
 
+INSTANTIATE_TEST_SUITE_P(PerAxis, PerAxisTest,
+                         testing::ValuesIn(ValidQuantizationTypePairs()));
+
 }  // namespace
 }  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/tensor.cc b/tensorflow/lite/experimental/shlo/tensor.cc
index 710ca6d7cb9ff6..e5026d0c6eebe0 100644
--- a/tensorflow/lite/experimental/shlo/tensor.cc
+++ b/tensorflow/lite/experimental/shlo/tensor.cc
@@ -19,111 +19,102 @@ limitations under the License.
 #include <cstddef>
 #include <variant>
 
+#include "absl/log/absl_check.h"
 #include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/overload.h"
 #include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
 #include "tensorflow/lite/experimental/shlo/shape.h"
 
 namespace shlo_ref {
 
-std::variant<TensorElementType, QuantizedTensorElementType> BaselineType(
-    const std::variant<TensorElementType, QuantizedTensorElementType>& type) {
+TensorElementTypeVariant BaselineType(const TensorElementTypeVariant& type) {
   return std::visit(
-      [](auto t)
-          -> std::variant<TensorElementType, QuantizedTensorElementType> {
-        return BaselineType(t);
-      },
-      type);
+      [](auto t) -> TensorElementTypeVariant { return BaselineType(t); }, type);
 }
 
 const Shape& Tensor::shape() const {
-  if (IsQuantized()) {
-    return quantized_tensor_type().shape;
-  } else {
-    return tensor_type().shape;
-  }
+  return std::visit([](auto& t) -> const Shape& { return t.shape; }, type);
 }
 
 Shape& Tensor::shape() {
-  if (IsQuantized()) {
-    return quantized_tensor_type().shape;
-  } else {
-    return tensor_type().shape;
-  }
+  return std::visit([](auto& t) -> Shape& { return t.shape; }, type);
 }
 
 bool Tensor::IsQuantized() const {
-  return std::holds_alternative<QuantizedTensorType>(type);
+  return IsPerTensorQuantized() || IsPerAxisQuantized();
 }
 
 bool Tensor::IsPerAxisQuantized() const {
-  return IsQuantized() &&
-         std::get<QuantizedTensorType>(type).element_type.IsPerAxisQuantized();
+  return std::holds_alternative<QuantizedPerAxisTensorType>(type);
 }
 bool Tensor::IsPerTensorQuantized() const {
-  return IsQuantized() && std::get<QuantizedTensorType>(type)
-                              .element_type.IsPerTensorQuantized();
+  return std::holds_alternative<QuantizedPerTensorTensorType>(type);
 }
 
-size_t Tensor::Rank() const {
-  return IsQuantized() ? quantized_tensor_type().shape.Rank()
-                       : tensor_type().shape.Rank();
-}
+size_t Tensor::Rank() const { return shape().Rank(); }
 
 DataType Tensor::StorageType() const {
-  return IsQuantized() ? quantized_tensor_type().element_type.StorageType()
-                       : tensor_type().element_type;
+  return std::visit(
+      shlo_ref::Overload(
+          [](const TensorType& t) { return t.element_type; },
+          [](const auto& t) { return t.element_type.StorageType(); }),
+      type);
 }
 
-DimensionSize Tensor::NumElements() const {
-  return IsQuantized() ? quantized_tensor_type().shape.NumElements()
-                       : tensor_type().shape.NumElements();
-}
+DimensionSize Tensor::NumElements() const { return shape().NumElements(); }
 
 size_t Tensor::SizeInBytes() const {
-  if (IsQuantized()) {
-    return SizeOf(quantized_tensor_type().element_type.StorageType()) *
-           quantized_tensor_type().shape.NumElements();
-  } else {
-    return SizeOf(tensor_type().element_type) *
-           tensor_type().shape.NumElements();
-  }
+  return SizeOf(StorageType()) * NumElements();
 }
 
 TensorType& Tensor::tensor_type() {
-  assert(std::holds_alternative<TensorType>(type));
+  ABSL_CHECK(std::holds_alternative<TensorType>(type));
   return std::get<TensorType>(type);
 }
 
 const TensorType& Tensor::tensor_type() const {
-  assert(std::holds_alternative<TensorType>(type));
+  ABSL_CHECK(std::holds_alternative<TensorType>(type));
   return std::get<TensorType>(type);
 }
 
-QuantizedTensorType& Tensor::quantized_tensor_type() {
-  assert(std::holds_alternative<QuantizedTensorType>(type));
-  return std::get<QuantizedTensorType>(type);
+QuantizedPerTensorTensorType& Tensor::quantized_per_tensor_type() {
+  ABSL_CHECK(std::holds_alternative<QuantizedPerTensorTensorType>(type));
+  return std::get<QuantizedPerTensorTensorType>(type);
 }
 
-const QuantizedTensorType& Tensor::quantized_tensor_type() const {
-  assert(std::holds_alternative<QuantizedTensorType>(type));
-  return std::get<QuantizedTensorType>(type);
+const QuantizedPerTensorTensorType& Tensor::quantized_per_tensor_type() const {
+  assert(std::holds_alternative<QuantizedPerTensorTensorType>(type));
+  return std::get<QuantizedPerTensorTensorType>(type);
+}
+
+QuantizedPerAxisTensorType& Tensor::quantized_per_axis_type() {
+  ABSL_CHECK(std::holds_alternative<QuantizedPerAxisTensorType>(type));
+  return std::get<QuantizedPerAxisTensorType>(type);
+}
+
+const QuantizedPerAxisTensorType& Tensor::quantized_per_axis_type() const {
+  assert(std::holds_alternative<QuantizedPerAxisTensorType>(type));
+  return std::get<QuantizedPerAxisTensorType>(type);
 }
 
 const TensorElementType& Tensor::tensor_element_type() const {
   return tensor_type().element_type;
 }
-const QuantizedTensorElementType& Tensor::quantized_tensor_element_type()
+
+const QuantizedElementTypePerTensor& Tensor::quantized_per_tensor_element_type()
     const {
-  return quantized_tensor_type().element_type;
+  return quantized_per_tensor_type().element_type;
 }
 
-std::variant<TensorElementType, QuantizedTensorElementType>
-Tensor::element_type() const {
-  if (const TensorType* t = std::get_if<TensorType>(&type); t != nullptr) {
-    return t->element_type;
-  } else {
-    return std::get<QuantizedTensorType>(type).element_type;
-  }
+const QuantizedElementTypePerAxis& Tensor::quantized_per_axis_element_type()
+    const {
+  return quantized_per_axis_type().element_type;
+}
+
+TensorElementTypeVariant Tensor::element_type() const {
+  return std::visit(
+      [](const auto& t) -> TensorElementTypeVariant { return t.element_type; },
+      type);
 }
 
 bool operator==(const TensorType& lhs, const TensorType& rhs) {
@@ -134,13 +125,23 @@ bool operator!=(const TensorType& lhs, const TensorType& rhs) {
   return !(lhs == rhs);
 }
 
-bool operator==(const QuantizedTensorType& lhs,
-                const QuantizedTensorType& rhs) {
+bool operator==(const QuantizedPerTensorTensorType& lhs,
+                const QuantizedPerTensorTensorType& rhs) {
+  return lhs.element_type == rhs.element_type && lhs.shape == rhs.shape;
+}
+
+bool operator!=(const QuantizedPerTensorTensorType& lhs,
+                const QuantizedPerTensorTensorType& rhs) {
+  return !(lhs == rhs);
+}
+
+bool operator==(const QuantizedPerAxisTensorType& lhs,
+                const QuantizedPerAxisTensorType& rhs) {
   return lhs.element_type == rhs.element_type && lhs.shape == rhs.shape;
 }
 
-bool operator!=(const QuantizedTensorType& lhs,
-                const QuantizedTensorType& rhs) {
+bool operator!=(const QuantizedPerAxisTensorType& lhs,
+                const QuantizedPerAxisTensorType& rhs) {
   return !(lhs == rhs);
 }
 
diff --git a/tensorflow/lite/experimental/shlo/tensor.h b/tensorflow/lite/experimental/shlo/tensor.h
index 57029105d1a218..a0ebdc649fb0e2 100644
--- a/tensorflow/lite/experimental/shlo/tensor.h
+++ b/tensorflow/lite/experimental/shlo/tensor.h
@@ -34,7 +34,8 @@ constexpr TensorElementType BaselineType(TensorElementType type) {
 }
 
 using TensorElementTypeVariant =
-    std::variant<TensorElementType, QuantizedTensorElementType>;
+    std::variant<TensorElementType, QuantizedElementTypePerTensor,
+                 QuantizedElementTypePerAxis>;
 
 TensorElementTypeVariant BaselineType(const TensorElementTypeVariant& type);
 
@@ -43,12 +44,18 @@ struct TensorType {
   TensorElementType element_type;
 };
 
-struct QuantizedTensorType {
+struct QuantizedPerTensorTensorType {
   Shape shape;
-  QuantizedTensorElementType element_type;
+  QuantizedElementTypePerTensor element_type;
 };
 
-using TensorTypeVariant = std::variant<TensorType, QuantizedTensorType>;
+struct QuantizedPerAxisTensorType {
+  Shape shape;
+  QuantizedElementTypePerAxis element_type;
+};
+
+using TensorTypeVariant = std::variant<TensorType, QuantizedPerTensorTensorType,
+                                       QuantizedPerAxisTensorType>;
 
 struct Tensor {
   const Shape& shape() const;
@@ -67,11 +74,16 @@ struct Tensor {
   TensorType& tensor_type();
   const TensorType& tensor_type() const;
 
-  QuantizedTensorType& quantized_tensor_type();
-  const QuantizedTensorType& quantized_tensor_type() const;
+  QuantizedPerTensorTensorType& quantized_per_tensor_type();
+  const QuantizedPerTensorTensorType& quantized_per_tensor_type() const;
+
+  QuantizedPerAxisTensorType& quantized_per_axis_type();
+  const QuantizedPerAxisTensorType& quantized_per_axis_type() const;
 
   const TensorElementType& tensor_element_type() const;
-  const QuantizedTensorElementType& quantized_tensor_element_type() const;
+  const QuantizedElementTypePerTensor& quantized_per_tensor_element_type()
+      const;
+  const QuantizedElementTypePerAxis& quantized_per_axis_element_type() const;
 
   TensorElementTypeVariant element_type() const;
 
@@ -104,8 +116,15 @@ struct Tensor {
 bool operator==(const TensorType& lhs, const TensorType& rhs);
 bool operator!=(const TensorType& lhs, const TensorType& rhs);
 
-bool operator==(const QuantizedTensorType& lhs, const QuantizedTensorType& rhs);
-bool operator!=(const QuantizedTensorType& lhs, const QuantizedTensorType& rhs);
+bool operator==(const QuantizedPerTensorTensorType& lhs,
+                const QuantizedPerTensorTensorType& rhs);
+bool operator!=(const QuantizedPerTensorTensorType& lhs,
+                const QuantizedPerTensorTensorType& rhs);
+
+bool operator==(const QuantizedPerAxisTensorType& lhs,
+                const QuantizedPerAxisTensorType& rhs);
+bool operator!=(const QuantizedPerAxisTensorType& lhs,
+                const QuantizedPerAxisTensorType& rhs);
 
 }  // namespace shlo_ref
 
diff --git a/tensorflow/lite/experimental/shlo/tensor_with_data.h b/tensorflow/lite/experimental/shlo/tensor_with_data.h
index 111881d3b3f2ad..e9508573b9b15d 100644
--- a/tensorflow/lite/experimental/shlo/tensor_with_data.h
+++ b/tensorflow/lite/experimental/shlo/tensor_with_data.h
@@ -54,11 +54,10 @@ class TensorWithData {
     using StorageT = typename Storage<storage_type>::Type;
     using ExpressedT = typename Storage<expressed_type>::Type;
 
-    Tensor tensor{QuantizedTensorType{
+    Tensor tensor{QuantizedPerTensorTensorType{
         .shape = std::move(shape),
-        .element_type =
-            QuantizedTensorElementType::PerTensor<storage_type, expressed_type>(
-                scale, zero_point)}};
+        .element_type = QuantizedElementTypePerTensor(storage_type, zero_point,
+                                                      expressed_type, scale)}};
 
     const ExpressedT scale_inv = ExpressedT(1.0) / scale;
     std::vector<StorageT> quantized_data;
diff --git a/tensorflow/lite/g3doc/guide/build_cmake.md b/tensorflow/lite/g3doc/guide/build_cmake.md
index 1055ded2ac535a..78a86adf3e442a 100644
--- a/tensorflow/lite/g3doc/guide/build_cmake.md
+++ b/tensorflow/lite/g3doc/guide/build_cmake.md
@@ -285,15 +285,22 @@ follow [step 1](#step-1-install-cmake-tool) to
 [step 3](#step-3-create-cmake-build-directory) first. After that, run the
 following commands.
 
+### Linux / MacOS
 ```sh
 cmake ../tensorflow_src/tensorflow/lite/c
 cmake --build . -j
 ```
 
-This command generates the following shared library in the current directory.
+### Windows
+```sh
+cmake ../tensorflow_src/tensorflow/lite/c
+cmake --build . -j --config Release
+```
+
+### Compiled Library
 
-**Note:** On Windows system, you can find the `tensorflowlite_c.dll` under
-`debug` directory.
+The above command generates the following shared library in the current
+directory.
 
 Platform | Library name
 -------- | ---------------------------
diff --git a/tensorflow/lite/g3doc/models/modify/model_maker/text_searcher.ipynb b/tensorflow/lite/g3doc/models/modify/model_maker/text_searcher.ipynb
index c7fe4c9f4d8f3b..6f595a830426a9 100644
--- a/tensorflow/lite/g3doc/models/modify/model_maker/text_searcher.ipynb
+++ b/tensorflow/lite/g3doc/models/modify/model_maker/text_searcher.ipynb
@@ -70,7 +70,7 @@
         "id": "c2sdIlXEVPZR"
       },
       "source": [
-        "In this colab notebook, you can learn how to use the [TensorFlow Lite Model Maker](https://www.tensorflow.org/lite/models/modify/model_maker) library to create a TFLite Searcher model. You can use a text Searcher model to build Sematic Search or Smart Reply for your app. This type of model lets you take a text query and search for the most related entries in a text dataset, such as a database of web pages. The model returns a list of the smallest distance scoring entries in the dataset, including metadata you specify, such as URL, page title, or other text entry identifiers. After building this, you can deploy it onto devices (e.g. Android) using [Task Library Searcher API](https://www.tensorflow.org/lite/inference_with_metadata/task_library/text_searcher) to run inference with just a few lines of code.\n",
+        "In this colab notebook, you can learn how to use the [TensorFlow Lite Model Maker](https://www.tensorflow.org/lite/models/modify/model_maker) library to create a TFLite Searcher model. You can use a text Searcher model to build Semantic Search or Smart Reply for your app. This type of model lets you take a text query and search for the most related entries in a text dataset, such as a database of web pages. The model returns a list of the smallest distance scoring entries in the dataset, including metadata you specify, such as URL, page title, or other text entry identifiers. After building this, you can deploy it onto devices (e.g. Android) using [Task Library Searcher API](https://www.tensorflow.org/lite/inference_with_metadata/task_library/text_searcher) to run inference with just a few lines of code.\n",
         "\n",
         "This tutorial leverages CNN/DailyMail dataset as an instance to create the TFLite Searcher model. You can try with your own dataset with the compatible input comma separated value (CSV) format."
       ]
diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index 1130c3d1b5e432..a808854f036d0c 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -7,7 +7,7 @@ load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "cc_library_with
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
-        "//visibility:public",
+        "//tensorflow/lite:anything_but_tf",
     ],
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/kernels/cast.cc b/tensorflow/lite/kernels/cast.cc
index e345a12aeb529c..202523075e05d6 100644
--- a/tensorflow/lite/kernels/cast.cc
+++ b/tensorflow/lite/kernels/cast.cc
@@ -48,6 +48,17 @@ void copyCast(const float* in, int32_t* out, int num_elements) {
   });
 }
 
+void copyCast(const float* in, int16_t* out, int num_elements) {
+  float min_int_float =
+      std::nextafterf((float)std::numeric_limits<int16_t>::min(), 0);
+  float max_int_float =
+      std::nextafterf((float)std::numeric_limits<int16_t>::max(), 0);
+  std::transform(in, in + num_elements, out, [=](float a) {
+    return static_cast<int16_t>(
+        std::max(std::min(a, max_int_float), min_int_float));
+  });
+}
+
 template <typename FromT, typename ToT>
 void copyCast(const FromT* in, ToT* out, int num_elements) {
   std::transform(in, in + num_elements, out,
diff --git a/tensorflow/lite/kernels/cast_test.cc b/tensorflow/lite/kernels/cast_test.cc
index d83c26b0b224ce..4e03e9568afc83 100644
--- a/tensorflow/lite/kernels/cast_test.cc
+++ b/tensorflow/lite/kernels/cast_test.cc
@@ -60,6 +60,15 @@ TEST(CastOpModel, CastInt4ToFloatLarge) {
   }
 }
 
+TEST(CastOpModel, CastFloatToInt16Infinity) {
+  CastOpModel m({TensorType_FLOAT32, {2}}, {TensorType_INT16, {2}});
+  m.PopulateTensor<float>(m.input(), {std::numeric_limits<float>::infinity(),
+                                      -std::numeric_limits<float>::infinity()});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.ExtractVector<int16_t>(m.output()),
+              ElementsAreArray({32766, -32767}));
+}
+
 TEST(CastOpModel, CastFloatToInt32Infinity) {
   CastOpModel m({TensorType_FLOAT32, {2}}, {TensorType_INT32, {2}});
   m.PopulateTensor<float>(m.input(), {std::numeric_limits<float>::infinity(),
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
index 4bfca5fc2bc07f..7dc040becb2795 100644
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
@@ -15,6 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_3X3_FILTER_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_3X3_FILTER_H_
 
+#include <stddef.h>
+
 #include <algorithm>
 #include <memory>
 
@@ -49,7 +51,6 @@ inline int8x8_t util_vld1_x8(const int8* data_addr) {
 // Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
 // Jetson TX-2. This compiler does not support the offsetof() macro.
 #if defined(__aarch64__) && !defined(GOOGLE_L4T)
-#include <stddef.h>
 
 // Lane operations are for clarity and convenience. We want to load and store
 // 4 8-bit lanes together. So these are treated much like 32-bit loads and
diff --git a/tensorflow/lite/kernels/lstm_test.cc b/tensorflow/lite/kernels/lstm_test.cc
index 75152dca1cb9dd..246a9550c357c5 100644
--- a/tensorflow/lite/kernels/lstm_test.cc
+++ b/tensorflow/lite/kernels/lstm_test.cc
@@ -1304,7 +1304,7 @@ TEST_P(LstmOpTest, Cifg_Peephole_Projection_LayerNorm) {
 
   static const auto* tolerance_per_type =
       new std::map<TensorType, float>{{TensorType_FLOAT32, 0.00001f},
-                                      {TensorType_UINT8, 0.000971057f},
+                                      {TensorType_UINT8, 0.001f},
                                       {TensorType_INT8, 0.001f}};
   VerifyGoldens(&lstm, tolerance_per_type->at(weight_type));
 }
diff --git a/tensorflow/lite/kernels/shim/BUILD b/tensorflow/lite/kernels/shim/BUILD
index 65d1c550991973..3244635f60cf36 100644
--- a/tensorflow/lite/kernels/shim/BUILD
+++ b/tensorflow/lite/kernels/shim/BUILD
@@ -46,7 +46,6 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
-        "@com_google_absl//absl/types:variant",
     ],
 )
 
@@ -142,7 +141,6 @@ cc_library(
         ":shape",
         ":status_macros",
         ":tensor_view",
-        "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/lite/kernels/shim/op_kernel.h b/tensorflow/lite/kernels/shim/op_kernel.h
index 84ff3c2c851a2c..4c6ae57e70d4fc 100644
--- a/tensorflow/lite/kernels/shim/op_kernel.h
+++ b/tensorflow/lite/kernels/shim/op_kernel.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 #include <string>
+#include <variant>
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -62,7 +63,7 @@ using ConstTensorViewOr = absl::StatusOr<std::unique_ptr<const TensorView>>;
 // methods.
 
 // The attribute dictionary passed to the op
-using AttrValue = absl::variant<bool, int64_t, float, absl::string_view>;
+using AttrValue = std::variant<bool, int64_t, float, absl::string_view>;
 
 // The interface for available methods during an op kernel initialization
 template <typename SubType>
@@ -228,13 +229,13 @@ absl::Status GetAttr(const std::string& attr_name,
                      AttrType* value) {
   if (!attr_value_or.ok()) return attr_value_or.status();
   const AttrValue& attr_value = attr_value_or.value();
-  if (!absl::holds_alternative<AttrType>(attr_value)) {
+  if (!std::holds_alternative<AttrType>(attr_value)) {
     return absl::InternalError(
         absl::StrCat("The attribute type does not match the provided "
                      "type: attr_name: ",
                      attr_name));
   }
-  *value = absl::get<AttrType>(attr_value);
+  *value = std::get<AttrType>(attr_value);
   return absl::OkStatus();
 }
 }  // namespace internal
diff --git a/tensorflow/lite/kernels/shim/tensor_view.h b/tensorflow/lite/kernels/shim/tensor_view.h
index aa8e0b0fd79370..fc952a4496d066 100644
--- a/tensorflow/lite/kernels/shim/tensor_view.h
+++ b/tensorflow/lite/kernels/shim/tensor_view.h
@@ -47,11 +47,12 @@ struct TensorViewSubType {};
 class TensorView {
  protected:
   // Union over all data types
-  using DataVariantType = absl::variant<
-      absl::Span<bool>, absl::Span<int8_t>, absl::Span<uint8_t>,
-      absl::Span<int16_t>, absl::Span<uint16_t>, absl::Span<int32_t>,
-      absl::Span<uint32_t>, absl::Span<int64_t>, absl::Span<uint64_t>,
-      absl::Span<float>, absl::Span<double>, absl::Span<::tensorflow::tstring>>;
+  using DataVariantType =
+      std::variant<absl::Span<bool>, absl::Span<int8_t>, absl::Span<uint8_t>,
+                   absl::Span<int16_t>, absl::Span<uint16_t>,
+                   absl::Span<int32_t>, absl::Span<uint32_t>,
+                   absl::Span<int64_t>, absl::Span<uint64_t>, absl::Span<float>,
+                   absl::Span<double>, absl::Span<::tensorflow::tstring>>;
 
   // An interface while provides convenient row-major indexing over the
   // underlying tensor.
diff --git a/tensorflow/lite/kernels/shim/tflite_op_wrapper.h b/tensorflow/lite/kernels/shim/tflite_op_wrapper.h
index fdb06e15690e18..9ad950c3d0c4f6 100644
--- a/tensorflow/lite/kernels/shim/tflite_op_wrapper.h
+++ b/tensorflow/lite/kernels/shim/tflite_op_wrapper.h
@@ -123,7 +123,7 @@ static constexpr auto convertTuplesToOps(std::tuple<Ts...>) -> std::tuple<
 
 // Convert a tuple of types into a variant of types.
 template <typename... Ts>
-static constexpr absl::variant<Ts...> convertTupleToVariant(std::tuple<Ts...>);
+static constexpr std::variant<Ts...> convertTupleToVariant(std::tuple<Ts...>);
 
 // The variant Op type created with TMP. A tuple of tuples containing the
 // attribute combinations is first created. Then each inner tuple is converted
diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 9b91a640bc7923..3443f14d2f5fdd 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -637,7 +637,10 @@ def __init__(self):
     self._experimental_full_integer_quantization_bias_type = None
     # Provides specs for quantization, whether preset or custom.
     self._experimental_quantization_options = None  # Deprecated
+    # Whether to use StableHLO Quantizer instead of TFLite Quantizer.
     self.experimental_use_stablehlo_quantizer = False
+    # Quantization configuration to pass to StableHLO Quantizer.
+    self.experimental_stablehlo_quantizer_config = None
     # Initializes conversion metadata.
     self.exclude_conversion_metadata = False
     self._metadata = conversion_metadata_fb.ConversionMetadataT()
@@ -816,6 +819,9 @@ def _get_base_converter_args(self):
         "use_buffer_offset": self._experimental_use_buffer_offset,
         "reduce_type_precision": self._experimental_reduce_type_precision,
         "use_stablehlo_quantizer": self.experimental_use_stablehlo_quantizer,
+        "stablehlo_quantizer_config": (
+            self.experimental_stablehlo_quantizer_config
+        ),
         "qdq_conversion_mode": self._experimental_qdq_conversion_mode,
         "disable_per_channel_quantization_for_dense_layers": (
             self._experimental_disable_per_channel_quantization_for_dense_layers
@@ -842,45 +848,67 @@ def _get_base_converter_args(self):
       )
 
     if self.experimental_use_stablehlo_quantizer:
-      if Optimize.DEFAULT in self.optimizations and self.representative_dataset:
-        if len(self._saved_model_exported_names) != 1:
-          raise ValueError(
-              "StableHLO quantizer is only supported when converting from a"
-              " SavedModel with one signature key."
-          )
+      self._assign_stablehlo_quantization_config_or_populate_default(args)
+    elif self.experimental_stablehlo_quantizer_config is not None:
+      raise ValueError(
+          "QuantizationConfig should be provided only when"
+          " experimental_use_stablehlo_quantizer is set to true."
+      )
 
-        signature_key = self._saved_model_exported_names[0]
-
-        # Convert a programmatically provided representative dataset to a
-        # temporary TFRecord file to be used by the StableHLO quantizer.
-        tfrecord_file_path: str = tempfile.mkstemp(
-            suffix=".tfrecord", prefix=signature_key
-        )[1]
-        rd.TfRecordRepresentativeDatasetSaver(
-            {signature_key: tfrecord_file_path}
-        ).save({signature_key: self.representative_dataset()})
-
-        quantization_config = qc.QuantizationConfig(
-            static_range_ptq_preset=qc.StaticRangePtqPreset(
-                representative_datasets=[
-                    qc.RepresentativeDatasetConfig(
-                        tf_record=qc.TfRecordFile(path=tfrecord_file_path)
-                    )
-                ],
-                enable_per_channel_quantized_weight=True,
-                enable_full_int_quantization=True,
-            ),
-            # For ODML use cases, uniform quantized types should be left intact.
-            pipeline_config=qc.PipelineConfig(
-                unpack_quantized_types=False,
-            ),
+    return args
+
+  def _assign_stablehlo_quantization_config_or_populate_default(self, args):
+    """Assigns `QuantizationConfig` to `args` or populate default.
+
+    Args:
+      args: Dictionary of argument names and associated values.
+    """
+    if (
+        self.experimental_stablehlo_quantizer_config is not None
+        and Optimize.DEFAULT not in self.optimizations
+    ):
+      args["quantization_config"] = self.experimental_stablehlo_quantizer_config
+    elif Optimize.DEFAULT in self.optimizations and self.representative_dataset:
+      if len(self._saved_model_exported_names) != 1:
+        raise ValueError(
+            "StableHLO quantizer is only supported when converting from a"
+            " SavedModel with one signature key."
         )
 
-        args["quantization_config"] = quantization_config
-      else:
-        raise ValueError("StableHLO quantizer only supports static-range PTQ.")
+      signature_key = self._saved_model_exported_names[0]
+
+      # Convert a programmatically provided representative dataset to a
+      # temporary TFRecord file to be used by the StableHLO quantizer.
+      tfrecord_file_path = tempfile.mkstemp(
+          suffix=".tfrecord", prefix=signature_key
+      )[1]
+      rd.TfRecordRepresentativeDatasetSaver(
+          {signature_key: tfrecord_file_path}
+      ).save({signature_key: self.representative_dataset()})
+
+      quantization_config = qc.QuantizationConfig(
+          static_range_ptq_preset=qc.StaticRangePtqPreset(
+              representative_datasets=[
+                  qc.RepresentativeDatasetConfig(
+                      tf_record=qc.TfRecordFile(path=tfrecord_file_path)
+                  )
+              ],
+              enable_per_channel_quantized_weight=True,
+              enable_full_int_quantization=True,
+          ),
+          # For ODML use cases, uniform quantized types should be left intact.
+          pipeline_config=qc.PipelineConfig(
+              unpack_quantized_types=False,
+          ),
+      )
 
-    return args
+      args["quantization_config"] = quantization_config
+      # TODO: b/307626463 - Enable StableHLO quantizer DRQ when Optimize.DEFAULT
+      # is set without representative dataset.
+    else:
+      raise ValueError(
+          "StableHLO quantizer only supports static-range and weight-only PTQ."
+      )
 
   def _contains_function_with_implements_attr(self, saved_model_proto):
     meta_graph = saved_model_proto.meta_graphs[0]
diff --git a/tensorflow/lite/python/lite_v2_test.py b/tensorflow/lite/python/lite_v2_test.py
index a8c0b136346d8d..9e74a9c136f907 100644
--- a/tensorflow/lite/python/lite_v2_test.py
+++ b/tensorflow/lite/python/lite_v2_test.py
@@ -794,10 +794,10 @@ def testGatherNDQI8(self):
     """Test gather_nd with quantized i8 parameters."""
 
     class GatherNDQI8QDQ(tf.keras.Model):
+
       @tf.function(
           input_signature=[tf.TensorSpec(shape=(2, 2), dtype=tf.float32)]
       )
-
       def func(self, input_tensor):
         x = tf.quantization.fake_quant_with_min_max_args(
             input_tensor, -3.0, 3.0
@@ -1629,7 +1629,9 @@ def testStableHloQuantizerSupportsOnlyStaticRangePtq(self):
 
     converter = lite.TFLiteConverterV2.from_saved_model(save_dir)
     converter.experimental_use_stablehlo_quantizer = True
-    with self.assertRaisesRegex(ValueError, 'only supports static-range PTQ'):
+    with self.assertRaisesRegex(
+        ValueError, 'only supports static-range and weight-only PTQ'
+    ):
       converter.convert()
 
   @test_util.run_v2_only
diff --git a/tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_activation_func.cc b/tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_activation_func.cc
index fbdf23d11813cc..ade38194bbe4d1 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_activation_func.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_activation_func.cc
@@ -100,7 +100,7 @@ ::tensorflow::Status RemoveTrivialQuantizedActivationFunc::Run(
   const auto it = model->operators.begin() + op_index;
   auto* op = it->get();
   if (op->inputs.empty()) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   if (IsTrivialUnfusedActivationFunc(this, *model, op->type, op->inputs[0])) {
@@ -109,7 +109,7 @@ ::tensorflow::Status RemoveTrivialQuantizedActivationFunc::Run(
         "minmax imply at least as tight a clamp anyway.",
         LogName(*op));
     *modified = RemoveTrivialPassthroughOp(this, model, op_index);
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   if (IsTrivialFusedActivationFunc(this, *model, op->fused_activation_function,
                                    op->outputs[0])) {
@@ -120,9 +120,9 @@ ::tensorflow::Status RemoveTrivialQuantizedActivationFunc::Run(
         "a clamp anyway.",
         LogName(*op));
     *modified = true;
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_min_max.cc b/tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_min_max.cc
index c26816ff490aa4..4e0669e3629cb0 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_min_max.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_min_max.cc
@@ -78,7 +78,7 @@ ::tensorflow::Status RemoveTrivialQuantizedMinMax::Run(Model* model,
   if ((op->type != OperatorType::kMinimum &&
        op->type != OperatorType::kMaximum) ||
       op->inputs.size() != 2) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   if (IsTrivialMinMax(this, *model, op->type, op->inputs[0], op->inputs[1])) {
     AddMessageF(
@@ -86,9 +86,9 @@ ::tensorflow::Status RemoveTrivialQuantizedMinMax::Run(Model* model,
         "at least as tight a clamp anyway.",
         LogName(*op));
     *modified = RemoveTrivialPassthroughOp(this, model, op_index);
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/remove_trivial_reshape.cc b/tensorflow/lite/toco/graph_transformations/remove_trivial_reshape.cc
index 2ccefc6286a863..37e2735874d9ae 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_trivial_reshape.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_trivial_reshape.cc
@@ -88,19 +88,19 @@ ::tensorflow::Status RemoveTrivialReshape::Run(Model* model,
   const auto reshape_it = model->operators.begin() + op_index;
   auto* reshape_op = reshape_it->get();
   if (reshape_op->type != OperatorType::kReshape) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   if (!IsReshapeTrivial(*model, *reshape_op, this)) {
     AddMessageF("%s is not trivial", LogName(*reshape_op));
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   AddMessageF("Removing trivial %s", LogName(*reshape_op));
 
   CHECK_EQ(reshape_op->inputs.size(), 2);
   *modified = RemoveTrivialPassthroughOp(this, model, op_index);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/remove_trivial_slice.cc b/tensorflow/lite/toco/graph_transformations/remove_trivial_slice.cc
index 79fc79abb1d669..7b75596fac59a4 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_trivial_slice.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_trivial_slice.cc
@@ -55,18 +55,18 @@ ::tensorflow::Status RemoveTrivialSlice::Run(Model* model, std::size_t op_index,
   const auto reshape_it = model->operators.begin() + op_index;
   auto* slice_op = reshape_it->get();
   if (slice_op->type != OperatorType::kSlice) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   if (!IsSliceTrivial(*model, *slice_op, this)) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   AddMessageF("Removing trivial %s", LogName(*slice_op));
 
   CHECK_EQ(slice_op->inputs.size(), 3);
   *modified = RemoveTrivialPassthroughOp(this, model, op_index);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/remove_unused_op.cc b/tensorflow/lite/toco/graph_transformations/remove_unused_op.cc
index b2b6fc34fb247a..58202fc4c6fe7e 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_unused_op.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_unused_op.cc
@@ -60,7 +60,7 @@ ::tensorflow::Status RemoveUnusedOp::Run(Model* model, std::size_t op_index,
     }
     for (const std::string& output_array : model->flags.output_arrays()) {
       if (output == output_array) {
-        return ::tensorflow::OkStatus();
+        return absl::OkStatus();
       }
     }
     for (const auto& rnn_state : model->flags.rnn_states()) {
@@ -69,26 +69,26 @@ ::tensorflow::Status RemoveUnusedOp::Run(Model* model, std::size_t op_index,
         if (!IsDiscardableArray(*model, rnn_state.back_edge_source_array()) ||
             !IsDiscardableArray(*model, rnn_state.state_array()) ||
             CountOpsWithInput(*model, rnn_state.state_array())) {
-          return ::tensorflow::OkStatus();
+          return absl::OkStatus();
         }
       }
     }
     if (CountOpsWithInput(*model, output)) {
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     }
   }
 
   if (op->unresolved_outputs) {
     AddMessageF("Not discarding %s because it has unresolved outputs.",
                 LogName(*op));
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   AddMessageF("Discarding %s because none of its outputs is used.",
               LogName(*op));
   DeleteOpAndArrays(model, op);
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc b/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc
index 12a986cc0cc904..a17e241600b39b 100644
--- a/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc
+++ b/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc
@@ -72,25 +72,25 @@ ::tensorflow::Status ReorderElementwiseUnary::Run(Model* model,
   const auto element_op_it = model->operators.begin() + op_index;
   std::unique_ptr<Operator>& element_op = *element_op_it;
   if (!IsElementwiseOperator(element_op->type)) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   const std::string intermediate_name = element_op->inputs[0];
   auto it = FindOpWithOutput(*model, intermediate_name);
   if (it == model->operators.end()) {
     AddMessageF("No preceding operator");
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   std::unique_ptr<Operator>& move_op = *it;
   if (!IsMoveOperator(move_op->type)) {
     AddMessageF("Preceding operator is not a move operator");
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   if (CountOpsWithInput(*model, intermediate_name) != 1) {
     AddMessageF("Input %s used elsewhere", intermediate_name);
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Check that the intermediate is discardable.
@@ -99,7 +99,7 @@ ::tensorflow::Status ReorderElementwiseUnary::Run(Model* model,
         "Cannot swap elementwise as it would invalidate %s which is "
         "an output array.",
         intermediate_name);
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // op->inputs may change so we need to keep a value by copy.
@@ -153,7 +153,7 @@ ::tensorflow::Status ReorderElementwiseUnary::Run(Model* model,
   element_op.swap(move_op);
 
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc b/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc
index 56bd1d82aa171d..9ff7e49dfb73af 100644
--- a/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc
+++ b/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc
@@ -111,30 +111,30 @@ ::tensorflow::Status ReorderReshapeTranspose::Run(Model* model,
       transpose_it->get(), OperatorType::kTranspose);
 
   if (transpose_op == nullptr) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   if (!OperatorReady(*model, transpose_op) || transpose_op->perm.empty()) {
     // Wait for values to propagate.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Find the operator that produces the transpose op.
   auto reshape_it = FindOpWithOutput(*model, transpose_op->inputs[0]);
   if (reshape_it == model->operators.end()) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   TensorFlowReshapeOperator* reshape_op =
       ConvertOperator<TensorFlowReshapeOperator*>(reshape_it->get(),
                                                   OperatorType::kReshape);
   if (reshape_op == nullptr) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Ignore if the reshape is uninitialized.
   if (!OperatorReady(*model, reshape_op) || reshape_op->shape.empty()) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Need to copy to keep static if permutated.
@@ -145,7 +145,7 @@ ::tensorflow::Status ReorderReshapeTranspose::Run(Model* model,
   // Intermediate should not be consumed by any other operators.
   if (CountOpsWithInput(*model, intermediate_name) != 1) {
     AddMessageF("Input %s used elsewhere", intermediate_name);
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Check that the intermediate is not an output array.
@@ -154,7 +154,7 @@ ::tensorflow::Status ReorderReshapeTranspose::Run(Model* model,
         "Cannot reorder reshape-transpose as it would invalidate %s which is "
         "an output array.",
         intermediate_name);
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Get the arrays.
@@ -176,7 +176,7 @@ ::tensorflow::Status ReorderReshapeTranspose::Run(Model* model,
   // dimensions then it can be moved between the transpose.
   if (!ReshapeIsEquivalentToTranspose(*model, reshape_op,
                                       true /*allow_extra_unary_dims*/)) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   if (!IsDiscardableArray(*model, output_name)) {
@@ -247,7 +247,7 @@ ::tensorflow::Status ReorderReshapeTranspose::Run(Model* model,
   transpose_it->swap(*reshape_it);
 
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_batch_normalization.cc b/tensorflow/lite/toco/graph_transformations/resolve_batch_normalization.cc
index d031654d0f4b36..ce8ca7fd9c600a 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_batch_normalization.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_batch_normalization.cc
@@ -31,7 +31,7 @@ ::tensorflow::Status ResolveBatchNormalization::Run(Model* model,
   *modified = false;
   auto bn_it = model->operators.begin() + op_index;
   if (bn_it->get()->type != OperatorType::kBatchNormalization) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   const auto* bn_op =
       static_cast<const BatchNormalizationOperator*>(bn_it->get());
@@ -44,7 +44,7 @@ ::tensorflow::Status ResolveBatchNormalization::Run(Model* model,
   // we need to exit early if these buffers don't exist yet (i.e. if the params
   // haven't yet been resolved as constants) and will process it once they have.
   if (!mean_array.buffer || !multiplier_array.buffer || !offset_array.buffer) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   CHECK(IsConstantParameterArray(*model, bn_op->inputs[1]) &&
@@ -141,7 +141,7 @@ ::tensorflow::Status ResolveBatchNormalization::Run(Model* model,
   DeleteOpAndArrays(model, bn_op);
 
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_batch_to_space_nd_attributes.cc b/tensorflow/lite/toco/graph_transformations/resolve_batch_to_space_nd_attributes.cc
index 03aaee0614857e..5d4dec284f9ecb 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_batch_to_space_nd_attributes.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_batch_to_space_nd_attributes.cc
@@ -30,29 +30,29 @@ ::tensorflow::Status ResolveBatchToSpaceNDAttributes::Run(Model* model,
   *modified = false;
   const auto op_it = model->operators.begin() + op_index;
   if (op_it->get()->type != OperatorType::kBatchToSpaceND)
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
 
   auto* op = static_cast<BatchToSpaceNDOperator*>(op_it->get());
 
   // The attributes are resolved only when the 3 attributes (block_shape,
   // before_crops, after_crops) are all constant.
   if (!op->block_shape.empty()) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   CHECK_EQ(op->inputs.size(), 3);
   if (!IsConstantParameterArray(*model, op->inputs[1]) ||
       !IsConstantParameterArray(*model, op->inputs[2]))
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
 
   // Handle crops
   const auto& crops_array = model->GetArray(op->inputs[2]);
-  if (!crops_array.has_shape()) return ::tensorflow::OkStatus();
+  if (!crops_array.has_shape()) return absl::OkStatus();
   const std::vector<int>& crops_dims = crops_array.shape().dims();
   if (crops_dims.size() != 2) {
     // Code only handles crops of 2 dimensions. Perhaps another transformation
     // will delete this op.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   const std::vector<int>& crops_buffer =
       crops_array.GetBuffer<ArrayDataType::kInt32>().data;
@@ -63,7 +63,7 @@ ::tensorflow::Status ResolveBatchToSpaceNDAttributes::Run(Model* model,
 
   // Handle block_shape
   const auto& block_shape_array = model->GetArray(op->inputs[1]);
-  if (!block_shape_array.has_shape()) return ::tensorflow::OkStatus();
+  if (!block_shape_array.has_shape()) return absl::OkStatus();
   const std::vector<int>& block_shape_dims = block_shape_array.shape().dims();
   CHECK_EQ(block_shape_dims.size(), 1);
   const std::vector<int>& block_shape_buffer =
@@ -73,7 +73,7 @@ ::tensorflow::Status ResolveBatchToSpaceNDAttributes::Run(Model* model,
   }
 
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_binary.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_binary.cc
index eabe90c6d24242..89d7cc0664d816 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_binary.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_binary.cc
@@ -207,7 +207,7 @@ ::tensorflow::Status ResolveConstantBinaryOperator::Run(Model* model,
       binary_op->type != OperatorType::kLessEqual &&
       binary_op->type != OperatorType::kGreater &&
       binary_op->type != OperatorType::kGreaterEqual) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   CHECK_EQ(binary_op->inputs.size(), 2);
 
@@ -215,13 +215,13 @@ ::tensorflow::Status ResolveConstantBinaryOperator::Run(Model* model,
   const auto& input1_array = model->GetArray(binary_op->inputs[1]);
   // Check if both inputs are constant parameters.
   if (!input0_array.buffer || !input1_array.buffer) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   auto& output_array = model->GetArray(binary_op->outputs[0]);
   // Yield until the output array dims have been resolved.
   if (!output_array.has_shape()) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // At the moment we don't want to care about fused activation functions.
@@ -232,7 +232,7 @@ ::tensorflow::Status ResolveConstantBinaryOperator::Run(Model* model,
     AddMessageF(
         "Not resolving constant %s because it has a fused activation function",
         LogName(*binary_op));
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Check that input data types agree.
@@ -245,12 +245,12 @@ ::tensorflow::Status ResolveConstantBinaryOperator::Run(Model* model,
 
   // Do the actual constants propagation
   if (!EvaluateBinaryOperatorOnConstantInputs(model, binary_op)) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   DeleteOpAndArrays(model, binary_op);
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_concatenation.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_concatenation.cc
index a726c9f79fee55..c8ef9e09f97b1e 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_concatenation.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_concatenation.cc
@@ -143,7 +143,7 @@ ::tensorflow::Status ResolveConstantConcatenation::Run(Model* model,
   const auto concat_it = model->operators.begin() + op_index;
   const auto* concat_base_op = concat_it->get();
   if (concat_base_op->type != OperatorType::kConcatenation) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   const auto* concat_op =
       static_cast<const ConcatenationOperator*>(concat_base_op);
@@ -153,15 +153,12 @@ ::tensorflow::Status ResolveConstantConcatenation::Run(Model* model,
     // We  also make sure the shapes of the input arrays are known and they are
     // all discardable.
     const Operator* input_op = GetOpWithOutput(*model, input_name);
-    if (input_op) return ::tensorflow::OkStatus();
-    if (!IsConstantParameterArray(*model, input_name))
-      return ::tensorflow::OkStatus();
-    if (!model->GetArray(input_name).has_shape())
-      return ::tensorflow::OkStatus();
+    if (input_op) return absl::OkStatus();
+    if (!IsConstantParameterArray(*model, input_name)) return absl::OkStatus();
+    if (!model->GetArray(input_name).has_shape()) return absl::OkStatus();
     if (model->GetArray(input_name).quantization_params)
-      return ::tensorflow::OkStatus();
-    if (!IsDiscardableArray(*model, input_name))
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
+    if (!IsDiscardableArray(*model, input_name)) return absl::OkStatus();
   }
 
   const int concatenation_axis = concat_op->axis;
@@ -210,7 +207,7 @@ ::tensorflow::Status ResolveConstantConcatenation::Run(Model* model,
 
   DeleteOpAndArrays(model, concat_op);
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_fake_quant.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_fake_quant.cc
index bcb1b9f4f8402a..f40f1a964c06ce 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_fake_quant.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_fake_quant.cc
@@ -66,7 +66,7 @@ ::tensorflow::Status ResolveConstantFakeQuant::Run(Model* model,
   const auto fakequant_it = model->operators.begin() + op_index;
   const auto* fakequant_base_op = fakequant_it->get();
   if (fakequant_base_op->type != OperatorType::kFakeQuant) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   const auto* fakequant_op =
@@ -74,12 +74,12 @@ ::tensorflow::Status ResolveConstantFakeQuant::Run(Model* model,
 
   // Yield until the fakequant MinMax has been resolved.
   if (!fakequant_op->minmax) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // This transformation only applies when the input array is constant.
   if (!IsConstantParameterArray(*model, fakequant_op->inputs[0])) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   const auto& input_array = model->GetArray(fakequant_op->inputs[0]);
@@ -90,7 +90,7 @@ ::tensorflow::Status ResolveConstantFakeQuant::Run(Model* model,
   if (!InferQuantizedDataTypeFromFakeQuant(*fakequant_op,
                                            &quantized_data_type)) {
     AddMessageF("Unsupported FakeQuant num_bits=%d", fakequant_op->num_bits);
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   AddMessageF("Resolving constant %s", LogName(*fakequant_op));
@@ -134,7 +134,7 @@ ::tensorflow::Status ResolveConstantFakeQuant::Run(Model* model,
                             size);
   DeleteOpAndArrays(model, fakequant_op);
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_fill.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_fill.cc
index 0b550b300647aa..13fd18d430c0b5 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_fill.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_fill.cc
@@ -48,7 +48,7 @@ ::tensorflow::Status ResolveConstantFill::Run(Model* model,
   const auto fill_it = model->operators.begin() + op_index;
   auto* base_op = fill_it->get();
   if (base_op->type != OperatorType::kFill) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   auto* op = static_cast<FillOperator*>(base_op);
 
@@ -58,49 +58,49 @@ ::tensorflow::Status ResolveConstantFill::Run(Model* model,
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.data_type == ArrayDataType::kNone) {
     // Yield until the output type has been set by PropagateArrayDataTypes
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   if (!output_array.has_shape()) {
     // Yield until the output shape has been set by PropagateFixedShapes
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   const auto& val_array = model->GetArray(op->inputs[1]);
   if (!val_array.has_shape()) {
     // Yield until the value shape has been resolved.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   if (!IsConstantParameterArray(*model, op->inputs[1])) {
     // Yield until the value is constant.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   CHECK_EQ(RequiredBufferSizeForShape(val_array.shape()), 1);
 
   switch (output_array.data_type) {
     case ArrayDataType::kFloat:
       if (!ComputeFillArray<ArrayDataType::kFloat>(model, op)) {
-        return ::tensorflow::OkStatus();
+        return absl::OkStatus();
       }
       break;
     case ArrayDataType::kUint8:
       if (!ComputeFillArray<ArrayDataType::kUint8>(model, op)) {
-        return ::tensorflow::OkStatus();
+        return absl::OkStatus();
       }
       break;
     case ArrayDataType::kInt32:
       if (!ComputeFillArray<ArrayDataType::kInt32>(model, op)) {
-        return ::tensorflow::OkStatus();
+        return absl::OkStatus();
       }
       break;
     case ArrayDataType::kInt64:
       if (!ComputeFillArray<ArrayDataType::kInt64>(model, op)) {
-        return ::tensorflow::OkStatus();
+        return absl::OkStatus();
       }
       break;
     case ArrayDataType::kComplex64:
       if (!ComputeFillArray<ArrayDataType::kComplex64>(model, op)) {
-        return ::tensorflow::OkStatus();
+        return absl::OkStatus();
       }
       break;
     default:
@@ -111,7 +111,7 @@ ::tensorflow::Status ResolveConstantFill::Run(Model* model,
 
   DeleteOpAndArrays(model, op);
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_gather.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_gather.cc
index 8a4354a5421404..961596fd6426e1 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_gather.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_gather.cc
@@ -71,7 +71,7 @@ ::tensorflow::Status ResolveConstantGather::Run(Model* model,
   auto it = model->operators.begin() + op_index;
   const auto* base_op = it->get();
   if (base_op->type != OperatorType::kGather) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   const auto* op = static_cast<const GatherOperator*>(base_op);
 
@@ -80,28 +80,28 @@ ::tensorflow::Status ResolveConstantGather::Run(Model* model,
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.data_type == ArrayDataType::kNone) {
     // Yield until the output type has been set by PropagateArrayDataTypes.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   if (!output_array.has_shape()) {
     // Yield until the output shape has been set by PropagateFixedShapes.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   if (!op->axis) {
     // Yield until axis has been set by ResolveGatherAttributes.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   if (op->axis.value() != 0) {
     // Only handling axis=0 for now.
     AddMessageF("%s has axis %d; only axis=0 is supported", LogName(*op),
                 op->axis.value());
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // We require constant inputs.
   if (!IsConstantParameterArray(*model, op->inputs[0]) ||
       !IsConstantParameterArray(*model, op->inputs[1])) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   const Array& input_array = model->GetArray(op->inputs[0]);
   const Array& coords_array = model->GetArray(op->inputs[1]);
@@ -144,7 +144,7 @@ ::tensorflow::Status ResolveConstantGather::Run(Model* model,
 
   DeleteOpAndArrays(model, op);
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_pack.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_pack.cc
index f471ed74755d16..ead1f91fea42b4 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_pack.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_pack.cc
@@ -56,7 +56,7 @@ ::tensorflow::Status ResolveConstantPack::Run(Model* model,
   auto it = model->operators.begin() + op_index;
   const auto* base_op = it->get();
   if (base_op->type != OperatorType::kPack) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   const auto* op = static_cast<const PackOperator*>(base_op);
 
@@ -65,18 +65,18 @@ ::tensorflow::Status ResolveConstantPack::Run(Model* model,
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.data_type == ArrayDataType::kNone) {
     // Yield until the output type has been set by PropagateArrayDataTypes
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   if (!output_array.has_shape()) {
     // Yield until the output shape has been set by PropagateFixedShapes
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   for (const auto& input : op->inputs) {
     if (!IsConstantParameterArray(*model, input)) {
       // Yield if any input is mutable
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     }
   }
 
@@ -112,7 +112,7 @@ ::tensorflow::Status ResolveConstantPack::Run(Model* model,
 
   DeleteOpAndArrays(model, op);
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_random_uniform.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_random_uniform.cc
index c7f90964647f71..2f011182e31869 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_random_uniform.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_random_uniform.cc
@@ -66,7 +66,7 @@ ::tensorflow::Status ResolveConstantRandomUniform::Run(Model* model,
   const auto it = model->operators.begin() + op_index;
   auto* base_op = it->get();
   if (base_op->type != OperatorType::kRandomUniform) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   auto* op = static_cast<RandomUniformOperator*>(base_op);
 
@@ -76,12 +76,12 @@ ::tensorflow::Status ResolveConstantRandomUniform::Run(Model* model,
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.data_type == ArrayDataType::kNone) {
     // Yield until the output type has been set by PropagateArrayDataTypes
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   if (!output_array.has_shape()) {
     // Yield until the output shape has been set by PropagateFixedShapes
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   if ((op->seed == 0) && (op->seed2 == 0)) {
@@ -89,13 +89,13 @@ ::tensorflow::Status ResolveConstantRandomUniform::Run(Model* model,
                  << "\" is truly random (using /dev/random system entropy). "
                     "Therefore, cannot resolve as constant. Set \"seed\" or "
                     "\"seed2\" attr non-zero to fix this";
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   switch (output_array.data_type) {
     case ArrayDataType::kFloat:
       if (!ComputeRandomUniformArray<ArrayDataType::kFloat>(model, op)) {
-        return ::tensorflow::OkStatus();
+        return absl::OkStatus();
       }
       break;
     // For future support of double or half.
@@ -109,7 +109,7 @@ ::tensorflow::Status ResolveConstantRandomUniform::Run(Model* model,
 
   DeleteOpAndArrays(model, op);
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_range.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_range.cc
index 7afd4f6b7535a6..01701757e6c37b 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_range.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_range.cc
@@ -47,7 +47,7 @@ ::tensorflow::Status ResolveConstantRange::Run(Model* model,
   const auto it = model->operators.begin() + op_index;
   auto* base_op = it->get();
   if (base_op->type != OperatorType::kRange) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   auto* op = static_cast<RangeOperator*>(base_op);
 
@@ -55,23 +55,23 @@ ::tensorflow::Status ResolveConstantRange::Run(Model* model,
   const auto& start_array = model->GetArray(op->inputs[0]);
   if (!start_array.has_shape()) {
     // Yield until all input dims have been resolved.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   const auto& limit_array = model->GetArray(op->inputs[1]);
   if (!limit_array.has_shape()) {
     // Yield until all input dims have been resolved.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   const auto& delta_array = model->GetArray(op->inputs[2]);
   if (!delta_array.has_shape()) {
     // Yield until all input dims have been resolved.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   for (const auto& input : op->inputs) {
     if (!IsConstantParameterArray(*model, input)) {
       // yield if any input is mutable
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     }
   }
 
@@ -79,7 +79,7 @@ ::tensorflow::Status ResolveConstantRange::Run(Model* model,
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.data_type == ArrayDataType::kNone) {
     // Yield until the output type has been set by PropagateArrayDataTypes
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   CHECK_EQ(RequiredBufferSizeForShape(start_array.shape()), 1)
@@ -107,7 +107,7 @@ ::tensorflow::Status ResolveConstantRange::Run(Model* model,
 
   DeleteOpAndArrays(model, op);
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_reshape.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_reshape.cc
index 6e1f0d7335846e..e68dcbaff6416b 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_reshape.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_reshape.cc
@@ -29,7 +29,7 @@ ::tensorflow::Status ResolveConstantReshape::Run(Model* model,
   auto it = model->operators.begin() + op_index;
   const auto* base_op = it->get();
   if (base_op->type != OperatorType::kReshape) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   const auto* op = static_cast<const TensorFlowReshapeOperator*>(base_op);
 
@@ -39,17 +39,17 @@ ::tensorflow::Status ResolveConstantReshape::Run(Model* model,
   // We require constant inputs.
   if (!IsConstantParameterArray(*model, op->inputs[0]) ||
       !IsConstantParameterArray(*model, op->inputs[1])) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.data_type == ArrayDataType::kNone) {
     // Yield until the output type has been set by PropagateArrayDataTypes.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   if (!output_array.has_shape()) {
     // Yield until the output shape has been set by PropagateFixedShapes.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   const Array& input_array = model->GetArray(op->inputs[0]);
@@ -57,7 +57,7 @@ ::tensorflow::Status ResolveConstantReshape::Run(Model* model,
     AddMessageF("Constant reshape is non-trivial (%s -> %s)",
                 ShapeToString(input_array.shape()),
                 ShapeToString(output_array.shape()));
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   CHECK(!output_array.buffer);
@@ -101,7 +101,7 @@ ::tensorflow::Status ResolveConstantReshape::Run(Model* model,
     default:
       LOG(FATAL) << "Unsupported data type: "
                  << ArrayDataTypeName(input_array.data_type);
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
   }
 
   AddMessageF("Resolving constant reshape of %s", LogName(*op));
@@ -110,7 +110,7 @@ ::tensorflow::Status ResolveConstantReshape::Run(Model* model,
 
   DeleteOpAndArrays(model, op);
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_select.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_select.cc
index c59b98334e8170..29744b3ebc64f6 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_select.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_select.cc
@@ -34,7 +34,7 @@ ::tensorflow::Status ResolveConstantSelect::Run(Model* model,
   auto it = model->operators.begin() + op_index;
   const auto* base_op = it->get();
   if (base_op->type != OperatorType::kSelect) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   const auto* op = static_cast<const SelectOperator*>(base_op);
 
@@ -43,23 +43,23 @@ ::tensorflow::Status ResolveConstantSelect::Run(Model* model,
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.data_type == ArrayDataType::kNone) {
     // Yield until the output type has been set by PropagateArrayDataTypes.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   if (!output_array.has_shape()) {
     // Yield until the output shape has been set by PropagateFixedShapes.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // We require the cond input to be constant.
   if (!IsConstantParameterArray(*model, op->inputs[0])) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   const Array& cond_array = model->GetArray(op->inputs[0]);
   CHECK(cond_array.data_type == ArrayDataType::kBool)
       << "Only bool conditions are supported";
   const auto& cond_data = cond_array.GetBuffer<ArrayDataType::kBool>().data;
   if (cond_data.empty()) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Check if the condition is the same for all elements.
@@ -70,14 +70,14 @@ ::tensorflow::Status ResolveConstantSelect::Run(Model* model,
           "Cannot resolve %s as constant; cond_array has differing "
           "per-element values",
           LogName(*op));
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     }
   }
 
   // Pass-through the selected input.
   *modified =
       RemoveTrivialPassthroughOp(this, model, op_index, cond_value ? 1 : 2);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_shape_or_rank.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_shape_or_rank.cc
index ca5de5e638c187..47768259d29b44 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_shape_or_rank.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_shape_or_rank.cc
@@ -26,25 +26,25 @@ ::tensorflow::Status ResolveConstantShapeOrRank::Run(Model* model,
   const auto it = model->operators.begin() + op_index;
   const auto* op = it->get();
   if (!(op->type == OperatorType::kShape || op->type == OperatorType::kRank)) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   CHECK_EQ(op->outputs.size(), 1);
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.data_type == ArrayDataType::kNone) {
     // Yield until the output type has been resolved
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   const auto& input_array = model->GetArray(op->inputs[0]);
   if (!input_array.has_shape()) {
     // Yield until the input array's shape has been resolved.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   if (!output_array.has_shape()) {
     // Yield until the output shape has been resolved.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Compute the output
@@ -63,7 +63,7 @@ ::tensorflow::Status ResolveConstantShapeOrRank::Run(Model* model,
 
   DeleteOpAndArrays(model, op);
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_slice.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_slice.cc
index 742987cff9f60e..29a0a60ccb2d23 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_slice.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_slice.cc
@@ -93,7 +93,7 @@ ::tensorflow::Status ResolveConstantSlice::Run(Model* model,
   const auto it = model->operators.begin() + op_index;
   const auto* base_op = it->get();
   if (base_op->type != OperatorType::kSlice) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   const SliceOperator* op = static_cast<const SliceOperator*>(base_op);
@@ -102,54 +102,54 @@ ::tensorflow::Status ResolveConstantSlice::Run(Model* model,
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.data_type == ArrayDataType::kNone) {
     // Yield until the output type has been set by PropagateArrayDataTypes.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   if (!output_array.has_shape()) {
     // Yield until the output shape has been set by PropagateFixedShapes.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   if (op->begin.empty() || op->size.empty()) {
     // Attributes have not resolved yet.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   const auto& input_array = model->GetArray(op->inputs[0]);
   if (!input_array.has_shape()) {
     // Yield until the value shape has been resolved.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   if (!IsConstantParameterArray(*model, op->inputs[0])) {
     // Yield until the value is constant.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   CHECK(!output_array.buffer);
   switch (output_array.data_type) {
     case ArrayDataType::kFloat:
       if (!Slice<ArrayDataType::kFloat>(*op, input_array, &output_array)) {
-        return ::tensorflow::OkStatus();
+        return absl::OkStatus();
       }
       break;
     case ArrayDataType::kUint8:
       if (!Slice<ArrayDataType::kUint8>(*op, input_array, &output_array)) {
-        return ::tensorflow::OkStatus();
+        return absl::OkStatus();
       }
       break;
     case ArrayDataType::kInt32:
       if (!Slice<ArrayDataType::kInt32>(*op, input_array, &output_array)) {
-        return ::tensorflow::OkStatus();
+        return absl::OkStatus();
       }
       break;
     case ArrayDataType::kInt64:
       if (!Slice<ArrayDataType::kInt64>(*op, input_array, &output_array)) {
-        return ::tensorflow::OkStatus();
+        return absl::OkStatus();
       }
       break;
     case ArrayDataType::kComplex64:
       if (!Slice<ArrayDataType::kComplex64>(*op, input_array, &output_array)) {
-        return ::tensorflow::OkStatus();
+        return absl::OkStatus();
       }
       break;
     default:
@@ -160,7 +160,7 @@ ::tensorflow::Status ResolveConstantSlice::Run(Model* model,
 
   DeleteOpAndArrays(model, op);
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_strided_slice.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
index 01c63f1f6aea27..7519b35d0f9cc6 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
@@ -105,7 +105,7 @@ ::tensorflow::Status ResolveConstantStridedSlice::Run(Model* model,
   const auto it = model->operators.begin() + op_index;
   const auto* base_op = it->get();
   if (base_op->type != OperatorType::kStridedSlice) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   const StridedSliceOperator* op =
@@ -115,28 +115,28 @@ ::tensorflow::Status ResolveConstantStridedSlice::Run(Model* model,
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.data_type == ArrayDataType::kNone) {
     // Yield until the output type has been set by PropagateArrayDataTypes
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   if (!output_array.has_shape()) {
     // Yield until the output shape has been set by PropagateFixedShapes
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   if (op->start_indices.empty() || op->stop_indices.empty() ||
       op->strides.empty()) {
     // Attributes have not resolved yet.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   const auto& input_array = model->GetArray(op->inputs[0]);
   if (!input_array.has_shape()) {
     // Yield until the value shape has been resolved.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   if (!IsConstantParameterArray(*model, op->inputs[0])) {
     // Yield until the value is constant.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   CHECK(!output_array.buffer);
@@ -165,7 +165,7 @@ ::tensorflow::Status ResolveConstantStridedSlice::Run(Model* model,
 
   DeleteOpAndArrays(model, op);
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_tile.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_tile.cc
index 9be148571f0910..4c798717544e26 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_tile.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_tile.cc
@@ -105,7 +105,7 @@ ::tensorflow::Status ResolveConstantTile::Run(Model* model,
   auto it = model->operators.begin() + op_index;
   const auto* base_op = it->get();
   if (base_op->type != OperatorType::kTile) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   const auto* op = static_cast<const TensorFlowTileOperator*>(base_op);
 
@@ -114,17 +114,17 @@ ::tensorflow::Status ResolveConstantTile::Run(Model* model,
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.data_type == ArrayDataType::kNone) {
     // Yield until the output type has been set by PropagateArrayDataTypes.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   if (!output_array.has_shape()) {
     // Yield until the output shape has been set by PropagateFixedShapes.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // We require constant inputs.
   if (!IsConstantParameterArray(*model, op->inputs[0]) ||
       !IsConstantParameterArray(*model, op->inputs[1])) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   const Array& input_array = model->GetArray(op->inputs[0]);
   const Array& multiples_array = model->GetArray(op->inputs[1]);
@@ -163,7 +163,7 @@ ::tensorflow::Status ResolveConstantTile::Run(Model* model,
 
   DeleteOpAndArrays(model, op);
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_transpose.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_transpose.cc
index 5dc8245b4ded91..265db153e4907a 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_transpose.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_transpose.cc
@@ -108,7 +108,7 @@ ::tensorflow::Status ResolveConstantTranspose::Run(Model* model,
   auto it = model->operators.begin() + op_index;
   const auto* base_op = it->get();
   if (base_op->type != OperatorType::kTranspose) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   const auto* op = static_cast<const TransposeOperator*>(base_op);
 
@@ -117,17 +117,17 @@ ::tensorflow::Status ResolveConstantTranspose::Run(Model* model,
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.data_type == ArrayDataType::kNone) {
     // Yield until the output type has been set by PropagateArrayDataTypes.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   if (!output_array.has_shape()) {
     // Yield until the output shape has been set by PropagateFixedShapes.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // We require constant inputs.
   if (!IsConstantParameterArray(*model, op->inputs[0]) ||
       !IsConstantParameterArray(*model, op->inputs[1])) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   const Array& input_array = model->GetArray(op->inputs[0]);
 
@@ -135,7 +135,7 @@ ::tensorflow::Status ResolveConstantTranspose::Run(Model* model,
 
   if (op->perm.empty()) {
     // Yield until perm has been populated by ResolveTransposeAttributes.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // We currently only support 1-4 dimensions.
@@ -173,7 +173,7 @@ ::tensorflow::Status ResolveConstantTranspose::Run(Model* model,
 
   DeleteOpAndArrays(model, op);
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_unary.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_unary.cc
index 7016914f3b0978..e415a12cbe6412 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_unary.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_unary.cc
@@ -138,28 +138,28 @@ ::tensorflow::Status ResolveConstantUnaryOperator::Run(Model* model,
     case OperatorType::kRelu:
       break;
     default:
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
   }
 
   // Check if the input is a constant parameter.
   if (!IsConstantParameterArray(*model, unary_op->inputs[0])) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // if the unary op involves a tensor required by a rnn state, ignore it
   for (const auto& rnn_state : model->flags.rnn_states()) {
     if (unary_op->inputs[0] == rnn_state.back_edge_source_array()) {
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     }
     if (unary_op->inputs[0] == rnn_state.state_array()) {
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     }
   }
 
   auto& output_array = model->GetArray(unary_op->outputs[0]);
   if (!output_array.has_shape()) {
     // Yield until the output array dims have been resolved.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // At the moment we don't want to care about fused activation functions.
@@ -171,7 +171,7 @@ ::tensorflow::Status ResolveConstantUnaryOperator::Run(Model* model,
         "Not resolving constant %s "
         " because it has a fused activation function",
         LogName(*unary_op));
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // The min-max is only copied for ops that copy data without arithmetic.
@@ -193,7 +193,7 @@ ::tensorflow::Status ResolveConstantUnaryOperator::Run(Model* model,
           "Not resolving constant %s because we currently only support casting "
           "to float",
           LogName(*unary_op));
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     }
     if (cast_op->src_data_type != input_array.buffer->type) {
       AddMessageF(
@@ -203,7 +203,7 @@ ::tensorflow::Status ResolveConstantUnaryOperator::Run(Model* model,
     }
   } else {
     if (input_array.buffer->type != ArrayDataType::kFloat) {
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     }
     input_float_data = &(input_array.GetBuffer<ArrayDataType::kFloat>().data);
   }
@@ -248,7 +248,7 @@ ::tensorflow::Status ResolveConstantUnaryOperator::Run(Model* model,
     CHECK_EQ(unary_op->inputs.size(), 2) << "Sum needs 2 inputs";
     if (!IsConstantParameterArray(*model, unary_op->inputs[1])) {
       AddMessageF("Axis input is non-constant");
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     }
     auto& axis_array = model->GetArray(unary_op->inputs[1]);
     CHECK(axis_array.data_type == ArrayDataType::kInt32);
@@ -345,7 +345,7 @@ ::tensorflow::Status ResolveConstantUnaryOperator::Run(Model* model,
         default:
           LOG(FATAL) << "Unsupported activation function "
                      << LogName(*unary_op);
-          return ::tensorflow::OkStatus();
+          return absl::OkStatus();
       }
       output_float_data[i] = new_value;
     }
@@ -355,7 +355,7 @@ ::tensorflow::Status ResolveConstantUnaryOperator::Run(Model* model,
 
   DeleteOpAndArrays(model, unary_op);
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_fake_quant_args_from_vars.cc b/tensorflow/lite/toco/graph_transformations/resolve_fake_quant_args_from_vars.cc
index eb16bc8c10a35a..b52632e4bce15d 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_fake_quant_args_from_vars.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_fake_quant_args_from_vars.cc
@@ -32,13 +32,13 @@ ::tensorflow::Status ResolveFakeQuantArgsFromVars::Run(Model* model,
   const auto fakequant_it = model->operators.begin() + op_index;
   auto* fakequant_base_op = fakequant_it->get();
   if (fakequant_base_op->type != OperatorType::kFakeQuant) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   auto* fakequant_op = static_cast<FakeQuantOperator*>(fakequant_base_op);
 
   if (fakequant_op->minmax) {
     // Already resolved.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   CHECK_EQ(fakequant_op->inputs.size(), 3);
@@ -46,7 +46,7 @@ ::tensorflow::Status ResolveFakeQuantArgsFromVars::Run(Model* model,
   // resolved to constant arrays.
   for (int i = 1; i <= 2; i++) {
     if (!IsConstantParameterArray(*model, fakequant_op->inputs[i])) {
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     }
   }
 
@@ -79,7 +79,7 @@ ::tensorflow::Status ResolveFakeQuantArgsFromVars::Run(Model* model,
   }
   fakequant_op->inputs.resize(1);
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_gather_attributes.cc b/tensorflow/lite/toco/graph_transformations/resolve_gather_attributes.cc
index 356e7df7b82441..083ce2756d6aa0 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_gather_attributes.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_gather_attributes.cc
@@ -29,19 +29,18 @@ ::tensorflow::Status ResolveGatherAttributes::Run(Model* model,
                                                   bool* modified) {
   *modified = false;
   auto* gather_op = model->operators[op_index].get();
-  if (gather_op->type != OperatorType::kGather) return ::tensorflow::OkStatus();
+  if (gather_op->type != OperatorType::kGather) return absl::OkStatus();
   auto* op = static_cast<GatherOperator*>(gather_op);
 
   if (op->axis) {
     // Attributes already resolved
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
-  if (op->inputs.size() != 3) return ::tensorflow::OkStatus();
-  if (!IsConstantParameterArray(*model, op->inputs[2]))
-    return ::tensorflow::OkStatus();
+  if (op->inputs.size() != 3) return absl::OkStatus();
+  if (!IsConstantParameterArray(*model, op->inputs[2])) return absl::OkStatus();
 
   const auto& indices_array = model->GetArray(op->inputs[2]);
-  if (!indices_array.has_shape()) return ::tensorflow::OkStatus();
+  if (!indices_array.has_shape()) return absl::OkStatus();
   const auto& axis_data = indices_array.GetBuffer<ArrayDataType::kInt32>().data;
   CHECK_EQ(axis_data.size(), 1)
       << "Multidimensional gather not supported on " << LogName(*op);
@@ -52,7 +51,7 @@ ::tensorflow::Status ResolveGatherAttributes::Run(Model* model,
   op->inputs.resize(2);
 
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_multiply_by_zero.cc b/tensorflow/lite/toco/graph_transformations/resolve_multiply_by_zero.cc
index fc8a669758ba00..5c45ec4a7b360a 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_multiply_by_zero.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_multiply_by_zero.cc
@@ -58,23 +58,23 @@ ::tensorflow::Status ResolveMultiplyByZero::Run(Model* model,
   const auto mul_it = model->operators.begin() + op_index;
   auto* mul_op = mul_it->get();
   if (mul_op->type != OperatorType::kMul) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   const auto& output_array_name = mul_op->outputs[0];
   auto& output_array = model->GetArray(output_array_name);
 
   if (!IsDiscardableArray(*model, output_array_name)) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   if (output_array.data_type == ArrayDataType::kNone) {
     // Yield until the output type has been set by PropagateArrayDataTypes
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Yield if the output shape is not known yet.
   if (!output_array.has_shape()) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // This transformation only handles the case where one operand is all 0's and
@@ -86,12 +86,12 @@ ::tensorflow::Status ResolveMultiplyByZero::Run(Model* model,
   };
   if (!is_input_constant[0] && !is_input_constant[1]) {
     // Neither input is constant, so nothing we can resolve here.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   if (is_input_constant[0] && is_input_constant[1]) {
     // Both inputs are constants. That's a job for constants propagation, not
     // for us to handle here.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   const int index_of_constant_input = is_input_constant[0] ? 0 : 1;
   const int index_of_variable_input = is_input_constant[0] ? 1 : 0;
@@ -108,7 +108,7 @@ ::tensorflow::Status ResolveMultiplyByZero::Run(Model* model,
           constant_input_array.GetBuffer<ArrayDataType::kFloat>().data;
       if (!AreAllBufferElementsZero<DataType<ArrayDataType::kFloat>>(
               constant_input_data)) {
-        return ::tensorflow::OkStatus();
+        return absl::OkStatus();
       }
       FillArrayWithZeros<ArrayDataType::kFloat>(&output_array);
     } break;
@@ -117,7 +117,7 @@ ::tensorflow::Status ResolveMultiplyByZero::Run(Model* model,
           constant_input_array.GetBuffer<ArrayDataType::kUint8>().data;
       if (!AreAllBufferElementsZero<DataType<ArrayDataType::kUint8>>(
               constant_input_data)) {
-        return ::tensorflow::OkStatus();
+        return absl::OkStatus();
       }
       FillArrayWithZeros<ArrayDataType::kUint8>(&output_array);
     } break;
@@ -126,7 +126,7 @@ ::tensorflow::Status ResolveMultiplyByZero::Run(Model* model,
           constant_input_array.GetBuffer<ArrayDataType::kInt32>().data;
       if (!AreAllBufferElementsZero<DataType<ArrayDataType::kInt32>>(
               constant_input_data)) {
-        return ::tensorflow::OkStatus();
+        return absl::OkStatus();
       }
       FillArrayWithZeros<ArrayDataType::kInt32>(&output_array);
     } break;
@@ -135,7 +135,7 @@ ::tensorflow::Status ResolveMultiplyByZero::Run(Model* model,
           constant_input_array.GetBuffer<ArrayDataType::kInt64>().data;
       if (!AreAllBufferElementsZero<DataType<ArrayDataType::kInt64>>(
               constant_input_data)) {
-        return ::tensorflow::OkStatus();
+        return absl::OkStatus();
       }
       FillArrayWithZeros<ArrayDataType::kInt64>(&output_array);
     } break;
@@ -144,19 +144,19 @@ ::tensorflow::Status ResolveMultiplyByZero::Run(Model* model,
           constant_input_array.GetBuffer<ArrayDataType::kComplex64>().data;
       if (!AreAllBufferElementsZero<DataType<ArrayDataType::kComplex64>>(
               constant_input_data)) {
-        return ::tensorflow::OkStatus();
+        return absl::OkStatus();
       }
       FillArrayWithZeros<ArrayDataType::kComplex64>(&output_array);
     } break;
     default:
       AddMessageF(
           "Cannot resolve multiply by 0 because of unsupported data type\n");
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
   }
 
   DeleteOpAndArrays(model, mul_op);
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_pad_attributes.cc b/tensorflow/lite/toco/graph_transformations/resolve_pad_attributes.cc
index a2b3d2f0a5cd27..2ba9566ac02287 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_pad_attributes.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_pad_attributes.cc
@@ -30,17 +30,16 @@ ::tensorflow::Status ResolvePadAttributes::Run(Model* model,
   *modified = false;
   const auto pad_it = model->operators.begin() + op_index;
   auto* pad_op = pad_it->get();
-  if (pad_op->type != OperatorType::kPad) return ::tensorflow::OkStatus();
+  if (pad_op->type != OperatorType::kPad) return absl::OkStatus();
 
   auto* op = static_cast<PadOperator*>(pad_op);
-  if (!op->left_padding.empty()) return ::tensorflow::OkStatus();
+  if (!op->left_padding.empty()) return absl::OkStatus();
 
   CHECK_EQ(op->inputs.size(), 2);
-  if (!IsConstantParameterArray(*model, op->inputs[1]))
-    return ::tensorflow::OkStatus();
+  if (!IsConstantParameterArray(*model, op->inputs[1])) return absl::OkStatus();
 
   const auto& array = model->GetArray(op->inputs[1]);
-  if (!array.has_shape()) return ::tensorflow::OkStatus();
+  if (!array.has_shape()) return absl::OkStatus();
 
   const std::vector<int>& dims = array.shape().dims();
   CHECK_EQ(dims.size(), 2);
@@ -55,6 +54,6 @@ ::tensorflow::Status ResolvePadAttributes::Run(Model* model,
   // TODO(dkalenichenko): Delete the extra input?
 
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_padv2_attributes.cc b/tensorflow/lite/toco/graph_transformations/resolve_padv2_attributes.cc
index c0990c3a682353..8f67c01864396f 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_padv2_attributes.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_padv2_attributes.cc
@@ -30,17 +30,16 @@ ::tensorflow::Status ResolvePadV2Attributes::Run(Model* model,
   *modified = false;
   const auto pad_it = model->operators.begin() + op_index;
   auto* pad_op = pad_it->get();
-  if (pad_op->type != OperatorType::kPadV2) return ::tensorflow::OkStatus();
+  if (pad_op->type != OperatorType::kPadV2) return absl::OkStatus();
 
   auto* op = static_cast<PadV2Operator*>(pad_op);
-  if (!op->left_padding.empty()) return ::tensorflow::OkStatus();
+  if (!op->left_padding.empty()) return absl::OkStatus();
 
   CHECK_EQ(op->inputs.size(), 3);
-  if (!IsConstantParameterArray(*model, op->inputs[1]))
-    return ::tensorflow::OkStatus();
+  if (!IsConstantParameterArray(*model, op->inputs[1])) return absl::OkStatus();
 
   const auto& array = model->GetArray(op->inputs[1]);
-  if (!array.has_shape()) return ::tensorflow::OkStatus();
+  if (!array.has_shape()) return absl::OkStatus();
 
   const std::vector<int>& dims = array.shape().dims();
   CHECK_EQ(dims.size(), 2);
@@ -55,6 +54,6 @@ ::tensorflow::Status ResolvePadV2Attributes::Run(Model* model,
   // TODO(dkalenichenko): Delete the extra input?
 
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_reduce_attributes.cc b/tensorflow/lite/toco/graph_transformations/resolve_reduce_attributes.cc
index 1fde09a8466b52..0d4ffd31220b72 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_reduce_attributes.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_reduce_attributes.cc
@@ -52,29 +52,29 @@ ::tensorflow::Status ResolveReduceAttributes::Run(Model* model,
   switch (op->type) {
     case OperatorType::kMean:
       *modified = ResolveAttributes(model, static_cast<MeanOperator*>(op));
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     case OperatorType::kSum:
       *modified =
           ResolveAttributes(model, static_cast<TensorFlowSumOperator*>(op));
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     case OperatorType::kReduceProd:
       *modified =
           ResolveAttributes(model, static_cast<TensorFlowProdOperator*>(op));
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     case OperatorType::kReduceMin:
       *modified =
           ResolveAttributes(model, static_cast<TensorFlowMinOperator*>(op));
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     case OperatorType::kReduceMax:
       *modified =
           ResolveAttributes(model, static_cast<TensorFlowMaxOperator*>(op));
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     case OperatorType::kAny:
       *modified =
           ResolveAttributes(model, static_cast<TensorFlowMaxOperator*>(op));
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     default:
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
   }
 }
 
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_reorder_axes.cc b/tensorflow/lite/toco/graph_transformations/resolve_reorder_axes.cc
index 24a41b6f40228d..bb86b366d37a58 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_reorder_axes.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_reorder_axes.cc
@@ -85,7 +85,7 @@ ::tensorflow::Status ResolveReorderAxes::Run(Model* model, std::size_t op_index,
   auto it = model->operators.begin() + op_index;
   auto* op = it->get();
   if (op->type != OperatorType::kReorderAxes) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   auto* reorder_op = static_cast<ReorderAxesOperator*>(op);
 
@@ -96,11 +96,11 @@ ::tensorflow::Status ResolveReorderAxes::Run(Model* model, std::size_t op_index,
   auto& input_array = model->GetArray(input_array_name);
   auto& output_array = model->GetArray(output_array_name);
   if (!input_array.buffer) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   // Yield until output dims have been resolved.
   if (!output_array.has_shape()) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   // Reorder the input array dims and buffer data
   if (input_array.buffer->type == ArrayDataType::kFloat) {
@@ -124,7 +124,7 @@ ::tensorflow::Status ResolveReorderAxes::Run(Model* model, std::size_t op_index,
   RenameArray(model, output_array_name, input_array_name);
 
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_reshape_attributes.cc b/tensorflow/lite/toco/graph_transformations/resolve_reshape_attributes.cc
index b361dd1b4af7f9..e4d4c0db3d96af 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_reshape_attributes.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_reshape_attributes.cc
@@ -32,22 +32,22 @@ ::tensorflow::Status ResolveReshapeAttributes::Run(Model* model,
   const auto reshape_it = model->operators.begin() + op_index;
   auto* reshape_op = reshape_it->get();
   if (reshape_op->type != OperatorType::kReshape) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   auto* op = static_cast<TensorFlowReshapeOperator*>(reshape_op);
 
-  if (!op->shape.empty()) return ::tensorflow::OkStatus();
+  if (!op->shape.empty()) return absl::OkStatus();
 
   if (IsConstantParameterArray(*model, reshape_op->inputs[1])) {
     const auto& constant_input_array = model->GetArray(reshape_op->inputs[1]);
     op->shape = constant_input_array.GetBuffer<ArrayDataType::kInt32>().data;
   }
 
-  if (op->shape.empty()) return ::tensorflow::OkStatus();
+  if (op->shape.empty()) return absl::OkStatus();
 
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_slice_attributes.cc b/tensorflow/lite/toco/graph_transformations/resolve_slice_attributes.cc
index 390edddcd770fe..80940815d55524 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_slice_attributes.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_slice_attributes.cc
@@ -30,22 +30,20 @@ ::tensorflow::Status ResolveSliceAttributes::Run(Model* model,
   *modified = false;
   const auto slice_it = model->operators.begin() + op_index;
   auto* slice_op = slice_it->get();
-  if (slice_op->type != OperatorType::kSlice) return ::tensorflow::OkStatus();
+  if (slice_op->type != OperatorType::kSlice) return absl::OkStatus();
 
   auto* op = static_cast<SliceOperator*>(slice_op);
-  if (!op->begin.empty()) return ::tensorflow::OkStatus();
+  if (!op->begin.empty()) return absl::OkStatus();
 
   CHECK_EQ(op->inputs.size(), 3);
-  if (!IsConstantParameterArray(*model, op->inputs[1]))
-    return ::tensorflow::OkStatus();
-  if (!IsConstantParameterArray(*model, op->inputs[2]))
-    return ::tensorflow::OkStatus();
+  if (!IsConstantParameterArray(*model, op->inputs[1])) return absl::OkStatus();
+  if (!IsConstantParameterArray(*model, op->inputs[2])) return absl::OkStatus();
 
   const auto& begin_array = model->GetArray(op->inputs[1]);
-  if (!begin_array.has_shape()) return ::tensorflow::OkStatus();
+  if (!begin_array.has_shape()) return absl::OkStatus();
 
   const auto& size_array = model->GetArray(op->inputs[2]);
-  if (!size_array.has_shape()) return ::tensorflow::OkStatus();
+  if (!size_array.has_shape()) return absl::OkStatus();
 
   op->begin = begin_array.GetBuffer<ArrayDataType::kInt32>().data;
   op->size = size_array.GetBuffer<ArrayDataType::kInt32>().data;
@@ -53,6 +51,6 @@ ::tensorflow::Status ResolveSliceAttributes::Run(Model* model,
   // TODO(dkalenichenko): Delete the extra inputs?
 
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_space_to_batch_nd_attributes.cc b/tensorflow/lite/toco/graph_transformations/resolve_space_to_batch_nd_attributes.cc
index 1538f58b151ad0..c97d7e30bd55f6 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_space_to_batch_nd_attributes.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_space_to_batch_nd_attributes.cc
@@ -30,14 +30,14 @@ ::tensorflow::Status ResolveSpaceToBatchNDAttributes::Run(Model* model,
   *modified = false;
   const auto op_it = model->operators.begin() + op_index;
   if (op_it->get()->type != OperatorType::kSpaceToBatchND)
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
 
   auto* op = static_cast<SpaceToBatchNDOperator*>(op_it->get());
 
   // The attributes are resolved only when the 3 attributes (block_shape,
   // before_paddings, after_paddings) are all constant.
   if (!op->block_shape.empty()) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   const int block_shape_index = 1;
@@ -46,16 +46,16 @@ ::tensorflow::Status ResolveSpaceToBatchNDAttributes::Run(Model* model,
   CHECK_EQ(op->inputs.size(), 3);
   if (!IsConstantParameterArray(*model, op->inputs[block_shape_index]) ||
       !IsConstantParameterArray(*model, op->inputs[paddings_index]))
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
 
   // Handle paddings.
   const auto& paddings_array = model->GetArray(op->inputs[paddings_index]);
-  if (!paddings_array.has_shape()) return ::tensorflow::OkStatus();
+  if (!paddings_array.has_shape()) return absl::OkStatus();
   const std::vector<int>& paddings_dims = paddings_array.shape().dims();
   if (paddings_dims.size() != 2) {
     // Code only handles padding of 2 dimensions. Perhaps another transformation
     // will delete this op.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   const std::vector<int>& paddings_buffer =
       paddings_array.GetBuffer<ArrayDataType::kInt32>().data;
@@ -67,7 +67,7 @@ ::tensorflow::Status ResolveSpaceToBatchNDAttributes::Run(Model* model,
   // Handle block_shape.
   const auto& block_shape_array =
       model->GetArray(op->inputs[block_shape_index]);
-  if (!block_shape_array.has_shape()) return ::tensorflow::OkStatus();
+  if (!block_shape_array.has_shape()) return absl::OkStatus();
   const std::vector<int>& block_shape_dims = block_shape_array.shape().dims();
   CHECK_EQ(block_shape_dims.size(), 1);
   const std::vector<int>& block_shape_buffer =
@@ -77,7 +77,7 @@ ::tensorflow::Status ResolveSpaceToBatchNDAttributes::Run(Model* model,
   }
 
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_squeeze_attributes.cc b/tensorflow/lite/toco/graph_transformations/resolve_squeeze_attributes.cc
index dc6a9ed8eef4f4..9acb34861c6535 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_squeeze_attributes.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_squeeze_attributes.cc
@@ -31,7 +31,7 @@ ::tensorflow::Status ResolveSqueezeAttributes::Run(Model* model,
   *modified = false;
   auto* squeeze_op = model->operators[op_index].get();
   if (squeeze_op->type != OperatorType::kSqueeze) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   DCHECK_EQ(squeeze_op->inputs.size(), 1);
   DCHECK_EQ(squeeze_op->outputs.size(), 1);
@@ -46,10 +46,10 @@ ::tensorflow::Status ResolveSqueezeAttributes::Run(Model* model,
           LogName(*squeeze_op));
 
       *modified = RemoveTrivialPassthroughOp(this, model, op_index);
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     }
   }
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc b/tensorflow/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc
index 7aecc605686d36..0d6334fe8e7af9 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc
@@ -43,41 +43,37 @@ ::tensorflow::Status ResolveStridedSliceAttributes::Run(Model* model,
   *modified = false;
   const auto slice_it = model->operators.begin() + op_index;
   auto* slice_op = slice_it->get();
-  if (slice_op->type != OperatorType::kStridedSlice)
-    return ::tensorflow::OkStatus();
+  if (slice_op->type != OperatorType::kStridedSlice) return absl::OkStatus();
 
   auto* op = static_cast<StridedSliceOperator*>(slice_op);
   if (!op->start_indices.empty()) {
     // We have already resolved these attributes
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   CHECK_EQ(op->inputs.size(), 4);
   const auto& input_array = model->GetArray(op->inputs[0]);
   if (!input_array.has_shape()) {
     // We require the dimensionality of the input to pad the indices
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   auto& start_array = model->GetArray(op->inputs[1]);
-  if (!start_array.has_shape()) return ::tensorflow::OkStatus();
+  if (!start_array.has_shape()) return absl::OkStatus();
   if (toco::RequiredBufferSizeForShape(start_array.shape()) > 4) {
     // Only 1-4D arrays are supported for now.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   auto& stop_array = model->GetArray(op->inputs[2]);
-  if (!stop_array.has_shape()) return ::tensorflow::OkStatus();
+  if (!stop_array.has_shape()) return absl::OkStatus();
 
   auto& stride_array = model->GetArray(op->inputs[3]);
-  if (!stride_array.has_shape()) return ::tensorflow::OkStatus();
+  if (!stride_array.has_shape()) return absl::OkStatus();
 
-  if (!IsConstantParameterArray(*model, op->inputs[1]))
-    return ::tensorflow::OkStatus();
-  if (!IsConstantParameterArray(*model, op->inputs[2]))
-    return ::tensorflow::OkStatus();
-  if (!IsConstantParameterArray(*model, op->inputs[3]))
-    return ::tensorflow::OkStatus();
+  if (!IsConstantParameterArray(*model, op->inputs[1])) return absl::OkStatus();
+  if (!IsConstantParameterArray(*model, op->inputs[2])) return absl::OkStatus();
+  if (!IsConstantParameterArray(*model, op->inputs[3])) return absl::OkStatus();
 
   int num_input_axes = input_array.shape().dimensions_count();
   int start_indices_size = start_array.shape().dims(0);
@@ -120,6 +116,6 @@ ::tensorflow::Status ResolveStridedSliceAttributes::Run(Model* model,
   op->strides = stride_array.GetBuffer<ArrayDataType::kInt32>().data;
 
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_concat.cc b/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_concat.cc
index 9939381517ed2f..d20c499355a34c 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_concat.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_concat.cc
@@ -33,7 +33,7 @@ ::tensorflow::Status ResolveTensorFlowConcat::Run(Model* model,
   const auto* tf_concat_op = concat_it->get();
   if (tf_concat_op->type != OperatorType::kConcat &&
       tf_concat_op->type != OperatorType::kConcatV2) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   CHECK_GE(tf_concat_op->inputs.size(), 2);
@@ -57,7 +57,7 @@ ::tensorflow::Status ResolveTensorFlowConcat::Run(Model* model,
   if (!axis_array.buffer) {
     AddMessageF("Waiting for the axis of %s to be resolved to a constant",
                 LogName(*tf_concat_op));
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   CHECK(axis_array.data_type == ArrayDataType::kInt32);
@@ -75,7 +75,7 @@ ::tensorflow::Status ResolveTensorFlowConcat::Run(Model* model,
 
   DeleteOpAndArrays(model, tf_concat_op);
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc b/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc
index 6c347080790e6f..396e71ae286e74 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc
@@ -62,7 +62,7 @@ ::tensorflow::Status ResolveTensorFlowMatMul::Run(Model* model,
   *modified = false;
   auto matmul_it = model->operators.begin() + op_index;
   if (matmul_it->get()->type != OperatorType::kMatMul) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   const auto* matmul_op =
       static_cast<const TensorFlowMatMulOperator*>(matmul_it->get());
@@ -87,7 +87,7 @@ ::tensorflow::Status ResolveTensorFlowMatMul::Run(Model* model,
           "Not replacing %s by a FullyConnected operator, because it has "
           "the transpose_a attribute and LHS has no shape",
           LogName(*matmul_op));
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     }
 
     int dimensions_count = lhs_array.shape().dimensions_count();
@@ -228,7 +228,7 @@ ::tensorflow::Status ResolveTensorFlowMatMul::Run(Model* model,
   // erase the MatMul operator
   model->operators.erase(matmul_it);
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_merge.cc b/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_merge.cc
index 1569427fa5c12f..2fb79f5c0008b0 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_merge.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_merge.cc
@@ -31,7 +31,7 @@ ::tensorflow::Status ResolveTensorFlowMerge::Run(Model* model,
   const auto merge_it = model->operators.begin() + op_index;
   const auto* merge_op = merge_it->get();
   if (merge_op->type != OperatorType::kMerge) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // We need to yield until this Merge node has only 1 input, which will mean
@@ -40,7 +40,7 @@ ::tensorflow::Status ResolveTensorFlowMerge::Run(Model* model,
   // non-selected inputs, so that at some point there will be only 1 input left.
   if (merge_op->inputs.size() > 1) {
     AddMessageF("Waiting for %s to be resolved", LogName(*merge_op));
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Now that the merge node has 1 input exactly, it is the same as an Identity
@@ -58,7 +58,7 @@ ::tensorflow::Status ResolveTensorFlowMerge::Run(Model* model,
 
   DeleteOpAndArrays(model, merge_op);
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_switch.cc b/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_switch.cc
index 390a3a76dddef0..9aefb8799fff1b 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_switch.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_switch.cc
@@ -32,7 +32,7 @@ ::tensorflow::Status ResolveTensorFlowSwitch::Run(Model* model,
   const auto switch_it = model->operators.begin() + op_index;
   const auto* switch_op = switch_it->get();
   if (switch_op->type != OperatorType::kSwitch) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   CHECK_EQ(switch_op->inputs.size(), 2);
@@ -44,7 +44,7 @@ ::tensorflow::Status ResolveTensorFlowSwitch::Run(Model* model,
     AddMessageF(
         "Waiting for the boolean predicate of %s to be resolved to a constant",
         LogName(*switch_op));
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // The predicate should be boolean, and should consist of a single value.
@@ -132,7 +132,7 @@ ::tensorflow::Status ResolveTensorFlowSwitch::Run(Model* model,
   AddMessageF("Removing already-resolved %s", LogName(*switch_op));
   DeleteOpAndArrays(model, switch_op);
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_transpose_attributes.cc b/tensorflow/lite/toco/graph_transformations/resolve_transpose_attributes.cc
index 258156244b8006..d10cf3b143e96f 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_transpose_attributes.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_transpose_attributes.cc
@@ -29,19 +29,17 @@ ::tensorflow::Status ResolveTransposeAttributes::Run(Model* model,
                                                      bool* modified) {
   *modified = false;
   const auto op_it = model->operators.begin() + op_index;
-  if (op_it->get()->type != OperatorType::kTranspose)
-    return ::tensorflow::OkStatus();
+  if (op_it->get()->type != OperatorType::kTranspose) return absl::OkStatus();
 
   auto* op = static_cast<TransposeOperator*>(op_it->get());
-  if (!op->perm.empty()) return ::tensorflow::OkStatus();
+  if (!op->perm.empty()) return absl::OkStatus();
 
   CHECK_EQ(op->inputs.size(), 2);
-  if (!IsConstantParameterArray(*model, op->inputs[1]))
-    return ::tensorflow::OkStatus();
+  if (!IsConstantParameterArray(*model, op->inputs[1])) return absl::OkStatus();
 
   // Handling perm.
   const auto& perm_array = model->GetArray(op->inputs[1]);
-  if (!perm_array.has_shape()) return ::tensorflow::OkStatus();
+  if (!perm_array.has_shape()) return absl::OkStatus();
 
   const std::vector<int>& perm_dims = perm_array.shape().dims();
   CHECK_EQ(perm_dims.size(), 1);
@@ -53,7 +51,7 @@ ::tensorflow::Status ResolveTransposeAttributes::Run(Model* model,
   }
 
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/shuffle_fc_weights.cc b/tensorflow/lite/toco/graph_transformations/shuffle_fc_weights.cc
index 253cb0f77c5759..1f758076772044 100644
--- a/tensorflow/lite/toco/graph_transformations/shuffle_fc_weights.cc
+++ b/tensorflow/lite/toco/graph_transformations/shuffle_fc_weights.cc
@@ -30,12 +30,12 @@ ::tensorflow::Status ShuffleFCWeights::Run(Model* model, std::size_t op_index,
   *modified = false;
   Operator* op = model->operators[op_index].get();
   if (op->type != OperatorType::kFullyConnected) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   FullyConnectedOperator* fc_op = static_cast<FullyConnectedOperator*>(op);
   // Exit if this FC op already has shuffled weights
   if (fc_op->weights_format != FullyConnectedWeightsFormat::kDefault) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   const Array& input_array = model->GetArray(fc_op->inputs[0]);
   const std::string& weights_name = fc_op->inputs[1];
@@ -49,11 +49,11 @@ ::tensorflow::Status ShuffleFCWeights::Run(Model* model, std::size_t op_index,
       output_array.data_type != ArrayDataType::kInt16 ||
       !input_array.quantization_params || !weights_array.quantization_params ||
       !output_array.quantization_params) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   // Exit if the shapes aren't known
   if (!input_array.has_shape() || !weights_array.has_shape()) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   // Exit if, based on the known shapes, this FC op is not a GEMV.
   // The shuffling of FC weights is only useful to enable fast GEMV paths.
@@ -67,7 +67,7 @@ ::tensorflow::Status ShuffleFCWeights::Run(Model* model, std::size_t op_index,
           "the input shape is not 1D or 2D (possibly with additional inner "
           "dimensions of size 1)",
           LogName(*op));
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     }
   }
   if (input_shape.dims(0) != 1 && input_shape.dims(0) != 4) {
@@ -76,7 +76,7 @@ ::tensorflow::Status ShuffleFCWeights::Run(Model* model, std::size_t op_index,
         "the input shape's leading dimension, i.e. the 'batch size', is not "
         "equal to 1 or 4",
         LogName(*op));
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   // Exit if the weights shape isn't an integral multiple of the shuffled
   // block shape, 4x16. We don't want to have to write code dealing with
@@ -91,7 +91,7 @@ ::tensorflow::Status ShuffleFCWeights::Run(Model* model, std::size_t op_index,
   // two.
   const Shape& weights_shape = weights_array.shape();
   if (weights_shape.dimensions_count() != 2) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   const int rows = weights_shape.dims(0);
   const int cols = weights_shape.dims(1);
@@ -100,11 +100,11 @@ ::tensorflow::Status ShuffleFCWeights::Run(Model* model, std::size_t op_index,
         "Not applying experimental shuffling to the weights of %s because its "
         "shape isn't a multiple of the shuffling block shape, 4x16",
         LogName(*op));
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   // Exit if the weights aren't already a constant array.
   if (!weights_array.buffer) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   // Exit if the weights are used by more than one op.
   if (CountOpsWithInput(*model, weights_name) != 1) {
@@ -112,7 +112,7 @@ ::tensorflow::Status ShuffleFCWeights::Run(Model* model, std::size_t op_index,
         "Not applying experimental shuffling to the weights of %s because that "
         "array is consumed by other operators",
         LogName(*op));
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   // Compute the shuffled weights
   auto& weights_data =
@@ -156,7 +156,7 @@ ::tensorflow::Status ShuffleFCWeights::Run(Model* model, std::size_t op_index,
       input_array.GetQuantizationParams();
 
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/unfuse_activation_functions.cc b/tensorflow/lite/toco/graph_transformations/unfuse_activation_functions.cc
index ef4a92b1d46bf7..52f9ea27171091 100644
--- a/tensorflow/lite/toco/graph_transformations/unfuse_activation_functions.cc
+++ b/tensorflow/lite/toco/graph_transformations/unfuse_activation_functions.cc
@@ -34,7 +34,7 @@ ::tensorflow::Status UnfuseActivationFunctions::Run(Model* model,
 
   // If a conv operation has an im2col array, yield: it should be dropped first.
   if ((op->type == OperatorType::kConv) && (op->outputs.size() == 2)) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   Operator* ac_op = nullptr;
@@ -49,7 +49,7 @@ ::tensorflow::Status UnfuseActivationFunctions::Run(Model* model,
       ac_op = new Relu1Operator;
       break;
     default:
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
   }
 
   // At this point we know that the op has a fused activation function. At the
@@ -78,7 +78,7 @@ ::tensorflow::Status UnfuseActivationFunctions::Run(Model* model,
   ac_op->inputs = {tmp_array_name};
   op->outputs = {tmp_array_name};
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/unpartition_embedding_lookup.cc b/tensorflow/lite/toco/graph_transformations/unpartition_embedding_lookup.cc
index f030d8fdbd77ca..374a98605c82e5 100644
--- a/tensorflow/lite/toco/graph_transformations/unpartition_embedding_lookup.cc
+++ b/tensorflow/lite/toco/graph_transformations/unpartition_embedding_lookup.cc
@@ -50,7 +50,7 @@ ::tensorflow::Status UnpartitionEmbeddingLookup::Run(Model* model,
   // First look for the final DynamicStitch.
   auto op_it = model->operators.begin() + op_index;
   if (op_it->get()->type != OperatorType::kDynamicStitch) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   auto* stitch_op = static_cast<DynamicStitchOperator*>(op_it->get());
 
@@ -77,7 +77,7 @@ ::tensorflow::Status UnpartitionEmbeddingLookup::Run(Model* model,
           "Skipping because indices input %s into "
           "%s is unexpected",
           LogName(*op), LogName(*stitch_op));
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     }
     if (!indices_partition_op) {
       indices_partition_op = static_cast<DynamicPartitionOperator*>(op);
@@ -88,7 +88,7 @@ ::tensorflow::Status UnpartitionEmbeddingLookup::Run(Model* model,
             "Skipping because indices input %s into "
             "%s is from a different source op than others",
             LogName(*op), LogName(*stitch_op));
-        return ::tensorflow::OkStatus();
+        return absl::OkStatus();
       }
     }
   }
@@ -97,12 +97,12 @@ ::tensorflow::Status UnpartitionEmbeddingLookup::Run(Model* model,
   // The data for the indices must be a constant range of the array shape.
   if (!IsConstantParameterArray(*model, indices_partition_op->inputs[0])) {
     AddMessageF("Skipping because indices partition data is non-constant");
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   auto& indices_data_array = model->GetArray(indices_partition_op->inputs[0]);
   if (indices_data_array.data_type == ArrayDataType::kNone) {
     // Yield until data types are propagated.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   CHECK(indices_data_array.data_type == ArrayDataType::kInt32)
       << "Indices partition inputs must be int32";
@@ -122,7 +122,7 @@ ::tensorflow::Status UnpartitionEmbeddingLookup::Run(Model* model,
           "Skipping because data input %s into %s "
           "is unexpected",
           LogName(*op), LogName(*stitch_op));
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     }
     gather_ops.push_back(static_cast<GatherOperator*>(op));
   }
@@ -137,7 +137,7 @@ ::tensorflow::Status UnpartitionEmbeddingLookup::Run(Model* model,
           "Skipping because data input %s into "
           "%s is unexpected",
           LogName(*op), LogName(*gather_op));
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     }
     if (!data_partition_op) {
       data_partition_op = static_cast<DynamicPartitionOperator*>(op);
@@ -148,7 +148,7 @@ ::tensorflow::Status UnpartitionEmbeddingLookup::Run(Model* model,
             "Skipping because data input %s into "
             "%s is from a different source op than others",
             LogName(*op), LogName(*gather_op));
-        return ::tensorflow::OkStatus();
+        return absl::OkStatus();
       }
     }
   }
@@ -242,7 +242,7 @@ ::tensorflow::Status UnpartitionEmbeddingLookup::Run(Model* model,
   DeleteOpAndArrays(model, data_partition_op);
   DeleteOpAndArrays(model, stitch_op);
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc b/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc
index 4bc4c19cbfaf43..d241e668ded88a 100644
--- a/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc
+++ b/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc
@@ -138,7 +138,7 @@ ::tensorflow::Status UnrollBatchMatMul::Run(Model* model, std::size_t op_index,
   *modified = false;
   auto batch_op_it = model->operators.begin() + op_index;
   if (batch_op_it->get()->type != OperatorType::kBatchMatMul) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   const auto* batch_op =
       static_cast<const BatchMatMulOperator*>(batch_op_it->get());
@@ -149,7 +149,7 @@ ::tensorflow::Status UnrollBatchMatMul::Run(Model* model, std::size_t op_index,
   const auto& input_lhs_array = model->GetArray(input_lhs);
   const auto& input_rhs_array = model->GetArray(input_rhs);
   if (!input_lhs_array.has_shape() || !input_rhs_array.has_shape())
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
 
   // Transpose LHS input if necessary.
   if (batch_op->adj_x) {
@@ -194,7 +194,7 @@ ::tensorflow::Status UnrollBatchMatMul::Run(Model* model, std::size_t op_index,
     model->operators.emplace(tail_it, matmul_op);
     DeleteOpAndArrays(model, batch_op);
     *modified = true;
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   AddMessageF("Unrolling BatchMatMul %s %d times", LogName(*batch_op),
               bcast.output_batch_size());
@@ -262,7 +262,7 @@ ::tensorflow::Status UnrollBatchMatMul::Run(Model* model, std::size_t op_index,
 
   DeleteOpAndArrays(model, batch_op);
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
index bc5a62d3180590..27aa3a272741e7 100644
--- a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
+++ b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
@@ -23,7 +23,7 @@ OverridableFetchContent_Declare(
   xnnpack
   GIT_REPOSITORY https://github.com/google/XNNPACK
   # Sync with tensorflow/workspace2.bzl
-  GIT_TAG 65ca94a2d3f39e7ff8e4045852b32b65ca6a16b7
+  GIT_TAG 7f3e8aa632ab976b8a195c8d3d17e2f5831dde0e
   GIT_PROGRESS TRUE
   PREFIX "${CMAKE_BINARY_DIR}"
   SOURCE_DIR "${CMAKE_BINARY_DIR}/xnnpack"
diff --git a/tensorflow/lite/tools/command_line_flags_test.cc b/tensorflow/lite/tools/command_line_flags_test.cc
index 8dc1a295ef0e4d..f25279468d3657 100644
--- a/tensorflow/lite/tools/command_line_flags_test.cc
+++ b/tensorflow/lite/tools/command_line_flags_test.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <string>
 
-#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/tools/tool_params.h"
 
diff --git a/tensorflow/lite/tools/optimize/calibration/BUILD b/tensorflow/lite/tools/optimize/calibration/BUILD
index d2ba8553c62e74..6ff168d0baf198 100644
--- a/tensorflow/lite/tools/optimize/calibration/BUILD
+++ b/tensorflow/lite/tools/optimize/calibration/BUILD
@@ -64,16 +64,13 @@ cc_library(
         ":logging_op_resolver",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:minimal_logging",
-        "//tensorflow/lite:string_util",
+        "//tensorflow/lite:string",
         "//tensorflow/lite/core:framework",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/core/c:common",
-        "//tensorflow/lite/core/kernels:builtin_ops",
-        "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/schema:schema_utils",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/memory",
         "@flatbuffers",
     ],
 )
@@ -97,15 +94,18 @@ tf_cc_test(
         "tflite_not_portable_ios",
     ],
     deps = [
+        ":calibration_reader",
         ":calibrator_lib",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite:string",
+        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/c:common",
         "//tensorflow/lite/core:framework",
         "//tensorflow/lite/core/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -118,10 +118,10 @@ cc_library(
     deps = [
         ":calibration_common",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite:util",
+        "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/api",
-        "@com_google_absl//absl/memory",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -130,9 +130,13 @@ cc_test(
     name = "logging_op_resolver_test",
     srcs = ["logging_op_resolver_test.cc"],
     deps = [
+        ":calibration_common",
         ":logging_op_resolver",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -145,10 +149,11 @@ cc_library(
     deps = [
         ":calibration_logger",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:common",
         "//tensorflow/lite/core:framework",
         "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -160,6 +165,7 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:minimal_logging",
+        "//tensorflow/lite/c:c_api_types",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/core/c:common",
         "@com_google_absl//absl/container:flat_hash_map",
diff --git a/tensorflow/lite/tools/optimize/calibration/calibration_logger.cc b/tensorflow/lite/tools/optimize/calibration/calibration_logger.cc
index 0e77386bd40964..1d19e41b6589b7 100644
--- a/tensorflow/lite/tools/optimize/calibration/calibration_logger.cc
+++ b/tensorflow/lite/tools/optimize/calibration/calibration_logger.cc
@@ -17,6 +17,9 @@ limitations under the License.
 #include <algorithm>
 #include <cmath>
 
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/logger.h"
 #include "tensorflow/lite/minimal_logging.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/tools/optimize/calibration/calibration_reader.cc b/tensorflow/lite/tools/optimize/calibration/calibration_reader.cc
index 7c6ea95e7c7ea1..a315883e1d6191 100644
--- a/tensorflow/lite/tools/optimize/calibration/calibration_reader.cc
+++ b/tensorflow/lite/tools/optimize/calibration/calibration_reader.cc
@@ -19,8 +19,9 @@ limitations under the License.
 #include <utility>
 
 #include "absl/container/flat_hash_map.h"
-#include "absl/memory/memory.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace optimize {
diff --git a/tensorflow/lite/tools/optimize/calibration/calibration_reader.h b/tensorflow/lite/tools/optimize/calibration/calibration_reader.h
index 67996ee230144a..9168b119404dac 100644
--- a/tensorflow/lite/tools/optimize/calibration/calibration_reader.h
+++ b/tensorflow/lite/tools/optimize/calibration/calibration_reader.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/core/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/tools/optimize/calibration/calibration_logger.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/tools/optimize/calibration/calibrator.cc b/tensorflow/lite/tools/optimize/calibration/calibrator.cc
index 25bd5d9b8fdaad..eb165036a3c5b0 100644
--- a/tensorflow/lite/tools/optimize/calibration/calibrator.cc
+++ b/tensorflow/lite/tools/optimize/calibration/calibrator.cc
@@ -23,20 +23,22 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
-#include "absl/memory/memory.h"
+#include "flatbuffers/buffer.h"  // from @flatbuffers
+#include "flatbuffers/vector.h"  // from @flatbuffers
+#include "tensorflow/lite/allocation.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/interpreter.h"
-#include "tensorflow/lite/core/kernels/register.h"
-#include "tensorflow/lite/core/model.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/core/interpreter_builder.h"
+#include "tensorflow/lite/logger.h"
 #include "tensorflow/lite/minimal_logging.h"
+#include "tensorflow/lite/model_builder.h"
 #include "tensorflow/lite/op_resolver.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/stderr_reporter.h"
-#include "tensorflow/lite/string_util.h"
+#include "tensorflow/lite/string_type.h"
 #include "tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.h"
 #include "tensorflow/lite/tools/optimize/calibration/calibration_common.h"
 #include "tensorflow/lite/tools/optimize/calibration/calibration_logger.h"
diff --git a/tensorflow/lite/tools/optimize/calibration/calibrator.h b/tensorflow/lite/tools/optimize/calibration/calibrator.h
index efc0fdb6939cd7..07e5bba3f4c262 100644
--- a/tensorflow/lite/tools/optimize/calibration/calibrator.h
+++ b/tensorflow/lite/tools/optimize/calibration/calibrator.h
@@ -17,9 +17,12 @@ limitations under the License.
 
 #include <memory>
 
+#include "tensorflow/lite/allocation.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/core/interpreter.h"
 #include "tensorflow/lite/core/model.h"
+#include "tensorflow/lite/model_builder.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/tools/optimize/calibration/calibration_reader.h"
 
diff --git a/tensorflow/lite/tools/optimize/calibration/calibrator_test.cc b/tensorflow/lite/tools/optimize/calibration/calibrator_test.cc
index 77bb1b3c75ee7b..74e500afa9c6db 100644
--- a/tensorflow/lite/tools/optimize/calibration/calibrator_test.cc
+++ b/tensorflow/lite/tools/optimize/calibration/calibrator_test.cc
@@ -18,14 +18,20 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
-#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "absl/memory/memory.h"
-#include "tensorflow/core/lib/io/path.h"
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/command_line_flags.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/kernels/register.h"
-#include "tensorflow/lite/core/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/model_builder.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/string_type.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibration_reader.h"
 
 namespace {
 tensorflow::string* g_test_model_dir = nullptr;
diff --git a/tensorflow/lite/tools/optimize/calibration/logging_op_resolver.cc b/tensorflow/lite/tools/optimize/calibration/logging_op_resolver.cc
index 340fe8b7015db9..6d27da46792ae7 100644
--- a/tensorflow/lite/tools/optimize/calibration/logging_op_resolver.cc
+++ b/tensorflow/lite/tools/optimize/calibration/logging_op_resolver.cc
@@ -18,10 +18,13 @@ limitations under the License.
 #include <string>
 #include <utility>
 
-#include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
-#include "tensorflow/lite/minimal_logging.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibration_common.h"
 #include "tensorflow/lite/util.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/tools/optimize/calibration/logging_op_resolver.h b/tensorflow/lite/tools/optimize/calibration/logging_op_resolver.h
index 63e94ae09eda58..56eaf5fdec1cb9 100644
--- a/tensorflow/lite/tools/optimize/calibration/logging_op_resolver.h
+++ b/tensorflow/lite/tools/optimize/calibration/logging_op_resolver.h
@@ -18,9 +18,12 @@ limitations under the License.
 #include <set>
 #include <unordered_map>
 
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/mutable_op_resolver.h"
 #include "tensorflow/lite/op_resolver.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/tools/optimize/calibration/calibration_common.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/tools/optimize/calibration/logging_op_resolver_test.cc b/tensorflow/lite/tools/optimize/calibration/logging_op_resolver_test.cc
index 2ec558f752bc55..29a56eabd61a84 100644
--- a/tensorflow/lite/tools/optimize/calibration/logging_op_resolver_test.cc
+++ b/tensorflow/lite/tools/optimize/calibration/logging_op_resolver_test.cc
@@ -16,9 +16,12 @@ limitations under the License.
 
 #include <string>
 
-#include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/mutable_op_resolver.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibration_common.h"
 
 namespace tflite {
 namespace optimize {
diff --git a/tensorflow/lite/tools/signature/signature_def_util.cc b/tensorflow/lite/tools/signature/signature_def_util.cc
index 9d36d2d08e90fe..f3d5657edc64d6 100644
--- a/tensorflow/lite/tools/signature/signature_def_util.cc
+++ b/tensorflow/lite/tools/signature/signature_def_util.cc
@@ -62,7 +62,7 @@ Status ReadSignatureDefMap(const Model* model, const Metadata* metadata,
     const std::string key = signature_defs.Keys()[i].AsString().c_str();
     (*map)[key] = signature_defs[key].AsString().c_str();
   }
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -105,7 +105,7 @@ Status SetSignatureDefMap(const Model* model,
   *model_data_with_signature_def =
       std::string(reinterpret_cast<const char*>(builder.GetBufferPointer()),
                   builder.GetSize());
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 bool HasSignatureDef(const Model* model, const std::string& signature_key) {
@@ -118,7 +118,7 @@ bool HasSignatureDef(const Model* model, const std::string& signature_key) {
   }
   SerializedSignatureDefMap signature_defs;
   if (ReadSignatureDefMap(model, metadata, &signature_defs) !=
-      ::tensorflow::OkStatus()) {
+      absl::OkStatus()) {
     return false;
   }
   return (signature_defs.find(signature_key) != signature_defs.end());
@@ -134,7 +134,7 @@ Status GetSignatureDefMap(const Model* model,
   if (metadata) {
     SerializedSignatureDefMap signature_defs;
     auto status = ReadSignatureDefMap(model, metadata, &signature_defs);
-    if (status != ::tensorflow::OkStatus()) {
+    if (status != absl::OkStatus()) {
       return tensorflow::errors::Internal("Error reading signature def map: ",
                                           status.message());
     }
@@ -148,7 +148,7 @@ Status GetSignatureDefMap(const Model* model,
     }
     *signature_def_map = retrieved_signature_def_map;
   }
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 Status ClearSignatureDefMap(const Model* model, std::string* model_data) {
@@ -172,7 +172,7 @@ Status ClearSignatureDefMap(const Model* model, std::string* model_data) {
   *model_data =
       std::string(reinterpret_cast<const char*>(builder.GetBufferPointer()),
                   builder.GetSize());
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/signature/signature_def_util_test.cc b/tensorflow/lite/tools/signature/signature_def_util_test.cc
index 51da756059c550..38b064707a48a5 100644
--- a/tensorflow/lite/tools/signature/signature_def_util_test.cc
+++ b/tensorflow/lite/tools/signature/signature_def_util_test.cc
@@ -69,12 +69,12 @@ TEST_F(SimpleSignatureDefUtilTest, SetSignatureDefTest) {
   const std::map<string, SignatureDef> expected_signature_def_map = {
       {kDefaultServingSignatureDefKey, expected_signature_def}};
   EXPECT_EQ(
-      ::tensorflow::OkStatus(),
+      absl::OkStatus(),
       SetSignatureDefMap(model_, expected_signature_def_map, &model_output));
   const Model* add_model = flatbuffers::GetRoot<Model>(model_output.data());
   EXPECT_TRUE(HasSignatureDef(add_model, kDefaultServingSignatureDefKey));
   std::map<string, SignatureDef> test_signature_def_map;
-  EXPECT_EQ(::tensorflow::OkStatus(),
+  EXPECT_EQ(absl::OkStatus(),
             GetSignatureDefMap(add_model, &test_signature_def_map));
   SignatureDef test_signature_def =
       test_signature_def_map[kDefaultServingSignatureDefKey];
@@ -88,12 +88,12 @@ TEST_F(SimpleSignatureDefUtilTest, OverwriteSignatureDefTest) {
   std::map<string, SignatureDef> expected_signature_def_map = {
       {kDefaultServingSignatureDefKey, expected_signature_def}};
   EXPECT_EQ(
-      ::tensorflow::OkStatus(),
+      absl::OkStatus(),
       SetSignatureDefMap(model_, expected_signature_def_map, &model_output));
   const Model* add_model = flatbuffers::GetRoot<Model>(model_output.data());
   EXPECT_TRUE(HasSignatureDef(add_model, kDefaultServingSignatureDefKey));
   std::map<string, SignatureDef> test_signature_def_map;
-  EXPECT_EQ(::tensorflow::OkStatus(),
+  EXPECT_EQ(absl::OkStatus(),
             GetSignatureDefMap(add_model, &test_signature_def_map));
   SignatureDef test_signature_def =
       test_signature_def_map[kDefaultServingSignatureDefKey];
@@ -105,16 +105,16 @@ TEST_F(SimpleSignatureDefUtilTest, OverwriteSignatureDefTest) {
   constexpr char kTestSignatureDefKey[] = "ServingTest";
   expected_signature_def_map[kTestSignatureDefKey] = expected_signature_def;
   EXPECT_EQ(
-      ::tensorflow::OkStatus(),
+      absl::OkStatus(),
       SetSignatureDefMap(add_model, expected_signature_def_map, &model_output));
   const Model* final_model = flatbuffers::GetRoot<Model>(model_output.data());
   EXPECT_FALSE(HasSignatureDef(final_model, kDefaultServingSignatureDefKey));
-  EXPECT_EQ(::tensorflow::OkStatus(),
+  EXPECT_EQ(absl::OkStatus(),
             GetSignatureDefMap(final_model, &test_signature_def_map));
   EXPECT_NE(expected_signature_def.SerializeAsString(),
             test_signature_def.SerializeAsString());
   EXPECT_TRUE(HasSignatureDef(final_model, kTestSignatureDefKey));
-  EXPECT_EQ(::tensorflow::OkStatus(),
+  EXPECT_EQ(absl::OkStatus(),
             GetSignatureDefMap(final_model, &test_signature_def_map));
   test_signature_def = test_signature_def_map[kTestSignatureDefKey];
   EXPECT_EQ(expected_signature_def.SerializeAsString(),
@@ -123,7 +123,7 @@ TEST_F(SimpleSignatureDefUtilTest, OverwriteSignatureDefTest) {
 
 TEST_F(SimpleSignatureDefUtilTest, GetSignatureDefTest) {
   std::map<string, SignatureDef> test_signature_def_map;
-  EXPECT_EQ(::tensorflow::OkStatus(),
+  EXPECT_EQ(absl::OkStatus(),
             GetSignatureDefMap(model_, &test_signature_def_map));
   EXPECT_FALSE(HasSignatureDef(model_, kDefaultServingSignatureDefKey));
 }
@@ -135,19 +135,18 @@ TEST_F(SimpleSignatureDefUtilTest, ClearSignatureDefTest) {
   std::map<string, SignatureDef> expected_signature_def_map = {
       {kDefaultServingSignatureDefKey, expected_signature_def}};
   EXPECT_EQ(
-      ::tensorflow::OkStatus(),
+      absl::OkStatus(),
       SetSignatureDefMap(model_, expected_signature_def_map, &model_output));
   const Model* add_model = flatbuffers::GetRoot<Model>(model_output.data());
   EXPECT_TRUE(HasSignatureDef(add_model, kDefaultServingSignatureDefKey));
   SignatureDef test_signature_def;
   std::map<string, SignatureDef> test_signature_def_map;
-  EXPECT_EQ(::tensorflow::OkStatus(),
+  EXPECT_EQ(absl::OkStatus(),
             GetSignatureDefMap(add_model, &test_signature_def_map));
   test_signature_def = test_signature_def_map[kDefaultServingSignatureDefKey];
   EXPECT_EQ(expected_signature_def.SerializeAsString(),
             test_signature_def.SerializeAsString());
-  EXPECT_EQ(::tensorflow::OkStatus(),
-            ClearSignatureDefMap(add_model, &model_output));
+  EXPECT_EQ(absl::OkStatus(), ClearSignatureDefMap(add_model, &model_output));
   const Model* clear_model = flatbuffers::GetRoot<Model>(model_output.data());
   EXPECT_FALSE(HasSignatureDef(clear_model, kDefaultServingSignatureDefKey));
   EXPECT_EQ(expected_num_buffers, clear_model->buffers()->size());
diff --git a/tensorflow/lite/tools/signature/signature_def_util_wrapper_pybind11.cc b/tensorflow/lite/tools/signature/signature_def_util_wrapper_pybind11.cc
index a0c80eb2615861..75b71ad6fd0617 100644
--- a/tensorflow/lite/tools/signature/signature_def_util_wrapper_pybind11.cc
+++ b/tensorflow/lite/tools/signature/signature_def_util_wrapper_pybind11.cc
@@ -40,7 +40,7 @@ py::bytes WrappedSetSignatureDefMap(
     signature_def_map[entry.first] = signature_def;
   }
   auto status = tflite::SetSignatureDefMap(model, signature_def_map, &data);
-  if (status != ::tensorflow::OkStatus()) {
+  if (status != absl::OkStatus()) {
     throw std::invalid_argument(std::string(status.message()));
   }
   return py::bytes(data);
@@ -57,7 +57,7 @@ std::map<std::string, py::bytes> WrappedGetSignatureDefMap(
   std::string content;
   std::map<std::string, tensorflow::SignatureDef> signature_def_map;
   auto status = tflite::GetSignatureDefMap(model, &signature_def_map);
-  if (status != ::tensorflow::OkStatus()) {
+  if (status != absl::OkStatus()) {
     throw std::invalid_argument("Cannot parse signature def");
   }
   std::map<std::string, py::bytes> serialized_signature_def_map;
@@ -77,7 +77,7 @@ py::bytes WrappedClearSignatureDefs(const std::vector<uint8_t>& model_buffer) {
   }
   std::string content;
   auto status = tflite::ClearSignatureDefMap(model, &content);
-  if (status != ::tensorflow::OkStatus()) {
+  if (status != absl::OkStatus()) {
     throw std::invalid_argument("An unknown error occurred");
   }
   return py::bytes(content);
diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
index 6cfd11e5d688d8..27c35be416d552 100644
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@@ -233,11 +233,14 @@ tf_staging/third_party/googleapis/googleapis.BUILD:
 tf_staging/third_party/googleapis/repository_rules.bzl:
 tf_staging/third_party/gpus/BUILD:
 tf_staging/third_party/gpus/crosstool/BUILD.rocm.tpl:
+tf_staging/third_party/gpus/crosstool/BUILD.sycl.tpl:
 tf_staging/third_party/gpus/crosstool/BUILD.tpl:
 tf_staging/third_party/gpus/crosstool/BUILD:
 tf_staging/third_party/gpus/crosstool/LICENSE:
 tf_staging/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl:
 tf_staging/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl:
+tf_staging/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_sycl.tpl:
+tf_staging/third_party/gpus/crosstool/sycl_cc_toolchain_config.bzl.tpl:
 tf_staging/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl:
 tf_staging/third_party/gpus/cuda/BUILD.tpl:
 tf_staging/third_party/gpus/cuda/BUILD.windows.tpl:
@@ -253,6 +256,10 @@ tf_staging/third_party/gpus/rocm/BUILD:
 tf_staging/third_party/gpus/rocm/build_defs.bzl.tpl:
 tf_staging/third_party/gpus/rocm/rocm_config.h.tpl:
 tf_staging/third_party/gpus/rocm_configure.bzl:
+tf_staging/third_party/gpus/sycl/BUILD.tpl:
+tf_staging/third_party/gpus/sycl/BUILD:
+tf_staging/third_party/gpus/sycl/build_defs.bzl.tpl:
+tf_staging/third_party/gpus/sycl_configure.bzl:
 tf_staging/third_party/grpc/BUILD:
 tf_staging/third_party/icu/udata.patch:
 tf_staging/third_party/implib_so/BUILD:
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 159e6db204f0cb..1311722b20b38e 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -1203,8 +1203,8 @@ tf_python_pybind_extension(
         "//tensorflow/python/lib/core:py_exception_registry_hdr",
         "//tensorflow/python/lib/core:safe_pyobject_ptr_required_hdrs",
         "//tensorflow/python/util:util_hdr",
-        "@local_tsl//tsl/distributed_runtime:pywrap_required_hdrs",
-        "@local_tsl//tsl/distributed_runtime/coordination:pywrap_required_hdrs",
+        "@local_xla//xla/tsl/distributed_runtime:pywrap_required_hdrs",
+        "@local_xla//xla/tsl/distributed_runtime/coordination:pywrap_required_hdrs",
         "@local_xla//xla/tsl/python/lib/core:numpy_hdr",
     ],
     dynamic_deps = [":_pywrap_tensorflow_internal.so"] + select({
diff --git a/tensorflow/python/client/BUILD b/tensorflow/python/client/BUILD
index 64ff5af1017bea..355f2c04e20dd4 100644
--- a/tensorflow/python/client/BUILD
+++ b/tensorflow/python/client/BUILD
@@ -43,8 +43,8 @@ tf_python_pybind_extension(
         "//tensorflow/core/distributed_runtime/coordination:pywrap_required_hdrs",
         "//tensorflow/core/distributed_runtime/eager:pywrap_required_hdrs",
         "//tensorflow/python/lib/core:safe_pyobject_ptr_required_hdrs",
-        "@local_tsl//tsl/distributed_runtime:pywrap_required_hdrs",
-        "@local_tsl//tsl/distributed_runtime/coordination:pywrap_required_hdrs",
+        "@local_xla//xla/tsl/distributed_runtime:pywrap_required_hdrs",
+        "@local_xla//xla/tsl/distributed_runtime/coordination:pywrap_required_hdrs",
         "@local_xla//xla/tsl/python/lib/core:numpy_hdr",
     ],
     enable_stub_generation = True,
diff --git a/tensorflow/python/client/tf_session_wrapper.cc b/tensorflow/python/client/tf_session_wrapper.cc
index 7492c91bbdb355..b2d3492f99dfd5 100644
--- a/tensorflow/python/client/tf_session_wrapper.cc
+++ b/tensorflow/python/client/tf_session_wrapper.cc
@@ -534,7 +534,7 @@ struct PyGraph {
     return py::bytes(versions);
   }
 
-  tsl::StatusOr<py::bytes> _op_def_for_type(
+  absl::StatusOr<py::bytes> _op_def_for_type(
       const std::string& kTypeName) const {
     tsl::mutex_lock l(tf_graph()->mu);
     const tensorflow::OpDef* op_def;
@@ -620,7 +620,7 @@ struct PyOperation {
     }
   }
 
-  tsl::Status _add_outputs(py::list dtypes, py::list shapes);
+  absl::Status _add_outputs(py::list dtypes, py::list shapes);
 
   TF_Output _tf_output(int idx) const { return TF_Output{tf_op(), idx}; }
   TF_Input _tf_input(int idx) const { return TF_Input{tf_op(), idx}; }
@@ -737,7 +737,7 @@ struct PyTensor {
 
   int value_index() const { return data->value_index; }
 
-  tsl::StatusOr<py::object> shape() {
+  absl::StatusOr<py::object> shape() {
     tensorflow::Safe_TF_StatusPtr status =
         tensorflow::make_safe(TF_NewStatus());
     bool unknown_shape = false;
@@ -756,7 +756,7 @@ struct PyTensor {
     return py::make_tuple(py_list, py::cast(unknown_shape));
   }
 
-  tsl::Status set_shape(py::iterable shape, bool unknown_shape) {
+  absl::Status set_shape(py::iterable shape, bool unknown_shape) {
     tensorflow::Safe_TF_StatusPtr status =
         tensorflow::make_safe(TF_NewStatus());
     std::vector<int64_t> dims;
@@ -809,7 +809,7 @@ void PyOperationData::Dismantle(PyOperation* py_op) {
   PyDict_Clear(py_op->dict);
 }
 
-tsl::Status PyOperation::_add_outputs(py::list dtypes, py::list shapes) {
+absl::Status PyOperation::_add_outputs(py::list dtypes, py::list shapes) {
   int orig_outputs = data->outputs.size();
   for (int i = 0; i < dtypes.size(); ++i) {
     py::object tensor =
@@ -834,7 +834,7 @@ tsl::Status PyOperation::_add_outputs(py::list dtypes, py::list shapes) {
         AsPyTfObject<PyTensor>(tensor)->set_shape(dims, unknown_shape));
     data->outputs.append(tensor);
   }
-  return tsl::OkStatus();
+  return absl::OkStatus();
 }
 
 void PyOperation::add_control_inputs(py::iterable inputs) {
@@ -1917,7 +1917,7 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
         TF_Function* func = new TF_Function();
         func->record =
             new tensorflow::FunctionRecord(std::move(fdef), {}, false);
-        status.get()->status = ::tensorflow::OkStatus();
+        status.get()->status = absl::OkStatus();
         // Acquire GIL for returning output returning.
         pybind11::gil_scoped_acquire acquire;
         tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 75aeecbbc1ab94..41d91d9ae07bf0 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 4, 2)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 4, 17)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 
diff --git a/tensorflow/python/data/experimental/BUILD b/tensorflow/python/data/experimental/BUILD
index db74ab487dbb24..aff11967d19f72 100644
--- a/tensorflow/python/data/experimental/BUILD
+++ b/tensorflow/python/data/experimental/BUILD
@@ -12,6 +12,7 @@ py_strict_library(
     srcs_version = "PY3",
     deps = [
         "//tensorflow/python/data/experimental/ops:dataset_ops",
+        "//tensorflow/python/data/experimental/ops:distributed_save_op",
         "//tensorflow/python/data/experimental/ops:iterator_ops",
         "//tensorflow/python/data/experimental/ops:lookup_ops",
         "//tensorflow/python/data/experimental/ops:parsing_ops",
diff --git a/tensorflow/python/data/experimental/__init__.py b/tensorflow/python/data/experimental/__init__.py
index eaaca860a318e5..2a798c2882349e 100644
--- a/tensorflow/python/data/experimental/__init__.py
+++ b/tensorflow/python/data/experimental/__init__.py
@@ -54,6 +54,7 @@
 @@dense_to_ragged_batch
 @@dense_to_sparse_batch
 @@distribute
+@@distributed_save
 @@enable_debug_mode
 @@enumerate_dataset
 @@from_list
@@ -106,6 +107,7 @@
 from tensorflow.python.data.experimental.ops.cardinality import UNKNOWN as UNKNOWN_CARDINALITY
 from tensorflow.python.data.experimental.ops.counter import Counter
 from tensorflow.python.data.experimental.ops.distribute import SHARD_HINT
+from tensorflow.python.data.experimental.ops.distributed_save_op import distributed_save
 from tensorflow.python.data.experimental.ops.enumerate_ops import enumerate_dataset
 from tensorflow.python.data.experimental.ops.error_ops import ignore_errors
 from tensorflow.python.data.experimental.ops.from_list import from_list
diff --git a/tensorflow/python/data/experimental/kernel_tests/BUILD b/tensorflow/python/data/experimental/kernel_tests/BUILD
index 5cf6657ea5d292..29faa384d7d28d 100644
--- a/tensorflow/python/data/experimental/kernel_tests/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/BUILD
@@ -175,6 +175,7 @@ tf_py_strict_test(
 tf_py_strict_test(
     name = "from_list_test",
     size = "small",
+    timeout = "moderate",
     srcs = ["from_list_test.py"],
     shard_count = 4,
     deps = [
@@ -239,6 +240,30 @@ tf_py_strict_test(
     ],
 )
 
+tf_py_strict_test(
+    name = "index_flat_map_test",
+    srcs = ["index_flat_map_test.py"],
+    shard_count = 8,
+    deps = [
+        "//tensorflow/python/data/experimental/ops:global_shuffle_op",
+        "//tensorflow/python/data/experimental/ops:index_flat_map_op",
+        "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:tensor",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops/ragged:ragged_string_ops",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 tf_py_strict_test(
     name = "index_shuffle_test",
     size = "large",
@@ -722,6 +747,27 @@ tf_py_strict_test(
     ],
 )
 
+tf_py_strict_test(
+    name = "weighted_flat_map_test",
+    srcs = ["weighted_flat_map_test.py"],
+    deps = [
+        "//tensorflow/python/data/experimental/ops:global_shuffle_op",
+        "//tensorflow/python/data/experimental/ops:weighted_flat_map_op",
+        "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/logging",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 cuda_py_strict_test(
     name = "wrap_unwrap_test",
     size = "small",
diff --git a/tensorflow/python/data/experimental/kernel_tests/index_flat_map_test.py b/tensorflow/python/data/experimental/kernel_tests/index_flat_map_test.py
new file mode 100644
index 00000000000000..a2d689835fb4db
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/index_flat_map_test.py
@@ -0,0 +1,277 @@
+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the index flat map dataset."""
+
+from typing import Any, Callable, Optional, Union
+
+from absl.testing import parameterized
+
+from tensorflow.python.data.experimental.ops import global_shuffle_op
+from tensorflow.python.data.experimental.ops import index_flat_map_op
+from tensorflow.python.data.kernel_tests import checkpoint_test_base
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import options as options_lib
+from tensorflow.python.framework import combinations
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import ragged_string_ops
+from tensorflow.python.platform import test
+
+_IndexType = index_flat_map_op._IndexType
+
+
+class IndexFlatMapTest(test_base.DatasetTestBase, parameterized.TestCase):
+  """Tests for global shuffling of index flat map datasets."""
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(use_tensors=[True, False])))
+  def test_split_strings(self, use_tensors: bool):
+    input_data = ["0 1", "2 3 4 5", "6 7", "8"]
+    # The metadata is [(0, 2, 0), (2, 6, 1), (6, 8, 2), (8, 9, 3)].
+    metadata = _get_metadata(input_data)
+
+    def _index_map_func(index: _IndexType) -> tuple[_IndexType, _IndexType]:
+      index = _maybe_convert_to_tensor(index)
+      element_index, offset = _get_index_map_func(metadata)(index)
+      return (_maybe_convert_to_tensor(element_index),
+              _maybe_convert_to_tensor(offset))
+
+    def _maybe_convert_to_tensor(value: Any) -> _IndexType:
+      return math_ops.cast(value, dtypes.int64) if use_tensors else value
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(input_data)
+    dataset = index_flat_map_op.index_flat_map(dataset, _split, _index_map_func)
+    output = self.getDatasetOutput(dataset)
+    self.assertEqual(output,
+                     [b"0", b"1", b"2", b"3", b"4", b"5", b"6", b"7", b"8"])
+
+  @combinations.generate(test_base.default_test_combinations())
+  def test_cache(self):
+    input_data = ["0 1", "2 3 4 5", "6 7", "8"]
+    metadata = _get_metadata(input_data)
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(input_data)
+    dataset = dataset.cache()
+    dataset = index_flat_map_op.index_flat_map(
+        dataset, _split, _get_index_map_func(metadata))
+    output = self.getDatasetOutput(dataset)
+    self.assertEqual(output,
+                     [b"0", b"1", b"2", b"3", b"4", b"5", b"6", b"7", b"8"])
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(
+              repetitions=[1, 3],
+              seed=[None, 42],
+              reshuffle_each_iteration=[True, False])))
+  def test_global_shuffle(
+      self,
+      repetitions: int,
+      seed: Optional[int],
+      reshuffle_each_iteration: bool):
+    input_data = ["0 1", "2 3 4 5", "6 7", "8"]
+    metadata = _get_metadata(input_data)
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(input_data)
+    dataset = index_flat_map_op.index_flat_map(
+        dataset, _split, _get_index_map_func(metadata), output_cardinality=9)
+    self.assertEqual(self.evaluate(dataset.cardinality()), 9)
+
+    if repetitions > 1:
+      dataset = dataset.repeat(repetitions)
+    dataset = global_shuffle_op._global_shuffle(
+        dataset, seed=seed, reshuffle_each_iteration=reshuffle_each_iteration)
+
+    dataset_output = self.getDatasetOutput(
+        dataset, requires_initialization=True)
+    expected = [
+        b"0", b"1", b"2", b"3", b"4", b"5", b"6", b"7", b"8"] * repetitions
+    self.assertCountEqual(dataset_output, expected)
+    self.assertNotEqual(dataset_output, expected)
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(dataset_range=[0, 10])))
+  def test_identity_map(self, dataset_range: int):
+
+    def _map_func(element: Any) -> Any:
+      return element
+
+    def _index_map_func(index: int) -> tuple[int, int]:
+      return (index, 0)
+
+    dataset = dataset_ops.Dataset.range(dataset_range)
+    dataset = index_flat_map_op.index_flat_map(
+        dataset, _map_func, _index_map_func)
+    self.assertDatasetProduces(dataset, list(range(dataset_range)))
+
+  @combinations.generate(test_base.default_test_combinations())
+  def test_offset_out_of_range(self):
+
+    def _index_map_func(_) -> tuple[int, int]:
+      return (0, 1000)
+
+    input_data = ["0 1", "2 3 4 5", "6 7", "8"]
+    dataset = dataset_ops.Dataset.from_tensor_slices(input_data)
+    dataset = index_flat_map_op.index_flat_map(dataset, _split, _index_map_func)
+    with self.assertRaisesRegex(
+        errors.InvalidArgumentError,
+        "invalid `index_map_fn` which returns offset 1000"):
+      self.getDatasetOutput(dataset)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def test_invalid_map_fn(self):
+
+    def _index_map_func(_) -> str:
+      # Expected to return two integers.
+      return "Hello"
+
+    input_data = ["0 1", "2 3 4 5", "6 7", "8"]
+    dataset = dataset_ops.Dataset.from_tensor_slices(input_data)
+    dataset = index_flat_map_op.index_flat_map(dataset, _split, _index_map_func)
+    with self.assertRaisesRegex(
+        errors.InvalidArgumentError,
+        "expected to return two int values"):
+      self.getDatasetOutput(dataset)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def test_unknown_cardinality(self):
+    input_data = ["0 1", "2 3 4 5", "6 7", "8"]
+    dataset = dataset_ops.Dataset.from_tensor_slices(input_data)
+
+    with self.assertRaisesRegex(
+        errors.InvalidArgumentError,
+        "`global_shuffle` requires the input dataset to have a non-empty "
+        "finite cardinality."):
+      dataset = index_flat_map_op.index_flat_map(
+          dataset, _split, _get_index_map_func(_get_metadata(input_data)))
+      dataset = global_shuffle_op._global_shuffle(dataset)
+      self.getDatasetOutput(dataset, requires_initialization=True)
+
+
+class IndexFlatMapCheckpointTest(
+    checkpoint_test_base.CheckpointTestBase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          checkpoint_test_base.default_test_combinations(),
+          combinations.combine(
+              repetitions=[1, 3],
+              symbolic_checkpoint=[True, False])))
+  def test_index_flat_map(
+      self,
+      verify_fn: Callable[..., None],
+      repetitions: int,
+      symbolic_checkpoint: bool):
+
+    input_data = ["0 1", "2 3 4 5", "6 7", "8"]
+
+    def _build_dataset() -> dataset_ops.Dataset:
+      dataset = dataset_ops.Dataset.from_tensor_slices(input_data)
+      dataset = index_flat_map_op.index_flat_map(
+          dataset, _split, _get_index_map_func(_get_metadata(input_data)))
+      if repetitions > 1:
+        dataset = dataset.repeat(repetitions)
+      options = options_lib.Options()
+      options.experimental_symbolic_checkpoint = symbolic_checkpoint
+      return dataset.with_options(options)
+
+    verify_fn(self, _build_dataset, num_outputs=9 * repetitions)
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          checkpoint_test_base.default_test_combinations(),
+          combinations.combine(
+              repetitions=[1, 3],
+              reshuffle_each_iteration=[True, False],
+              symbolic_checkpoint=[True, False])))
+  def test_global_shuffle(
+      self,
+      verify_fn: Callable[..., None],
+      repetitions: list[int],
+      reshuffle_each_iteration: bool,
+      symbolic_checkpoint: bool):
+
+    input_data = ["0 1", "2 3 4 5", "6 7", "8"]
+
+    def _build_dataset() -> dataset_ops.Dataset:
+      dataset = dataset_ops.Dataset.from_tensor_slices(input_data)
+      dataset = index_flat_map_op.index_flat_map(
+          dataset,
+          _split,
+          _get_index_map_func(_get_metadata(input_data)),
+          output_cardinality=9)
+      if repetitions > 1:
+        dataset = dataset.repeat(repetitions)
+      dataset = global_shuffle_op._global_shuffle(
+          dataset, seed=42, reshuffle_each_iteration=reshuffle_each_iteration)
+
+      options = options_lib.Options()
+      options.experimental_symbolic_checkpoint = symbolic_checkpoint
+      return dataset.with_options(options)
+
+    verify_fn(
+        self,
+        _build_dataset,
+        num_outputs=9 * repetitions,
+        assert_items_equal=reshuffle_each_iteration)
+
+
+def _split(element: str) -> tensor.Tensor:
+  return ragged_string_ops.string_split_v2(element, " ")
+
+
+def _get_metadata(input_data: list[str]) -> tensor.Tensor:
+  """Given a list of strings, creates a metadata matrix."""
+
+  metadata = []
+  for i, data in enumerate(input_data):
+    split_data = data.split()
+    last_index = metadata[-1][1] if metadata else 0
+    metadata.append((last_index, last_index + len(split_data), i))
+  return constant_op.constant(metadata, dtype=dtypes.int64)
+
+
+def _get_index_map_func(
+    metadata: tensor.Tensor) -> Callable[[int], tuple[int, int]]:
+  """Turns a `metadata` Tensor into an index map function."""
+
+  def _index_map_func(index: Union[int, tensor.Tensor]) -> tuple[int, int]:
+    element_index = 0
+    while (element_index < metadata.shape[0] and
+           index >= array_ops.gather_nd(metadata, [element_index, 1])):
+      element_index += 1
+    offset = (
+        index - array_ops.gather_nd(metadata, [element_index, 0])
+        if element_index < metadata.shape[0]
+        else constant_op.constant(0, dtype=dtypes.int64))
+    return (element_index, offset)
+
+  return _index_map_func
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/service/BUILD b/tensorflow/python/data/experimental/kernel_tests/service/BUILD
index 94eac15c1c1295..628d2e2ad0653e 100644
--- a/tensorflow/python/data/experimental/kernel_tests/service/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/service/BUILD
@@ -366,8 +366,15 @@ tf_py_strict_test(
     deps = [
         ":multi_process_cluster",
         ":test_base",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/data/experimental/ops:data_service_ops",
+        "//tensorflow/python/data/experimental/ops:distribute",
         "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_ops",
+        "//tensorflow/python/ops:math_ops",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -384,6 +391,7 @@ py_strict_library(
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/framework:combinations",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/platform:test",
     ],
diff --git a/tensorflow/python/data/experimental/kernel_tests/service/distributed_save_load_ft_test.py b/tensorflow/python/data/experimental/kernel_tests/service/distributed_save_load_ft_test.py
index e752b073206d9a..55f50af7d1fb95 100644
--- a/tensorflow/python/data/experimental/kernel_tests/service/distributed_save_load_ft_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/service/distributed_save_load_ft_test.py
@@ -24,13 +24,13 @@
 from tensorflow.python.data.experimental.ops import distributed_save_op
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import load_op
 from tensorflow.python.framework import combinations
 from tensorflow.python.platform import test
 
 
 class DistributedSaveLoadFtTest(
-    data_service_test_base.TestBase, parameterized.TestCase):
+    data_service_test_base.TestBase, parameterized.TestCase
+):
   """Fault tolerance tests for distributed save/load."""
 
   @combinations.generate(
@@ -43,14 +43,19 @@ class DistributedSaveLoadFtTest(
               load_repetitions=[1, 2],
               sharding_policy=[
                   data_service_ops.ShardingPolicy.OFF,
-                  data_service_ops.ShardingPolicy.DYNAMIC])))
+                  data_service_ops.ShardingPolicy.DYNAMIC,
+              ],
+          ),
+      )
+  )
   def test_dispatcher_restart(
       self,
       num_workers: int,
       num_elements: int,
       save_repetitions: int,
       load_repetitions: int,
-      sharding_policy: data_service_ops.ShardingPolicy):
+      sharding_policy: data_service_ops.ShardingPolicy,
+  ):
     cluster = data_service_test_base.TestCluster(num_workers=num_workers)
     snapshot_dir = data_service_test_base.TempDir()
     dataset = dataset_ops.Dataset.range(num_elements)
@@ -58,16 +63,20 @@ def test_dispatcher_restart(
       dataset = dataset.repeat(save_repetitions)
     self.evaluate(
         distributed_save_op.distributed_save(
-            dataset, snapshot_dir.full_path, cluster.dispatcher_address()))
+            dataset, snapshot_dir.full_path, cluster.dispatcher_address()
+        )
+    )
 
-    dataset = load_op._load_with_retry(snapshot_dir.full_path)
+    dataset = dataset_ops.Dataset.load(snapshot_dir.full_path, wait=True)
     if load_repetitions > 1:
       dataset = dataset.repeat(load_repetitions)
     dataset = dataset.apply(
         data_service_ops.distribute(
             sharding_policy,
             cluster.dispatcher_address(),
-            max_outstanding_requests=1))
+            max_outstanding_requests=1,
+        )
+    )
 
     iterator = self.getNext(dataset)
     output = [self.evaluate(iterator())]
@@ -95,17 +104,22 @@ def test_dispatcher_restart(
               save_repetitions=[1, 2],
               load_repetitions=[1, 2],
               sharding_policy=[
-                  # TODO(b/297930782): Enable dynamic sharding. Need to fix the
-                  # race condition where workers restart before sending the
-                  # final task completion update.
-                  data_service_ops.ShardingPolicy.OFF])))
+                  # Enable dynamic sharding. Need to fix the race condition
+                  # where workers restart before sending the final task
+                  # completion update.
+                  data_service_ops.ShardingPolicy.OFF
+              ],
+          ),
+      )
+  )
   def test_dispatcher_and_worker_restart(
       self,
       num_elements: int,
       num_workers: int,
       save_repetitions: int,
       load_repetitions: int,
-      sharding_policy: data_service_ops.ShardingPolicy):
+      sharding_policy: data_service_ops.ShardingPolicy,
+  ):
     cluster = data_service_test_base.TestCluster(num_workers=num_workers)
     snapshot_dir = data_service_test_base.TempDir()
     dataset = dataset_ops.Dataset.range(num_elements)
@@ -113,16 +127,20 @@ def test_dispatcher_and_worker_restart(
       dataset = dataset.repeat(save_repetitions)
     self.evaluate(
         distributed_save_op.distributed_save(
-            dataset, snapshot_dir.full_path, cluster.dispatcher_address()))
+            dataset, snapshot_dir.full_path, cluster.dispatcher_address()
+        )
+    )
 
-    dataset = load_op._load_with_retry(snapshot_dir.full_path)
+    dataset = dataset_ops.Dataset.load(snapshot_dir.full_path, wait=True)
     if load_repetitions > 1:
       dataset = dataset.repeat(load_repetitions)
     dataset = dataset.apply(
         data_service_ops.distribute(
             sharding_policy,
             cluster.dispatcher_address(),
-            max_outstanding_requests=1))
+            max_outstanding_requests=1,
+        )
+    )
 
     iterator = self.getNext(dataset)
     output = [self.evaluate(iterator())]
@@ -138,7 +156,8 @@ def test_dispatcher_and_worker_restart(
       repetitions = save_repetitions * load_repetitions
       self.assertContainsSubsequence(
           sorted(output),
-          sorted(list(range(num_elements)) * repetitions * num_workers))
+          sorted(list(range(num_elements)) * repetitions * num_workers),
+      )
 
   @combinations.generate(
       combinations.times(
@@ -147,26 +166,35 @@ def test_dispatcher_and_worker_restart(
               load_repetitions=[1, 2],
               sharding_policy=[
                   data_service_ops.ShardingPolicy.OFF,
-                  data_service_ops.ShardingPolicy.DYNAMIC])))
+                  data_service_ops.ShardingPolicy.DYNAMIC,
+              ],
+          ),
+      )
+  )
   def test_add_worker_midjob(
       self,
       load_repetitions: int,
-      sharding_policy: data_service_ops.ShardingPolicy):
+      sharding_policy: data_service_ops.ShardingPolicy,
+  ):
     num_elements = 2 * multiprocessing.cpu_count() + 100
     cluster = data_service_test_base.TestCluster(num_workers=1)
     snapshot_dir = data_service_test_base.TempDir()
     dataset = dataset_ops.Dataset.range(num_elements)
     self.evaluate(
         distributed_save_op.distributed_save(
-            dataset, snapshot_dir.full_path, cluster.dispatcher_address()))
+            dataset, snapshot_dir.full_path, cluster.dispatcher_address()
+        )
+    )
 
-    dataset = load_op._load_with_retry(snapshot_dir.full_path)
+    dataset = dataset_ops.Dataset.load(snapshot_dir.full_path, wait=True)
     dataset = dataset.repeat(load_repetitions)
     dataset = dataset.apply(
         data_service_ops.distribute(
             sharding_policy,
             cluster.dispatcher_address(),
-            max_outstanding_requests=1))
+            max_outstanding_requests=1,
+        )
+    )
     expected = list(range(num_elements)) * load_repetitions
     if sharding_policy == data_service_ops.ShardingPolicy.OFF:
       expected *= 2
@@ -192,25 +220,34 @@ def test_add_worker_midjob(
               load_repetitions=[1, 2],
               sharding_policy=[
                   data_service_ops.ShardingPolicy.OFF,
-                  data_service_ops.ShardingPolicy.DYNAMIC])))
+                  data_service_ops.ShardingPolicy.DYNAMIC,
+              ],
+          ),
+      )
+  )
   def test_new_dataset_after_restart(
       self,
       num_workers: int,
       num_elements: int,
       load_repetitions: int,
-      sharding_policy: data_service_ops.ShardingPolicy):
+      sharding_policy: data_service_ops.ShardingPolicy,
+  ):
     cluster = data_service_test_base.TestCluster(num_workers=num_workers)
     snapshot_dir = data_service_test_base.TempDir()
     dataset = dataset_ops.Dataset.range(num_elements)
     self.evaluate(
         distributed_save_op.distributed_save(
-            dataset, snapshot_dir.full_path, cluster.dispatcher_address()))
+            dataset, snapshot_dir.full_path, cluster.dispatcher_address()
+        )
+    )
 
-    dataset = load_op._load_with_retry(snapshot_dir.full_path)
+    dataset = dataset_ops.Dataset.load(snapshot_dir.full_path, wait=True)
     dataset = dataset.repeat(load_repetitions)
     dataset = dataset.apply(
         data_service_ops.distribute(
-            sharding_policy, cluster.dispatcher_address()))
+            sharding_policy, cluster.dispatcher_address()
+        )
+    )
 
     expected = list(range(num_elements)) * load_repetitions
     if sharding_policy == data_service_ops.ShardingPolicy.OFF:
diff --git a/tensorflow/python/data/experimental/kernel_tests/service/distributed_save_load_test.py b/tensorflow/python/data/experimental/kernel_tests/service/distributed_save_load_test.py
index 6b308c8097ad5a..e55eb7fa5ef0f1 100644
--- a/tensorflow/python/data/experimental/kernel_tests/service/distributed_save_load_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/service/distributed_save_load_test.py
@@ -68,7 +68,7 @@ def test_save_load(
         distributed_save_op.distributed_save(
             dataset, snapshot_dir.full_path, cluster.dispatcher_address()))
 
-    dataset = load_op._load_with_retry(snapshot_dir.full_path)
+    dataset = dataset_ops.Dataset.load(snapshot_dir.full_path, wait=True)
     self.assertDatasetProduces(
         dataset,
         list(range(num_elements)) * num_repetitions,
@@ -83,7 +83,7 @@ def test_concurrent_save_load(self, num_workers: int):
     snapshot_dir = data_service_test_base.TempDir()
 
     def load_thread_fn():
-      dataset = load_op._load_with_retry(snapshot_dir.full_path)
+      dataset = dataset_ops.Dataset.load(snapshot_dir.full_path, wait=True)
       self.assertDatasetProduces(
           dataset, list(range(10)), assert_items_equal=True)
     load_thread = threading.Thread(target=load_thread_fn, name="load_thread")
@@ -125,7 +125,7 @@ def test_deterministic_load_order(
         distributed_save_op.distributed_save(
             dataset, snapshot_dir.full_path, cluster.dispatcher_address()))
 
-    dataset = load_op._load_with_retry(snapshot_dir.full_path)
+    dataset = dataset_ops.Dataset.load(snapshot_dir.full_path, wait=True)
     dataset = dataset.repeat(num_repetitions)
     output = self.getDatasetOutput(dataset)
     output_per_repetition = [
@@ -166,8 +166,10 @@ def _interleave_shuffled_chunks(
           num_parallel_calls=dataset_ops.AUTOTUNE)
 
     dataset = dataset_ops.Dataset.range(num_repetitions).flat_map(
-        lambda _: load_op._load_with_retry(
-            snapshot_dir.full_path, reader_func=_interleave_shuffled_chunks))
+        lambda _: dataset_ops.Dataset.load(
+            snapshot_dir.full_path,
+            reader_func=_interleave_shuffled_chunks,
+            wait=True))
 
     output = self.getDatasetOutput(dataset)
     output_per_repetition = [
@@ -201,7 +203,7 @@ def test_distributed_load(
         distributed_save_op.distributed_save(
             dataset, snapshot_dir.full_path, cluster.dispatcher_address()))
 
-    dataset = load_op._load_with_retry(snapshot_dir.full_path)
+    dataset = dataset_ops.Dataset.load(snapshot_dir.full_path, wait=True)
     if repeated_load > 1:
       dataset = dataset.repeat(repeated_load)
     dataset = dataset.apply(
@@ -233,7 +235,7 @@ def test_save_before_sample(self, num_workers: int):
     loaded_datasets = []
     for i in range(len(datasets)):
       snapshot_path = os.path.join(snapshot_dir.full_path, f"dataset_{i}")
-      loaded_datasets.append(load_op._load_with_retry(snapshot_path))
+      loaded_datasets.append(dataset_ops.Dataset.load(snapshot_path, wait=True))
     dataset = dataset_ops.Dataset.sample_from_datasets(
         loaded_datasets,
         weights=[1.0] * num_datasets,
@@ -262,7 +264,7 @@ def test_save_after_sample(self, num_workers: int, num_repetitions: int):
         distributed_save_op.distributed_save(
             dataset, snapshot_dir.full_path, cluster.dispatcher_address()))
 
-    dataset = load_op._load_with_retry(snapshot_dir.full_path)
+    dataset = dataset_ops.Dataset.load(snapshot_dir.full_path, wait=True)
     self.assertDatasetProduces(
         dataset,
         list(range(num_elements)) * num_datasets * num_repetitions,
@@ -282,7 +284,7 @@ def test_enumerate(self, num_workers: int):
         distributed_save_op.distributed_save(
             dataset, snapshot_dir.full_path, cluster.dispatcher_address()))
 
-    dataset = load_op._load_with_retry(snapshot_dir.full_path)
+    dataset = dataset_ops.Dataset.load(snapshot_dir.full_path, wait=True)
     indexes, elements = map(list, zip(*self.getDatasetOutput(dataset)))
     if num_workers == 1:
       self.assertCountEqual(indexes, list(range(9)))
@@ -300,7 +302,7 @@ def test_worker_failure(self):
             dataset, snapshot_dir.full_path, cluster.dispatcher_address()))
 
     with self.assertRaises(errors.InvalidArgumentError):
-      dataset = load_op._load_with_retry(snapshot_dir.full_path)
+      dataset = dataset_ops.Dataset.load(snapshot_dir.full_path, wait=True)
       self.getDatasetOutput(dataset)
 
   @combinations.generate(
@@ -315,7 +317,7 @@ def test_dataset_spec_file_is_optional(self, num_elements: int):
         distributed_save_op.distributed_save(
             dataset, snapshot_dir.full_path, cluster.dispatcher_address()))
 
-    dataset = load_op._load_with_retry(snapshot_dir.full_path)
+    dataset = dataset_ops.Dataset.load(snapshot_dir.full_path, wait=True)
     self.assertDatasetProduces(
         dataset, list(range(num_elements)), assert_items_equal=True)
 
@@ -323,7 +325,7 @@ def test_dataset_spec_file_is_optional(self, num_elements: int):
     # the same output.
     os.remove(os.path.join(
         snapshot_dir.full_path, dataset_ops.DATASET_SPEC_FILENAME))
-    dataset = load_op._load_with_retry(snapshot_dir.full_path)
+    dataset = dataset_ops.Dataset.load(snapshot_dir.full_path, wait=True)
     self.assertDatasetProduces(
         dataset, list(range(num_elements)), assert_items_equal=True)
 
@@ -339,7 +341,7 @@ def test_empty_dataset_spec_file(self, num_elements: int):
         distributed_save_op.distributed_save(
             dataset, snapshot_dir.full_path, cluster.dispatcher_address()))
 
-    dataset = load_op._load_with_retry(snapshot_dir.full_path)
+    dataset = dataset_ops.Dataset.load(snapshot_dir.full_path, wait=True)
     self.assertDatasetProduces(
         dataset, list(range(num_elements)), assert_items_equal=True)
 
@@ -349,16 +351,15 @@ def test_empty_dataset_spec_file(self, num_elements: int):
       f.write("")
 
     # Reads element_spec from the metadata file.
-    dataset = load_op._load_with_retry(snapshot_dir.full_path)
+    dataset = dataset_ops.Dataset.load(snapshot_dir.full_path, wait=True)
     self.assertDatasetProduces(
         dataset, list(range(num_elements)), assert_items_equal=True)
 
   @combinations.generate(test_base.default_test_combinations())
   def test_snapshot_does_not_exist(self):
-    load_op._LOAD_TIMEOUT_SECONDS = 5
     snapshot_dir = data_service_test_base.TempDir()
     with self.assertRaises(errors.NotFoundError):
-      dataset = load_op._load_with_retry(snapshot_dir.full_path)
+      dataset = dataset_ops.Dataset.load(snapshot_dir.full_path, wait=False)
       self.getDatasetOutput(dataset)
 
   @combinations.generate(
@@ -439,7 +440,7 @@ def test_save_load_checkpoint(
             dataset, snapshot_dir.full_path, cluster.dispatcher_address()))
 
     def _build_ds() -> dataset_ops.Dataset:
-      dataset = load_op._load_with_retry(snapshot_dir.full_path)
+      dataset = dataset_ops.Dataset.load(snapshot_dir.full_path, wait=True)
       if num_repetitions > 1:
         dataset = dataset.repeat(num_repetitions)
       return dataset
@@ -478,7 +479,7 @@ def test_skip_first_repetition(
             dataset, snapshot_dir.full_path, cluster.dispatcher_address()))
 
     def _build_ds() -> dataset_ops.Dataset:
-      dataset = load_op._load_with_retry(snapshot_dir.full_path)
+      dataset = dataset_ops.Dataset.load(snapshot_dir.full_path, wait=True)
       dataset = dataset.repeat(num_repetitions)
       # Skips the first repetition. The remaining repetitions should be
       # deterministic.
diff --git a/tensorflow/python/data/experimental/kernel_tests/service/multi_process_cluster_test.py b/tensorflow/python/data/experimental/kernel_tests/service/multi_process_cluster_test.py
index 969daeedb2caaa..c21dc89eb345e2 100644
--- a/tensorflow/python/data/experimental/kernel_tests/service/multi_process_cluster_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/service/multi_process_cluster_test.py
@@ -16,10 +16,17 @@
 
 from absl.testing import parameterized
 
+from tensorflow.core.protobuf import data_service_pb2
 from tensorflow.python.data.experimental.kernel_tests.service import multi_process_cluster
 from tensorflow.python.data.experimental.kernel_tests.service import test_base as data_service_test_base
+from tensorflow.python.data.experimental.ops import data_service_ops
+from tensorflow.python.data.experimental.ops import distribute
 from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import combinations
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_ops
+from tensorflow.python.ops import math_ops
 
 
 class MultiProcessClusterTest(data_service_test_base.TestBase,
@@ -45,6 +52,56 @@ def testCluster(self, num_local_workers, num_remote_workers):
         num_workers * list(range(num_elements)),
         assert_items_equal=True)
 
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations())
+  )
+  def testDistributeNonblockingWithStuckWorkers(self):
+    num_workers = 6
+    # Avoids using local workers because it will stall the teardown
+    # while separate worker processes can be killed easily.
+    cluster = multi_process_cluster.MultiProcessCluster(
+        num_local_workers=0,
+        num_remote_workers=num_workers,
+        worker_addresses=["localhost"] * num_workers,
+        deployment_mode=data_service_pb2.DEPLOYMENT_MODE_REMOTE,
+    )
+
+    num_elements = 10
+
+    def force_one_worker_to_stall_map(x):
+      # Simulates having a stuck worker.
+      if math_ops.equal(x, 0):
+        test_ops.sleep_op(sleep_seconds=10000)
+        return math_ops.cast(0, dtypes.int64)
+      else:
+        return x
+
+    dataset = dataset_ops.Dataset.range(num_elements, dtype=dtypes.int64)
+    dataset = dataset.shard(distribute.SHARD_HINT, distribute.SHARD_HINT)
+    dataset = dataset.map(force_one_worker_to_stall_map)
+    dataset = dataset.repeat()
+
+    dataset = self.make_distributed_dataset(
+        dataset,
+        cluster,
+        processing_mode=data_service_ops.ShardingPolicy.HINT,
+        # Makes sure only there is one client request at most.
+        max_outstanding_requests=1,
+    )
+
+    get_next = self.getNext(dataset, requires_initialization=False)
+
+    results = []
+    for _ in range(100):
+      results.append(self.evaluate(get_next()))
+
+    self.assertNotIn(
+        0,
+        results,
+        "The worker producing 0 should be sleeping so 0 should not show up in"
+        " the results.",
+    )
+
 
 if __name__ == "__main__":
   multi_process_cluster.test_main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/weighted_flat_map_test.py b/tensorflow/python/data/experimental/kernel_tests/weighted_flat_map_test.py
new file mode 100644
index 00000000000000..8ff845dc0816a9
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/weighted_flat_map_test.py
@@ -0,0 +1,219 @@
+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.weighted_flat_map()`."""
+
+from typing import Callable
+
+from absl.testing import parameterized
+
+import numpy as np
+
+from tensorflow.python.data.experimental.ops import global_shuffle_op
+from tensorflow.python.data.experimental.ops import weighted_flat_map_op
+from tensorflow.python.data.kernel_tests import checkpoint_test_base
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import options as options_lib
+from tensorflow.python.framework import combinations
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import test
+
+
+class WeightedFlatMapTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testWeightedFlatMap(self):
+    dataset1 = dataset_ops.Dataset.range(10)
+    dataset2 = dataset_ops.Dataset.range(10, 20)
+    dataset3 = dataset_ops.Dataset.range(20, 30)
+    dataset = weighted_flat_map_op._weighted_flat_map(
+        [dataset1, dataset2, dataset3], np.asarray([1, 1, 2]))
+    self.assertDatasetProduces(
+        dataset, expected_output=list(range(5)) + list(range(10, 15)) +
+        list(range(20, 30)))
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testInvalidCardinality(self):
+    dataset1 = dataset_ops.Dataset.range(100)
+    dataset2 = dataset_ops.Dataset.range(100, 200)
+    dataset3 = dataset_ops.Dataset.range(200, 210)
+    with self.assertRaisesRegex(
+        errors.InvalidArgumentError, "Input.*needs to have at least."):
+      dataset = weighted_flat_map_op._weighted_flat_map(
+          [dataset1, dataset2, dataset3], np.asarray([1, 1, 100]))
+      self.getDatasetOutput(dataset, requires_initialization=True)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testInfiniteCardinality(self):
+    dataset1 = dataset_ops.Dataset.range(10).repeat()
+    dataset2 = dataset_ops.Dataset.range(10, 20)
+    dataset3 = dataset_ops.Dataset.range(20, 30)
+    with self.assertRaisesRegex(
+        errors.InvalidArgumentError,
+        "Cardinalities of the inputs must be known."):
+      dataset = weighted_flat_map_op._weighted_flat_map(
+          [dataset1, dataset2, dataset3])
+      self.getDatasetOutput(dataset, requires_initialization=True)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testEmptyInputDatasets(self):
+    dataset1 = dataset_ops.Dataset.from_tensor_slices([])
+    dataset2 = dataset_ops.Dataset.range(10, 20)
+    dataset3 = dataset_ops.Dataset.range(20, 30)
+    with self.assertRaisesRegex(
+        TypeError,
+        "Incompatible dataset elements"):
+      dataset = weighted_flat_map_op._weighted_flat_map(
+          [dataset1, dataset2, dataset3])
+      self.getDatasetOutput(dataset, requires_initialization=True)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testZeroWeight(self):
+    dataset1 = dataset_ops.Dataset.range(10)
+    dataset2 = dataset_ops.Dataset.range(10, 20)
+    dataset3 = dataset_ops.Dataset.range(20, 30)
+    with self.assertRaisesRegex(
+        errors.InvalidArgumentError,
+        "`weights` must be greater than 0.0"):
+      dataset = weighted_flat_map_op._weighted_flat_map(
+          [dataset1, dataset2, dataset3], [0, 1.0, 1.0])
+      self.getDatasetOutput(dataset, requires_initialization=True)
+
+
+class GlobalShuffleTest(test_base.DatasetTestBase, parameterized.TestCase):
+  """Tests for global shuffling of tf.data datasets."""
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testShuffledOutput(self):
+    dataset1 = dataset_ops.Dataset.range(10)
+    dataset2 = dataset_ops.Dataset.range(10, 20)
+    dataset3 = dataset_ops.Dataset.range(20, 30)
+    dataset = weighted_flat_map_op._weighted_flat_map(
+        [dataset1, dataset2, dataset3], np.asarray([0.25, 0.25, 0.5]))
+    dataset = global_shuffle_op._global_shuffle(dataset)
+    output = self.getDatasetOutput(dataset, requires_initialization=True)
+    self.assertCountEqual(
+        output, list(range(5)) + list(range(10, 15)) + list(range(20, 30)))
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testShuffledInputs(self):
+    dataset1 = dataset_ops.Dataset.range(10)
+    dataset2 = dataset_ops.Dataset.range(10, 20)
+    dataset3 = dataset_ops.Dataset.range(20, 30)
+    dataset1 = global_shuffle_op._global_shuffle(dataset1, seed=42)
+    dataset2 = global_shuffle_op._global_shuffle(dataset2, seed=42)
+    dataset3 = global_shuffle_op._global_shuffle(dataset3, seed=42)
+    dataset = weighted_flat_map_op._weighted_flat_map(
+        [dataset1, dataset2, dataset3], np.asarray([0.25, 0.25, 0.5]))
+    output = self.getDatasetOutput(dataset, requires_initialization=True)
+    # Verifies that the first 5 elements are from `dataset1` in a random order.
+    self.assertFalse(set(output[:5]).issubset(set(range(5))))
+    self.assertTrue(set(output[:5]).issubset(set(range(10))))
+    # Verifies that the second 5 elements are from `dataset2` in a random order.
+    self.assertFalse(set(output[5:10]).issubset(set(range(10, 15))))
+    self.assertTrue(set(output[5:10]).issubset(set(range(10, 20))))
+    # Verifies that the last 10 elements are from `dataset3` in a random order.
+    self.assertCountEqual(output[10:], range(20, 30))
+    self.assertNotEqual(output[10:], range(20, 30))
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testShuffledInputsAndOutput(self):
+    dataset1 = dataset_ops.Dataset.range(10)
+    dataset2 = dataset_ops.Dataset.range(10, 20)
+    dataset3 = dataset_ops.Dataset.range(20, 30)
+    dataset1 = global_shuffle_op._global_shuffle(dataset1, seed=42)
+    dataset2 = global_shuffle_op._global_shuffle(dataset2, seed=42)
+    dataset3 = global_shuffle_op._global_shuffle(dataset3, seed=42)
+    dataset = weighted_flat_map_op._weighted_flat_map(
+        [dataset1, dataset2, dataset3], np.asarray([0.25, 0.25, 0.5]))
+    dataset = global_shuffle_op._global_shuffle(dataset, seed=42)
+    output = self.getDatasetOutput(dataset, requires_initialization=True)
+    # Verifies that not all first 5 elements are from `dataset1`.
+    self.assertFalse(set(output[:5]).issubset(set(range(10))))
+    # Verifies that not all second 5 elements are from `dataset2`.
+    self.assertFalse(set(output[5:10]).issubset(set(range(10, 20))))
+    # Verifies that not all last 10 elements are from `dataset3`.
+    self.assertFalse(set(output[10:]).issubset(set(range(20, 30))))
+
+    sorted_output = sorted(output)
+    # Verifies that there are 5 elements from dataset1
+    self.assertTrue(set(sorted_output[:5]).issubset(set(range(10))))
+    # Verifies that there are 5 elements from dataset2
+    self.assertTrue(set(sorted_output[5:10]).issubset(set(range(10, 20))))
+    # Verifies that there are 10 elements from dataset3
+    self.assertTrue(set(sorted_output[10:]).issubset(set(range(20, 30))))
+
+
+class WeightedFlatMapGlobalShuffleCheckpointTest(
+    checkpoint_test_base.CheckpointTestBase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          checkpoint_test_base.default_test_combinations(),
+          combinations.combine(symbolic_checkpoint=[True, False])))
+  def testWeightedFlatMap(
+      self,
+      verify_fn: Callable[..., None],
+      symbolic_checkpoint: bool):
+
+    def _build_dataset() -> dataset_ops.Dataset:
+      dataset1 = dataset_ops.Dataset.range(10)
+      dataset2 = dataset_ops.Dataset.range(10, 20)
+      dataset3 = dataset_ops.Dataset.range(20, 30)
+      dataset = weighted_flat_map_op._weighted_flat_map(
+          [dataset1, dataset2, dataset3], np.asarray([0.25, 0.25, 0.5]))
+      options = options_lib.Options()
+      options.experimental_optimization.apply_default_optimizations = False
+      options.experimental_symbolic_checkpoint = symbolic_checkpoint
+      return dataset.with_options(options)
+
+    verify_fn(self, _build_dataset, num_outputs=20)
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          checkpoint_test_base.default_test_combinations(),
+          combinations.combine(
+              reshuffle_each_iteration=[True, False],
+              symbolic_checkpoint=[True, False])))
+  def testGlobalshuffle(
+      self,
+      verify_fn: Callable[..., None],
+      reshuffle_each_iteration: bool,
+      symbolic_checkpoint: bool):
+
+    def _build_dataset() -> dataset_ops.Dataset:
+      dataset1 = dataset_ops.Dataset.range(10)
+      dataset2 = dataset_ops.Dataset.range(10, 20)
+      dataset3 = dataset_ops.Dataset.range(20, 30)
+      dataset = weighted_flat_map_op._weighted_flat_map(
+          [dataset1, dataset2, dataset3], np.asarray([0.25, 0.25, 0.5]))
+      dataset = global_shuffle_op._global_shuffle(
+          dataset, seed=42, reshuffle_each_iteration=reshuffle_each_iteration)
+      options = options_lib.Options()
+      options.experimental_optimization.apply_default_optimizations = False
+      options.experimental_symbolic_checkpoint = symbolic_checkpoint
+      return dataset.with_options(options)
+
+    verify_fn(
+        self,
+        _build_dataset,
+        num_outputs=20,
+        assert_items_equal=reshuffle_each_iteration)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/ops/BUILD b/tensorflow/python/data/experimental/ops/BUILD
index 3151856ff5831d..946a030d50544c 100644
--- a/tensorflow/python/data/experimental/ops/BUILD
+++ b/tensorflow/python/data/experimental/ops/BUILD
@@ -103,6 +103,7 @@ py_strict_library(
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/ops:experimental_dataset_ops_gen",
         "//tensorflow/python/saved_model:nested_structure_coder",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -207,6 +208,23 @@ py_strict_library(
     ],
 )
 
+py_strict_library(
+    name = "index_flat_map_op",
+    srcs = [
+        "index_flat_map_op.py",
+    ],
+    srcs_version = "PY3",
+    deps = [
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:structured_function",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/ops:experimental_dataset_ops_gen",
+    ],
+)
+
 py_strict_library(
     name = "interleave_ops",
     srcs = ["interleave_ops.py"],
@@ -507,6 +525,25 @@ py_strict_library(
     ],
 )
 
+py_strict_library(
+    name = "weighted_flat_map_op",
+    srcs = [
+        "weighted_flat_map_op.py",
+    ],
+    srcs_version = "PY3",
+    deps = [
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:convert",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:random_seed",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor",
+        "//tensorflow/python/ops:dataset_ops_gen",
+        "//tensorflow/python/ops:experimental_dataset_ops_gen",
+    ],
+)
+
 py_strict_library(
     name = "writers",
     srcs = [
diff --git a/tensorflow/python/data/experimental/ops/distributed_save_op.py b/tensorflow/python/data/experimental/ops/distributed_save_op.py
index bd2b547a355bbc..6e57237cf73da3 100644
--- a/tensorflow/python/data/experimental/ops/distributed_save_op.py
+++ b/tensorflow/python/data/experimental/ops/distributed_save_op.py
@@ -22,23 +22,60 @@
 from tensorflow.python.ops import gen_experimental_dataset_ops
 # TODO(b/238903802): Use TypeSpec serialization methods directly.
 from tensorflow.python.saved_model import nested_structure_coder
+from tensorflow.python.util.tf_export import tf_export
 
 
-# TODO(b/250921378): Add example to docstring and export to TF API.
+@tf_export("data.experimental.distributed_save")
 def distributed_save(
     dataset: dataset_ops.Dataset,
     path: str,
-    dispatcher_address: str,
+    data_service_address: str,
     compression: str = "AUTO",
 ) -> Optional[ops.OperationType]:
-  """Initiates the process of distributedly saving a dataset to disk.
+  """Initiates the process of saving a dataset to disk using tf.data service.
+
+  The op uses tf.data service
+  (https://www.tensorflow.org/api_docs/python/tf/data/experimental/service) to
+  write a dataset snapshot. Returns immediately after submitting the request.
+  Does not wait for the snapshot to be finished. Requires that the tf.data
+  service run a fixed number of worker replicas.
+
+  To load the snapshot, users may optionally pass `wait=True` to
+  `tf.data.Dataset.load` so it can read snapshots as they are being written.
+
+  Example usage:
+
+  >>> import os
+  >>> import tempfile
+
+  >>> # Runs tf.data service.
+  >>> tempdir = tempfile.gettempdir()
+  >>> dispatcher = tf.data.experimental.service.DispatchServer(
+  ...     tf.data.experimental.service.DispatcherConfig(
+  ...         fault_tolerant_mode=True,
+  ...         work_dir=os.path.join(tempdir, "work_dir")))
+  >>> dispatcher_address = dispatcher.target.split("://")[1]
+  >>> worker = tf.data.experimental.service.WorkerServer(
+  ...     tf.data.experimental.service.WorkerConfig(
+  ...         dispatcher_address=dispatcher_address))
+
+  >>> # Writes dataset snapshot.
+  >>> path = os.path.join(tempdir, "dataset_snapshot")
+  >>> dataset = tf.data.Dataset.range(1)
+  >>> tf.data.experimental.distributed_save(dataset, path, dispatcher_address)
+
+  >>> # Loads a dataset snapshot.
+  >>> loaded_dataset = tf.data.Dataset.load(path, wait=True)
+  >>> for elem in loaded_dataset:
+  ...   print(elem)
+  tf.Tensor(0, shape=(), dtype=int64)
 
   Args:
     dataset: The `tf.data.Dataset` to save.
-    path: The directory path to save the dataset. Requires that the directory
-      do not exist and will create the directory.
-    dispatcher_address: The address of the tf.data service dispatcher used to
-      save `dataset`.
+    path: The directory path to save the dataset. Requires that:
+      - The directory does not exist and will create the directory.
+      - The file system supports atomic move (rename).
+    data_service_address: tf.data service dispatcher address.
     compression: (Optional.) Whether and how to compress the `dataset` snapshot.
       If `"AUTO"`, the tf.data runtime decides which algorithm to use. If
       `"GZIP"` or `"SNAPPY"`, that specific algorithm is used.  If `None`, the
@@ -49,13 +86,18 @@ def distributed_save(
 
   Raises:
     ValueError: If `dispatcher_address` is invalid.
-    tf.errors.AlreadyExistsError: If the snapshot already exists.
+    tf.errors.AlreadyExistsError: If the snapshot has already started or has
+      finished.
+    tf.errors.FailedPreconditionError: If the file system does not support
+      atomic move (rename).
+    tf.errors.InvalidArgumentError: If tf.data service is not running in the
+      fault tolerant mode.
   """
-  if not isinstance(dispatcher_address, str):
-    raise ValueError("`dispatcher_address` must be a string, but is a "
-                     f"{type(dispatcher_address)} ({dispatcher_address}")
-  if not dispatcher_address:
-    raise ValueError("`dispatcher_address` must not be empty")
+  if not isinstance(data_service_address, str):
+    raise ValueError("`data_service_address` must be a string, but is a "
+                     f"{type(data_service_address)} ({data_service_address}")
+  if not data_service_address:
+    raise ValueError("`data_service_address` must not be empty")
 
   metadata = snapshot_pb2.DistributedSnapshotMetadata(
       element_spec=nested_structure_coder.encode_structure(
@@ -65,5 +107,5 @@ def distributed_save(
   return gen_experimental_dataset_ops.distributed_save(
       dataset._variant_tensor,  # pylint: disable=protected-access
       directory=path,
-      address=dispatcher_address,
+      address=data_service_address,
       metadata=metadata.SerializeToString())
diff --git a/tensorflow/python/data/experimental/ops/index_flat_map_op.py b/tensorflow/python/data/experimental/ops/index_flat_map_op.py
new file mode 100644
index 00000000000000..5b1242b521b7f8
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/index_flat_map_op.py
@@ -0,0 +1,131 @@
+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python API for `index_flat_map` dataset, which supports global shuffling."""
+
+from typing import Any, Callable, Optional, Union
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import structured_function
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
+
+_IndexType = Union[int, tensor.Tensor]
+
+
+def index_flat_map(  # pylint: disable=unused-private-name
+    input_dataset: dataset_ops.Dataset,
+    map_func: Callable[[Any], tensor.Tensor],
+    index_map_func: Callable[[_IndexType], tuple[_IndexType, _IndexType]],
+    *,
+    output_cardinality: _IndexType = dataset_ops.UNKNOWN,
+    name: Optional[str] = None) -> dataset_ops.Dataset:
+  """A variant of flat_map that supports global shuffling.
+
+  In addition to a `map_func`, the user needs to provide an `index_map_fn`.
+  Given an index in the flattened dataset, the `index_map_fn` returns an
+  (element_index, offset) tuple that represents the index of the element in the
+  unflattened dataset, and the offset in the unflattened element.
+
+  For example, users could flatten a dataset as following:
+
+  def _split(element: str) -> tensor.Tensor:
+    return tf.strings.split(element, " ")
+
+  def _index_map_func(flattened_index: int) -> tuple[int, int]:
+    '''Returns an (element_index, offset) tuple for the requested element index.
+
+    For a `flattened_index` that represents an element index in the output
+    dataset, this function returns a tuple that represents the index of the
+    element in the unflattened dataset, and the offset in that element.
+    See the example below on how to implement this function.
+    '''
+
+  dataset = tf.data.Dataset.from_tensor_slices([["0 1", "2 3 4 5", "6 7", "8"])
+  dataset = index_flat_map_op.index_flat_map(dataset, _split, _index_map_func)
+  for x in dataset:
+    print(x)  # Produces "0", "1", "2", "3", "4", "5", "6", "7", "8".
+
+  Given an input of 5, the `index_map_func` should return (1, 3), which means
+  that the target element in the result dataset is in the 2nd element in the
+  original dataset ("2 3 4 5") with an offset of 3, which is "5".
+
+  Args:
+    input_dataset: The input dataset.
+    map_func: A function mapping a dataset element to a Tensor of the mapped
+      elements.
+    index_map_func: Given an index in the flattened dataset, returns a
+      (element index, offset) tuple that represents the index of the element in
+      the unflattened dataset, and the offset in the unflattened element.
+    output_cardinality: Cardinality of the output dataset. Can be an int or a
+      Tensor of int64. Required if the dataset is globally shuffled because the
+      cardinality cannot be inferred as `map_func` produces a varied number of
+      output elements from each input element.
+    name: (Optional.) A name for the tf.data operation.
+
+  Returns:
+    A new `Dataset` with the transformation applied as described above.
+  """
+  return _IndexFlatMapDataset(
+      input_dataset, map_func, index_map_func, output_cardinality, name)
+
+
+class _IndexFlatMapDataset(dataset_ops.UnaryDataset):
+  """A `Dataset` that maps a function over its input and flattens the result."""
+
+  def __init__(
+      self,
+      input_dataset: dataset_ops.Dataset,
+      map_func: Callable[[Any], tensor.Tensor],
+      index_map_func: Callable[[_IndexType], tuple[_IndexType, _IndexType]],
+      output_cardinality: _IndexType = dataset_ops.UNKNOWN,
+      name: str = None):
+
+    self._input_dataset = input_dataset
+    self._map_func = structured_function.StructuredFunctionWrapper(
+        map_func,
+        transformation_name=f"{self._transformation_name()}.map_func",
+        dataset=input_dataset)
+    self._index_map_func = structured_function.StructuredFunctionWrapper(
+        index_map_func,
+        transformation_name=f"{self._transformation_name()}.index_map_func",
+        input_structure=tensor_spec.TensorSpec([], dtypes.int64))
+    self._output_cardinality = ops.convert_to_tensor(
+        output_cardinality, dtype=dtypes.int64)
+    self._name = name
+    variant_tensor = ged_ops.index_flat_map_dataset(
+        input_dataset._variant_tensor,
+        self._map_func.function.captured_inputs,
+        self._index_map_func.function.captured_inputs,
+        map_func=self._map_func.function,
+        index_map_func=self._index_map_func.function,
+        output_cardinality=self._output_cardinality,
+        **self._common_args)
+    super().__init__(input_dataset, variant_tensor)
+
+  # TODO(b/325112575): Make sure this works if `map_func` returns lists.
+  @property
+  def element_spec(self) -> Any:
+    return tensor_spec.TensorSpec(
+        shape=[],
+        dtype=self._map_func.output_structure.dtype)
+
+  def _functions(self) -> list[structured_function.StructuredFunctionWrapper]:
+    return [self._map_func, self._index_map_func]
+
+  def _transformation_name(self) -> str:
+    return "Dataset.index_flat_map()"
diff --git a/tensorflow/python/data/experimental/ops/weighted_flat_map_op.py b/tensorflow/python/data/experimental/ops/weighted_flat_map_op.py
new file mode 100644
index 00000000000000..c20d6d283c1210
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/weighted_flat_map_op.py
@@ -0,0 +1,162 @@
+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The implementation of `weighted_flat_map`."""
+
+from typing import Optional, Sequence, Union
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest as tf_nest
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
+
+
+def _weighted_flat_map(  # pylint: disable=unused-private-name
+    input_datasets: Sequence[dataset_ops.DatasetV2],
+    weights: Optional[Sequence[Union[float, tensor.Tensor]]] = None,
+    name: Optional[str] = None) -> dataset_ops.DatasetV2:
+  """A `Dataset` that fetches elements from `input_datasets` and flattens them.
+
+  This operation combines elements from multiple datasets into a flattened
+  dataset. Elements are read in proportion to the `weights` assigned to each
+  input dataset. All requested elements from a dataset are read before reading
+  the elements from the next dataset.
+
+  For example, suppose we have 2 datasets:
+
+  # TODO(wilsin): Make the following code testable after the API is released.
+  dataset1 = tf.data.Dataset.range(0, 10)
+  dataset2 = tf.data.Dataset.range(10, 20),
+
+  Suppose that we call `weighted_flat_map` from these 2 datasets with the
+  following weights:
+
+  dataset = tf.data.Dataset.weighted_flat_map([dataset1, dataset2], [0.5, 1.0])
+
+  Then, the outcome of the elements is:
+  # [0, 1, 2, 3, 4, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
+
+  Args:
+    input_datasets: A non-empty list of `tf.data.Dataset` objects with
+      compatible structure.
+    weights: (Optional.) A list or Tensor of `len(datasets)` non-zero
+      floating-point values where `weights[i]` represents the probability to
+      sample from `datasets[i]`, or a `tf.data.Dataset` object where each
+      element is such a list. Defaults to a uniform distribution across
+      `datasets`.
+    name: (Optional.) A name for the tf.data operation.
+
+  Returns:
+    A dataset that reads elements from all its inputs, reading the requested
+    elements from an input according to the weight before proceeding to the next
+    input. The number of elements read from an input is in proportion to its
+    weight given in `weights`.
+
+  Raises:
+    TypeError: if the `datasets` or `weights` arguments have the wrong type.
+    ValueError:
+      - if `input_datasets` has less than 2 datasets.
+      - if `weights` is specified and does not match the length of
+        `input_datasets`.
+    InvalidArgumentError:
+      - if any of the `input_datasets` has an unknown or infinite cardinality.
+      - if any of the `weights` has a value that is less than or equal to 0.0
+  """
+  return _WeightedFlatMap(input_datasets, weights, name=name)
+
+
+class _WeightedFlatMap(dataset_ops.DatasetV2):
+  """A `Dataset` that maps a function over its input and flattens the result."""
+
+  def __init__(
+      self,
+      input_datasets: Sequence[dataset_ops.DatasetV2],
+      weights: Optional[Sequence[Union[float, tensor.Tensor]]] = None,
+      name: Optional[str] = None):
+    if not input_datasets:
+      raise ValueError("Invalid `datasets`. `datasets` should not be empty.")
+    if len(input_datasets) < 2:
+      raise ValueError(
+          "Invalid `datasets`. `datasets` should have at least two datasets.")
+    self._input_datasets = input_datasets
+    self._name = name
+
+    def common_supertype(a, b):
+      result = a.most_specific_common_supertype([b])
+      if result is None:
+        raise TypeError(f"No common supertype of {a} and {b}.")
+      return result
+
+    self._structure = input_datasets[0].element_spec
+    for dataset in input_datasets[1:]:
+      try:
+        self._structure = tf_nest.map_structure(
+            common_supertype, self._structure, dataset.element_spec
+        )
+      except (TypeError, ValueError) as e:
+        raise TypeError(
+            "Incompatible dataset elements:\n"
+            f"  {input_datasets[0].element_spec} vs. "
+            f"  {dataset.element_spec}"
+        ) from e
+
+    if weights is None:
+      weights = [1.0] * len(input_datasets)
+    else:
+      if isinstance(weights, tensor.Tensor):
+        if not weights.shape.is_compatible_with([len(input_datasets)]):
+          raise ValueError(
+              "Invalid `weights`. The shape of `weights` "
+              "should be compatible with `[len(datasets)]` "
+              f"but is {weights.shape}."
+          )
+      else:
+        if len(input_datasets) != len(weights):
+          raise ValueError(
+              "Invalid `weights`. `weights` should have the "
+              "same length as `datasets` but got "
+              f"`len(weights)={len(weights)}` vs. "
+              f"`len(datasets)={len(input_datasets)}`."
+          )
+      weights = [
+          ops.convert_to_tensor(w, preferred_dtype=dtypes.float64)
+          for w in weights
+      ]
+      for weight in weights:
+        if weight.dtype not in (dtypes.float32, dtypes.float64):
+          raise TypeError(
+              "Invalid `weights`. `weights` type must be either "
+              "`tf.float32` or `tf.float64` but is "
+              f"{weight.dtype}."
+          )
+
+    # pylint: disable=protected-access
+    variant_tensor = ged_ops.weighted_flat_map_dataset(
+        [dataset._variant_tensor for dataset in self._input_datasets],
+        weights,
+        **self._common_args,
+    )
+    super().__init__(variant_tensor)
+
+  def _inputs(self):
+    return self._input_datasets
+
+  @property
+  def element_spec(self):
+    return self._structure
+
+  def _transformation_name(self):
+    return "Dataset.weighted_flat_map()"
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index bfdb79b802c64b..daf24612198162 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -94,6 +94,7 @@ tf_py_strict_test(
         ":test_base",
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/checkpoint:checkpoint_management",
+        "//tensorflow/python/data/experimental/ops:global_shuffle_op",
         "//tensorflow/python/data/experimental/ops:random_access",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
diff --git a/tensorflow/python/data/kernel_tests/cache_test.py b/tensorflow/python/data/kernel_tests/cache_test.py
index fce4c7bd30c3f9..7e0324cdd11921 100644
--- a/tensorflow/python/data/kernel_tests/cache_test.py
+++ b/tensorflow/python/data/kernel_tests/cache_test.py
@@ -18,11 +18,13 @@
 from os import path
 import shutil
 import tempfile
+from typing import Optional
 
 from absl.testing import parameterized
 import numpy as np
 from tensorflow.python.checkpoint import checkpoint as trackable_utils
 from tensorflow.python.checkpoint import checkpoint_management
+from tensorflow.python.data.experimental.ops import global_shuffle_op
 from tensorflow.python.data.experimental.ops import random_access
 from tensorflow.python.data.kernel_tests import checkpoint_test_base
 from tensorflow.python.data.kernel_tests import test_base
@@ -692,5 +694,38 @@ def testCacheInputDatasetInfiniteCardinality(self):
     # with caching will cache through index 11.
     self.verifyRandomAccessInfiniteCardinality(dataset, expected)
 
+
+class CacheGlobalShuffleTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(
+              dataset_range=[10],
+              repetitions=[1, 2],
+              seed=[None, 42],
+              reshuffle_each_iteration=[True, False])))
+  def test(
+      self,
+      dataset_range: int,
+      repetitions: int,
+      seed: Optional[int],
+      reshuffle_each_iteration: bool):
+    dataset = dataset_ops.Dataset.range(dataset_range)
+    dataset = dataset.cache()
+    dataset = dataset.prefetch(buffer_size=dataset_ops.AUTOTUNE)
+    if repetitions > 1:
+      dataset = dataset.repeat(repetitions)
+    dataset = global_shuffle_op._global_shuffle(
+        dataset, seed=seed, reshuffle_each_iteration=reshuffle_each_iteration)
+
+    expected = list(range(0, dataset_range)) * repetitions
+    dataset_output = self.getDatasetOutput(
+        dataset, requires_initialization=True)
+    self.assertCountEqual(dataset_output, expected)
+    self.assertNotEqual(dataset_output, expected)
+    self.assertLen(dataset_output, self.evaluate(dataset.cardinality()))
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/io_test.py b/tensorflow/python/data/kernel_tests/io_test.py
index a76cfd0a09d9f1..ad2945dce322d5 100644
--- a/tensorflow/python/data/kernel_tests/io_test.py
+++ b/tensorflow/python/data/kernel_tests/io_test.py
@@ -13,8 +13,12 @@
 # limitations under the License.
 # ==============================================================================
 """Tests for the `tf.data.experimental.{save,load}` operations."""
+
 import os
 import shutil
+import threading
+import time
+
 from absl.testing import parameterized
 import numpy as np
 from tensorflow.python.data.kernel_tests import checkpoint_test_base
@@ -137,6 +141,26 @@ def testRepeatAndPrefetch(self):
     for _ in range(30):
       self.evaluate(next_element())
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testWait(self):
+    dataset_range = 50
+
+    def load_thread_fn():
+      dataset = dataset_ops.Dataset.load(self._test_dir)
+      self.assertDatasetProduces(
+          dataset, list(range(dataset_range)), assert_items_equal=True)
+    load_thread = threading.Thread(target=load_thread_fn, name="load_thread")
+    load_thread.start()
+
+    def save_thread_fn():
+      time.sleep(5)
+      dataset = dataset_ops.Dataset.range(dataset_range)
+      self.evaluate(dataset.save(self._test_dir))
+    save_thread = threading.Thread(target=save_thread_fn, name="save_thread")
+    save_thread.start()
+    save_thread.join()
+    load_thread.join()
+
 
 class LoadCheckpointTest(IOTest, checkpoint_test_base.CheckpointTestBase):
 
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 7b4d2cb43cca51..5ead7f5950110e 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -1767,7 +1767,7 @@ def custom_shard_func(element):
 
   @staticmethod
   def load(
-      path, element_spec=None, compression=None, reader_func=None
+      path, element_spec=None, compression=None, reader_func=None, wait=False,
   ) -> "DatasetV2":
     """Loads a previously saved dataset.
 
@@ -1815,6 +1815,11 @@ def custom_reader_func(datasets):
       reader_func: Optional. A function to control how to read data from shards.
         If present, the function will be traced and executed as graph
         computation.
+      wait: If `True`, for snapshots written with `distributed_save`, it reads
+        the snapshot while it is being written. For snapshots written with
+        regular `save`, it waits for the snapshot until it's finished. The
+        default is `False` for backward compatibility. Users of
+        `distributed_save` are recommended to set it to `True`.
 
     Returns:
       A `tf.data.Dataset` instance.
@@ -1833,7 +1838,8 @@ def custom_reader_func(datasets):
         path=path,
         element_spec=element_spec,
         compression=compression,
-        reader_func=reader_func)
+        reader_func=reader_func,
+        wait=wait)
     # pylint: enable=g-import-not-at-top,protected-access
 
   def batch(
diff --git a/tensorflow/python/data/ops/load_op.py b/tensorflow/python/data/ops/load_op.py
index aa14d3ea5b070e..487159bbe4811a 100644
--- a/tensorflow/python/data/ops/load_op.py
+++ b/tensorflow/python/data/ops/load_op.py
@@ -34,19 +34,23 @@
 # TODO(b/238903802): Use TypeSpec serialization methods directly.
 from tensorflow.python.saved_model import nested_structure_coder
 
-# For distributed snapshot load V2, if the snapshot does not exist in this time,
-# wait and retry. Raises an ValueError on timeout.
-_LOAD_TIMEOUT_SECONDS = 1800
+# For distributed snapshot load V2, retries loading after this time, if the
+# snapshot is not ready yet.
+_RETRY_INTERVAL_SEC = 5
 
 
 def _load(  # pylint: disable=unused-private-name
     path: str,
     element_spec: Any,
     compression: Optional[str],
-    reader_func: Optional[Callable[[dataset_ops.Dataset], dataset_ops.Dataset]]
+    reader_func: Optional[Callable[[dataset_ops.Dataset], dataset_ops.Dataset]],
+    wait: bool,
 ) -> dataset_ops.Dataset:
   """Loads dataset from tf.data snapshot."""
 
+  if wait:
+    return _load_with_retry(path, element_spec, compression, reader_func)
+
   if reader_func is None:
     reader_func = lambda datasets: datasets.interleave(  # pylint:disable=g-long-lambda
         lambda x: x,
@@ -72,22 +76,22 @@ def _load_with_retry(  # pylint: disable=unused-private-name
     reader_func: Optional[
         Callable[[dataset_ops.Dataset], dataset_ops.Dataset]] = None,
 ) -> dataset_ops.Dataset:
-  """Tries loading the snapshot. Retries if not found with a timeout."""
+  """Tries loading the snapshot. Retries if not found."""
 
-  deadline = time.time() + _LOAD_TIMEOUT_SECONDS
-  error = None
-  while time.time() < deadline:
+  while True:
     try:
       dataset = dataset_ops.Dataset.load(
-          path, element_spec, compression, reader_func)
+          path=path,
+          element_spec=element_spec,
+          compression=compression,
+          reader_func=reader_func,
+          wait=False)
       logging.info("Load tf.data snapshot at %s.", path)
       return dataset
-    except (errors.NotFoundError, FileNotFoundError) as e:
+    except (errors.NotFoundError, FileNotFoundError):
       logging.info(
           "Could not find tf.data snapshot at %s. Will wait and retry.", path)
-      error = e
-      time.sleep(10)
-  raise error
+      time.sleep(_RETRY_INTERVAL_SEC)
 
 
 def _load_distributed_snapshot_metadata(
@@ -102,10 +106,12 @@ def _load_distributed_snapshot_metadata(
     DistributedSnapshotMetadata if the snapshot is a distributed snapshot.
     Returns None if it is a non-distributed snapshot.
   """
+  metadata_file = _pywrap_snapshot_utils.TF_DATA_SnapshotMetadataFilePath(path)
+  if not gfile.Exists(metadata_file):
+    return None
+
   try:
-    with gfile.GFile(
-        _pywrap_snapshot_utils.TF_DATA_SnapshotMetadataFilePath(path), "r"
-    ) as f:
+    with gfile.GFile(metadata_file, "r") as f:
       return text_format.ParseLines(
           f, snapshot_pb2.DistributedSnapshotMetadata())
   except (
@@ -145,6 +151,12 @@ def _load_element_spec(path: str) -> Any:
     NotFoundError if the element spec file does not exist or cannot be decoded.
   """
   dataset_spec_filename = os.path.join(path, dataset_ops.DATASET_SPEC_FILENAME)
+  if not gfile.Exists(dataset_spec_filename):
+    raise errors.NotFoundError(
+        node_def=None, op=None,
+        message="tf.data snapshot element_spec file not found: "
+                f"{dataset_spec_filename}.")
+
   with gfile.GFile(dataset_spec_filename, "rb") as f:
     encoded_spec = f.read()
   try:
diff --git a/tensorflow/python/distribute/tpu_strategy_test.py b/tensorflow/python/distribute/tpu_strategy_test.py
index b006ab65053cad..425310407ca2a6 100644
--- a/tensorflow/python/distribute/tpu_strategy_test.py
+++ b/tensorflow/python/distribute/tpu_strategy_test.py
@@ -25,7 +25,6 @@
 from tensorflow.python.distribute import tpu_strategy as tpu_lib
 from tensorflow.python.distribute import tpu_values
 from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
-from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import remote
 from tensorflow.python.eager import test
@@ -44,11 +43,9 @@
 from tensorflow.python.ops import control_flow_switch_case
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import gen_dataset_ops
-from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.ragged import ragged_tensor
@@ -133,33 +130,6 @@ def foo(x):
       result = foo(a)
     self.assertAllEqual(6, result)
 
-  # In this case, the entire computation in foo is compiled using JIT
-  # compilation and contains unsupported ops that should be outside compiled.
-  def test_single_tpu_jit_compile_with_outside_compilation(self):
-    context.enable_jit_compile_rewrite()
-    get_tpu_strategy(True)
-    config.set_soft_device_placement(True)
-    with ops.device("/device:TPU:1"):
-      a = variables.Variable(1)
-
-    def get_a_plus_one():
-      return a + 1
-
-    @def_function.function(
-        input_signature=[tensor_spec.TensorSpec([], dtypes.int32)])
-    def foo(x):
-      b = x + get_a_plus_one()
-      my_str = string_ops.as_string(b)
-      new_str = my_str + "0"
-      c = string_ops.string_to_number(new_str, out_type=dtypes.int32)
-      logging_ops.print_v2(c)
-      b = c + get_a_plus_one()
-      return b + 1
-
-    with ops.device("/device:TPU:1"):
-      result = foo(a)
-    self.assertAllEqual(33, result)
-
   # In this case, each of the ops in the TPU device scope are compiled and run
   # individually.
   def test_single_tpu_on_demand(self):
diff --git a/tensorflow/python/eager/polymorphic_function/BUILD b/tensorflow/python/eager/polymorphic_function/BUILD
index 2492bd4bd06124..29cbfbd4715e5a 100644
--- a/tensorflow/python/eager/polymorphic_function/BUILD
+++ b/tensorflow/python/eager/polymorphic_function/BUILD
@@ -540,9 +540,7 @@ py_strict_library(
     name = "function_context",
     srcs = ["function_context.py"],
     srcs_version = "PY3",
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
     deps = [
         "//tensorflow/core/function/polymorphism:function_cache",
         "//tensorflow/python/eager:context",
@@ -620,9 +618,7 @@ py_strict_library(
     name = "compiler_ir",
     srcs = ["compiler_ir.py"],
     srcs_version = "PY3",
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
     deps = [
         "//tensorflow/core/function/trace_type",
         "//tensorflow/python/eager:context",
diff --git a/tensorflow/python/eager/pywrap_gradient_exclusions.cc b/tensorflow/python/eager/pywrap_gradient_exclusions.cc
index f259ae9cebb17f..4bd7ecbccedf7d 100644
--- a/tensorflow/python/eager/pywrap_gradient_exclusions.cc
+++ b/tensorflow/python/eager/pywrap_gradient_exclusions.cc
@@ -50,7 +50,7 @@ auto OpGradientInfoInit(const T &a) {
 
 absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
     const tensorflow::string &op_name) {
-  static std::array<OpIndexInfo, 363> a = {{
+  static std::array<OpIndexInfo, 366> a = {{
       {"Acosh"},
       {"AllToAll", 1, {0}},
       {"ApproximateEqual"},
@@ -124,6 +124,9 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
       {"FFT"},
       {"FFT2D"},
       {"FFT3D"},
+      {"FakeQuantWithMinMaxArgsGradient"},
+      {"FakeQuantWithMinMaxVarsGradient"},
+      {"FakeQuantWithMinMaxVarsPerChannelGradient"},
       {"Fill"},
       {"FixedLengthRecordReader"},
       {"Floor"},
@@ -426,7 +429,7 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
 
 absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedOutputIndices(
     const tensorflow::string &op_name) {
-  static std::array<OpIndexInfo, 482> a = {{
+  static std::array<OpIndexInfo, 485> a = {{
       {"Abs"},
       {"AccumulateNV2"},
       {"Acos"},
@@ -542,8 +545,11 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedOutputIndices(
       {"FFT2D"},
       {"FFT3D"},
       {"FakeQuantWithMinMaxArgs"},
+      {"FakeQuantWithMinMaxArgsGradient"},
       {"FakeQuantWithMinMaxVars"},
+      {"FakeQuantWithMinMaxVarsGradient"},
       {"FakeQuantWithMinMaxVarsPerChannel"},
+      {"FakeQuantWithMinMaxVarsPerChannelGradient"},
       {"Fill"},
       {"FixedLengthRecordReader"},
       {"Floor"},
diff --git a/tensorflow/python/flags_pybind.pyi b/tensorflow/python/flags_pybind.pyi
index b108f4c7fdb515..c08b02ddcb9b5d 100644
--- a/tensorflow/python/flags_pybind.pyi
+++ b/tensorflow/python/flags_pybind.pyi
@@ -23,6 +23,7 @@ class Flags:
     enable_colocation_key_propagation_in_while_op_lowering: Flag
     enable_nested_function_shape_inference: Flag
     enable_quantized_dtypes_training: Flag
+    enable_tf2min_ici_weight: Flag
     graph_building_optimization: Flag
     more_stack_traces: Flag
     op_building_optimization: Flag
diff --git a/tensorflow/python/framework/BUILD b/tensorflow/python/framework/BUILD
index dccc222720757f..e1a09c076382fc 100644
--- a/tensorflow/python/framework/BUILD
+++ b/tensorflow/python/framework/BUILD
@@ -863,8 +863,8 @@ tf_python_pybind_extension(
         "//tensorflow/core/distributed_runtime/eager:pywrap_required_hdrs",
         "//tensorflow/python/eager:pywrap_required_hdrs",
         "//tensorflow/python/lib/core:safe_pyobject_ptr_required_hdrs",
-        "@local_tsl//tsl/distributed_runtime:pywrap_required_hdrs",
-        "@local_tsl//tsl/distributed_runtime/coordination:pywrap_required_hdrs",
+        "@local_xla//xla/tsl/distributed_runtime:pywrap_required_hdrs",
+        "@local_xla//xla/tsl/distributed_runtime/coordination:pywrap_required_hdrs",
         "@local_xla//xla/tsl/python/lib/core:numpy_hdr",
     ],
     enable_stub_generation = True,
@@ -963,8 +963,8 @@ tf_python_pybind_extension(
         "//tensorflow/core/distributed_runtime/eager:pywrap_required_hdrs",
         "//tensorflow/python/eager:pywrap_required_hdrs",
         "//tensorflow/python/lib/core:safe_pyobject_ptr_required_hdrs",
-        "@local_tsl//tsl/distributed_runtime:pywrap_required_hdrs",
-        "@local_tsl//tsl/distributed_runtime/coordination:pywrap_required_hdrs",
+        "@local_xla//xla/tsl/distributed_runtime:pywrap_required_hdrs",
+        "@local_xla//xla/tsl/distributed_runtime/coordination:pywrap_required_hdrs",
         "@local_xla//xla/tsl/python/lib/core:numpy_hdr",
     ],
     enable_stub_generation = True,
@@ -1107,8 +1107,8 @@ tf_python_pybind_extension(
         "//tensorflow/core/distributed_runtime/eager:pywrap_required_hdrs",
         "//tensorflow/python/eager:pywrap_required_hdrs",
         "//tensorflow/python/lib/core:safe_pyobject_ptr_required_hdrs",
-        "@local_tsl//tsl/distributed_runtime:pywrap_required_hdrs",
-        "@local_tsl//tsl/distributed_runtime/coordination:pywrap_required_hdrs",
+        "@local_xla//xla/tsl/distributed_runtime:pywrap_required_hdrs",
+        "@local_xla//xla/tsl/distributed_runtime/coordination:pywrap_required_hdrs",
         "@local_xla//xla/tsl/python/lib/core:numpy_hdr",
     ],
     enable_stub_generation = True,
diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index 488ae42f460e64..c5ba8157dc73e2 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -1014,6 +1014,11 @@ def _MirrorPadGradGrad(op: ops.Operation, grad):
   return [gen_array_ops.mirror_pad(grad, op.inputs[1], mode=mode), None]
 
 
+ops.NotDifferentiable("FakeQuantWithMinMaxArgsGradient")
+ops.NotDifferentiable("FakeQuantWithMinMaxVarsGradient")
+ops.NotDifferentiable("FakeQuantWithMinMaxVarsPerChannelGradient")
+
+
 @ops.RegisterGradient("QuantizeAndDequantize")
 def _QuantizeAndDequantizeGrad(_, grad):
   return grad
diff --git a/tensorflow/python/ops/memory_tests/custom_gradient_memory_test.py b/tensorflow/python/ops/memory_tests/custom_gradient_memory_test.py
index c59009fb809e90..78e9337e5ea771 100644
--- a/tensorflow/python/ops/memory_tests/custom_gradient_memory_test.py
+++ b/tensorflow/python/ops/memory_tests/custom_gradient_memory_test.py
@@ -15,6 +15,7 @@
 """Memory tests for tensorflow.ops.custom_gradient."""
 
 import functools
+import unittest
 
 from absl.testing import parameterized
 from xla.service import hlo_pb2
@@ -110,6 +111,7 @@ def run(test_func):
     res_recompute = run(f_recompute)
     self.assertAllClose(res_no_recompute, res_recompute)
 
+  @unittest.skip("b/335476600")
   @test_util.run_v2_only
   def testRecomputeGradXla(self):
     device_type = self._get_device_type()
diff --git a/tensorflow/python/profiler/internal/BUILD b/tensorflow/python/profiler/internal/BUILD
index c7ba629f1a0cfd..db0a0cc922a944 100644
--- a/tensorflow/python/profiler/internal/BUILD
+++ b/tensorflow/python/profiler/internal/BUILD
@@ -149,9 +149,7 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     copts = tf_profiler_copts() + ["-fexceptions"],
     features = ["-use_header_modules"],  # Incompatible with -fexceptions.
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
@@ -167,9 +165,7 @@ cc_library(
     name = "profiler_pywrap_impl",
     srcs = ["profiler_pywrap_impl.cc"],
     hdrs = ["profiler_pywrap_impl.h"],
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
diff --git a/tensorflow/python/profiler/internal/profiler_pywrap_impl.cc b/tensorflow/python/profiler/internal/profiler_pywrap_impl.cc
index 0174562f6d0d23..e9b9ba65fac069 100644
--- a/tensorflow/python/profiler/internal/profiler_pywrap_impl.cc
+++ b/tensorflow/python/profiler/internal/profiler_pywrap_impl.cc
@@ -74,7 +74,7 @@ tensorflow::Status ProfilerSessionWrapper::Start(
     const absl::flat_hash_map<std::string, std::variant<int, std::string>>&
         options) {
   auto opts = GetRemoteSessionManagerOptionsLocked(logdir, options);
-  session_ = tensorflow::ProfilerSession::Create(opts.profiler_options());
+  session_ = tsl::ProfilerSession::Create(opts.profiler_options());
   logdir_ = logdir;
   return session_->Status();
 }
diff --git a/tensorflow/python/profiler/internal/profiler_pywrap_impl.h b/tensorflow/python/profiler/internal/profiler_pywrap_impl.h
index 3f75c81f01fe76..11d7e2abd4ae12 100644
--- a/tensorflow/python/profiler/internal/profiler_pywrap_impl.h
+++ b/tensorflow/python/profiler/internal/profiler_pywrap_impl.h
@@ -47,7 +47,7 @@ class ProfilerSessionWrapper {
   tensorflow::Status ExportToTensorBoard();
 
  private:
-  std::unique_ptr<tensorflow::ProfilerSession> session_;
+  std::unique_ptr<tsl::ProfilerSession> session_;
   tensorflow::string logdir_;
 };
 
diff --git a/tensorflow/python/tpu/tpu_embedding_v3_checkpoint_adapter.py b/tensorflow/python/tpu/tpu_embedding_v3_checkpoint_adapter.py
index 25407d23fc09bf..4a8d0d5ba55302 100644
--- a/tensorflow/python/tpu/tpu_embedding_v3_checkpoint_adapter.py
+++ b/tensorflow/python/tpu/tpu_embedding_v3_checkpoint_adapter.py
@@ -188,7 +188,7 @@ def create_from_checkpoint(cls, save_path: str):
       return cls(None)
     layouts = sparse_core_layout_pb2.SparseCoreTableLayouts()
     layouts.ParseFromString(sparsecore_layouts_str)
-    logging.vlog(1, "Loaded layouts from checkpoint: %s", layouts)
+    logging.info("Loaded layouts from checkpoint: %s", layouts)
     return cls(layouts)
 
   def initialize_reshard_callbacks(
@@ -225,12 +225,35 @@ def initialize_reshard_callbacks(
     raise NotImplementedError("Changing topology is not implemented yet.")
 
   def is_layouts_same(self, embedding_layouts) -> bool:
-    """Returns True if the all the embedding and checkpoint layouts are the same."""
-    if self._checkpoint_layouts.keys() == embedding_layouts.keys():
-      for key, layout in self._checkpoint_layouts.items():
-        if not compare.ProtoEq(layout, embedding_layouts[key]):
-          return False
-      return True
+    """Returns True if the all the embedding and checkpoint layouts are the same.
+
+    Args:
+      embedding_layouts: dict of layouts for embedding tables.
+
+    Raises: ValueError if the embedding layouts and checkpoint layouts do not
+      have the same keys.
+    Returns: Bool representing if the embedding layouts match the layouts in
+      checkpoint.
+    """
+    if self._checkpoint_layouts.keys() != embedding_layouts.keys():
+      raise ValueError(
+          "Layouts in checkpoint and embedding must have the same keys. found"
+          " {} and {}".format(
+              self._checkpoint_layouts.keys(), embedding_layouts.keys()
+          )
+      )
+
+    for key, layout in self._checkpoint_layouts.items():
+      if not compare.ProtoEq(layout, embedding_layouts[key]):
+        logging.info(
+            "Layouts do not match for %s this will require resharding; %s"
+            " vs %s",
+            key,
+            layout,
+            embedding_layouts[key],
+        )
+        return False
+    return True
 
   def is_applicable(self, trackable: trackable_base.Trackable) -> bool:
     # issubclass(trackable, TPUEmbeddingBase) adds circular deps, hence using
diff --git a/tensorflow/security/fuzzing/cc/BUILD b/tensorflow/security/fuzzing/cc/BUILD
index 4e4e032574f1ac..c3ec8ac5314408 100644
--- a/tensorflow/security/fuzzing/cc/BUILD
+++ b/tensorflow/security/fuzzing/cc/BUILD
@@ -220,11 +220,16 @@ tf_cc_fuzz_test(
         "//tensorflow/cc/saved_model:loader",
         "//tensorflow/cc/saved_model:tag_constants",
         "//tensorflow/core:core_cpu",
+        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:tensorflow",
+        "//tensorflow/core/framework:tensor",
+        "//tensorflow/core/platform:status",
         "//tensorflow/security/fuzzing/cc/core/framework:datatype_domains",
         "//tensorflow/security/fuzzing/cc/core/framework:tensor_domains",
         "//tensorflow/security/fuzzing/cc/core/framework:tensor_shape_domains",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:status",
     ],
 )
 
diff --git a/tensorflow/security/fuzzing/cc/core/framework/tensor_domains.cc b/tensorflow/security/fuzzing/cc/core/framework/tensor_domains.cc
index 62e9c2e20368b9..e1d5a499738479 100644
--- a/tensorflow/security/fuzzing/cc/core/framework/tensor_domains.cc
+++ b/tensorflow/security/fuzzing/cc/core/framework/tensor_domains.cc
@@ -57,7 +57,7 @@ Domain<bool> DomainRange(double min, double max) {
 template <typename T>
 auto StatusOrAnyTensor(const TensorShape& shape, Domain<T> content_domain) {
   return Map(
-      [shape](const std::vector<T>& contents) -> StatusOr<Tensor> {
+      [shape](const std::vector<T>& contents) -> absl::StatusOr<Tensor> {
         Tensor tensor;
         TF_RETURN_IF_ERROR(
             Tensor::BuildTensor(DataTypeToEnum<T>::v(), shape, &tensor));
@@ -75,9 +75,8 @@ auto StatusOrAnyTensor(const TensorShape& shape, Domain<T> content_domain) {
     return StatusOrAnyTensor(            \
         shape, DomainRange<EnumToDataType<data_type>::Type>(min, max));
 
-Domain<StatusOr<Tensor>> StatusOrAnyNumericTensor(const TensorShape& shape,
-                                                  DataType data_type,
-                                                  double min, double max) {
+Domain<absl::StatusOr<Tensor>> StatusOrAnyNumericTensor(
+    const TensorShape& shape, DataType data_type, double min, double max) {
   switch (data_type) {
     NUMERIC_TENSOR_HELPER(DT_FLOAT);
     NUMERIC_TENSOR_HELPER(DT_DOUBLE);
@@ -99,12 +98,13 @@ Domain<StatusOr<Tensor>> StatusOrAnyNumericTensor(const TensorShape& shape,
   }
 }
 
-Domain<Tensor> FilterInvalid(Domain<StatusOr<Tensor>> domain) {
-  return Map(
-      [](const StatusOr<Tensor>& t) { return *t; },
-      Filter(
-          [](const StatusOr<Tensor>& inner_t) { return inner_t.status().ok(); },
-          domain));
+Domain<Tensor> FilterInvalid(Domain<absl::StatusOr<Tensor>> domain) {
+  return Map([](const absl::StatusOr<Tensor>& t) { return *t; },
+             Filter(
+                 [](const absl::StatusOr<Tensor>& inner_t) {
+                   return inner_t.status().ok();
+                 },
+                 domain));
 }
 
 }  // namespace
diff --git a/tensorflow/security/fuzzing/cc/core/framework/tensor_shape_domains.cc b/tensorflow/security/fuzzing/cc/core/framework/tensor_shape_domains.cc
index 191d8ac31f6c22..6e6114cd99497e 100644
--- a/tensorflow/security/fuzzing/cc/core/framework/tensor_shape_domains.cc
+++ b/tensorflow/security/fuzzing/cc/core/framework/tensor_shape_domains.cc
@@ -32,11 +32,10 @@ using ::fuzztest::InRange;
 using ::fuzztest::Map;
 using ::fuzztest::VectorOf;
 
-Domain<StatusOr<TensorShape>> AnyStatusOrTensorShape(size_t max_rank,
-                                                     int64_t dim_lower_bound,
-                                                     int64_t dim_upper_bound) {
+Domain<absl::StatusOr<TensorShape>> AnyStatusOrTensorShape(
+    size_t max_rank, int64_t dim_lower_bound, int64_t dim_upper_bound) {
   return Map(
-      [](std::vector<int64_t> v) -> StatusOr<TensorShape> {
+      [](std::vector<int64_t> v) -> absl::StatusOr<TensorShape> {
         TensorShape out;
         TF_RETURN_IF_ERROR(TensorShape::BuildTensorShape(v, &out));
         return out;
@@ -51,7 +50,7 @@ Domain<TensorShape> AnyValidTensorShape(
     size_t max_rank = std::numeric_limits<size_t>::max(),
     int64_t dim_lower_bound = std::numeric_limits<int64_t>::min(),
     int64_t dim_upper_bound = std::numeric_limits<int64_t>::max()) {
-  return Map([](StatusOr<TensorShape> t) { return *t; },
+  return Map([](absl::StatusOr<TensorShape> t) { return *t; },
              Filter([](auto t_inner) { return t_inner.status().ok(); },
                     AnyStatusOrTensorShape(max_rank, dim_lower_bound,
                                            dim_upper_bound)));
diff --git a/tensorflow/security/fuzzing/cc/end_to_end_fuzz.cc b/tensorflow/security/fuzzing/cc/end_to_end_fuzz.cc
index 7769837deceff4..f6f86da3e12bb0 100644
--- a/tensorflow/security/fuzzing/cc/end_to_end_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/end_to_end_fuzz.cc
@@ -19,11 +19,17 @@ limitations under the License.
 #include "tensorflow/cc/saved_model/constants.h"
 #include "tensorflow/cc/saved_model/loader.h"
 #include "tensorflow/cc/saved_model/tag_constants.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/protobuf/saved_model.pb.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/security/fuzzing/cc/core/framework/datatype_domains.h"
 #include "tensorflow/security/fuzzing/cc/core/framework/tensor_domains.h"
 #include "tensorflow/security/fuzzing/cc/core/framework/tensor_shape_domains.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/status.h"
 
 namespace tensorflow::fuzzing {
 namespace {
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 7dbf684d6a078b..eebd36473b8ba8 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -1,5 +1,6 @@
 """Provides build configuration for TensorFlow."""
 
+load("@rules_java//java:defs.bzl", "java_test")
 load(
     "//tensorflow:py.default.bzl",
     _plain_py_binary = "py_binary",
@@ -68,7 +69,7 @@ load(
     "rocm_copts",
 )
 load(
-    "@local_tsl//tsl:tsl.bzl",
+    "@local_xla//xla/tsl:tsl.bzl",
     "tsl_gpu_library",
     _cc_header_only_library = "cc_header_only_library",
     _if_cuda_or_rocm = "if_cuda_or_rocm",
@@ -106,7 +107,7 @@ def clean_dep(target):
     """
 
     # A repo-relative label is resolved relative to the file in which the
-    # Label() call appears, i.e. @local_tsl.
+    # Label() call appears, i.e. @tsl.
     return str(Label(target))
 
 cc_header_only_library = _cc_header_only_library
@@ -1883,7 +1884,7 @@ def tf_java_test(
         name = cc_library_name,
         srcs = tf_binary_additional_srcs(fullversion = True) + tf_binary_dynamic_kernel_dsos() + tf_binary_dynamic_kernel_deps(kernels),
     )
-    native.java_test(
+    java_test(
         name = name,
         srcs = srcs,
         deps = deps + [cc_library_name],
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
index ee8586b35c90b4..89b7a82e147b7d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "load"
-    argspec: "args=[\'path\', \'element_spec\', \'compression\', \'reader_func\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'path\', \'element_spec\', \'compression\', \'reader_func\', \'wait\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\'], "
   }
   member_method {
     name: "make_initializable_iterator"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
index 3dfe8c47450ab7..7f3d32d7c14b4c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "load"
-    argspec: "args=[\'path\', \'element_spec\', \'compression\', \'reader_func\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'path\', \'element_spec\', \'compression\', \'reader_func\', \'wait\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\'], "
   }
   member_method {
     name: "make_initializable_iterator"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
index a6fd3ab125d9b0..0f21a9f6996320 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "load"
-    argspec: "args=[\'path\', \'element_spec\', \'compression\', \'reader_func\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'path\', \'element_spec\', \'compression\', \'reader_func\', \'wait\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\'], "
   }
   member_method {
     name: "make_initializable_iterator"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
index 7f5e3054580cb2..f9333a8dfc7b90 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "load"
-    argspec: "args=[\'path\', \'element_spec\', \'compression\', \'reader_func\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'path\', \'element_spec\', \'compression\', \'reader_func\', \'wait\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\'], "
   }
   member_method {
     name: "make_initializable_iterator"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
index 2d60c141f042e7..dd14c139764ee5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "load"
-    argspec: "args=[\'path\', \'element_spec\', \'compression\', \'reader_func\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'path\', \'element_spec\', \'compression\', \'reader_func\', \'wait\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\'], "
   }
   member_method {
     name: "make_initializable_iterator"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
index 69cbbe85568309..c270c68f0f1f38 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "load"
-    argspec: "args=[\'path\', \'element_spec\', \'compression\', \'reader_func\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'path\', \'element_spec\', \'compression\', \'reader_func\', \'wait\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\'], "
   }
   member_method {
     name: "make_initializable_iterator"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
index ef59409a7f8163..3a328715c7c3ea 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "load"
-    argspec: "args=[\'path\', \'element_spec\', \'compression\', \'reader_func\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'path\', \'element_spec\', \'compression\', \'reader_func\', \'wait\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\'], "
   }
   member_method {
     name: "make_initializable_iterator"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
index 7e5e3d54cd549f..5ee6f2f675bbad 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
@@ -136,6 +136,10 @@ tf_module {
     name: "dense_to_sparse_batch"
     argspec: "args=[\'batch_size\', \'row_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "distributed_save"
+    argspec: "args=[\'dataset\', \'path\', \'data_service_address\', \'compression\'], varargs=None, keywords=None, defaults=[\'AUTO\'], "
+  }
   member_method {
     name: "enable_debug_mode"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index 4cbab39f30e69c..badc611fd6d123 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -382,7 +382,7 @@ tf_module {
   }
   member_method {
     name: "BatchFunction"
-    argspec: "args=[\'in_tensors\', \'captured_tensors\', \'f\', \'num_batch_threads\', \'max_batch_size\', \'batch_timeout_micros\', \'Tout\', \'max_enqueued_batches\', \'allowed_batch_sizes\', \'container\', \'shared_name\', \'batching_queue\', \'low_priority_max_batch_size\', \'low_priority_batch_timeout_micros\', \'low_priority_allowed_batch_sizes\', \'low_priority_max_enqueued_batches\', \'enable_large_batch_splitting\', \'name\'], varargs=None, keywords=None, defaults=[\'10\', \'[]\', \'\', \'\', \'\', \'0\', \'0\', \'[]\', \'0\', \'False\', \'None\'], "
+    argspec: "args=[\'in_tensors\', \'captured_tensors\', \'f\', \'num_batch_threads\', \'max_batch_size\', \'batch_timeout_micros\', \'Tout\', \'max_enqueued_batches\', \'allowed_batch_sizes\', \'container\', \'shared_name\', \'batching_queue\', \'low_priority_max_batch_size\', \'low_priority_batch_timeout_micros\', \'low_priority_allowed_batch_sizes\', \'low_priority_max_enqueued_batches\', \'mixed_priority_policy\', \'enable_large_batch_splitting\', \'name\'], varargs=None, keywords=None, defaults=[\'10\', \'[]\', \'\', \'\', \'\', \'0\', \'0\', \'[]\', \'0\', \'low_priority_padding_with_max_batch_size\', \'False\', \'None\'], "
   }
   member_method {
     name: "BatchIFFT"
@@ -2088,6 +2088,10 @@ tf_module {
     name: "InTopKV2"
     argspec: "args=[\'predictions\', \'targets\', \'k\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "IndexFlatMapDataset"
+    argspec: "args=[\'input_dataset\', \'map_func_other_args\', \'index_map_func_other_args\', \'output_cardinality\', \'map_func\', \'index_map_func\', \'output_types\', \'output_shapes\', \'metadata\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
+  }
   member_method {
     name: "InfeedDequeue"
     argspec: "args=[\'dtype\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -5608,6 +5612,10 @@ tf_module {
     name: "VariableV2"
     argspec: "args=[\'shape\', \'dtype\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'None\'], "
   }
+  member_method {
+    name: "WeightedFlatMapDataset"
+    argspec: "args=[\'input_datasets\', \'weights\', \'output_types\', \'output_shapes\', \'metadata\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
+  }
   member_method {
     name: "Where"
     argspec: "args=[\'condition\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
index 23ed28f0428614..e51ced7730915c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
@@ -96,7 +96,7 @@ tf_class {
   }
   member_method {
     name: "load"
-    argspec: "args=[\'path\', \'element_spec\', \'compression\', \'reader_func\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'path\', \'element_spec\', \'compression\', \'reader_func\', \'wait\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\'], "
   }
   member_method {
     name: "map"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
index 663e470502da8b..ec96d8fdbb9cc5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "load"
-    argspec: "args=[\'path\', \'element_spec\', \'compression\', \'reader_func\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'path\', \'element_spec\', \'compression\', \'reader_func\', \'wait\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\'], "
   }
   member_method {
     name: "map"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
index 545b22fe67c348..34f5e2ba60ab17 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -97,7 +97,7 @@ tf_class {
   }
   member_method {
     name: "load"
-    argspec: "args=[\'path\', \'element_spec\', \'compression\', \'reader_func\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'path\', \'element_spec\', \'compression\', \'reader_func\', \'wait\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\'], "
   }
   member_method {
     name: "map"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
index f01cb46d577937..284e27fa3b5496 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "load"
-    argspec: "args=[\'path\', \'element_spec\', \'compression\', \'reader_func\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'path\', \'element_spec\', \'compression\', \'reader_func\', \'wait\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\'], "
   }
   member_method {
     name: "map"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
index 022a5b544c7a4f..37941f21d7c302 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "load"
-    argspec: "args=[\'path\', \'element_spec\', \'compression\', \'reader_func\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'path\', \'element_spec\', \'compression\', \'reader_func\', \'wait\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\'], "
   }
   member_method {
     name: "map"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
index becce178f61c7e..ed179f4751b411 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "load"
-    argspec: "args=[\'path\', \'element_spec\', \'compression\', \'reader_func\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'path\', \'element_spec\', \'compression\', \'reader_func\', \'wait\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\'], "
   }
   member_method {
     name: "map"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
index 727668005a4169..5a348471c0e84a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "load"
-    argspec: "args=[\'path\', \'element_spec\', \'compression\', \'reader_func\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'path\', \'element_spec\', \'compression\', \'reader_func\', \'wait\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\'], "
   }
   member_method {
     name: "map"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
index 55a4dff02260e0..eb0abe4b49287c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
@@ -112,6 +112,10 @@ tf_module {
     name: "dense_to_sparse_batch"
     argspec: "args=[\'batch_size\', \'row_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "distributed_save"
+    argspec: "args=[\'dataset\', \'path\', \'data_service_address\', \'compression\'], varargs=None, keywords=None, defaults=[\'AUTO\'], "
+  }
   member_method {
     name: "enable_debug_mode"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-d-tensor-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-d-tensor-dataset.pbtxt
index adac282ec9177d..19e84cef749ac0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-d-tensor-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-d-tensor-dataset.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "load"
-    argspec: "args=[\'path\', \'element_spec\', \'compression\', \'reader_func\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'path\', \'element_spec\', \'compression\', \'reader_func\', \'wait\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\'], "
   }
   member_method {
     name: "map"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index 4cbab39f30e69c..badc611fd6d123 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -382,7 +382,7 @@ tf_module {
   }
   member_method {
     name: "BatchFunction"
-    argspec: "args=[\'in_tensors\', \'captured_tensors\', \'f\', \'num_batch_threads\', \'max_batch_size\', \'batch_timeout_micros\', \'Tout\', \'max_enqueued_batches\', \'allowed_batch_sizes\', \'container\', \'shared_name\', \'batching_queue\', \'low_priority_max_batch_size\', \'low_priority_batch_timeout_micros\', \'low_priority_allowed_batch_sizes\', \'low_priority_max_enqueued_batches\', \'enable_large_batch_splitting\', \'name\'], varargs=None, keywords=None, defaults=[\'10\', \'[]\', \'\', \'\', \'\', \'0\', \'0\', \'[]\', \'0\', \'False\', \'None\'], "
+    argspec: "args=[\'in_tensors\', \'captured_tensors\', \'f\', \'num_batch_threads\', \'max_batch_size\', \'batch_timeout_micros\', \'Tout\', \'max_enqueued_batches\', \'allowed_batch_sizes\', \'container\', \'shared_name\', \'batching_queue\', \'low_priority_max_batch_size\', \'low_priority_batch_timeout_micros\', \'low_priority_allowed_batch_sizes\', \'low_priority_max_enqueued_batches\', \'mixed_priority_policy\', \'enable_large_batch_splitting\', \'name\'], varargs=None, keywords=None, defaults=[\'10\', \'[]\', \'\', \'\', \'\', \'0\', \'0\', \'[]\', \'0\', \'low_priority_padding_with_max_batch_size\', \'False\', \'None\'], "
   }
   member_method {
     name: "BatchIFFT"
@@ -2088,6 +2088,10 @@ tf_module {
     name: "InTopKV2"
     argspec: "args=[\'predictions\', \'targets\', \'k\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "IndexFlatMapDataset"
+    argspec: "args=[\'input_dataset\', \'map_func_other_args\', \'index_map_func_other_args\', \'output_cardinality\', \'map_func\', \'index_map_func\', \'output_types\', \'output_shapes\', \'metadata\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
+  }
   member_method {
     name: "InfeedDequeue"
     argspec: "args=[\'dtype\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -5608,6 +5612,10 @@ tf_module {
     name: "VariableV2"
     argspec: "args=[\'shape\', \'dtype\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'None\'], "
   }
+  member_method {
+    name: "WeightedFlatMapDataset"
+    argspec: "args=[\'input_datasets\', \'weights\', \'output_types\', \'output_shapes\', \'metadata\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
+  }
   member_method {
     name: "Where"
     argspec: "args=[\'condition\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages_by_version.sh b/tensorflow/tools/ci_build/install/install_pip_packages_by_version.sh
index cfb97cb0215b1e..5c17692ee12b10 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages_by_version.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages_by_version.sh
@@ -31,7 +31,9 @@ JAX_PACKAGES=(
   "wheel"
   "cloudpickle"
   "colorama>=0.4.4"
-  "matplotlib"
+  # TODO(phawkins): reenable matplotlib once it makes a NumPy 2.0 compatible
+  # release.
+  # "matplotlib"
   "pillow>=9.1.0"
   "rich"
   "absl-py"
@@ -40,7 +42,7 @@ JAX_PACKAGES=(
   "opt-einsum"
   "auditwheel"
   "typing_extensions"
-  "ml_dtypes>=0.3.0"
+  "ml_dtypes>=0.4.0"
   "importlib_metadata>=4.6"
   "flatbuffers"
   "build"
@@ -91,13 +93,9 @@ else
 fi
 
 if [[ "$2" == "jax" ]]; then
-  # Special casing by version of Python
-  # E.g., numpy supports py3.11 only from 1.23.4
-  if [[ ${PYTHON_VERSION} -eq 12 ]]; then
-    "${PIP_INSTALL[@]}" "numpy==1.26.0" "scipy==1.11.2"
-  else
-    "${PIP_INSTALL[@]}" "numpy==1.23.4" "scipy==1.9.3"
-  fi
+  # As of NumPy 2.0, wheels must be built against NumPy 2.0, even if we intend
+  # to deploy them against Numpy 1.
+  "${PIP_INSTALL[@]}" --pre "numpy==2.0.0rc1" "scipy==1.13.0rc1"
 else
   # Special casing by version of Python
   # E.g., numpy supports py3.10 only from 1.21.3
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index c78d03f82b52fd..f130beebad0bf5 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -72,7 +72,7 @@ pkg_tar(
         "//tensorflow/c:headers",
     ],
     package_dir = "include/",
-    strip_prefix = "/external/local_tsl",
+    strip_prefix = "/external/local_xla",
     # Mark as "manual" till
     # https://github.com/bazelbuild/bazel/issues/2352
     # and https://github.com/bazelbuild/bazel/issues/1580
@@ -160,7 +160,7 @@ genrule(
             "@com_github_googlecloudplatform_google_cloud_cpp//:LICENSE",
         ],
     }) + select({
-        "@local_tsl//tsl:with_numa_support": [
+        "@local_xla//xla/tsl:with_numa_support": [
             "@hwloc//:COPYING",
         ],
         "//conditions:default": [],
@@ -203,7 +203,7 @@ genrule(
             "@com_github_googlecloudplatform_google_cloud_cpp//:LICENSE",
         ],
     }) + select({
-        "@local_tsl//tsl:with_numa_support": [
+        "@local_xla//xla/tsl:with_numa_support": [
             "@hwloc//:COPYING",
         ],
         "//conditions:default": [],
diff --git a/tensorflow/tools/lib_package/libtensorflow_test.sh b/tensorflow/tools/lib_package/libtensorflow_test.sh
index 1d1f967c60d380..de3fb8ac1f1eb0 100755
--- a/tensorflow/tools/lib_package/libtensorflow_test.sh
+++ b/tensorflow/tools/lib_package/libtensorflow_test.sh
@@ -39,8 +39,13 @@ mkdir tensorflow
 ${TAR} -xzf ${TARFILE} -Ctensorflow
 
 # Compile the test .c file. Assumes with_framework_lib=True.
-${CC} ${CFILE} -Itensorflow/include -Itensorflow/include/external/local_tsl\
-  -Ltensorflow/lib -ltensorflow_framework -ltensorflow -oa.out
+${CC} ${CFILE} -Itensorflow/include \
+  -Itensorflow/include/external/local_tsl \
+  -Itensorflow/include/external/local_xla \
+  -Ltensorflow/lib \
+  -ltensorflow_framework \
+  -ltensorflow \
+  -oa.out
 
 # Execute it, with the shared library available.
 # DYLD_LIBRARY_PATH is used on OS X, LD_LIBRARY_PATH on Linux.
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 282b7c1229146c..2fcd2b28da4d98 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -113,7 +113,7 @@ filegroup(
             "@com_github_googlecloudplatform_google_cloud_cpp//:LICENSE",
         ],
     }) + select({
-        "@local_tsl//tsl:with_numa_support": [
+        "@local_xla//xla/tsl:with_numa_support": [
             "@hwloc//:COPYING",
         ],
         "//conditions:default": [],
@@ -251,6 +251,7 @@ collect_data_files(
         "//tensorflow:tensorflow_py",
         # utils below are not part of tensorflow_py DAG
         "//tensorflow/compiler/mlir/stablehlo:stablehlo_extension",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:pywrap_calibration",
         "//tensorflow/python/autograph/impl/testing:pybind_for_testing",
         "//tensorflow/python/framework:memory_checker",
         "//tensorflow/python/util:pywrap_xla_ops",
diff --git a/tensorflow/tools/proto_splitter/cc/util_test.cc b/tensorflow/tools/proto_splitter/cc/util_test.cc
index 23e9c8067db0e1..e318f7c16f144d 100644
--- a/tensorflow/tools/proto_splitter/cc/util_test.cc
+++ b/tensorflow/tools/proto_splitter/cc/util_test.cc
@@ -45,7 +45,7 @@ using tsl::testing::StatusIs;
 // Required in OSS to prevent string to bool conversion in FieldType variant.
 using namespace std::string_literals;  // NOLINT
 
-tsl::StatusOr<ManyFields> MakeManyFields() {
+absl::StatusOr<ManyFields> MakeManyFields() {
   return ParseTextProto<ManyFields>(
       R"pb(field_one {
              repeated_field {}
@@ -66,7 +66,7 @@ tsl::StatusOr<ManyFields> MakeManyFields() {
            })pb");
 }
 
-tsl::StatusOr<
+absl::StatusOr<
     tsl::protobuf::RepeatedPtrField<::tensorflow::proto_splitter::FieldIndex>>
 MakeFieldTags() {
   TF_ASSIGN_OR_RETURN(auto ret, ParseTextProto<ChunkedField>(R"pb(
@@ -78,7 +78,7 @@ MakeFieldTags() {
   return ret.field_tag();
 }
 
-tsl::StatusOr<
+absl::StatusOr<
     tsl::protobuf::RepeatedPtrField<::tensorflow::proto_splitter::FieldIndex>>
 MakeFieldTagsTooManyIndices() {
   TF_ASSIGN_OR_RETURN(auto ret, ParseTextProto<ChunkedField>(R"pb(
@@ -91,7 +91,7 @@ MakeFieldTagsTooManyIndices() {
   return ret.field_tag();
 }
 
-tsl::StatusOr<
+absl::StatusOr<
     tsl::protobuf::RepeatedPtrField<::tensorflow::proto_splitter::FieldIndex>>
 MakeFieldTagsTooManyMapKeys() {
   TF_ASSIGN_OR_RETURN(auto ret, ParseTextProto<ChunkedField>(R"pb(
@@ -104,7 +104,7 @@ MakeFieldTagsTooManyMapKeys() {
   return ret.field_tag();
 }
 
-tsl::StatusOr<
+absl::StatusOr<
     tsl::protobuf::RepeatedPtrField<::tensorflow::proto_splitter::FieldIndex>>
 MakeFieldTagsMisplacedIndex() {
   TF_ASSIGN_OR_RETURN(auto ret, ParseTextProto<ChunkedField>(R"pb(
@@ -117,7 +117,7 @@ MakeFieldTagsMisplacedIndex() {
   return ret.field_tag();
 }
 
-tsl::StatusOr<
+absl::StatusOr<
     tsl::protobuf::RepeatedPtrField<::tensorflow::proto_splitter::FieldIndex>>
 MakeFieldTagsMisplacedMapKey() {
   TF_ASSIGN_OR_RETURN(auto ret, ParseTextProto<ChunkedField>(R"pb(
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile b/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile
index a659294bb2cf77..5ca83bcd730c18 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile
@@ -1,5 +1,5 @@
 ################################################################################
-FROM ubuntu:20.04 as builder
+FROM ubuntu:22.04 as builder
 ################################################################################
 
 # Install devtoolset build dependencies
@@ -16,7 +16,7 @@ COPY builder.devtoolset/glibc2.17-inline.patch /glibc2.17-inline.patch
 RUN /build_devtoolset.sh devtoolset-9 /dt9
 
 ################################################################################
-FROM nvidia/cuda:12.3.1-base-ubuntu20.04 as devel
+FROM nvidia/cuda:12.3.1-base-ubuntu22.04 as devel
 ################################################################################
 COPY --from=builder /dt9 /dt9
 
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.packages.txt b/tensorflow/tools/tf_sig_build_dockerfiles/devel.packages.txt
index 49db07a0f190ab..1af8eb9c52e11d 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/devel.packages.txt
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/devel.packages.txt
@@ -20,6 +20,8 @@ libnccl2=2.19.3-1+cuda12.3
 libcudnn8-dev=8.9.6.50-1+cuda12.2
 libcudnn8=8.9.6.50-1+cuda12.2
 # TensorRT: See https://docs.nvidia.com/deeplearning/sdk/tensorrt-install-guide/index.html#maclearn-net-repo-install-rpm
+libnvinfer-headers-dev=8.6.1.6-1+cuda12.0
+libnvinfer-headers-plugin-dev=8.6.1.6-1+cuda12.0
 libnvinfer-plugin8=8.6.1.6-1+cuda12.0
 libnvinfer8=8.6.1.6-1+cuda12.0
 libnvinfer-dev=8.6.1.6-1+cuda12.0
@@ -57,8 +59,8 @@ libtool
 libzmq3-dev
 mlocate
 moreutils
-openjdk-11-jdk
-openjdk-11-jre-headless
+openjdk-21-jdk
+openjdk-21-jre-headless
 pkg-config
 python3-dev
 python3-setuptools
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/cpu_gcc.bazelrc b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/cpu_gcc.bazelrc
index 1f21969496f1e9..d0cddeaa3e138a 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/cpu_gcc.bazelrc
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/cpu_gcc.bazelrc
@@ -29,7 +29,7 @@ build --copt=-mavx --host_copt=-mavx
 build --profile=/tf/pkg/profile.json.gz
 
 # Use the NVCC toolchain to compile for manylinux2014
-build --crosstool_top="@sigbuild-r2.16_config_cuda//crosstool:toolchain"
+build --crosstool_top="@sigbuild-r2.17_config_cuda//crosstool:toolchain"
 
 # Test-related settings below this point.
 test --build_tests_only --keep_going --test_output=errors --verbose_failures=true
@@ -67,14 +67,14 @@ build:rbe --spawn_strategy=remote,worker,standalone,local
 build:rbe --remote_download_toplevel
 build:rbe --action_env=PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/go/bin"
 build:rbe --linkopt=-lrt --host_linkopt=-lrt --linkopt=-lm --host_linkopt=-lm  # Unclear why this is here
-build:rbe --host_crosstool_top="@sigbuild-r2.16_config_cuda//crosstool:toolchain"
-build:rbe --crosstool_top="@sigbuild-r2.16_config_cuda//crosstool:toolchain"
-build:rbe --extra_toolchains="@sigbuild-r2.16_config_cuda//crosstool:toolchain-linux-x86_64"
-build:rbe --extra_execution_platforms="@sigbuild-r2.16_config_platform//:platform"
-build:rbe --host_platform="@sigbuild-r2.16_config_platform//:platform"
-build:rbe --platforms="@sigbuild-r2.16_config_platform//:platform"
+build:rbe --host_crosstool_top="@sigbuild-r2.17_config_cuda//crosstool:toolchain"
+build:rbe --crosstool_top="@sigbuild-r2.17_config_cuda//crosstool:toolchain"
+build:rbe --extra_toolchains="@sigbuild-r2.17_config_cuda//crosstool:toolchain-linux-x86_64"
+build:rbe --extra_execution_platforms="@sigbuild-r2.17_config_platform//:platform"
+build:rbe --host_platform="@sigbuild-r2.17_config_platform//:platform"
+build:rbe --platforms="@sigbuild-r2.17_config_platform//:platform"
 # Python config is the same across all containers because the binary is the same
-build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.16_config_python"
+build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.17_config_python"
 build:rbe --remote_instance_name=projects/tensorflow-testing/instances/default_instance
 build:rbe --project_id="tensorflow-testing"
 
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/setup.sources.sh b/tensorflow/tools/tf_sig_build_dockerfiles/setup.sources.sh
index 3fa469d62c25fe..9c32c19e7a1a2a 100755
--- a/tensorflow/tools/tf_sig_build_dockerfiles/setup.sources.sh
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/setup.sources.sh
@@ -44,10 +44,10 @@ cat >/etc/apt/sources.list.d/custom.list <<SOURCES
 deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /
 
 # More Python versions: Deadsnakes
-deb http://ppa.launchpad.net/deadsnakes/ppa/ubuntu focal main
-deb-src http://ppa.launchpad.net/deadsnakes/ppa/ubuntu focal main
+deb http://ppa.launchpad.net/deadsnakes/ppa/ubuntu jammy main
+deb-src http://ppa.launchpad.net/deadsnakes/ppa/ubuntu jammy main
 
 # LLVM/Clang repository
-deb http://apt.llvm.org/focal/ llvm-toolchain-focal-17 main
-deb-src http://apt.llvm.org/focal/ llvm-toolchain-focal-17 main
+deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-17 main
+deb-src http://apt.llvm.org/jammy/ llvm-toolchain-jammy-17 main
 SOURCES
diff --git a/tensorflow/tools/tfg_graph_transforms/tfg_graph_transforms_main.cc b/tensorflow/tools/tfg_graph_transforms/tfg_graph_transforms_main.cc
index 94f8be4b5f4cda..bc6a7967d22f20 100644
--- a/tensorflow/tools/tfg_graph_transforms/tfg_graph_transforms_main.cc
+++ b/tensorflow/tools/tfg_graph_transforms/tfg_graph_transforms_main.cc
@@ -149,7 +149,7 @@ tensorflow::Status RunOptimizationPasses(
 }
 
 // Import model to the TFG MLIR module.
-tensorflow::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ImportModel(
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ImportModel(
     DataFormat data_format, const std::string& input_file,
     bool experimental_image_format, mlir::MLIRContext* mlir_context) {
   tensorflow::GraphDebugInfo debug_info;
diff --git a/tensorflow/tools/tfg_graph_transforms/utils.h b/tensorflow/tools/tfg_graph_transforms/utils.h
index ca60b786db62dc..5efdd37e444d73 100644
--- a/tensorflow/tools/tfg_graph_transforms/utils.h
+++ b/tensorflow/tools/tfg_graph_transforms/utils.h
@@ -89,7 +89,7 @@ tensorflow::Status SerializeProto(T model_proto,
                                      model_proto),
         "Error while writing the resulting model proto");
   }
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 // Read and write to the experimental SavedModel Image format.
diff --git a/tensorflow/tools/toolchains/remote_config/configs.bzl b/tensorflow/tools/toolchains/remote_config/configs.bzl
index 8de64750b71776..b4d69b94d59125 100644
--- a/tensorflow/tools/toolchains/remote_config/configs.bzl
+++ b/tensorflow/tools/toolchains/remote_config/configs.bzl
@@ -725,3 +725,80 @@ def initialize_rbe_configs():
             "TF_TENSORRT_VERSION": "8.6",
         },
     )
+
+    sigbuild_tf_configs(
+        name_container_map = {
+            "sigbuild-r2.17": "docker://gcr.io/tensorflow-sigs/build@sha256:dddcaf30321e9007103dce75c51b83fea3c06de462fcf41e7c6ae93f37fc3545",
+            "sigbuild-r2.17-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:8ca6b205b54f18d26a053cfe606145b8b11cc99cf83fc970a936ce327913c3c3",
+            "sigbuild-r2.17-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:5cfd081a337548165a800546f2365a38245e38e7a97052b1a21830bf66b2356d",
+            "sigbuild-r2.17-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:dddcaf30321e9007103dce75c51b83fea3c06de462fcf41e7c6ae93f37fc3545",
+            "sigbuild-r2.17-python3.12": "docker://gcr.io/tensorflow-sigs/build@sha256:933c9f4bf65c92780863e00bd2132c6cfd41dbd624736c1af0dd2a5a056db6b8",
+        },
+        # Unclear why LIBC is set to 2.19 here, and yet manylinux2010 is 2.12
+        # and manylinux2014 is 2.17.
+        env = {
+            "ABI_LIBC_VERSION": "glibc_2.19",
+            "ABI_VERSION": "gcc",
+            "BAZEL_COMPILER": "/dt9/usr/bin/gcc",
+            "BAZEL_HOST_SYSTEM": "i686-unknown-linux-gnu",
+            "BAZEL_TARGET_CPU": "k8",
+            "BAZEL_TARGET_LIBC": "glibc_2.19",
+            "BAZEL_TARGET_SYSTEM": "x86_64-unknown-linux-gnu",
+            "CC": "/dt9/usr/bin/gcc",
+            "CC_TOOLCHAIN_NAME": "linux_gnu_x86",
+            "CLEAR_CACHE": "1",
+            "CUDNN_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
+            "GCC_HOST_COMPILER_PATH": "/dt9/usr/bin/gcc",
+            "GCC_HOST_COMPILER_PREFIX": "/usr/bin",
+            "HOST_CXX_COMPILER": "/dt9/usr/bin/gcc",
+            "HOST_C_COMPILER": "/dt9/usr/bin/gcc",
+            "PYTHON_BIN_PATH": "/usr/bin/python3",
+            "TF_CUDA_CLANG": "0",
+            "TF_CUDA_COMPUTE_CAPABILITIES": "3.5,6.0",
+            "TF_CUDA_VERSION": "12.3",
+            "TF_CUDNN_VERSION": "8.9",
+            "TF_ENABLE_XLA": "1",
+            "TF_NEED_CUDA": "1",
+            "TF_SYSROOT": "/dt9",
+            "TF_NEED_TENSORRT": "1",
+            "TF_TENSORRT_VERSION": "8.6",
+        },
+    )
+
+    sigbuild_tf_configs(
+        name_container_map = {
+            "sigbuild-r2.17-clang": "docker://gcr.io/tensorflow-sigs/build@sha256:dddcaf30321e9007103dce75c51b83fea3c06de462fcf41e7c6ae93f37fc3545",
+            "sigbuild-r2.17-clang-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:8ca6b205b54f18d26a053cfe606145b8b11cc99cf83fc970a936ce327913c3c3",
+            "sigbuild-r2.17-clang-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:5cfd081a337548165a800546f2365a38245e38e7a97052b1a21830bf66b2356d",
+            "sigbuild-r2.17-clang-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:dddcaf30321e9007103dce75c51b83fea3c06de462fcf41e7c6ae93f37fc3545",
+            "sigbuild-r2.17-clang-python3.12": "docker://gcr.io/tensorflow-sigs/build@sha256:933c9f4bf65c92780863e00bd2132c6cfd41dbd624736c1af0dd2a5a056db6b8",
+        },
+        # Unclear why LIBC is set to 2.19 here, and yet manylinux2010 is 2.12
+        # and manylinux2014 is 2.17.
+        env = {
+            "ABI_LIBC_VERSION": "glibc_2.19",
+            "ABI_VERSION": "gcc",
+            "BAZEL_COMPILER": "/usr/lib/llvm-17/bin/clang",
+            "BAZEL_HOST_SYSTEM": "i686-unknown-linux-gnu",
+            "BAZEL_TARGET_CPU": "k8",
+            "BAZEL_TARGET_LIBC": "glibc_2.19",
+            "BAZEL_TARGET_SYSTEM": "x86_64-unknown-linux-gnu",
+            "CC": "/usr/lib/llvm-17/bin/clang",
+            "CC_TOOLCHAIN_NAME": "linux_gnu_x86",
+            "CLEAR_CACHE": "1",
+            "CUDNN_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
+            "CLANG_CUDA_COMPILER_PATH": "/usr/lib/llvm-17/bin/clang",
+            "HOST_CXX_COMPILER": "/usr/lib/llvm-17/bin/clang",
+            "HOST_C_COMPILER": "/usr/lib/llvm-17/bin/clang",
+            "PYTHON_BIN_PATH": "/usr/bin/python3",
+            "TF_CUDA_CLANG": "1",
+            "TF_CUDA_COMPUTE_CAPABILITIES": "3.5,6.0",
+            "TF_CUDA_VERSION": "12.3",
+            "TF_CUDNN_VERSION": "8.9",
+            "TF_ENABLE_XLA": "1",
+            "TF_NEED_CUDA": "1",
+            "TF_SYSROOT": "/dt9",
+            "TF_NEED_TENSORRT": "1",
+            "TF_TENSORRT_VERSION": "8.6",
+        },
+    )
diff --git a/tensorflow/tools/toolchains/remote_config/containers.bzl b/tensorflow/tools/toolchains/remote_config/containers.bzl
index c1fbd211e16806..dd222d06bd13b1 100644
--- a/tensorflow/tools/toolchains/remote_config/containers.bzl
+++ b/tensorflow/tools/toolchains/remote_config/containers.bzl
@@ -6,9 +6,9 @@ container_digests = {
     "cuda11.2-cudnn8.1-ubuntu20.04-manylinux2014-multipython": "sha256:48612bd85709cd014711d0b0f87e0806f3567d06d2e81c6e860516b87498b821",
     # JAX manylinux2014 configs.
     "cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython": "sha256:45619e91f14faabddd79fe0cb1526df4c4ad92fc2e6ebdc725ea4419225429c3",
-    "cuda12.1-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:25bb9dcc4af0fabeb09c29c6679d85a72e751872c733465a784e6e1395b31ba3",
+    "cuda12.1-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:8c266e5b0acd203aed5e8871b63f68a39d8d23f6d882e619797e58b973f7fe63",
     "cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:9fefda035b4a12b24cd5bae56c7dbb9527a5fd06a41ced0a22ac86fe5ed26428",
-    "cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:578e8ec3d03451867a8cee100fa92f1686dd83443f8938df8e91b5f9a157f89e",
+    "cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:6f9524a2ed7f75255dc4be3a0c5e3bda581385a1c13e2fa890bc17fa62da95b2",
     # ROCM, probably not all of them still in use
     "rocm-ubuntu18.04-manylinux2010-multipython": "sha256:6e953a09b145df338bcb03e9e36f99b291140c29b72d0a048fb6c5905ccad5eb",
     "rocm-ubuntu20.04-manylinux2014-multipython": "sha256:906faec7765fe5dd067f2b092b5d5f220c1fedde725fb42c83d031b4d6f32204",
diff --git a/tensorflow/workspace2.bzl b/tensorflow/workspace2.bzl
index d89168256105a7..fb4987f3916fbb 100644
--- a/tensorflow/workspace2.bzl
+++ b/tensorflow/workspace2.bzl
@@ -150,9 +150,9 @@ def _tf_repositories():
     # LINT.IfChange
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "55827bd7c39a080d4296e84dfbe576240d83a3347df8ad4e10cf9fe400678db7",
-        strip_prefix = "XNNPACK-65ca94a2d3f39e7ff8e4045852b32b65ca6a16b7",
-        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/65ca94a2d3f39e7ff8e4045852b32b65ca6a16b7.zip"),
+        sha256 = "ba8406e5a885edcf98b1097e853d5b5e960d34844858505143872b0f6f4f03e3",
+        strip_prefix = "XNNPACK-7f3e8aa632ab976b8a195c8d3d17e2f5831dde0e",
+        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/7f3e8aa632ab976b8a195c8d3d17e2f5831dde0e.zip"),
     )
     # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/xnnpack.cmake)
 
diff --git a/third_party/compute_library/build_defs.bzl b/third_party/compute_library/build_defs.bzl
index 74102fd3e6d051..9271ce1de2a7db 100644
--- a/third_party/compute_library/build_defs.bzl
+++ b/third_party/compute_library/build_defs.bzl
@@ -1,6 +1,6 @@
 def if_enable_acl(if_true, if_false = []):
     return select({
-        "@org_tensorflow//third_party/compute_library:build_with_acl": if_true,
+        "@local_tsl//third_party/compute_library:build_with_acl": if_true,
         "//conditions:default": if_false,
     })
 
@@ -15,6 +15,6 @@ def acl_deps():
       inclusion in the deps attribute of rules.
     """
     return select({
-        "@org_tensorflow//third_party/compute_library:build_with_acl": ["@compute_library//:arm_compute"],
+        "@local_tsl//third_party/compute_library:build_with_acl": ["@compute_library//:arm_compute"],
         "//conditions:default": [],
     })
diff --git a/third_party/curl.BUILD b/third_party/curl.BUILD
index b31d8488aaa0ba..a58b18c73bd8a5 100644
--- a/third_party/curl.BUILD
+++ b/third_party/curl.BUILD
@@ -353,13 +353,13 @@ cc_library(
         "lib/ws.c",
         "lib/ws.h",
     ] + select({
-        "@local_tsl//tsl:macos": [
+        "@local_xla//xla/tsl:macos": [
             "lib/vtls/sectransp.c",
         ],
-        "@local_tsl//tsl:ios": [
+        "@local_xla//xla/tsl:ios": [
             "lib/vtls/sectransp.c",
         ],
-        "@local_tsl//tsl:windows": CURL_WIN_SRCS,
+        "@local_xla//xla/tsl:windows": CURL_WIN_SRCS,
         "//conditions:default": [
         ],
     }),
@@ -378,7 +378,7 @@ cc_library(
         "include/curl/websockets.h",
     ],
     copts = select({
-        "@local_tsl//tsl:windows": CURL_WIN_COPTS,
+        "@local_xla//xla/tsl:windows": CURL_WIN_COPTS,
         "//conditions:default": [
             "-Iexternal/curl/lib",
             "-D_GNU_SOURCE",
@@ -391,10 +391,10 @@ cc_library(
             "-Wno-string-plus-int",
         ],
     }) + select({
-        "@local_tsl//tsl:macos": [
+        "@local_xla//xla/tsl:macos": [
             "-fno-constant-cfstrings",
         ],
-        "@local_tsl//tsl:windows": [
+        "@local_xla//xla/tsl:windows": [
             # See curl.h for discussion of write size and Windows
             "/DCURL_MAX_WRITE_SIZE=16384",
         ],
@@ -405,10 +405,10 @@ cc_library(
     defines = ["CURL_STATICLIB"],
     includes = ["include"],
     linkopts = select({
-        "@local_tsl//tsl:android": [
+        "@local_xla//xla/tsl:android": [
             "-pie",
         ],
-        "@local_tsl//tsl:macos": [
+        "@local_xla//xla/tsl:macos": [
             "-Wl,-framework",
             "-Wl,CoreFoundation",
             "-Wl,-framework",
@@ -416,8 +416,8 @@ cc_library(
             "-Wl,-framework",
             "-Wl,Security",
         ],
-        "@local_tsl//tsl:ios": [],
-        "@local_tsl//tsl:windows": [
+        "@local_xla//xla/tsl:ios": [],
+        "@local_xla//xla/tsl:windows": [
             "-DEFAULTLIB:ws2_32.lib",
             "-DEFAULTLIB:advapi32.lib",
             "-DEFAULTLIB:crypt32.lib",
@@ -431,8 +431,8 @@ cc_library(
     deps = [
         "@zlib",
     ] + select({
-        "@local_tsl//tsl:ios": [],
-        "@local_tsl//tsl:windows": [],
+        "@local_xla//xla/tsl:ios": [],
+        "@local_xla//xla/tsl:windows": [],
         "//conditions:default": [
             "@boringssl//:ssl",
         ],
@@ -538,7 +538,7 @@ cc_binary(
         "src/tool_xattr.h",
     ],
     copts = select({
-        "@local_tsl//tsl:windows": CURL_BIN_WIN_COPTS,
+        "@local_xla//xla/tsl:windows": CURL_BIN_WIN_COPTS,
         "//conditions:default": [
             "-Iexternal/curl/lib",
             "-D_GNU_SOURCE",
diff --git a/third_party/flatbuffers/build_defs.bzl b/third_party/flatbuffers/build_defs.bzl
index 34711a6cf3ba21..421d3b91938c9c 100644
--- a/third_party/flatbuffers/build_defs.bzl
+++ b/third_party/flatbuffers/build_defs.bzl
@@ -1,6 +1,7 @@
 """BUILD rules for generating flatbuffer files."""
 
 load("@build_bazel_rules_android//android:rules.bzl", "android_library")
+load("@rules_java//java:defs.bzl", "java_library")
 
 flatc_path = "@flatbuffers//:flatc"
 zip_files = "//tensorflow/lite/tools:zip_files"
@@ -502,8 +503,7 @@ def flatbuffer_java_library(
         name = "%s.srcjar" % name,
         srcs = [out_srcjar],
     )
-
-    native.java_library(
+    java_library(
         name = name,
         srcs = [out_srcjar],
         javacopts = ["-source 7 -target 7"],
diff --git a/third_party/gloo/gloo.BUILD b/third_party/gloo/gloo.BUILD
index 68ba4e3610da70..79fa3cb22c7bd0 100644
--- a/third_party/gloo/gloo.BUILD
+++ b/third_party/gloo/gloo.BUILD
@@ -57,8 +57,8 @@ cc_library(
         "gloo/rendezvous/prefix_store.cc",
         "gloo/rendezvous/store.cc",
     ] + select({
-        "@local_tsl//tsl:macos": [],
-        "@local_tsl//tsl:windows": [],
+        "@local_xla//xla/tsl:macos": [],
+        "@local_xla//xla/tsl:windows": [],
         "//conditions:default": [
             "gloo/common/linux.cc",
         ],
diff --git a/third_party/gpus/crosstool/BUILD.sycl.tpl b/third_party/gpus/crosstool/BUILD.sycl.tpl
new file mode 100644
index 00000000000000..a8db760d0c695c
--- /dev/null
+++ b/third_party/gpus/crosstool/BUILD.sycl.tpl
@@ -0,0 +1,71 @@
+# This file is expanded from a template sycl_configure.bzl
+# Update sycl_configure.bzl#verify_build_defines when adding new variables.
+
+load(":cc_toolchain_config.bzl", "cc_toolchain_config")
+
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+toolchain(
+    name = "toolchain-linux-x86_64",
+    exec_compatible_with = [
+        "@bazel_tools//platforms:linux",
+        "@bazel_tools//platforms:x86_64",
+    ],
+    target_compatible_with = [
+        "@bazel_tools//platforms:linux",
+        "@bazel_tools//platforms:x86_64",
+    ],
+    toolchain = ":cc-compiler-local",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain_suite(
+    name = "toolchain",
+    toolchains = {
+        "local|compiler": ":cc-compiler-local",
+        "k8": ":cc-compiler-local",
+    },
+)
+
+cc_toolchain(
+    name = "cc-compiler-local",
+    all_files = ":crosstool_wrapper_driver_is_not_gcc",
+    compiler_files = ":crosstool_wrapper_driver_is_not_gcc",
+    ar_files = ":crosstool_wrapper_driver_is_not_gcc",
+    as_files = ":crosstool_wrapper_driver_is_not_gcc",
+    dwp_files = ":empty",
+    linker_files = ":crosstool_wrapper_driver_is_not_gcc",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    # To support linker flags that need to go to the start of command line
+    # we need the toolchain to support parameter files. Parameter files are
+    # last on the command line and contain all shared libraries to link, so all
+    # regular options will be left of them.
+    supports_param_files = 1,
+    toolchain_identifier = "local_linux",
+    toolchain_config = ":cc-compiler-local-config",
+)
+
+cc_toolchain_config(
+    name = "cc-compiler-local-config",
+    cpu = "local",
+    builtin_include_directories = [%{cxx_builtin_include_directories}],
+    extra_no_canonical_prefixes_flags = [%{extra_no_canonical_prefixes_flags}],
+    host_compiler_path = "%{host_compiler_path}",
+    host_compiler_prefix = "%{host_compiler_prefix}",
+    host_unfiltered_compile_flags = [%{unfiltered_compile_flags}],
+    linker_bin_path = "%{linker_bin_path}",
+    compiler = "unknown",
+)
+
+filegroup(
+    name = "empty",
+    srcs = [],
+)
+
+filegroup(
+    name = "crosstool_wrapper_driver_is_not_gcc",
+    srcs = ["clang/bin/crosstool_wrapper_driver_is_not_gcc"]
+)
diff --git a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_sycl.tpl b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_sycl.tpl
new file mode 100644
index 00000000000000..75474521110df5
--- /dev/null
+++ b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_sycl.tpl
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+"""Crosstool wrapper for compiling SYCL program
+SYNOPSIS:
+  crosstool_wrapper_driver_sycl [options passed in by cc_library()
+                                or cc_binary() rule]
+DESCRIPTION:
+  This script is expected to be called by the cc_library() or cc_binary() bazel
+  rules. When the option "sycl_compile" is present in the list of arguments passed
+  to this script, it invokes the sycl compiler. When "sycl_compile" is not
+  present, this wrapper invokes gcc with the input arguments as is.
+"""
+
+from __future__ import print_function
+from argparse import ArgumentParser
+import os
+import subprocess
+import sys
+import shlex
+
+CPU_COMPILER = ('%{cpu_compiler}')
+
+def system(cmd):
+  """Invokes cmd with os.system()"""
+
+  ret = os.system(cmd)
+  if os.WIFEXITED(ret):
+    return os.WEXITSTATUS(ret)
+  else:
+    return -os.WTERMSIG(ret)
+
+def call_compiler(argv):
+  parser = ArgumentParser()
+  parser.add_argument('-c', nargs=1, action='append')
+  parser.add_argument('-o', nargs=1, action='append')
+  args, leftover = parser.parse_known_args(argv)
+
+  flags = leftover
+
+  common_flags = []
+  common_flags.append("-fno-finite-math-only")
+  common_flags.append("-fno-fast-math")
+  common_flags.append("-fexceptions")
+
+  in_files, out_files = [], []
+  if args.c:
+    in_files.append('-c')
+    in_files.extend(args.c[0])
+  if args.o:
+    out_files.append('-o')
+    out_files.extend(args.o[0])
+  flags += (common_flags + in_files + out_files)
+  print("cmd: ", " ".join([CPU_COMPILER] + flags))
+  return subprocess.call([CPU_COMPILER] + flags)
+
+def main():
+  parser = ArgumentParser()
+  parser = ArgumentParser(fromfile_prefix_chars='@')
+  parser.add_argument('-sycl_compile', action='store_true')
+  args, leftover = parser.parse_known_args(sys.argv[1:])
+
+  return call_compiler(leftover)
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/third_party/gpus/crosstool/sycl_cc_toolchain_config.bzl.tpl b/third_party/gpus/crosstool/sycl_cc_toolchain_config.bzl.tpl
new file mode 100644
index 00000000000000..1b4e1c6def04f2
--- /dev/null
+++ b/third_party/gpus/crosstool/sycl_cc_toolchain_config.bzl.tpl
@@ -0,0 +1,598 @@
+"""cc_toolchain_config rule for configuring SYCL toolchains on Linux."""
+
+load(
+    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
+    "action_config",
+    "artifact_name_pattern",
+    "env_entry",
+    "env_set",
+    "feature",
+    "feature_set",
+    "flag_group",
+    "flag_set",
+    "tool",
+    "tool_path",
+    "variable_with_value",
+    "with_feature_set",
+)
+load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
+
+def all_assembly_actions():
+    return [
+        ACTION_NAMES.assemble,
+        ACTION_NAMES.preprocess_assemble,
+    ]
+
+def all_compile_actions():
+    return [
+        ACTION_NAMES.assemble,
+        ACTION_NAMES.c_compile,
+        ACTION_NAMES.cpp_compile,
+        ACTION_NAMES.cpp_header_parsing,
+        ACTION_NAMES.cpp_module_codegen,
+        ACTION_NAMES.cpp_module_compile,
+        ACTION_NAMES.linkstamp_compile,
+        ACTION_NAMES.preprocess_assemble,
+    ]
+
+def all_c_compile_actions():
+    return [
+        ACTION_NAMES.c_compile,
+    ]
+
+def all_cpp_compile_actions():
+    return [
+        ACTION_NAMES.cpp_compile,
+        ACTION_NAMES.cpp_header_parsing,
+        ACTION_NAMES.cpp_module_codegen,
+        ACTION_NAMES.cpp_module_compile,
+        ACTION_NAMES.linkstamp_compile,
+    ]
+
+def all_preprocessed_actions():
+    return [
+        ACTION_NAMES.c_compile,
+        ACTION_NAMES.cpp_compile,
+        ACTION_NAMES.cpp_header_parsing,
+        ACTION_NAMES.cpp_module_codegen,
+        ACTION_NAMES.cpp_module_compile,
+        ACTION_NAMES.linkstamp_compile,
+        ACTION_NAMES.preprocess_assemble,
+    ]
+
+def all_link_actions():
+    return [
+        ACTION_NAMES.cpp_link_executable,
+        ACTION_NAMES.cpp_link_dynamic_library,
+        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+    ]
+
+def all_executable_link_actions():
+    return [
+        ACTION_NAMES.cpp_link_executable,
+    ]
+
+def all_shared_library_link_actions():
+    return [
+        ACTION_NAMES.cpp_link_dynamic_library,
+        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+    ]
+
+def all_archive_actions():
+    return [ACTION_NAMES.cpp_link_static_library]
+
+def all_strip_actions():
+    return [ACTION_NAMES.strip]
+
+def _library_to_link(flag_prefix, value, iterate = None):
+    return flag_group(
+        flags = [
+            "{}%{{libraries_to_link.{}}}".format(
+                flag_prefix,
+                iterate if iterate else "name",
+            ),
+        ],
+        iterate_over = ("libraries_to_link." + iterate if iterate else None),
+        expand_if_equal = variable_with_value(
+            name = "libraries_to_link.type",
+            value = value,
+        ),
+    )
+
+def _surround_static_library(prefix, suffix):
+    return [
+        flag_group(
+            flags = [prefix, "%{libraries_to_link.name}", suffix],
+            expand_if_true = "libraries_to_link.is_whole_archive",
+        ),
+        flag_group(
+            flags = ["%{libraries_to_link.name}"],
+            expand_if_false = "libraries_to_link.is_whole_archive",
+        ),
+    ]
+
+def _prefix_static_library(prefix):
+    return [
+        flag_group(
+            flags = ["%{libraries_to_link.name}"],
+            expand_if_false = "libraries_to_link.is_whole_archive",
+        ),
+        flag_group(
+            flags = [prefix + "%{libraries_to_link.name}"],
+            expand_if_true = "libraries_to_link.is_whole_archive",
+        ),
+    ]
+
+def _static_library_to_link(alwayslink_prefix, alwayslink_suffix = None):
+    if alwayslink_suffix:
+        flag_groups = _surround_static_library(alwayslink_prefix, alwayslink_suffix)
+    else:
+        flag_groups = _prefix_static_library(alwayslink_prefix)
+    return flag_group(
+        flag_groups = flag_groups,
+        expand_if_equal = variable_with_value(
+            name = "libraries_to_link.type",
+            value = "static_library",
+        ),
+    )
+
+def _iterate_flag_group(iterate_over, flags = [], flag_groups = []):
+    return flag_group(
+        iterate_over = iterate_over,
+        expand_if_available = iterate_over,
+        flag_groups = flag_groups,
+        flags = flags,
+    )
+
+def _libraries_to_link_group(flavour):
+    if flavour == "linux":
+        return _iterate_flag_group(
+            iterate_over = "libraries_to_link",
+            flag_groups = [
+                flag_group(
+                    flags = ["-Wl,--start-lib"],
+                    expand_if_equal = variable_with_value(
+                        name = "libraries_to_link.type",
+                        value = "object_file_group",
+                    ),
+                ),
+                _library_to_link("", "object_file_group", "object_files"),
+                flag_group(
+                    flags = ["-Wl,--end-lib"],
+                    expand_if_equal = variable_with_value(
+                        name = "libraries_to_link.type",
+                        value = "object_file_group",
+                    ),
+                ),
+                _library_to_link("", "object_file"),
+                _library_to_link("", "interface_library"),
+                _static_library_to_link("-Wl,-whole-archive", "-Wl,-no-whole-archive"),
+                _library_to_link("-l", "dynamic_library"),
+                _library_to_link("-l:", "versioned_dynamic_library"),
+            ],
+        )
+
+def _action_configs_with_tool(path, actions):
+    return [
+        action_config(
+            action_name = name,
+            enabled = True,
+            tools = [tool(path = path)],
+        )
+        for name in actions
+    ]
+
+def _action_configs(assembly_path, c_compiler_path, cc_compiler_path, archiver_path, linker_path, strip_path):
+    return _action_configs_with_tool(
+        assembly_path,
+        all_assembly_actions(),
+    ) + _action_configs_with_tool(
+        c_compiler_path,
+        all_c_compile_actions(),
+    ) + _action_configs_with_tool(
+        cc_compiler_path,
+        all_cpp_compile_actions(),
+    ) + _action_configs_with_tool(
+        archiver_path,
+        all_archive_actions(),
+    ) + _action_configs_with_tool(
+        linker_path,
+        all_link_actions(),
+    ) + _action_configs_with_tool(
+        strip_path,
+        all_strip_actions(),
+    )
+
+def _tool_paths(cpu, ctx):
+    if cpu == "local":
+        return [
+            tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
+            tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + "/ar"),
+            tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
+            tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
+            tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
+            tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
+            tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
+            tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
+            tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
+            tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
+            tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
+        ]
+    else:
+        fail("Unreachable")
+
+def _sysroot_group():
+    return flag_group(
+        flags = ["--sysroot=%{sysroot}"],
+        expand_if_available = "sysroot",
+    )
+
+def _no_canonical_prefixes_group(extra_flags):
+    return flag_group(
+        flags = [
+            "-no-canonical-prefixes",
+        ] + extra_flags,
+    )
+
+def _nologo():
+    return flag_group(flags = ["/nologo"])
+
+def _features(cpu, compiler, ctx):
+    if cpu == "local":
+        return [
+            feature(name = "no_legacy_features"),
+            feature(
+                name = "all_compile_flags",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = ["-MD", "-MF", "%{dependency_file}"],
+                                expand_if_available = "dependency_file",
+                            ),
+                            flag_group(
+                                flags = ["-gsplit-dwarf"],
+                                expand_if_available = "per_object_debug_info_file",
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_preprocessed_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = ["-frandom-seed=%{output_file}"],
+                                expand_if_available = "output_file",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-D%{preprocessor_defines}"],
+                                iterate_over = "preprocessor_defines",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-include", "%{includes}"],
+                                iterate_over = "includes",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-iquote", "%{quote_include_paths}"],
+                                iterate_over = "quote_include_paths",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-I%{include_paths}"],
+                                iterate_over = "include_paths",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-isystem", "%{system_include_paths}"],
+                                iterate_over = "system_include_paths",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-F", "%{framework_include_paths}"],
+                                iterate_over = "framework_include_paths",
+                            ),
+                        ] + ([
+                            flag_group(flags = ctx.attr.host_unfiltered_compile_flags),
+                        ] if ctx.attr.host_unfiltered_compile_flags else []),
+                    ),
+                    flag_set(
+                        actions = all_cpp_compile_actions(),
+                        flag_groups = [
+                            flag_group(flags = [
+                                "-fmerge-all-constants",
+                            ]),
+                        ] if compiler == "clang" else [],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = [
+                                    "-Wno-builtin-macro-redefined",
+                                    "-D__DATE__=\"redacted\"",
+                                    "-D__TIMESTAMP__=\"redacted\"",
+                                    "-D__TIME__=\"redacted\"",
+                                ],
+                            ),
+                            flag_group(
+                                flags = ["-fPIC"],
+                                expand_if_available = "pic",
+                            ),
+                            flag_group(
+                                flags = ["-fPIE"],
+                                expand_if_not_available = "pic",
+                            ),
+                            flag_group(
+                                flags = [
+                                    "-U_FORTIFY_SOURCE",
+                                    "-D_FORTIFY_SOURCE=1",
+                                    "-fstack-protector",
+                                    "-Wall",
+                                ] + ctx.attr.host_compiler_warnings + [
+                                    "-fno-omit-frame-pointer",
+                                ],
+                            ),
+                            _no_canonical_prefixes_group(
+                                ctx.attr.extra_no_canonical_prefixes_flags,
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [flag_group(flags = ["-DNDEBUG"])],
+                        with_features = [with_feature_set(features = ["disable-assertions"])],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = [
+                                    "-g0",
+                                    "-O2",
+                                    "-ffunction-sections",
+                                    "-fdata-sections",
+                                ],
+                            ),
+                        ],
+                        with_features = [with_feature_set(features = ["opt"])],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [flag_group(flags = ["-g"])],
+                        with_features = [with_feature_set(features = ["dbg"])],
+                    ),
+                ] + [
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [
+                            _iterate_flag_group(
+                                flags = ["%{user_compile_flags}"],
+                                iterate_over = "user_compile_flags",
+                            ),
+                            _sysroot_group(),
+                            flag_group(
+                                expand_if_available = "source_file",
+                                flags = ["-c", "%{source_file}"],
+                            ),
+                            flag_group(
+                                expand_if_available = "output_assembly_file",
+                                flags = ["-S"],
+                            ),
+                            flag_group(
+                                expand_if_available = "output_preprocess_file",
+                                flags = ["-E"],
+                            ),
+                            flag_group(
+                                expand_if_available = "output_file",
+                                flags = ["-o", "%{output_file}"],
+                            ),
+                        ],
+                    ),
+                ],
+            ),
+            feature(
+                name = "all_archive_flags",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = all_archive_actions(),
+                        flag_groups = [
+                            flag_group(
+                                expand_if_available = "linker_param_file",
+                                flags = ["@%{linker_param_file}"],
+                            ),
+                            flag_group(flags = ["rcsD"]),
+                            flag_group(
+                                flags = ["%{output_execpath}"],
+                                expand_if_available = "output_execpath",
+                            ),
+                            flag_group(
+                                iterate_over = "libraries_to_link",
+                                flag_groups = [
+                                    flag_group(
+                                        flags = ["%{libraries_to_link.name}"],
+                                        expand_if_equal = variable_with_value(
+                                            name = "libraries_to_link.type",
+                                            value = "object_file",
+                                        ),
+                                    ),
+                                    flag_group(
+                                        flags = ["%{libraries_to_link.object_files}"],
+                                        iterate_over = "libraries_to_link.object_files",
+                                        expand_if_equal = variable_with_value(
+                                            name = "libraries_to_link.type",
+                                            value = "object_file_group",
+                                        ),
+                                    ),
+                                ],
+                                expand_if_available = "libraries_to_link",
+                            ),
+                        ],
+                    ),
+                ],
+            ),
+            feature(
+                name = "all_link_flags",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = all_shared_library_link_actions(),
+                        flag_groups = [flag_group(flags = ["-shared"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = ([
+                            flag_group(flags = ["-Wl,-no-as-needed"]),
+                        ] if cpu == "local" else []) + ([
+                            flag_group(flags = ["-B" + ctx.attr.linker_bin_path]),
+                        ] if ctx.attr.linker_bin_path else []) + [
+                            flag_group(
+                                flags = ["@%{linker_param_file}"],
+                                expand_if_available = "linker_param_file",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["%{linkstamp_paths}"],
+                                iterate_over = "linkstamp_paths",
+                            ),
+                            flag_group(
+                                flags = ["-o", "%{output_execpath}"],
+                                expand_if_available = "output_execpath",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-L%{library_search_directories}"],
+                                iterate_over = "library_search_directories",
+                            ),
+                            _iterate_flag_group(
+                                iterate_over = "runtime_library_search_directories",
+                                flags = [
+                                    "-Wl,-rpath,$ORIGIN/%{runtime_library_search_directories}",
+                                ] if cpu == "local" else [
+                                    "-Wl,-rpath,@loader_path/%{runtime_library_search_directories}",
+                                ],
+                            ),
+                            _libraries_to_link_group("linux"),
+                            _iterate_flag_group(
+                                flags = ["%{user_link_flags}"],
+                                iterate_over = "user_link_flags",
+                            ),
+                            flag_group(
+                                flags = ["-Wl,--gdb-index"],
+                                expand_if_available = "is_using_fission",
+                            ),
+                            flag_group(
+                                flags = ["-Wl,-S"],
+                                expand_if_available = "strip_debug_symbols",
+                            ),
+                            flag_group(flags = ["-lstdc++"]),
+                            _no_canonical_prefixes_group(
+                                ctx.attr.extra_no_canonical_prefixes_flags,
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_executable_link_actions(),
+                        flag_groups = [flag_group(flags = ["-pie"])],
+                    ),
+                ] + ([
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [flag_group(flags = [
+                            "-Wl,-z,relro,-z,now",
+                        ])],
+                    ),
+                ]) + ([
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [
+                            flag_group(flags = ["-Wl,--gc-sections"]),
+                            flag_group(
+                                flags = ["-Wl,--build-id=md5", "-Wl,--hash-style=gnu"],
+                            ),
+                        ],
+                    ),
+                ]) + [
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [
+                            _sysroot_group(),
+                        ],
+                    ),
+                ],
+            ),
+            feature(name = "disable-assertions"),
+            feature(
+                name = "opt",
+                implies = ["disable-assertions"],
+            ),
+            feature(name = "fastbuild"),
+            feature(name = "dbg"),
+            feature(name = "supports_dynamic_linker", enabled = True),
+            feature(name = "pic", enabled = True),
+            feature(name = "supports_pic", enabled = True),
+            feature(name = "has_configured_linker_path", enabled = True),
+        ]
+    else:
+        fail("Unreachable")
+
+def _impl(ctx):
+    cpu = ctx.attr.cpu
+    compiler = ctx.attr.compiler
+
+    if (cpu == "local"):
+        toolchain_identifier = "local_linux"
+        target_cpu = "local"
+        target_libc = "local"
+        action_configs = _action_configs(
+            assembly_path = ctx.attr.host_compiler_path,
+            c_compiler_path = ctx.attr.host_compiler_path,
+            cc_compiler_path = ctx.attr.host_compiler_path,
+            archiver_path = ctx.attr.host_compiler_prefix + "/ar",
+            linker_path = ctx.attr.host_compiler_path,
+            strip_path = ctx.attr.host_compiler_prefix + "/strip",
+        )
+        artifact_name_patterns = []
+    else:
+        fail("Unreachable")
+
+    out = ctx.actions.declare_file(ctx.label.name)
+    ctx.actions.write(out, "Fake executable")
+    return [
+        cc_common.create_cc_toolchain_config_info(
+            ctx = ctx,
+            features = _features(cpu, compiler, ctx),
+            action_configs = action_configs,
+            artifact_name_patterns = artifact_name_patterns,
+            cxx_builtin_include_directories = ctx.attr.builtin_include_directories,
+            toolchain_identifier = toolchain_identifier,
+            host_system_name = "local",
+            target_system_name = "local",
+            target_cpu = target_cpu,
+            target_libc = target_libc,
+            compiler = compiler,
+            abi_version = "local",
+            abi_libc_version = "local",
+            tool_paths = _tool_paths(cpu, ctx),
+            make_variables = [],
+            builtin_sysroot = ctx.attr.builtin_sysroot,
+            cc_target_os = None,
+        ),
+        DefaultInfo(
+            executable = out,
+        ),
+    ]
+
+cc_toolchain_config = rule(
+    implementation = _impl,
+    attrs = {
+        "cpu": attr.string(mandatory = True, values = ["local"]),
+        "compiler": attr.string(values = ["clang", "unknown"], default = "unknown"),
+        "builtin_include_directories": attr.string_list(),
+        "extra_no_canonical_prefixes_flags": attr.string_list(),
+        "host_compiler_path": attr.string(),
+        "host_compiler_prefix": attr.string(),
+        "host_compiler_warnings": attr.string_list(),
+        "host_unfiltered_compile_flags": attr.string_list(),
+        "linker_bin_path": attr.string(),
+        "builtin_sysroot": attr.string(),
+    },
+    provides = [CcToolchainConfigInfo],
+    executable = True,
+)
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 03a373a0024e3e..fefbf081c87e1c 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -1141,7 +1141,7 @@ def _create_local_cuda_repository(repository_ctx):
         ],
     ))
 
-    check_cuda_libs_script = repository_ctx.path(Label("@org_tensorflow//third_party/gpus:check_cuda_libs.py"))
+    check_cuda_libs_script = repository_ctx.path(Label("@local_tsl//third_party/gpus:check_cuda_libs.py"))
     cuda_libs = _find_libs(repository_ctx, check_cuda_libs_script, cuda_config)
     cuda_lib_srcs = []
     cuda_lib_outs = []
@@ -1534,7 +1534,7 @@ remote_cuda_configure = repository_rule(
     attrs = {
         "environ": attr.string_dict(),
         "_find_cuda_config": attr.label(
-            default = Label("@org_tensorflow//third_party/gpus:find_cuda_config.py"),
+            default = Label("@local_tsl//third_party/gpus:find_cuda_config.py"),
         ),
     },
 )
@@ -1544,7 +1544,7 @@ cuda_configure = repository_rule(
     environ = _ENVIRONS + [_TF_CUDA_CONFIG_REPO],
     attrs = {
         "_find_cuda_config": attr.label(
-            default = Label("@org_tensorflow//third_party/gpus:find_cuda_config.py"),
+            default = Label("@local_tsl//third_party/gpus:find_cuda_config.py"),
         ),
     },
 )
diff --git a/third_party/gpus/find_sycl_config.py b/third_party/gpus/find_sycl_config.py
new file mode 100644
index 00000000000000..5db84c32ea9084
--- /dev/null
+++ b/third_party/gpus/find_sycl_config.py
@@ -0,0 +1,139 @@
+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Prints SYCL library and header directories and versions found on the system.
+
+The script searches for SYCL library and header files on the system, inspects
+them to determine their version and prints the configuration to stdout. The path
+to inspect is specified through an environment variable (SYCL_PATH). If no valid
+configuration is found, the script prints to stderr and returns an error code.
+The script takes the directory specified by the SYCL_PATH environment variable.
+The script looks for headers and library files in a hard-coded set of
+subdirectories from base path of the specified directory. If SYCL_PATH is not
+specified, then "/opt/sycl" is used as it default value
+"""
+
+import io
+import os
+import re
+import sys
+
+
+class ConfigError(Exception):
+  pass
+
+
+def _get_default_sycl_toolkit_path():
+  return "/opt/intel/oneapi/compiler/latest"
+
+
+def _get_toolkit_path():
+  """Determines and returns the SYCL installation path."""
+  sycl_toolkit_path = None
+  sycl_toolkit_path = _get_default_sycl_toolkit_path()
+  if "SYCL_TOOLKIT_PATH" in os.environ:
+    sycl_toolkit_path = os.environ["SYCL_TOOLKIT_PATH"]
+  return os.path.realpath(sycl_toolkit_path)
+
+
+def _get_basekit_path():
+  return _get_toolkit_path().split("/compiler/")[0]
+
+
+def _get_basekit_version():
+  return _get_toolkit_path().split("/compiler/")[1].split("/")[0]
+
+
+def _get_composite_version_number(major, minor, patch):
+  return 10000 * major + 100 * minor + patch
+
+
+def _get_header_version(path, name):
+  """Returns preprocessor defines in C header file."""
+  for line in io.open(path, "r", encoding="utf-8"):
+    match = re.match(r"#define %s +(\d+)" % name, line)
+    if match:
+      value = match.group(1)
+      return int(value)
+
+  raise ConfigError(
+      '#define "{}" is either\n'.format(name)
+      + "  not present in file {} OR\n".format(path)
+      + "  its value is not an integer literal"
+  )
+
+
+def _find_sycl_config(basekit_path):
+  # pylint: disable=missing-function-docstring
+
+  def sycl_version_numbers(path):
+    possible_version_files = [
+        "compiler/latest/linux/include/sycl/version.hpp",
+        "compiler/latest/include/sycl/version.hpp",
+    ]
+    version_file = None
+    for f in possible_version_files:
+      version_file_path = os.path.join(path, f)
+      if os.path.exists(version_file_path):
+        version_file = version_file_path
+        break
+    if not version_file:
+      raise ConfigError(
+          "SYCL version file not found in {}".format(possible_version_files)
+      )
+
+    major = _get_header_version(version_file, "__LIBSYCL_MAJOR_VERSION")
+    minor = _get_header_version(version_file, "__LIBSYCL_MINOR_VERSION")
+    patch = _get_header_version(version_file, "__LIBSYCL_PATCH_VERSION")
+    return major, minor, patch
+
+  major, minor, patch = sycl_version_numbers(basekit_path)
+
+  sycl_config = {
+      "sycl_version_number": _get_composite_version_number(major, minor, patch),
+      "sycl_basekit_version_number": _get_basekit_version(),
+  }
+
+  return sycl_config
+
+
+def find_sycl_config():
+  """Returns a dictionary of SYCL components config info."""
+  basekit_path = _get_basekit_path()
+  toolkit_path = _get_toolkit_path()
+  if not os.path.exists(basekit_path):
+    raise ConfigError(
+        'Specified SYCL_TOOLKIT_PATH "{}" does not exist'.format(basekit_path)
+    )
+
+  result = {}
+
+  result["sycl_basekit_path"] = basekit_path
+  result["sycl_toolkit_path"] = toolkit_path
+  result.update(_find_sycl_config(basekit_path))
+
+  return result
+
+
+def main():
+  try:
+    for key, value in sorted(find_sycl_config().items()):
+      print("%s: %s" % (key, value))
+  except ConfigError as e:
+    sys.stderr.write("\nERROR: {}\n\n".format(str(e)))
+    sys.exit(1)
+
+
+if __name__ == "__main__":
+  main()
diff --git a/third_party/gpus/rocm/build_defs.bzl.tpl b/third_party/gpus/rocm/build_defs.bzl.tpl
index 339733755d6f1f..231a8a1913e7a9 100644
--- a/third_party/gpus/rocm/build_defs.bzl.tpl
+++ b/third_party/gpus/rocm/build_defs.bzl.tpl
@@ -39,6 +39,10 @@ def rocm_version_number():
     return %{rocm_version_number}
 
 def if_gpu_is_configured(if_true, if_false = []):
+    """Tests if ROCm or CUDA or SYCL was enabled during the configure process."""
+    return select({"//conditions:default": %{gpu_is_configured}})
+
+def if_cuda_or_rocm(if_true, if_false = []):
     """Tests if ROCm or CUDA was enabled during the configure process.
 
     Unlike if_rocm() or if_cuda(), this does not require that we are building
@@ -46,7 +50,7 @@ def if_gpu_is_configured(if_true, if_false = []):
     code to depend on ROCm or CUDA libraries.
 
     """
-    return select({"//conditions:default": %{gpu_is_configured}})
+    return select({"//conditions:default": %{cuda_or_rocm}})
 
 def if_rocm_is_configured(x):
     """Tests if the ROCm was enabled during the configure process.
diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl
index 1792df5dc23d12..0fd4019fc5bb75 100644
--- a/third_party/gpus/rocm_configure.bzl
+++ b/third_party/gpus/rocm_configure.bzl
@@ -29,6 +29,10 @@ load(
     "make_copy_files_rule",
     "to_list_of_strings",
 )
+load(
+    ":sycl_configure.bzl",
+    "enable_sycl",
+)
 
 _GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH"
 _GCC_HOST_COMPILER_PREFIX = "GCC_HOST_COMPILER_PREFIX"
@@ -450,7 +454,8 @@ def _create_dummy_repository(repository_ctx):
         "rocm:build_defs.bzl",
         {
             "%{rocm_is_configured}": "False",
-            "%{gpu_is_configured}": "if_true" if enable_cuda(repository_ctx) else "if_false",
+            "%{gpu_is_configured}": "if_true" if enable_cuda(repository_ctx) or enable_sycl(repository_ctx) else "if_false",
+            "%{cuda_or_rocm}": "if_true" if enable_cuda(repository_ctx) else "if_false",
             "%{rocm_extra_copts}": "[]",
             "%{rocm_gpu_architectures}": "[]",
             "%{rocm_version_number}": "0",
@@ -637,6 +642,7 @@ def _create_local_rocm_repository(repository_ctx):
         {
             "%{rocm_is_configured}": "True",
             "%{gpu_is_configured}": "if_true",
+            "%{cuda_or_rocm}": "if_true",
             "%{rocm_extra_copts}": _compute_rocm_extra_copts(
                 repository_ctx,
                 rocm_config.amdgpu_targets,
@@ -766,6 +772,7 @@ def _create_remote_rocm_repository(repository_ctx, remote_config_repo):
         {
             "%{rocm_is_configured}": "True",
             "%{gpu_is_configured}": "if_true",
+            "%{cuda_or_rocm}": "if_true",
             "%{rocm_extra_copts}": _compute_rocm_extra_copts(
                 repository_ctx,
                 [],  #_compute_capabilities(repository_ctx)
@@ -831,7 +838,7 @@ remote_rocm_configure = repository_rule(
     attrs = {
         "environ": attr.string_dict(),
         "_find_rocm_config": attr.label(
-            default = Label("@org_tensorflow//third_party/gpus:find_rocm_config.py"),
+            default = Label("@local_tsl//third_party/gpus:find_rocm_config.py"),
         ),
     },
 )
@@ -841,7 +848,7 @@ rocm_configure = repository_rule(
     environ = _ENVIRONS + [_TF_ROCM_CONFIG_REPO],
     attrs = {
         "_find_rocm_config": attr.label(
-            default = Label("@org_tensorflow//third_party/gpus:find_rocm_config.py"),
+            default = Label("@local_tsl//third_party/gpus:find_rocm_config.py"),
         ),
     },
 )
diff --git a/tensorflow/lite/examples/ios/camera/data/.gitignore b/third_party/gpus/sycl/BUILD
similarity index 100%
rename from tensorflow/lite/examples/ios/camera/data/.gitignore
rename to third_party/gpus/sycl/BUILD
diff --git a/third_party/gpus/sycl/BUILD.tpl b/third_party/gpus/sycl/BUILD.tpl
new file mode 100644
index 00000000000000..effbbc88799399
--- /dev/null
+++ b/third_party/gpus/sycl/BUILD.tpl
@@ -0,0 +1,63 @@
+package(default_visibility = ["//visibility:public"])
+
+config_setting(
+    name = "using_sycl",
+    values = {
+        "define": "using_sycl=true",
+    },
+)
+
+cc_library(
+    name = "sycl_headers",
+    hdrs = [
+        %{sycl_headers}
+    ],
+    includes = [
+        ".",
+        "sycl/include",
+        "sycl/include/sycl",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "sycl",
+    srcs = [
+        %{core_sycl_libs}
+    ],
+    data = [
+        %{core_sycl_libs}
+    ],
+    includes = [
+        ".",
+        "sycl/include",
+    ],
+    linkopts = ["-lze_loader"],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "mkl",
+    srcs = [
+        "sycl/lib/%{mkl_intel_ilp64_lib}",
+        "sycl/lib/%{mkl_sequential_lib}",
+        "sycl/lib/%{mkl_core_lib}",
+        %{mkl_sycl_libs}
+    ],
+    data = [
+        "sycl/lib/%{mkl_intel_ilp64_lib}",
+        "sycl/lib/%{mkl_sequential_lib}",
+        "sycl/lib/%{mkl_core_lib}",
+        %{mkl_sycl_libs}
+    ],
+    includes = [
+        ".",
+        "sycl/include",
+    ],
+    # linkopts = ["-Wl,-Bstatic,-lsvml,-lirng,-limf,-lirc,-lirc_s,-Bdynamic"],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+%{copy_rules}
diff --git a/third_party/gpus/sycl/build_defs.bzl.tpl b/third_party/gpus/sycl/build_defs.bzl.tpl
new file mode 100644
index 00000000000000..d678f84ca88a01
--- /dev/null
+++ b/third_party/gpus/sycl/build_defs.bzl.tpl
@@ -0,0 +1,39 @@
+# Macros for building SYCL code.
+def if_sycl(if_true, if_false = []):
+    """Shorthand for select()'ing on whether we're building with SYCL.
+
+    Returns a select statement which evaluates to if_true if we're building
+    with SYCL enabled.  Otherwise, the select statement evaluates to if_false.
+
+    """
+    return select({
+        "@local_config_sycl//sycl:using_sycl": if_true,
+        "//conditions:default": if_false,
+    })
+
+def sycl_default_copts():
+    """Default options for all SYCL compilations."""
+    return if_sycl(["-x", "sycl"])
+
+def sycl_build_is_configured():
+    """Returns true if SYCL compiler was enabled during the configure process."""
+    return %{sycl_build_is_configured}
+
+def if_sycl_is_configured(x):
+    """Tests if the SYCL was enabled during the configure process.
+
+    Unlike if_sycl(), this does not require that we are building with
+    --config=sycl. Used to allow non-SYCL code to depend on SYCL libraries.
+    """
+    if %{sycl_is_configured}:
+      return select({"//conditions:default": x})
+    return select({"//conditions:default": []})
+
+def if_sycl_build_is_configured(x, y):
+    if sycl_build_is_configured():
+      return x
+    return y
+
+def sycl_library(copts = [], **kwargs):
+    """Wrapper over cc_library which adds default SYCL options."""
+    native.cc_library(copts = sycl_default_copts() + copts, **kwargs)
diff --git a/third_party/gpus/sycl_configure.bzl b/third_party/gpus/sycl_configure.bzl
new file mode 100644
index 00000000000000..05330b2fe53195
--- /dev/null
+++ b/third_party/gpus/sycl_configure.bzl
@@ -0,0 +1,531 @@
+"""Repository rule for SYCL autoconfiguration.
+`sycl_configure` depends on the following environment variables:
+  * `TF_NEED_SYCL`: Whether to enable building with SYCL.
+  * `GCC_HOST_COMPILER_PATH`: The GCC host compiler path
+"""
+
+load(
+    "//third_party/remote_config:common.bzl",
+    "err_out",
+    "execute",
+    "files_exist",
+    "get_bash_bin",
+    "get_host_environ",
+    "get_python_bin",
+    "raw_exec",
+    "realpath",
+    "which",
+)
+load(
+    ":cuda_configure.bzl",
+    "make_copy_dir_rule",
+    "make_copy_files_rule",
+    "to_list_of_strings",
+)
+
+_GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH"
+_GCC_HOST_COMPILER_PREFIX = "GCC_HOST_COMPILER_PREFIX"
+
+def _mkl_path(sycl_config):
+    return sycl_config.sycl_basekit_path + "/mkl/" + sycl_config.sycl_basekit_version_number
+
+def _sycl_header_path(repository_ctx, sycl_config, bash_bin):
+    sycl_header_path = sycl_config.sycl_basekit_path + "/compiler/" + sycl_config.sycl_basekit_version_number
+    include_dir = sycl_header_path + "/include"
+    if not files_exist(repository_ctx, [include_dir], bash_bin)[0]:
+        sycl_header_path = sycl_header_path + "/linux"
+        include_dir = sycl_header_path + "/include"
+        if not files_exist(repository_ctx, [include_dir], bash_bin)[0]:
+            auto_configure_fail("Cannot find sycl headers in {}".format(include_dir))
+    return sycl_header_path
+
+def _sycl_include_path(repository_ctx, sycl_config, bash_bin):
+    """Generates the cxx_builtin_include_directory entries for sycl inc dirs.
+    Args:
+      repository_ctx: The repository context.
+      sycl_config: The path to the gcc host compiler.
+    Returns:
+      A string containing the Starlark string for each of the gcc
+      host compiler include directories, which can be added to the CROSSTOOL
+      file.
+    """
+    inc_dirs = []
+
+    inc_dirs.append(_mkl_path(sycl_config) + "/include")
+    inc_dirs.append(_sycl_header_path(repository_ctx, sycl_config, bash_bin) + "/include")
+    inc_dirs.append(_sycl_header_path(repository_ctx, sycl_config, bash_bin) + "/include/sycl")
+
+    return inc_dirs
+
+def enable_sycl(repository_ctx):
+    """Returns whether to build with SYCL support."""
+    return int(get_host_environ(repository_ctx, "TF_NEED_SYCL", False))
+
+def auto_configure_fail(msg):
+    """Output failure message when auto configuration fails."""
+    red = "\033[0;31m"
+    no_color = "\033[0m"
+    fail("\n%sAuto-Configuration Error:%s %s\n" % (red, no_color, msg))
+
+def find_cc(repository_ctx):
+    """Find the C++ compiler."""
+
+    # Return a dummy value for GCC detection here to avoid error
+    target_cc_name = "gcc"
+    cc_path_envvar = _GCC_HOST_COMPILER_PATH
+    cc_name = target_cc_name
+
+    cc_name_from_env = get_host_environ(repository_ctx, cc_path_envvar)
+    if cc_name_from_env:
+        cc_name = cc_name_from_env
+    if cc_name.startswith("/"):
+        # Absolute path, maybe we should make this supported by our which function.
+        return cc_name
+    cc = which(repository_ctx, cc_name)
+    if cc == None:
+        fail(("Cannot find {}, either correct your path or set the {}" +
+              " environment variable").format(target_cc_name, cc_path_envvar))
+    return cc
+
+def find_sycl_root(repository_ctx, sycl_config):
+    sycl_name = str(repository_ctx.path(sycl_config.sycl_toolkit_path.strip()).realpath)
+    if sycl_name.startswith("/"):
+        return sycl_name
+    fail("Cannot find SYCL compiler, please correct your path")
+
+def find_sycl_include_path(repository_ctx, sycl_config):
+    base_path = find_sycl_root(repository_ctx, sycl_config)
+    bin_path = repository_ctx.path(base_path + "/" + "bin" + "/" + "icpx")
+    icpx_extra = ""
+    if not bin_path.exists:
+        bin_path = repository_ctx.path(base_path + "/" + "bin" + "/" + "clang")
+        if not bin_path.exists:
+            fail("Cannot find SYCL compiler, please correct your path")
+    else:
+        icpx_extra = "-fsycl"
+    gcc_path = repository_ctx.which("gcc")
+    gcc_install_dir = repository_ctx.execute([gcc_path, "-print-libgcc-file-name"])
+    gcc_install_dir_opt = "--gcc-install-dir=" + str(repository_ctx.path(gcc_install_dir.stdout.strip()).dirname)
+    cmd_out = repository_ctx.execute([bin_path, icpx_extra, gcc_install_dir_opt, "-xc++", "-E", "-v", "/dev/null", "-o", "/dev/null"])
+    outlist = cmd_out.stderr.split("\n")
+    real_base_path = str(repository_ctx.path(base_path).realpath).strip()
+    include_dirs = []
+    for l in outlist:
+        if l.startswith(" ") and l.strip().startswith("/") and str(repository_ctx.path(l.strip()).realpath) not in include_dirs:
+            include_dirs.append(str(repository_ctx.path(l.strip()).realpath))
+    return include_dirs
+
+def _lib_name(lib, version = "", static = False):
+    """Constructs the name of a library on Linux.
+    Args:
+      lib: The name of the library, such as "mkl"
+      version: The version of the library.
+      static: True the library is static or False if it is a shared object.
+    Returns:
+      The platform-specific name of the library.
+    """
+    if static:
+        return "lib%s.a" % lib
+    else:
+        if version:
+            version = ".%s" % version
+        return "lib%s.so%s" % (lib, version)
+
+def _sycl_lib_paths(repository_ctx, lib, basedir):
+    file_name = _lib_name(lib, version = "", static = False)
+    return [
+        repository_ctx.path("%s/lib/%s" % (basedir, file_name)),
+        repository_ctx.path("%s/lib/intel64/%s" % (basedir, file_name)),
+    ]
+
+def _batch_files_exist(repository_ctx, libs_paths, bash_bin):
+    all_paths = []
+    for _, lib_paths in libs_paths:
+        for lib_path in lib_paths:
+            all_paths.append(lib_path)
+    return files_exist(repository_ctx, all_paths, bash_bin)
+
+def _select_sycl_lib_paths(repository_ctx, libs_paths, bash_bin):
+    test_results = _batch_files_exist(repository_ctx, libs_paths, bash_bin)
+
+    libs = {}
+    i = 0
+    for name, lib_paths in libs_paths:
+        selected_path = None
+        for path in lib_paths:
+            if test_results[i] and selected_path == None:
+                # For each lib select the first path that exists.
+                selected_path = path
+            i = i + 1
+        if selected_path == None:
+            auto_configure_fail("Cannot find sycl library %s in %s" % (name, path))
+
+        libs[name] = struct(file_name = selected_path.basename, path = realpath(repository_ctx, selected_path, bash_bin))
+
+    return libs
+
+def _find_libs(repository_ctx, sycl_config, bash_bin):
+    """Returns the SYCL libraries on the system.
+    Args:
+      repository_ctx: The repository context.
+      sycl_config: The SYCL config as returned by _get_sycl_config
+      bash_bin: the path to the bash interpreter
+    Returns:
+      Map of library names to structs of filename and path
+    """
+    mkl_path = _mkl_path(sycl_config)
+    sycl_path = _sycl_header_path(repository_ctx, sycl_config, bash_bin)
+    libs_paths = [
+        (name, _sycl_lib_paths(repository_ctx, name, path))
+        for name, path in [
+            ("sycl", sycl_path),
+            ("OpenCL", sycl_path),
+            ("mkl_intel_ilp64", mkl_path),
+            ("mkl_sequential", mkl_path),
+            ("mkl_core", mkl_path),
+        ]
+    ]
+    if sycl_config.sycl_basekit_version_number < "2024":
+        libs_paths.append(("mkl_sycl", _sycl_lib_paths(repository_ctx, "mkl_sycl", mkl_path)))
+    else:
+        libs_paths.append(("mkl_sycl_blas", _sycl_lib_paths(repository_ctx, "mkl_sycl_blas", mkl_path)))
+        libs_paths.append(("mkl_sycl_lapack", _sycl_lib_paths(repository_ctx, "mkl_sycl_lapack", mkl_path)))
+        libs_paths.append(("mkl_sycl_sparse", _sycl_lib_paths(repository_ctx, "mkl_sycl_sparse", mkl_path)))
+        libs_paths.append(("mkl_sycl_dft", _sycl_lib_paths(repository_ctx, "mkl_sycl_dft", mkl_path)))
+        libs_paths.append(("mkl_sycl_vm", _sycl_lib_paths(repository_ctx, "mkl_sycl_vm", mkl_path)))
+        libs_paths.append(("mkl_sycl_rng", _sycl_lib_paths(repository_ctx, "mkl_sycl_rng", mkl_path)))
+        libs_paths.append(("mkl_sycl_stats", _sycl_lib_paths(repository_ctx, "mkl_sycl_stats", mkl_path)))
+        libs_paths.append(("mkl_sycl_data_fitting", _sycl_lib_paths(repository_ctx, "mkl_sycl_data_fitting", mkl_path)))
+    return _select_sycl_lib_paths(repository_ctx, libs_paths, bash_bin)
+
+def find_sycl_config(repository_ctx):
+    """Returns SYCL config dictionary from running find_sycl_config.py"""
+    python_bin = get_python_bin(repository_ctx)
+    exec_result = execute(repository_ctx, [python_bin, repository_ctx.attr._find_sycl_config])
+    if exec_result.return_code:
+        auto_configure_fail("Failed to run find_sycl_config.py: %s" % err_out(exec_result))
+
+    # Parse the dict from stdout.
+    return dict([tuple(x.split(": ")) for x in exec_result.stdout.splitlines()])
+
+def _get_sycl_config(repository_ctx, bash_bin):
+    """Detects and returns information about the SYCL installation on the system.
+    Args:
+      repository_ctx: The repository context.
+      bash_bin: the path to the path interpreter
+    """
+    config = find_sycl_config(repository_ctx)
+    sycl_basekit_path = config["sycl_basekit_path"]
+    sycl_toolkit_path = config["sycl_toolkit_path"]
+    sycl_version_number = config["sycl_version_number"]
+    sycl_basekit_version_number = config["sycl_basekit_version_number"]
+    return struct(
+        sycl_basekit_path = sycl_basekit_path,
+        sycl_toolkit_path = sycl_toolkit_path,
+        sycl_version_number = sycl_version_number,
+        sycl_basekit_version_number = sycl_basekit_version_number,
+    )
+
+def _tpl_path(repository_ctx, labelname):
+    return repository_ctx.path(Label("//third_party/gpus/%s.tpl" % labelname))
+
+def _tpl(repository_ctx, tpl, substitutions = {}, out = None):
+    if not out:
+        out = tpl.replace(":", "/")
+    repository_ctx.template(
+        out,
+        _tpl_path(repository_ctx, tpl),
+        substitutions,
+    )
+
+_INC_DIR_MARKER_BEGIN = "#include <...>"
+
+def _cxx_inc_convert(path):
+    """Convert path returned by cc -E xc++ in a complete path."""
+    path = path.strip()
+    return path
+
+def _normalize_include_path(repository_ctx, path):
+    """Normalizes include paths before writing them to the crosstool.
+      If path points inside the 'crosstool' folder of the repository, a relative
+      path is returned.
+      If path points outside the 'crosstool' folder, an absolute path is returned.
+      """
+    path = str(repository_ctx.path(path))
+    crosstool_folder = str(repository_ctx.path(".").get_child("crosstool"))
+
+    if path.startswith(crosstool_folder):
+        # We drop the path to "$REPO/crosstool" and a trailing path separator.
+        return "\"" + path[len(crosstool_folder) + 1:] + "\""
+    return "\"" + path + "\""
+
+def _get_cxx_inc_directories_impl(repository_ctx, cc, lang_is_cpp):
+    """Compute the list of default C or C++ include directories."""
+    if lang_is_cpp:
+        lang = "c++"
+    else:
+        lang = "c"
+
+    result = raw_exec(repository_ctx, [
+        cc,
+        "-no-canonical-prefixes",
+        "-E",
+        "-x" + lang,
+        "-",
+        "-v",
+    ])
+    stderr = err_out(result)
+    index1 = stderr.find(_INC_DIR_MARKER_BEGIN)
+    if index1 == -1:
+        return []
+    index1 = stderr.find("\n", index1)
+    if index1 == -1:
+        return []
+    index2 = stderr.rfind("\n ")
+    if index2 == -1 or index2 < index1:
+        return []
+    index2 = stderr.find("\n", index2 + 1)
+    if index2 == -1:
+        inc_dirs = stderr[index1 + 1:]
+    else:
+        inc_dirs = stderr[index1 + 1:index2].strip()
+
+    return [
+        str(repository_ctx.path(_cxx_inc_convert(p)))
+        for p in inc_dirs.split("\n")
+    ]
+
+def get_cxx_inc_directories(repository_ctx, cc):
+    """Compute the list of default C and C++ include directories."""
+
+    # For some reason `clang -xc` sometimes returns include paths that are
+    # different from the ones from `clang -xc++`. (Symlink and a dir)
+    # So we run the compiler with both `-xc` and `-xc++` and merge resulting lists
+    includes_cpp = _get_cxx_inc_directories_impl(repository_ctx, cc, True)
+    includes_c = _get_cxx_inc_directories_impl(repository_ctx, cc, False)
+
+    includes_cpp_set = depset(includes_cpp)
+    return includes_cpp + [
+        inc
+        for inc in includes_c
+        if inc not in includes_cpp_set.to_list()
+    ]
+
+_DUMMY_CROSSTOOL_BZL_FILE = """
+def error_gpu_disabled():
+  fail("ERROR: Building with --config=sycl but TensorFlow is not configured " +
+       "to build with GPU support. Please re-run ./configure and enter 'Y' " +
+       "at the prompt to build with GPU support.")
+  native.genrule(
+      name = "error_gen_crosstool",
+      outs = ["CROSSTOOL"],
+      cmd = "echo 'Should not be run.' && exit 1",
+  )
+  native.filegroup(
+      name = "crosstool",
+      srcs = [":CROSSTOOL"],
+      output_licenses = ["unencumbered"],
+  )
+"""
+
+_DUMMY_CROSSTOOL_BUILD_FILE = """
+load("//crosstool:error_gpu_disabled.bzl", "error_gpu_disabled")
+error_gpu_disabled()
+"""
+
+def _create_dummy_repository(repository_ctx):
+    # Set up BUILD file for sycl/.
+    _tpl(repository_ctx, "sycl:build_defs.bzl")
+    _tpl(
+        repository_ctx,
+        "sycl:BUILD",
+        {
+            "%{mkl_intel_ilp64_lib}": _lib_name("mkl_intel_ilp64"),
+            "%{mkl_sequential_lib}": _lib_name("mkl_sequential"),
+            "%{mkl_core_lib}": _lib_name("mkl_core"),
+            "%{mkl_sycl_libs}": "",
+            "%{core_sycl_libs}": "",
+            "%{copy_rules}": "",
+            "%{sycl_headers}": "",
+        },
+    )
+
+    # If sycl_configure is not configured to build with SYCL support, and the user
+    # attempts to build with --config=sycl, add a dummy build rule to intercept
+    # this and fail with an actionable error message.
+    repository_ctx.file(
+        "crosstool/error_gpu_disabled.bzl",
+        _DUMMY_CROSSTOOL_BZL_FILE,
+    )
+    repository_ctx.file("crosstool/BUILD", _DUMMY_CROSSTOOL_BUILD_FILE)
+
+    _tpl(
+        repository_ctx,
+        "sycl:build_defs.bzl",
+        {
+            "%{sycl_is_configured}": "False",
+            "%{sycl_build_is_configured}": "False",
+        },
+    )
+
+def _create_local_sycl_repository(repository_ctx):
+    tpl_paths = {labelname: _tpl_path(repository_ctx, labelname) for labelname in [
+        "sycl:build_defs.bzl",
+        "sycl:BUILD",
+        "crosstool:BUILD.sycl",
+        "crosstool:sycl_cc_toolchain_config.bzl",
+        "crosstool:clang/bin/crosstool_wrapper_driver_sycl",
+    ]}
+
+    bash_bin = get_bash_bin(repository_ctx)
+    sycl_config = _get_sycl_config(repository_ctx, bash_bin)
+
+    # Copy header and library files to execroot.
+    copy_rules = [
+        make_copy_dir_rule(
+            repository_ctx,
+            name = "sycl-include",
+            src_dir = _sycl_header_path(repository_ctx, sycl_config, bash_bin) + "/include",
+            out_dir = "sycl/include",
+        ),
+    ]
+    copy_rules.append(make_copy_dir_rule(
+        repository_ctx,
+        name = "mkl-include",
+        src_dir = _mkl_path(sycl_config) + "/include",
+        out_dir = "sycl/include",
+    ))
+
+    sycl_libs = _find_libs(repository_ctx, sycl_config, bash_bin)
+    sycl_lib_srcs = []
+    sycl_lib_outs = []
+    for lib in sycl_libs.values():
+        sycl_lib_srcs.append(lib.path)
+        sycl_lib_outs.append("sycl/lib/" + lib.file_name)
+    copy_rules.append(make_copy_files_rule(
+        repository_ctx,
+        name = "sycl-lib",
+        srcs = sycl_lib_srcs,
+        outs = sycl_lib_outs,
+    ))
+
+    # Set up BUILD file for sycl/
+    repository_ctx.template(
+        "sycl/build_defs.bzl",
+        tpl_paths["sycl:build_defs.bzl"],
+        {
+            "%{sycl_is_configured}": "True",
+            "%{sycl_build_is_configured}": "True",
+        },
+    )
+
+    if sycl_config.sycl_basekit_version_number < "2024":
+        mkl_sycl_libs = '"{}"'.format(
+            "sycl/lib/" + sycl_libs["mkl_sycl"].file_name,
+        )
+    else:
+        mkl_sycl_libs = '"{}",\n"{}",\n"{}",\n"{}",\n"{}",\n"{}",\n"{}",\n"{}"'.format(
+            "sycl/lib/" + sycl_libs["mkl_sycl_blas"].file_name,
+            "sycl/lib/" + sycl_libs["mkl_sycl_lapack"].file_name,
+            "sycl/lib/" + sycl_libs["mkl_sycl_sparse"].file_name,
+            "sycl/lib/" + sycl_libs["mkl_sycl_dft"].file_name,
+            "sycl/lib/" + sycl_libs["mkl_sycl_vm"].file_name,
+            "sycl/lib/" + sycl_libs["mkl_sycl_rng"].file_name,
+            "sycl/lib/" + sycl_libs["mkl_sycl_stats"].file_name,
+            "sycl/lib/" + sycl_libs["mkl_sycl_data_fitting"].file_name,
+        )
+    core_sycl_libs = '"{}",\n"{}"'.format(
+        "sycl/lib/" + sycl_libs["sycl"].file_name,
+        "sycl/lib/" + sycl_libs["OpenCL"].file_name,
+    )
+    repository_dict = {
+        "%{mkl_intel_ilp64_lib}": sycl_libs["mkl_intel_ilp64"].file_name,
+        "%{mkl_sequential_lib}": sycl_libs["mkl_sequential"].file_name,
+        "%{mkl_core_lib}": sycl_libs["mkl_core"].file_name,
+        "%{mkl_sycl_libs}": mkl_sycl_libs,
+        "%{core_sycl_libs}": core_sycl_libs,
+        "%{copy_rules}": "\n".join(copy_rules),
+        "%{sycl_headers}": ('":mkl-include",\n":sycl-include",\n'),
+    }
+    repository_ctx.template(
+        "sycl/BUILD",
+        tpl_paths["sycl:BUILD"],
+        repository_dict,
+    )
+
+    # Set up crosstool/
+
+    cc = find_cc(repository_ctx)
+
+    host_compiler_includes = get_cxx_inc_directories(repository_ctx, cc)
+
+    host_compiler_prefix = get_host_environ(repository_ctx, _GCC_HOST_COMPILER_PREFIX, "/usr/bin")
+
+    sycl_defines = {}
+
+    sycl_defines["%{host_compiler_prefix}"] = host_compiler_prefix
+    sycl_defines["%{host_compiler_path}"] = "clang/bin/crosstool_wrapper_driver_is_not_gcc"
+
+    sycl_defines["%{cpu_compiler}"] = str(cc)
+    sycl_defines["%{linker_bin_path}"] = "/usr/bin"
+
+    sycl_internal_inc_dirs = find_sycl_include_path(repository_ctx, sycl_config)
+    cxx_builtin_includes_list = sycl_internal_inc_dirs + _sycl_include_path(repository_ctx, sycl_config, bash_bin) + host_compiler_includes
+
+    sycl_defines["%{cxx_builtin_include_directories}"] = to_list_of_strings(cxx_builtin_includes_list)
+    sycl_defines["%{extra_no_canonical_prefixes_flags}"] = "\"-fno-canonical-system-headers\""
+    sycl_defines["%{unfiltered_compile_flags}"] = to_list_of_strings([
+        "-DTENSORFLOW_USE_SYCL=1",
+        "-DMKL_ILP64",
+        "-fPIC",
+    ])
+    sycl_defines["%{sycl_compiler_root}"] = str(sycl_config.sycl_toolkit_path)
+    sycl_defines["%{SYCL_ROOT_DIR}"] = str(sycl_config.sycl_toolkit_path)
+    sycl_defines["%{basekit_path}"] = str(sycl_config.sycl_basekit_path)
+    sycl_defines["%{basekit_version}"] = str(sycl_config.sycl_basekit_version_number)
+    sycl_defines["%{MKL_PATH}"] = _mkl_path(sycl_config)
+
+    # Only expand template variables in the BUILD file
+    repository_ctx.template(
+        "crosstool/BUILD",
+        tpl_paths["crosstool:BUILD.sycl"],
+        sycl_defines,
+    )
+
+    # No templating of cc_toolchain_config - use attributes and templatize the
+    # BUILD file.
+    repository_ctx.template(
+        "crosstool/cc_toolchain_config.bzl",
+        tpl_paths["crosstool:sycl_cc_toolchain_config.bzl"],
+        sycl_defines,
+    )
+
+    repository_ctx.template(
+        "crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc",
+        tpl_paths["crosstool:clang/bin/crosstool_wrapper_driver_sycl"],
+        sycl_defines,
+    )
+
+def _sycl_autoconf_imp(repository_ctx):
+    """Implementation of the sycl_autoconf rule."""
+    if not enable_sycl(repository_ctx):
+        _create_dummy_repository(repository_ctx)
+    else:
+        _create_local_sycl_repository(repository_ctx)
+
+sycl_configure = repository_rule(
+    # Detects and configures the local SYCL toolchain.
+    # Add the following to your WORKSPACE FILE:
+    # ```python
+    # sycl_configure(name = "local_config_sycl")
+    # ```
+    # Args:
+    #   name: A unique name for this workspace rule.
+    implementation = _sycl_autoconf_imp,
+    local = True,
+    attrs = {
+        "_find_sycl_config": attr.label(
+            default = Label("//third_party/gpus:find_sycl_config.py"),
+        ),
+    },
+)
diff --git a/third_party/hwloc/hwloc.BUILD b/third_party/hwloc/hwloc.BUILD
index ff14af8b59e21d..c2ce91ac241fef 100644
--- a/third_party/hwloc/hwloc.BUILD
+++ b/third_party/hwloc/hwloc.BUILD
@@ -50,7 +50,7 @@ expand_template(
     name = "include_hwloc_autogen_config_h",
     out = "include/hwloc/autogen/config.h",
     substitutions = select({
-        "@local_tsl//tsl:linux_x86_64": _INCLUDE_HWLOC_AUTOIGEN_CONFIG_H_LINUX_SUBS,
+        "@local_xla//xla/tsl:linux_x86_64": _INCLUDE_HWLOC_AUTOIGEN_CONFIG_H_LINUX_SUBS,
         "//conditions:default": _INCLUDE_HWLOC_AUTOIGEN_CONFIG_H_COMMON_SUBS,
     }),
     template = "include/hwloc/autogen/config.h.in",
@@ -224,7 +224,7 @@ expand_template(
     name = "move_static_components_h",
     out = "hwloc/static-components.h",
     substitutions = {"&hwloc_linuxio_component": "//&hwloc_linuxio_component"},
-    template = "@org_tensorflow//third_party/hwloc:static-components.h",
+    template = "@local_tsl//third_party/hwloc:static-components.h",
 )
 
 cc_library(
@@ -259,21 +259,21 @@ cc_library(
         "include/private/private.h",
         "include/private/xml.h",
     ] + select({
-        "@local_tsl//tsl:linux_x86_64": [
+        "@local_xla//xla/tsl:linux_x86_64": [
             "hwloc/topology-linux.c",
             "hwloc/topology-x86.c",
             "include/hwloc/linux.h",
             "include/private/cpuid-x86.h",
         ],
-        "@local_tsl//tsl:linux_aarch64": [
+        "@local_xla//xla/tsl:linux_aarch64": [
             "hwloc/topology-linux.c",
             "include/hwloc/linux.h",
         ],
-        "@local_tsl//tsl:linux_ppc64le": [
+        "@local_xla//xla/tsl:linux_ppc64le": [
             "hwloc/topology-linux.c",
             "include/hwloc/linux.h",
         ],
-        "@local_tsl//tsl:freebsd": [
+        "@local_xla//xla/tsl:freebsd": [
             "hwloc/topology-freebsd.c",
             "hwloc/topology-x86.c",
             "include/private/cpuid-x86.h",
diff --git a/third_party/llvm/vs2019.patch b/third_party/llvm/vs2019.patch
deleted file mode 100644
index d6eff968e5f832..00000000000000
--- a/third_party/llvm/vs2019.patch
+++ /dev/null
@@ -1,394 +0,0 @@
-diff --git a/llvm/include/llvm/Support/FormatAdapters.h b/llvm/include/llvm/Support/FormatAdapters.h
-index 495205d11748..4131e956873e 100644
---- a/llvm/include/llvm/Support/FormatAdapters.h
-+++ b/llvm/include/llvm/Support/FormatAdapters.h
-@@ -16,13 +16,15 @@
- #include "llvm/Support/raw_ostream.h"
- 
- namespace llvm {
--template <typename T> class FormatAdapter : public detail::format_adapter {
-+template <typename T>
-+class FormatAdapter : public support::detail::format_adapter {
- protected:
-   explicit FormatAdapter(T &&Item) : Item(std::forward<T>(Item)) {}
- 
-   T Item;
- };
- 
-+namespace support {
- namespace detail {
- template <typename T> class AlignAdapter final : public FormatAdapter<T> {
-   AlignStyle Where;
-@@ -80,29 +82,31 @@ public:
-     Stream << Item;
-   }
- };
--}
-+} // namespace detail
-+} // namespace support
- 
- template <typename T>
--detail::AlignAdapter<T> fmt_align(T &&Item, AlignStyle Where, size_t Amount,
--                                  char Fill = ' ') {
--  return detail::AlignAdapter<T>(std::forward<T>(Item), Where, Amount, Fill);
-+support::detail::AlignAdapter<T> fmt_align(T &&Item, AlignStyle Where,
-+                                           size_t Amount, char Fill = ' ') {
-+  return support::detail::AlignAdapter<T>(std::forward<T>(Item), Where, Amount,
-+                                          Fill);
- }
- 
- template <typename T>
--detail::PadAdapter<T> fmt_pad(T &&Item, size_t Left, size_t Right) {
--  return detail::PadAdapter<T>(std::forward<T>(Item), Left, Right);
-+support::detail::PadAdapter<T> fmt_pad(T &&Item, size_t Left, size_t Right) {
-+  return support::detail::PadAdapter<T>(std::forward<T>(Item), Left, Right);
- }
- 
- template <typename T>
--detail::RepeatAdapter<T> fmt_repeat(T &&Item, size_t Count) {
--  return detail::RepeatAdapter<T>(std::forward<T>(Item), Count);
-+support::detail::RepeatAdapter<T> fmt_repeat(T &&Item, size_t Count) {
-+  return support::detail::RepeatAdapter<T>(std::forward<T>(Item), Count);
- }
- 
- // llvm::Error values must be consumed before being destroyed.
- // Wrapping an error in fmt_consume explicitly indicates that the formatv_object
- // should take ownership and consume it.
--inline detail::ErrorAdapter fmt_consume(Error &&Item) {
--  return detail::ErrorAdapter(std::move(Item));
-+inline support::detail::ErrorAdapter fmt_consume(Error &&Item) {
-+  return support::detail::ErrorAdapter(std::move(Item));
- }
- }
- 
-diff --git a/llvm/include/llvm/Support/FormatCommon.h b/llvm/include/llvm/Support/FormatCommon.h
-index 24a40c325e13..326e00936aa7 100644
---- a/llvm/include/llvm/Support/FormatCommon.h
-+++ b/llvm/include/llvm/Support/FormatCommon.h
-@@ -17,13 +17,13 @@ namespace llvm {
- enum class AlignStyle { Left, Center, Right };
- 
- struct FmtAlign {
--  detail::format_adapter &Adapter;
-+  support::detail::format_adapter &Adapter;
-   AlignStyle Where;
-   size_t Amount;
-   char Fill;
- 
--  FmtAlign(detail::format_adapter &Adapter, AlignStyle Where, size_t Amount,
--           char Fill = ' ')
-+  FmtAlign(support::detail::format_adapter &Adapter, AlignStyle Where,
-+           size_t Amount, char Fill = ' ')
-       : Adapter(Adapter), Where(Where), Amount(Amount), Fill(Fill) {}
- 
-   void format(raw_ostream &S, StringRef Options) {
-diff --git a/llvm/include/llvm/Support/FormatProviders.h b/llvm/include/llvm/Support/FormatProviders.h
-index aa0773847161..bf489e2bfa07 100644
---- a/llvm/include/llvm/Support/FormatProviders.h
-+++ b/llvm/include/llvm/Support/FormatProviders.h
-@@ -25,6 +25,7 @@
- #include <type_traits>
- 
- namespace llvm {
-+namespace support {
- namespace detail {
- template <typename T>
- struct use_integral_formatter
-@@ -98,7 +99,8 @@ protected:
-     return Default;
-   }
- };
--}
-+} // namespace detail
-+} // namespace support
- 
- /// Implementation of format_provider<T> for integral arithmetic types.
- ///
-@@ -125,8 +127,8 @@ protected:
- 
- template <typename T>
- struct format_provider<
--    T, std::enable_if_t<detail::use_integral_formatter<T>::value>>
--    : public detail::HelperFunctions {
-+    T, std::enable_if_t<support::detail::use_integral_formatter<T>::value>>
-+    : public support::detail::HelperFunctions {
- private:
- public:
-   static void format(const T &V, llvm::raw_ostream &Stream, StringRef Style) {
-@@ -174,8 +176,8 @@ public:
- /// cases indicates the minimum number of nibbles to print.
- template <typename T>
- struct format_provider<
--    T, std::enable_if_t<detail::use_pointer_formatter<T>::value>>
--    : public detail::HelperFunctions {
-+    T, std::enable_if_t<support::detail::use_pointer_formatter<T>::value>>
-+    : public support::detail::HelperFunctions {
- private:
- public:
-   static void format(const T &V, llvm::raw_ostream &Stream, StringRef Style) {
-@@ -199,7 +201,7 @@ public:
- 
- template <typename T>
- struct format_provider<
--    T, std::enable_if_t<detail::use_string_formatter<T>::value>> {
-+    T, std::enable_if_t<support::detail::use_string_formatter<T>::value>> {
-   static void format(const T &V, llvm::raw_ostream &Stream, StringRef Style) {
-     size_t N = StringRef::npos;
-     if (!Style.empty() && Style.getAsInteger(10, N)) {
-@@ -231,8 +233,8 @@ template <> struct format_provider<Twine> {
- /// character.  Otherwise, it is treated as an integer options string.
- ///
- template <typename T>
--struct format_provider<T,
--                       std::enable_if_t<detail::use_char_formatter<T>::value>> {
-+struct format_provider<
-+    T, std::enable_if_t<support::detail::use_char_formatter<T>::value>> {
-   static void format(const char &V, llvm::raw_ostream &Stream,
-                      StringRef Style) {
-     if (Style.empty())
-@@ -297,9 +299,9 @@ template <> struct format_provider<bool> {
- /// else.
- 
- template <typename T>
--struct format_provider<T,
--                       std::enable_if_t<detail::use_double_formatter<T>::value>>
--    : public detail::HelperFunctions {
-+struct format_provider<
-+    T, std::enable_if_t<support::detail::use_double_formatter<T>::value>>
-+    : public support::detail::HelperFunctions {
-   static void format(const T &V, llvm::raw_ostream &Stream, StringRef Style) {
-     FloatStyle S;
-     if (Style.consume_front("P") || Style.consume_front("p"))
-@@ -321,6 +323,7 @@ struct format_provider<T,
-   }
- };
- 
-+namespace support {
- namespace detail {
- template <typename IterT>
- using IterValue = typename std::iterator_traits<IterT>::value_type;
-@@ -328,8 +331,10 @@ using IterValue = typename std::iterator_traits<IterT>::value_type;
- template <typename IterT>
- struct range_item_has_provider
-     : public std::integral_constant<
--          bool, !uses_missing_provider<IterValue<IterT>>::value> {};
--}
-+          bool,
-+          !support::detail::uses_missing_provider<IterValue<IterT>>::value> {};
-+} // namespace detail
-+} // namespace support
- 
- /// Implementation of format_provider<T> for ranges.
- ///
-@@ -393,7 +398,7 @@ template <typename IterT> class format_provider<llvm::iterator_range<IterT>> {
-   }
- 
- public:
--  static_assert(detail::range_item_has_provider<IterT>::value,
-+  static_assert(support::detail::range_item_has_provider<IterT>::value,
-                 "Range value_type does not have a format provider!");
-   static void format(const llvm::iterator_range<IterT> &V,
-                      llvm::raw_ostream &Stream, StringRef Style) {
-@@ -403,18 +408,18 @@ public:
-     auto Begin = V.begin();
-     auto End = V.end();
-     if (Begin != End) {
--      auto Adapter = detail::build_format_adapter(*Begin);
-+      auto Adapter = support::detail::build_format_adapter(*Begin);
-       Adapter.format(Stream, ArgStyle);
-       ++Begin;
-     }
-     while (Begin != End) {
-       Stream << Sep;
--      auto Adapter = detail::build_format_adapter(*Begin);
-+      auto Adapter = support::detail::build_format_adapter(*Begin);
-       Adapter.format(Stream, ArgStyle);
-       ++Begin;
-     }
-   }
- };
--}
-+} // namespace llvm
- 
- #endif
-diff --git a/llvm/include/llvm/Support/FormatVariadic.h b/llvm/include/llvm/Support/FormatVariadic.h
-index ddd80d89f1cd..595f2cf559a4 100644
---- a/llvm/include/llvm/Support/FormatVariadic.h
-+++ b/llvm/include/llvm/Support/FormatVariadic.h
-@@ -66,7 +66,7 @@ struct ReplacementItem {
- class formatv_object_base {
- protected:
-   StringRef Fmt;
--  ArrayRef<detail::format_adapter *> Adapters;
-+  ArrayRef<support::detail::format_adapter *> Adapters;
- 
-   static bool consumeFieldLayout(StringRef &Spec, AlignStyle &Where,
-                                  size_t &Align, char &Pad);
-@@ -75,7 +75,7 @@ protected:
-   splitLiteralAndReplacement(StringRef Fmt);
- 
-   formatv_object_base(StringRef Fmt,
--                      ArrayRef<detail::format_adapter *> Adapters)
-+                      ArrayRef<support::detail::format_adapter *> Adapters)
-       : Fmt(Fmt), Adapters(Adapters) {}
- 
-   formatv_object_base(formatv_object_base const &rhs) = delete;
-@@ -130,7 +130,7 @@ template <typename Tuple> class formatv_object : public formatv_object_base {
-   // of the parameters, we have to own the storage for the parameters here, and
-   // have the base class store type-erased pointers into this tuple.
-   Tuple Parameters;
--  std::array<detail::format_adapter *, std::tuple_size<Tuple>::value>
-+  std::array<support::detail::format_adapter *, std::tuple_size<Tuple>::value>
-       ParameterPointers;
- 
-   // The parameters are stored in a std::tuple, which does not provide runtime
-@@ -142,8 +142,8 @@ template <typename Tuple> class formatv_object : public formatv_object_base {
-   // std::array<Base*>.
-   struct create_adapters {
-     template <typename... Ts>
--    std::array<detail::format_adapter *, std::tuple_size<Tuple>::value>
--    operator()(Ts &... Items) {
-+    std::array<support::detail::format_adapter *, std::tuple_size<Tuple>::value>
-+    operator()(Ts &...Items) {
-       return {{&Items...}};
-     }
-   };
-@@ -248,13 +248,14 @@ public:
- // the details of what that is are undefined.
- //
- template <typename... Ts>
--inline auto formatv(const char *Fmt, Ts &&... Vals) -> formatv_object<decltype(
--    std::make_tuple(detail::build_format_adapter(std::forward<Ts>(Vals))...))> {
--  using ParamTuple = decltype(
--      std::make_tuple(detail::build_format_adapter(std::forward<Ts>(Vals))...));
-+inline auto formatv(const char *Fmt, Ts &&...Vals)
-+    -> formatv_object<decltype(std::make_tuple(
-+        support::detail::build_format_adapter(std::forward<Ts>(Vals))...))> {
-+  using ParamTuple = decltype(std::make_tuple(
-+      support::detail::build_format_adapter(std::forward<Ts>(Vals))...));
-   return formatv_object<ParamTuple>(
--      Fmt,
--      std::make_tuple(detail::build_format_adapter(std::forward<Ts>(Vals))...));
-+      Fmt, std::make_tuple(support::detail::build_format_adapter(
-+               std::forward<Ts>(Vals))...));
- }
- 
- } // end namespace llvm
-diff --git a/llvm/include/llvm/Support/FormatVariadicDetails.h b/llvm/include/llvm/Support/FormatVariadicDetails.h
-index 068c327df396..a221fcadbd3c 100644
---- a/llvm/include/llvm/Support/FormatVariadicDetails.h
-+++ b/llvm/include/llvm/Support/FormatVariadicDetails.h
-@@ -19,6 +19,7 @@ namespace llvm {
- template <typename T, typename Enable = void> struct format_provider {};
- class Error;
- 
-+namespace support {
- namespace detail {
- class format_adapter {
-   virtual void anchor();
-@@ -156,7 +157,8 @@ std::enable_if_t<uses_missing_provider<T>::value, missing_format_adapter<T>>
- build_format_adapter(T &&) {
-   return missing_format_adapter<T>();
- }
--}
--}
-+} // namespace detail
-+} // namespace support
-+} // namespace llvm
- 
- #endif
-diff --git a/llvm/lib/Support/FormatVariadic.cpp b/llvm/lib/Support/FormatVariadic.cpp
-index 91b4c38ade4f..3c07a80a00ae 100644
---- a/llvm/lib/Support/FormatVariadic.cpp
-+++ b/llvm/lib/Support/FormatVariadic.cpp
-@@ -152,4 +152,4 @@ formatv_object_base::parseFormatString(StringRef Fmt) {
-   return Replacements;
- }
- 
--void detail::format_adapter::anchor() { }
-+void support::detail::format_adapter::anchor() {}
-
-diff --git a/llvm/unittests/Support/FormatVariadicTest.cpp b/llvm/unittests/Support/FormatVariadicTest.cpp
-index 58c89aad7a85..a78b25c53d7e 100644
---- a/llvm/unittests/Support/FormatVariadicTest.cpp
-+++ b/llvm/unittests/Support/FormatVariadicTest.cpp
-@@ -20,8 +20,8 @@ struct Format : public FormatAdapter<int> {
-   void format(raw_ostream &OS, StringRef Opt) override { OS << "Format"; }
- };
- 
--using detail::uses_format_member;
--using detail::uses_missing_provider;
-+using support::detail::uses_format_member;
-+using support::detail::uses_missing_provider;
- 
- static_assert(uses_format_member<Format>::value, "");
- static_assert(uses_format_member<Format &>::value, "");
-diff --git a/mlir/include/mlir/TableGen/Format.h b/mlir/include/mlir/TableGen/Format.h
-index 79d3d26a9d68..e92f6c64eab5 100644
---- a/mlir/include/mlir/TableGen/Format.h
-+++ b/mlir/include/mlir/TableGen/Format.h
-@@ -133,14 +133,15 @@ protected:
-   // std::vector<Base*>.
-   struct CreateAdapters {
-     template <typename... Ts>
--    std::vector<llvm::detail::format_adapter *> operator()(Ts &...items) {
--      return std::vector<llvm::detail::format_adapter *>{&items...};
-+    std::vector<llvm::support::detail::format_adapter *>
-+    operator()(Ts &...items) {
-+      return std::vector<llvm::support::detail::format_adapter *>{&items...};
-     }
-   };
- 
-   StringRef fmt;
-   const FmtContext *context;
--  std::vector<llvm::detail::format_adapter *> adapters;
-+  std::vector<llvm::support::detail::format_adapter *> adapters;
-   std::vector<FmtReplacement> replacements;
- 
- public:
-@@ -205,8 +206,8 @@ public:
- 
- class FmtStrVecObject : public FmtObjectBase {
- public:
--  using StrFormatAdapter =
--      decltype(llvm::detail::build_format_adapter(std::declval<std::string>()));
-+  using StrFormatAdapter = decltype(llvm::support::detail::build_format_adapter(
-+      std::declval<std::string>()));
- 
-   FmtStrVecObject(StringRef fmt, const FmtContext *ctx,
-                   ArrayRef<std::string> params);
-@@ -259,14 +260,15 @@ private:
- ///    in C++ code generation.
- template <typename... Ts>
- inline auto tgfmt(StringRef fmt, const FmtContext *ctx, Ts &&...vals)
--    -> FmtObject<decltype(std::make_tuple(
--        llvm::detail::build_format_adapter(std::forward<Ts>(vals))...))> {
-+    -> FmtObject<
-+        decltype(std::make_tuple(llvm::support::detail::build_format_adapter(
-+            std::forward<Ts>(vals))...))> {
-   using ParamTuple = decltype(std::make_tuple(
--      llvm::detail::build_format_adapter(std::forward<Ts>(vals))...));
-+      llvm::support::detail::build_format_adapter(std::forward<Ts>(vals))...));
-   return FmtObject<ParamTuple>(
-       fmt, ctx,
--      std::make_tuple(
--          llvm::detail::build_format_adapter(std::forward<Ts>(vals))...));
-+      std::make_tuple(llvm::support::detail::build_format_adapter(
-+          std::forward<Ts>(vals))...));
- }
- 
- inline FmtStrVecObject tgfmt(StringRef fmt, const FmtContext *ctx,
-diff --git a/mlir/lib/TableGen/Format.cpp b/mlir/lib/TableGen/Format.cpp
-index 03f888b139f8..65f4ad56dd15 100644
---- a/mlir/lib/TableGen/Format.cpp
-+++ b/mlir/lib/TableGen/Format.cpp
-@@ -203,7 +203,8 @@ FmtStrVecObject::FmtStrVecObject(StringRef fmt, const FmtContext *ctx,
-     : FmtObjectBase(fmt, ctx, params.size()) {
-   parameters.reserve(params.size());
-   for (std::string p : params)
--    parameters.push_back(llvm::detail::build_format_adapter(std::move(p)));
-+    parameters.push_back(
-+        llvm::support::detail::build_format_adapter(std::move(p)));
- 
-   adapters.reserve(parameters.size());
-   for (auto &p : parameters)
diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index 93795513a18c66..5ef93e8ac6da3e 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "c09b6fac12b0299841bf1bf04974712963736db5"
-    LLVM_SHA256 = "37b63fac6ac8b44c89b1c0b2ba3ed662adfc213dddac3221ebed2c7c0a60e606"
+    LLVM_COMMIT = "694c444b5bbb56dcba8978d283fe5385237c309a"
+    LLVM_SHA256 = "b5c953f633562b6e81101f9c50f246b741cd43158cec852d426ec55adb63f4b3"
 
     tf_http_archive(
         name = name,
@@ -21,7 +21,6 @@ def repo(name):
             "//third_party/llvm:build.patch",
             "//third_party/llvm:mathextras.patch",
             "//third_party/llvm:toolchains.patch",
-            "//third_party/llvm:vs2019.patch",
             "//third_party/llvm:zstd.patch",
         ],
         link_files = {"//third_party/llvm:run_lit.sh": "mlir/run_lit.sh"},
diff --git a/third_party/llvm_openmp/BUILD b/third_party/llvm_openmp/BUILD
index 0d1fad28f9c1ac..031d29f5b7f7ae 100644
--- a/third_party/llvm_openmp/BUILD
+++ b/third_party/llvm_openmp/BUILD
@@ -2,21 +2,21 @@
 
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load(
-    "@local_tsl//tsl:tsl.bzl",
-    "if_linux_x86_64",
-    "if_macos",
-    "if_windows",
-)
-load(
-    "@org_tensorflow//third_party/llvm_openmp:cmake_vars.bzl",
+    "@local_tsl//third_party/llvm_openmp:cmake_vars.bzl",
     "cmake_var_string",
     "expand_cmake_vars",
 )
 load(
-    "@org_tensorflow//third_party/llvm_openmp:openmp.bzl",
+    "@local_tsl//third_party/llvm_openmp:openmp.bzl",
     "dict_add",
     "libiomp5_cc_binary",
 )
+load(
+    "@local_xla//xla/tsl:tsl.bzl",
+    "if_linux_x86_64",
+    "if_macos",
+    "if_windows",
+)
 
 package(
     default_visibility = [
@@ -36,7 +36,7 @@ py_binary(
 )
 
 kmp_i18n_os_type = select({
-    "@local_tsl//tsl:windows": "win",
+    "@local_xla//xla/tsl:windows": "win",
     "//conditions:default": "lin",
 })
 
@@ -114,7 +114,7 @@ omp_vars_win = {
 }
 
 omp_all_cmake_vars = select({
-    "@local_tsl//tsl:windows": cmake_var_string(
+    "@local_xla//xla/tsl:windows": cmake_var_string(
         dict_add(
             omp_vars,
             omp_vars_win,
diff --git a/third_party/llvm_openmp/cmake_vars.bzl b/third_party/llvm_openmp/cmake_vars.bzl
index 5d38d92f9294e4..9b01fffdcf02e1 100644
--- a/third_party/llvm_openmp/cmake_vars.bzl
+++ b/third_party/llvm_openmp/cmake_vars.bzl
@@ -46,7 +46,7 @@ def expand_cmake_vars(name, src, dst, cmake_vars):
       cmake_vars: a string containing the CMake variables, as generated by
         cmake_var_string.
     """
-    expand_cmake_vars_tool = "@org_tensorflow//third_party/llvm_openmp:expand_cmake_vars"
+    expand_cmake_vars_tool = "@local_tsl//third_party/llvm_openmp:expand_cmake_vars"
     native.genrule(
         name = name,
         srcs = [src],
diff --git a/third_party/llvm_openmp/openmp.bzl b/third_party/llvm_openmp/openmp.bzl
index df985649a2f4ae..bbc44451c601e7 100644
--- a/third_party/llvm_openmp/openmp.bzl
+++ b/third_party/llvm_openmp/openmp.bzl
@@ -33,9 +33,9 @@ def dict_add(*dictionaries):
 
 def select_os_specific(L, M, W):
     return select({
-        "@local_tsl//tsl:linux_x86_64": L,
-        "@local_tsl//tsl:macos": M,
-        "@local_tsl//tsl:windows": W,
+        "@local_xla//xla/tsl:linux_x86_64": L,
+        "@local_xla//xla/tsl:macos": M,
+        "@local_xla//xla/tsl:windows": W,
         "//conditions:default": L,
     })
 
diff --git a/third_party/mkl_dnn/build_defs.bzl b/third_party/mkl_dnn/build_defs.bzl
index 6f79f443c6639e..3df22fd7c3f90f 100644
--- a/third_party/mkl_dnn/build_defs.bzl
+++ b/third_party/mkl_dnn/build_defs.bzl
@@ -17,7 +17,7 @@ def if_mkldnn_openmp(if_true, if_false = []):
 
     """
     return select({
-        "@org_tensorflow//third_party/mkl_dnn:build_with_mkldnn_openmp": if_true,
+        "@local_tsl//third_party/mkl_dnn:build_with_mkldnn_openmp": if_true,
         "//conditions:default": if_false,
     })
 
@@ -29,6 +29,6 @@ def if_mkldnn_aarch64_acl(if_true, if_false = []):
 
 def if_mkldnn_aarch64_acl_openmp(if_true, if_false = []):
     return select({
-        "@org_tensorflow//third_party/mkl_dnn:build_with_mkl_aarch64_openmp": if_true,
+        "@local_tsl//third_party/mkl_dnn:build_with_mkl_aarch64_openmp": if_true,
         "//conditions:default": if_false,
     })
diff --git a/third_party/mkl_dnn/mkldnn_acl.BUILD b/third_party/mkl_dnn/mkldnn_acl.BUILD
index 619b9deea1fbf1..d67b62a98d2660 100644
--- a/third_party/mkl_dnn/mkldnn_acl.BUILD
+++ b/third_party/mkl_dnn/mkldnn_acl.BUILD
@@ -117,7 +117,7 @@ expand_template(
     name = "dnnl_config_h",
     out = "include/oneapi/dnnl/dnnl_config.h",
     substitutions = select({
-        "@org_tensorflow//third_party/mkl_dnn:build_with_mkl_aarch64_openmp": _DNNL_RUNTIME_OMP,
+        "@local_tsl//third_party/mkl_dnn:build_with_mkl_aarch64_openmp": _DNNL_RUNTIME_OMP,
         "//conditions:default": _DNNL_RUNTIME_THREADPOOL,
     }),
     template = "include/oneapi/dnnl/dnnl_config.h.in",
@@ -149,7 +149,7 @@ cc_library(
         ],
     ),
     copts = select({
-        "@org_tensorflow//third_party/mkl_dnn:build_with_mkl_aarch64_openmp": _DNNL_COPTS_OMP,
+        "@local_tsl//third_party/mkl_dnn:build_with_mkl_aarch64_openmp": _DNNL_COPTS_OMP,
         "//conditions:default": _DNNL_COPTS_THREADPOOL,
     }),
     defines = ["DNNL_AARCH64_USE_ACL=1"],
diff --git a/third_party/mkl_dnn/mkldnn_v1.BUILD b/third_party/mkl_dnn/mkldnn_v1.BUILD
index 370319c5b8ee6f..cc9e66d77e3a77 100644
--- a/third_party/mkl_dnn/mkldnn_v1.BUILD
+++ b/third_party/mkl_dnn/mkldnn_v1.BUILD
@@ -1,7 +1,7 @@
 load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
-load("@local_tsl//tsl:tsl.bzl", "tf_openmp_copts")
+load("@local_tsl//third_party/mkl_dnn:build_defs.bzl", "if_mkldnn_openmp")
+load("@local_xla//xla/tsl:tsl.bzl", "tf_openmp_copts")
 load("@local_xla//xla/tsl/mkl:build_defs.bzl", "if_mkl", "if_mkl_ml")
-load("@org_tensorflow//third_party/mkl_dnn:build_defs.bzl", "if_mkldnn_openmp")
 
 exports_files(["LICENSE"])
 
@@ -76,7 +76,7 @@ expand_template(
     name = "dnnl_config_h",
     out = "include/oneapi/dnnl/dnnl_config.h",
     substitutions = select({
-        "@org_tensorflow//third_party/mkl_dnn:build_with_mkldnn_openmp": _DNNL_RUNTIME_OMP,
+        "@local_tsl//third_party/mkl_dnn:build_with_mkldnn_openmp": _DNNL_RUNTIME_OMP,
         "//conditions:default": _DNNL_RUNTIME_THREADPOOL,
     }),
     template = "include/oneapi/dnnl/dnnl_config.h.in",
@@ -102,7 +102,7 @@ expand_template(
 )
 
 _COPTS_LIST = select({
-    "@local_tsl//tsl:windows": [],
+    "@local_xla//xla/tsl:windows": [],
     "//conditions:default": ["-fexceptions"],
 }) + [
     "-UUSE_MKL",
@@ -171,9 +171,9 @@ cc_library(
     includes = _INCLUDES_LIST,
     # TODO(penpornk): Use lrt_if_needed from tensorflow.bzl instead.
     linkopts = select({
-        "@local_tsl//tsl:linux_aarch64": ["-lrt"],
-        "@local_tsl//tsl:linux_x86_64": ["-lrt"],
-        "@local_tsl//tsl:linux_ppc64le": ["-lrt"],
+        "@local_xla//xla/tsl:linux_aarch64": ["-lrt"],
+        "@local_xla//xla/tsl:linux_x86_64": ["-lrt"],
+        "@local_xla//xla/tsl:linux_ppc64le": ["-lrt"],
         "//conditions:default": [],
     }),
     textual_hdrs = _TEXTUAL_HDRS_LIST,
diff --git a/third_party/mpitrampoline/mpitrampoline.BUILD b/third_party/mpitrampoline/mpitrampoline.BUILD
index cf8e9c336e4e33..296e70199a5994 100644
--- a/third_party/mpitrampoline/mpitrampoline.BUILD
+++ b/third_party/mpitrampoline/mpitrampoline.BUILD
@@ -1,7 +1,7 @@
 # Description:
 #  A forwarding MPI implementation that can use any other MPI implementation via an MPI ABI
 
-load("@org_tensorflow//xla:strict.default.bzl", "py_strict_binary")
+load("@local_xla//xla:strict.default.bzl", "py_strict_binary")
 load("//third_party/bazel_skylib/rules:expand_template.bzl", "expand_template")
 
 package(
diff --git a/third_party/nanobind/nanobind.BUILD b/third_party/nanobind/nanobind.BUILD
index 982afcb1fdf958..c9f307b75ef0ca 100644
--- a/third_party/nanobind/nanobind.BUILD
+++ b/third_party/nanobind/nanobind.BUILD
@@ -20,7 +20,7 @@ cc_library(
         ],
     ),
     deps = [
-        "@org_tensorflow//third_party/python_runtime:headers",
+        "@local_tsl//third_party/python_runtime:headers",
         "@robin_map",
     ],
 )
diff --git a/third_party/nccl/build_defs.bzl.tpl b/third_party/nccl/build_defs.bzl.tpl
index 25dcdf4aa2362a..53a6d4e1e41890 100644
--- a/third_party/nccl/build_defs.bzl.tpl
+++ b/third_party/nccl/build_defs.bzl.tpl
@@ -329,8 +329,8 @@ def cuda_rdc_library(name, hdrs = None, copts = None, linkstatic = True, **kwarg
         out = dlink_cc,
         gpu_archs = cuda_gpu_architectures(),
         nvlink_args = select({
-            "@local_tsl//tsl:linux_x86_64": ["--cpu-arch=X86_64"],
-            "@local_tsl//tsl:linux_ppc64le": ["--cpu-arch=PPC64LE"],
+            "@local_xla//xla/tsl:linux_x86_64": ["--cpu-arch=X86_64"],
+            "@local_xla//xla/tsl:linux_ppc64le": ["--cpu-arch=PPC64LE"],
             "//conditions:default": [],
         }),
     )
diff --git a/third_party/nccl/nccl_configure.bzl b/third_party/nccl/nccl_configure.bzl
index ab6370003f4009..22cf64d4771062 100644
--- a/third_party/nccl/nccl_configure.bzl
+++ b/third_party/nccl/nccl_configure.bzl
@@ -191,7 +191,7 @@ remote_nccl_configure = repository_rule(
     attrs = {
         "environ": attr.string_dict(),
         "_find_cuda_config": attr.label(
-            default = Label("@org_tensorflow//third_party/gpus:find_cuda_config.py"),
+            default = Label("@local_tsl//third_party/gpus:find_cuda_config.py"),
         ),
     },
 )
@@ -201,7 +201,7 @@ nccl_configure = repository_rule(
     environ = _ENVIRONS,
     attrs = {
         "_find_cuda_config": attr.label(
-            default = Label("@org_tensorflow//third_party/gpus:find_cuda_config.py"),
+            default = Label("@local_tsl//third_party/gpus:find_cuda_config.py"),
         ),
     },
 )
diff --git a/third_party/png.BUILD b/third_party/png.BUILD
index 0e2a5d0e3c427e..e79442e35bc58d 100644
--- a/third_party/png.BUILD
+++ b/third_party/png.BUILD
@@ -33,7 +33,7 @@ cc_library(
             "intel/filter_sse2_intrinsics.c",
             "intel/intel_init.c",
         ],
-        "@local_tsl//tsl:linux_ppc64le": [
+        "@local_xla//xla/tsl:linux_ppc64le": [
             "powerpc/filter_vsx_intrinsics.c",
             "powerpc/powerpc_init.c",
         ],
diff --git a/third_party/py/ml_dtypes/ml_dtypes.BUILD b/third_party/py/ml_dtypes/ml_dtypes.BUILD
index a85195e9c51102..25c98abc05655d 100644
--- a/third_party/py/ml_dtypes/ml_dtypes.BUILD
+++ b/third_party/py/ml_dtypes/ml_dtypes.BUILD
@@ -30,6 +30,21 @@ cc_library(
         ".",
         "ml_dtypes",
     ],
+    deps = [
+        ":intn",
+    ],
+)
+
+cc_library(
+    name = "intn",
+    hdrs = ["include/intn.h"],
+    include_prefix = "ml_dtypes",
+    # Internal headers are all relative to . but other packages
+    # include these headers with the  prefix.
+    includes = [
+        ".",
+        "ml_dtypes",
+    ],
 )
 
 pybind_extension(
@@ -48,8 +63,9 @@ pybind_extension(
     deps = [
         ":float8",
         ":int4",
+        ":intn",
         "@eigen_archive//:eigen3",
-        "@org_tensorflow//third_party/py/numpy:headers",
+        "@local_tsl//third_party/py/numpy:headers",
     ],
 )
 
diff --git a/third_party/py/ml_dtypes/ml_dtypes.tests.BUILD b/third_party/py/ml_dtypes/ml_dtypes.tests.BUILD
index 574659aff7d734..3e73c6f4747976 100644
--- a/third_party/py/ml_dtypes/ml_dtypes.tests.BUILD
+++ b/third_party/py/ml_dtypes/ml_dtypes.tests.BUILD
@@ -8,7 +8,7 @@ py_library(
         "//:ml_dtypes",
         "@absl_py//absl/testing:absltest",
         "@absl_py//absl/testing:parameterized",
-        "@org_tensorflow//third_party/py/numpy",
+        "@local_tsl//third_party/py/numpy",
     ],
 )
 
diff --git a/third_party/py/ml_dtypes/workspace.bzl b/third_party/py/ml_dtypes/workspace.bzl
index 84aeb00edbe105..ac75f63653e8ab 100644
--- a/third_party/py/ml_dtypes/workspace.bzl
+++ b/third_party/py/ml_dtypes/workspace.bzl
@@ -7,8 +7,8 @@ float8 varieties, and int4.
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
-    ML_DTYPES_COMMIT = "2ca30a2b3c0744625ae3d6988f5596740080bbd0"
-    ML_DTYPES_SHA256 = "5ea2b9ab133ddd522bcdf2f9e77ac277bb19d3fa4442bd77ee024fc225f0b5ab"
+    ML_DTYPES_COMMIT = "15b400f4dbad93e725e77e7b8171e4bfebfac874"
+    ML_DTYPES_SHA256 = "368312e4909bffe6a5ef22640ddae425ee14101af069a2e48b69d2fee33461e4"
     tf_http_archive(
         name = "ml_dtypes",
         build_file = "//third_party/py/ml_dtypes:ml_dtypes.BUILD",
diff --git a/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.BUILD b/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.BUILD
index a85195e9c51102..25c98abc05655d 100644
--- a/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.BUILD
+++ b/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.BUILD
@@ -30,6 +30,21 @@ cc_library(
         ".",
         "ml_dtypes",
     ],
+    deps = [
+        ":intn",
+    ],
+)
+
+cc_library(
+    name = "intn",
+    hdrs = ["include/intn.h"],
+    include_prefix = "ml_dtypes",
+    # Internal headers are all relative to . but other packages
+    # include these headers with the  prefix.
+    includes = [
+        ".",
+        "ml_dtypes",
+    ],
 )
 
 pybind_extension(
@@ -48,8 +63,9 @@ pybind_extension(
     deps = [
         ":float8",
         ":int4",
+        ":intn",
         "@eigen_archive//:eigen3",
-        "@org_tensorflow//third_party/py/numpy:headers",
+        "@local_tsl//third_party/py/numpy:headers",
     ],
 )
 
diff --git a/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.tests.BUILD b/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.tests.BUILD
index 574659aff7d734..3e73c6f4747976 100644
--- a/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.tests.BUILD
+++ b/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.tests.BUILD
@@ -8,7 +8,7 @@ py_library(
         "//:ml_dtypes",
         "@absl_py//absl/testing:absltest",
         "@absl_py//absl/testing:parameterized",
-        "@org_tensorflow//third_party/py/numpy",
+        "@local_tsl//third_party/py/numpy",
     ],
 )
 
diff --git a/third_party/py/non_hermetic/ml_dtypes/workspace.bzl b/third_party/py/non_hermetic/ml_dtypes/workspace.bzl
index 84aeb00edbe105..ac75f63653e8ab 100644
--- a/third_party/py/non_hermetic/ml_dtypes/workspace.bzl
+++ b/third_party/py/non_hermetic/ml_dtypes/workspace.bzl
@@ -7,8 +7,8 @@ float8 varieties, and int4.
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
-    ML_DTYPES_COMMIT = "2ca30a2b3c0744625ae3d6988f5596740080bbd0"
-    ML_DTYPES_SHA256 = "5ea2b9ab133ddd522bcdf2f9e77ac277bb19d3fa4442bd77ee024fc225f0b5ab"
+    ML_DTYPES_COMMIT = "15b400f4dbad93e725e77e7b8171e4bfebfac874"
+    ML_DTYPES_SHA256 = "368312e4909bffe6a5ef22640ddae425ee14101af069a2e48b69d2fee33461e4"
     tf_http_archive(
         name = "ml_dtypes",
         build_file = "//third_party/py/ml_dtypes:ml_dtypes.BUILD",
diff --git a/third_party/pybind11.BUILD b/third_party/pybind11.BUILD
index 5e0e03e2669bdb..5ad467802eda74 100644
--- a/third_party/pybind11.BUILD
+++ b/third_party/pybind11.BUILD
@@ -20,7 +20,7 @@ cc_library(
     includes = ["include"],
     strip_include_prefix = "include",
     deps = [
-        "@org_tensorflow//third_party/python_runtime:headers",
+        "@local_tsl//third_party/python_runtime:headers",
     ],
 )
 
diff --git a/third_party/remote_config/remote_platform_configure.bzl b/third_party/remote_config/remote_platform_configure.bzl
index c8a5dc8b8774dd..2eaa2d7a457891 100644
--- a/third_party/remote_config/remote_platform_configure.bzl
+++ b/third_party/remote_config/remote_platform_configure.bzl
@@ -38,7 +38,7 @@ def _remote_platform_configure_impl(repository_ctx):
 
     repository_ctx.template(
         "BUILD",
-        Label("@org_tensorflow//third_party/remote_config:BUILD.tpl"),
+        Label("@local_tsl//third_party/remote_config:BUILD.tpl"),
         {
             "%{platform}": platform,
             "%{exec_properties}": serialized_exec_properties,
diff --git a/third_party/snappy.BUILD b/third_party/snappy.BUILD
index fbf4e2eae9ef1b..9559b8e9f17d6f 100644
--- a/third_party/snappy.BUILD
+++ b/third_party/snappy.BUILD
@@ -19,7 +19,7 @@ cc_library(
     ],
     hdrs = ["snappy.h"],
     copts = ["-DHAVE_CONFIG_H"] + select({
-        "@local_tsl//tsl:windows": [],
+        "@local_xla//xla/tsl:windows": [],
         "//conditions:default": [
             "-fno-exceptions",
             "-Wno-sign-compare",
@@ -28,7 +28,7 @@ cc_library(
         ],
     }),
     defines = select({
-        "@local_tsl//tsl:windows": [],
+        "@local_xla//xla/tsl:windows": [],
         "//conditions:default": ["HAVE_SYS_UIO_H"],
     }),
 )
diff --git a/third_party/sqlite.BUILD b/third_party/sqlite.BUILD
index 353c65c9a6e8d5..de61e86bf91767 100644
--- a/third_party/sqlite.BUILD
+++ b/third_party/sqlite.BUILD
@@ -11,10 +11,10 @@ SQLITE_COPTS = [
     "-D_FILE_OFFSET_BITS=64",
     "-D_REENTRANT=1",
 ] + select({
-    "@local_tsl//tsl:windows": [
+    "@local_xla//xla/tsl:windows": [
         "-DSQLITE_MAX_TRIGGER_DEPTH=100",
     ],
-    "@local_tsl//tsl:macos": [
+    "@local_xla//xla/tsl:macos": [
         "-Os",
         "-DHAVE_GMTIME_R=1",
         "-DHAVE_LOCALTIME_R=1",
@@ -46,7 +46,7 @@ cc_library(
         "SQLITE_OMIT_DEPRECATED",
     ],
     linkopts = select({
-        "@local_tsl//tsl:windows": [],
+        "@local_xla//xla/tsl:windows": [],
         "//conditions:default": [
             "-ldl",
             "-lpthread",
diff --git a/third_party/stablehlo/temporary.patch b/third_party/stablehlo/temporary.patch
index 94971c07102a21..c8897665644baa 100755
--- a/third_party/stablehlo/temporary.patch
+++ b/third_party/stablehlo/temporary.patch
@@ -177,7 +177,7 @@ diff --ruN a/stablehlo/stablehlo/CMakeLists.txt b/stablehlo/stablehlo/CMakeLists
 diff --ruN a/stablehlo/stablehlo/experimental/BUILD.bazel b/stablehlo/stablehlo/experimental/BUILD.bazel
 --- stablehlo/stablehlo/experimental/BUILD.bazel
 +++ stablehlo/stablehlo/experimental/BUILD.bazel
-@@ -0,0 +1,116 @@
+@@ -0,0 +1,115 @@
 +# Copyright 2023 The StableHLO Authors. All Rights Reserved.
 +#
 +# Licensed under the Apache License, Version 2.0 (the "License");
@@ -251,7 +251,6 @@ diff --ruN a/stablehlo/stablehlo/experimental/BUILD.bazel b/stablehlo/stablehlo/
 +        "transforms/ChloRecomposeOps.cpp",
 +        "transforms/StablehloCanonicalizeDynamism.cpp",
 +        "transforms/StablehloRefineShapes.cpp",
-+        "transforms/StablehloTrivialDce.cpp",
 +    ],
 +    hdrs = [
 +        "transforms/Passes.h",
@@ -447,7 +446,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/CMakeLists.txt b/stablehlo
 diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp
 --- stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp
 +++ stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp
-@@ -0,0 +1,504 @@
+@@ -0,0 +1,506 @@
 +/* Copyright 2023 The StableHLO Authors.
 +
 +Licensed under the Apache License, Version 2.0 (the "License");
@@ -752,7 +751,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stableh
 +
 +  // Unpack operands and attributes of the underlying custom call into
 +  // operation-specific inputs.
-+  auto rngAlgorithmAttr = op_->getAttr("rng_algorithm");
++  auto rngAlgorithmAttr = op_->getDiscardableAttr("rng_algorithm");
 +  auto initialState = op_.getInputs()[0];
 +  auto outputShape = op_.getInputs()[1];
 +  auto outputState = op_.getResults()[0];
@@ -813,7 +812,9 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stableh
 +}
 +
 +RngAlgorithm DynamicRngBitGeneratorOpAdaptor::getRngAlgorithm() {
-+  return op_->getAttr("rng_algorithm").cast<RngAlgorithmAttr>().getValue();
++  return op_->getDiscardableAttr("rng_algorithm")
++      .cast<RngAlgorithmAttr>()
++      .getValue();
 +}
 +
 +TypedValue<ShapedType> DynamicRngBitGeneratorOpAdaptor::getInitialState() {
@@ -1904,7 +1905,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/tools/StablehloOptMain.cpp b/stabl
 diff --ruN a/stablehlo/stablehlo/experimental/transforms/CMakeLists.txt b/stablehlo/stablehlo/experimental/transforms/CMakeLists.txt
 --- stablehlo/stablehlo/experimental/transforms/CMakeLists.txt
 +++ stablehlo/stablehlo/experimental/transforms/CMakeLists.txt
-@@ -0,0 +1,41 @@
+@@ -0,0 +1,40 @@
 +# Copyright 2023 The StableHLO Authors.
 +#
 +# Licensed under the Apache License, Version 2.0 (the "License");
@@ -1928,7 +1929,6 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/CMakeLists.txt b/stable
 +  ChloRecomposeOps.cpp
 +  StablehloCanonicalizeDynamism.cpp
 +  StablehloRefineShapes.cpp
-+  StablehloTrivialDce.cpp
 +
 +  DEPENDS
 +  ExperimentalPassesIncGen
@@ -1949,7 +1949,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/CMakeLists.txt b/stable
 diff --ruN a/stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp b/stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp
 --- stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp
 +++ stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp
-@@ -0,0 +1,178 @@
+@@ -0,0 +1,180 @@
 +/* Copyright 2024 The StableHLO Authors.
 +Licensed under the Apache License, Version 2.0 (the "License");
 +you may not use this file except in compliance with the License.
@@ -1968,6 +1968,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp b/
 +#include <functional>
 +
 +#include "llvm/ADT/SmallVector.h"
++#include "llvm/Support/Casting.h"
 +#include "mlir/Dialect/Func/IR/FuncOps.h"
 +#include "mlir/IR/Attributes.h"
 +#include "mlir/IR/BuiltinAttributes.h"
@@ -1992,7 +1993,8 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp b/
 +
 +FailureOr<DictionaryAttr> getCustomCallOpAttributes(CustomCallOp op,
 +                                                    PatternRewriter& rewriter) {
-+  auto attrs = op->getAttrOfType<DictionaryAttr>("mhlo.attributes");
++  auto attrs = llvm::dyn_cast_or_null<DictionaryAttr>(
++      op->getDiscardableAttr("mhlo.attributes"));
 +  if (!attrs)
 +    return rewriter.notifyMatchFailure(
 +        op, "Expected mhlo.attributes dictionary attribute.");
@@ -2173,7 +2175,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/Passes.h b/stablehlo/st
 diff --ruN a/stablehlo/stablehlo/experimental/transforms/Passes.td b/stablehlo/stablehlo/experimental/transforms/Passes.td
 --- stablehlo/stablehlo/experimental/transforms/Passes.td
 +++ stablehlo/stablehlo/experimental/transforms/Passes.td
-@@ -0,0 +1,55 @@
+@@ -0,0 +1,39 @@
 +/* Copyright 2023 The StableHLO Authors.
 +
 +Licensed under the Apache License, Version 2.0 (the "License");
@@ -2213,22 +2215,6 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/Passes.td b/stablehlo/s
 +  }];
 +  let dependentDialects = ["chlo::ChloDialect"];
 +}
-+
-+def StablehloTrivialDcePass : Pass<"experimental-stablehlo-trivial-dce", "ModuleOp"> {
-+  let summary = "(Experimental) Performs a single bottom up pass to remove values that are trivially dead.";
-+  let description = [{
-+    An experimental pass to remove dead values prior to running other passes
-+    that may fail to converge otherwise. For example, running shape refinement
-+    on a program that has a lot of dead values can fail because shape refinement
-+    is top down and removing values causes a new iteration to be triggered, and
-+    removing all the dead values with a top down traversal can take a lot of
-+    iterations (10+), which is slow.
-+
-+    Performing a single pass should be fast, and doing it bottom up means that
-+    values that are transitively dead can be removed since leaf values will be
-+    processed first.
-+  }];
-+}
 diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp b/stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp
 --- stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp
 +++ stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp
@@ -2578,1252 +2564,76 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.c
 +}  // namespace experimental
 +}  // namespace stablehlo
 +}  // namespace mlir
-diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloTrivialDce.cpp b/stablehlo/stablehlo/experimental/transforms/StablehloTrivialDce.cpp
---- stablehlo/stablehlo/experimental/transforms/StablehloTrivialDce.cpp
-+++ stablehlo/stablehlo/experimental/transforms/StablehloTrivialDce.cpp
-@@ -0,0 +1,63 @@
-+/* Copyright 2022 The StableHLO Authors.
-+Licensed under the Apache License, Version 2.0 (the "License");
-+you may not use this file except in compliance with the License.
-+You may obtain a copy of the License at
-+
-+    http://www.apache.org/licenses/LICENSE-2.0
-+
-+Unless required by applicable law or agreed to in writing, software
-+distributed under the License is distributed on an "AS IS" BASIS,
-+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+See the License for the specific language governing permissions and
-+limitations under the License.
-+==============================================================================*/
-+
-+#include <cstdint>
-+
-+#include "llvm/ADT/SmallVector.h"
-+#include "mlir/Dialect/Func/IR/FuncOps.h"
-+#include "mlir/IR/PatternMatch.h"
-+#include "mlir/Support/LogicalResult.h"
-+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-+#include "stablehlo/experimental/transforms/Passes.h"
-+
-+namespace mlir {
-+namespace stablehlo {
-+namespace experimental {
-+
-+#define GEN_PASS_DEF_STABLEHLOTRIVIALDCEPASS
-+#include "stablehlo/experimental/transforms/Passes.h.inc"
-+
-+namespace {
-+
-+struct StablehloTrivialDcePass
-+    : public impl::StablehloTrivialDcePassBase<StablehloTrivialDcePass> {
-+  using StablehloTrivialDcePassBase::StablehloTrivialDcePassBase;
-+
-+  void runOnOperation() override {
-+    GreedyRewriteConfig config;
-+
-+    // Hardcode defaults for stability.
-+    config.enableRegionSimplification = true;
-+    config.maxNumRewrites = GreedyRewriteConfig::kNoLimit;
-+    config.strictMode = GreedyRewriteStrictness::AnyOp;
-+
-+    // Run a single bottom up pass.
-+    config.useTopDownTraversal = false;
-+    config.maxIterations = 1;
-+
-+    // Running a greedy rewrite will cause trivially dead values to be removed.
-+    // Doing it without patterns ensures that no other changes are made to the
-+    // IR. Doing it bottom-up ensures that values that are transitively dead are
-+    // also removed. Although 1 pass should be enough,
-+    // applyPatternsAndFoldGreedily will want to run at least 1 more iteration
-+    // to confirm convergence, but we don't need to check for convergence, so we
-+    // ignore the return value.
-+    (void)applyPatternsAndFoldGreedily(getOperation(), RewritePatternSet(&getContext()), config);
-+  }
-+};
-+
-+}  // namespace
-+}  // namespace experimental
-+}  // namespace stablehlo
-+}  // namespace mlir
-diff --ruN a/stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir b/stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
---- stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
-+++ stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
-@@ -1283,153 +1283,153 @@
- func.func @zeta_f16(%arg0: tensor<f16>, %arg1: tensor<f16>) -> tensor<f16> {
-   // CHECK: %[[TMP_0:.*]] = stablehlo.convert %[[X]] : (tensor<f16>) -> tensor<f32>
-   // CHECK: %[[TMP_1:.*]] = stablehlo.convert %[[Q]] : (tensor<f16>) -> tensor<f32>
--  // CHECK: %[[TMP_2:.*]] = stablehlo.constant dense<0.000000e+00>
--  // CHECK: %[[TMP_3:.*]] = stablehlo.negate %[[TMP_0]]
--  // CHECK: %[[TMP_4:.*]] = stablehlo.power %[[TMP_1]], %[[TMP_3]]
--  // CHECK: %[[TMP_5:.*]] = stablehlo.constant dense<1.000000e+00>
--  // CHECK: %[[TMP_6:.*]] = stablehlo.add %[[TMP_1]], %[[TMP_5]]
--  // CHECK: %[[TMP_7:.*]] = stablehlo.power %[[TMP_6]], %[[TMP_3]]
--  // CHECK: %[[TMP_8:.*]] = stablehlo.add %[[TMP_4]], %[[TMP_7]]
--  // CHECK: %[[TMP_9:.*]] = stablehlo.add %[[TMP_6]], %[[TMP_5]]
--  // CHECK: %[[TMP_10:.*]] = stablehlo.power %[[TMP_9]], %[[TMP_3]]
-+  // CHECK-DAG: %[[TMP_2:.*]] = stablehlo.constant dense<0.000000e+00>
-+  // CHECK-DAG: %[[TMP_3:.*]] = stablehlo.constant dense<1.000000e+00>
-+  // CHECK: %[[TMP_4:.*]] = stablehlo.negate %[[TMP_0]]
-+  // CHECK: %[[TMP_5:.*]] = stablehlo.power %[[TMP_1]], %[[TMP_4]]
-+  // CHECK: %[[TMP_6:.*]] = stablehlo.add %[[TMP_1]], %[[TMP_3]]
-+  // CHECK: %[[TMP_7:.*]] = stablehlo.power %[[TMP_6]], %[[TMP_4]]
-+  // CHECK: %[[TMP_8:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_7]]
-+  // CHECK: %[[TMP_9:.*]] = stablehlo.add %[[TMP_6]], %[[TMP_3]]
-+  // CHECK: %[[TMP_10:.*]] = stablehlo.power %[[TMP_9]], %[[TMP_4]]
-   // CHECK: %[[TMP_11:.*]] = stablehlo.add %[[TMP_8]], %[[TMP_10]]
--  // CHECK: %[[TMP_12:.*]] = stablehlo.add %[[TMP_9]], %[[TMP_5]]
--  // CHECK: %[[TMP_13:.*]] = stablehlo.power %[[TMP_12]], %[[TMP_3]]
-+  // CHECK: %[[TMP_12:.*]] = stablehlo.add %[[TMP_9]], %[[TMP_3]]
-+  // CHECK: %[[TMP_13:.*]] = stablehlo.power %[[TMP_12]], %[[TMP_4]]
-   // CHECK: %[[TMP_14:.*]] = stablehlo.add %[[TMP_11]], %[[TMP_13]]
--  // CHECK: %[[TMP_15:.*]] = stablehlo.add %[[TMP_12]], %[[TMP_5]]
--  // CHECK: %[[TMP_16:.*]] = stablehlo.power %[[TMP_15]], %[[TMP_3]]
-+  // CHECK: %[[TMP_15:.*]] = stablehlo.add %[[TMP_12]], %[[TMP_3]]
-+  // CHECK: %[[TMP_16:.*]] = stablehlo.power %[[TMP_15]], %[[TMP_4]]
-   // CHECK: %[[TMP_17:.*]] = stablehlo.add %[[TMP_14]], %[[TMP_16]]
--  // CHECK: %[[TMP_18:.*]] = stablehlo.add %[[TMP_15]], %[[TMP_5]]
--  // CHECK: %[[TMP_19:.*]] = stablehlo.power %[[TMP_18]], %[[TMP_3]]
-+  // CHECK: %[[TMP_18:.*]] = stablehlo.add %[[TMP_15]], %[[TMP_3]]
-+  // CHECK: %[[TMP_19:.*]] = stablehlo.power %[[TMP_18]], %[[TMP_4]]
-   // CHECK: %[[TMP_20:.*]] = stablehlo.add %[[TMP_17]], %[[TMP_19]]
--  // CHECK: %[[TMP_21:.*]] = stablehlo.add %[[TMP_18]], %[[TMP_5]]
--  // CHECK: %[[TMP_22:.*]] = stablehlo.power %[[TMP_21]], %[[TMP_3]]
-+  // CHECK: %[[TMP_21:.*]] = stablehlo.add %[[TMP_18]], %[[TMP_3]]
-+  // CHECK: %[[TMP_22:.*]] = stablehlo.power %[[TMP_21]], %[[TMP_4]]
-   // CHECK: %[[TMP_23:.*]] = stablehlo.add %[[TMP_20]], %[[TMP_22]]
--  // CHECK: %[[TMP_24:.*]] = stablehlo.add %[[TMP_21]], %[[TMP_5]]
--  // CHECK: %[[TMP_25:.*]] = stablehlo.power %[[TMP_24]], %[[TMP_3]]
-+  // CHECK: %[[TMP_24:.*]] = stablehlo.add %[[TMP_21]], %[[TMP_3]]
-+  // CHECK: %[[TMP_25:.*]] = stablehlo.power %[[TMP_24]], %[[TMP_4]]
-   // CHECK: %[[TMP_26:.*]] = stablehlo.add %[[TMP_23]], %[[TMP_25]]
--  // CHECK: %[[TMP_27:.*]] = stablehlo.add %[[TMP_24]], %[[TMP_5]]
--  // CHECK: %[[TMP_28:.*]] = stablehlo.power %[[TMP_27]], %[[TMP_3]]
-+  // CHECK: %[[TMP_27:.*]] = stablehlo.add %[[TMP_24]], %[[TMP_3]]
-+  // CHECK: %[[TMP_28:.*]] = stablehlo.power %[[TMP_27]], %[[TMP_4]]
-   // CHECK: %[[TMP_29:.*]] = stablehlo.add %[[TMP_26]], %[[TMP_28]]
--  // CHECK: %[[TMP_30:.*]] = stablehlo.add %[[TMP_27]], %[[TMP_5]]
--  // CHECK: %[[TMP_31:.*]] = stablehlo.power %[[TMP_30]], %[[TMP_3]]
-+  // CHECK: %[[TMP_30:.*]] = stablehlo.add %[[TMP_27]], %[[TMP_3]]
-+  // CHECK: %[[TMP_31:.*]] = stablehlo.power %[[TMP_30]], %[[TMP_4]]
-   // CHECK: %[[TMP_32:.*]] = stablehlo.add %[[TMP_29]], %[[TMP_31]]
--  // CHECK: %[[TMP_33:.*]] = stablehlo.add %[[TMP_30]], %[[TMP_5]]
--  // CHECK: %[[TMP_34:.*]] = stablehlo.power %[[TMP_33]], %[[TMP_3]]
-+  // CHECK: %[[TMP_33:.*]] = stablehlo.add %[[TMP_30]], %[[TMP_3]]
-+  // CHECK: %[[TMP_34:.*]] = stablehlo.power %[[TMP_33]], %[[TMP_4]]
-   // CHECK: %[[TMP_35:.*]] = stablehlo.constant dense<1.000000e+00>
--  // CHECK: %[[TMP_36:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_35]]
--  // CHECK: %[[TMP_37:.*]] = stablehlo.multiply %[[TMP_34]], %[[TMP_33]]
--  // CHECK: %[[TMP_38:.*]] = stablehlo.divide %[[TMP_37]], %[[TMP_36]]
--  // CHECK: %[[TMP_39:.*]] = stablehlo.add %[[TMP_32]], %[[TMP_38]]
--  // CHECK: %[[TMP_40:.*]] = stablehlo.multiply %[[TMP_33]], %[[TMP_33]]
--  // CHECK: %[[TMP_41:.*]] = stablehlo.divide %[[TMP_5]], %[[TMP_40]]
--  // CHECK: %[[TMP_42:.*]] = stablehlo.constant dense<2.200000e+01>
--  // CHECK: %[[TMP_43:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_42]]
--  // CHECK: %[[TMP_44:.*]] = stablehlo.constant dense<2.100000e+01>
--  // CHECK: %[[TMP_45:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_44]]
--  // CHECK: %[[TMP_46:.*]] = stablehlo.multiply %[[TMP_43]], %[[TMP_45]]
--  // CHECK: %[[TMP_47:.*]] = stablehlo.constant dense<-1.39544646E-19>
--  // CHECK: %[[TMP_48:.*]] = stablehlo.add %[[TMP_2]], %[[TMP_47]]
--  // CHECK: %[[TMP_49:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_48]]
--  // CHECK: %[[TMP_50:.*]] = stablehlo.multiply %[[TMP_46]], %[[TMP_49]]
--  // CHECK: %[[TMP_51:.*]] = stablehlo.constant dense<2.000000e+01>
--  // CHECK: %[[TMP_52:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_51]]
--  // CHECK: %[[TMP_53:.*]] = stablehlo.constant dense<1.900000e+01>
--  // CHECK: %[[TMP_54:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_53]]
--  // CHECK: %[[TMP_55:.*]] = stablehlo.multiply %[[TMP_52]], %[[TMP_54]]
--  // CHECK: %[[TMP_56:.*]] = stablehlo.constant dense<5.50900303E-18>
--  // CHECK: %[[TMP_57:.*]] = stablehlo.add %[[TMP_50]], %[[TMP_56]]
--  // CHECK: %[[TMP_58:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_57]]
--  // CHECK: %[[TMP_59:.*]] = stablehlo.multiply %[[TMP_55]], %[[TMP_58]]
--  // CHECK: %[[TMP_60:.*]] = stablehlo.constant dense<1.800000e+01>
--  // CHECK: %[[TMP_61:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_60]]
--  // CHECK: %[[TMP_62:.*]] = stablehlo.constant dense<1.700000e+01>
--  // CHECK: %[[TMP_63:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_62]]
--  // CHECK: %[[TMP_64:.*]] = stablehlo.multiply %[[TMP_61]], %[[TMP_63]]
--  // CHECK: %[[TMP_65:.*]] = stablehlo.constant dense<-2.17486866E-16>
--  // CHECK: %[[TMP_66:.*]] = stablehlo.add %[[TMP_59]], %[[TMP_65]]
--  // CHECK: %[[TMP_67:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_66]]
--  // CHECK: %[[TMP_68:.*]] = stablehlo.multiply %[[TMP_64]], %[[TMP_67]]
--  // CHECK: %[[TMP_69:.*]] = stablehlo.constant dense<1.600000e+01>
--  // CHECK: %[[TMP_70:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_69]]
--  // CHECK: %[[TMP_71:.*]] = stablehlo.constant dense<1.500000e+01>
--  // CHECK: %[[TMP_72:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_71]]
--  // CHECK: %[[TMP_73:.*]] = stablehlo.multiply %[[TMP_70]], %[[TMP_72]]
--  // CHECK: %[[TMP_74:.*]] = stablehlo.constant dense<8.58606213E-15>
--  // CHECK: %[[TMP_75:.*]] = stablehlo.add %[[TMP_68]], %[[TMP_74]]
--  // CHECK: %[[TMP_76:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_75]]
--  // CHECK: %[[TMP_77:.*]] = stablehlo.multiply %[[TMP_73]], %[[TMP_76]]
--  // CHECK: %[[TMP_78:.*]] = stablehlo.constant dense<1.400000e+01>
--  // CHECK: %[[TMP_79:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_78]]
--  // CHECK: %[[TMP_80:.*]] = stablehlo.constant dense<1.300000e+01>
--  // CHECK: %[[TMP_81:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_80]]
--  // CHECK: %[[TMP_82:.*]] = stablehlo.multiply %[[TMP_79]], %[[TMP_81]]
--  // CHECK: %[[TMP_83:.*]] = stablehlo.constant dense<-3.3896803E-13>
--  // CHECK: %[[TMP_84:.*]] = stablehlo.add %[[TMP_77]], %[[TMP_83]]
--  // CHECK: %[[TMP_85:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_84]]
--  // CHECK: %[[TMP_86:.*]] = stablehlo.multiply %[[TMP_82]], %[[TMP_85]]
--  // CHECK: %[[TMP_87:.*]] = stablehlo.constant dense<1.200000e+01>
--  // CHECK: %[[TMP_88:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_87]]
--  // CHECK: %[[TMP_89:.*]] = stablehlo.constant dense<1.100000e+01>
--  // CHECK: %[[TMP_90:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_89]]
--  // CHECK: %[[TMP_91:.*]] = stablehlo.multiply %[[TMP_88]], %[[TMP_90]]
--  // CHECK: %[[TMP_92:.*]] = stablehlo.constant dense<1.33825364E-11>
--  // CHECK: %[[TMP_93:.*]] = stablehlo.add %[[TMP_86]], %[[TMP_92]]
--  // CHECK: %[[TMP_94:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_93]]
--  // CHECK: %[[TMP_95:.*]] = stablehlo.multiply %[[TMP_91]], %[[TMP_94]]
--  // CHECK: %[[TMP_96:.*]] = stablehlo.constant dense<1.000000e+01>
--  // CHECK: %[[TMP_97:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_96]]
--  // CHECK: %[[TMP_98:.*]] = stablehlo.constant dense<9.000000e+00>
--  // CHECK: %[[TMP_99:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_98]]
--  // CHECK: %[[TMP_100:.*]] = stablehlo.multiply %[[TMP_97]], %[[TMP_99]]
--  // CHECK: %[[TMP_101:.*]] = stablehlo.constant dense<-5.28419031E-10>
--  // CHECK: %[[TMP_102:.*]] = stablehlo.add %[[TMP_95]], %[[TMP_101]]
--  // CHECK: %[[TMP_103:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_102]]
--  // CHECK: %[[TMP_104:.*]] = stablehlo.multiply %[[TMP_100]], %[[TMP_103]]
--  // CHECK: %[[TMP_105:.*]] = stablehlo.constant dense<8.000000e+00>
--  // CHECK: %[[TMP_106:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_105]]
--  // CHECK: %[[TMP_107:.*]] = stablehlo.constant dense<7.000000e+00>
--  // CHECK: %[[TMP_108:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_107]]
--  // CHECK: %[[TMP_109:.*]] = stablehlo.multiply %[[TMP_106]], %[[TMP_108]]
--  // CHECK: %[[TMP_110:.*]] = stablehlo.constant dense<2.08767563E-8>
--  // CHECK: %[[TMP_111:.*]] = stablehlo.add %[[TMP_104]], %[[TMP_110]]
--  // CHECK: %[[TMP_112:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_111]]
--  // CHECK: %[[TMP_113:.*]] = stablehlo.multiply %[[TMP_109]], %[[TMP_112]]
--  // CHECK: %[[TMP_114:.*]] = stablehlo.constant dense<6.000000e+00>
--  // CHECK: %[[TMP_115:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_114]]
--  // CHECK: %[[TMP_116:.*]] = stablehlo.constant dense<5.000000e+00>
--  // CHECK: %[[TMP_117:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_116]]
--  // CHECK: %[[TMP_118:.*]] = stablehlo.multiply %[[TMP_115]], %[[TMP_117]]
--  // CHECK: %[[TMP_119:.*]] = stablehlo.constant dense<-8.26719599E-7>
--  // CHECK: %[[TMP_120:.*]] = stablehlo.add %[[TMP_113]], %[[TMP_119]]
--  // CHECK: %[[TMP_121:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_120]]
--  // CHECK: %[[TMP_122:.*]] = stablehlo.multiply %[[TMP_118]], %[[TMP_121]]
--  // CHECK: %[[TMP_123:.*]] = stablehlo.constant dense<4.000000e+00>
--  // CHECK: %[[TMP_124:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_123]]
--  // CHECK: %[[TMP_125:.*]] = stablehlo.constant dense<3.000000e+00>
--  // CHECK: %[[TMP_126:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_125]]
--  // CHECK: %[[TMP_127:.*]] = stablehlo.multiply %[[TMP_124]], %[[TMP_126]]
--  // CHECK: %[[TMP_128:.*]] = stablehlo.constant dense<3.30687835E-5>
--  // CHECK: %[[TMP_129:.*]] = stablehlo.add %[[TMP_122]], %[[TMP_128]]
--  // CHECK: %[[TMP_130:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_129]]
--  // CHECK: %[[TMP_131:.*]] = stablehlo.multiply %[[TMP_127]], %[[TMP_130]]
--  // CHECK: %[[TMP_132:.*]] = stablehlo.constant dense<2.000000e+00>
--  // CHECK: %[[TMP_133:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_132]]
--  // CHECK: %[[TMP_134:.*]] = stablehlo.constant dense<1.000000e+00>
--  // CHECK: %[[TMP_135:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_134]]
--  // CHECK: %[[TMP_136:.*]] = stablehlo.multiply %[[TMP_133]], %[[TMP_135]]
--  // CHECK: %[[TMP_137:.*]] = stablehlo.constant dense<-0.00138888892>
--  // CHECK: %[[TMP_138:.*]] = stablehlo.add %[[TMP_131]], %[[TMP_137]]
--  // CHECK: %[[TMP_139:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_138]]
--  // CHECK: %[[TMP_140:.*]] = stablehlo.multiply %[[TMP_136]], %[[TMP_139]]
--  // CHECK: %[[TMP_141:.*]] = stablehlo.constant dense<5.000000e-01>
--  // CHECK: %[[TMP_142:.*]] = stablehlo.divide %[[TMP_0]], %[[TMP_33]]
--  // CHECK: %[[TMP_143:.*]] = stablehlo.constant dense<0.0833333358>
--  // CHECK: %[[TMP_144:.*]] = stablehlo.add %[[TMP_143]], %[[TMP_140]]
--  // CHECK: %[[TMP_145:.*]] = stablehlo.multiply %[[TMP_142]], %[[TMP_144]]
--  // CHECK: %[[TMP_146:.*]] = stablehlo.add %[[TMP_141]], %[[TMP_145]]
--  // CHECK: %[[TMP_147:.*]] = stablehlo.multiply %[[TMP_34]], %[[TMP_146]]
--  // CHECK: %[[TMP_148:.*]] = stablehlo.add %[[TMP_39]], %[[TMP_147]]
-+  // CHECK: %[[TMP_36:.*]] = stablehlo.multiply %[[TMP_34]], %[[TMP_33]]
-+  // CHECK: %[[TMP_37:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_35]]
-+  // CHECK: %[[TMP_38:.*]] = stablehlo.divide %[[TMP_36]], %[[TMP_37]]
-+  // CHECK: %[[TMP_39:.*]] = stablehlo.multiply %[[TMP_33]], %[[TMP_33]]
-+  // CHECK: %[[TMP_40:.*]] = stablehlo.divide %[[TMP_3]], %[[TMP_39]]
-+  // CHECK: %[[TMP_41:.*]] = stablehlo.constant dense<2.200000e+01>
-+  // CHECK: %[[TMP_42:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_41]]
-+  // CHECK: %[[TMP_43:.*]] = stablehlo.constant dense<2.100000e+01>
-+  // CHECK: %[[TMP_44:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_43]]
-+  // CHECK: %[[TMP_45:.*]] = stablehlo.multiply %[[TMP_42]], %[[TMP_44]]
-+  // CHECK: %[[TMP_46:.*]] = stablehlo.constant dense<-1.39544646E-19>
-+  // CHECK: %[[TMP_47:.*]] = stablehlo.add %[[TMP_2]], %[[TMP_46]]
-+  // CHECK: %[[TMP_48:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_47]]
-+  // CHECK: %[[TMP_49:.*]] = stablehlo.multiply %[[TMP_45]], %[[TMP_48]]
-+  // CHECK: %[[TMP_50:.*]] = stablehlo.constant dense<2.000000e+01>
-+  // CHECK: %[[TMP_51:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_50]]
-+  // CHECK: %[[TMP_52:.*]] = stablehlo.constant dense<1.900000e+01>
-+  // CHECK: %[[TMP_53:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_52]]
-+  // CHECK: %[[TMP_54:.*]] = stablehlo.multiply %[[TMP_51]], %[[TMP_53]]
-+  // CHECK: %[[TMP_55:.*]] = stablehlo.constant dense<5.50900303E-18>
-+  // CHECK: %[[TMP_56:.*]] = stablehlo.add %[[TMP_49]], %[[TMP_55]]
-+  // CHECK: %[[TMP_57:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_56]]
-+  // CHECK: %[[TMP_58:.*]] = stablehlo.multiply %[[TMP_54]], %[[TMP_57]]
-+  // CHECK: %[[TMP_59:.*]] = stablehlo.constant dense<1.800000e+01>
-+  // CHECK: %[[TMP_60:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_59]]
-+  // CHECK: %[[TMP_61:.*]] = stablehlo.constant dense<1.700000e+01>
-+  // CHECK: %[[TMP_62:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_61]]
-+  // CHECK: %[[TMP_63:.*]] = stablehlo.multiply %[[TMP_60]], %[[TMP_62]]
-+  // CHECK: %[[TMP_64:.*]] = stablehlo.constant dense<-2.17486866E-16>
-+  // CHECK: %[[TMP_65:.*]] = stablehlo.add %[[TMP_58]], %[[TMP_64]]
-+  // CHECK: %[[TMP_66:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_65]]
-+  // CHECK: %[[TMP_67:.*]] = stablehlo.multiply %[[TMP_63]], %[[TMP_66]]
-+  // CHECK: %[[TMP_68:.*]] = stablehlo.constant dense<1.600000e+01>
-+  // CHECK: %[[TMP_69:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_68]]
-+  // CHECK: %[[TMP_70:.*]] = stablehlo.constant dense<1.500000e+01>
-+  // CHECK: %[[TMP_71:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_70]]
-+  // CHECK: %[[TMP_72:.*]] = stablehlo.multiply %[[TMP_69]], %[[TMP_71]]
-+  // CHECK: %[[TMP_73:.*]] = stablehlo.constant dense<8.58606213E-15>
-+  // CHECK: %[[TMP_74:.*]] = stablehlo.add %[[TMP_67]], %[[TMP_73]]
-+  // CHECK: %[[TMP_75:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_74]]
-+  // CHECK: %[[TMP_76:.*]] = stablehlo.multiply %[[TMP_72]], %[[TMP_75]]
-+  // CHECK: %[[TMP_77:.*]] = stablehlo.constant dense<1.400000e+01>
-+  // CHECK: %[[TMP_78:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_77]]
-+  // CHECK: %[[TMP_79:.*]] = stablehlo.constant dense<1.300000e+01>
-+  // CHECK: %[[TMP_80:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_79]]
-+  // CHECK: %[[TMP_81:.*]] = stablehlo.multiply %[[TMP_78]], %[[TMP_80]]
-+  // CHECK: %[[TMP_82:.*]] = stablehlo.constant dense<-3.3896803E-13>
-+  // CHECK: %[[TMP_83:.*]] = stablehlo.add %[[TMP_76]], %[[TMP_82]]
-+  // CHECK: %[[TMP_84:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_83]]
-+  // CHECK: %[[TMP_85:.*]] = stablehlo.multiply %[[TMP_81]], %[[TMP_84]]
-+  // CHECK: %[[TMP_86:.*]] = stablehlo.constant dense<1.200000e+01>
-+  // CHECK: %[[TMP_87:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_86]]
-+  // CHECK: %[[TMP_88:.*]] = stablehlo.constant dense<1.100000e+01>
-+  // CHECK: %[[TMP_89:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_88]]
-+  // CHECK: %[[TMP_90:.*]] = stablehlo.multiply %[[TMP_87]], %[[TMP_89]]
-+  // CHECK: %[[TMP_91:.*]] = stablehlo.constant dense<1.33825364E-11>
-+  // CHECK: %[[TMP_92:.*]] = stablehlo.add %[[TMP_85]], %[[TMP_91]]
-+  // CHECK: %[[TMP_93:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_92]]
-+  // CHECK: %[[TMP_94:.*]] = stablehlo.multiply %[[TMP_90]], %[[TMP_93]]
-+  // CHECK: %[[TMP_95:.*]] = stablehlo.constant dense<1.000000e+01>
-+  // CHECK: %[[TMP_96:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_95]]
-+  // CHECK: %[[TMP_97:.*]] = stablehlo.constant dense<9.000000e+00>
-+  // CHECK: %[[TMP_98:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_97]]
-+  // CHECK: %[[TMP_99:.*]] = stablehlo.multiply %[[TMP_96]], %[[TMP_98]]
-+  // CHECK: %[[TMP_100:.*]] = stablehlo.constant dense<-5.28419031E-10>
-+  // CHECK: %[[TMP_101:.*]] = stablehlo.add %[[TMP_94]], %[[TMP_100]]
-+  // CHECK: %[[TMP_102:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_101]]
-+  // CHECK: %[[TMP_103:.*]] = stablehlo.multiply %[[TMP_99]], %[[TMP_102]]
-+  // CHECK: %[[TMP_104:.*]] = stablehlo.constant dense<8.000000e+00>
-+  // CHECK: %[[TMP_105:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_104]]
-+  // CHECK: %[[TMP_106:.*]] = stablehlo.constant dense<7.000000e+00>
-+  // CHECK: %[[TMP_107:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_106]]
-+  // CHECK: %[[TMP_108:.*]] = stablehlo.multiply %[[TMP_105]], %[[TMP_107]]
-+  // CHECK: %[[TMP_109:.*]] = stablehlo.constant dense<2.08767563E-8>
-+  // CHECK: %[[TMP_110:.*]] = stablehlo.add %[[TMP_103]], %[[TMP_109]]
-+  // CHECK: %[[TMP_111:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_110]]
-+  // CHECK: %[[TMP_112:.*]] = stablehlo.multiply %[[TMP_108]], %[[TMP_111]]
-+  // CHECK: %[[TMP_113:.*]] = stablehlo.constant dense<6.000000e+00>
-+  // CHECK: %[[TMP_114:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_113]]
-+  // CHECK: %[[TMP_115:.*]] = stablehlo.constant dense<5.000000e+00>
-+  // CHECK: %[[TMP_116:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_115]]
-+  // CHECK: %[[TMP_117:.*]] = stablehlo.multiply %[[TMP_114]], %[[TMP_116]]
-+  // CHECK: %[[TMP_118:.*]] = stablehlo.constant dense<-8.26719599E-7>
-+  // CHECK: %[[TMP_119:.*]] = stablehlo.add %[[TMP_112]], %[[TMP_118]]
-+  // CHECK: %[[TMP_120:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_119]]
-+  // CHECK: %[[TMP_121:.*]] = stablehlo.multiply %[[TMP_117]], %[[TMP_120]]
-+  // CHECK: %[[TMP_122:.*]] = stablehlo.constant dense<4.000000e+00>
-+  // CHECK: %[[TMP_123:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_122]]
-+  // CHECK: %[[TMP_124:.*]] = stablehlo.constant dense<3.000000e+00>
-+  // CHECK: %[[TMP_125:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_124]]
-+  // CHECK: %[[TMP_126:.*]] = stablehlo.multiply %[[TMP_123]], %[[TMP_125]]
-+  // CHECK: %[[TMP_127:.*]] = stablehlo.constant dense<3.30687835E-5>
-+  // CHECK: %[[TMP_128:.*]] = stablehlo.add %[[TMP_121]], %[[TMP_127]]
-+  // CHECK: %[[TMP_129:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_128]]
-+  // CHECK: %[[TMP_130:.*]] = stablehlo.multiply %[[TMP_126]], %[[TMP_129]]
-+  // CHECK: %[[TMP_131:.*]] = stablehlo.constant dense<2.000000e+00>
-+  // CHECK: %[[TMP_132:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_131]]
-+  // CHECK: %[[TMP_133:.*]] = stablehlo.constant dense<1.000000e+00>
-+  // CHECK: %[[TMP_134:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_133]]
-+  // CHECK: %[[TMP_135:.*]] = stablehlo.multiply %[[TMP_132]], %[[TMP_134]]
-+  // CHECK: %[[TMP_136:.*]] = stablehlo.constant dense<-0.00138888892>
-+  // CHECK: %[[TMP_137:.*]] = stablehlo.add %[[TMP_130]], %[[TMP_136]]
-+  // CHECK: %[[TMP_138:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_137]]
-+  // CHECK: %[[TMP_139:.*]] = stablehlo.multiply %[[TMP_135]], %[[TMP_138]]
-+  // CHECK: %[[TMP_140:.*]] = stablehlo.constant dense<5.000000e-01>
-+  // CHECK: %[[TMP_141:.*]] = stablehlo.divide %[[TMP_0]], %[[TMP_33]]
-+  // CHECK: %[[TMP_142:.*]] = stablehlo.constant dense<0.0833333358>
-+  // CHECK: %[[TMP_143:.*]] = stablehlo.add %[[TMP_142]], %[[TMP_139]]
-+  // CHECK: %[[TMP_144:.*]] = stablehlo.multiply %[[TMP_141]], %[[TMP_143]]
-+  // CHECK: %[[TMP_145:.*]] = stablehlo.add %[[TMP_140]], %[[TMP_144]]
-+  // CHECK: %[[TMP_146:.*]] = stablehlo.multiply %[[TMP_34]], %[[TMP_145]]
-+  // CHECK: %[[TMP_147:.*]] = stablehlo.add %[[TMP_32]], %[[TMP_38]]
-+  // CHECK: %[[TMP_148:.*]] = stablehlo.add %[[TMP_147]], %[[TMP_146]]
-   // CHECK: %[[TMP_149:.*]] = stablehlo.abs %[[TMP_34]]
-   // CHECK: %[[TMP_150:.*]] = stablehlo.abs %[[TMP_32]]
-   // CHECK: %[[TMP_151:.*]] = stablehlo.constant dense<1.401300e-45>
-@@ -1456,7 +1456,7 @@
-   // CHECK: %[[TMP_172:.*]] = stablehlo.and %[[TMP_169]], %[[TMP_171]] : tensor<i1>
-   // CHECK: %[[TMP_173:.*]] = stablehlo.select %[[TMP_172]], %[[TMP_163]], %[[TMP_155]]
-   // CHECK: %[[TMP_174:.*]] = stablehlo.select %[[TMP_166]], %[[TMP_173]], %[[TMP_162]]
--  // CHECK: %[[TMP_175:.*]] = stablehlo.compare EQ, %[[TMP_0]], %[[TMP_5]], NOTYPE
-+  // CHECK: %[[TMP_175:.*]] = stablehlo.compare EQ, %[[TMP_0]], %[[TMP_3]], NOTYPE
-   // CHECK: %[[TMP_176:.*]] = stablehlo.select %[[TMP_175]], %[[TMP_163]], %[[TMP_174]]
-   // CHECK: %[[TMP_177:.*]] = stablehlo.convert %[[TMP_176]] : (tensor<f32>) -> tensor<f16>
-   %0 = chlo.zeta %arg0, %arg1 : tensor<f16>, tensor<f16> -> tensor<f16>
-@@ -1465,8 +1465,7 @@
+diff --ruN a/stablehlo/stablehlo/reference/Ops.cpp b/stablehlo/stablehlo/reference/Ops.cpp
+--- stablehlo/stablehlo/reference/Ops.cpp
++++ stablehlo/stablehlo/reference/Ops.cpp
+@@ -16,6 +16,7 @@
+ #include "stablehlo/reference/Ops.h"
  
- // -----
- 
--
--// CHECK-LABEL: @polygamma_f32
-+// CHECK: @polygamma_f32
- // CHECK-SAME: (%[[ARG0:.*]]: tensor<f32>, %[[ARG1:.*]]: tensor<f32>)
- func.func @polygamma_f32(%lhs : tensor<f32>, %rhs : tensor<f32>) -> tensor<f32> {
-   // CHECK-DAG: %[[TMP_0:.*]] = stablehlo.constant dense<1.000000e+00>
-@@ -1559,153 +1558,153 @@
-   // CHECK: %[[TMP_87:.*]] = stablehlo.constant dense<0x7F800000>
-   // CHECK: %[[TMP_88:.*]] = stablehlo.select %[[TMP_86]], %[[TMP_87]], %[[TMP_83]]
-   // CHECK: %[[TMP_89:.*]] = stablehlo.exponential %[[TMP_88]]
--  // CHECK: %[[TMP_90:.*]] = stablehlo.constant dense<0.000000e+00>
--  // CHECK: %[[TMP_91:.*]] = stablehlo.negate %[[TMP_5]]
--  // CHECK: %[[TMP_92:.*]] = stablehlo.power %[[ARG1]], %[[TMP_91]]
--  // CHECK: %[[TMP_93:.*]] = stablehlo.constant dense<1.000000e+00>
--  // CHECK: %[[TMP_94:.*]] = stablehlo.add %[[ARG1]], %[[TMP_93]]
--  // CHECK: %[[TMP_95:.*]] = stablehlo.power %[[TMP_94]], %[[TMP_91]]
--  // CHECK: %[[TMP_96:.*]] = stablehlo.add %[[TMP_92]], %[[TMP_95]]
--  // CHECK: %[[TMP_97:.*]] = stablehlo.add %[[TMP_94]], %[[TMP_93]]
--  // CHECK: %[[TMP_98:.*]] = stablehlo.power %[[TMP_97]], %[[TMP_91]]
-+  // CHECK-DAG: %[[TMP_90:.*]] = stablehlo.constant dense<0.000000e+00>
-+  // CHECK-DAG: %[[TMP_91:.*]] = stablehlo.constant dense<1.000000e+00>
-+  // CHECK: %[[TMP_92:.*]] = stablehlo.negate %[[TMP_5]]
-+  // CHECK: %[[TMP_93:.*]] = stablehlo.power %[[ARG1]], %[[TMP_92]]
-+  // CHECK: %[[TMP_94:.*]] = stablehlo.add %[[ARG1]], %[[TMP_91]]
-+  // CHECK: %[[TMP_95:.*]] = stablehlo.power %[[TMP_94]], %[[TMP_92]]
-+  // CHECK: %[[TMP_96:.*]] = stablehlo.add %[[TMP_93]], %[[TMP_95]]
-+  // CHECK: %[[TMP_97:.*]] = stablehlo.add %[[TMP_94]], %[[TMP_91]]
-+  // CHECK: %[[TMP_98:.*]] = stablehlo.power %[[TMP_97]], %[[TMP_92]]
-   // CHECK: %[[TMP_99:.*]] = stablehlo.add %[[TMP_96]], %[[TMP_98]]
--  // CHECK: %[[TMP_100:.*]] = stablehlo.add %[[TMP_97]], %[[TMP_93]]
--  // CHECK: %[[TMP_101:.*]] = stablehlo.power %[[TMP_100]], %[[TMP_91]]
-+  // CHECK: %[[TMP_100:.*]] = stablehlo.add %[[TMP_97]], %[[TMP_91]]
-+  // CHECK: %[[TMP_101:.*]] = stablehlo.power %[[TMP_100]], %[[TMP_92]]
-   // CHECK: %[[TMP_102:.*]] = stablehlo.add %[[TMP_99]], %[[TMP_101]]
--  // CHECK: %[[TMP_103:.*]] = stablehlo.add %[[TMP_100]], %[[TMP_93]]
--  // CHECK: %[[TMP_104:.*]] = stablehlo.power %[[TMP_103]], %[[TMP_91]]
-+  // CHECK: %[[TMP_103:.*]] = stablehlo.add %[[TMP_100]], %[[TMP_91]]
-+  // CHECK: %[[TMP_104:.*]] = stablehlo.power %[[TMP_103]], %[[TMP_92]]
-   // CHECK: %[[TMP_105:.*]] = stablehlo.add %[[TMP_102]], %[[TMP_104]]
--  // CHECK: %[[TMP_106:.*]] = stablehlo.add %[[TMP_103]], %[[TMP_93]]
--  // CHECK: %[[TMP_107:.*]] = stablehlo.power %[[TMP_106]], %[[TMP_91]]
-+  // CHECK: %[[TMP_106:.*]] = stablehlo.add %[[TMP_103]], %[[TMP_91]]
-+  // CHECK: %[[TMP_107:.*]] = stablehlo.power %[[TMP_106]], %[[TMP_92]]
-   // CHECK: %[[TMP_108:.*]] = stablehlo.add %[[TMP_105]], %[[TMP_107]]
--  // CHECK: %[[TMP_109:.*]] = stablehlo.add %[[TMP_106]], %[[TMP_93]]
--  // CHECK: %[[TMP_110:.*]] = stablehlo.power %[[TMP_109]], %[[TMP_91]]
-+  // CHECK: %[[TMP_109:.*]] = stablehlo.add %[[TMP_106]], %[[TMP_91]]
-+  // CHECK: %[[TMP_110:.*]] = stablehlo.power %[[TMP_109]], %[[TMP_92]]
-   // CHECK: %[[TMP_111:.*]] = stablehlo.add %[[TMP_108]], %[[TMP_110]]
--  // CHECK: %[[TMP_112:.*]] = stablehlo.add %[[TMP_109]], %[[TMP_93]]
--  // CHECK: %[[TMP_113:.*]] = stablehlo.power %[[TMP_112]], %[[TMP_91]]
-+  // CHECK: %[[TMP_112:.*]] = stablehlo.add %[[TMP_109]], %[[TMP_91]]
-+  // CHECK: %[[TMP_113:.*]] = stablehlo.power %[[TMP_112]], %[[TMP_92]]
-   // CHECK: %[[TMP_114:.*]] = stablehlo.add %[[TMP_111]], %[[TMP_113]]
--  // CHECK: %[[TMP_115:.*]] = stablehlo.add %[[TMP_112]], %[[TMP_93]]
--  // CHECK: %[[TMP_116:.*]] = stablehlo.power %[[TMP_115]], %[[TMP_91]]
-+  // CHECK: %[[TMP_115:.*]] = stablehlo.add %[[TMP_112]], %[[TMP_91]]
-+  // CHECK: %[[TMP_116:.*]] = stablehlo.power %[[TMP_115]], %[[TMP_92]]
-   // CHECK: %[[TMP_117:.*]] = stablehlo.add %[[TMP_114]], %[[TMP_116]]
--  // CHECK: %[[TMP_118:.*]] = stablehlo.add %[[TMP_115]], %[[TMP_93]]
--  // CHECK: %[[TMP_119:.*]] = stablehlo.power %[[TMP_118]], %[[TMP_91]]
-+  // CHECK: %[[TMP_118:.*]] = stablehlo.add %[[TMP_115]], %[[TMP_91]]
-+  // CHECK: %[[TMP_119:.*]] = stablehlo.power %[[TMP_118]], %[[TMP_92]]
-   // CHECK: %[[TMP_120:.*]] = stablehlo.add %[[TMP_117]], %[[TMP_119]]
--  // CHECK: %[[TMP_121:.*]] = stablehlo.add %[[TMP_118]], %[[TMP_93]]
--  // CHECK: %[[TMP_122:.*]] = stablehlo.power %[[TMP_121]], %[[TMP_91]]
-+  // CHECK: %[[TMP_121:.*]] = stablehlo.add %[[TMP_118]], %[[TMP_91]]
-+  // CHECK: %[[TMP_122:.*]] = stablehlo.power %[[TMP_121]], %[[TMP_92]]
-   // CHECK: %[[TMP_123:.*]] = stablehlo.constant dense<1.000000e+00>
--  // CHECK: %[[TMP_124:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_123]]
--  // CHECK: %[[TMP_125:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_121]]
--  // CHECK: %[[TMP_126:.*]] = stablehlo.divide %[[TMP_125]], %[[TMP_124]]
--  // CHECK: %[[TMP_127:.*]] = stablehlo.add %[[TMP_120]], %[[TMP_126]]
--  // CHECK: %[[TMP_128:.*]] = stablehlo.multiply %[[TMP_121]], %[[TMP_121]]
--  // CHECK: %[[TMP_129:.*]] = stablehlo.divide %[[TMP_93]], %[[TMP_128]]
--  // CHECK: %[[TMP_130:.*]] = stablehlo.constant dense<2.200000e+01>
--  // CHECK: %[[TMP_131:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_130]]
--  // CHECK: %[[TMP_132:.*]] = stablehlo.constant dense<2.100000e+01>
--  // CHECK: %[[TMP_133:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_132]]
--  // CHECK: %[[TMP_134:.*]] = stablehlo.multiply %[[TMP_131]], %[[TMP_133]]
--  // CHECK: %[[TMP_135:.*]] = stablehlo.constant dense<-1.39544646E-19>
--  // CHECK: %[[TMP_136:.*]] = stablehlo.add %[[TMP_90]], %[[TMP_135]]
--  // CHECK: %[[TMP_137:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_136]]
--  // CHECK: %[[TMP_138:.*]] = stablehlo.multiply %[[TMP_134]], %[[TMP_137]]
--  // CHECK: %[[TMP_139:.*]] = stablehlo.constant dense<2.000000e+01>
--  // CHECK: %[[TMP_140:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_139]]
--  // CHECK: %[[TMP_141:.*]] = stablehlo.constant dense<1.900000e+01>
--  // CHECK: %[[TMP_142:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_141]]
--  // CHECK: %[[TMP_143:.*]] = stablehlo.multiply %[[TMP_140]], %[[TMP_142]]
--  // CHECK: %[[TMP_144:.*]] = stablehlo.constant dense<5.50900303E-18>
--  // CHECK: %[[TMP_145:.*]] = stablehlo.add %[[TMP_138]], %[[TMP_144]]
--  // CHECK: %[[TMP_146:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_145]]
--  // CHECK: %[[TMP_147:.*]] = stablehlo.multiply %[[TMP_143]], %[[TMP_146]]
--  // CHECK: %[[TMP_148:.*]] = stablehlo.constant dense<1.800000e+01>
--  // CHECK: %[[TMP_149:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_148]]
--  // CHECK: %[[TMP_150:.*]] = stablehlo.constant dense<1.700000e+01>
--  // CHECK: %[[TMP_151:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_150]]
--  // CHECK: %[[TMP_152:.*]] = stablehlo.multiply %[[TMP_149]], %[[TMP_151]]
--  // CHECK: %[[TMP_153:.*]] = stablehlo.constant dense<-2.17486866E-16>
--  // CHECK: %[[TMP_154:.*]] = stablehlo.add %[[TMP_147]], %[[TMP_153]]
--  // CHECK: %[[TMP_155:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_154]]
--  // CHECK: %[[TMP_156:.*]] = stablehlo.multiply %[[TMP_152]], %[[TMP_155]]
--  // CHECK: %[[TMP_157:.*]] = stablehlo.constant dense<1.600000e+01>
--  // CHECK: %[[TMP_158:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_157]]
--  // CHECK: %[[TMP_159:.*]] = stablehlo.constant dense<1.500000e+01>
--  // CHECK: %[[TMP_160:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_159]]
--  // CHECK: %[[TMP_161:.*]] = stablehlo.multiply %[[TMP_158]], %[[TMP_160]]
--  // CHECK: %[[TMP_162:.*]] = stablehlo.constant dense<8.58606213E-15>
--  // CHECK: %[[TMP_163:.*]] = stablehlo.add %[[TMP_156]], %[[TMP_162]]
--  // CHECK: %[[TMP_164:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_163]]
--  // CHECK: %[[TMP_165:.*]] = stablehlo.multiply %[[TMP_161]], %[[TMP_164]]
--  // CHECK: %[[TMP_166:.*]] = stablehlo.constant dense<1.400000e+01>
--  // CHECK: %[[TMP_167:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_166]]
--  // CHECK: %[[TMP_168:.*]] = stablehlo.constant dense<1.300000e+01>
--  // CHECK: %[[TMP_169:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_168]]
--  // CHECK: %[[TMP_170:.*]] = stablehlo.multiply %[[TMP_167]], %[[TMP_169]]
--  // CHECK: %[[TMP_171:.*]] = stablehlo.constant dense<-3.3896803E-13>
--  // CHECK: %[[TMP_172:.*]] = stablehlo.add %[[TMP_165]], %[[TMP_171]]
--  // CHECK: %[[TMP_173:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_172]]
--  // CHECK: %[[TMP_174:.*]] = stablehlo.multiply %[[TMP_170]], %[[TMP_173]]
--  // CHECK: %[[TMP_175:.*]] = stablehlo.constant dense<1.200000e+01>
--  // CHECK: %[[TMP_176:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_175]]
--  // CHECK: %[[TMP_177:.*]] = stablehlo.constant dense<1.100000e+01>
--  // CHECK: %[[TMP_178:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_177]]
--  // CHECK: %[[TMP_179:.*]] = stablehlo.multiply %[[TMP_176]], %[[TMP_178]]
--  // CHECK: %[[TMP_180:.*]] = stablehlo.constant dense<1.33825364E-11>
--  // CHECK: %[[TMP_181:.*]] = stablehlo.add %[[TMP_174]], %[[TMP_180]]
--  // CHECK: %[[TMP_182:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_181]]
--  // CHECK: %[[TMP_183:.*]] = stablehlo.multiply %[[TMP_179]], %[[TMP_182]]
--  // CHECK: %[[TMP_184:.*]] = stablehlo.constant dense<1.000000e+01>
--  // CHECK: %[[TMP_185:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_184]]
--  // CHECK: %[[TMP_186:.*]] = stablehlo.constant dense<9.000000e+00>
--  // CHECK: %[[TMP_187:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_186]]
--  // CHECK: %[[TMP_188:.*]] = stablehlo.multiply %[[TMP_185]], %[[TMP_187]]
--  // CHECK: %[[TMP_189:.*]] = stablehlo.constant dense<-5.28419031E-10>
--  // CHECK: %[[TMP_190:.*]] = stablehlo.add %[[TMP_183]], %[[TMP_189]]
--  // CHECK: %[[TMP_191:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_190]]
--  // CHECK: %[[TMP_192:.*]] = stablehlo.multiply %[[TMP_188]], %[[TMP_191]]
--  // CHECK: %[[TMP_193:.*]] = stablehlo.constant dense<8.000000e+00>
--  // CHECK: %[[TMP_194:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_193]]
--  // CHECK: %[[TMP_195:.*]] = stablehlo.constant dense<7.000000e+00>
--  // CHECK: %[[TMP_196:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_195]]
--  // CHECK: %[[TMP_197:.*]] = stablehlo.multiply %[[TMP_194]], %[[TMP_196]]
--  // CHECK: %[[TMP_198:.*]] = stablehlo.constant dense<2.08767563E-8>
--  // CHECK: %[[TMP_199:.*]] = stablehlo.add %[[TMP_192]], %[[TMP_198]]
--  // CHECK: %[[TMP_200:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_199]]
--  // CHECK: %[[TMP_201:.*]] = stablehlo.multiply %[[TMP_197]], %[[TMP_200]]
--  // CHECK: %[[TMP_202:.*]] = stablehlo.constant dense<6.000000e+00>
--  // CHECK: %[[TMP_203:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_202]]
--  // CHECK: %[[TMP_204:.*]] = stablehlo.constant dense<5.000000e+00>
--  // CHECK: %[[TMP_205:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_204]]
--  // CHECK: %[[TMP_206:.*]] = stablehlo.multiply %[[TMP_203]], %[[TMP_205]]
--  // CHECK: %[[TMP_207:.*]] = stablehlo.constant dense<-8.26719599E-7>
--  // CHECK: %[[TMP_208:.*]] = stablehlo.add %[[TMP_201]], %[[TMP_207]]
--  // CHECK: %[[TMP_209:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_208]]
--  // CHECK: %[[TMP_210:.*]] = stablehlo.multiply %[[TMP_206]], %[[TMP_209]]
--  // CHECK: %[[TMP_211:.*]] = stablehlo.constant dense<4.000000e+00>
--  // CHECK: %[[TMP_212:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_211]]
--  // CHECK: %[[TMP_213:.*]] = stablehlo.constant dense<3.000000e+00>
--  // CHECK: %[[TMP_214:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_213]]
--  // CHECK: %[[TMP_215:.*]] = stablehlo.multiply %[[TMP_212]], %[[TMP_214]]
--  // CHECK: %[[TMP_216:.*]] = stablehlo.constant dense<3.30687835E-5>
--  // CHECK: %[[TMP_217:.*]] = stablehlo.add %[[TMP_210]], %[[TMP_216]]
--  // CHECK: %[[TMP_218:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_217]]
--  // CHECK: %[[TMP_219:.*]] = stablehlo.multiply %[[TMP_215]], %[[TMP_218]]
--  // CHECK: %[[TMP_220:.*]] = stablehlo.constant dense<2.000000e+00>
--  // CHECK: %[[TMP_221:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_220]]
--  // CHECK: %[[TMP_222:.*]] = stablehlo.constant dense<1.000000e+00>
--  // CHECK: %[[TMP_223:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_222]]
--  // CHECK: %[[TMP_224:.*]] = stablehlo.multiply %[[TMP_221]], %[[TMP_223]]
--  // CHECK: %[[TMP_225:.*]] = stablehlo.constant dense<-0.00138888892>
--  // CHECK: %[[TMP_226:.*]] = stablehlo.add %[[TMP_219]], %[[TMP_225]]
--  // CHECK: %[[TMP_227:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_226]]
--  // CHECK: %[[TMP_228:.*]] = stablehlo.multiply %[[TMP_224]], %[[TMP_227]]
--  // CHECK: %[[TMP_229:.*]] = stablehlo.constant dense<5.000000e-01>
--  // CHECK: %[[TMP_230:.*]] = stablehlo.divide %[[TMP_5]], %[[TMP_121]]
--  // CHECK: %[[TMP_231:.*]] = stablehlo.constant dense<0.0833333358>
--  // CHECK: %[[TMP_232:.*]] = stablehlo.add %[[TMP_231]], %[[TMP_228]]
--  // CHECK: %[[TMP_233:.*]] = stablehlo.multiply %[[TMP_230]], %[[TMP_232]]
--  // CHECK: %[[TMP_234:.*]] = stablehlo.add %[[TMP_229]], %[[TMP_233]]
--  // CHECK: %[[TMP_235:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_234]]
--  // CHECK: %[[TMP_236:.*]] = stablehlo.add %[[TMP_127]], %[[TMP_235]]
-+  // CHECK: %[[TMP_124:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_121]]
-+  // CHECK: %[[TMP_125:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_123]]
-+  // CHECK: %[[TMP_126:.*]] = stablehlo.divide %[[TMP_124]], %[[TMP_125]]
-+  // CHECK: %[[TMP_127:.*]] = stablehlo.multiply %[[TMP_121]], %[[TMP_121]]
-+  // CHECK: %[[TMP_128:.*]] = stablehlo.divide %[[TMP_91]], %[[TMP_127]]
-+  // CHECK: %[[TMP_129:.*]] = stablehlo.constant dense<2.200000e+01>
-+  // CHECK: %[[TMP_130:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_129]]
-+  // CHECK: %[[TMP_131:.*]] = stablehlo.constant dense<2.100000e+01>
-+  // CHECK: %[[TMP_132:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_131]]
-+  // CHECK: %[[TMP_133:.*]] = stablehlo.multiply %[[TMP_130]], %[[TMP_132]]
-+  // CHECK: %[[TMP_134:.*]] = stablehlo.constant dense<-1.39544646E-19>
-+  // CHECK: %[[TMP_135:.*]] = stablehlo.add %[[TMP_90]], %[[TMP_134]]
-+  // CHECK: %[[TMP_136:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_135]]
-+  // CHECK: %[[TMP_137:.*]] = stablehlo.multiply %[[TMP_133]], %[[TMP_136]]
-+  // CHECK: %[[TMP_138:.*]] = stablehlo.constant dense<2.000000e+01>
-+  // CHECK: %[[TMP_139:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_138]]
-+  // CHECK: %[[TMP_140:.*]] = stablehlo.constant dense<1.900000e+01>
-+  // CHECK: %[[TMP_141:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_140]]
-+  // CHECK: %[[TMP_142:.*]] = stablehlo.multiply %[[TMP_139]], %[[TMP_141]]
-+  // CHECK: %[[TMP_143:.*]] = stablehlo.constant dense<5.50900303E-18>
-+  // CHECK: %[[TMP_144:.*]] = stablehlo.add %[[TMP_137]], %[[TMP_143]]
-+  // CHECK: %[[TMP_145:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_144]]
-+  // CHECK: %[[TMP_146:.*]] = stablehlo.multiply %[[TMP_142]], %[[TMP_145]]
-+  // CHECK: %[[TMP_147:.*]] = stablehlo.constant dense<1.800000e+01>
-+  // CHECK: %[[TMP_148:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_147]]
-+  // CHECK: %[[TMP_149:.*]] = stablehlo.constant dense<1.700000e+01>
-+  // CHECK: %[[TMP_150:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_149]]
-+  // CHECK: %[[TMP_151:.*]] = stablehlo.multiply %[[TMP_148]], %[[TMP_150]]
-+  // CHECK: %[[TMP_152:.*]] = stablehlo.constant dense<-2.17486866E-16>
-+  // CHECK: %[[TMP_153:.*]] = stablehlo.add %[[TMP_146]], %[[TMP_152]]
-+  // CHECK: %[[TMP_154:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_153]]
-+  // CHECK: %[[TMP_155:.*]] = stablehlo.multiply %[[TMP_151]], %[[TMP_154]]
-+  // CHECK: %[[TMP_156:.*]] = stablehlo.constant dense<1.600000e+01>
-+  // CHECK: %[[TMP_157:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_156]]
-+  // CHECK: %[[TMP_158:.*]] = stablehlo.constant dense<1.500000e+01>
-+  // CHECK: %[[TMP_159:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_158]]
-+  // CHECK: %[[TMP_160:.*]] = stablehlo.multiply %[[TMP_157]], %[[TMP_159]]
-+  // CHECK: %[[TMP_161:.*]] = stablehlo.constant dense<8.58606213E-15>
-+  // CHECK: %[[TMP_162:.*]] = stablehlo.add %[[TMP_155]], %[[TMP_161]]
-+  // CHECK: %[[TMP_163:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_162]]
-+  // CHECK: %[[TMP_164:.*]] = stablehlo.multiply %[[TMP_160]], %[[TMP_163]]
-+  // CHECK: %[[TMP_165:.*]] = stablehlo.constant dense<1.400000e+01>
-+  // CHECK: %[[TMP_166:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_165]]
-+  // CHECK: %[[TMP_167:.*]] = stablehlo.constant dense<1.300000e+01>
-+  // CHECK: %[[TMP_168:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_167]]
-+  // CHECK: %[[TMP_169:.*]] = stablehlo.multiply %[[TMP_166]], %[[TMP_168]]
-+  // CHECK: %[[TMP_170:.*]] = stablehlo.constant dense<-3.3896803E-13>
-+  // CHECK: %[[TMP_171:.*]] = stablehlo.add %[[TMP_164]], %[[TMP_170]]
-+  // CHECK: %[[TMP_172:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_171]]
-+  // CHECK: %[[TMP_173:.*]] = stablehlo.multiply %[[TMP_169]], %[[TMP_172]]
-+  // CHECK: %[[TMP_174:.*]] = stablehlo.constant dense<1.200000e+01>
-+  // CHECK: %[[TMP_175:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_174]]
-+  // CHECK: %[[TMP_176:.*]] = stablehlo.constant dense<1.100000e+01>
-+  // CHECK: %[[TMP_177:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_176]]
-+  // CHECK: %[[TMP_178:.*]] = stablehlo.multiply %[[TMP_175]], %[[TMP_177]]
-+  // CHECK: %[[TMP_179:.*]] = stablehlo.constant dense<1.33825364E-11>
-+  // CHECK: %[[TMP_180:.*]] = stablehlo.add %[[TMP_173]], %[[TMP_179]]
-+  // CHECK: %[[TMP_181:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_180]]
-+  // CHECK: %[[TMP_182:.*]] = stablehlo.multiply %[[TMP_178]], %[[TMP_181]]
-+  // CHECK: %[[TMP_183:.*]] = stablehlo.constant dense<1.000000e+01>
-+  // CHECK: %[[TMP_184:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_183]]
-+  // CHECK: %[[TMP_185:.*]] = stablehlo.constant dense<9.000000e+00>
-+  // CHECK: %[[TMP_186:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_185]]
-+  // CHECK: %[[TMP_187:.*]] = stablehlo.multiply %[[TMP_184]], %[[TMP_186]]
-+  // CHECK: %[[TMP_188:.*]] = stablehlo.constant dense<-5.28419031E-10>
-+  // CHECK: %[[TMP_189:.*]] = stablehlo.add %[[TMP_182]], %[[TMP_188]]
-+  // CHECK: %[[TMP_190:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_189]]
-+  // CHECK: %[[TMP_191:.*]] = stablehlo.multiply %[[TMP_187]], %[[TMP_190]]
-+  // CHECK: %[[TMP_192:.*]] = stablehlo.constant dense<8.000000e+00>
-+  // CHECK: %[[TMP_193:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_192]]
-+  // CHECK: %[[TMP_194:.*]] = stablehlo.constant dense<7.000000e+00>
-+  // CHECK: %[[TMP_195:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_194]]
-+  // CHECK: %[[TMP_196:.*]] = stablehlo.multiply %[[TMP_193]], %[[TMP_195]]
-+  // CHECK: %[[TMP_197:.*]] = stablehlo.constant dense<2.08767563E-8>
-+  // CHECK: %[[TMP_198:.*]] = stablehlo.add %[[TMP_191]], %[[TMP_197]]
-+  // CHECK: %[[TMP_199:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_198]]
-+  // CHECK: %[[TMP_200:.*]] = stablehlo.multiply %[[TMP_196]], %[[TMP_199]]
-+  // CHECK: %[[TMP_201:.*]] = stablehlo.constant dense<6.000000e+00>
-+  // CHECK: %[[TMP_202:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_201]]
-+  // CHECK: %[[TMP_203:.*]] = stablehlo.constant dense<5.000000e+00>
-+  // CHECK: %[[TMP_204:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_203]]
-+  // CHECK: %[[TMP_205:.*]] = stablehlo.multiply %[[TMP_202]], %[[TMP_204]]
-+  // CHECK: %[[TMP_206:.*]] = stablehlo.constant dense<-8.26719599E-7>
-+  // CHECK: %[[TMP_207:.*]] = stablehlo.add %[[TMP_200]], %[[TMP_206]]
-+  // CHECK: %[[TMP_208:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_207]]
-+  // CHECK: %[[TMP_209:.*]] = stablehlo.multiply %[[TMP_205]], %[[TMP_208]]
-+  // CHECK: %[[TMP_210:.*]] = stablehlo.constant dense<4.000000e+00>
-+  // CHECK: %[[TMP_211:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_210]]
-+  // CHECK: %[[TMP_212:.*]] = stablehlo.constant dense<3.000000e+00>
-+  // CHECK: %[[TMP_213:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_212]]
-+  // CHECK: %[[TMP_214:.*]] = stablehlo.multiply %[[TMP_211]], %[[TMP_213]]
-+  // CHECK: %[[TMP_215:.*]] = stablehlo.constant dense<3.30687835E-5>
-+  // CHECK: %[[TMP_216:.*]] = stablehlo.add %[[TMP_209]], %[[TMP_215]]
-+  // CHECK: %[[TMP_217:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_216]]
-+  // CHECK: %[[TMP_218:.*]] = stablehlo.multiply %[[TMP_214]], %[[TMP_217]]
-+  // CHECK: %[[TMP_219:.*]] = stablehlo.constant dense<2.000000e+00>
-+  // CHECK: %[[TMP_220:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_219]]
-+  // CHECK: %[[TMP_221:.*]] = stablehlo.constant dense<1.000000e+00>
-+  // CHECK: %[[TMP_222:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_221]]
-+  // CHECK: %[[TMP_223:.*]] = stablehlo.multiply %[[TMP_220]], %[[TMP_222]]
-+  // CHECK: %[[TMP_224:.*]] = stablehlo.constant dense<-0.00138888892>
-+  // CHECK: %[[TMP_225:.*]] = stablehlo.add %[[TMP_218]], %[[TMP_224]]
-+  // CHECK: %[[TMP_226:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_225]]
-+  // CHECK: %[[TMP_227:.*]] = stablehlo.multiply %[[TMP_223]], %[[TMP_226]]
-+  // CHECK: %[[TMP_228:.*]] = stablehlo.constant dense<5.000000e-01>
-+  // CHECK: %[[TMP_229:.*]] = stablehlo.divide %[[TMP_5]], %[[TMP_121]]
-+  // CHECK: %[[TMP_230:.*]] = stablehlo.constant dense<0.0833333358>
-+  // CHECK: %[[TMP_231:.*]] = stablehlo.add %[[TMP_230]], %[[TMP_227]]
-+  // CHECK: %[[TMP_232:.*]] = stablehlo.multiply %[[TMP_229]], %[[TMP_231]]
-+  // CHECK: %[[TMP_233:.*]] = stablehlo.add %[[TMP_228]], %[[TMP_232]]
-+  // CHECK: %[[TMP_234:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_233]]
-+  // CHECK: %[[TMP_235:.*]] = stablehlo.add %[[TMP_120]], %[[TMP_126]]
-+  // CHECK: %[[TMP_236:.*]] = stablehlo.add %[[TMP_235]], %[[TMP_234]]
-   // CHECK: %[[TMP_237:.*]] = stablehlo.abs %[[TMP_122]]
-   // CHECK: %[[TMP_238:.*]] = stablehlo.abs %[[TMP_120]]
-   // CHECK: %[[TMP_239:.*]] = stablehlo.constant dense<1.401300e-45>
-@@ -1732,7 +1731,7 @@
-   // CHECK: %[[TMP_260:.*]] = stablehlo.and %[[TMP_257]], %[[TMP_259]]
-   // CHECK: %[[TMP_261:.*]] = stablehlo.select %[[TMP_260]], %[[TMP_251]], %[[TMP_243]]
-   // CHECK: %[[TMP_262:.*]] = stablehlo.select %[[TMP_254]], %[[TMP_261]], %[[TMP_250]]
--  // CHECK: %[[TMP_263:.*]] = stablehlo.compare EQ, %[[TMP_5]], %[[TMP_93]], NOTYPE
-+  // CHECK: %[[TMP_263:.*]] = stablehlo.compare EQ, %[[TMP_5]], %[[TMP_91]], NOTYPE
-   // CHECK: %[[TMP_264:.*]] = stablehlo.select %[[TMP_263]], %[[TMP_251]], %[[TMP_262]]
-   // CHECK: %[[TMP_265:.*]] = stablehlo.multiply %[[TMP_4]], %[[TMP_89]]
-   // CHECK: %[[TMP_266:.*]] = stablehlo.multiply %[[TMP_265]], %[[TMP_264]]
-@@ -1853,8 +1852,7 @@
+ #include <algorithm>
++#include <cstdint>
  
- // -----
+ #include "llvm/ADT/APFloat.h"
+ #include "llvm/ADT/APInt.h"
+@@ -1428,23 +1429,23 @@
+                         inputSpatialDimensions.end());
+   lhsPermutation.push_back(inputFeatureDimension);
  
+-  auto lhsWindowDimensions =
+-      concatAndPermute(lhs.getShape()[inputBatchDimension],
+-                       extractElements(rhs.getShape(), kernelSpatialDimensions),
+-                       lhs.getShape()[inputFeatureDimension], lhsPermutation);
 -
--// CHECK-LABEL: @polygamma_f64
-+// CHECK: @polygamma_f64
- // CHECK-SAME: (%[[ARG0:.*]]: tensor<f64>, %[[ARG1:.*]]: tensor<f64>)
- func.func @polygamma_f64(%lhs : tensor<f64>, %rhs : tensor<f64>) -> tensor<f64> {
-   // CHECK-DAG: %[[TMP_0:.*]] = stablehlo.constant dense<1.000000e+00>
-@@ -1947,153 +1945,153 @@
-   // CHECK: %[[TMP_87:.*]] = stablehlo.constant dense<0x7FF0000000000000>
-   // CHECK: %[[TMP_88:.*]] = stablehlo.select %[[TMP_86]], %[[TMP_87]], %[[TMP_83]]
-   // CHECK: %[[TMP_89:.*]] = stablehlo.exponential %[[TMP_88]]
--  // CHECK: %[[TMP_90:.*]] = stablehlo.constant dense<0.000000e+00>
--  // CHECK: %[[TMP_91:.*]] = stablehlo.negate %[[TMP_5]]
--  // CHECK: %[[TMP_92:.*]] = stablehlo.power %[[ARG1]], %[[TMP_91]]
--  // CHECK: %[[TMP_93:.*]] = stablehlo.constant dense<1.000000e+00>
--  // CHECK: %[[TMP_94:.*]] = stablehlo.add %[[ARG1]], %[[TMP_93]]
--  // CHECK: %[[TMP_95:.*]] = stablehlo.power %[[TMP_94]], %[[TMP_91]]
--  // CHECK: %[[TMP_96:.*]] = stablehlo.add %[[TMP_92]], %[[TMP_95]]
--  // CHECK: %[[TMP_97:.*]] = stablehlo.add %[[TMP_94]], %[[TMP_93]]
--  // CHECK: %[[TMP_98:.*]] = stablehlo.power %[[TMP_97]], %[[TMP_91]]
-+  // CHECK-DAG: %[[TMP_90:.*]] = stablehlo.constant dense<0.000000e+00>
-+  // CHECK-DAG: %[[TMP_91:.*]] = stablehlo.constant dense<1.000000e+00>
-+  // CHECK: %[[TMP_92:.*]] = stablehlo.negate %[[TMP_5]]
-+  // CHECK: %[[TMP_93:.*]] = stablehlo.power %[[ARG1]], %[[TMP_92]]
-+  // CHECK: %[[TMP_94:.*]] = stablehlo.add %[[ARG1]], %[[TMP_91]]
-+  // CHECK: %[[TMP_95:.*]] = stablehlo.power %[[TMP_94]], %[[TMP_92]]
-+  // CHECK: %[[TMP_96:.*]] = stablehlo.add %[[TMP_93]], %[[TMP_95]]
-+  // CHECK: %[[TMP_97:.*]] = stablehlo.add %[[TMP_94]], %[[TMP_91]]
-+  // CHECK: %[[TMP_98:.*]] = stablehlo.power %[[TMP_97]], %[[TMP_92]]
-   // CHECK: %[[TMP_99:.*]] = stablehlo.add %[[TMP_96]], %[[TMP_98]]
--  // CHECK: %[[TMP_100:.*]] = stablehlo.add %[[TMP_97]], %[[TMP_93]]
--  // CHECK: %[[TMP_101:.*]] = stablehlo.power %[[TMP_100]], %[[TMP_91]]
-+  // CHECK: %[[TMP_100:.*]] = stablehlo.add %[[TMP_97]], %[[TMP_91]]
-+  // CHECK: %[[TMP_101:.*]] = stablehlo.power %[[TMP_100]], %[[TMP_92]]
-   // CHECK: %[[TMP_102:.*]] = stablehlo.add %[[TMP_99]], %[[TMP_101]]
--  // CHECK: %[[TMP_103:.*]] = stablehlo.add %[[TMP_100]], %[[TMP_93]]
--  // CHECK: %[[TMP_104:.*]] = stablehlo.power %[[TMP_103]], %[[TMP_91]]
-+  // CHECK: %[[TMP_103:.*]] = stablehlo.add %[[TMP_100]], %[[TMP_91]]
-+  // CHECK: %[[TMP_104:.*]] = stablehlo.power %[[TMP_103]], %[[TMP_92]]
-   // CHECK: %[[TMP_105:.*]] = stablehlo.add %[[TMP_102]], %[[TMP_104]]
--  // CHECK: %[[TMP_106:.*]] = stablehlo.add %[[TMP_103]], %[[TMP_93]]
--  // CHECK: %[[TMP_107:.*]] = stablehlo.power %[[TMP_106]], %[[TMP_91]]
-+  // CHECK: %[[TMP_106:.*]] = stablehlo.add %[[TMP_103]], %[[TMP_91]]
-+  // CHECK: %[[TMP_107:.*]] = stablehlo.power %[[TMP_106]], %[[TMP_92]]
-   // CHECK: %[[TMP_108:.*]] = stablehlo.add %[[TMP_105]], %[[TMP_107]]
--  // CHECK: %[[TMP_109:.*]] = stablehlo.add %[[TMP_106]], %[[TMP_93]]
--  // CHECK: %[[TMP_110:.*]] = stablehlo.power %[[TMP_109]], %[[TMP_91]]
-+  // CHECK: %[[TMP_109:.*]] = stablehlo.add %[[TMP_106]], %[[TMP_91]]
-+  // CHECK: %[[TMP_110:.*]] = stablehlo.power %[[TMP_109]], %[[TMP_92]]
-   // CHECK: %[[TMP_111:.*]] = stablehlo.add %[[TMP_108]], %[[TMP_110]]
--  // CHECK: %[[TMP_112:.*]] = stablehlo.add %[[TMP_109]], %[[TMP_93]]
--  // CHECK: %[[TMP_113:.*]] = stablehlo.power %[[TMP_112]], %[[TMP_91]]
-+  // CHECK: %[[TMP_112:.*]] = stablehlo.add %[[TMP_109]], %[[TMP_91]]
-+  // CHECK: %[[TMP_113:.*]] = stablehlo.power %[[TMP_112]], %[[TMP_92]]
-   // CHECK: %[[TMP_114:.*]] = stablehlo.add %[[TMP_111]], %[[TMP_113]]
--  // CHECK: %[[TMP_115:.*]] = stablehlo.add %[[TMP_112]], %[[TMP_93]]
--  // CHECK: %[[TMP_116:.*]] = stablehlo.power %[[TMP_115]], %[[TMP_91]]
-+  // CHECK: %[[TMP_115:.*]] = stablehlo.add %[[TMP_112]], %[[TMP_91]]
-+  // CHECK: %[[TMP_116:.*]] = stablehlo.power %[[TMP_115]], %[[TMP_92]]
-   // CHECK: %[[TMP_117:.*]] = stablehlo.add %[[TMP_114]], %[[TMP_116]]
--  // CHECK: %[[TMP_118:.*]] = stablehlo.add %[[TMP_115]], %[[TMP_93]]
--  // CHECK: %[[TMP_119:.*]] = stablehlo.power %[[TMP_118]], %[[TMP_91]]
-+  // CHECK: %[[TMP_118:.*]] = stablehlo.add %[[TMP_115]], %[[TMP_91]]
-+  // CHECK: %[[TMP_119:.*]] = stablehlo.power %[[TMP_118]], %[[TMP_92]]
-   // CHECK: %[[TMP_120:.*]] = stablehlo.add %[[TMP_117]], %[[TMP_119]]
--  // CHECK: %[[TMP_121:.*]] = stablehlo.add %[[TMP_118]], %[[TMP_93]]
--  // CHECK: %[[TMP_122:.*]] = stablehlo.power %[[TMP_121]], %[[TMP_91]]
-+  // CHECK: %[[TMP_121:.*]] = stablehlo.add %[[TMP_118]], %[[TMP_91]]
-+  // CHECK: %[[TMP_122:.*]] = stablehlo.power %[[TMP_121]], %[[TMP_92]]
-   // CHECK: %[[TMP_123:.*]] = stablehlo.constant dense<1.000000e+00>
--  // CHECK: %[[TMP_124:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_123]]
--  // CHECK: %[[TMP_125:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_121]]
--  // CHECK: %[[TMP_126:.*]] = stablehlo.divide %[[TMP_125]], %[[TMP_124]]
--  // CHECK: %[[TMP_127:.*]] = stablehlo.add %[[TMP_120]], %[[TMP_126]]
--  // CHECK: %[[TMP_128:.*]] = stablehlo.multiply %[[TMP_121]], %[[TMP_121]]
--  // CHECK: %[[TMP_129:.*]] = stablehlo.divide %[[TMP_93]], %[[TMP_128]]
--  // CHECK: %[[TMP_130:.*]] = stablehlo.constant dense<2.200000e+01>
--  // CHECK: %[[TMP_131:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_130]]
--  // CHECK: %[[TMP_132:.*]] = stablehlo.constant dense<2.100000e+01>
--  // CHECK: %[[TMP_133:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_132]]
--  // CHECK: %[[TMP_134:.*]] = stablehlo.multiply %[[TMP_131]], %[[TMP_133]]
--  // CHECK: %[[TMP_135:.*]] = stablehlo.constant dense<-1.3954464685812522E-19>
--  // CHECK: %[[TMP_136:.*]] = stablehlo.add %[[TMP_90]], %[[TMP_135]]
--  // CHECK: %[[TMP_137:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_136]]
--  // CHECK: %[[TMP_138:.*]] = stablehlo.multiply %[[TMP_134]], %[[TMP_137]]
--  // CHECK: %[[TMP_139:.*]] = stablehlo.constant dense<2.000000e+01>
--  // CHECK: %[[TMP_140:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_139]]
--  // CHECK: %[[TMP_141:.*]] = stablehlo.constant dense<1.900000e+01>
--  // CHECK: %[[TMP_142:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_141]]
--  // CHECK: %[[TMP_143:.*]] = stablehlo.multiply %[[TMP_140]], %[[TMP_142]]
--  // CHECK: %[[TMP_144:.*]] = stablehlo.constant dense<5.5090028283602295E-18>
--  // CHECK: %[[TMP_145:.*]] = stablehlo.add %[[TMP_138]], %[[TMP_144]]
--  // CHECK: %[[TMP_146:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_145]]
--  // CHECK: %[[TMP_147:.*]] = stablehlo.multiply %[[TMP_143]], %[[TMP_146]]
--  // CHECK: %[[TMP_148:.*]] = stablehlo.constant dense<1.800000e+01>
--  // CHECK: %[[TMP_149:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_148]]
--  // CHECK: %[[TMP_150:.*]] = stablehlo.constant dense<1.700000e+01>
--  // CHECK: %[[TMP_151:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_150]]
--  // CHECK: %[[TMP_152:.*]] = stablehlo.multiply %[[TMP_149]], %[[TMP_151]]
--  // CHECK: %[[TMP_153:.*]] = stablehlo.constant dense<-2.1748686985580617E-16>
--  // CHECK: %[[TMP_154:.*]] = stablehlo.add %[[TMP_147]], %[[TMP_153]]
--  // CHECK: %[[TMP_155:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_154]]
--  // CHECK: %[[TMP_156:.*]] = stablehlo.multiply %[[TMP_152]], %[[TMP_155]]
--  // CHECK: %[[TMP_157:.*]] = stablehlo.constant dense<1.600000e+01>
--  // CHECK: %[[TMP_158:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_157]]
--  // CHECK: %[[TMP_159:.*]] = stablehlo.constant dense<1.500000e+01>
--  // CHECK: %[[TMP_160:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_159]]
--  // CHECK: %[[TMP_161:.*]] = stablehlo.multiply %[[TMP_158]], %[[TMP_160]]
--  // CHECK: %[[TMP_162:.*]] = stablehlo.constant dense<8.5860620562778452E-15>
--  // CHECK: %[[TMP_163:.*]] = stablehlo.add %[[TMP_156]], %[[TMP_162]]
--  // CHECK: %[[TMP_164:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_163]]
--  // CHECK: %[[TMP_165:.*]] = stablehlo.multiply %[[TMP_161]], %[[TMP_164]]
--  // CHECK: %[[TMP_166:.*]] = stablehlo.constant dense<1.400000e+01>
--  // CHECK: %[[TMP_167:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_166]]
--  // CHECK: %[[TMP_168:.*]] = stablehlo.constant dense<1.300000e+01>
--  // CHECK: %[[TMP_169:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_168]]
--  // CHECK: %[[TMP_170:.*]] = stablehlo.multiply %[[TMP_167]], %[[TMP_169]]
--  // CHECK: %[[TMP_171:.*]] = stablehlo.constant dense<-3.3896802963225832E-13>
--  // CHECK: %[[TMP_172:.*]] = stablehlo.add %[[TMP_165]], %[[TMP_171]]
--  // CHECK: %[[TMP_173:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_172]]
--  // CHECK: %[[TMP_174:.*]] = stablehlo.multiply %[[TMP_170]], %[[TMP_173]]
--  // CHECK: %[[TMP_175:.*]] = stablehlo.constant dense<1.200000e+01>
--  // CHECK: %[[TMP_176:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_175]]
--  // CHECK: %[[TMP_177:.*]] = stablehlo.constant dense<1.100000e+01>
--  // CHECK: %[[TMP_178:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_177]]
--  // CHECK: %[[TMP_179:.*]] = stablehlo.multiply %[[TMP_176]], %[[TMP_178]]
--  // CHECK: %[[TMP_180:.*]] = stablehlo.constant dense<1.3382536530684679E-11>
--  // CHECK: %[[TMP_181:.*]] = stablehlo.add %[[TMP_174]], %[[TMP_180]]
--  // CHECK: %[[TMP_182:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_181]]
--  // CHECK: %[[TMP_183:.*]] = stablehlo.multiply %[[TMP_179]], %[[TMP_182]]
--  // CHECK: %[[TMP_184:.*]] = stablehlo.constant dense<1.000000e+01>
--  // CHECK: %[[TMP_185:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_184]]
--  // CHECK: %[[TMP_186:.*]] = stablehlo.constant dense<9.000000e+00>
--  // CHECK: %[[TMP_187:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_186]]
--  // CHECK: %[[TMP_188:.*]] = stablehlo.multiply %[[TMP_185]], %[[TMP_187]]
--  // CHECK: %[[TMP_189:.*]] = stablehlo.constant dense<-5.2841901386874932E-10>
--  // CHECK: %[[TMP_190:.*]] = stablehlo.add %[[TMP_183]], %[[TMP_189]]
--  // CHECK: %[[TMP_191:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_190]]
--  // CHECK: %[[TMP_192:.*]] = stablehlo.multiply %[[TMP_188]], %[[TMP_191]]
--  // CHECK: %[[TMP_193:.*]] = stablehlo.constant dense<8.000000e+00>
--  // CHECK: %[[TMP_194:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_193]]
--  // CHECK: %[[TMP_195:.*]] = stablehlo.constant dense<7.000000e+00>
--  // CHECK: %[[TMP_196:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_195]]
--  // CHECK: %[[TMP_197:.*]] = stablehlo.multiply %[[TMP_194]], %[[TMP_196]]
--  // CHECK: %[[TMP_198:.*]] = stablehlo.constant dense<2.08767569878681E-8>
--  // CHECK: %[[TMP_199:.*]] = stablehlo.add %[[TMP_192]], %[[TMP_198]]
--  // CHECK: %[[TMP_200:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_199]]
--  // CHECK: %[[TMP_201:.*]] = stablehlo.multiply %[[TMP_197]], %[[TMP_200]]
--  // CHECK: %[[TMP_202:.*]] = stablehlo.constant dense<6.000000e+00>
--  // CHECK: %[[TMP_203:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_202]]
--  // CHECK: %[[TMP_204:.*]] = stablehlo.constant dense<5.000000e+00>
--  // CHECK: %[[TMP_205:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_204]]
--  // CHECK: %[[TMP_206:.*]] = stablehlo.multiply %[[TMP_203]], %[[TMP_205]]
--  // CHECK: %[[TMP_207:.*]] = stablehlo.constant dense<-8.2671957671957675E-7>
--  // CHECK: %[[TMP_208:.*]] = stablehlo.add %[[TMP_201]], %[[TMP_207]]
--  // CHECK: %[[TMP_209:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_208]]
--  // CHECK: %[[TMP_210:.*]] = stablehlo.multiply %[[TMP_206]], %[[TMP_209]]
--  // CHECK: %[[TMP_211:.*]] = stablehlo.constant dense<4.000000e+00>
--  // CHECK: %[[TMP_212:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_211]]
--  // CHECK: %[[TMP_213:.*]] = stablehlo.constant dense<3.000000e+00>
--  // CHECK: %[[TMP_214:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_213]]
--  // CHECK: %[[TMP_215:.*]] = stablehlo.multiply %[[TMP_212]], %[[TMP_214]]
--  // CHECK: %[[TMP_216:.*]] = stablehlo.constant dense<3.3068783068783071E-5>
--  // CHECK: %[[TMP_217:.*]] = stablehlo.add %[[TMP_210]], %[[TMP_216]]
--  // CHECK: %[[TMP_218:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_217]]
--  // CHECK: %[[TMP_219:.*]] = stablehlo.multiply %[[TMP_215]], %[[TMP_218]]
--  // CHECK: %[[TMP_220:.*]] = stablehlo.constant dense<2.000000e+00>
--  // CHECK: %[[TMP_221:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_220]]
--  // CHECK: %[[TMP_222:.*]] = stablehlo.constant dense<1.000000e+00>
--  // CHECK: %[[TMP_223:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_222]]
--  // CHECK: %[[TMP_224:.*]] = stablehlo.multiply %[[TMP_221]], %[[TMP_223]]
--  // CHECK: %[[TMP_225:.*]] = stablehlo.constant dense<-0.0013888888888888889>
--  // CHECK: %[[TMP_226:.*]] = stablehlo.add %[[TMP_219]], %[[TMP_225]]
--  // CHECK: %[[TMP_227:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_226]]
--  // CHECK: %[[TMP_228:.*]] = stablehlo.multiply %[[TMP_224]], %[[TMP_227]]
--  // CHECK: %[[TMP_229:.*]] = stablehlo.constant dense<5.000000e-01>
--  // CHECK: %[[TMP_230:.*]] = stablehlo.divide %[[TMP_5]], %[[TMP_121]]
--  // CHECK: %[[TMP_231:.*]] = stablehlo.constant dense<0.083333333333333329>
--  // CHECK: %[[TMP_232:.*]] = stablehlo.add %[[TMP_231]], %[[TMP_228]]
--  // CHECK: %[[TMP_233:.*]] = stablehlo.multiply %[[TMP_230]], %[[TMP_232]]
--  // CHECK: %[[TMP_234:.*]] = stablehlo.add %[[TMP_229]], %[[TMP_233]]
--  // CHECK: %[[TMP_235:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_234]]
--  // CHECK: %[[TMP_236:.*]] = stablehlo.add %[[TMP_127]], %[[TMP_235]]
-+  // CHECK: %[[TMP_124:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_121]]
-+  // CHECK: %[[TMP_125:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_123]]
-+  // CHECK: %[[TMP_126:.*]] = stablehlo.divide %[[TMP_124]], %[[TMP_125]]
-+  // CHECK: %[[TMP_127:.*]] = stablehlo.multiply %[[TMP_121]], %[[TMP_121]]
-+  // CHECK: %[[TMP_128:.*]] = stablehlo.divide %[[TMP_91]], %[[TMP_127]]
-+  // CHECK: %[[TMP_129:.*]] = stablehlo.constant dense<2.200000e+01>
-+  // CHECK: %[[TMP_130:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_129]]
-+  // CHECK: %[[TMP_131:.*]] = stablehlo.constant dense<2.100000e+01>
-+  // CHECK: %[[TMP_132:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_131]]
-+  // CHECK: %[[TMP_133:.*]] = stablehlo.multiply %[[TMP_130]], %[[TMP_132]]
-+  // CHECK: %[[TMP_134:.*]] = stablehlo.constant dense<-1.3954464685812522E-19>
-+  // CHECK: %[[TMP_135:.*]] = stablehlo.add %[[TMP_90]], %[[TMP_134]]
-+  // CHECK: %[[TMP_136:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_135]]
-+  // CHECK: %[[TMP_137:.*]] = stablehlo.multiply %[[TMP_133]], %[[TMP_136]]
-+  // CHECK: %[[TMP_138:.*]] = stablehlo.constant dense<2.000000e+01>
-+  // CHECK: %[[TMP_139:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_138]]
-+  // CHECK: %[[TMP_140:.*]] = stablehlo.constant dense<1.900000e+01>
-+  // CHECK: %[[TMP_141:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_140]]
-+  // CHECK: %[[TMP_142:.*]] = stablehlo.multiply %[[TMP_139]], %[[TMP_141]]
-+  // CHECK: %[[TMP_143:.*]] = stablehlo.constant dense<5.5090028283602295E-18>
-+  // CHECK: %[[TMP_144:.*]] = stablehlo.add %[[TMP_137]], %[[TMP_143]]
-+  // CHECK: %[[TMP_145:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_144]]
-+  // CHECK: %[[TMP_146:.*]] = stablehlo.multiply %[[TMP_142]], %[[TMP_145]]
-+  // CHECK: %[[TMP_147:.*]] = stablehlo.constant dense<1.800000e+01>
-+  // CHECK: %[[TMP_148:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_147]]
-+  // CHECK: %[[TMP_149:.*]] = stablehlo.constant dense<1.700000e+01>
-+  // CHECK: %[[TMP_150:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_149]]
-+  // CHECK: %[[TMP_151:.*]] = stablehlo.multiply %[[TMP_148]], %[[TMP_150]]
-+  // CHECK: %[[TMP_152:.*]] = stablehlo.constant dense<-2.1748686985580617E-16>
-+  // CHECK: %[[TMP_153:.*]] = stablehlo.add %[[TMP_146]], %[[TMP_152]]
-+  // CHECK: %[[TMP_154:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_153]]
-+  // CHECK: %[[TMP_155:.*]] = stablehlo.multiply %[[TMP_151]], %[[TMP_154]]
-+  // CHECK: %[[TMP_156:.*]] = stablehlo.constant dense<1.600000e+01>
-+  // CHECK: %[[TMP_157:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_156]]
-+  // CHECK: %[[TMP_158:.*]] = stablehlo.constant dense<1.500000e+01>
-+  // CHECK: %[[TMP_159:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_158]]
-+  // CHECK: %[[TMP_160:.*]] = stablehlo.multiply %[[TMP_157]], %[[TMP_159]]
-+  // CHECK: %[[TMP_161:.*]] = stablehlo.constant dense<8.5860620562778452E-15>
-+  // CHECK: %[[TMP_162:.*]] = stablehlo.add %[[TMP_155]], %[[TMP_161]]
-+  // CHECK: %[[TMP_163:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_162]]
-+  // CHECK: %[[TMP_164:.*]] = stablehlo.multiply %[[TMP_160]], %[[TMP_163]]
-+  // CHECK: %[[TMP_165:.*]] = stablehlo.constant dense<1.400000e+01>
-+  // CHECK: %[[TMP_166:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_165]]
-+  // CHECK: %[[TMP_167:.*]] = stablehlo.constant dense<1.300000e+01>
-+  // CHECK: %[[TMP_168:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_167]]
-+  // CHECK: %[[TMP_169:.*]] = stablehlo.multiply %[[TMP_166]], %[[TMP_168]]
-+  // CHECK: %[[TMP_170:.*]] = stablehlo.constant dense<-3.3896802963225832E-13>
-+  // CHECK: %[[TMP_171:.*]] = stablehlo.add %[[TMP_164]], %[[TMP_170]]
-+  // CHECK: %[[TMP_172:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_171]]
-+  // CHECK: %[[TMP_173:.*]] = stablehlo.multiply %[[TMP_169]], %[[TMP_172]]
-+  // CHECK: %[[TMP_174:.*]] = stablehlo.constant dense<1.200000e+01>
-+  // CHECK: %[[TMP_175:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_174]]
-+  // CHECK: %[[TMP_176:.*]] = stablehlo.constant dense<1.100000e+01>
-+  // CHECK: %[[TMP_177:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_176]]
-+  // CHECK: %[[TMP_178:.*]] = stablehlo.multiply %[[TMP_175]], %[[TMP_177]]
-+  // CHECK: %[[TMP_179:.*]] = stablehlo.constant dense<1.3382536530684679E-11>
-+  // CHECK: %[[TMP_180:.*]] = stablehlo.add %[[TMP_173]], %[[TMP_179]]
-+  // CHECK: %[[TMP_181:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_180]]
-+  // CHECK: %[[TMP_182:.*]] = stablehlo.multiply %[[TMP_178]], %[[TMP_181]]
-+  // CHECK: %[[TMP_183:.*]] = stablehlo.constant dense<1.000000e+01>
-+  // CHECK: %[[TMP_184:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_183]]
-+  // CHECK: %[[TMP_185:.*]] = stablehlo.constant dense<9.000000e+00>
-+  // CHECK: %[[TMP_186:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_185]]
-+  // CHECK: %[[TMP_187:.*]] = stablehlo.multiply %[[TMP_184]], %[[TMP_186]]
-+  // CHECK: %[[TMP_188:.*]] = stablehlo.constant dense<-5.2841901386874932E-10>
-+  // CHECK: %[[TMP_189:.*]] = stablehlo.add %[[TMP_182]], %[[TMP_188]]
-+  // CHECK: %[[TMP_190:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_189]]
-+  // CHECK: %[[TMP_191:.*]] = stablehlo.multiply %[[TMP_187]], %[[TMP_190]]
-+  // CHECK: %[[TMP_192:.*]] = stablehlo.constant dense<8.000000e+00>
-+  // CHECK: %[[TMP_193:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_192]]
-+  // CHECK: %[[TMP_194:.*]] = stablehlo.constant dense<7.000000e+00>
-+  // CHECK: %[[TMP_195:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_194]]
-+  // CHECK: %[[TMP_196:.*]] = stablehlo.multiply %[[TMP_193]], %[[TMP_195]]
-+  // CHECK: %[[TMP_197:.*]] = stablehlo.constant dense<2.08767569878681E-8>
-+  // CHECK: %[[TMP_198:.*]] = stablehlo.add %[[TMP_191]], %[[TMP_197]]
-+  // CHECK: %[[TMP_199:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_198]]
-+  // CHECK: %[[TMP_200:.*]] = stablehlo.multiply %[[TMP_196]], %[[TMP_199]]
-+  // CHECK: %[[TMP_201:.*]] = stablehlo.constant dense<6.000000e+00>
-+  // CHECK: %[[TMP_202:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_201]]
-+  // CHECK: %[[TMP_203:.*]] = stablehlo.constant dense<5.000000e+00>
-+  // CHECK: %[[TMP_204:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_203]]
-+  // CHECK: %[[TMP_205:.*]] = stablehlo.multiply %[[TMP_202]], %[[TMP_204]]
-+  // CHECK: %[[TMP_206:.*]] = stablehlo.constant dense<-8.2671957671957675E-7>
-+  // CHECK: %[[TMP_207:.*]] = stablehlo.add %[[TMP_200]], %[[TMP_206]]
-+  // CHECK: %[[TMP_208:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_207]]
-+  // CHECK: %[[TMP_209:.*]] = stablehlo.multiply %[[TMP_205]], %[[TMP_208]]
-+  // CHECK: %[[TMP_210:.*]] = stablehlo.constant dense<4.000000e+00>
-+  // CHECK: %[[TMP_211:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_210]]
-+  // CHECK: %[[TMP_212:.*]] = stablehlo.constant dense<3.000000e+00>
-+  // CHECK: %[[TMP_213:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_212]]
-+  // CHECK: %[[TMP_214:.*]] = stablehlo.multiply %[[TMP_211]], %[[TMP_213]]
-+  // CHECK: %[[TMP_215:.*]] = stablehlo.constant dense<3.3068783068783071E-5>
-+  // CHECK: %[[TMP_216:.*]] = stablehlo.add %[[TMP_209]], %[[TMP_215]]
-+  // CHECK: %[[TMP_217:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_216]]
-+  // CHECK: %[[TMP_218:.*]] = stablehlo.multiply %[[TMP_214]], %[[TMP_217]]
-+  // CHECK: %[[TMP_219:.*]] = stablehlo.constant dense<2.000000e+00>
-+  // CHECK: %[[TMP_220:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_219]]
-+  // CHECK: %[[TMP_221:.*]] = stablehlo.constant dense<1.000000e+00>
-+  // CHECK: %[[TMP_222:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_221]]
-+  // CHECK: %[[TMP_223:.*]] = stablehlo.multiply %[[TMP_220]], %[[TMP_222]]
-+  // CHECK: %[[TMP_224:.*]] = stablehlo.constant dense<-0.0013888888888888889>
-+  // CHECK: %[[TMP_225:.*]] = stablehlo.add %[[TMP_218]], %[[TMP_224]]
-+  // CHECK: %[[TMP_226:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_225]]
-+  // CHECK: %[[TMP_227:.*]] = stablehlo.multiply %[[TMP_223]], %[[TMP_226]]
-+  // CHECK: %[[TMP_228:.*]] = stablehlo.constant dense<5.000000e-01>
-+  // CHECK: %[[TMP_229:.*]] = stablehlo.divide %[[TMP_5]], %[[TMP_121]]
-+  // CHECK: %[[TMP_230:.*]] = stablehlo.constant dense<0.083333333333333329>
-+  // CHECK: %[[TMP_231:.*]] = stablehlo.add %[[TMP_230]], %[[TMP_227]]
-+  // CHECK: %[[TMP_232:.*]] = stablehlo.multiply %[[TMP_229]], %[[TMP_231]]
-+  // CHECK: %[[TMP_233:.*]] = stablehlo.add %[[TMP_228]], %[[TMP_232]]
-+  // CHECK: %[[TMP_234:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_233]]
-+  // CHECK: %[[TMP_235:.*]] = stablehlo.add %[[TMP_120]], %[[TMP_126]]
-+  // CHECK: %[[TMP_236:.*]] = stablehlo.add %[[TMP_235]], %[[TMP_234]]
-   // CHECK: %[[TMP_237:.*]] = stablehlo.abs %[[TMP_122]]
-   // CHECK: %[[TMP_238:.*]] = stablehlo.abs %[[TMP_120]]
-   // CHECK: %[[TMP_239:.*]] = stablehlo.constant dense<4.940660e-324>
-@@ -2120,7 +2118,7 @@
-   // CHECK: %[[TMP_260:.*]] = stablehlo.and %[[TMP_257]], %[[TMP_259]]
-   // CHECK: %[[TMP_261:.*]] = stablehlo.select %[[TMP_260]], %[[TMP_251]], %[[TMP_243]]
-   // CHECK: %[[TMP_262:.*]] = stablehlo.select %[[TMP_254]], %[[TMP_261]], %[[TMP_250]]
--  // CHECK: %[[TMP_263:.*]] = stablehlo.compare EQ, %[[TMP_5]], %[[TMP_93]], NOTYPE
-+  // CHECK: %[[TMP_263:.*]] = stablehlo.compare EQ, %[[TMP_5]], %[[TMP_91]], NOTYPE
-   // CHECK: %[[TMP_264:.*]] = stablehlo.select %[[TMP_263]], %[[TMP_251]], %[[TMP_262]]
-   // CHECK: %[[TMP_265:.*]] = stablehlo.multiply %[[TMP_4]], %[[TMP_89]]
-   // CHECK: %[[TMP_266:.*]] = stablehlo.multiply %[[TMP_265]], %[[TMP_264]]
-diff --ruN a/stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp b/stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
---- stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
-+++ stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
-@@ -1575,11 +1575,21 @@
- 
- static Value materializeZeta(ConversionPatternRewriter &rewriter, Location loc,
-                              ValueRange args) {
--  // Code should match XLA's materializeZeta from chlo_legalize_to_hlo.cc
-+  // Implementation ported from:
-+  // https://github.com/openxla/xla/blob/7a067a7b88d2ffb15b1dc5e3c06f701a15f0391d/xla/client/lib/math.cc#L1912-L1917
-+  // Reference: Johansson, Fredrik.
-+  // "Rigorous high-precision computation of the Hurwitz zeta function and its
-+  // derivatives." Numerical Algorithms 69.2 (2015): 253-270.
-+  // https://arxiv.org/abs/1309.2877 - formula (5)
-+  // Notation is more or less kept as a reference to the whitepaper.
-   assert(args.size() == 2);
-   Value x = args[0];
-   Value q = args[1];
--  static const std::array<double, 12> kZetaCoeffs{
-+
-+  static constexpr auto kTerms = 12;
-+  static constexpr auto kIters = 9;
-+  static constexpr auto kTwoTermsMinusOne = 2 * kTerms - 1;
-+  static constexpr auto kZetaCoeffs = std::array<double, kTerms>{
-       -7.1661652561756670113e18,
-       1.8152105401943546773e17,
-       -4.5979787224074726105e15,
-@@ -1596,131 +1606,134 @@
+-  auto lhsWindowStrides =
+-      concatAndPermute(1L, llvm::to_vector(windowStrides), 1L, lhsPermutation);
++  auto lhsWindowDimensions = concatAndPermute<int64_t>(
++      lhs.getShape()[inputBatchDimension],
++      extractElements(rhs.getShape(), kernelSpatialDimensions),
++      lhs.getShape()[inputFeatureDimension], lhsPermutation);
++
++  auto lhsWindowStrides = concatAndPermute<int64_t>(
++      1L, llvm::to_vector(windowStrides), 1L, lhsPermutation);
  
-   // For speed we'll always use 9 iterations for the initial series estimate,
-   // and a 12 term expansion for the Euler-Maclaurin formula.
--  Value a = q;
--  Value zero = getConstantLike(rewriter, loc, 0.0, a);
--  Value negPower = zero;
--  Value negX = rewriter.create<mlir::stablehlo::NegOp>(loc, x);
--  Value initialSum = rewriter.create<mlir::stablehlo::PowOp>(loc, q, negX);
--  Value one = getConstantLike(rewriter, loc, 1.0, a);
--  for (int i = 0; i < 9; ++i) {
--    a = rewriter.create<mlir::stablehlo::AddOp>(loc, a, one);
--    negPower = rewriter.create<mlir::stablehlo::PowOp>(loc, a, negX);
--    initialSum =
--        rewriter.create<mlir::stablehlo::AddOp>(loc, initialSum, negPower);
--  }
+   auto lhsBaseDilations =
+-      concatAndPermute(0L, Sizes(lhsDilation) - 1, 0L, lhsPermutation);
 -
--  a = rewriter.create<mlir::stablehlo::AddOp>(loc, a, one);
--  negPower = rewriter.create<mlir::stablehlo::PowOp>(loc, a, negX);
-+  Value zero = getConstantLike(rewriter, loc, 0.0, q);
-+  Value one = getConstantLike(rewriter, loc, 1.0, q);
-+  Value acc = q;
-+  Value qNegPower = zero;
-+  Value negX = rewriter.create<NegOp>(loc, x);
-+  Value powerSum = rewriter.create<PowOp>(loc, q, negX);
-+  for (int i = 0; i < kIters; ++i) {
-+    acc = rewriter.create<AddOp>(loc, acc, one);
-+    qNegPower = rewriter.create<PowOp>(loc, acc, negX);
-+    powerSum =
-+        rewriter.create<AddOp>(loc, powerSum, qNegPower);
-+  }
-+  acc = rewriter.create<AddOp>(loc, acc, one);
-+  qNegPower = rewriter.create<PowOp>(loc, acc, negX);
-   Value oneLikeX = getConstantLike(rewriter, loc, 1.0, x);
--  Value xMinusOne =
--      rewriter.create<mlir::stablehlo::SubtractOp>(loc, x, oneLikeX);
--  Value negPowerMulA =
--      rewriter.create<mlir::stablehlo::MulOp>(loc, negPower, a);
--  Value negPowerMulADivXMinusOne =
--      rewriter.create<mlir::stablehlo::DivOp>(loc, negPowerMulA, xMinusOne);
--  Value s = rewriter.create<mlir::stablehlo::AddOp>(loc, initialSum,
--                                                    negPowerMulADivXMinusOne);
--  Value aInverseSquare = rewriter.create<mlir::stablehlo::DivOp>(
--      loc, one, rewriter.create<mlir::stablehlo::MulOp>(loc, a, a));
--
--  Value hornerSum = zero;
--  Value factor = one;
-+  Value correctionEulerMaclaurin = rewriter.create<DivOp>(
-+      loc, rewriter.create<MulOp>(loc, qNegPower, acc),
-+      rewriter.create<SubtractOp>(loc, x, oneLikeX));
-+
-+  // Manual reciprocal of the square root as RsqrtOp produces different results
-+  Value rsqrtAcc = rewriter.create<DivOp>(
-+      loc, one, rewriter.create<MulOp>(loc, acc, acc));
-+
-   // Use Horner's rule for this.
-   // Note this differs from Cephes which does a 'naive' polynomial evaluation.
-   // Using Horner's rule allows to avoid some NaN's and Infs from happening,
-   // resulting in more numerically stable code.
--  for (int i = 0; i < 11; ++i) {
--    Value factorLhs = rewriter.create<mlir::stablehlo::AddOp>(
--        loc, x, getConstantLike(rewriter, loc, 22 - 2 * i, x));
--    Value factorRhs = rewriter.create<mlir::stablehlo::AddOp>(
--        loc, x, getConstantLike(rewriter, loc, 21 - 2 * i, x));
--    factor = rewriter.create<mlir::stablehlo::MulOp>(loc, factorLhs, factorRhs);
--    hornerSum = rewriter.create<mlir::stablehlo::MulOp>(
--        loc, factor,
--        rewriter.create<mlir::stablehlo::MulOp>(
--            loc, aInverseSquare,
--            rewriter.create<mlir::stablehlo::AddOp>(
-+  Value hornerSum = zero;
-+  Value hornerProduct = one;
-+
-+  for (int i = 0; i < kTerms - 1; ++i) {
-+    Value factorLhs = rewriter.create<AddOp>(
-+        loc, x,
-+        getConstantLike(rewriter, loc, kTwoTermsMinusOne - 1 - 2 * i, x));
-+    Value factorRhs = rewriter.create<AddOp>(
-+        loc, x,
-+        getConstantLike(rewriter, loc, kTwoTermsMinusOne - 2 - 2 * i, x));
-+    hornerProduct =
-+        rewriter.create<MulOp>(loc, factorLhs, factorRhs);
-+    hornerSum = rewriter.create<MulOp>(
-+        loc, hornerProduct,
-+        rewriter.create<MulOp>(
-+            loc, rsqrtAcc,
-+            rewriter.create<AddOp>(
-                 loc, hornerSum,
--                getConstantLike(rewriter, loc, 1. / kZetaCoeffs[i], a))));
--  }
--  Value zeroPointFiveLikeNegPower =
--      getConstantLike(rewriter, loc, .5, negPower);
--  Value xDivA = rewriter.create<mlir::stablehlo::DivOp>(loc, x, a);
--  s = rewriter.create<mlir::stablehlo::AddOp>(
--      loc, s,
--      rewriter.create<mlir::stablehlo::MulOp>(
--          loc, negPower,
--          rewriter.create<mlir::stablehlo::AddOp>(
--              loc, zeroPointFiveLikeNegPower,
--              rewriter.create<mlir::stablehlo::MulOp>(
--                  loc, xDivA,
--                  rewriter.create<mlir::stablehlo::AddOp>(
--                      loc,
--                      getConstantLike(rewriter, loc, 1. / kZetaCoeffs[11], a),
--                      hornerSum)))));
-+                getConstantLike(rewriter, loc, 1. / kZetaCoeffs[i], acc))));
-+  }
-+  Value zeroPointFiveLikeQNegPower =
-+      getConstantLike(rewriter, loc, .5, qNegPower);
-+  Value xDivAcc = rewriter.create<DivOp>(loc, x, acc);
-+  Value bernoulliTailTerm = rewriter.create<MulOp>(
-+      loc, qNegPower,
-+      rewriter.create<AddOp>(
-+          loc, zeroPointFiveLikeQNegPower,
-+          rewriter.create<MulOp>(
-+              loc, xDivAcc,
-+              rewriter.create<AddOp>(
-+                  loc,
-+                  getConstantLike(rewriter, loc, 1. / kZetaCoeffs[kTerms - 1],
-+                                  acc),
-+                  hornerSum))));
-+  Value accurateResult = rewriter.create<AddOp>(
-+      loc,
-+      rewriter.create<AddOp>(loc, powerSum,
-+                                              correctionEulerMaclaurin),
-+      bernoulliTailTerm);
- 
-   // Use the initial zeta sum without the correction term coming
-   // from Euler-Maclaurin if it is accurate enough.
--  Value absNegPower = rewriter.create<mlir::stablehlo::AbsOp>(loc, negPower);
--  Value absInitialSum =
--      rewriter.create<mlir::stablehlo::AbsOp>(loc, initialSum);
--  Value output = rewriter.create<mlir::stablehlo::SelectOp>(
-+  Value absQNegPower = rewriter.create<AbsOp>(loc, qNegPower);
-+  Value absPowerSum = rewriter.create<AbsOp>(loc, powerSum);
-+  Value output = rewriter.create<SelectOp>(
-       loc,
--      rewriter.create<mlir::stablehlo::CompareOp>(
--          loc, absNegPower,
--          rewriter.create<mlir::stablehlo::MulOp>(
--              loc, absInitialSum,
--              getConstantLikeSmallestFiniteValue(rewriter, loc, a)),
--          mlir::stablehlo::ComparisonDirection::LT),
--      initialSum, s);
-+      rewriter.create<CompareOp>(
-+          loc, absQNegPower,
-+          rewriter.create<MulOp>(
-+              loc, absPowerSum,
-+              getConstantLikeSmallestFiniteValue(rewriter, loc, acc)),
-+          ComparisonDirection::LT),
-+      powerSum, accurateResult);
- 
-   // Function is not defined for x < 1.
-   Value nan = getConstantLike(rewriter, loc,
-                               std::numeric_limits<double>::quiet_NaN(), x);
--  output = rewriter.create<mlir::stablehlo::SelectOp>(
-+  output = rewriter.create<SelectOp>(
-       loc,
--      rewriter.create<mlir::stablehlo::CompareOp>(
--          loc, x, oneLikeX, mlir::stablehlo::ComparisonDirection::LT),
-+      rewriter.create<CompareOp>(
-+          loc, x, oneLikeX, ComparisonDirection::LT),
-       nan, output);
- 
-   // For q <= 0, x must be an integer.
--  Value qLeZero = rewriter.create<mlir::stablehlo::CompareOp>(
--      loc, q, zero, mlir::stablehlo::ComparisonDirection::LE);
--  Value xNotInt = rewriter.create<mlir::stablehlo::CompareOp>(
--      loc, x, rewriter.create<mlir::stablehlo::FloorOp>(loc, x),
--      mlir::stablehlo::ComparisonDirection::NE);
-+  Value qLeZero = rewriter.create<CompareOp>(
-+      loc, q, zero, ComparisonDirection::LE);
-+  Value xNotInt = rewriter.create<CompareOp>(
-+      loc, x, rewriter.create<FloorOp>(loc, x),
-+      ComparisonDirection::NE);
-   Value xDomainError =
--      rewriter.create<mlir::stablehlo::AndOp>(loc, qLeZero, xNotInt);
--  output = rewriter.create<mlir::stablehlo::SelectOp>(loc, xDomainError, nan,
-+      rewriter.create<AndOp>(loc, qLeZero, xNotInt);
-+  output = rewriter.create<SelectOp>(loc, xDomainError, nan,
-                                                       output);
- 
-   // For all integer q <= 0, zeta has a pole. The limit is only defined as
-   // +inf if x is and even integer.
-   Value inf = getConstantLike(rewriter, loc,
-                               std::numeric_limits<double>::infinity(), x);
--  Value qIsInt = rewriter.create<mlir::stablehlo::CompareOp>(
--      loc, q, rewriter.create<mlir::stablehlo::FloorOp>(loc, q),
--      mlir::stablehlo::ComparisonDirection::EQ);
--  Value atPole = rewriter.create<mlir::stablehlo::AndOp>(loc, qLeZero, qIsInt);
-+  Value qIsInt = rewriter.create<CompareOp>(
-+      loc, q, rewriter.create<FloorOp>(loc, q),
-+      ComparisonDirection::EQ);
-+  Value atPole = rewriter.create<AndOp>(loc, qLeZero, qIsInt);
-   Value two = getConstantLike(rewriter, loc, 2.0, x);
--  Value xIsInt = rewriter.create<mlir::stablehlo::CompareOp>(
--      loc, x, rewriter.create<mlir::stablehlo::FloorOp>(loc, x),
--      mlir::stablehlo::ComparisonDirection::EQ);
--  Value xIsEven = rewriter.create<mlir::stablehlo::CompareOp>(
--      loc, rewriter.create<mlir::stablehlo::RemOp>(loc, x, two), zero,
--      mlir::stablehlo::ComparisonDirection::EQ);
-+  Value xIsInt = rewriter.create<CompareOp>(
-+      loc, x, rewriter.create<FloorOp>(loc, x),
-+      ComparisonDirection::EQ);
-+  Value xIsEven = rewriter.create<CompareOp>(
-+      loc, rewriter.create<RemOp>(loc, x, two), zero,
-+      ComparisonDirection::EQ);
-   Value xIsEvenInt =
--      rewriter.create<mlir::stablehlo::AndOp>(loc, xIsInt, xIsEven);
--  output = rewriter.create<mlir::stablehlo::SelectOp>(
-+      rewriter.create<AndOp>(loc, xIsInt, xIsEven);
-+  output = rewriter.create<SelectOp>(
-       loc, atPole,
--      rewriter.create<mlir::stablehlo::SelectOp>(loc, xIsEvenInt, inf, nan),
-+      rewriter.create<SelectOp>(loc, xIsEvenInt, inf, nan),
-       output);
+-  auto lhsWindowDilations =
+-      concatAndPermute(1L, llvm::to_vector(rhsDilation), 1L, lhsPermutation);
++      concatAndPermute<int64_t>(0L, Sizes(lhsDilation) - 1, 0L, lhsPermutation);
++
++  auto lhsWindowDilations = concatAndPermute<int64_t>(
++      1L, llvm::to_vector(rhsDilation), 1L, lhsPermutation);
  
-   // For x = 1, this is the harmonic series and diverges.
--  output = rewriter.create<mlir::stablehlo::SelectOp>(
-+  output = rewriter.create<SelectOp>(
-       loc,
--      rewriter.create<mlir::stablehlo::CompareOp>(
--          loc, x, one, mlir::stablehlo::ComparisonDirection::EQ),
-+      rewriter.create<CompareOp>(
-+          loc, x, one, ComparisonDirection::EQ),
-       inf, output);
+   Sizes lhsPaddingLow, lhsPaddingHigh;
+-  for (auto paddingPair : concatAndPermute({0, 0}, llvm::to_vector(padding),
+-                                           {0, 0}, lhsPermutation)) {
++  for (auto paddingPair : concatAndPermute<std::pair<int64_t, int64_t>>(
++           {0, 0}, llvm::to_vector(padding), {0, 0}, lhsPermutation)) {
+     lhsPaddingLow.push_back(paddingPair.first);
+     lhsPaddingHigh.push_back(paddingPair.second);
+   }
+@@ -1461,8 +1462,8 @@
+   for (; outputSpatialIndexIt != outputSpatialIndexItEnd;
+        ++outputSpatialIndexIt) {
+     Sizes lhsWindowStart;
+-    for (auto [i, offset] : llvm::enumerate(
+-             concatAndPermute(0L, *outputSpatialIndexIt, 0L, lhsPermutation)))
++    for (auto [i, offset] : llvm::enumerate(concatAndPermute<int64_t>(
++             0L, *outputSpatialIndexIt, 0L, lhsPermutation)))
+       lhsWindowStart.push_back(lhsWindowStrides[i] * offset);
  
-   return output;
+     Sizes limitIndices;
+@@ -1507,9 +1508,9 @@
+     for (auto dotProductIt = dotProduct.index_begin();
+          dotProductIt != dotProduct.index_end();
+          ++dotProductIt, ++resultNonSpatialIt) {
+-      Index resultIndex(
+-          concatAndPermute((*resultNonSpatialIt)[0], *outputSpatialIndexIt,
+-                           (*resultNonSpatialIt)[1], resultPermutation));
++      Index resultIndex(concatAndPermute<int64_t>(
++          (*resultNonSpatialIt)[0], *outputSpatialIndexIt,
++          (*resultNonSpatialIt)[1], resultPermutation));
+       result.set(resultIndex, dotProduct.get(*dotProductIt));
+     }
+   }
 
diff --git a/third_party/stablehlo/workspace.bzl b/third_party/stablehlo/workspace.bzl
index ca2f3a937f73ad..86613c0dcd88cf 100644
--- a/third_party/stablehlo/workspace.bzl
+++ b/third_party/stablehlo/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
     # LINT.IfChange
-    STABLEHLO_COMMIT = "271e8634de184fbfafd677d3876170feb6d08c97"
-    STABLEHLO_SHA256 = "06db84c751bd4a980dc76249e02f10e119175fceba3eebed008da122cb480bab"
+    STABLEHLO_COMMIT = "e81411ef562e11337283ef24bb3c40b2f3a6ebfa"
+    STABLEHLO_SHA256 = "167f15fbdfc3dc54601b6e37d53bce7323123f701893deb3935c8629a763766a"
     # LINT.ThenChange(Google-internal path)
 
     tf_http_archive(
diff --git a/third_party/systemlibs/pybind11.BUILD b/third_party/systemlibs/pybind11.BUILD
index 79a483d7b5d75c..44e7496fc25c58 100644
--- a/third_party/systemlibs/pybind11.BUILD
+++ b/third_party/systemlibs/pybind11.BUILD
@@ -3,6 +3,6 @@ package(default_visibility = ["//visibility:public"])
 cc_library(
     name = "pybind11",
     deps = [
-        "@org_tensorflow//third_party/python_runtime:headers",
+        "@local_tsl//third_party/python_runtime:headers",
     ],
 )
diff --git a/third_party/tensorrt/tensorrt_configure.bzl b/third_party/tensorrt/tensorrt_configure.bzl
index 3d127795638bf5..91b214fd990866 100644
--- a/third_party/tensorrt/tensorrt_configure.bzl
+++ b/third_party/tensorrt/tensorrt_configure.bzl
@@ -313,7 +313,7 @@ remote_tensorrt_configure = repository_rule(
     remotable = True,
     attrs = {
         "environ": attr.string_dict(),
-        "_find_cuda_config": attr.label(default = "@org_tensorflow//third_party/gpus:find_cuda_config.py"),
+        "_find_cuda_config": attr.label(default = "@local_tsl//third_party/gpus:find_cuda_config.py"),
     },
 )
 
@@ -321,7 +321,7 @@ tensorrt_configure = repository_rule(
     implementation = _tensorrt_configure_impl,
     environ = _ENVIRONS + [_TF_TENSORRT_CONFIG_REPO],
     attrs = {
-        "_find_cuda_config": attr.label(default = "@org_tensorflow//third_party/gpus:find_cuda_config.py"),
+        "_find_cuda_config": attr.label(default = "@local_tsl//third_party/gpus:find_cuda_config.py"),
     },
 )
 """Detects and configures the local CUDA toolchain.
diff --git a/third_party/tf_runtime/workspace.bzl b/third_party/tf_runtime/workspace.bzl
index 2b789c8cead317..01b5c67cc07d67 100644
--- a/third_party/tf_runtime/workspace.bzl
+++ b/third_party/tf_runtime/workspace.bzl
@@ -6,14 +6,18 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "968eb3e5b0aa2e20301a41af9bb14a48dd1aee40"
-    TFRT_SHA256 = "cd3b1d190625d6ca5ddcf1c9cc0b095928707b623f1b986f1ba333a89d5418ae"
+    TFRT_COMMIT = "fc9cb9f1253679cb2b645c26e0198cd0213d4378"
+    TFRT_SHA256 = "f90273589fdbba62b7b7de6b928273b36e870f8aa2922e3cdfac6c2fa5b262aa"
 
     tf_http_archive(
         name = "tf_runtime",
         sha256 = TFRT_SHA256,
         strip_prefix = "runtime-{commit}".format(commit = TFRT_COMMIT),
         urls = tf_mirror_urls("https://github.com/tensorflow/runtime/archive/{commit}.tar.gz".format(commit = TFRT_COMMIT)),
+        repo_mapping = {
+            "@tsl": "@local_tsl",
+            "@xla": "@local_xla",
+        },
         # A patch file can be provided for atomic commits to both TF and TFRT.
         # The job that bumps the TFRT_COMMIT also resets patch_file to 'None'.
         patch_file = None,
diff --git a/third_party/triton/cl617812302.patch b/third_party/triton/cl617812302.patch
deleted file mode 100644
index c9c8066190e6ce..00000000000000
--- a/third_party/triton/cl617812302.patch
+++ /dev/null
@@ -1,12 +0,0 @@
-==== triton/BUILD#42 - /google/src/cloud/csigg/mlir_83e5a1239242d64110e3dfa96ed3889170ab96b2_1711020969/triton/BUILD ====
-# action=edit type=text
---- triton/BUILD	2024-03-21 04:26:30.000000000 -0700
-+++ triton/BUILD	2024-03-21 05:39:41.000000000 -0700
-@@ -692,6 +692,7 @@
-         "@llvm-project//mlir:SCFDialect",
-         "@llvm-project//mlir:SCFTransforms",
-         "@llvm-project//mlir:SCFUtils",
-+        "@llvm-project//mlir:SideEffectInterfaces",
-         "@llvm-project//mlir:Support",
-         "@llvm-project//mlir:TensorDialect",
-         "@llvm-project//mlir:TransformUtils",
diff --git a/third_party/triton/cl619146327.patch b/third_party/triton/cl619146327.patch
deleted file mode 100644
index 4f9c1e4b971c0a..00000000000000
--- a/third_party/triton/cl619146327.patch
+++ /dev/null
@@ -1,52 +0,0 @@
-This patch can be removed once this commit is included:
-https://github.com/openai/triton/commit/6ea5b56015db9e0bcff45ec7116cfcbfa729a516
-
-diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
---- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
-+++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
-@@ -305,9 +305,10 @@ compared to 1*64 when the hasLeadingOffs
-           int perPhase = 128 / (shapePerCTA[order[0]] * 4 / dotOpEnc.getKWidth());
-           perPhase = std::max<int>(perPhase, 1);
-           std::vector<size_t> matShape = {8, 8, 4 * dotOpEnc.getKWidth()};
--          // for now, disable swizzle when using transposed int8 tensor cores
--          if ((32 / typeWidthInBit != dotOpEnc.getKWidth()) && order[0] == inner)
--            return get(context, 1, 1, 1, order, CTALayout);
-+          int vecWidth = 32 / typeWidthInBit;
-+          if (vecWidth != dotOpEnc.getKWidth() && order[0] == inner) {
-+              perPhase = std::max<int>(perPhase, 2 * vecWidth);
-+          }
-           int rank = order.size();
-           // --- handle A operand ---
-           if (opIdx == 0) { // compute swizzling for A operand
-diff --git a/test/TritonGPU/reduce-data-duplication.mlir b/test/TritonGPU/reduce-data-duplication.mlir
-new file mode 100644
---- /dev/null
-+++ b/test/TritonGPU/reduce-data-duplication.mlir
-@@ -0,0 +1,14 @@
-+// RUN: triton-opt %s -split-input-file -tritongpu-reduce-data-duplication | FileCheck %s
-+
-+//       CHECK:   #[[SHARED:.*]] = #triton_gpu.shared<{vec = 8, perPhase = 8, maxPhase = 2, order = [0, 1], hasLeadingOffset = false}
-+//       CHECK:   apply_swizzle
-+//       CHECK:   %{{.*}} = triton_gpu.local_alloc %{{.*}} : (tensor<16x256xf16, #{{.*}}>) -> !tt.memdesc<16x256xf16, #[[SHARED]]>
-+
-+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 4], order = [0, 1]}>
-+#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 4], instrShape = [16, 8]}>
-+module attributes {"triton_gpu.compute-capability" = 80 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
-+  tt.func @apply_swizzle(%arg0: tensor<16x256xf16, #blocked>) {
-+    %0 = triton_gpu.convert_layout %arg0 : tensor<16x256xf16, #blocked> -> tensor<16x256xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>> 
-+    tt.return 
-+  } 
-+}
-\ No newline at end of file
-diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMMAv2.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMMAv2.cpp
---- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMMAv2.cpp
-+++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMMAv2.cpp
-@@ -541,8 +541,6 @@ getLoadMatrixFn(MemDescType descTy, cons
-   const int elemBytes = descTy.getElementTypeBitWidth() / 8;
-   auto order = sharedLayout.getOrder();
- 
--  if (kWidth != (4 / elemBytes))
--    assert(vecPhase == 1 || vecPhase == 4 * kWidth);
-   int nPerWarp =
-       std::max<int>(shapePerCTA[2] / mmaLayout.getWarpsPerCTA()[2], 8);
- 
diff --git a/third_party/triton/cl619443019.patch b/third_party/triton/cl619443019.patch
deleted file mode 100644
index 95ce54b6e4d6aa..00000000000000
--- a/third_party/triton/cl619443019.patch
+++ /dev/null
@@ -1,76 +0,0 @@
-==== triton/BUILD#44 - /google/src/cloud/csigg/mlir_transform_utils/triton/BUILD ====
-# action=edit type=text
---- triton/BUILD	2024-03-22 08:02:38.000000000 -0700
-+++ triton/BUILD	2024-03-27 01:34:43.000000000 -0700
-@@ -620,6 +620,7 @@
-         "@llvm-project//mlir:FunctionInterfaces",
-         "@llvm-project//mlir:GPUDialect",
-         "@llvm-project//mlir:IR",
-+        "@llvm-project//mlir:InliningUtils",
-         "@llvm-project//mlir:LLVMDialect",
-         "@llvm-project//mlir:MathDialect",
-         "@llvm-project//mlir:SCFDialect",
-@@ -628,6 +629,7 @@
-         # The following is added to make Utility compile
-         ":TritonTools",
-         "@llvm-project//mlir:LLVMCommonConversion",
-+        "@llvm-project//mlir:TransformUtils",
-         "@llvm-project//mlir:Transforms",
-     ],
- )
-@@ -646,6 +648,7 @@
-         "@llvm-project//mlir:IR",
-         "@llvm-project//mlir:Pass",
-         "@llvm-project//mlir:Support",
-+        "@llvm-project//mlir:TransformUtils",
-         "@llvm-project//mlir:Transforms",
-     ],
-     alwayslink = True,  # TritonDialect uses getCanonicalizationPatterns().
-@@ -729,6 +732,7 @@
-         "@llvm-project//mlir:LLVMDialect",
-         "@llvm-project//mlir:NVVMDialect",
-         "@llvm-project//mlir:Pass",
-+        "@llvm-project//mlir:TransformUtils",
-         "@llvm-project//mlir:Transforms",
-     ],
- )
-@@ -780,6 +784,7 @@
-         "@llvm-project//mlir:IR",
-         "@llvm-project//mlir:IndexDialect",
-         "@llvm-project//mlir:Pass",
-+        "@llvm-project//mlir:TransformUtils",
-         "@llvm-project//mlir:Transforms",
-     ],
- )
-==== triton/test/BUILD#18 - /google/src/cloud/csigg/mlir_transform_utils/triton/test/BUILD ====
-# action=edit type=text
---- triton/test/BUILD	2024-03-11 11:42:57.000000000 -0700
-+++ triton/test/BUILD	2024-03-27 01:32:04.000000000 -0700
-@@ -53,6 +53,7 @@
-         "@llvm-project//mlir:IR",
-         "@llvm-project//mlir:Pass",
-         "@llvm-project//mlir:SCFToControlFlow",
-+        "@llvm-project//mlir:TransformUtils",
-         "@llvm-project//mlir:Transforms",
-         "//:TritonAnalysis",
-         "//:TritonDialects",
-==== triton/third_party/nvidia/BUILD#3 - /google/src/cloud/csigg/mlir_transform_utils/triton/third_party/nvidia/BUILD ====
-# action=edit type=text
---- triton/third_party/nvidia/BUILD	2024-03-11 11:42:57.000000000 -0700
-+++ triton/third_party/nvidia/BUILD	2024-03-27 01:32:46.000000000 -0700
-@@ -66,6 +66,7 @@
-         "@llvm-project//mlir:NVVMDialect",
-         "@llvm-project//mlir:Pass",
-         "@llvm-project//mlir:Support",
-+        "@llvm-project//mlir:TransformUtils",
-         "@llvm-project//mlir:Transforms",
-         "//:TritonDialects",
-     ],
-@@ -113,6 +114,7 @@
-         "@llvm-project//mlir:NVVMDialect",
-         "@llvm-project//mlir:Pass",
-         "@llvm-project//mlir:SCFToControlFlow",
-+        "@llvm-project//mlir:TransformUtils",
-         "@llvm-project//mlir:Transforms",
-         "//:TritonAnalysis",
-         "//:TritonDialects",
diff --git a/third_party/triton/llvm_integration/BUILD b/third_party/triton/llvm_integration/BUILD
new file mode 100644
index 00000000000000..3c413807167aeb
--- /dev/null
+++ b/third_party/triton/llvm_integration/BUILD
@@ -0,0 +1 @@
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/triton/llvm_integration/cl623185214.patch b/third_party/triton/llvm_integration/cl623185214.patch
new file mode 100644
index 00000000000000..0eed759d61b419
--- /dev/null
+++ b/third_party/triton/llvm_integration/cl623185214.patch
@@ -0,0 +1,14 @@
+==== triton/lib/Target/LLVMIR/LLVMDIScope.cpp#8 - /google/src/cloud/wcui/mlir_6f6336858e4588ebd113ebcc930f6384a4edca54_1712678792/triton/lib/Target/LLVMIR/LLVMDIScope.cpp ====
+# action=edit type=text
+--- triton/lib/Target/LLVMIR/LLVMDIScope.cpp	2024-02-09 08:45:23.000000000 -0800
++++ triton/lib/Target/LLVMIR/LLVMDIScope.cpp	2024-04-09 09:10:04.000000000 -0700
+@@ -90,7 +90,8 @@
+         compileUnitAttr = LLVM::DICompileUnitAttr::get(
+             context, distinctId, llvm::dwarf::DW_LANG_C, fileAttr,
+             StringAttr::get(context, "triton"),
+-            /*isOptimized=*/true, LLVM::DIEmissionKind::LineTablesOnly);
++            /*isOptimized=*/true, LLVM::DIEmissionKind::LineTablesOnly,
++            LLVM::DINameTableKind::Default);
+       }
+       subprogramFlags = subprogramFlags | LLVM::DISubprogramFlags::Definition;
+     } else {
diff --git a/third_party/triton/llvm_integration/series.bzl b/third_party/triton/llvm_integration/series.bzl
new file mode 100644
index 00000000000000..3248f6a5b1322c
--- /dev/null
+++ b/third_party/triton/llvm_integration/series.bzl
@@ -0,0 +1,9 @@
+"""
+Provides a temporary list of patches created during llvm-integration.
+
+These should be upstreamed to openai/triton as part of the next triton integration process.
+"""
+
+llvm_patch_list = [
+    "//third_party/triton/llvm_integration:cl623185214.patch",
+]
diff --git a/third_party/triton/temporary/BUILD b/third_party/triton/temporary/BUILD
new file mode 100644
index 00000000000000..3c413807167aeb
--- /dev/null
+++ b/third_party/triton/temporary/BUILD
@@ -0,0 +1 @@
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/triton/temporary/cl609333259.patch b/third_party/triton/temporary/cl609333259.patch
new file mode 100644
index 00000000000000..d51bae07dda0db
--- /dev/null
+++ b/third_party/triton/temporary/cl609333259.patch
@@ -0,0 +1,47 @@
+This patch handles internal test failures. We can attempt to upstream this into
+2 changes, but OpenAI might resist. For now, we should move this patch into the
+internal ones. This is tracked here: b/331606551. These issues won't reproduce
+upstream without removing a pass (which we do internally) that needs further
+investigations (tracked here b/331360119).
+
+diff --git a/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp b/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp
+--- a/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp
++++ b/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp
+@@ -123,7 +115,8 @@ public:
+                                 PatternRewriter &rewriter) const override {
+     // Only consider conversions to dot operand.
+     auto cvtTy = cvt.getType().cast<RankedTensorType>();
+-    if (!cvtTy.getEncoding().isa<DotOperandEncodingAttr>())
++    auto dotOpEnc = cvtTy.getEncoding().dyn_cast<DotOperandEncodingAttr>();
++    if (!dotOpEnc)
+       return failure();
+ 
+     auto src = cvt.getSrc().getDefiningOp();
+@@ -138,6 +131,12 @@ public:
+                 [](Type ty) { return ty.isa<RankedTensorType>(); }))
+       return failure();
+ 
++    // Quick handling to fix loading issues when computing the original
++    // bitwidth is unable to realize that there is a mixed-precision dot
++    // (hence kWidth = 1) but wants to hoist through the type conversion.
++    if (isa<arith::ExtFOp>(src) && dotOpEnc.getKWidth() == 1)
++        return failure();
++
+     // Only consider custom conversions or arith ops.
+     // TODO(jlebar): Is this too restrictive?
+     if (!isa<FpToFpOp, BitcastOp>(src) &&
+@@ -150,6 +149,14 @@ public:
+     if (isa<arith::TruncIOp, arith::TruncFOp, arith::SelectOp>(src))
+       return failure();
+ 
++    // Don't hoist through u1 -> fp casts as they aren't supported in
++    // ElementwiseOpToLLVM::reorderValues().
++    if (isa<arith::UIToFPOp>(src)) {
++      Type srcType = getElementTypeOrSelf(src->getOperand(0));
++      if (srcType.isInteger(1))
++        return failure();
++    }
++
+     // Check that the conversion is transitively dependent on a load, and all
+     // operations between the load and the conversion are layout preserving.
+     //
diff --git a/third_party/triton/temporary/series.bzl b/third_party/triton/temporary/series.bzl
new file mode 100644
index 00000000000000..e403a80462516b
--- /dev/null
+++ b/third_party/triton/temporary/series.bzl
@@ -0,0 +1,10 @@
+"""
+Provides a temporary list of patches.
+
+These are created temporarily and should be moved to the first copybara workflow as a public or an
+internal patch during the next triton integration process.
+"""
+
+temporary_patch_list = [
+    "//third_party/triton/temporary:cl609333259.patch",
+]
diff --git a/third_party/triton/workspace.bzl b/third_party/triton/workspace.bzl
index 2773b250ac8554..50ba0e0c1ead99 100644
--- a/third_party/triton/workspace.bzl
+++ b/third_party/triton/workspace.bzl
@@ -1,22 +1,19 @@
 """Provides the repository macro to import Triton."""
 
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
+load("//third_party/triton/llvm_integration:series.bzl", "llvm_patch_list")
+load("//third_party/triton/temporary:series.bzl", "temporary_patch_list")
+load("//third_party/triton/xla_extensions:series.bzl", "extensions_files_patch_list")
 
 def repo():
     """Imports Triton."""
 
-    TRITON_COMMIT = "cl617459344"
-    TRITON_SHA256 = "f23e65175a67b1091ab6782720b0bcb969f33c19cae8168bf93eea523dab8a3f"
+    TRITON_COMMIT = "cl619179472"
+    TRITON_SHA256 = "aa0b0b338bf16aa7eea778312fa549a421278b24d1a4bc04f5d6ced706f693fe"
     tf_http_archive(
         name = "triton",
         sha256 = TRITON_SHA256,
         strip_prefix = "triton-{commit}".format(commit = TRITON_COMMIT),
         urls = tf_mirror_urls("https://github.com/openxla/triton/archive/{commit}.tar.gz".format(commit = TRITON_COMMIT)),
-        # For temporary changes which haven't landed upstream yet.
-        patch_file = [
-            "//third_party/triton:cl607293980.patch",  # long standing :(
-            "//third_party/triton:cl617812302.patch",
-            "//third_party/triton:cl619146327.patch",
-            "//third_party/triton:cl619443019.patch",
-        ],
+        patch_file = extensions_files_patch_list + llvm_patch_list + temporary_patch_list,
     )
diff --git a/third_party/triton/xla_extensions/BUILD b/third_party/triton/xla_extensions/BUILD
new file mode 100644
index 00000000000000..3c413807167aeb
--- /dev/null
+++ b/third_party/triton/xla_extensions/BUILD
@@ -0,0 +1 @@
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/triton/cl607293980.patch b/third_party/triton/xla_extensions/env_vars.patch
similarity index 100%
rename from third_party/triton/cl607293980.patch
rename to third_party/triton/xla_extensions/env_vars.patch
diff --git a/third_party/triton/xla_extensions/series.bzl b/third_party/triton/xla_extensions/series.bzl
new file mode 100644
index 00000000000000..5e6309a20988ce
--- /dev/null
+++ b/third_party/triton/xla_extensions/series.bzl
@@ -0,0 +1,11 @@
+"""
+Provides the list of long-term patches applied to openxla/xla that are not possible to be
+applied in the previous copybara workflow.
+"""
+
+extensions_files_patch_list = [
+    "//third_party/triton/xla_extensions:env_vars.patch",  # File not exported to google
+    "//third_party/triton/xla_extensions:sparse_dot_nvgpu.patch",  # Sparsity internal patch
+    "//third_party/triton/xla_extensions:sparse_dot_base.patch",  # Sparsity internal patch
+    "//third_party/triton/xla_extensions:sparse_dot_passes.patch",  # Sparsity internal patch
+]
diff --git a/third_party/triton/xla_extensions/sparse_dot_base.patch b/third_party/triton/xla_extensions/sparse_dot_base.patch
new file mode 100644
index 00000000000000..5b520537f6e814
--- /dev/null
+++ b/third_party/triton/xla_extensions/sparse_dot_base.patch
@@ -0,0 +1,900 @@
+diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
+--- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
++++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
+@@ -1158,4 +1158,12 @@ section 9.7.13.4.1 for more details.
+   let extraClassDeclaration = extraDistributedDeclaration;
+ }
+ 
++def SparseDotMetaEncodingAttr : DistributedEncoding<"SparseDotMetaEncoding", "sparse_dot_meta_encoding"> {
++  let mnemonic = "sparse_dot_meta";
++
++  let parameters = (ins "Attribute":$parent);
++  let assemblyFormat = "`<``{` struct(params) `}``>`";
++  let extraClassDeclaration = extraDistributedDeclaration;
++}
++
+ #endif
+diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
+--- a/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
++++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
+@@ -7,6 +7,7 @@ include "triton/Dialect/TritonGPU/IR/Tri
+ include "mlir/Dialect/Arith/IR/ArithBase.td"
+ include "triton/Dialect/Triton/IR/TritonTypes.td"
+ include "triton/Dialect/Triton/IR/TritonAttrDefs.td"
++include "triton/Dialect/Triton/IR/TritonTypeInterfaces.td"
+ include "mlir/IR/OpBase.td"
+ include "mlir/Interfaces/SideEffectInterfaces.td" // Pure
+ include "mlir/Interfaces/InferTypeOpInterface.td" // SameOperandsAndResultType
+@@ -214,4 +215,19 @@ def TTG_LocalLoadOp : TTG_Op<"local_load
+   let results = (outs TT_Tensor:$result);
+ }
+ 
++def TTNG_SparseDotOp : TTG_Op<"sparse_dot", [
++        Pure, DeclareOpInterfaceMethods<InferTypeOpInterface>,
++        TypesMatchWith<"result's type matches accumulator's type", "d", "c", "$_self">]> {
++    let summary = "sparse dot";
++
++    let arguments = (ins
++      TT_TensorOrMemDesc:$a,
++      TT_TensorOrMemDesc:$b,
++      TT_FpIntTensor:$c,
++      TT_IntTensor: $aMeta);
++    let results = (outs TT_FpIntTensor:$d);
++    let assemblyFormat = "$a`,` $b`,` $c`,` $aMeta attr-dict `:` type($a) `meta` type($aMeta) `*` type($b) `->` type($d)";
++    let hasVerifier = 1;
++}
++
+ #endif
+diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
+--- a/lib/Dialect/TritonGPU/IR/Dialect.cpp
++++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp
+@@ -479,6 +479,119 @@ getDefaultBlockedEncoding(MLIRContext *c
+   return encoding;
+ }
+ 
++///--- SparseDotOp ---
++namespace {
++// Implied properties of 2:4 sparse dots.
++constexpr int kContractingFactor = 2;
++constexpr int kMetadataElementsPerPackedValue = 8;
++constexpr int kMetadataElementsPerWarp = 16;
++}  // namespace
++
++mlir::LogicalResult SparseDotOp::inferReturnTypes(
++    MLIRContext *context, std::optional<Location> location, ValueRange operands,
++    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
++    SmallVectorImpl<Type> &inferredReturnTypes) {
++  return DotOp::inferReturnTypes(context, location, operands, attributes,
++                                 properties, regions, inferredReturnTypes);
++}
++
++LogicalResult SparseDotOp::verify() {
++  // Verify operand A.
++  auto aTensorTy = getOperand(0).getType().cast<TensorOrMemDesc>();
++  auto aElemTy = aTensorTy.getElementType();
++  if (!aElemTy.isF16() && !aElemTy.isBF16())
++    return emitError("element type of operand A is not supported");
++  auto aShape = aTensorTy.getShape();
++  if (aShape.size() != 2) return emitError("shape of operand A is incorrect");
++
++  // Verify operand B.
++  auto bTensorTy = getOperand(1).getType().cast<TensorOrMemDesc>();
++  auto bElemTy = bTensorTy.getElementType();
++  if (!bElemTy.isF16() && !bElemTy.isBF16())
++    return emitError("element type of operand B is not supported");
++  auto bShape = bTensorTy.getShape();
++  if (bShape.size() != 2) return emitError("shape of operand B is incorrect");
++
++  // Verify operand C.
++  auto cTensorTy = getOperand(2).getType().cast<RankedTensorType>();
++  auto cElemTy = cTensorTy.getElementType();
++  if (!cElemTy.isF32())
++    return emitError("element type of operand C is not supported");
++  auto cShape = cTensorTy.getShape();
++  if (cShape.size() != 2) return emitError("shape of operand C is incorrect");
++
++  // Check operand dependencies.
++  if (aShape[0] != cShape[0] || bShape[1] != cShape[1] ||
++      bShape[0] != aShape[1] * kContractingFactor)
++    return emitError("operand shape dimensions are incorrect");
++  if (aElemTy != bElemTy)
++    return emitError("operand element types do not match");
++
++  // Verify sparse metadata.
++  auto metaTy = getOperand(3).getType().cast<RankedTensorType>();
++  auto metaShape = metaTy.getShape();
++  if (!metaTy.getElementType().isInteger(16) || metaShape.size() != 2)
++    return emitError("sparse metadata tensor is invalid");
++  if (metaShape[0] != aShape[0] ||
++      metaShape[1] * kMetadataElementsPerPackedValue != aShape[1])
++    return emitError("sparse metadata shape dimensions are incorrect");
++
++  // Verify tensor encoding.
++  auto aEncoding = aTensorTy.getEncoding();
++  auto bEncoding = bTensorTy.getEncoding();
++  if (!aEncoding && !bEncoding) return mlir::success();
++  if (!aEncoding || !bEncoding)
++    return emitError("mismatching encoding between A and B operands");
++
++  Dialect &dialect = aEncoding.getDialect();
++  auto interface = cast<DialectInferLayoutInterface>(&dialect);
++  return interface->verifyDotOpEncodingCompatibility(getOperation(), aEncoding,
++                                                     bEncoding);
++}
++
++//--- SparseDotMetaEncodingAttr ---
++unsigned SparseDotMetaEncodingAttr::getTotalElemsPerThread(
++    ArrayRef<int64_t> shape, Type eltTy) const {
++  auto mmaLayout = getParent().cast<NvidiaMmaEncodingAttr>();
++  return product<int64_t>(shape) /
++         (mmaLayout.getWarpsPerCTA()[0] * kMetadataElementsPerWarp);
++}
++
++SmallVector<unsigned> SparseDotMetaEncodingAttr::getElemsPerThread(
++    ArrayRef<int64_t> shape, Type eltTy) const {
++  llvm_unreachable("getElemsPerThread is not supported for sparse dot meta");
++  return SmallVector<unsigned>();
++}
++
++SmallVector<unsigned> SparseDotMetaEncodingAttr::getCTAsPerCGA() const {
++  return ::getCTAsPerCGA(getParent());
++}
++SmallVector<unsigned> SparseDotMetaEncodingAttr::getCTAOrder() const {
++  return ::getCTAOrder(getParent());
++}
++SmallVector<unsigned> SparseDotMetaEncodingAttr::getCTASplitNum() const {
++  return ::getCTASplitNum(getParent());
++}
++SmallVector<unsigned> SparseDotMetaEncodingAttr::getWarpsPerCTA() const {
++  return ::getWarpsPerCTA(getParent());
++}
++SmallVector<unsigned> SparseDotMetaEncodingAttr::getWarpOrder() const {
++  return {1, 0};
++}
++SmallVector<unsigned> SparseDotMetaEncodingAttr::getThreadsPerWarp() const {
++  return ::getThreadsPerWarp(getParent());
++}
++SmallVector<unsigned> SparseDotMetaEncodingAttr::getThreadOrder() const {
++  return {1, 0};
++}
++SmallVector<unsigned> SparseDotMetaEncodingAttr::getSizePerThread() const {
++  return ::getSizePerThread(getParent());
++}
++SmallVector<unsigned> SparseDotMetaEncodingAttr::getShapePerCTATile(
++    ArrayRef<int64_t> tensorShape) const {
++  return ::getShapePerCTATile(getParent(), tensorShape);
++}
++
+ } // namespace gpu
+ } // namespace triton
+ } // namespace mlir
+diff --git a/test/SparseDot/convert_to_llvm_ampere.mlir b/test/SparseDot/convert_to_llvm_ampere.mlir
+new file mode 100644
+--- /dev/null
++++ b/test/SparseDot/convert_to_llvm_ampere.mlir
+@@ -0,0 +1,26 @@
++// RUN: triton-opt %s --allocate-shared-memory --convert-triton-gpu-to-llvm=compute-capability=80 | FileCheck %s
++
++#blocked0 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
++#shared0 = #triton_gpu.shared<{vec = 1, perPhase=1, maxPhase=1, order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
++#mma0 = #triton_gpu.nvidia_mma<{versionMajor = 2, warpsPerCTA = [2, 2], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1], instrShape = [16, 8]}>
++#dot_operand_a = #triton_gpu.dot_op<{opIdx=0, parent=#mma0, kWidth=2}>
++#dot_operand_b = #triton_gpu.dot_op<{opIdx=1, parent=#mma0, kWidth=2}>
++#dot_meta_enc = #triton_gpu.sparse_dot_meta<{parent=#mma0}>
++
++module attributes {"triton_gpu.num-warps" = 4 : i32} {
++  tt.func @sparse_dot(%A: tensor<32x32xf16, #blocked0>, %B: tensor<64x32xf16, #blocked0>, %meta: tensor<32x4xi16, #blocked0>) {
++    // CHECK-COUNT-2: ldmatrix.sync.aligned.m8n8.x4.shared.b16
++    %A_alloc = triton_gpu.local_alloc %A {allocation.offset = 0 : i32} : (tensor<32x32xf16, #blocked0>) -> !tt.memdesc<32x32xf16, #shared0>
++    %A_dot = triton_gpu.local_load %A_alloc : !tt.memdesc<32x32xf16, #shared0> -> tensor<32x32xf16, #dot_operand_a>
++    // CHECK-COUNT-4: ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16
++    %B_alloc = triton_gpu.local_alloc %B {allocation.offset = 2048 : i32} : (tensor<64x32xf16, #blocked0>) -> !tt.memdesc<64x32xf16, #shared0>
++    %B_dot = triton_gpu.local_load %B_alloc : !tt.memdesc<64x32xf16, #shared0> -> tensor<64x32xf16, #dot_operand_b>
++    // CHECK-COUNT-4: llvm.load %[[_:.*]] : !llvm.ptr<3> -> i16
++    %meta_alloc = triton_gpu.local_alloc %meta {allocation.offset = 6144 : i32} : (tensor<32x4xi16, #blocked0>) -> !tt.memdesc<32x4xi16, #shared0>
++    %meta_reg = triton_gpu.local_load %meta_alloc : !tt.memdesc<32x4xi16, #shared0> -> tensor<32x4xi16, #dot_meta_enc>
++    // CHECK-COUNT-4: mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32
++    %acc = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma0>
++    %D = triton_gpu.sparse_dot %A_dot, %B_dot, %acc, %meta_reg : tensor<32x32xf16, #dot_operand_a> meta tensor<32x4xi16, #dot_meta_enc> * tensor<64x32xf16, #dot_operand_b> -> tensor<32x32xf32, #mma0>
++    tt.return
++  }
++}
+diff --git a/test/SparseDot/convert_to_llvm_hopper.mlir b/test/SparseDot/convert_to_llvm_hopper.mlir
+new file mode 100644
+--- /dev/null
++++ b/test/SparseDot/convert_to_llvm_hopper.mlir
+@@ -0,0 +1,28 @@
++// RUN: triton-opt %s --allocate-shared-memory --convert-triton-gpu-to-llvm=compute-capability=90 | FileCheck %s
++
++#blocked0 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
++#shared0 = #triton_gpu.shared<{vec = 1, perPhase=2, maxPhase=4, order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
++#shared1 = #triton_gpu.shared<{vec = 1, perPhase=1, maxPhase=1, order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
++#mma0 = #triton_gpu.nvidia_mma<{versionMajor = 3, warpsPerCTA = [4, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1], instrShape = [16, 64, 16]}>
++#dot_meta_enc = #triton_gpu.sparse_dot_meta<{parent=#mma0}>
++
++module attributes {"triton_gpu.num-warps" = 4 : i32} {
++  tt.func @sparse_dot(%A: tensor<64x32xf16, #blocked0>, %B: tensor<64x64xf16, #blocked0>, %meta: tensor<64x4xi16, #blocked0>) {
++    %A_alloc = triton_gpu.local_alloc %A {allocation.offset = 0 : i32} : (tensor<64x32xf16, #blocked0>) -> !tt.memdesc<64x32xf16, #shared0>
++    %B_alloc = triton_gpu.local_alloc %B {allocation.offset = 4096 : i32} : (tensor<64x64xf16, #blocked0>) -> !tt.memdesc<64x64xf16, #shared0>
++    // CHECK-COUNT-2: llvm.load %[[_:.*]] : !llvm.ptr<3> -> i16
++    %meta_alloc = triton_gpu.local_alloc %meta {allocation.offset = 12288 : i32} : (tensor<64x4xi16, #blocked0>) -> !tt.memdesc<64x4xi16, #shared0>
++    %meta_reg = triton_gpu.local_load %meta_alloc : !tt.memdesc<64x4xi16, #shared0> -> tensor<64x4xi16, #dot_meta_enc>
++    // CHECK: nvgpu.wgmma_fence
++    // CHECK-COUNT-2: nvgpu.wgmma_sp %[[A:.*]] meta %[[M:.*]], %[[B:.*]], %[[C:.*]] {
++    // CHECK-DAG: layoutA = 0 : i32
++    // CHECK-DAG: layoutB = 0 : i32
++    // CHECK-DAG: m = 64 : i32
++    // CHECK-DAG: n = 64 : i32
++    // CHECK-DAG: k = 32 : i32
++    // CHECK: nvgpu.wgmma_commit_group
++    %acc = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #mma0>
++    %D = triton_gpu.sparse_dot %A_alloc, %B_alloc, %acc, %meta_reg : !tt.memdesc<64x32xf16, #shared0> meta tensor<64x4xi16, #dot_meta_enc> * !tt.memdesc<64x64xf16, #shared0> -> tensor<64x64xf32, #mma0>
++    tt.return
++  }
++}
+diff --git a/test/SparseDot/validation.mlir b/test/SparseDot/validation.mlir
+new file mode 100644
+--- /dev/null
++++ b/test/SparseDot/validation.mlir
+@@ -0,0 +1,129 @@
++// RUN: triton-opt --split-input-file --verify-diagnostics %s
++
++tt.func @sparse_dot(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
++  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
++  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
++  tt.return
++}
++
++// -----
++tt.func @sparse_dot_invalid_lhs_type(%lhs: tensor<128x32xf32>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
++  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
++  // expected-error @+1 {{element type of operand A is not supported}}
++  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xf32> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
++  tt.return
++}
++
++// -----
++tt.func @sparse_dot_invalid_lhs_shape(%lhs: tensor<1x128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
++  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
++  // expected-error @+1 {{shape of operand A is incorrect}}
++  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<1x128x32xbf16> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
++  tt.return
++}
++
++// -----
++tt.func @sparse_dot_invalid_rhs_type(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xf32>, %meta: tensor<128x4xi16>) {
++  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
++  // expected-error @+1 {{element type of operand B is not supported}}
++  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<64x128xf32> -> tensor<128x128xf32>
++  tt.return
++}
++
++// -----
++tt.func @sparse_dot_invalid_rhs_shape(%lhs: tensor<128x32xbf16>, %rhs: tensor<1x64x128xbf16>, %meta: tensor<128x4xi16>) {
++  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
++  // expected-error @+1 {{shape of operand B is incorrect}}
++  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<1x64x128xbf16> -> tensor<128x128xf32>
++  tt.return
++}
++
++// -----
++tt.func @sparse_dot_invalid_acc_type(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
++  %acc = arith.constant dense<0.00e+00> : tensor<128x128xbf16>
++  // expected-error @+1 {{element type of operand C is not supported}}
++  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<128x128xbf16>
++  tt.return
++}
++
++// -----
++tt.func @sparse_dot_invalid_acc_shape(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
++  %acc = arith.constant dense<0.00e+00> : tensor<16384xf32>
++  // expected-error @+1 {{shape of operand C is incorrect}}
++  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<16384xf32>
++  tt.return
++}
++
++// -----
++tt.func @sparse_dot_mismatch_lhs_acc(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
++  %acc = arith.constant dense<0.00e+00> : tensor<64x128xf32>
++  // expected-error @+1 {{operand shape dimensions are incorrect}}
++  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<64x128xf32>
++  tt.return
++}
++
++// -----
++tt.func @sparse_dot_mismatch_rhs_acc(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
++  %acc = arith.constant dense<0.00e+00> : tensor<128x64xf32>
++  // expected-error @+1 {{operand shape dimensions are incorrect}}
++  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<128x64xf32>
++  tt.return
++}
++
++// -----
++tt.func @sparse_dot_mismatch_lhs_rhs(%lhs: tensor<128x32xbf16>, %rhs: tensor<32x128xbf16>, %meta: tensor<128x4xi16>) {
++  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
++  // expected-error @+1 {{operand shape dimensions are incorrect}}
++  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<32x128xbf16> -> tensor<128x128xf32>
++  tt.return
++}
++
++// -----
++tt.func @sparse_dot_mismatch_input_types(%lhs: tensor<128x32xf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
++  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
++  // expected-error @+1 {{operand element types do not match}}
++  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xf16> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
++  tt.return
++}
++
++// -----
++tt.func @sparse_dot_invalid_meta_type(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi8>) {
++  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
++  // expected-error @+1 {{sparse metadata tensor is invalid}}
++  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi8> * tensor<64x128xbf16> -> tensor<128x128xf32>
++  tt.return
++}
++
++// -----
++tt.func @sparse_dot_invalid_meta_shape(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<512xi16>) {
++  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
++  // expected-error @+1 {{sparse metadata tensor is invalid}}
++  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<512xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
++  tt.return
++}
++
++// -----
++tt.func @sparse_dot_mismatch_meta_noncontracting(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<64x4xi16>) {
++  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
++  // expected-error @+1 {{sparse metadata shape dimensions are incorrect}}
++  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<64x4xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
++  tt.return
++}
++
++// -----
++tt.func @sparse_dot_mismatch_meta_contracting(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x8xi16>) {
++  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
++  // expected-error @+1 {{sparse metadata shape dimensions are incorrect}}
++  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x8xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
++  tt.return
++}
++
++// -----
++#mma0 = #triton_gpu.nvidia_mma<{versionMajor = 2, warpsPerCTA = [2, 2], instrShape = [16, 8]}>
++#enc0 = #triton_gpu.dot_op<{opIdx=0, parent=#mma0, kWidth=2}>
++tt.func @sparse_dot_encoding_operand_mismatch(%lhs: tensor<128x32xbf16, #enc0>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
++  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
++  // expected-error @+1 {{mismatching encoding between A and B operands}}
++  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16, #enc0> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
++  tt.return
++}
+diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM.cpp
+--- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM.cpp
++++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM.cpp
+@@ -38,6 +38,14 @@ Value convertLayout(int opIdx, Conversio
+                     const LLVMTypeConverter *typeConverter, Value thread);
+ }
+ 
++namespace SharedToSparseDotOperand {
++Value convertLayout(
++    ConversionPatternRewriter &rewriter, Location loc, Value tensor,
++    triton::gpu::SparseDotMetaEncodingAttr sparseEncoding,
++    const SharedMemoryObject &smemObj, const LLVMTypeConverter *typeConverter,
++    Value thread);
++} // namespace SharedToSparseDotOperand
++
+ namespace {
+ 
+ struct LocalLoadOpConversion
+@@ -59,6 +67,10 @@ public:
+             .isa<NvidiaMmaEncodingAttr>()) {
+       return lowerSharedToDotOperand(op, adaptor, getTypeConverter(), rewriter);
+     }
++    if (srcLayout.isa<SharedEncodingAttr>() &&
++        dstLayout.isa<triton::gpu::SparseDotMetaEncodingAttr>()) {
++      return lowerSharedToSparseMeta(op, adaptor, getTypeConverter(), rewriter);
++    }
+     return failure();
+   }
+ 
+@@ -130,6 +142,29 @@ private:
+     rewriter.replaceOp(op, res);
+     return success();
+   }
++
++  // shared -> sparse dot meta
++  LogicalResult lowerSharedToSparseMeta(
++      triton::gpu::LocalLoadOp op, triton::gpu::LocalLoadOpAdaptor adaptor,
++      const LLVMTypeConverter *typeConverter,
++      ConversionPatternRewriter &rewriter) const {
++    auto loc = op.getLoc();
++    auto sparseEncoding = op.getResult()
++                              .getType()
++                              .cast<RankedTensorType>()
++                              .getEncoding()
++                              .cast<triton::gpu::SparseDotMetaEncodingAttr>();
++    auto llvmElemTy = typeConverter->convertType(
++        op.getSrc().getType().cast<MemDescType>().getElementType());
++    auto smemObj = getSharedMemoryObjectFromStruct(loc, adaptor.getSrc(),
++                                                   llvmElemTy, rewriter);
++    Value res = SharedToSparseDotOperand::convertLayout(
++        rewriter, loc, op.getSrc(), sparseEncoding, smemObj, typeConverter,
++        getThreadId(rewriter, loc));
++
++    rewriter.replaceOp(op, res);
++    return success();
++  }
+ };
+ 
+ struct ConvertLayoutOpOptimizedConversion
+diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
+new file mode 100644
+--- /dev/null
++++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
+@@ -0,0 +1,69 @@
++#include "../Utility.h"
++
++namespace SharedToSparseDotOperand {
++namespace {
++constexpr int kThreadsPerWarp = 32;
++
++// Each 16x16 original sparse matrix tile requires 16 metadata values of 16-bit
++// size, where the first thread (T0) in each 4-thread group holds two such
++// values in a register (32-bit).
++// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#sparse-matrix-storage
++constexpr int kTileSize = 16;
++constexpr int kThreadsInGroup = 4;
++constexpr int kMetadataElementsPerPackedValue = 8;  // 8 x 2-bit = 16-bit
++constexpr int kMetadataLineOffset = kThreadsPerWarp / kThreadsInGroup;
++}  // namespace
++
++Value convertLayout(
++    ConversionPatternRewriter &rewriter, Location loc, Value tensor,
++    triton::gpu::SparseDotMetaEncodingAttr sparseEncoding,
++    const SharedMemoryObject &smemObj, const LLVMTypeConverter *typeConverter,
++    Value thread) {
++  // Calculate tile size as number of mask elements (4xi4).
++  NvidiaMmaEncodingAttr mmaLayout =
++      sparseEncoding.getParent().cast<NvidiaMmaEncodingAttr>();
++  SmallVector<unsigned> shapePerCTATile = {
++      kTileSize * mmaLayout.getWarpsPerCTA()[0],
++      kTileSize / kMetadataElementsPerPackedValue};
++  Value strideM = smemObj.strides[0];
++  Value strideK = smemObj.strides[1];
++
++  // Calculate offset in the tile for the current thread.
++  Value threadsPerWarp = i32_val(kThreadsPerWarp);
++  Value warpId = udiv(thread, threadsPerWarp);
++  Value warpGroupId = urem(warpId, i32_val(shapePerCTATile[0] / kTileSize));
++  Value laneId = urem(thread, threadsPerWarp);
++  Value laneGroupId = udiv(laneId, i32_val(kThreadsInGroup));
++  Value columnId = urem(laneId, i32_val(shapePerCTATile[1]));
++  Value rowId = add(mul(warpGroupId, i32_val(kTileSize)), laneGroupId);
++
++  // Calculate number of tile repetitions.
++  auto shape = tensor.getType().cast<MemDescType>().getShape();
++  int repM = shape[0] / shapePerCTATile[0];
++  int repK = shape[1] / shapePerCTATile[1];
++  assert(repM > 0 && repK > 0);
++
++  // Load sparse metadata from shared memory.
++  MLIRContext *ctx = tensor.getContext();
++  Type ptrTy = ptr_ty(ctx, 3);
++  Value base = gep(ptrTy, i16_ty, smemObj.base, i32_val(0));
++  SmallVector<Value> values;
++
++  for (int k = 0; k < repK; ++k) {
++    for (int m = 0; m < repM; ++m) {
++      Value row = add(rowId, i32_val(m * shapePerCTATile[0]));
++      Value column = add(columnId, i32_val(k * shapePerCTATile[1]));
++      Value offset1 = add(mul(row, strideM), mul(column, strideK));
++      Value offset2 = add(offset1, mul(i32_val(kMetadataLineOffset), strideM));
++      Value lower = load(i16_ty, gep(ptrTy, i16_ty, base, offset1));
++      Value upper = load(i16_ty, gep(ptrTy, i16_ty, base, offset2));
++      values.push_back(lower);
++      values.push_back(upper);
++    }
++  }
++
++  // Pack resulting values as LLVM struct.
++  Type structTy = struct_ty(SmallVector<Type>(values.size(), i16_ty));
++  return packLLElements(loc, typeConverter, values, rewriter, structTy);
++}
++}  // namespace SharedToSparseDotOperand
+diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM.cpp
+--- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM.cpp
++++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM.cpp
+@@ -32,6 +32,12 @@ LogicalResult convertAsyncWGMMA(triton::
+                                 const LLVMTypeConverter *typeConverter,
+                                 ConversionPatternRewriter &rewriter,
+                                 Value thread);
++
++LogicalResult rewriteSparseDotOp(triton::gpu::SparseDotOp op,
++                                 triton::gpu::SparseDotOp::Adaptor adaptor,
++                                 const LLVMTypeConverter *typeConverter,
++                                 ConversionPatternRewriter &rewriter);
++
+ namespace {
+ struct DotOpConversion : public ConvertOpToLLVMPattern<triton::DotOp> {
+   using ConvertOpToLLVMPattern<triton::DotOp>::ConvertOpToLLVMPattern;
+@@ -180,6 +186,18 @@ struct DotWaitOpConversion
+     return success();
+   }
+ };
++
++struct SparseDotOpConversion
++    : public ConvertOpToLLVMPattern<triton::gpu::SparseDotOp> {
++  using ConvertOpToLLVMPattern<
++      triton::gpu::SparseDotOp>::ConvertOpToLLVMPattern;
++
++  LogicalResult matchAndRewrite(
++      triton::gpu::SparseDotOp op, OpAdaptor adaptor,
++      ConversionPatternRewriter &rewriter) const override {
++    return rewriteSparseDotOp(op, adaptor, getTypeConverter(), rewriter);
++  }
++};
+ } // namespace
+ 
+ void mlir::triton::NVIDIA::populateDotOpToLLVMPatterns(
+@@ -188,4 +206,5 @@ void mlir::triton::NVIDIA::populateDotOp
+   patterns.add<DotOpConversion>(typeConverter, benefit);
+   patterns.add<DotAsyncOpConversion>(typeConverter, benefit);
+   patterns.add<DotWaitOpConversion>(typeConverter, benefit);
++  patterns.add<SparseDotOpConversion>(typeConverter, benefit);
+ }
+diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/Sparse.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/Sparse.cpp
+new file mode 100644
+--- /dev/null
++++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/Sparse.cpp
+@@ -0,0 +1,339 @@
++#include "../Utility.h"
++
++using namespace mlir;
++using namespace mlir::triton;
++using namespace mlir::triton::gpu;
++
++using ::mlir::LLVM::getSharedMemoryObjectFromStruct;
++using ::mlir::triton::gpu::getShapePerCTA;
++using ::mlir::triton::gpu::getShapePerCTATile;
++using ::mlir::triton::gpu::SharedEncodingAttr;
++
++using ValueTableV2 = std::map<std::pair<unsigned, unsigned>, Value>;
++
++namespace {
++constexpr int kContractingFactor = 2;  // implied by N:M (2:4)
++constexpr int kCore = 2;               // number of core matrices per batch
++constexpr int kCoreTile = kCore * kContractingFactor;
++}  // namespace
++
++// ----- Ampere implementation.
++
++ValueTableV2 getValuesFromDotOperandLayoutStruct(SmallVector<Value> elems,
++                                                 int n0, int n1) {
++  int offset = 0;
++  ValueTableV2 vals;
++  for (int i = 0; i < n0; ++i) {
++    for (int j = 0; j < n1; ++j) {
++      vals[{kCore * i, kCore * j}] = elems[offset++];
++      vals[{kCore * i, kCore * j + 1}] = elems[offset++];
++      vals[{kCore * i + 1, kCore * j}] = elems[offset++];
++      vals[{kCore * i + 1, kCore * j + 1}] = elems[offset++];
++    }
++  }
++  return vals;
++}
++
++std::string getMmaSpPtxInstruction(Type type) {
++  if (type.isF16()) {
++    return "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32";
++  } else if (type.isBF16()) {
++    return "mma.sp.sync.aligned.m16n8k32.row.col.f32.bf16.bf16.f32";
++  }
++  llvm::report_fatal_error("Unsupported SparseDotOp operand type");
++}
++
++LogicalResult convertSparseMMA(SparseDotOp op,
++                               SparseDotOp::Adaptor adaptor,
++                               const LLVMTypeConverter *typeConverter,
++                               ConversionPatternRewriter &rewriter) {
++  // Get number of repetitions across the dimensions.
++  auto aTensorTy = op.getA().getType().cast<RankedTensorType>();
++  auto bTensorTy = op.getB().getType().cast<RankedTensorType>();
++
++  auto layoutA = aTensorTy.getEncoding().dyn_cast<DotOperandEncodingAttr>();
++  auto layoutB = bTensorTy.getEncoding().dyn_cast<DotOperandEncodingAttr>();
++  assert(layoutA != nullptr && layoutB != nullptr);
++
++  int bitwidth = aTensorTy.getElementType().getIntOrFloatBitWidth();
++  auto mmaEnc = layoutA.getParent().cast<NvidiaMmaEncodingAttr>();
++  auto repA = mmaEnc.getMMAv2Rep(triton::gpu::getShapePerCTA(aTensorTy),
++                                 bitwidth, layoutA.getOpIdx());
++  auto repB = mmaEnc.getMMAv2Rep(triton::gpu::getShapePerCTA(bTensorTy),
++                                 bitwidth, layoutB.getOpIdx());
++
++  assert(repA[0] == 1 && repB[0] == 1);  // batch size
++  assert(repB[1] == repA[2] * kContractingFactor);
++  int repM = repA[1], repN = repB[2], repK = repB[1];
++
++  // Arrange loaded values into positions.
++  Location loc = op.getLoc();
++  auto ha = getValuesFromDotOperandLayoutStruct(
++      unpackLLElements(loc, adaptor.getA(), rewriter), repM,
++      repK / kContractingFactor);
++  auto hb = getValuesFromDotOperandLayoutStruct(
++      unpackLLElements(loc, adaptor.getB(), rewriter),
++      std::max(repN / kCore, 1), repK);
++
++  // Combine loaded metadata values.
++  auto hMeta = unpackLLElements(loc, adaptor.getAMeta(), rewriter);
++  SmallVector<Value> hMetaPacked;
++  for (int i = 0; i < hMeta.size(); i += kCore) {
++    Value lower = zext(i32_ty, hMeta[i]);
++    Value upper = zext(i32_ty, hMeta[i + 1]);
++    Value packed = or_(shl(upper, i32_val(16)), lower);
++    hMetaPacked.push_back(packed);
++  }
++
++  // Flatten accumulator values.
++  auto dTensorTy = op.getD().getType().cast<RankedTensorType>();
++  auto fc = unpackLLElements(loc, adaptor.getC(), rewriter);
++
++  // Create `mma.sp` instruction for 4/8 core matrices.
++  auto callMma = [&](unsigned m, unsigned n, unsigned k) {
++    PTXBuilder builder;
++    auto &mma =
++        *builder.create(getMmaSpPtxInstruction(aTensorTy.getElementType()));
++
++    auto retArgs = builder.newListOperand(kCoreTile, "=f");
++    auto cArgs = builder.newListOperand();
++    int baseIdx = m * repN * kCore + n * kCoreTile;
++    for (int i = 0; i < kCoreTile; ++i) {
++      cArgs->listAppend(builder.newOperand(fc[baseIdx + i], std::to_string(i)));
++    }
++    int i = k / kContractingFactor;
++    auto aArgs = builder.newListOperand({
++        {ha[{m, i}], "r"},
++        {ha[{m + 1, i}], "r"},
++        {ha[{m, i + 1}], "r"},
++        {ha[{m + 1, i + 1}], "r"},
++    });
++    auto bArgs = builder.newListOperand({
++        {hb[{n, k}], "r"},
++        {hb[{n, k + 1}], "r"},
++        {hb[{n, k + 2}], "r"},
++        {hb[{n, k + 3}], "r"},
++    });
++    auto metaArg =
++        builder.newOperand(hMetaPacked[k / kCoreTile * repM + m / kCore], "r");
++    auto selector = builder.newConstantOperand(0);
++    mma(retArgs, aArgs, bArgs, cArgs, metaArg, selector);
++
++    Type fp32x4Ty = LLVM::LLVMStructType::getLiteral(
++        op.getContext(), SmallVector<Type>(kCoreTile, f32_ty));
++    Value mmaOut = builder.launch(rewriter, loc, fp32x4Ty);
++    for (int i = 0; i < kCoreTile; ++i) {
++      fc[baseIdx + i] = extract_val(f32_ty, mmaOut, i);
++    }
++  };
++
++  for (int k = 0; k < repK; k += kContractingFactor)
++    for (int m = 0; m < repM; ++m)
++      for (int n = 0; n < repN; ++n) callMma(kCore * m, n, kCore * k);
++
++  // Replace with new packed result.
++  Type structTy = LLVM::LLVMStructType::getLiteral(
++      op.getContext(), SmallVector<Type>(fc.size(), f32_ty));
++  Value res = packLLElements(loc, typeConverter, fc, rewriter, structTy);
++  rewriter.replaceOp(op, res);
++
++  return success();
++}
++
++// ----- Hopper implementation.
++
++// Forward declarations.
++Value createDescriptor(ConversionPatternRewriter &rewriter, Location loc,
++                       int64_t swizzling, uint32_t stride);
++int64_t getSwizzlingFromLayout(const SharedEncodingAttr &layout,
++                               uint32_t widthInByte);
++triton::nvgpu::WGMMAEltType getMmaRetType(Value);
++triton::nvgpu::WGMMAEltType getMmaOperandType(Value, bool);
++
++namespace {
++constexpr int kThreadsPerWarp = 32;
++constexpr int kWarpsInGroup = 4;
++constexpr int kMmaAccumulatorCount = 2;
++constexpr int kMmaLineSize = 128;
++constexpr int kMmaAlignment = 16;
++}  // namespace
++
++// Shared memory descriptor builder for WGMMA.
++Value smemDescriptor(int a, int b, ConversionPatternRewriter &rewriter,
++                     Location loc, std::vector<unsigned int> instrShape,
++                     bool trans, int dimWpt, Value warpId, MemDescType tensorTy,
++                     Value baseDesc, int minor) {
++  auto sharedLayout = tensorTy.getEncoding().cast<SharedEncodingAttr>();
++  int elemBytes = tensorTy.getElementTypeBitWidth() / 8;
++  int elemsPerSwizzlingRow =
++      kMmaLineSize / sharedLayout.getPerPhase() / elemBytes;
++  Value elemsPerSwizzlingRowVal = i32_val(elemsPerSwizzlingRow);
++
++  Value k = i32_val(b * instrShape[1]);
++  Value m = add(i32_val(a * dimWpt * instrShape[0]),
++                mul(warpId, i32_val(instrShape[0])));
++  if (trans) {
++    std::swap(k, m);
++  }
++  Value leading_offset = mul(udiv(k, elemsPerSwizzlingRowVal),
++                             i32_val(minor * elemsPerSwizzlingRow));
++  Value stride_offset = mul(m, elemsPerSwizzlingRowVal);
++  Value offset =
++      add(add(leading_offset, stride_offset), urem(k, elemsPerSwizzlingRowVal));
++  Value off1 = mul(i32_val(elemBytes), offset);
++  Value off_ = zext(i64_ty, udiv(off1, i32_val(kMmaAlignment)));
++
++  return add(baseDesc, off_);
++}
++
++LogicalResult convertSparseWGMMA(SparseDotOp op,
++                                 SparseDotOp::Adaptor adaptor,
++                                 const LLVMTypeConverter *typeConverter,
++                                 ConversionPatternRewriter &rewriter,
++                                 Value thread) {
++  // Get number of repetitions across the dimensions.
++  auto aTensorTy = op.getA().getType().cast<MemDescType>();
++  auto bTensorTy = op.getB().getType().cast<MemDescType>();
++  auto dTensorTy = op.getD().getType().cast<RankedTensorType>();
++  auto mmaEnc = dTensorTy.getEncoding().cast<NvidiaMmaEncodingAttr>();
++
++  auto shapePerCTA = getShapePerCTA(dTensorTy);
++  auto shapePerCTATile = getShapePerCTATile(mmaEnc);
++  auto instrShape = mmaEnc.getInstrShape();
++  int repM = ceil<unsigned>(shapePerCTA[0], shapePerCTATile[0]);
++  int repN = ceil<unsigned>(shapePerCTA[1], shapePerCTATile[1]);
++  int repK = ceil<unsigned>(bTensorTy.getShape()[0],
++                            instrShape[2] * kContractingFactor);
++
++  // Flatten accumulator values.
++  auto loc = op.getLoc();
++  auto fc = unpackLLElements(loc, adaptor.getC(), rewriter);
++  int accSize = kMmaAccumulatorCount * (instrShape[1] / kWarpsInGroup);
++  assert(fc.size() == repM * repN * accSize);
++
++  // Get warp ID.
++  auto wpt = mmaEnc.getWarpsPerCTA();
++  Value warp =
++      and_(udiv(thread, i32_val(kThreadsPerWarp)), i32_val(0xFFFFFFFC));
++  Value warpM = urem(warp, i32_val(wpt[0]));
++  Value warpMN = udiv(warp, i32_val(wpt[0]));
++  Value warpN = urem(warpMN, i32_val(wpt[1]));
++
++  // Create descriptor.
++  auto getSharedData = [&](Value arg, MemDescType tensorTy) {
++    auto sharedObj = getSharedMemoryObjectFromStruct(
++        loc, arg, typeConverter->convertType(tensorTy.getElementType()),
++        rewriter);
++    auto sharedLayout = tensorTy.getEncoding().cast<SharedEncodingAttr>();
++    auto shape = getShapePerCTA(tensorTy);
++    auto ord = sharedLayout.getOrder();
++    int byteSize = aTensorTy.getElementTypeBitWidth() / 8;
++    int64_t swizzling =
++        getSwizzlingFromLayout(sharedLayout, shape[ord[0]] * byteSize);
++    Value baseDesc = createDescriptor(rewriter, loc, swizzling, shape[ord[1]]);
++    baseDesc =
++        add(baseDesc, lshr(ptrtoint(i64_ty, sharedObj.base), int_val(64, 4)));
++    return std::make_tuple(shape, ord, baseDesc);
++  };
++
++  // Create descriptor for loading A from shared memory.
++  auto tA = getSharedData(adaptor.getA(), aTensorTy);
++  Value warpA = urem(warpM, i32_val(std::get<0>(tA)[0] / instrShape[0]));
++  bool transA = std::get<1>(tA)[0] == 0;
++  auto loadA = [&](int m, int k) {
++    return smemDescriptor(m, k, rewriter, loc, {instrShape[0], instrShape[2]},
++                          transA, wpt[0], warpA, aTensorTy, std::get<2>(tA),
++                          std::get<0>(tA)[std::get<1>(tA)[1]]);
++  };
++
++  // Create descriptor for loading B from shared memory.
++  auto tB = getSharedData(adaptor.getB(), bTensorTy);
++  Value warpB = urem(warpN, i32_val(std::get<0>(tB)[1] / instrShape[1]));
++  bool transB = std::get<1>(tB)[0] == 1;
++  auto loadB = [&](int n, int k) {
++    return smemDescriptor(n, k, rewriter, loc,
++                          {instrShape[1], instrShape[2] * kContractingFactor},
++                          transB, wpt[1], warpB, bTensorTy, std::get<2>(tB),
++                          std::get<0>(tB)[std::get<1>(tB)[1]]);
++  };
++
++  // Load metadata from shared memory.
++  auto hMeta = unpackLLElements(loc, adaptor.getAMeta(), rewriter);
++  SmallVector<Value> hMetaPacked;
++  for (int i = 0; i < hMeta.size(); i += kCore) {
++    Value lower = zext(i32_ty, hMeta[i]);
++    Value upper = zext(i32_ty, hMeta[i + 1]);
++    Value packed = or_(shl(upper, i32_val(16)), lower);
++    hMetaPacked.push_back(packed);
++  }
++  assert(hMetaPacked.size() == repM * repK);
++
++  // Generate prologue.
++  triton::nvgpu::WGMMAEltType eltTypeA = getMmaOperandType(op.getA(), false);
++  triton::nvgpu::WGMMAEltType eltTypeB = getMmaOperandType(op.getB(), false);
++  triton::nvgpu::WGMMAEltType eltTypeC = getMmaRetType(op.getD());
++
++  triton::nvgpu::WGMMALayout layoutA = transA ? triton::nvgpu::WGMMALayout::col
++                                              : triton::nvgpu::WGMMALayout::row;
++  triton::nvgpu::WGMMALayout layoutB = transB ? triton::nvgpu::WGMMALayout::row
++                                              : triton::nvgpu::WGMMALayout::col;
++
++  rewriter.create<triton::nvgpu::FenceAsyncSharedOp>(loc, 0);
++  rewriter.create<triton::nvgpu::WGMMAFenceOp>(loc);
++
++  // Generate main loop.
++  for (int m = 0; m < repM; ++m) {
++    for (int n = 0; n < repN; ++n) {
++      llvm::MutableArrayRef acc(&fc[(m * repN + n) * accSize], accSize);
++      auto accTy = LLVM::LLVMStructType::getLiteral(
++          op.getContext(), SmallVector<Type>(accSize, f32_ty));
++      Value d = packLLElements(loc, typeConverter, acc, rewriter, accTy);
++      for (int k = 0; k < repK; ++k) {
++        Value a = loadA(m, k);
++        Value b = loadB(n, k);
++        Value meta = hMetaPacked[k * repM + m];
++        d = rewriter.create<triton::nvgpu::SparseWGMMAOp>(
++            loc, accTy, a, meta, b, d, kWarpsInGroup * instrShape[0],
++            instrShape[1], kContractingFactor * instrShape[2], eltTypeC,
++            eltTypeA, eltTypeB, layoutA, layoutB);
++      }
++      auto res = unpackLLElements(loc, d, rewriter);
++      for (int i = 0; i < res.size(); ++i) {
++        acc[i] = res[i];
++      }
++    }
++  }
++
++  // Replace with new packed result.
++  Type structTy = LLVM::LLVMStructType::getLiteral(
++      op.getContext(), SmallVector<Type>(fc.size(), f32_ty));
++  Value res = packLLElements(loc, typeConverter, fc, rewriter, structTy);
++
++  rewriter.create<triton::nvgpu::WGMMACommitGroupOp>(loc);
++  res = rewriter.create<triton::nvgpu::WGMMAWaitGroupOp>(loc, res, 0);
++  rewriter.replaceOp(op, res);
++
++  return success();
++}
++
++// ----- Dispatch based on architecture.
++
++LogicalResult rewriteSparseDotOp(SparseDotOp op,
++                                 SparseDotOp::Adaptor adaptor,
++                                 const LLVMTypeConverter *typeConverter,
++                                 ConversionPatternRewriter &rewriter) {
++  auto resultTy = op.getResult().getType().cast<RankedTensorType>();
++  NvidiaMmaEncodingAttr mmaLayout =
++      resultTy.getEncoding().cast<NvidiaMmaEncodingAttr>();
++
++  if (mmaLayout.isAmpere()) {
++    return convertSparseMMA(op, adaptor, typeConverter, rewriter);
++  }
++  if (mmaLayout.isHopper()) {
++    return convertSparseWGMMA(op, adaptor, typeConverter, rewriter,
++                              getThreadId(rewriter, op.getLoc()));
++  }
++
++  llvm::report_fatal_error(
++      "Unsupported SparseDotOp found when converting TritonGPU to LLVM.");
++}
+diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/WGMMA.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/WGMMA.cpp
+--- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/WGMMA.cpp
++++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/WGMMA.cpp
+@@ -87,8 +87,8 @@ int64_t getSwizzlingFromLayout(const Sha
+   return swizzlingByteWidth;
+ }
+ 
+-static Value createDescriptor(ConversionPatternRewriter &rewriter, Location loc,
+-                              int64_t swizzling, uint32_t stride) {
++Value createDescriptor(ConversionPatternRewriter &rewriter, Location loc,
++                       int64_t swizzling, uint32_t stride) {
+   // Create descriptor based on the format described in the spec:
+   // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-shared-memory-layout-matrix-descriptor
+   union WGMMADescriptor {
diff --git a/third_party/triton/xla_extensions/sparse_dot_nvgpu.patch b/third_party/triton/xla_extensions/sparse_dot_nvgpu.patch
new file mode 100644
index 00000000000000..b96aeacced5743
--- /dev/null
+++ b/third_party/triton/xla_extensions/sparse_dot_nvgpu.patch
@@ -0,0 +1,136 @@
+diff --git a/include/triton/Dialect/NVGPU/IR/NVGPUOps.td b/include/triton/Dialect/NVGPU/IR/NVGPUOps.td
+--- a/include/triton/Dialect/NVGPU/IR/NVGPUOps.td
++++ b/include/triton/Dialect/NVGPU/IR/NVGPUOps.td
+@@ -87,6 +87,15 @@ def NVGPU_WGMMAOp : NVGPU_Op<"wgmma", []
+   let assemblyFormat = "$opA `,` $opB (`,` $opC^)? attr-dict `:` functional-type(operands, $res)";
+ }
+ 
++def NVGPU_SparseWGMMAOp : NVGPU_Op<"wgmma_sp", []> {
++  let arguments = (ins WGMMA_OperandType:$opA, I32:$metaA, WGMMA_OperandType:$opB, LLVM_AnyStruct:$opC,
++                   I32Attr:$m, I32Attr:$n, I32Attr:$k,
++                   WGMMA_EltTypeAttr:$eltTypeC, WGMMA_EltTypeAttr:$eltTypeA, WGMMA_EltTypeAttr:$eltTypeB,
++                   WGMMA_LayoutAttr:$layoutA, WGMMA_LayoutAttr:$layoutB);
++  let results = (outs LLVM_AnyStruct:$res);
++  let assemblyFormat = "$opA `meta` $metaA `,` $opB `,` $opC attr-dict `:` functional-type(operands, $res)";
++}
++
+ def NVGPU_LoadDSmemOp : NVGPU_Op<"load_dsmem", [MemoryEffects<[MemRead]>]> {
+   let arguments = (ins LLVM_AnyPointer:$addr, I32:$ctaId, I32Attr:$bitwidth, I32Attr:$vec);
+   let builders = [
+diff --git a/test/SparseDot/test_wgmma_sp.mlir b/test/SparseDot/test_wgmma_sp.mlir
+new file mode 100644
+--- /dev/null
++++ b/test/SparseDot/test_wgmma_sp.mlir
+@@ -0,0 +1,14 @@
++// RUN: triton-opt %s -split-input-file --convert-nv-gpu-to-llvm | FileCheck %s
++
++module attributes {"triton_gpu.num-warps" = 4 : i32} {
++  tt.func @wgmma_sp(%descA: i64, %metaA: i32, %descB: i64, %acc: !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>) {
++    // CHECK: @wgmma_sp(%[[LHS:.*]]: i64, %[[META:.*]]: i32, %[[RHS:.*]]: i64,
++    // CHECK: llvm.inline_asm has_side_effects asm_dialect = att operand_attrs = []
++    // CHECK-SAME: "wgmma.mma_async.sp.sync.aligned.m64n16k32.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7}, $16, $17, $18, 0, 1, 1, 1, 0, 0;"
++    // CHECK-SAME: "=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,l,l,r" %0, %1, %2, %3, %4, %5, %6, %7, %[[LHS]], %[[RHS]], %[[META]]
++    %acc0 = nvgpu.wgmma_sp %descA meta %metaA, %descB, %acc
++    {eltTypeA = 5 : i32, eltTypeB = 5 : i32, eltTypeC = 7 : i32, layoutA = 0 : i32, layoutB = 1 : i32, m = 64 : i32, n = 16 : i32, k = 32 : i32} :
++    (i64, i32, i64, !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>) -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
++    tt.return
++  }
++}
+diff --git a/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp b/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp
+--- a/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp
++++ b/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp
+@@ -688,6 +688,84 @@ public:
+   }
+ };
+ 
++class SparseWGMMAOpPattern
++    : public NVGPUOpPatternBase<ttn::SparseWGMMAOp, SparseWGMMAOpPattern> {
++public:
++  using Base = NVGPUOpPatternBase<ttn::SparseWGMMAOp, SparseWGMMAOpPattern>;
++  using Base::Base;
++
++  std::vector<std::string> getOutputConstraints(ttn::SparseWGMMAOp op) const {
++    auto outputStructType = op.getType().cast<LLVM::LLVMStructType>();
++    uint32_t numOutputRegs = outputStructType.getBody().size();
++    std::string output =
++        outputStructType.getBody().front().isF32() ? "=f" : "=r";
++    return std::vector<std::string>(numOutputRegs, output);
++  }
++
++  OperandsAndConstraints getOperandsAndConstraints(
++      ttn::SparseWGMMAOp op) const {
++    return {{op.getOpC(), "0"}, {op.getOpA(), "l"}, {op.getOpB(), "l"},
++            {op.getMetaA(), "r"}};
++  }
++
++  std::string getPtxAsm(ttn::SparseWGMMAOp op) const {
++    using namespace ttn;
++    auto opA = op.getOpA();
++    auto opB = op.getOpB();
++    auto m = op.getM();
++    auto n = op.getN();
++    auto k = op.getK();
++    auto eltTypeC = op.getEltTypeC();
++    auto eltTypeA = op.getEltTypeA();
++    auto eltTypeB = op.getEltTypeB();
++    auto layoutA = op.getLayoutA();
++    auto layoutB = op.getLayoutB();
++
++    // Only f16/bf16 variant is supported.
++    bool supported =
++        eltTypeC == WGMMAEltType::f32 &&
++        ((eltTypeA == WGMMAEltType::f16 && eltTypeB == WGMMAEltType::f16) ||
++         (eltTypeA == WGMMAEltType::bf16 && eltTypeB == WGMMAEltType::bf16)) &&
++        (m == 64 && 8 <= n && n <= 256 && n % 8 == 0 && k == 32);
++    assert(supported && "Sparse WGMMA type or shape is not supported");
++
++    // Operands
++    uint32_t asmOpIdx = 0;
++    std::string args = "";
++
++    // Output and operand C
++    uint32_t numCRegs =
++        op.getType().cast<LLVM::LLVMStructType>().getBody().size();
++    args += "{";
++    for (uint32_t i = 0; i < numCRegs; ++i) {
++      args += "$" + std::to_string(asmOpIdx++) + (i == numCRegs - 1 ? "" : ",");
++    }
++    args += "}, ";
++    asmOpIdx += numCRegs;
++
++    // Operands A and B (must be `desc`)
++    args += "$" + std::to_string(asmOpIdx++) + ", ";
++    args += "$" + std::to_string(asmOpIdx++) + ", ";
++
++    // Metadata for A
++    args += "$" + std::to_string(asmOpIdx++) + ", 0, ";
++
++    // `scale-d`, `imm-scale-a`, and `imm-scale-b` are 1 by default
++    args += "1, 1, 1";
++
++    // `trans-a` and `trans-b`
++    args += ", " + std::to_string(layoutA == WGMMALayout::col);
++    args += ", " + std::to_string(layoutB == WGMMALayout::row);
++
++    auto ptxAsm = "wgmma.mma_async.sp.sync.aligned"
++                  ".m" + std::to_string(m) + "n" + std::to_string(n) + "k" +
++                  std::to_string(k) + "." + stringifyEnum(eltTypeC).str() +
++                  "." + stringifyEnum(eltTypeA).str() + "." +
++                  stringifyEnum(eltTypeB).str() + " " + args + ";";
++    return ptxAsm;
++  }
++};
++
+ class ConvertNVGPUToLLVM : public ConvertNVGPUToLLVMBase<ConvertNVGPUToLLVM> {
+ 
+ public:
+@@ -711,7 +789,8 @@ public:
+ 
+     patterns.add<FenceAsyncSharedOpPattern, StoreMatrixOpPattern,
+                  ClusterArriveOpPattern, LoadDSmemOpPattern, WGMMAOpPattern,
+-                 WGMMAWaitGroupOpPattern, StoreDSmemOpPattern>(context);
++                 WGMMAWaitGroupOpPattern, StoreDSmemOpPattern,
++                 SparseWGMMAOpPattern>(context);
+ 
+     if (applyPatternsAndFoldGreedily(mod, std::move(patterns)).failed())
+       signalPassFailure();
diff --git a/third_party/triton/xla_extensions/sparse_dot_passes.patch b/third_party/triton/xla_extensions/sparse_dot_passes.patch
new file mode 100644
index 00000000000000..e610f67cd70309
--- /dev/null
+++ b/third_party/triton/xla_extensions/sparse_dot_passes.patch
@@ -0,0 +1,591 @@
+diff --git a/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp b/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp
+--- a/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp
++++ b/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp
+@@ -277,6 +277,89 @@ struct TritonDotPattern : public OpConve
+   }
+ };
+ 
++struct TritonSparseDotPattern
++    : public OpConversionPattern<triton::gpu::SparseDotOp> {
++  using OpConversionPattern<triton::gpu::SparseDotOp>::OpConversionPattern;
++
++  LogicalResult matchAndRewrite(
++      triton::gpu::SparseDotOp op, OpAdaptor adaptor,
++      ConversionPatternRewriter &rewriter) const override {
++    RankedTensorType origType = op.getType().cast<RankedTensorType>();
++    auto origShape = origType.getShape();
++    auto typeConverter = getTypeConverter<TritonGPUTypeConverter>();
++    int numWarps = typeConverter->getNumWarps();
++    int threadsPerWarp = typeConverter->getThreadsPerWarp();
++    int numCTAs = typeConverter->getNumCTAs();
++
++    auto rank = origShape.size();
++    auto numElements = product<int64_t>(origShape);
++    SmallVector<unsigned> retSizePerThread(rank, 1);
++    if (numElements / (numWarps * threadsPerWarp) >= 4) {
++      retSizePerThread[rank - 1] = 2;
++      retSizePerThread[rank - 2] = 2;
++    }
++    if (numElements / (numWarps * threadsPerWarp) >= 16) {
++      retSizePerThread[rank - 1] = 4;
++      retSizePerThread[rank - 2] = 4;
++    }
++    SmallVector<unsigned> retOrder(rank);
++    for (unsigned i = 0; i < rank; ++i)
++      retOrder[i] = rank - 1 - i;
++    Attribute dEncoding = triton::gpu::BlockedEncodingAttr::get(
++        getContext(), origShape, retSizePerThread, retOrder, numWarps,
++        threadsPerWarp, numCTAs);
++    RankedTensorType retType =
++        RankedTensorType::get(origShape, origType.getElementType(), dEncoding);
++
++    // a & b must be of smem layout
++    auto aType = adaptor.getA().getType().cast<RankedTensorType>();
++    auto bType = adaptor.getB().getType().cast<RankedTensorType>();
++    Type aEltType = aType.getElementType();
++    Type bEltType = bType.getElementType();
++    Attribute aEncoding = aType.getEncoding();
++    Attribute bEncoding = bType.getEncoding();
++    if (!aEncoding || !bEncoding)
++      return failure();
++    Value a = adaptor.getA();
++    Value b = adaptor.getB();
++    Value c = adaptor.getC();
++    if (!aEncoding.isa<triton::gpu::DotOperandEncodingAttr>()) {
++      Attribute encoding = triton::gpu::DotOperandEncodingAttr::get(
++          getContext(), 0, dEncoding, aEltType);
++      auto dstType =
++          RankedTensorType::get(aType.getShape(), aEltType, encoding);
++      a = rewriter.create<triton::gpu::ConvertLayoutOp>(a.getLoc(), dstType, a);
++    }
++    if (!bEncoding.isa<triton::gpu::DotOperandEncodingAttr>()) {
++      Attribute encoding = triton::gpu::DotOperandEncodingAttr::get(
++          getContext(), 1, dEncoding, bEltType);
++      auto dstType =
++          RankedTensorType::get(bType.getShape(), bEltType, encoding);
++      b = rewriter.create<triton::gpu::ConvertLayoutOp>(b.getLoc(), dstType, b);
++    }
++    c = rewriter.create<triton::gpu::ConvertLayoutOp>(c.getLoc(), retType, c);
++
++    // aMeta must be of smem layout
++    auto aMetaType = adaptor.getAMeta().getType().cast<RankedTensorType>();
++    Attribute aMetaEncoding = aMetaType.getEncoding();
++    if (!aMetaEncoding) return failure();
++    Value aMeta = adaptor.getAMeta();
++    if (!aMetaEncoding.isa<triton::gpu::SparseDotMetaEncodingAttr>()) {
++      Attribute encoding =
++          triton::gpu::SparseDotMetaEncodingAttr::get(getContext(), dEncoding);
++      auto dstType = RankedTensorType::get(
++          aMetaType.getShape(), aMetaType.getElementType(), encoding);
++      aMeta = rewriter.create<triton::gpu::ConvertLayoutOp>(aMeta.getLoc(),
++                                                            dstType, aMeta);
++    }
++
++    addNamedAttrs(rewriter.replaceOpWithNewOp<triton::gpu::SparseDotOp>(
++                      op, retType, a, b, c, aMeta),
++                  adaptor.getAttributes());
++    return success();
++  }
++};
++
+ struct TritonCatPattern : public OpConversionPattern<triton::CatOp> {
+   using OpConversionPattern::OpConversionPattern;
+ 
+@@ -550,6 +633,7 @@ void populateTritonPatterns(TritonGPUTyp
+       GenericOpPattern<triton::AtomicRMWOp>, GenericOpPattern<ReturnOp>,
+       GenericOpPattern<triton::CallOp>, TritonFuncOpPattern>(typeConverter,
+                                                              context);
++  patterns.insert<TritonSparseDotPattern>(typeConverter, context);
+ }
+ 
+ //
+@@ -788,6 +872,12 @@ public:
+                  IntegerAttr::get(
+                      i32_ty, llvm::APInt(32, computeCapability.getValue())));
+ 
++    // Only transform sparse dot op with undefined layout.
++    target.addDynamicallyLegalOp<triton::gpu::SparseDotOp>(
++        [](triton::gpu::SparseDotOp op) {
++          return op.getAMeta().getType().getEncoding() != nullptr;
++        });
++
+     if (failed(applyPartialConversion(mod, target, std::move(patterns))))
+       return signalPassFailure();
+ 
+diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
+--- a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
++++ b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
+@@ -42,8 +42,9 @@ static int getMMAVersionSafe(int compute
+   return 0;
+ }
+ 
++template <typename DotType>
+ SmallVector<unsigned>
+-warpsPerTileV2(tt::DotOp dotOp, const ArrayRef<int64_t> shape, int numWarps) {
++warpsPerTileV2(DotType dotOp, const ArrayRef<int64_t> shape, int numWarps) {
+   auto rank = shape.size();
+   // Early exit for batched matmul
+   if (rank == 3)
+@@ -56,14 +57,14 @@ warpsPerTileV2(tt::DotOp dotOp, const Ar
+   auto slices = multiRootGetSlice(dotOp, {filter}, {filter});
+   bool hasChainedDot = false;
+   for (Operation *op : slices) {
+-    if (isa<tt::DotOp>(op) && (op != dotOp)) {
+-      auto chainedDot = cast<tt::DotOp>(op);
++    if (isa<DotType>(op) && (op != dotOp)) {
++      auto chainedDot = cast<DotType>(op);
+       auto resTy = chainedDot.getResult().getType();
+       if (resTy.getRank() != rank) {
+         continue;
+       }
+       if (auto mmaEncoding =
+-              resTy.getEncoding().dyn_cast<NvidiaMmaEncodingAttr>()) {
++              resTy.getEncoding().template dyn_cast<NvidiaMmaEncodingAttr>()) {
+         return ttg::getWarpsPerCTA(mmaEncoding);
+       }
+       hasChainedDot = true;
+@@ -101,12 +102,13 @@ warpsPerTileV2(tt::DotOp dotOp, const Ar
+   return ret;
+ }
+ 
+-SmallVector<unsigned, 2>
+-warpsPerTileV3(tt::DotOp dotOp, const ArrayRef<int64_t> shape, int numWarps,
+-               const SmallVector<unsigned, 3> &instrShape) {
++template <typename DotType>
++SmallVector<unsigned, 2> warpsPerTileV3(
++    DotType dotOp, const ArrayRef<int64_t> shape, int numWarps,
++    const SmallVector<unsigned, 3> &instrShape) {
+   SetVector<Operation *> slices;
+   mlir::getForwardSlice(dotOp.getResult(), &slices);
+-  if (llvm::find_if(slices, [](Operation *op) { return isa<tt::DotOp>(op); }) !=
++  if (llvm::find_if(slices, [](Operation *op) { return isa<DotType>(op); }) !=
+       slices.end())
+     return {(unsigned)numWarps, 1};
+ 
+@@ -175,9 +177,10 @@ public:
+       : mlir::RewritePattern(tt::DotOp::getOperationName(), 2, context),
+         computeCapability(computeCapability) {}
+ 
+-  static SmallVector<unsigned, 3>
+-  getWarpsPerTile(tt::DotOp dotOp, const ArrayRef<int64_t> shape, int version,
+-                  int numWarps, const SmallVector<unsigned, 3> &instrShape) {
++  template <typename DotType>
++  static SmallVector<unsigned, 3> getWarpsPerTile(
++      DotType dotOp, const ArrayRef<int64_t> shape, int version, int numWarps,
++      const SmallVector<unsigned, 3> &instrShape) {
+     switch (version) {
+     case 2:
+       return warpsPerTileV2(dotOp, shape, numWarps);
+@@ -337,6 +340,97 @@ public:
+     return success();
+   }
+ };
++
++class SparseBlockedToMMA : public mlir::RewritePattern {
++ public:
++  using SparseDotOp = mlir::triton::gpu::SparseDotOp;
++  using SparseDotMetaEncodingAttr =
++      mlir::triton::gpu::SparseDotMetaEncodingAttr;
++
++  SparseBlockedToMMA(mlir::MLIRContext *context, int computeCapability)
++      : mlir::RewritePattern(SparseDotOp::getOperationName(), 2, context),
++        computeCapability(computeCapability) {}
++
++  mlir::LogicalResult matchAndRewrite(
++      mlir::Operation *op, mlir::PatternRewriter &rewriter) const override {
++    auto dotOp = cast<SparseDotOp>(op);
++    auto ctx = op->getContext();
++    Value a = dotOp.getA();
++    Value b = dotOp.getB();
++
++    // Check data-types and SM compatibility
++    RankedTensorType oldRetType = dotOp.getType();
++    if (!oldRetType.getEncoding() ||
++        oldRetType.getEncoding().isa<ttg::NvidiaMmaEncodingAttr>())
++      return failure();
++
++    assert(computeCapability >= 80 &&
++           "SparseDot is supported on Ampere and higher");
++    int versionMajor = computeCapability < 90 ? 2 : 3;
++
++    // get MMA encoding for the given number of warps
++    auto retShapePerCTA = ttg::getShapePerCTA(oldRetType);
++    auto mod = op->getParentOfType<mlir::ModuleOp>();
++    int numWarps = ttg::TritonGPUDialect::getNumWarps(mod);
++    auto CTALayout = ttg::getCTALayout(oldRetType.getEncoding());
++
++    auto instrShape = mmaVersionToInstrShape(
++        versionMajor, retShapePerCTA, a.getType().cast<TensorOrMemDesc>());
++    auto warpsPerTile = BlockedToMMA::getWarpsPerTile(
++        dotOp, retShapePerCTA, versionMajor, numWarps, instrShape);
++    ttg::NvidiaMmaEncodingAttr mmaEnc =
++        ttg::NvidiaMmaEncodingAttr::get(ctx, versionMajor, /*versionMinor=*/0,
++                                        warpsPerTile, CTALayout, instrShape);
++    auto newRetType = RankedTensorType::get(
++        oldRetType.getShape(), oldRetType.getElementType(), mmaEnc);
++
++    // convert accumulator
++    auto oldAcc = dotOp.getOperand(2);
++    auto newAcc = rewriter.create<ttg::ConvertLayoutOp>(oldAcc.getLoc(),
++                                                        newRetType, oldAcc);
++
++    if (versionMajor == 2) {
++      // convert A operand
++      auto oldAType = a.getType().cast<RankedTensorType>();
++      auto newAEncoding = ttg::DotOperandEncodingAttr::get(
++          ctx, 0, mmaEnc, oldAType.getElementType());
++      auto newAType = RankedTensorType::get(
++          oldAType.getShape(), oldAType.getElementType(), newAEncoding);
++      a = rewriter.create<ttg::ConvertLayoutOp>(a.getLoc(), newAType, a);
++
++      // convert B operand
++      auto oldBType = b.getType().cast<RankedTensorType>();
++      auto newBEncoding = ttg::DotOperandEncodingAttr::get(
++          ctx, 1, mmaEnc, oldBType.getElementType());
++      auto newBType = RankedTensorType::get(
++          oldBType.getShape(), oldBType.getElementType(), newBEncoding);
++      b = rewriter.create<ttg::ConvertLayoutOp>(b.getLoc(), newBType, b);
++    } else {
++      a = BlockedToMMA::getMMAv3Operand(a, rewriter, 0);
++      b = BlockedToMMA::getMMAv3Operand(b, rewriter, 1);
++    }
++
++    // convert metadata
++    Value meta = dotOp.getAMeta();
++    auto oldMetaType = meta.getType().cast<RankedTensorType>();
++    auto newMetaType = RankedTensorType::get(
++        oldMetaType.getShape(), oldMetaType.getElementType(),
++        SparseDotMetaEncodingAttr::get(ctx, mmaEnc));
++    meta =
++        rewriter.create<ttg::ConvertLayoutOp>(meta.getLoc(), newMetaType, meta);
++
++    // convert dot instruction
++    auto newDot = rewriter.create<SparseDotOp>(dotOp.getLoc(), newRetType, a, b,
++                                               newAcc, meta);
++
++    rewriter.replaceOpWithNewOp<ttg::ConvertLayoutOp>(op, oldRetType,
++                                                      newDot.getResult());
++    return success();
++  }
++
++ private:
++  int computeCapability;
++};
+ } // namespace
+ 
+ static Value promoteOperand(OpBuilder &builder, Location loc, Value operand,
+@@ -397,6 +491,7 @@ public:
+ 
+     mlir::RewritePatternSet patterns(context);
+     patterns.add<::BlockedToMMA>(context, computeCapability);
++    patterns.add<::SparseBlockedToMMA>(context, computeCapability);
+     if (applyPatternsAndFoldGreedily(m, std::move(patterns)).failed()) {
+       signalPassFailure();
+     }
+diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp
+--- a/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp
++++ b/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp
+@@ -47,6 +47,10 @@ struct PipelinedOpInfo {
+   bool loadIsMMAV3 = false;
+ };
+ 
++bool isDotOp(Operation* op) {
++  return isa<tt::DotOp, ttg::SparseDotOp>(op);
++}
++
+ } // namespace
+ 
+ static bool isMMAv3Dot(Operation *op) {
+@@ -163,22 +167,28 @@ getSharedEncIfAllUsersAreDotEnc(Value va
+     } else {
+       if (!isa<ttg::LocalLoadOp, ttg::ConvertLayoutOp>(user))
+         return std::nullopt;
+-      auto dotOpEnc = user->getResult(0)
+-                          .getType()
+-                          .cast<TensorOrMemDesc>()
+-                          .getEncoding()
+-                          .dyn_cast<ttg::DotOperandEncodingAttr>();
+-      if (!dotOpEnc)
++      auto enc =
++          user->getResult(0).getType().cast<TensorOrMemDesc>().getEncoding();
++      if (isa<ttg::DotOperandEncodingAttr>(enc)) {
++        auto srcTy = val.getType().cast<TensorOrMemDesc>();
++        auto CTALayout = ttg::getCTALayout(srcTy.getEncoding());
++        auto order = ttg::getOrder(srcTy.getEncoding());
++        unsigned bitWidth = srcTy.getElementType().getIntOrFloatBitWidth();
++        tempAttr = ttg::SharedEncodingAttr::get(
++            val.getContext(), cast<ttg::DotOperandEncodingAttr>(enc),
++            srcTy.getShape(), ttg::getOrder(srcTy.getEncoding()),
++            ttg::getCTALayout(srcTy.getEncoding()),
++            srcTy.getElementType().getIntOrFloatBitWidth(),
++            /*needTrans=*/false);
++      } else if (isa<ttg::SparseDotMetaEncodingAttr>(enc)) {
++        auto srcTy = val.getType().cast<TensorOrMemDesc>();
++        tempAttr = ttg::SharedEncodingAttr::get(
++            val.getContext(), /*vec=*/1, /*perPhase=*/1, /*maxPhase=*/1,
++            ttg::getOrder(srcTy.getEncoding()),
++            ttg::getCTALayout(srcTy.getEncoding()));
++      } else {
+         return std::nullopt;
+-      auto srcTy = val.getType().cast<TensorOrMemDesc>();
+-      auto CTALayout = ttg::getCTALayout(srcTy.getEncoding());
+-      auto order = ttg::getOrder(srcTy.getEncoding());
+-      unsigned bitWidth = srcTy.getElementType().getIntOrFloatBitWidth();
+-      tempAttr = ttg::SharedEncodingAttr::get(
+-          val.getContext(), dotOpEnc, srcTy.getShape(),
+-          ttg::getOrder(srcTy.getEncoding()),
+-          ttg::getCTALayout(srcTy.getEncoding()),
+-          srcTy.getElementType().getIntOrFloatBitWidth(), /*needTrans=*/false);
++      }
+     }
+     // Check that the shared encodings needed by the users are compatible.
+     if (!tempAttr || (attr != nullptr && attr != tempAttr))
+@@ -311,7 +321,7 @@ loadOpsToDistanceAndUse(scf::ForOp forOp
+       };
+ 
+   for (Operation &op : forOp.getBody()->without_terminator()) {
+-    if (!isa<tt::DotOp>(op))
++    if (!isDotOp(&op))
+       continue;
+     dfs(&op, 0, &op);
+   }
+@@ -385,7 +395,7 @@ collectOpsToPipeline(scf::ForOp forOp,
+   // loads.
+   for (auto &[loadOp, distAndUse] : loadOpToDistAndUse) {
+     PipelinedOpInfo loadInfo;
+-    if (isa<tt::DotOp>(distAndUse.second)) {
++    if (isDotOp(distAndUse.second)) {
+       if (loadIsMMAv3(loadOp)) {
+         loadInfo.loadIsMMAV3 = true;
+         loadInfo.sharedEncoding =
+@@ -743,7 +753,7 @@ bool mlir::triton::preProcessLoopAndGetS
+     int useStage = opToInfo[info.use].stage;
+     int numBuffers = useStage - defStage;
+ 
+-    if (hasMMAV3 && isa<tt::DotOp>(info.use)) {
++    if (hasMMAV3 && isDotOp(info.use)) {
+       // For MMAv3, we need an extra buffer as this is assumed in the wgmma
+       // pipelining post-processing.
+       numBuffers++;
+diff --git a/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp b/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp
+--- a/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp
++++ b/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp
+@@ -36,6 +36,10 @@ public:
+       auto srcEncoding = srcType.getEncoding();
+       if (srcEncoding.isa<triton::gpu::SharedEncodingAttr>())
+         return;
++      if (dstType.getEncoding().isa<triton::gpu::SparseDotMetaEncodingAttr>()) {
++        replaceSparseMetaEncoding(cvtOp);
++        return;
++      }
+       auto dstDotOp =
+           dstType.getEncoding().dyn_cast<triton::gpu::DotOperandEncodingAttr>();
+       if (!dstDotOp)
+@@ -74,6 +78,27 @@ public:
+       cvtOp.erase();
+     });
+   }
++
++ private:
++  void replaceSparseMetaEncoding(triton::gpu::ConvertLayoutOp cvtOp) {
++    auto srcType = cvtOp.getOperand().getType().cast<RankedTensorType>();
++    auto srcEncoding = srcType.getEncoding();
++    auto sharedLayout = triton::gpu::SharedEncodingAttr::get(
++        cvtOp.getContext(), 8, 1, 1, triton::gpu::getOrder(srcEncoding),
++        triton::gpu::getCTALayout(srcEncoding));
++
++    auto dstType = cvtOp.getType().cast<RankedTensorType>();
++    auto tmpType = triton::MemDescType::get(
++        dstType.getShape(), dstType.getElementType(), sharedLayout);
++
++    OpBuilder builder(cvtOp);
++    auto tmp = builder.create<triton::gpu::LocalAllocOp>(
++        cvtOp.getLoc(), tmpType, cvtOp.getSrc());
++    auto newConvert = builder.create<triton::gpu::LocalLoadOp>(
++        cvtOp.getLoc(), dstType, tmp);
++    cvtOp.replaceAllUsesWith(newConvert.getResult());
++    cvtOp.erase();
++  }
+ };
+ 
+ std::unique_ptr<Pass> mlir::triton::gpu::createReduceDataDuplicationPass() {
+diff --git a/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp b/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
+--- a/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
++++ b/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
+@@ -45,7 +45,7 @@ public:
+       return;
+     ModuleOp mod = getOperation();
+     mod.walk([&](Operation *op) {
+-      if (!isa<tt::DotOp, ttng::DotAsyncOp>(op))
++      if (!isa<tt::DotOp, ttng::DotAsyncOp, ttg::SparseDotOp>(op))
+         return WalkResult::advance();
+       OpBuilder builder(op);
+       auto a = op->getOperand(0);
+@@ -83,7 +83,7 @@ private:
+     static DenseSet<std::pair<Operation *, unsigned>> trace;
+     auto op = operand.getDefiningOp();
+     // avoid redundant insertion
+-    if (op && isa<tt::DotOp, ttng::DotAsyncOp>(op))
++    if (op && isa<tt::DotOp, ttng::DotAsyncOp, ttg::SparseDotOp>(op))
+       return false;
+     // reach convertlayout
+     if (op && isa<ttg::LocalAllocOp>(op) &&
+diff --git a/test/SparseDot/add_layout.mlir b/test/SparseDot/add_layout.mlir
+new file mode 100644
+--- /dev/null
++++ b/test/SparseDot/add_layout.mlir
+@@ -0,0 +1,15 @@
++// RUN: triton-opt %s -split-input-file -convert-triton-to-tritongpu | FileCheck %s
++
++// CHECK-COUNT-4: #triton_gpu.blocked
++module attributes {"triton_gpu.num-warps" = 4 : i32} {
++  tt.func @sparse_dot() {
++    %A = arith.constant dense<1.00e+00> : tensor<64x32xf16>
++    %meta = arith.constant dense<0x3333> : tensor<64x4xi16>
++    %B = arith.constant dense<2.00e+00> : tensor<64x64xf16>
++    %C = arith.constant dense<0.00e+00> : tensor<64x64xf32>
++    // CHECK-COUNT-4: triton_gpu.convert_layout
++    // CHECK: triton_gpu.sparse_dot {{.+}} #triton_gpu.sparse_dot_meta
++    %D = triton_gpu.sparse_dot %A, %B, %C, %meta : tensor<64x32xf16> meta tensor<64x4xi16> * tensor<64x64xf16> -> tensor<64x64xf32>
++    tt.return
++  }
++}
+diff --git a/test/SparseDot/ttg_accelerate_matmul.mlir b/test/SparseDot/ttg_accelerate_matmul.mlir
+new file mode 100644
+--- /dev/null
++++ b/test/SparseDot/ttg_accelerate_matmul.mlir
+@@ -0,0 +1,27 @@
++// RUN: ENABLE_MMA_V3=1 triton-opt %s -split-input-file -tritongpu-accelerate-matmul=compute-capability=90 | FileCheck %s
++// RUN: triton-opt %s -split-input-file -tritongpu-accelerate-matmul=compute-capability=80 | FILECHECK_OPTS= FileCheck %s --check-prefix=CHECK-80
++
++#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
++// CHECK: #[[MMA:.+]] = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 64, 16]}>
++// CHECK-80: #[[MMA:.+]] = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 2], instrShape = [16, 8]}>
++#lhs = #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>
++#rhs = #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>
++module attributes {"triton_gpu.num-warps" = 4 : i32} {
++  tt.func @sparse_dot(%A: tensor<64x32xf16, #lhs>, %B: tensor<64x64xf16, #rhs>, %meta: tensor<64x4xi16, #blocked>) -> tensor<64x64xf32, #blocked> {
++    %C = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked>
++    // CHECK-DAG: %[[LHS:.+]] = triton_gpu.local_alloc {{.+}} : (tensor<64x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>>) -> !tt.memdesc<64x32xf16, #{{.+}}>
++    // CHECK-DAG: %[[RHS:.+]] = triton_gpu.local_alloc {{.+}} : (tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>>) -> !tt.memdesc<64x64xf16, #{{.+}}>
++    // CHECK-DAG: %[[ACC:.+]] = triton_gpu.convert_layout {{.+}} : tensor<64x64xf32, #blocked> -> tensor<64x64xf32, #[[MMA]]>
++    // CHECK-DAG: %[[META:.+]] = triton_gpu.convert_layout {{.+}} : tensor<64x4xi16, #blocked> -> tensor<64x4xi16, #triton_gpu.sparse_dot_meta<{parent = #[[MMA]]}>>
++    // CHECK: %[[OUT:.+]] = triton_gpu.sparse_dot %[[LHS]], %[[RHS]], %[[ACC]], %[[META]] : {{.+}} -> tensor<64x64xf32, #[[MMA]]>
++    // CHECK-80-DAG: %[[LHS:.+]] = triton_gpu.convert_layout {{.+}} : {{.+}} -> tensor<64x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #[[MMA]], kWidth = 2}>>
++    // CHECK-80-DAG: %[[RHS:.+]] = triton_gpu.convert_layout {{.+}} : {{.+}} -> tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #[[MMA]], kWidth = 2}>>
++    // CHECK-80-DAG: %[[ACC:.+]] = triton_gpu.convert_layout {{.+}} : {{.+}} -> tensor<64x64xf32, #[[MMA]]>
++    // CHECK-80-DAG: %[[META:.+]] = triton_gpu.convert_layout {{.+}} : {{.+}} -> tensor<64x4xi16, #triton_gpu.sparse_dot_meta<{parent = #[[MMA]]}>>
++    // CHECK-80: %[[OUT:.+]] = triton_gpu.sparse_dot %[[LHS]], %[[RHS]], %[[ACC]], %[[META]] : {{.+}} -> tensor<64x64xf32, #[[MMA]]>
++    %D = triton_gpu.sparse_dot %A, %B, %C, %meta : tensor<64x32xf16, #lhs> meta tensor<64x4xi16, #blocked> * tensor<64x64xf16, #rhs> -> tensor<64x64xf32, #blocked>
++    // CHECK: triton_gpu.convert_layout %[[OUT]] : tensor<64x64xf32, #[[MMA]]> -> tensor<64x64xf32, #blocked>
++    // CHECK-80: triton_gpu.convert_layout %[[OUT]] : tensor<64x64xf32, #[[MMA]]> -> tensor<64x64xf32, #blocked>
++    tt.return %D : tensor<64x64xf32, #blocked>
++  }
++}
+diff --git a/test/SparseDot/ttg_fence_insertion.mlir b/test/SparseDot/ttg_fence_insertion.mlir
+new file mode 100644
+--- /dev/null
++++ b/test/SparseDot/ttg_fence_insertion.mlir
+@@ -0,0 +1,18 @@
++// RUN: ENABLE_MMA_V3=1 triton-opt %s -split-input-file -triton-nvidia-gpu-fence-insertion | FileCheck %s
++
++#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0]}>
++#mma = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 64, 16]}>
++#lhs = #triton_gpu.dot_op<{opIdx = 0, parent = #mma}>
++#rhs = #triton_gpu.dot_op<{opIdx = 1, parent = #mma}>
++#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = true}>
++module attributes {"triton_gpu.num-warps" = 4 : i32} {
++  tt.func public @sparse_dot_fence(%A: tensor<64x32xf16, #lhs>, %B: tensor<64x64xf16, #rhs>, %meta: tensor<64x4xi16, #blocked>) {
++    %C = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #mma>
++    %0 = triton_gpu.local_alloc %A : (tensor<64x32xf16, #lhs>) -> !tt.memdesc<64x32xf16, #shared>
++    %1 = triton_gpu.local_alloc %B : (tensor<64x64xf16, #rhs>) -> !tt.memdesc<64x64xf16, #shared>
++    %2 = triton_gpu.convert_layout %meta : tensor<64x4xi16, #blocked> -> tensor<64x4xi16, #triton_gpu.sparse_dot_meta<{parent = #mma}>>
++    // CHECK: triton_nvidia_gpu.fence_async_shared
++    %3 = triton_gpu.sparse_dot %0, %1, %C, %2 : !tt.memdesc<64x32xf16, #shared> meta tensor<64x4xi16, #triton_gpu.sparse_dot_meta<{parent = #mma}>> * !tt.memdesc<64x64xf16, #shared> -> tensor<64x64xf32, #mma>
++    tt.return
++  }
++}
+diff --git a/test/SparseDot/ttg_loop_pipeline.mlir b/test/SparseDot/ttg_loop_pipeline.mlir
+new file mode 100644
+--- /dev/null
++++ b/test/SparseDot/ttg_loop_pipeline.mlir
+@@ -0,0 +1,61 @@
++// RUN: triton-opt %s -split-input-file -tritongpu-pipeline=num-stages=3 | FileCheck %s
++
++#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0]}>
++#sliced = #triton_gpu.slice<{parent=#blocked, dim=0}>
++#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, warpsPerCTA = [4, 1]}>
++#dot_operand_a = #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth=2}>
++#dot_operand_b = #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth=2}>
++#dot_meta_enc = #triton_gpu.sparse_dot_meta<{parent=#mma}>
++
++module attributes {"triton_gpu.num-warps" = 4 : i32} {
++  tt.func @sparse_dot_loop(%lb : index, %ub : index, %step : index,
++        %A : !tt.ptr<f16> {tt.divisibility = 16 : i32},
++        %B : !tt.ptr<f16> {tt.divisibility = 16 : i32},
++        %A_meta : !tt.ptr<i16> {tt.divisibility = 16 : i32}) -> tensor<128x128xf32, #mma> {
++    // CHECK-COUNT-6: triton_gpu.async_copy_global_to_local
++    // CHECK: triton_gpu.async_wait {{.+}}, {{.+}} {num = 3 : i32}
++    %a_ptr_splat = tt.splat %A : !tt.ptr<f16> -> tensor<128x32x!tt.ptr<f16>, #blocked>
++    %a_tmp0 = tt.make_range {end = 32: i32, start = 0: i32} : tensor<32xi32, #sliced>
++    %a_tmp1 = tt.expand_dims %a_tmp0 {axis = 0 : i32} : tensor<32xi32, #sliced> -> tensor<1x32xi32, #blocked>
++    %a_offs = tt.broadcast %a_tmp1 : tensor<1x32xi32, #blocked> -> tensor<128x32xi32, #blocked>
++    %a_ptr_init = tt.addptr %a_ptr_splat, %a_offs : tensor<128x32x!tt.ptr<f16>, #blocked>, tensor<128x32xi32, #blocked>
++
++    %b_ptr_splat = tt.splat %B : !tt.ptr<f16> -> tensor<64x128x!tt.ptr<f16>, #blocked>
++    %b_tmp0 = tt.make_range {end = 128: i32, start = 0: i32} : tensor<128xi32, #sliced>
++    %b_tmp1 = tt.expand_dims %b_tmp0 {axis = 0 : i32} : tensor<128xi32, #sliced> -> tensor<1x128xi32, #blocked>
++    %b_offs = tt.broadcast %b_tmp1 : tensor<1x128xi32, #blocked> -> tensor<64x128xi32, #blocked>
++    %b_ptr_init = tt.addptr %b_ptr_splat, %b_offs : tensor<64x128x!tt.ptr<f16>, #blocked>, tensor<64x128xi32, #blocked>
++
++    %meta_ptr_splat = tt.splat %A_meta : !tt.ptr<i16> -> tensor<128x4x!tt.ptr<i16>, #blocked>
++    %meta_tmp0 = tt.make_range {end = 4: i32, start = 0: i32} : tensor<4xi32, #sliced>
++    %meta_tmp1 = tt.expand_dims %meta_tmp0 {axis = 0 : i32} : tensor<4xi32, #sliced> -> tensor<1x4xi32, #blocked>
++    %meta_offs = tt.broadcast %meta_tmp1 : tensor<1x4xi32, #blocked> -> tensor<128x4xi32, #blocked>
++    %meta_ptr_init = tt.addptr %meta_ptr_splat, %meta_offs : tensor<128x4x!tt.ptr<i16>, #blocked>, tensor<128x4xi32, #blocked>
++
++    %a_off = arith.constant dense<4> : tensor<128x32xi32, #blocked>
++    %b_off = arith.constant dense<4> : tensor<64x128xi32, #blocked>
++    %meta_off = arith.constant dense<4> : tensor<128x4xi32, #blocked>
++    %c_init = arith.constant dense<0.00e+00> : tensor<128x128xf32, #mma>
++
++    // CHECK: scf.for
++    %loop:4 = scf.for %iv = %lb to %ub step %step iter_args(%a_ptr = %a_ptr_init, %b_ptr = %b_ptr_init, %c = %c_init, %meta_ptr = %meta_ptr_init)
++        -> (tensor<128x32x!tt.ptr<f16>, #blocked>, tensor<64x128x!tt.ptr<f16>, #blocked>, tensor<128x128xf32, #mma>, tensor<128x4x!tt.ptr<i16>, #blocked>) {
++      // CHECK-COUNT-3: triton_gpu.local_load
++      // CHECK: triton_gpu.sparse_dot
++      // CHECK-COUNT-3: triton_gpu.async_copy_global_to_local
++      %a_ = tt.load %a_ptr {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x32xf16, #blocked>
++      %a = triton_gpu.convert_layout %a_ : tensor<128x32xf16, #blocked> -> tensor<128x32xf16, #dot_operand_a>
++      %b_ = tt.load %b_ptr {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<64x128xf16, #blocked>
++      %b = triton_gpu.convert_layout %b_ : tensor<64x128xf16, #blocked> -> tensor<64x128xf16, #dot_operand_b>
++      %meta_ = tt.load %meta_ptr {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x4xi16, #blocked>
++      %meta = triton_gpu.convert_layout %meta_ : tensor<128x4xi16, #blocked> -> tensor<128x4xi16, #dot_meta_enc>
++      %d = triton_gpu.sparse_dot %a, %b, %c, %meta : tensor<128x32xf16, #dot_operand_a> meta tensor<128x4xi16, #dot_meta_enc> * tensor<64x128xf16, #dot_operand_b> -> tensor<128x128xf32, #mma>
++
++      %a_ptr_next = tt.addptr %a_ptr, %a_off : tensor<128x32x!tt.ptr<f16>, #blocked>, tensor<128x32xi32, #blocked>
++      %b_ptr_next = tt.addptr %b_ptr, %b_off : tensor<64x128x!tt.ptr<f16>, #blocked>, tensor<64x128xi32, #blocked>
++      %meta_ptr_next = tt.addptr %meta_ptr, %meta_off : tensor<128x4x!tt.ptr<i16>, #blocked>, tensor<128x4xi32, #blocked>
++      scf.yield %a_ptr_next, %b_ptr_next, %d, %meta_ptr_next : tensor<128x32x!tt.ptr<f16>, #blocked>, tensor<64x128x!tt.ptr<f16>, #blocked>, tensor<128x128xf32, #mma>, tensor<128x4x!tt.ptr<i16>, #blocked>
++    }
++    tt.return %loop#2: tensor<128x128xf32, #mma>
++  }
++}
+diff --git a/test/SparseDot/ttg_reduce_data_duplication.mlir b/test/SparseDot/ttg_reduce_data_duplication.mlir
+new file mode 100644
+--- /dev/null
++++ b/test/SparseDot/ttg_reduce_data_duplication.mlir
+@@ -0,0 +1,13 @@
++// RUN: triton-opt %s -split-input-file -tritongpu-reduce-data-duplication | FileCheck %s
++
++#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0]}>
++#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 2], instrShape = [16, 8]}>
++// CHECK: #[[SHARED:.+]] = #triton_gpu.shared
++module attributes {"triton_gpu.num-warps" = 4 : i32} {
++  tt.func @sparse_dot_metadata(%meta: tensor<64x4xi16, #blocked>) {
++    // CHECK: %[[META:.+]] = triton_gpu.local_alloc {{.+}} : (tensor<64x4xi16, #blocked>) -> !tt.memdesc<64x4xi16, #[[SHARED]]>
++    // CHECK: triton_gpu.local_load %[[META]] : !tt.memdesc<64x4xi16, #[[SHARED]]> -> tensor<64x4xi16, #triton_gpu.sparse_dot_meta<{parent = #mma}>>
++    %0 = triton_gpu.convert_layout %meta : tensor<64x4xi16, #blocked> -> tensor<64x4xi16, #triton_gpu.sparse_dot_meta<{parent = #mma}>>
++    tt.return
++  }
++}
diff --git a/third_party/vulkan_headers/vulkan_headers.BUILD b/third_party/vulkan_headers/vulkan_headers.BUILD
index 3fef197233286a..35c4a2d443974d 100644
--- a/third_party/vulkan_headers/vulkan_headers.BUILD
+++ b/third_party/vulkan_headers/vulkan_headers.BUILD
@@ -53,8 +53,8 @@ cc_library(
     name = "vulkan_hpp",
     srcs =
         select({
-            "@local_tsl//tsl:macos": [],
-            "@local_tsl//tsl:ios": [],
+            "@local_xla//xla/tsl:macos": [],
+            "@local_xla//xla/tsl:ios": [],
             "//conditions:default": ["tensorflow/vulkan_hpp_dispatch_loader_dynamic.cc"],
         }),
     hdrs = ["include/vulkan/vulkan.hpp"],
@@ -65,8 +65,8 @@ cc_library(
         "VULKAN_HPP_TYPESAFE_CONVERSION",
         "VULKAN_HPP_TYPESAFE_EXPLICIT",
     ] + select({
-        "@local_tsl//tsl:macos": [],
-        "@local_tsl//tsl:ios": [],
+        "@local_xla//xla/tsl:macos": [],
+        "@local_xla//xla/tsl:ios": [],
         "//conditions:default": ["VULKAN_HPP_DISPATCH_LOADER_DYNAMIC"],
     }),
     includes = ["include"],
diff --git a/third_party/xla/.bazelrc b/third_party/xla/.bazelrc
index d8990ac5c12cc5..d7ae76f096431a 100644
--- a/third_party/xla/.bazelrc
+++ b/third_party/xla/.bazelrc
@@ -253,7 +253,7 @@ build:cuda_clang_official --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-12.3"
 build:cuda_clang_official --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
 build:cuda_clang_official --action_env=CLANG_CUDA_COMPILER_PATH="/usr/lib/llvm-17/bin/clang"
 build:cuda_clang_official --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
-build:cuda_clang_official --crosstool_top="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain"
+build:cuda_clang_official --crosstool_top="@sigbuild-r2.17-clang_config_cuda//crosstool:toolchain"
 
 # Build with nvcc for CUDA and clang for host
 build:nvcc_clang --config=cuda
@@ -293,6 +293,11 @@ build:rocm --define=using_rocm_hipcc=true
 build:rocm --define=tensorflow_mkldnn_contraction_kernel=0
 build:rocm --repo_env TF_NEED_ROCM=1
 
+build:sycl --crosstool_top=@local_config_sycl//crosstool:toolchain
+build:sycl --define=using_sycl=true
+build:sycl --define=tensorflow_mkldnn_contraction_kernel=0
+build:sycl --repo_env TF_NEED_SYCL=1
+
 # Options to disable default on features
 build:noaws --define=no_aws_support=true
 build:nogcp --define=no_gcp_support=true
@@ -497,12 +502,12 @@ build:rbe_linux --host_linkopt=-lm
 
 build:rbe_linux_cpu --config=rbe_linux
 # Linux cpu and cuda builds share the same toolchain now.
-build:rbe_linux_cpu --host_crosstool_top="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain"
-build:rbe_linux_cpu --crosstool_top="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain"
-build:rbe_linux_cpu --extra_toolchains="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain-linux-x86_64"
-build:rbe_linux_cpu --extra_execution_platforms="@sigbuild-r2.16-clang_config_platform//:platform"
-build:rbe_linux_cpu --host_platform="@sigbuild-r2.16-clang_config_platform//:platform"
-build:rbe_linux_cpu --platforms="@sigbuild-r2.16-clang_config_platform//:platform"
+build:rbe_linux_cpu --host_crosstool_top="@sigbuild-r2.17-clang_config_cuda//crosstool:toolchain"
+build:rbe_linux_cpu --crosstool_top="@sigbuild-r2.17-clang_config_cuda//crosstool:toolchain"
+build:rbe_linux_cpu --extra_toolchains="@sigbuild-r2.17-clang_config_cuda//crosstool:toolchain-linux-x86_64"
+build:rbe_linux_cpu --extra_execution_platforms="@sigbuild-r2.17-clang_config_platform//:platform"
+build:rbe_linux_cpu --host_platform="@sigbuild-r2.17-clang_config_platform//:platform"
+build:rbe_linux_cpu --platforms="@sigbuild-r2.17-clang_config_platform//:platform"
 # This is needed for all Clang17 builds but must not be present in GCC builds.
 build:rbe_linux_cpu --copt=-Wno-error=unused-command-line-argument
 # This was added in clang-16 by https://reviews.llvm.org/D133574.
@@ -511,7 +516,7 @@ build:rbe_linux_cpu --copt=-Wno-error=unused-command-line-argument
 # See https://github.com/protocolbuffers/upb/blob/9effcbcb27f0a665f9f345030188c0b291e32482/upb/upb.c#L183.
 build:rbe_linux_cpu --copt=-Wno-gnu-offsetof-extensions
 # Python config is the same across all containers because the binary is the same
-build:rbe_linux_cpu --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.16-clang_config_python"
+build:rbe_linux_cpu --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.17-clang_config_python"
 build:rbe_linux_cpu --python_path="/usr/bin/python3"
 # These you may need to change for your own GCP project.
 common:rbe_linux_cpu --remote_instance_name=projects/tensorflow-testing/instances/default_instance
@@ -532,9 +537,9 @@ build:rbe_linux_cuda --config=cuda_clang_official
 build:rbe_linux_cuda --config=rbe_linux_cpu
 # For Remote build execution -- GPU configuration
 build:rbe_linux_cuda --repo_env=REMOTE_GPU_TESTING=1
-build:rbe_linux_cuda --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.16-clang_config_cuda"
-build:rbe_linux_cuda --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.16-clang_config_tensorrt"
-build:rbe_linux_cuda --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.16-clang_config_nccl"
+build:rbe_linux_cuda --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.17-clang_config_cuda"
+build:rbe_linux_cuda --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.17-clang_config_tensorrt"
+build:rbe_linux_cuda --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.17-clang_config_nccl"
 test:rbe_linux_cuda --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 
 build:rbe_linux_cuda_nvcc --config=rbe_linux_cuda
@@ -639,7 +644,7 @@ test:release_linux_base --test_summary=short
 
 # Use the Clang toolchain to compile
 build:release_cpu_linux --config=release_linux_base
-build:release_cpu_linux --crosstool_top="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain"
+build:release_cpu_linux --crosstool_top="@sigbuild-r2.17-clang_config_cuda//crosstool:toolchain"
 
 build:release_gpu_linux --config=release_cpu_linux
 # Set up compilation CUDA version and paths and use the CUDA Clang toolchain.
diff --git a/third_party/xla/.github/workflows/benchmark.yml b/third_party/xla/.github/workflows/benchmark.yml
deleted file mode 100644
index 9d4a8d0881b47e..00000000000000
--- a/third_party/xla/.github/workflows/benchmark.yml
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-
-name: A/B Diff Performance Benchmarking
-
-on:
-  pull_request:
-    types: [labeled, synchronize]
-
-jobs:
-  run-benchmark-at-pr:
-    # TODO(b/278787029): Rework triggering to prevent new labels from overriding job status.
-    if: contains(github.event.pull_request.labels.*.name, 'A/B diff benchmarking')
-    runs-on:
-      - self-hosted  # must come first
-      - environment=testing
-      - cpu
-      - os-family=Linux
-    defaults:
-      run:
-        shell: bash
-    timeout-minutes: 60
-    steps:
-      - name: "Checking out PR repository"
-        uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791 # v2.5.0
-      - name: "Build docker"  # TODO(b/277242108): build once and reference docker image by digest.
-        run: |
-          docker build --file build_tools/docker/dockerfiles/benchmarking.Dockerfile \
-            --tag base \
-            build_tools/docker/context
-      - name: "Benchmark at PR"
-        run: |
-          docker run --mount="type=bind,src="${PWD}",target=/work" --workdir="/work" \
-            base:latest \
-            build_tools/github_actions/build_xla.sh
-      - name: "Checking out base repository"
-        uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791 # v2.5.0
-        with:
-          ref: "${{ github.event.pull_request.base.sha }}"
-      - name: "Benchmark at Base"
-        run: |
-          docker run --mount="type=bind,src="${PWD}",target=/work" --workdir="/work" \
-            base:latest \
-            build_tools/github_actions/build_xla.sh
diff --git a/third_party/xla/docs/_toc.yaml b/third_party/xla/docs/_toc.yaml
index 50a24a1a6607c8..d46f2534e897e7 100644
--- a/third_party/xla/docs/_toc.yaml
+++ b/third_party/xla/docs/_toc.yaml
@@ -16,6 +16,8 @@ toc:
       path: /xla/shapes
     - title: Aliasing
       path: /xla/aliasing
+    - title: Indexing Analysis
+      path: /xla/indexing
     - title: Tiled layout
       path: /xla/tiled_layout
     - title: Writing custom calls
diff --git a/third_party/xla/docs/indexing.md b/third_party/xla/docs/indexing.md
index 4ee61f16f62033..7de3a4c9014903 100644
--- a/third_party/xla/docs/indexing.md
+++ b/third_party/xla/docs/indexing.md
@@ -14,8 +14,8 @@ p0 = f32[20] parameter(0)
 bc0 = f32[10, 20, 30] broadcast(p0), dimensions={1}
 ```
 
-the indexing map from the output to input is $(i, j, k) \mapsto (j)$ for $i \in
-[0, 10]$, $j \in [0, 20]$ and $k \in [0, 30]$.
+the indexing map from the output to input is `(i, j, k) -> (j)` for `i in
+[0, 10]`, `j in [0, 20]` and `k in [0, 30]`.
 
 ## Motivation
 
@@ -49,33 +49,32 @@ robust if it is expressed via indexing maps.
 
 ## Function and Domain
 
-The indexing map is a function $\boldsymbol{f}(\boldsymbol{d}, \boldsymbol{s})$
-that maps a multi-index $\boldsymbol{d}$ of a tensor $A$ to elements/ranges of
-tensor $B$. The parameter $\boldsymbol{s}$ refers to the ranges of indices of
-the dimensions that are present in tensor $B$, but not in tensor $A$​.
+The indexing map is a function **f**(**d**, **s**)
+that maps a multi-index **d** of a tensor `A` to elements/ranges of
+tensor `B`. The parameter **s** refers to the ranges of indices of
+the dimensions that are present in tensor `B`, but not in tensor `A`​.
 
 For example, if we have a reduction from `tensor<2x4x8x16xf32>` to
 `tensor<4x8xf32>`, then the indexing map from the 2D output to the 4D input is
-$(d_0, d_1) \mapsto (s_0, d_0, d_1, s_1)$, where $d_i$ are the dimension
-parameters that correspond to the indices of the output tensor. Parameters $s_j$
-encode multiple values, i.e. to compute a $(d_0, d_1)$ element of the output, we
-need $(s_0, d_0, d_1, s_1)$ elements of the input, where $s_0 \in [0, 2)$ and
-$s_1 \in [0, 16)$.
+`(d0, d1) -> (s0, d0, d1, s1)`, where `d_i` are the dimension
+parameters that correspond to the indices of the output tensor. Parameters `s_j`
+encode multiple values, i.e. to compute a `(d0, d1)` element of the output, we
+need `(s0, d0, d1, s1)` elements of the input, where `s0 in [0, 2)` and
+`s1 in [0, 16)`.
 
 This mapping can be constructed from the attributes of HLO instructions or the
 mappings of unfused instructions can be composed to get indexing for a fusion.
 The mapping also has a domain, which specifies for what elements of the tensor
 the mapping exists.
 
-$$
-\begin{eqnarray}
-\boldsymbol{f}(\boldsymbol{d}, \boldsymbol{s})\; &s.t.& \\
-\boldsymbol{lb}_d &\leq& \boldsymbol{d} \leq \boldsymbol{ub}_d \\
-\boldsymbol{lb}_s &\leq& \boldsymbol{s} \leq \boldsymbol{ub}_s \\
-\boldsymbol{lb}_g &\leq& \boldsymbol{g}(\boldsymbol{d},
-  \boldsymbol{s}) \leq \boldsymbol{ub}_g
-\end{eqnarray}
-$$
+**f**(**d**, **s**) s.t.
+
+**lb**_d <= **d** <= **ub**_d
+
+**lb**_s <= **s** <= **ub**_s
+
+**lb**_g <= **g** <= **ub**_g
+
 
 Since we want to minimize recomputation, we need a library for symbolic
 computations. XLA already depends on MLIR, so we use
@@ -89,7 +88,7 @@ A typical `AffineMap` looks like
 ```
 
 `AffineMap` conveniently has two types of parameters: *dimensions* and *symbols*
-that we can use for $\boldsymbol d$ and $\boldsymbol s$ respectively.
+that we can use for **d** and **s** respectively.
 `AffineMap` does not contain any metadata about ranges of the dimensions, so we
 have to provide this data ourselves.
 
@@ -109,13 +108,12 @@ struct IndexingMap {
 ```
 
 `dim_ranges` encodes the **inclusive** box constraints for the dimension
-parameters $\boldsymbol{d}$ of the indexing map, which usually coincide with the
+parameters **d** of the indexing map, which usually coincide with the
 shape of the output tensor for ops like transpose, reduce, elementwise, dot, but
 there are some exceptions like
 [HloConcatenateInstruction](https://github.com/openxla/stablehlo/blob/main/docs/spec.md#concatenate).
 
-`symbol_ranges` encode possible values that $\boldsymbol {s}$ parameters can
-take.
+`symbol_ranges` encode possible values that **s** parameters can take.
 
 Let's study-by-example to understand what's all of the above actually means.
 
@@ -133,15 +131,25 @@ For elementwise ops the indexing map is an identity.
 
 The output to input maps:
 
--   output -> input_0: $(d_0, d_1) \mapsto (d_0, d_1)$ for $\boldsymbol{d} \in
-    [0,9] \times [0, 19]$, i.e. $\boldsymbol{d} \in {\rm Dom}(output)$
--   output -> input_1: $(d_0, d_1) \mapsto (d_0, d_1)$ for $\boldsymbol{d} \in
-    {\rm Dom} (output)$
+-   output -> input_i:
+
+```
+(d0, d1) -> (d0, d1)
+domain:
+d0 in [0, 19]
+d1 in [0, 19]
+```
 
 The input to output maps
 
--   input_i -> output: $(d_0, d_1) \mapsto (d_0, d_1)$ for $\boldsymbol{d} \in
-    {\rm Dom}(input)$
+-   input_i -> output:
+
+```
+(d0, d1) -> (d0, d1)
+domain:
+d0 in [0, 19]
+d1 in [0, 19]
+```
 
 ### [Broadcast](https://openxla.org/xla/operation_semantics#broadcastindim)
 
@@ -155,17 +163,27 @@ bc0 = f32[10, 20, 30] broadcast(p0), dimensions={1}
 
 The output to input map:
 
--   output -> input: $(d_0, d_1, d_2) \mapsto (d_1)$ for $\boldsymbol{d} \in
-    {\rm Dom}(output)$
+```
+(d0, d1, d2) -> (d1)
+domain:
+d0 in [0, 9]
+d1 in [0, 19]
+d2 in [0, 29]
+```
 
 The input to output map
 
--   input -> output: $(d_0) \mapsto (s_0, d_0, s_1)$ for $\boldsymbol{d} \in
-    {\rm Dom}(input)$ and $\boldsymbol{s} \in [0, 9] \times [0, 29]$.
+```
+(d0)[s0, s1] -> (s0, d0, s1)
+domain:
+d0 in [0, 19]
+s0 in [0, 9]
+s1 in [0, 29]
+```
 
-Note that now we have $\boldsymbol s$ on the right side for the input-to-output
+Note that now we have **s** on the right side for the input-to-output
 mapping. Those are the symbols that represent ranges of values. For example, in
-this particular case every element of input with index $d_0$ is mapped to a
+this particular case every element of input with index `d0` is mapped to a
 10x1x30 slice of the output.
 
 ### Constant and [Iota](https://openxla.org/xla/operation_semantics#iota)
@@ -184,18 +202,30 @@ transpose = f32[3, 6, 128, 12288] transpose(p0), dimensions={0, 2, 3, 1}
 
 The output to input map:
 
--   output -> input: $(d_0, d_1, d_2, d_3) \mapsto (d_0, d_3, d_1, d_2)$ for
-    $\boldsymbol{d} \in {\rm Dom}(output)$
+```
+(d0, d1, d2, d3) -> (d0, d3, d1, d2)
+domain:
+d0 in [0, 2]
+d1 in [0, 5]
+d2 in [0, 127]
+d3 in [0, 12287]
+```
 
 The input to output map:
 
--   input -> output: $(d_0, d_1, d_2, d_3) \mapsto (d_0, d_2, d_3, d_1)$ for
-    $\boldsymbol{d} \in {\rm Dom}(input)$
+```
+(d0, d1, d2, d3) -> (d0, d2, d3, d1)
+domain:
+d0 in [0, 2]
+d1 in [0, 12287]
+d2 in [0, 5]
+d3 in [0, 127]
+```
 
 ### [Reverse](https://openxla.org/xla/operation_semantics#rev_reverse)
 
-Indexing map for reverse changes the reverted dimensions to $upper\_bound(d_i) -
-d_i$:
+Indexing map for reverse changes the reverted dimensions to `upper_bound(d_i) -
+d_i`:
 
 ```c+
 p0 = f32[1, 17, 9, 9] parameter(0)
@@ -204,13 +234,25 @@ reverse = f32[1, 17, 9, 9] reverse(p0), dimensions={1, 2}
 
 The output to input map:
 
--   output -> input: $(d_0, d_1, d_2, d_3) \mapsto (d_0, -d_1 + 16, -d_2 + 8,
-    d_3)$ for $\boldsymbol{d} \in {\rm Dom}(output)$
+```
+(d0, d1, d2, d3) -> (d0, -d1 + 16, -d2 + 8, d3)
+domain:
+d0 in [0, 0]
+d1 in [0, 16]
+d2 in [0, 8]
+d3 in [0, 8]
+```
 
 The input to output map:
 
--   input -> output: $(d_0, d_1, d_2, d_3) \mapsto (d_0, -d_1 + 16, -d_2 + 8,
-    d_3)$ for $\boldsymbol{d} \in {\rm Dom}(input)$
+```
+(d0, d1, d2, d3) -> (d0, -d1 + 16, -d2 + 8, d3)
+domain:
+d0 in [0, 0]
+d1 in [0, 16]
+d2 in [0, 8]
+d3 in [0, 8]
+```
 
 ### **[(Variadic)Reduce](https://openxla.org/xla/operation_semantics#reduce)**
 
@@ -229,18 +271,43 @@ reduce = (f32[10], s32[10]) reduce(p0, p1, p0_init, p1_init),
 
 The output to input maps:
 
--   output -> input_j: $(d_0) \mapsto (s_0, d_0)$ for $\boldsymbol{d} \in {\rm
-    Dom}(output)$ and $\boldsymbol{s} \in [0, 9]$
--   output -> init_j: $(d_0) \mapsto ()$ for $\boldsymbol{d} \in {\rm
-    Dom}(output)$
+-   output -> input_j:
+
+```
+(d0)[s0] -> (s0, d0)
+domain:
+d0 in [0, 9]
+s0 in [0, 255]
+```
+
+-   output -> init_j:
+
+```
+(d0) -> ()
+domain:
+d0 in [0, 9]
+```
 
 The input to output maps:
 
--   input_i -> output_j: $(d_0, d_1) \mapsto (d_1)$ for $\boldsymbol{d} \in {\rm
-    Dom}(input)$
--   init_i -> output_j: $() \mapsto (s_0)$ for $\boldsymbol{s} \in [0, 9]$
+-   input_i -> output_j:
 
-for $i, j = 0, \ldots, INPUT\\_COUNT$.
+```
+(d0, d1) -> (d1)
+domain:
+d0 in [0, 255]
+d1 in [0, 9]
+```
+
+-   init_i -> output_j:
+
+```
+()[s0] -> (s0)
+domain:
+s0 in [0, 9]
+```
+
+for i, j = 0, ... INPUT_COUNT.
 
 ### [Slice](https://openxla.org/xla/operation_semantics#slice)
 
@@ -256,15 +323,16 @@ slice = f32[5, 3, 25] slice(f32[10, 20, 50] p0),
 
 The output to input map:
 
--   output -> input: $(d_0, d_1, d_2) \mapsto (d_0 + 5, 7d_1 + 3, 2d_2)$ for
-    $\boldsymbol{d} \in {\rm Dom}(output)$
+```
+(d0, d1, d2) -> (d0 + 5, d1 * 7 + 3, d2 * 2)
+domain:
+d0 in [0, 4]
+d1 in [0, 2]
+d2 in [0, 24]
+```
 
 The input to output map:
 
--   input -> output: $(d_0, d_1, d_2) \mapsto (d_0, d_1 / 7, d_2 / 2)$ for
-    $\boldsymbol{d} \in [5, 9] \times [3, 19] \times [0, 49]$ with strides $[1,
-    7, 2]$​.
-
 **TBD**: input-to-output indexing
 
 ### [Reshape](https://openxla.org/xla/operation_semantics#reshape)
@@ -282,13 +350,20 @@ reshape = f32[32] reshape(p0)
 
 The output to input map:
 
--   output -> input: $(d_0) \mapsto (d_0 / 8, d_0 \mod 8)$ for $\boldsymbol{d}
-    \in {\rm Dom}(output)$
+```
+(d0) -> (d0 floordiv 8, d0 mod 8)
+domain:
+d0 in [0, 31]
+```
 
 The input to output map:
 
--   input -> output: $(d_0, d_1) \mapsto (8 d_0 + d_1)$ for $\boldsymbol{d} \in
-    {\rm Dom}(input)$.
+```
+(d0) -> (d0 floordiv 8, d0 mod 8)
+domain:
+d0 in [0, 31]
+```
+
 
 #### Expand shape
 
@@ -301,13 +376,24 @@ reshape = f32[4, 8] reshape(p0)
 
 The output to input map:
 
--   output -> input: $(d_0, d_1) \mapsto (8 d_0 + d_1)$ for $\boldsymbol{d} \in
-    {\rm Dom}(output)$
+```
+(d0, d1, d2) -> (d0 floordiv 8, d0 mod 8, d1 * 4 + d2)
+domain:
+d0 in [0, 31]
+d1 in [0, 2]
+d2 in [0, 3]
+```
 
 The input to output map:
 
--   input -> output: $(d_0) \mapsto (d_0 / 8, d_0 \mod 8)$ for $\boldsymbol{d}
-    \in {\rm Dom}(input)$.
+```
+(d0, d1, d2) -> (d0 * 8 + d1, d2 floordiv 4, d2 mod 4)
+domain:
+d0 in [0, 3]
+d1 in [0, 7]
+d2 in [0, 11]
+```
+
 
 #### Generic reshape
 
@@ -328,17 +414,22 @@ This reshape can be represented as a composition of collapse shape of
 
 The output to input map:
 
--   output -> input: $(d_0, d_1, d_2) \mapsto (2d_0 + (4d_1 + d_2) / 8, 4d_1 +
-    d_2) \mod 8)$
-
-for $\boldsymbol{d} \in {\rm Dom}(output)$
+```
+(d0, d1, d2) -> (d0 * 2 + d1 floordiv 2, d2 + (d1 mod 2) * 4) 
+domain:
+d0 in [0, 1]
+d1 in [0, 3]
+d2 in [0, 3]
+```
 
 The input to output map:
 
--   input -> output: $(d_0, d_1) \mapsto ((8d_0 + d_1) / 16, ((8d_0 + d_1) \mod
-    16) / 4, d_1 \mod 4)$
-
-for $\boldsymbol{d} \in {\rm Dom}(input)$.
+```
+(d0, d1) -> (d0 floordiv 2, d1 floordiv 4 + (d0 mod 2) * 2, d1 mod 4)
+domain:
+d0 in [0, 3]
+d1 in [0, 7]
+```
 
 ##### Example 2: Expanded and collapsed subshapes
 
@@ -354,13 +445,23 @@ and the second one expand the innermost dimension `tensor<32x12xf32>` into
 
 The output to input map:
 
--   output -> input: $(d_0, d_1, d_2) \mapsto (d_0 / 8, d_0 \mod 8, 4d_1 + d_2)$
-    for $\boldsymbol{d} \in {\rm Dom}(output)$
+```
+(d0, d1, d2) -> (d0 floordiv 8, d0 mod 8, d1 * 4 + d2)
+domain:
+d0 in [0, 31]
+d1 in [0, 2]
+d2 in [0, 3]
+```
 
 The input to output map:
 
--   input -> output: $(d_0, d_1, d_2) \mapsto (8d_0 + d_1, d_2 / 4, d_2 \mod 4)$
-    for $\boldsymbol{d} \in {\rm Dom}(input)$.
+```
+(d0, d1, d2) -> (d0 * 8 + d1, d2 floordiv 4, d2 mod 4)
+domain:
+d0 in [0, 3]
+d1 in [0, 7]
+d2 in [0, 11]
+```
 
 ### Bitcast
 
@@ -375,29 +476,76 @@ Output-to-input mapping for concat is defined for all inputs, but with
 non-overlapping domains, i.e. only one of the inputs will be used at a time.
 
 ```c+
-p0 = f32[3,50] parameter(0)
-p1 = f32[3,30] parameter(1)
-concat = f32[3,80] concatenate(f32[3,50] p0, f32[3,30] p1),
-  dimensions={1}
+p0 = f32[2, 5, 7] parameter(0)
+p1 = f32[2, 11, 7] parameter(1)
+p2 = f32[2, 17, 7] parameter(2)
+ROOT concat = f32[2, 33, 7] concatenate(f32[2, 5, 7] p0, f32[2, 11, 7] p1, f32[2, 17, 7] p2), dimensions={1}
 ```
 
-The output to input map:
+The output to inputs maps:
 
 -   output -> input 1:
 
-$(d_0, d_1) \mapsto (d_0, d_1)$ for $\boldsymbol{d} \in [0, 2] \times [0, 49]$
+```
+(d0, d1, d2) -> (d0, d1, d2)
+domain:
+d0 in [0, 1]
+d1 in [0, 4]
+d2 in [0, 6]
+```
 
 -   output -> input 2:
 
-$(d_0, d_1) \mapsto (d_0, d_1 - 50)$ for $\boldsymbol{d} \in [0, 2] \times [50,
-79]$
+```
+(d0, d1, d2) -> (d0, d1 - 5, d2)
+domain:
+d0 in [0, 1]
+d1 in [5, 15]
+d2 in [0, 6]
+```
 
-The inputs to output map:
+-   output -> input 3:
 
--   input 1 -> output: $(d_0, d_1) \mapsto (d_0, d_1)$ for $\boldsymbol{d} \in
-    {\rm Dom}(input_1)$.
--   input 2 -> output: $(d_0, d_1) \mapsto (d_0, d_1 + 50)$ for $\boldsymbol{d}
-    \in {\rm Dom}(input_2)$.
+```
+(d0, d1, d2) -> (d0, d1 - 16, d2)
+domain:
+d0 in [0, 1]
+d1 in [16, 32]
+d2 in [0, 6]
+```
+
+
+The inputs to output maps:
+
+-   input 1 -> output:
+
+```
+(d0, d1, d2) -> (d0, d1, d2)
+domain:
+d0 in [0, 1]
+d1 in [0, 4]
+d2 in [0, 6]
+```
+
+-   input 2 -> output:
+
+```
+(d0, d1, d2) -> (d0, d1 + 5, d2)
+domain:
+d0 in [0, 1]
+d1 in [0, 10]
+d2 in [0, 6]
+```
+
+-   input 3 -> output:
+
+```
+(d0, d1, d2) -> (d0, d1 + 16, d2)
+domain:
+d0 in [0, 1]
+d1 in [0, 16]
+d2 in [0, 6]
+```
 
 ### [Dot](https://openxla.org/xla/operation_semantics#dot)
 
@@ -413,17 +561,51 @@ dot = f32[4, 128, 64] dot(p0, p1),
 
 The output to inputs maps:
 
--   output -> input_1: $(d_0, d_1, d_2) \mapsto (d_0, d_1, s_0)$ for
-    $\boldsymbol{d} \in {\rm Dom}(output)$ and $\boldsymbol{s} \in [0, 255]$
--   output -> input_2: $(d_0, d_1, d_2) \mapsto (d_0, s_0, d_2)$ for
-    $\boldsymbol{d} \in {\rm Dom}(output)$ and $\boldsymbol{s} \in [0, 255]$
+-   output -> input_1:
+
+```
+(d0, d1, d2)[s0] -> (d0, d1, s0)
+domain:
+d0 in [0, 3]
+d1 in [0, 127]
+d2 in [0, 63]
+s0 in [0, 255]
+```
+
+-   output -> input_2:
+
+```
+(d0, d1, d2)[s0] -> (d0, s0, d2)
+domain:
+d0 in [0, 3]
+d1 in [0, 127]
+d2 in [0, 63]
+s0 in [0, 255]
+```
 
 The inputs to output maps:
 
--   input_1 -> output: $(d_0, d_1, d_2) \mapsto (d_0, d_1, s_0)$ for
-    $\boldsymbol{d} \in {\rm Dom}(input_1)$ and $\boldsymbol{s} \in [0, 63]$
--   input_2 -> output: $(d_0, d_1, d_2) \mapsto (d_0, s_0, d_1)$ for
-    $\boldsymbol{d} \in {\rm Dom}(input_2)$ and $\boldsymbol{s} \in [0, 127]$
+-   input_1 -> output:
+
+```
+(d0, d1, d2) -> (d0, d1, s0)
+domain:
+d0 in [0, 3]
+d1 in [0, 127]
+d2 in [0, 255]
+s0 in [0, 63]
+```
+
+-   input_2 -> output:
+
+```
+(d0, d1, d2) -> (d0, s_0, d1)
+domain:
+d0 in [0, 3]
+d1 in [0, 255]
+d2 in [0, 63]
+s0 in [0, 127]
+```
 
 ### [Pad](https://openxla.org/xla/operation_semantics#pad)
 
@@ -439,9 +621,24 @@ The padding config `1_4_1x4_8_0` denotes `lowPad_highPad_interiorPad_dim_0 x low
 
 The output to input maps:
 
--   output -> input: $(d_0, d_1) \mapsto ((d_0 - 1) / 2, d_1 - 4)$
-    for $\boldsymbol{d} \in [1, 7] \times [4, 7]$ and $(d_0 - 1) \mod 2 \equiv 0$
--   output -> init: $(d_0, d_1) \mapsto ()$ for $\boldsymbol{d} \in {\rm Dom}(output)$
+-   output -> input:
+
+```
+(d0, d1) -> ((d0 - 1) floordiv 2, d1 - 4)
+domain:
+d0 in [1, 7]
+d1 in [4, 7]
+(d0 - 1) mod 2 in [0, 0]
+```
+
+-   output -> init:
+
+```
+(d0, d1) -> ()
+domain:
+d0 in [0, 11]
+d1 in [0, 15]
+```
 
 
 ### [ReduceWindow](https://openxla.org/xla/operation_semantics#reducewindow)
@@ -460,8 +657,24 @@ reduce-window = f32[1024, 3] reduce-window(p0, c_inf),
 
 The output to input maps:
 
--   output -> input: $(d_0, d_1) \mapsto (d_0, d_1 + s_0)$ for $\boldsymbol{d} \in [0, 1023] \times [0, 2]$ and $\boldsymbol{s} \in [0, 511]$
--   output -> init: $(d_0, d_1) \mapsto ()$ for $\boldsymbol{d} \in {\rm Dom}(output)$
+-   output -> input:
+
+```
+(d0, d1)[s0] -> (d0, d1 + s0)
+domain:
+d0 in [0, 1023]
+d1 in [0, 2]
+s0 in [0, 511]
+```
+
+-   output -> init:
+
+```
+(d0, d1) -> ()
+domain:
+d0 in [0, 1023]
+d1 in [0, 2]
+```
 
 ## Indexing Maps for Fusion
 
@@ -471,7 +684,7 @@ access patterns.
 
 ### One input, several indexing maps
 
-Here is an example for $p_0 + p_0^T$
+Here is an example for `p0 + transpose(p0)`.
 
 ```c+
 f {
@@ -481,8 +694,8 @@ f {
 }
 ```
 
-The output-to-input indexing maps for `p0` will be $(d_0, d_1) \mapsto (d_0,
-d_1)$ and $(d_0, d_1) \mapsto (d_1, d_0)$. It means that to compute one element
+The output-to-input indexing maps for `p0` will be `(d0, d1) -> (d0, d1)` and 
+`(d0, d1) -> (d1, d0)`. It means that to compute one element
 of the output we might need to read the input parameter twice.
 
 ### One input, deduplicated indexing map
@@ -505,9 +718,8 @@ f {
 }
 ```
 
-The output-to-input indexing map for `p0` in this case is just $(d_0, d_1, d_2)
-\mapsto (d_2, d_0, d_1)$.
-
+The output-to-input indexing map for `p0` in this case is just
+`(d0, d1, d2) -> (d2, d0, d1)`.
 
 ### Softmax
 
@@ -515,11 +727,26 @@ The output-to-input indexing map for `p0` in this case is just $(d_0, d_1, d_2)
 
 The output-to-input indexing maps for `parameter 0` for softmax:
 
--   $(d_0, d_1, d_2) \mapsto (d_0, d_1, d_2)$
--   $(d_0, d_1, d_2)[s_0] \mapsto (d_0, d_1, s_0)$
+```
+(d0, d1, d2)[s0] -> (d0, d1, s0)
+domain:
+d0 in [0, 1]
+d1 in [0, 64]
+d2 in [0, 124]
+s0 in [0, 124]
+```
+
+and
+
+```
+(d0, d1, d2) -> (d0, d1, d2)
+domain:
+d0 in [0, 1]
+d1 in [0, 64]
+d2 in [0, 124]
+```
 
-for $\boldsymbol{d} \in {\rm Dom}(output)$ and $\boldsymbol{s} \in [0, 124]$
-refers to the inner-most dimension of the input.
+where `s_0` refers to the inner-most dimension of the input.
 
 ## Indexing Map Simplifier
 
@@ -532,16 +759,16 @@ sub-expressions in the affine maps to simplify them even more.
 
 The simplifier can rewrite the following expressions.
 
-1.  $(d_0, d_1) \mapsto (d_0 + d1 / 16, d1 \mod 16)$ for $\boldsymbol{d} \in [0,
-    6] \times [0, 14]$ becomes $(d_0, d_1) \mapsto (d_0, d_1)$
-2.  $(d_0, d_1, d_2) \mapsto ((100d_0 + 10d_1 + d_2) /100, ((100d_0 + 10d_1 +
-    d_2) \mod 100) / 10, d_2 \mod 10)$ for $d_i \in [0, 9]$ becomes $(d_0, d_1,
-    d_2) \mapsto (d_0, d_1, d_2)$.
-3.  $(d_0, d_1, d_2) \mapsto ((16d_0 + 4d_1 + d_2) /8, (16d_0 + 4d_1 + d_2) \mod
-    8)$ for $d_i \in [0, 9]$ becomes $(d_0, d_1, d_2) \mapsto (2d_0 + (4d_1 +
-    d_2) /8,(4d_1 + d_2) \mod 8)$.
-4.  $(d_0, d_1) \mapsto (-(-11d_0 - d_1 + 109) / 11 + 9)$ for $\boldsymbol{d}
-    \in [0, 9] \times [0, 10]$ becomes $(d_0, d_1) \mapsto (d_0)$.
+1.  `(d0, d1) -> (d0 + d1 floordiv 16, d1 mod 16)` for **d** in `[0,
+    6] x [0, 14]` becomes `(d0, d1) -> (d0, d1)`
+2.  `(d0, d1, d2) -> ((100d0 + 10d1 + d2) floorDiv 100, ((100d0 + 10d1 +
+    d2) mod 100) floordiv 10, d2 mod 10)` for `di in [0, 9]` becomes `(d0, d1,
+    d2) -> (d0, d1, d2)`.
+3.  `(d0, d1, d2) -> ((16d0 + 4d1 + d2) floordiv 8, (16d0 + 4d1 + d2) mod
+    8)` for `d_i in [0, 9]` becomes `(d0, d1, d2) -> (2d0 + (4d1 +
+    d2) floordiv 8,(4d1 + d2) mod 8)`.
+4.  `(d0, d1) -> (-(-11d0 - d1 + 109) floordiv 11 + 9)` for **d**
+    in `[0, 9] x [0, 10]` becomes `(d0, d1) -> (d0)`.
 
 Indexing map simplifier allows us to understand that some of the chained
 reshapes in HLO cancel each other.
@@ -554,14 +781,14 @@ reshape2 = f32[10, 10, 10] reshape(reshape1)
 
 After the composition of indexing maps and their simplification we will get
 
-$(d_0, d_1, d_2) \mapsto (d_0, d_1, d_2)$.
+`(d0, d1, d2) -> (d0, d1, d2)`.
 
 Indexing map simplification also simplifies the constraints.
 
 1. Constraints of type
 `lower_bound <= affine_expr (floordiv, +, -, *) constant <= upper_bound` are
 rewritten as `updated_lower_bound <= affine_expr <= updated_upped_bound`.
-2. Constraints that are always satisfied, e.g. $d_0 + s_0 in [0, 20]$
-for $d_0 \in [0, 5]$ and $s_0 \in [1, 3]$ are eliminated.
+2. Constraints that are always satisfied, e.g. `d0 + s0 in [0, 20]`
+for `d0 in [0, 5]` and `s0 in [1, 3]` are eliminated.
 3. Affine expressions in the constraints are optimized as the indexing affine
 map above.
diff --git a/third_party/xla/third_party/compute_library/build_defs.bzl b/third_party/xla/third_party/compute_library/build_defs.bzl
index cd85be26bde540..9271ce1de2a7db 100644
--- a/third_party/xla/third_party/compute_library/build_defs.bzl
+++ b/third_party/xla/third_party/compute_library/build_defs.bzl
@@ -1,6 +1,6 @@
 def if_enable_acl(if_true, if_false = []):
     return select({
-        "@local_xla//third_party/compute_library:build_with_acl": if_true,
+        "@local_tsl//third_party/compute_library:build_with_acl": if_true,
         "//conditions:default": if_false,
     })
 
@@ -15,6 +15,6 @@ def acl_deps():
       inclusion in the deps attribute of rules.
     """
     return select({
-        "@local_xla//third_party/compute_library:build_with_acl": ["@compute_library//:arm_compute"],
+        "@local_tsl//third_party/compute_library:build_with_acl": ["@compute_library//:arm_compute"],
         "//conditions:default": [],
     })
diff --git a/third_party/xla/third_party/gloo/gloo.BUILD b/third_party/xla/third_party/gloo/gloo.BUILD
index e960fc518a7699..99a8e32c69c8f6 100644
--- a/third_party/xla/third_party/gloo/gloo.BUILD
+++ b/third_party/xla/third_party/gloo/gloo.BUILD
@@ -57,8 +57,8 @@ cc_library(
         "gloo/rendezvous/prefix_store.cc",
         "gloo/rendezvous/store.cc",
     ] + select({
-        "@local_tsl//tsl:macos": [],
-        "@local_tsl//tsl:windows": [],
+        "@local_xla//xla/tsl:macos": [],
+        "@local_xla//xla/tsl:windows": [],
         "//conditions:default": [
             "gloo/common/linux.cc",
         ],
diff --git a/third_party/xla/third_party/llvm_openmp/BUILD b/third_party/xla/third_party/llvm_openmp/BUILD
index 31bf29f966c495..031d29f5b7f7ae 100644
--- a/third_party/xla/third_party/llvm_openmp/BUILD
+++ b/third_party/xla/third_party/llvm_openmp/BUILD
@@ -2,21 +2,21 @@
 
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load(
-    "@local_tsl//tsl:tsl.bzl",
-    "if_linux_x86_64",
-    "if_macos",
-    "if_windows",
-)
-load(
-    "@local_xla//third_party/llvm_openmp:cmake_vars.bzl",
+    "@local_tsl//third_party/llvm_openmp:cmake_vars.bzl",
     "cmake_var_string",
     "expand_cmake_vars",
 )
 load(
-    "@local_xla//third_party/llvm_openmp:openmp.bzl",
+    "@local_tsl//third_party/llvm_openmp:openmp.bzl",
     "dict_add",
     "libiomp5_cc_binary",
 )
+load(
+    "@local_xla//xla/tsl:tsl.bzl",
+    "if_linux_x86_64",
+    "if_macos",
+    "if_windows",
+)
 
 package(
     default_visibility = [
@@ -36,7 +36,7 @@ py_binary(
 )
 
 kmp_i18n_os_type = select({
-    "@local_tsl//tsl:windows": "win",
+    "@local_xla//xla/tsl:windows": "win",
     "//conditions:default": "lin",
 })
 
@@ -114,7 +114,7 @@ omp_vars_win = {
 }
 
 omp_all_cmake_vars = select({
-    "@local_tsl//tsl:windows": cmake_var_string(
+    "@local_xla//xla/tsl:windows": cmake_var_string(
         dict_add(
             omp_vars,
             omp_vars_win,
diff --git a/third_party/xla/third_party/llvm_openmp/cmake_vars.bzl b/third_party/xla/third_party/llvm_openmp/cmake_vars.bzl
index 95f2fe1c6120db..9b01fffdcf02e1 100644
--- a/third_party/xla/third_party/llvm_openmp/cmake_vars.bzl
+++ b/third_party/xla/third_party/llvm_openmp/cmake_vars.bzl
@@ -46,7 +46,7 @@ def expand_cmake_vars(name, src, dst, cmake_vars):
       cmake_vars: a string containing the CMake variables, as generated by
         cmake_var_string.
     """
-    expand_cmake_vars_tool = "@local_xla//third_party/llvm_openmp:expand_cmake_vars"
+    expand_cmake_vars_tool = "@local_tsl//third_party/llvm_openmp:expand_cmake_vars"
     native.genrule(
         name = name,
         srcs = [src],
diff --git a/third_party/xla/third_party/llvm_openmp/openmp.bzl b/third_party/xla/third_party/llvm_openmp/openmp.bzl
index df985649a2f4ae..bbc44451c601e7 100644
--- a/third_party/xla/third_party/llvm_openmp/openmp.bzl
+++ b/third_party/xla/third_party/llvm_openmp/openmp.bzl
@@ -33,9 +33,9 @@ def dict_add(*dictionaries):
 
 def select_os_specific(L, M, W):
     return select({
-        "@local_tsl//tsl:linux_x86_64": L,
-        "@local_tsl//tsl:macos": M,
-        "@local_tsl//tsl:windows": W,
+        "@local_xla//xla/tsl:linux_x86_64": L,
+        "@local_xla//xla/tsl:macos": M,
+        "@local_xla//xla/tsl:windows": W,
         "//conditions:default": L,
     })
 
diff --git a/third_party/xla/third_party/nanobind/nanobind.BUILD b/third_party/xla/third_party/nanobind/nanobind.BUILD
index fd42d58b9aac07..c9f307b75ef0ca 100644
--- a/third_party/xla/third_party/nanobind/nanobind.BUILD
+++ b/third_party/xla/third_party/nanobind/nanobind.BUILD
@@ -20,7 +20,7 @@ cc_library(
         ],
     ),
     deps = [
-        "@local_xla//third_party/python_runtime:headers",
+        "@local_tsl//third_party/python_runtime:headers",
         "@robin_map",
     ],
 )
diff --git a/third_party/xla/third_party/py/ml_dtypes/ml_dtypes.BUILD b/third_party/xla/third_party/py/ml_dtypes/ml_dtypes.BUILD
index a85195e9c51102..25c98abc05655d 100644
--- a/third_party/xla/third_party/py/ml_dtypes/ml_dtypes.BUILD
+++ b/third_party/xla/third_party/py/ml_dtypes/ml_dtypes.BUILD
@@ -30,6 +30,21 @@ cc_library(
         ".",
         "ml_dtypes",
     ],
+    deps = [
+        ":intn",
+    ],
+)
+
+cc_library(
+    name = "intn",
+    hdrs = ["include/intn.h"],
+    include_prefix = "ml_dtypes",
+    # Internal headers are all relative to . but other packages
+    # include these headers with the  prefix.
+    includes = [
+        ".",
+        "ml_dtypes",
+    ],
 )
 
 pybind_extension(
@@ -48,8 +63,9 @@ pybind_extension(
     deps = [
         ":float8",
         ":int4",
+        ":intn",
         "@eigen_archive//:eigen3",
-        "@org_tensorflow//third_party/py/numpy:headers",
+        "@local_tsl//third_party/py/numpy:headers",
     ],
 )
 
diff --git a/third_party/xla/third_party/py/ml_dtypes/ml_dtypes.tests.BUILD b/third_party/xla/third_party/py/ml_dtypes/ml_dtypes.tests.BUILD
index 574659aff7d734..3e73c6f4747976 100644
--- a/third_party/xla/third_party/py/ml_dtypes/ml_dtypes.tests.BUILD
+++ b/third_party/xla/third_party/py/ml_dtypes/ml_dtypes.tests.BUILD
@@ -8,7 +8,7 @@ py_library(
         "//:ml_dtypes",
         "@absl_py//absl/testing:absltest",
         "@absl_py//absl/testing:parameterized",
-        "@org_tensorflow//third_party/py/numpy",
+        "@local_tsl//third_party/py/numpy",
     ],
 )
 
diff --git a/third_party/xla/third_party/py/ml_dtypes/workspace.bzl b/third_party/xla/third_party/py/ml_dtypes/workspace.bzl
index 84aeb00edbe105..ac75f63653e8ab 100644
--- a/third_party/xla/third_party/py/ml_dtypes/workspace.bzl
+++ b/third_party/xla/third_party/py/ml_dtypes/workspace.bzl
@@ -7,8 +7,8 @@ float8 varieties, and int4.
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
-    ML_DTYPES_COMMIT = "2ca30a2b3c0744625ae3d6988f5596740080bbd0"
-    ML_DTYPES_SHA256 = "5ea2b9ab133ddd522bcdf2f9e77ac277bb19d3fa4442bd77ee024fc225f0b5ab"
+    ML_DTYPES_COMMIT = "15b400f4dbad93e725e77e7b8171e4bfebfac874"
+    ML_DTYPES_SHA256 = "368312e4909bffe6a5ef22640ddae425ee14101af069a2e48b69d2fee33461e4"
     tf_http_archive(
         name = "ml_dtypes",
         build_file = "//third_party/py/ml_dtypes:ml_dtypes.BUILD",
diff --git a/third_party/xla/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.BUILD b/third_party/xla/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.BUILD
index a85195e9c51102..25c98abc05655d 100644
--- a/third_party/xla/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.BUILD
+++ b/third_party/xla/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.BUILD
@@ -30,6 +30,21 @@ cc_library(
         ".",
         "ml_dtypes",
     ],
+    deps = [
+        ":intn",
+    ],
+)
+
+cc_library(
+    name = "intn",
+    hdrs = ["include/intn.h"],
+    include_prefix = "ml_dtypes",
+    # Internal headers are all relative to . but other packages
+    # include these headers with the  prefix.
+    includes = [
+        ".",
+        "ml_dtypes",
+    ],
 )
 
 pybind_extension(
@@ -48,8 +63,9 @@ pybind_extension(
     deps = [
         ":float8",
         ":int4",
+        ":intn",
         "@eigen_archive//:eigen3",
-        "@org_tensorflow//third_party/py/numpy:headers",
+        "@local_tsl//third_party/py/numpy:headers",
     ],
 )
 
diff --git a/third_party/xla/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.tests.BUILD b/third_party/xla/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.tests.BUILD
index 574659aff7d734..3e73c6f4747976 100644
--- a/third_party/xla/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.tests.BUILD
+++ b/third_party/xla/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.tests.BUILD
@@ -8,7 +8,7 @@ py_library(
         "//:ml_dtypes",
         "@absl_py//absl/testing:absltest",
         "@absl_py//absl/testing:parameterized",
-        "@org_tensorflow//third_party/py/numpy",
+        "@local_tsl//third_party/py/numpy",
     ],
 )
 
diff --git a/third_party/xla/third_party/py/non_hermetic/ml_dtypes/workspace.bzl b/third_party/xla/third_party/py/non_hermetic/ml_dtypes/workspace.bzl
index 84aeb00edbe105..ac75f63653e8ab 100644
--- a/third_party/xla/third_party/py/non_hermetic/ml_dtypes/workspace.bzl
+++ b/third_party/xla/third_party/py/non_hermetic/ml_dtypes/workspace.bzl
@@ -7,8 +7,8 @@ float8 varieties, and int4.
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
-    ML_DTYPES_COMMIT = "2ca30a2b3c0744625ae3d6988f5596740080bbd0"
-    ML_DTYPES_SHA256 = "5ea2b9ab133ddd522bcdf2f9e77ac277bb19d3fa4442bd77ee024fc225f0b5ab"
+    ML_DTYPES_COMMIT = "15b400f4dbad93e725e77e7b8171e4bfebfac874"
+    ML_DTYPES_SHA256 = "368312e4909bffe6a5ef22640ddae425ee14101af069a2e48b69d2fee33461e4"
     tf_http_archive(
         name = "ml_dtypes",
         build_file = "//third_party/py/ml_dtypes:ml_dtypes.BUILD",
diff --git a/third_party/xla/third_party/stablehlo/temporary.patch b/third_party/xla/third_party/stablehlo/temporary.patch
index 94971c07102a21..c8897665644baa 100755
--- a/third_party/xla/third_party/stablehlo/temporary.patch
+++ b/third_party/xla/third_party/stablehlo/temporary.patch
@@ -177,7 +177,7 @@ diff --ruN a/stablehlo/stablehlo/CMakeLists.txt b/stablehlo/stablehlo/CMakeLists
 diff --ruN a/stablehlo/stablehlo/experimental/BUILD.bazel b/stablehlo/stablehlo/experimental/BUILD.bazel
 --- stablehlo/stablehlo/experimental/BUILD.bazel
 +++ stablehlo/stablehlo/experimental/BUILD.bazel
-@@ -0,0 +1,116 @@
+@@ -0,0 +1,115 @@
 +# Copyright 2023 The StableHLO Authors. All Rights Reserved.
 +#
 +# Licensed under the Apache License, Version 2.0 (the "License");
@@ -251,7 +251,6 @@ diff --ruN a/stablehlo/stablehlo/experimental/BUILD.bazel b/stablehlo/stablehlo/
 +        "transforms/ChloRecomposeOps.cpp",
 +        "transforms/StablehloCanonicalizeDynamism.cpp",
 +        "transforms/StablehloRefineShapes.cpp",
-+        "transforms/StablehloTrivialDce.cpp",
 +    ],
 +    hdrs = [
 +        "transforms/Passes.h",
@@ -447,7 +446,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/CMakeLists.txt b/stablehlo
 diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp
 --- stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp
 +++ stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp
-@@ -0,0 +1,504 @@
+@@ -0,0 +1,506 @@
 +/* Copyright 2023 The StableHLO Authors.
 +
 +Licensed under the Apache License, Version 2.0 (the "License");
@@ -752,7 +751,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stableh
 +
 +  // Unpack operands and attributes of the underlying custom call into
 +  // operation-specific inputs.
-+  auto rngAlgorithmAttr = op_->getAttr("rng_algorithm");
++  auto rngAlgorithmAttr = op_->getDiscardableAttr("rng_algorithm");
 +  auto initialState = op_.getInputs()[0];
 +  auto outputShape = op_.getInputs()[1];
 +  auto outputState = op_.getResults()[0];
@@ -813,7 +812,9 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stableh
 +}
 +
 +RngAlgorithm DynamicRngBitGeneratorOpAdaptor::getRngAlgorithm() {
-+  return op_->getAttr("rng_algorithm").cast<RngAlgorithmAttr>().getValue();
++  return op_->getDiscardableAttr("rng_algorithm")
++      .cast<RngAlgorithmAttr>()
++      .getValue();
 +}
 +
 +TypedValue<ShapedType> DynamicRngBitGeneratorOpAdaptor::getInitialState() {
@@ -1904,7 +1905,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/tools/StablehloOptMain.cpp b/stabl
 diff --ruN a/stablehlo/stablehlo/experimental/transforms/CMakeLists.txt b/stablehlo/stablehlo/experimental/transforms/CMakeLists.txt
 --- stablehlo/stablehlo/experimental/transforms/CMakeLists.txt
 +++ stablehlo/stablehlo/experimental/transforms/CMakeLists.txt
-@@ -0,0 +1,41 @@
+@@ -0,0 +1,40 @@
 +# Copyright 2023 The StableHLO Authors.
 +#
 +# Licensed under the Apache License, Version 2.0 (the "License");
@@ -1928,7 +1929,6 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/CMakeLists.txt b/stable
 +  ChloRecomposeOps.cpp
 +  StablehloCanonicalizeDynamism.cpp
 +  StablehloRefineShapes.cpp
-+  StablehloTrivialDce.cpp
 +
 +  DEPENDS
 +  ExperimentalPassesIncGen
@@ -1949,7 +1949,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/CMakeLists.txt b/stable
 diff --ruN a/stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp b/stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp
 --- stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp
 +++ stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp
-@@ -0,0 +1,178 @@
+@@ -0,0 +1,180 @@
 +/* Copyright 2024 The StableHLO Authors.
 +Licensed under the Apache License, Version 2.0 (the "License");
 +you may not use this file except in compliance with the License.
@@ -1968,6 +1968,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp b/
 +#include <functional>
 +
 +#include "llvm/ADT/SmallVector.h"
++#include "llvm/Support/Casting.h"
 +#include "mlir/Dialect/Func/IR/FuncOps.h"
 +#include "mlir/IR/Attributes.h"
 +#include "mlir/IR/BuiltinAttributes.h"
@@ -1992,7 +1993,8 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp b/
 +
 +FailureOr<DictionaryAttr> getCustomCallOpAttributes(CustomCallOp op,
 +                                                    PatternRewriter& rewriter) {
-+  auto attrs = op->getAttrOfType<DictionaryAttr>("mhlo.attributes");
++  auto attrs = llvm::dyn_cast_or_null<DictionaryAttr>(
++      op->getDiscardableAttr("mhlo.attributes"));
 +  if (!attrs)
 +    return rewriter.notifyMatchFailure(
 +        op, "Expected mhlo.attributes dictionary attribute.");
@@ -2173,7 +2175,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/Passes.h b/stablehlo/st
 diff --ruN a/stablehlo/stablehlo/experimental/transforms/Passes.td b/stablehlo/stablehlo/experimental/transforms/Passes.td
 --- stablehlo/stablehlo/experimental/transforms/Passes.td
 +++ stablehlo/stablehlo/experimental/transforms/Passes.td
-@@ -0,0 +1,55 @@
+@@ -0,0 +1,39 @@
 +/* Copyright 2023 The StableHLO Authors.
 +
 +Licensed under the Apache License, Version 2.0 (the "License");
@@ -2213,22 +2215,6 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/Passes.td b/stablehlo/s
 +  }];
 +  let dependentDialects = ["chlo::ChloDialect"];
 +}
-+
-+def StablehloTrivialDcePass : Pass<"experimental-stablehlo-trivial-dce", "ModuleOp"> {
-+  let summary = "(Experimental) Performs a single bottom up pass to remove values that are trivially dead.";
-+  let description = [{
-+    An experimental pass to remove dead values prior to running other passes
-+    that may fail to converge otherwise. For example, running shape refinement
-+    on a program that has a lot of dead values can fail because shape refinement
-+    is top down and removing values causes a new iteration to be triggered, and
-+    removing all the dead values with a top down traversal can take a lot of
-+    iterations (10+), which is slow.
-+
-+    Performing a single pass should be fast, and doing it bottom up means that
-+    values that are transitively dead can be removed since leaf values will be
-+    processed first.
-+  }];
-+}
 diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp b/stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp
 --- stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp
 +++ stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp
@@ -2578,1252 +2564,76 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.c
 +}  // namespace experimental
 +}  // namespace stablehlo
 +}  // namespace mlir
-diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloTrivialDce.cpp b/stablehlo/stablehlo/experimental/transforms/StablehloTrivialDce.cpp
---- stablehlo/stablehlo/experimental/transforms/StablehloTrivialDce.cpp
-+++ stablehlo/stablehlo/experimental/transforms/StablehloTrivialDce.cpp
-@@ -0,0 +1,63 @@
-+/* Copyright 2022 The StableHLO Authors.
-+Licensed under the Apache License, Version 2.0 (the "License");
-+you may not use this file except in compliance with the License.
-+You may obtain a copy of the License at
-+
-+    http://www.apache.org/licenses/LICENSE-2.0
-+
-+Unless required by applicable law or agreed to in writing, software
-+distributed under the License is distributed on an "AS IS" BASIS,
-+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+See the License for the specific language governing permissions and
-+limitations under the License.
-+==============================================================================*/
-+
-+#include <cstdint>
-+
-+#include "llvm/ADT/SmallVector.h"
-+#include "mlir/Dialect/Func/IR/FuncOps.h"
-+#include "mlir/IR/PatternMatch.h"
-+#include "mlir/Support/LogicalResult.h"
-+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-+#include "stablehlo/experimental/transforms/Passes.h"
-+
-+namespace mlir {
-+namespace stablehlo {
-+namespace experimental {
-+
-+#define GEN_PASS_DEF_STABLEHLOTRIVIALDCEPASS
-+#include "stablehlo/experimental/transforms/Passes.h.inc"
-+
-+namespace {
-+
-+struct StablehloTrivialDcePass
-+    : public impl::StablehloTrivialDcePassBase<StablehloTrivialDcePass> {
-+  using StablehloTrivialDcePassBase::StablehloTrivialDcePassBase;
-+
-+  void runOnOperation() override {
-+    GreedyRewriteConfig config;
-+
-+    // Hardcode defaults for stability.
-+    config.enableRegionSimplification = true;
-+    config.maxNumRewrites = GreedyRewriteConfig::kNoLimit;
-+    config.strictMode = GreedyRewriteStrictness::AnyOp;
-+
-+    // Run a single bottom up pass.
-+    config.useTopDownTraversal = false;
-+    config.maxIterations = 1;
-+
-+    // Running a greedy rewrite will cause trivially dead values to be removed.
-+    // Doing it without patterns ensures that no other changes are made to the
-+    // IR. Doing it bottom-up ensures that values that are transitively dead are
-+    // also removed. Although 1 pass should be enough,
-+    // applyPatternsAndFoldGreedily will want to run at least 1 more iteration
-+    // to confirm convergence, but we don't need to check for convergence, so we
-+    // ignore the return value.
-+    (void)applyPatternsAndFoldGreedily(getOperation(), RewritePatternSet(&getContext()), config);
-+  }
-+};
-+
-+}  // namespace
-+}  // namespace experimental
-+}  // namespace stablehlo
-+}  // namespace mlir
-diff --ruN a/stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir b/stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
---- stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
-+++ stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
-@@ -1283,153 +1283,153 @@
- func.func @zeta_f16(%arg0: tensor<f16>, %arg1: tensor<f16>) -> tensor<f16> {
-   // CHECK: %[[TMP_0:.*]] = stablehlo.convert %[[X]] : (tensor<f16>) -> tensor<f32>
-   // CHECK: %[[TMP_1:.*]] = stablehlo.convert %[[Q]] : (tensor<f16>) -> tensor<f32>
--  // CHECK: %[[TMP_2:.*]] = stablehlo.constant dense<0.000000e+00>
--  // CHECK: %[[TMP_3:.*]] = stablehlo.negate %[[TMP_0]]
--  // CHECK: %[[TMP_4:.*]] = stablehlo.power %[[TMP_1]], %[[TMP_3]]
--  // CHECK: %[[TMP_5:.*]] = stablehlo.constant dense<1.000000e+00>
--  // CHECK: %[[TMP_6:.*]] = stablehlo.add %[[TMP_1]], %[[TMP_5]]
--  // CHECK: %[[TMP_7:.*]] = stablehlo.power %[[TMP_6]], %[[TMP_3]]
--  // CHECK: %[[TMP_8:.*]] = stablehlo.add %[[TMP_4]], %[[TMP_7]]
--  // CHECK: %[[TMP_9:.*]] = stablehlo.add %[[TMP_6]], %[[TMP_5]]
--  // CHECK: %[[TMP_10:.*]] = stablehlo.power %[[TMP_9]], %[[TMP_3]]
-+  // CHECK-DAG: %[[TMP_2:.*]] = stablehlo.constant dense<0.000000e+00>
-+  // CHECK-DAG: %[[TMP_3:.*]] = stablehlo.constant dense<1.000000e+00>
-+  // CHECK: %[[TMP_4:.*]] = stablehlo.negate %[[TMP_0]]
-+  // CHECK: %[[TMP_5:.*]] = stablehlo.power %[[TMP_1]], %[[TMP_4]]
-+  // CHECK: %[[TMP_6:.*]] = stablehlo.add %[[TMP_1]], %[[TMP_3]]
-+  // CHECK: %[[TMP_7:.*]] = stablehlo.power %[[TMP_6]], %[[TMP_4]]
-+  // CHECK: %[[TMP_8:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_7]]
-+  // CHECK: %[[TMP_9:.*]] = stablehlo.add %[[TMP_6]], %[[TMP_3]]
-+  // CHECK: %[[TMP_10:.*]] = stablehlo.power %[[TMP_9]], %[[TMP_4]]
-   // CHECK: %[[TMP_11:.*]] = stablehlo.add %[[TMP_8]], %[[TMP_10]]
--  // CHECK: %[[TMP_12:.*]] = stablehlo.add %[[TMP_9]], %[[TMP_5]]
--  // CHECK: %[[TMP_13:.*]] = stablehlo.power %[[TMP_12]], %[[TMP_3]]
-+  // CHECK: %[[TMP_12:.*]] = stablehlo.add %[[TMP_9]], %[[TMP_3]]
-+  // CHECK: %[[TMP_13:.*]] = stablehlo.power %[[TMP_12]], %[[TMP_4]]
-   // CHECK: %[[TMP_14:.*]] = stablehlo.add %[[TMP_11]], %[[TMP_13]]
--  // CHECK: %[[TMP_15:.*]] = stablehlo.add %[[TMP_12]], %[[TMP_5]]
--  // CHECK: %[[TMP_16:.*]] = stablehlo.power %[[TMP_15]], %[[TMP_3]]
-+  // CHECK: %[[TMP_15:.*]] = stablehlo.add %[[TMP_12]], %[[TMP_3]]
-+  // CHECK: %[[TMP_16:.*]] = stablehlo.power %[[TMP_15]], %[[TMP_4]]
-   // CHECK: %[[TMP_17:.*]] = stablehlo.add %[[TMP_14]], %[[TMP_16]]
--  // CHECK: %[[TMP_18:.*]] = stablehlo.add %[[TMP_15]], %[[TMP_5]]
--  // CHECK: %[[TMP_19:.*]] = stablehlo.power %[[TMP_18]], %[[TMP_3]]
-+  // CHECK: %[[TMP_18:.*]] = stablehlo.add %[[TMP_15]], %[[TMP_3]]
-+  // CHECK: %[[TMP_19:.*]] = stablehlo.power %[[TMP_18]], %[[TMP_4]]
-   // CHECK: %[[TMP_20:.*]] = stablehlo.add %[[TMP_17]], %[[TMP_19]]
--  // CHECK: %[[TMP_21:.*]] = stablehlo.add %[[TMP_18]], %[[TMP_5]]
--  // CHECK: %[[TMP_22:.*]] = stablehlo.power %[[TMP_21]], %[[TMP_3]]
-+  // CHECK: %[[TMP_21:.*]] = stablehlo.add %[[TMP_18]], %[[TMP_3]]
-+  // CHECK: %[[TMP_22:.*]] = stablehlo.power %[[TMP_21]], %[[TMP_4]]
-   // CHECK: %[[TMP_23:.*]] = stablehlo.add %[[TMP_20]], %[[TMP_22]]
--  // CHECK: %[[TMP_24:.*]] = stablehlo.add %[[TMP_21]], %[[TMP_5]]
--  // CHECK: %[[TMP_25:.*]] = stablehlo.power %[[TMP_24]], %[[TMP_3]]
-+  // CHECK: %[[TMP_24:.*]] = stablehlo.add %[[TMP_21]], %[[TMP_3]]
-+  // CHECK: %[[TMP_25:.*]] = stablehlo.power %[[TMP_24]], %[[TMP_4]]
-   // CHECK: %[[TMP_26:.*]] = stablehlo.add %[[TMP_23]], %[[TMP_25]]
--  // CHECK: %[[TMP_27:.*]] = stablehlo.add %[[TMP_24]], %[[TMP_5]]
--  // CHECK: %[[TMP_28:.*]] = stablehlo.power %[[TMP_27]], %[[TMP_3]]
-+  // CHECK: %[[TMP_27:.*]] = stablehlo.add %[[TMP_24]], %[[TMP_3]]
-+  // CHECK: %[[TMP_28:.*]] = stablehlo.power %[[TMP_27]], %[[TMP_4]]
-   // CHECK: %[[TMP_29:.*]] = stablehlo.add %[[TMP_26]], %[[TMP_28]]
--  // CHECK: %[[TMP_30:.*]] = stablehlo.add %[[TMP_27]], %[[TMP_5]]
--  // CHECK: %[[TMP_31:.*]] = stablehlo.power %[[TMP_30]], %[[TMP_3]]
-+  // CHECK: %[[TMP_30:.*]] = stablehlo.add %[[TMP_27]], %[[TMP_3]]
-+  // CHECK: %[[TMP_31:.*]] = stablehlo.power %[[TMP_30]], %[[TMP_4]]
-   // CHECK: %[[TMP_32:.*]] = stablehlo.add %[[TMP_29]], %[[TMP_31]]
--  // CHECK: %[[TMP_33:.*]] = stablehlo.add %[[TMP_30]], %[[TMP_5]]
--  // CHECK: %[[TMP_34:.*]] = stablehlo.power %[[TMP_33]], %[[TMP_3]]
-+  // CHECK: %[[TMP_33:.*]] = stablehlo.add %[[TMP_30]], %[[TMP_3]]
-+  // CHECK: %[[TMP_34:.*]] = stablehlo.power %[[TMP_33]], %[[TMP_4]]
-   // CHECK: %[[TMP_35:.*]] = stablehlo.constant dense<1.000000e+00>
--  // CHECK: %[[TMP_36:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_35]]
--  // CHECK: %[[TMP_37:.*]] = stablehlo.multiply %[[TMP_34]], %[[TMP_33]]
--  // CHECK: %[[TMP_38:.*]] = stablehlo.divide %[[TMP_37]], %[[TMP_36]]
--  // CHECK: %[[TMP_39:.*]] = stablehlo.add %[[TMP_32]], %[[TMP_38]]
--  // CHECK: %[[TMP_40:.*]] = stablehlo.multiply %[[TMP_33]], %[[TMP_33]]
--  // CHECK: %[[TMP_41:.*]] = stablehlo.divide %[[TMP_5]], %[[TMP_40]]
--  // CHECK: %[[TMP_42:.*]] = stablehlo.constant dense<2.200000e+01>
--  // CHECK: %[[TMP_43:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_42]]
--  // CHECK: %[[TMP_44:.*]] = stablehlo.constant dense<2.100000e+01>
--  // CHECK: %[[TMP_45:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_44]]
--  // CHECK: %[[TMP_46:.*]] = stablehlo.multiply %[[TMP_43]], %[[TMP_45]]
--  // CHECK: %[[TMP_47:.*]] = stablehlo.constant dense<-1.39544646E-19>
--  // CHECK: %[[TMP_48:.*]] = stablehlo.add %[[TMP_2]], %[[TMP_47]]
--  // CHECK: %[[TMP_49:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_48]]
--  // CHECK: %[[TMP_50:.*]] = stablehlo.multiply %[[TMP_46]], %[[TMP_49]]
--  // CHECK: %[[TMP_51:.*]] = stablehlo.constant dense<2.000000e+01>
--  // CHECK: %[[TMP_52:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_51]]
--  // CHECK: %[[TMP_53:.*]] = stablehlo.constant dense<1.900000e+01>
--  // CHECK: %[[TMP_54:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_53]]
--  // CHECK: %[[TMP_55:.*]] = stablehlo.multiply %[[TMP_52]], %[[TMP_54]]
--  // CHECK: %[[TMP_56:.*]] = stablehlo.constant dense<5.50900303E-18>
--  // CHECK: %[[TMP_57:.*]] = stablehlo.add %[[TMP_50]], %[[TMP_56]]
--  // CHECK: %[[TMP_58:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_57]]
--  // CHECK: %[[TMP_59:.*]] = stablehlo.multiply %[[TMP_55]], %[[TMP_58]]
--  // CHECK: %[[TMP_60:.*]] = stablehlo.constant dense<1.800000e+01>
--  // CHECK: %[[TMP_61:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_60]]
--  // CHECK: %[[TMP_62:.*]] = stablehlo.constant dense<1.700000e+01>
--  // CHECK: %[[TMP_63:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_62]]
--  // CHECK: %[[TMP_64:.*]] = stablehlo.multiply %[[TMP_61]], %[[TMP_63]]
--  // CHECK: %[[TMP_65:.*]] = stablehlo.constant dense<-2.17486866E-16>
--  // CHECK: %[[TMP_66:.*]] = stablehlo.add %[[TMP_59]], %[[TMP_65]]
--  // CHECK: %[[TMP_67:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_66]]
--  // CHECK: %[[TMP_68:.*]] = stablehlo.multiply %[[TMP_64]], %[[TMP_67]]
--  // CHECK: %[[TMP_69:.*]] = stablehlo.constant dense<1.600000e+01>
--  // CHECK: %[[TMP_70:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_69]]
--  // CHECK: %[[TMP_71:.*]] = stablehlo.constant dense<1.500000e+01>
--  // CHECK: %[[TMP_72:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_71]]
--  // CHECK: %[[TMP_73:.*]] = stablehlo.multiply %[[TMP_70]], %[[TMP_72]]
--  // CHECK: %[[TMP_74:.*]] = stablehlo.constant dense<8.58606213E-15>
--  // CHECK: %[[TMP_75:.*]] = stablehlo.add %[[TMP_68]], %[[TMP_74]]
--  // CHECK: %[[TMP_76:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_75]]
--  // CHECK: %[[TMP_77:.*]] = stablehlo.multiply %[[TMP_73]], %[[TMP_76]]
--  // CHECK: %[[TMP_78:.*]] = stablehlo.constant dense<1.400000e+01>
--  // CHECK: %[[TMP_79:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_78]]
--  // CHECK: %[[TMP_80:.*]] = stablehlo.constant dense<1.300000e+01>
--  // CHECK: %[[TMP_81:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_80]]
--  // CHECK: %[[TMP_82:.*]] = stablehlo.multiply %[[TMP_79]], %[[TMP_81]]
--  // CHECK: %[[TMP_83:.*]] = stablehlo.constant dense<-3.3896803E-13>
--  // CHECK: %[[TMP_84:.*]] = stablehlo.add %[[TMP_77]], %[[TMP_83]]
--  // CHECK: %[[TMP_85:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_84]]
--  // CHECK: %[[TMP_86:.*]] = stablehlo.multiply %[[TMP_82]], %[[TMP_85]]
--  // CHECK: %[[TMP_87:.*]] = stablehlo.constant dense<1.200000e+01>
--  // CHECK: %[[TMP_88:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_87]]
--  // CHECK: %[[TMP_89:.*]] = stablehlo.constant dense<1.100000e+01>
--  // CHECK: %[[TMP_90:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_89]]
--  // CHECK: %[[TMP_91:.*]] = stablehlo.multiply %[[TMP_88]], %[[TMP_90]]
--  // CHECK: %[[TMP_92:.*]] = stablehlo.constant dense<1.33825364E-11>
--  // CHECK: %[[TMP_93:.*]] = stablehlo.add %[[TMP_86]], %[[TMP_92]]
--  // CHECK: %[[TMP_94:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_93]]
--  // CHECK: %[[TMP_95:.*]] = stablehlo.multiply %[[TMP_91]], %[[TMP_94]]
--  // CHECK: %[[TMP_96:.*]] = stablehlo.constant dense<1.000000e+01>
--  // CHECK: %[[TMP_97:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_96]]
--  // CHECK: %[[TMP_98:.*]] = stablehlo.constant dense<9.000000e+00>
--  // CHECK: %[[TMP_99:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_98]]
--  // CHECK: %[[TMP_100:.*]] = stablehlo.multiply %[[TMP_97]], %[[TMP_99]]
--  // CHECK: %[[TMP_101:.*]] = stablehlo.constant dense<-5.28419031E-10>
--  // CHECK: %[[TMP_102:.*]] = stablehlo.add %[[TMP_95]], %[[TMP_101]]
--  // CHECK: %[[TMP_103:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_102]]
--  // CHECK: %[[TMP_104:.*]] = stablehlo.multiply %[[TMP_100]], %[[TMP_103]]
--  // CHECK: %[[TMP_105:.*]] = stablehlo.constant dense<8.000000e+00>
--  // CHECK: %[[TMP_106:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_105]]
--  // CHECK: %[[TMP_107:.*]] = stablehlo.constant dense<7.000000e+00>
--  // CHECK: %[[TMP_108:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_107]]
--  // CHECK: %[[TMP_109:.*]] = stablehlo.multiply %[[TMP_106]], %[[TMP_108]]
--  // CHECK: %[[TMP_110:.*]] = stablehlo.constant dense<2.08767563E-8>
--  // CHECK: %[[TMP_111:.*]] = stablehlo.add %[[TMP_104]], %[[TMP_110]]
--  // CHECK: %[[TMP_112:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_111]]
--  // CHECK: %[[TMP_113:.*]] = stablehlo.multiply %[[TMP_109]], %[[TMP_112]]
--  // CHECK: %[[TMP_114:.*]] = stablehlo.constant dense<6.000000e+00>
--  // CHECK: %[[TMP_115:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_114]]
--  // CHECK: %[[TMP_116:.*]] = stablehlo.constant dense<5.000000e+00>
--  // CHECK: %[[TMP_117:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_116]]
--  // CHECK: %[[TMP_118:.*]] = stablehlo.multiply %[[TMP_115]], %[[TMP_117]]
--  // CHECK: %[[TMP_119:.*]] = stablehlo.constant dense<-8.26719599E-7>
--  // CHECK: %[[TMP_120:.*]] = stablehlo.add %[[TMP_113]], %[[TMP_119]]
--  // CHECK: %[[TMP_121:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_120]]
--  // CHECK: %[[TMP_122:.*]] = stablehlo.multiply %[[TMP_118]], %[[TMP_121]]
--  // CHECK: %[[TMP_123:.*]] = stablehlo.constant dense<4.000000e+00>
--  // CHECK: %[[TMP_124:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_123]]
--  // CHECK: %[[TMP_125:.*]] = stablehlo.constant dense<3.000000e+00>
--  // CHECK: %[[TMP_126:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_125]]
--  // CHECK: %[[TMP_127:.*]] = stablehlo.multiply %[[TMP_124]], %[[TMP_126]]
--  // CHECK: %[[TMP_128:.*]] = stablehlo.constant dense<3.30687835E-5>
--  // CHECK: %[[TMP_129:.*]] = stablehlo.add %[[TMP_122]], %[[TMP_128]]
--  // CHECK: %[[TMP_130:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_129]]
--  // CHECK: %[[TMP_131:.*]] = stablehlo.multiply %[[TMP_127]], %[[TMP_130]]
--  // CHECK: %[[TMP_132:.*]] = stablehlo.constant dense<2.000000e+00>
--  // CHECK: %[[TMP_133:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_132]]
--  // CHECK: %[[TMP_134:.*]] = stablehlo.constant dense<1.000000e+00>
--  // CHECK: %[[TMP_135:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_134]]
--  // CHECK: %[[TMP_136:.*]] = stablehlo.multiply %[[TMP_133]], %[[TMP_135]]
--  // CHECK: %[[TMP_137:.*]] = stablehlo.constant dense<-0.00138888892>
--  // CHECK: %[[TMP_138:.*]] = stablehlo.add %[[TMP_131]], %[[TMP_137]]
--  // CHECK: %[[TMP_139:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_138]]
--  // CHECK: %[[TMP_140:.*]] = stablehlo.multiply %[[TMP_136]], %[[TMP_139]]
--  // CHECK: %[[TMP_141:.*]] = stablehlo.constant dense<5.000000e-01>
--  // CHECK: %[[TMP_142:.*]] = stablehlo.divide %[[TMP_0]], %[[TMP_33]]
--  // CHECK: %[[TMP_143:.*]] = stablehlo.constant dense<0.0833333358>
--  // CHECK: %[[TMP_144:.*]] = stablehlo.add %[[TMP_143]], %[[TMP_140]]
--  // CHECK: %[[TMP_145:.*]] = stablehlo.multiply %[[TMP_142]], %[[TMP_144]]
--  // CHECK: %[[TMP_146:.*]] = stablehlo.add %[[TMP_141]], %[[TMP_145]]
--  // CHECK: %[[TMP_147:.*]] = stablehlo.multiply %[[TMP_34]], %[[TMP_146]]
--  // CHECK: %[[TMP_148:.*]] = stablehlo.add %[[TMP_39]], %[[TMP_147]]
-+  // CHECK: %[[TMP_36:.*]] = stablehlo.multiply %[[TMP_34]], %[[TMP_33]]
-+  // CHECK: %[[TMP_37:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_35]]
-+  // CHECK: %[[TMP_38:.*]] = stablehlo.divide %[[TMP_36]], %[[TMP_37]]
-+  // CHECK: %[[TMP_39:.*]] = stablehlo.multiply %[[TMP_33]], %[[TMP_33]]
-+  // CHECK: %[[TMP_40:.*]] = stablehlo.divide %[[TMP_3]], %[[TMP_39]]
-+  // CHECK: %[[TMP_41:.*]] = stablehlo.constant dense<2.200000e+01>
-+  // CHECK: %[[TMP_42:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_41]]
-+  // CHECK: %[[TMP_43:.*]] = stablehlo.constant dense<2.100000e+01>
-+  // CHECK: %[[TMP_44:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_43]]
-+  // CHECK: %[[TMP_45:.*]] = stablehlo.multiply %[[TMP_42]], %[[TMP_44]]
-+  // CHECK: %[[TMP_46:.*]] = stablehlo.constant dense<-1.39544646E-19>
-+  // CHECK: %[[TMP_47:.*]] = stablehlo.add %[[TMP_2]], %[[TMP_46]]
-+  // CHECK: %[[TMP_48:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_47]]
-+  // CHECK: %[[TMP_49:.*]] = stablehlo.multiply %[[TMP_45]], %[[TMP_48]]
-+  // CHECK: %[[TMP_50:.*]] = stablehlo.constant dense<2.000000e+01>
-+  // CHECK: %[[TMP_51:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_50]]
-+  // CHECK: %[[TMP_52:.*]] = stablehlo.constant dense<1.900000e+01>
-+  // CHECK: %[[TMP_53:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_52]]
-+  // CHECK: %[[TMP_54:.*]] = stablehlo.multiply %[[TMP_51]], %[[TMP_53]]
-+  // CHECK: %[[TMP_55:.*]] = stablehlo.constant dense<5.50900303E-18>
-+  // CHECK: %[[TMP_56:.*]] = stablehlo.add %[[TMP_49]], %[[TMP_55]]
-+  // CHECK: %[[TMP_57:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_56]]
-+  // CHECK: %[[TMP_58:.*]] = stablehlo.multiply %[[TMP_54]], %[[TMP_57]]
-+  // CHECK: %[[TMP_59:.*]] = stablehlo.constant dense<1.800000e+01>
-+  // CHECK: %[[TMP_60:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_59]]
-+  // CHECK: %[[TMP_61:.*]] = stablehlo.constant dense<1.700000e+01>
-+  // CHECK: %[[TMP_62:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_61]]
-+  // CHECK: %[[TMP_63:.*]] = stablehlo.multiply %[[TMP_60]], %[[TMP_62]]
-+  // CHECK: %[[TMP_64:.*]] = stablehlo.constant dense<-2.17486866E-16>
-+  // CHECK: %[[TMP_65:.*]] = stablehlo.add %[[TMP_58]], %[[TMP_64]]
-+  // CHECK: %[[TMP_66:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_65]]
-+  // CHECK: %[[TMP_67:.*]] = stablehlo.multiply %[[TMP_63]], %[[TMP_66]]
-+  // CHECK: %[[TMP_68:.*]] = stablehlo.constant dense<1.600000e+01>
-+  // CHECK: %[[TMP_69:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_68]]
-+  // CHECK: %[[TMP_70:.*]] = stablehlo.constant dense<1.500000e+01>
-+  // CHECK: %[[TMP_71:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_70]]
-+  // CHECK: %[[TMP_72:.*]] = stablehlo.multiply %[[TMP_69]], %[[TMP_71]]
-+  // CHECK: %[[TMP_73:.*]] = stablehlo.constant dense<8.58606213E-15>
-+  // CHECK: %[[TMP_74:.*]] = stablehlo.add %[[TMP_67]], %[[TMP_73]]
-+  // CHECK: %[[TMP_75:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_74]]
-+  // CHECK: %[[TMP_76:.*]] = stablehlo.multiply %[[TMP_72]], %[[TMP_75]]
-+  // CHECK: %[[TMP_77:.*]] = stablehlo.constant dense<1.400000e+01>
-+  // CHECK: %[[TMP_78:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_77]]
-+  // CHECK: %[[TMP_79:.*]] = stablehlo.constant dense<1.300000e+01>
-+  // CHECK: %[[TMP_80:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_79]]
-+  // CHECK: %[[TMP_81:.*]] = stablehlo.multiply %[[TMP_78]], %[[TMP_80]]
-+  // CHECK: %[[TMP_82:.*]] = stablehlo.constant dense<-3.3896803E-13>
-+  // CHECK: %[[TMP_83:.*]] = stablehlo.add %[[TMP_76]], %[[TMP_82]]
-+  // CHECK: %[[TMP_84:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_83]]
-+  // CHECK: %[[TMP_85:.*]] = stablehlo.multiply %[[TMP_81]], %[[TMP_84]]
-+  // CHECK: %[[TMP_86:.*]] = stablehlo.constant dense<1.200000e+01>
-+  // CHECK: %[[TMP_87:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_86]]
-+  // CHECK: %[[TMP_88:.*]] = stablehlo.constant dense<1.100000e+01>
-+  // CHECK: %[[TMP_89:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_88]]
-+  // CHECK: %[[TMP_90:.*]] = stablehlo.multiply %[[TMP_87]], %[[TMP_89]]
-+  // CHECK: %[[TMP_91:.*]] = stablehlo.constant dense<1.33825364E-11>
-+  // CHECK: %[[TMP_92:.*]] = stablehlo.add %[[TMP_85]], %[[TMP_91]]
-+  // CHECK: %[[TMP_93:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_92]]
-+  // CHECK: %[[TMP_94:.*]] = stablehlo.multiply %[[TMP_90]], %[[TMP_93]]
-+  // CHECK: %[[TMP_95:.*]] = stablehlo.constant dense<1.000000e+01>
-+  // CHECK: %[[TMP_96:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_95]]
-+  // CHECK: %[[TMP_97:.*]] = stablehlo.constant dense<9.000000e+00>
-+  // CHECK: %[[TMP_98:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_97]]
-+  // CHECK: %[[TMP_99:.*]] = stablehlo.multiply %[[TMP_96]], %[[TMP_98]]
-+  // CHECK: %[[TMP_100:.*]] = stablehlo.constant dense<-5.28419031E-10>
-+  // CHECK: %[[TMP_101:.*]] = stablehlo.add %[[TMP_94]], %[[TMP_100]]
-+  // CHECK: %[[TMP_102:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_101]]
-+  // CHECK: %[[TMP_103:.*]] = stablehlo.multiply %[[TMP_99]], %[[TMP_102]]
-+  // CHECK: %[[TMP_104:.*]] = stablehlo.constant dense<8.000000e+00>
-+  // CHECK: %[[TMP_105:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_104]]
-+  // CHECK: %[[TMP_106:.*]] = stablehlo.constant dense<7.000000e+00>
-+  // CHECK: %[[TMP_107:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_106]]
-+  // CHECK: %[[TMP_108:.*]] = stablehlo.multiply %[[TMP_105]], %[[TMP_107]]
-+  // CHECK: %[[TMP_109:.*]] = stablehlo.constant dense<2.08767563E-8>
-+  // CHECK: %[[TMP_110:.*]] = stablehlo.add %[[TMP_103]], %[[TMP_109]]
-+  // CHECK: %[[TMP_111:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_110]]
-+  // CHECK: %[[TMP_112:.*]] = stablehlo.multiply %[[TMP_108]], %[[TMP_111]]
-+  // CHECK: %[[TMP_113:.*]] = stablehlo.constant dense<6.000000e+00>
-+  // CHECK: %[[TMP_114:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_113]]
-+  // CHECK: %[[TMP_115:.*]] = stablehlo.constant dense<5.000000e+00>
-+  // CHECK: %[[TMP_116:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_115]]
-+  // CHECK: %[[TMP_117:.*]] = stablehlo.multiply %[[TMP_114]], %[[TMP_116]]
-+  // CHECK: %[[TMP_118:.*]] = stablehlo.constant dense<-8.26719599E-7>
-+  // CHECK: %[[TMP_119:.*]] = stablehlo.add %[[TMP_112]], %[[TMP_118]]
-+  // CHECK: %[[TMP_120:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_119]]
-+  // CHECK: %[[TMP_121:.*]] = stablehlo.multiply %[[TMP_117]], %[[TMP_120]]
-+  // CHECK: %[[TMP_122:.*]] = stablehlo.constant dense<4.000000e+00>
-+  // CHECK: %[[TMP_123:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_122]]
-+  // CHECK: %[[TMP_124:.*]] = stablehlo.constant dense<3.000000e+00>
-+  // CHECK: %[[TMP_125:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_124]]
-+  // CHECK: %[[TMP_126:.*]] = stablehlo.multiply %[[TMP_123]], %[[TMP_125]]
-+  // CHECK: %[[TMP_127:.*]] = stablehlo.constant dense<3.30687835E-5>
-+  // CHECK: %[[TMP_128:.*]] = stablehlo.add %[[TMP_121]], %[[TMP_127]]
-+  // CHECK: %[[TMP_129:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_128]]
-+  // CHECK: %[[TMP_130:.*]] = stablehlo.multiply %[[TMP_126]], %[[TMP_129]]
-+  // CHECK: %[[TMP_131:.*]] = stablehlo.constant dense<2.000000e+00>
-+  // CHECK: %[[TMP_132:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_131]]
-+  // CHECK: %[[TMP_133:.*]] = stablehlo.constant dense<1.000000e+00>
-+  // CHECK: %[[TMP_134:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_133]]
-+  // CHECK: %[[TMP_135:.*]] = stablehlo.multiply %[[TMP_132]], %[[TMP_134]]
-+  // CHECK: %[[TMP_136:.*]] = stablehlo.constant dense<-0.00138888892>
-+  // CHECK: %[[TMP_137:.*]] = stablehlo.add %[[TMP_130]], %[[TMP_136]]
-+  // CHECK: %[[TMP_138:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_137]]
-+  // CHECK: %[[TMP_139:.*]] = stablehlo.multiply %[[TMP_135]], %[[TMP_138]]
-+  // CHECK: %[[TMP_140:.*]] = stablehlo.constant dense<5.000000e-01>
-+  // CHECK: %[[TMP_141:.*]] = stablehlo.divide %[[TMP_0]], %[[TMP_33]]
-+  // CHECK: %[[TMP_142:.*]] = stablehlo.constant dense<0.0833333358>
-+  // CHECK: %[[TMP_143:.*]] = stablehlo.add %[[TMP_142]], %[[TMP_139]]
-+  // CHECK: %[[TMP_144:.*]] = stablehlo.multiply %[[TMP_141]], %[[TMP_143]]
-+  // CHECK: %[[TMP_145:.*]] = stablehlo.add %[[TMP_140]], %[[TMP_144]]
-+  // CHECK: %[[TMP_146:.*]] = stablehlo.multiply %[[TMP_34]], %[[TMP_145]]
-+  // CHECK: %[[TMP_147:.*]] = stablehlo.add %[[TMP_32]], %[[TMP_38]]
-+  // CHECK: %[[TMP_148:.*]] = stablehlo.add %[[TMP_147]], %[[TMP_146]]
-   // CHECK: %[[TMP_149:.*]] = stablehlo.abs %[[TMP_34]]
-   // CHECK: %[[TMP_150:.*]] = stablehlo.abs %[[TMP_32]]
-   // CHECK: %[[TMP_151:.*]] = stablehlo.constant dense<1.401300e-45>
-@@ -1456,7 +1456,7 @@
-   // CHECK: %[[TMP_172:.*]] = stablehlo.and %[[TMP_169]], %[[TMP_171]] : tensor<i1>
-   // CHECK: %[[TMP_173:.*]] = stablehlo.select %[[TMP_172]], %[[TMP_163]], %[[TMP_155]]
-   // CHECK: %[[TMP_174:.*]] = stablehlo.select %[[TMP_166]], %[[TMP_173]], %[[TMP_162]]
--  // CHECK: %[[TMP_175:.*]] = stablehlo.compare EQ, %[[TMP_0]], %[[TMP_5]], NOTYPE
-+  // CHECK: %[[TMP_175:.*]] = stablehlo.compare EQ, %[[TMP_0]], %[[TMP_3]], NOTYPE
-   // CHECK: %[[TMP_176:.*]] = stablehlo.select %[[TMP_175]], %[[TMP_163]], %[[TMP_174]]
-   // CHECK: %[[TMP_177:.*]] = stablehlo.convert %[[TMP_176]] : (tensor<f32>) -> tensor<f16>
-   %0 = chlo.zeta %arg0, %arg1 : tensor<f16>, tensor<f16> -> tensor<f16>
-@@ -1465,8 +1465,7 @@
+diff --ruN a/stablehlo/stablehlo/reference/Ops.cpp b/stablehlo/stablehlo/reference/Ops.cpp
+--- stablehlo/stablehlo/reference/Ops.cpp
++++ stablehlo/stablehlo/reference/Ops.cpp
+@@ -16,6 +16,7 @@
+ #include "stablehlo/reference/Ops.h"
  
- // -----
- 
--
--// CHECK-LABEL: @polygamma_f32
-+// CHECK: @polygamma_f32
- // CHECK-SAME: (%[[ARG0:.*]]: tensor<f32>, %[[ARG1:.*]]: tensor<f32>)
- func.func @polygamma_f32(%lhs : tensor<f32>, %rhs : tensor<f32>) -> tensor<f32> {
-   // CHECK-DAG: %[[TMP_0:.*]] = stablehlo.constant dense<1.000000e+00>
-@@ -1559,153 +1558,153 @@
-   // CHECK: %[[TMP_87:.*]] = stablehlo.constant dense<0x7F800000>
-   // CHECK: %[[TMP_88:.*]] = stablehlo.select %[[TMP_86]], %[[TMP_87]], %[[TMP_83]]
-   // CHECK: %[[TMP_89:.*]] = stablehlo.exponential %[[TMP_88]]
--  // CHECK: %[[TMP_90:.*]] = stablehlo.constant dense<0.000000e+00>
--  // CHECK: %[[TMP_91:.*]] = stablehlo.negate %[[TMP_5]]
--  // CHECK: %[[TMP_92:.*]] = stablehlo.power %[[ARG1]], %[[TMP_91]]
--  // CHECK: %[[TMP_93:.*]] = stablehlo.constant dense<1.000000e+00>
--  // CHECK: %[[TMP_94:.*]] = stablehlo.add %[[ARG1]], %[[TMP_93]]
--  // CHECK: %[[TMP_95:.*]] = stablehlo.power %[[TMP_94]], %[[TMP_91]]
--  // CHECK: %[[TMP_96:.*]] = stablehlo.add %[[TMP_92]], %[[TMP_95]]
--  // CHECK: %[[TMP_97:.*]] = stablehlo.add %[[TMP_94]], %[[TMP_93]]
--  // CHECK: %[[TMP_98:.*]] = stablehlo.power %[[TMP_97]], %[[TMP_91]]
-+  // CHECK-DAG: %[[TMP_90:.*]] = stablehlo.constant dense<0.000000e+00>
-+  // CHECK-DAG: %[[TMP_91:.*]] = stablehlo.constant dense<1.000000e+00>
-+  // CHECK: %[[TMP_92:.*]] = stablehlo.negate %[[TMP_5]]
-+  // CHECK: %[[TMP_93:.*]] = stablehlo.power %[[ARG1]], %[[TMP_92]]
-+  // CHECK: %[[TMP_94:.*]] = stablehlo.add %[[ARG1]], %[[TMP_91]]
-+  // CHECK: %[[TMP_95:.*]] = stablehlo.power %[[TMP_94]], %[[TMP_92]]
-+  // CHECK: %[[TMP_96:.*]] = stablehlo.add %[[TMP_93]], %[[TMP_95]]
-+  // CHECK: %[[TMP_97:.*]] = stablehlo.add %[[TMP_94]], %[[TMP_91]]
-+  // CHECK: %[[TMP_98:.*]] = stablehlo.power %[[TMP_97]], %[[TMP_92]]
-   // CHECK: %[[TMP_99:.*]] = stablehlo.add %[[TMP_96]], %[[TMP_98]]
--  // CHECK: %[[TMP_100:.*]] = stablehlo.add %[[TMP_97]], %[[TMP_93]]
--  // CHECK: %[[TMP_101:.*]] = stablehlo.power %[[TMP_100]], %[[TMP_91]]
-+  // CHECK: %[[TMP_100:.*]] = stablehlo.add %[[TMP_97]], %[[TMP_91]]
-+  // CHECK: %[[TMP_101:.*]] = stablehlo.power %[[TMP_100]], %[[TMP_92]]
-   // CHECK: %[[TMP_102:.*]] = stablehlo.add %[[TMP_99]], %[[TMP_101]]
--  // CHECK: %[[TMP_103:.*]] = stablehlo.add %[[TMP_100]], %[[TMP_93]]
--  // CHECK: %[[TMP_104:.*]] = stablehlo.power %[[TMP_103]], %[[TMP_91]]
-+  // CHECK: %[[TMP_103:.*]] = stablehlo.add %[[TMP_100]], %[[TMP_91]]
-+  // CHECK: %[[TMP_104:.*]] = stablehlo.power %[[TMP_103]], %[[TMP_92]]
-   // CHECK: %[[TMP_105:.*]] = stablehlo.add %[[TMP_102]], %[[TMP_104]]
--  // CHECK: %[[TMP_106:.*]] = stablehlo.add %[[TMP_103]], %[[TMP_93]]
--  // CHECK: %[[TMP_107:.*]] = stablehlo.power %[[TMP_106]], %[[TMP_91]]
-+  // CHECK: %[[TMP_106:.*]] = stablehlo.add %[[TMP_103]], %[[TMP_91]]
-+  // CHECK: %[[TMP_107:.*]] = stablehlo.power %[[TMP_106]], %[[TMP_92]]
-   // CHECK: %[[TMP_108:.*]] = stablehlo.add %[[TMP_105]], %[[TMP_107]]
--  // CHECK: %[[TMP_109:.*]] = stablehlo.add %[[TMP_106]], %[[TMP_93]]
--  // CHECK: %[[TMP_110:.*]] = stablehlo.power %[[TMP_109]], %[[TMP_91]]
-+  // CHECK: %[[TMP_109:.*]] = stablehlo.add %[[TMP_106]], %[[TMP_91]]
-+  // CHECK: %[[TMP_110:.*]] = stablehlo.power %[[TMP_109]], %[[TMP_92]]
-   // CHECK: %[[TMP_111:.*]] = stablehlo.add %[[TMP_108]], %[[TMP_110]]
--  // CHECK: %[[TMP_112:.*]] = stablehlo.add %[[TMP_109]], %[[TMP_93]]
--  // CHECK: %[[TMP_113:.*]] = stablehlo.power %[[TMP_112]], %[[TMP_91]]
-+  // CHECK: %[[TMP_112:.*]] = stablehlo.add %[[TMP_109]], %[[TMP_91]]
-+  // CHECK: %[[TMP_113:.*]] = stablehlo.power %[[TMP_112]], %[[TMP_92]]
-   // CHECK: %[[TMP_114:.*]] = stablehlo.add %[[TMP_111]], %[[TMP_113]]
--  // CHECK: %[[TMP_115:.*]] = stablehlo.add %[[TMP_112]], %[[TMP_93]]
--  // CHECK: %[[TMP_116:.*]] = stablehlo.power %[[TMP_115]], %[[TMP_91]]
-+  // CHECK: %[[TMP_115:.*]] = stablehlo.add %[[TMP_112]], %[[TMP_91]]
-+  // CHECK: %[[TMP_116:.*]] = stablehlo.power %[[TMP_115]], %[[TMP_92]]
-   // CHECK: %[[TMP_117:.*]] = stablehlo.add %[[TMP_114]], %[[TMP_116]]
--  // CHECK: %[[TMP_118:.*]] = stablehlo.add %[[TMP_115]], %[[TMP_93]]
--  // CHECK: %[[TMP_119:.*]] = stablehlo.power %[[TMP_118]], %[[TMP_91]]
-+  // CHECK: %[[TMP_118:.*]] = stablehlo.add %[[TMP_115]], %[[TMP_91]]
-+  // CHECK: %[[TMP_119:.*]] = stablehlo.power %[[TMP_118]], %[[TMP_92]]
-   // CHECK: %[[TMP_120:.*]] = stablehlo.add %[[TMP_117]], %[[TMP_119]]
--  // CHECK: %[[TMP_121:.*]] = stablehlo.add %[[TMP_118]], %[[TMP_93]]
--  // CHECK: %[[TMP_122:.*]] = stablehlo.power %[[TMP_121]], %[[TMP_91]]
-+  // CHECK: %[[TMP_121:.*]] = stablehlo.add %[[TMP_118]], %[[TMP_91]]
-+  // CHECK: %[[TMP_122:.*]] = stablehlo.power %[[TMP_121]], %[[TMP_92]]
-   // CHECK: %[[TMP_123:.*]] = stablehlo.constant dense<1.000000e+00>
--  // CHECK: %[[TMP_124:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_123]]
--  // CHECK: %[[TMP_125:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_121]]
--  // CHECK: %[[TMP_126:.*]] = stablehlo.divide %[[TMP_125]], %[[TMP_124]]
--  // CHECK: %[[TMP_127:.*]] = stablehlo.add %[[TMP_120]], %[[TMP_126]]
--  // CHECK: %[[TMP_128:.*]] = stablehlo.multiply %[[TMP_121]], %[[TMP_121]]
--  // CHECK: %[[TMP_129:.*]] = stablehlo.divide %[[TMP_93]], %[[TMP_128]]
--  // CHECK: %[[TMP_130:.*]] = stablehlo.constant dense<2.200000e+01>
--  // CHECK: %[[TMP_131:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_130]]
--  // CHECK: %[[TMP_132:.*]] = stablehlo.constant dense<2.100000e+01>
--  // CHECK: %[[TMP_133:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_132]]
--  // CHECK: %[[TMP_134:.*]] = stablehlo.multiply %[[TMP_131]], %[[TMP_133]]
--  // CHECK: %[[TMP_135:.*]] = stablehlo.constant dense<-1.39544646E-19>
--  // CHECK: %[[TMP_136:.*]] = stablehlo.add %[[TMP_90]], %[[TMP_135]]
--  // CHECK: %[[TMP_137:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_136]]
--  // CHECK: %[[TMP_138:.*]] = stablehlo.multiply %[[TMP_134]], %[[TMP_137]]
--  // CHECK: %[[TMP_139:.*]] = stablehlo.constant dense<2.000000e+01>
--  // CHECK: %[[TMP_140:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_139]]
--  // CHECK: %[[TMP_141:.*]] = stablehlo.constant dense<1.900000e+01>
--  // CHECK: %[[TMP_142:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_141]]
--  // CHECK: %[[TMP_143:.*]] = stablehlo.multiply %[[TMP_140]], %[[TMP_142]]
--  // CHECK: %[[TMP_144:.*]] = stablehlo.constant dense<5.50900303E-18>
--  // CHECK: %[[TMP_145:.*]] = stablehlo.add %[[TMP_138]], %[[TMP_144]]
--  // CHECK: %[[TMP_146:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_145]]
--  // CHECK: %[[TMP_147:.*]] = stablehlo.multiply %[[TMP_143]], %[[TMP_146]]
--  // CHECK: %[[TMP_148:.*]] = stablehlo.constant dense<1.800000e+01>
--  // CHECK: %[[TMP_149:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_148]]
--  // CHECK: %[[TMP_150:.*]] = stablehlo.constant dense<1.700000e+01>
--  // CHECK: %[[TMP_151:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_150]]
--  // CHECK: %[[TMP_152:.*]] = stablehlo.multiply %[[TMP_149]], %[[TMP_151]]
--  // CHECK: %[[TMP_153:.*]] = stablehlo.constant dense<-2.17486866E-16>
--  // CHECK: %[[TMP_154:.*]] = stablehlo.add %[[TMP_147]], %[[TMP_153]]
--  // CHECK: %[[TMP_155:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_154]]
--  // CHECK: %[[TMP_156:.*]] = stablehlo.multiply %[[TMP_152]], %[[TMP_155]]
--  // CHECK: %[[TMP_157:.*]] = stablehlo.constant dense<1.600000e+01>
--  // CHECK: %[[TMP_158:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_157]]
--  // CHECK: %[[TMP_159:.*]] = stablehlo.constant dense<1.500000e+01>
--  // CHECK: %[[TMP_160:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_159]]
--  // CHECK: %[[TMP_161:.*]] = stablehlo.multiply %[[TMP_158]], %[[TMP_160]]
--  // CHECK: %[[TMP_162:.*]] = stablehlo.constant dense<8.58606213E-15>
--  // CHECK: %[[TMP_163:.*]] = stablehlo.add %[[TMP_156]], %[[TMP_162]]
--  // CHECK: %[[TMP_164:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_163]]
--  // CHECK: %[[TMP_165:.*]] = stablehlo.multiply %[[TMP_161]], %[[TMP_164]]
--  // CHECK: %[[TMP_166:.*]] = stablehlo.constant dense<1.400000e+01>
--  // CHECK: %[[TMP_167:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_166]]
--  // CHECK: %[[TMP_168:.*]] = stablehlo.constant dense<1.300000e+01>
--  // CHECK: %[[TMP_169:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_168]]
--  // CHECK: %[[TMP_170:.*]] = stablehlo.multiply %[[TMP_167]], %[[TMP_169]]
--  // CHECK: %[[TMP_171:.*]] = stablehlo.constant dense<-3.3896803E-13>
--  // CHECK: %[[TMP_172:.*]] = stablehlo.add %[[TMP_165]], %[[TMP_171]]
--  // CHECK: %[[TMP_173:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_172]]
--  // CHECK: %[[TMP_174:.*]] = stablehlo.multiply %[[TMP_170]], %[[TMP_173]]
--  // CHECK: %[[TMP_175:.*]] = stablehlo.constant dense<1.200000e+01>
--  // CHECK: %[[TMP_176:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_175]]
--  // CHECK: %[[TMP_177:.*]] = stablehlo.constant dense<1.100000e+01>
--  // CHECK: %[[TMP_178:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_177]]
--  // CHECK: %[[TMP_179:.*]] = stablehlo.multiply %[[TMP_176]], %[[TMP_178]]
--  // CHECK: %[[TMP_180:.*]] = stablehlo.constant dense<1.33825364E-11>
--  // CHECK: %[[TMP_181:.*]] = stablehlo.add %[[TMP_174]], %[[TMP_180]]
--  // CHECK: %[[TMP_182:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_181]]
--  // CHECK: %[[TMP_183:.*]] = stablehlo.multiply %[[TMP_179]], %[[TMP_182]]
--  // CHECK: %[[TMP_184:.*]] = stablehlo.constant dense<1.000000e+01>
--  // CHECK: %[[TMP_185:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_184]]
--  // CHECK: %[[TMP_186:.*]] = stablehlo.constant dense<9.000000e+00>
--  // CHECK: %[[TMP_187:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_186]]
--  // CHECK: %[[TMP_188:.*]] = stablehlo.multiply %[[TMP_185]], %[[TMP_187]]
--  // CHECK: %[[TMP_189:.*]] = stablehlo.constant dense<-5.28419031E-10>
--  // CHECK: %[[TMP_190:.*]] = stablehlo.add %[[TMP_183]], %[[TMP_189]]
--  // CHECK: %[[TMP_191:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_190]]
--  // CHECK: %[[TMP_192:.*]] = stablehlo.multiply %[[TMP_188]], %[[TMP_191]]
--  // CHECK: %[[TMP_193:.*]] = stablehlo.constant dense<8.000000e+00>
--  // CHECK: %[[TMP_194:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_193]]
--  // CHECK: %[[TMP_195:.*]] = stablehlo.constant dense<7.000000e+00>
--  // CHECK: %[[TMP_196:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_195]]
--  // CHECK: %[[TMP_197:.*]] = stablehlo.multiply %[[TMP_194]], %[[TMP_196]]
--  // CHECK: %[[TMP_198:.*]] = stablehlo.constant dense<2.08767563E-8>
--  // CHECK: %[[TMP_199:.*]] = stablehlo.add %[[TMP_192]], %[[TMP_198]]
--  // CHECK: %[[TMP_200:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_199]]
--  // CHECK: %[[TMP_201:.*]] = stablehlo.multiply %[[TMP_197]], %[[TMP_200]]
--  // CHECK: %[[TMP_202:.*]] = stablehlo.constant dense<6.000000e+00>
--  // CHECK: %[[TMP_203:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_202]]
--  // CHECK: %[[TMP_204:.*]] = stablehlo.constant dense<5.000000e+00>
--  // CHECK: %[[TMP_205:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_204]]
--  // CHECK: %[[TMP_206:.*]] = stablehlo.multiply %[[TMP_203]], %[[TMP_205]]
--  // CHECK: %[[TMP_207:.*]] = stablehlo.constant dense<-8.26719599E-7>
--  // CHECK: %[[TMP_208:.*]] = stablehlo.add %[[TMP_201]], %[[TMP_207]]
--  // CHECK: %[[TMP_209:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_208]]
--  // CHECK: %[[TMP_210:.*]] = stablehlo.multiply %[[TMP_206]], %[[TMP_209]]
--  // CHECK: %[[TMP_211:.*]] = stablehlo.constant dense<4.000000e+00>
--  // CHECK: %[[TMP_212:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_211]]
--  // CHECK: %[[TMP_213:.*]] = stablehlo.constant dense<3.000000e+00>
--  // CHECK: %[[TMP_214:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_213]]
--  // CHECK: %[[TMP_215:.*]] = stablehlo.multiply %[[TMP_212]], %[[TMP_214]]
--  // CHECK: %[[TMP_216:.*]] = stablehlo.constant dense<3.30687835E-5>
--  // CHECK: %[[TMP_217:.*]] = stablehlo.add %[[TMP_210]], %[[TMP_216]]
--  // CHECK: %[[TMP_218:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_217]]
--  // CHECK: %[[TMP_219:.*]] = stablehlo.multiply %[[TMP_215]], %[[TMP_218]]
--  // CHECK: %[[TMP_220:.*]] = stablehlo.constant dense<2.000000e+00>
--  // CHECK: %[[TMP_221:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_220]]
--  // CHECK: %[[TMP_222:.*]] = stablehlo.constant dense<1.000000e+00>
--  // CHECK: %[[TMP_223:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_222]]
--  // CHECK: %[[TMP_224:.*]] = stablehlo.multiply %[[TMP_221]], %[[TMP_223]]
--  // CHECK: %[[TMP_225:.*]] = stablehlo.constant dense<-0.00138888892>
--  // CHECK: %[[TMP_226:.*]] = stablehlo.add %[[TMP_219]], %[[TMP_225]]
--  // CHECK: %[[TMP_227:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_226]]
--  // CHECK: %[[TMP_228:.*]] = stablehlo.multiply %[[TMP_224]], %[[TMP_227]]
--  // CHECK: %[[TMP_229:.*]] = stablehlo.constant dense<5.000000e-01>
--  // CHECK: %[[TMP_230:.*]] = stablehlo.divide %[[TMP_5]], %[[TMP_121]]
--  // CHECK: %[[TMP_231:.*]] = stablehlo.constant dense<0.0833333358>
--  // CHECK: %[[TMP_232:.*]] = stablehlo.add %[[TMP_231]], %[[TMP_228]]
--  // CHECK: %[[TMP_233:.*]] = stablehlo.multiply %[[TMP_230]], %[[TMP_232]]
--  // CHECK: %[[TMP_234:.*]] = stablehlo.add %[[TMP_229]], %[[TMP_233]]
--  // CHECK: %[[TMP_235:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_234]]
--  // CHECK: %[[TMP_236:.*]] = stablehlo.add %[[TMP_127]], %[[TMP_235]]
-+  // CHECK: %[[TMP_124:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_121]]
-+  // CHECK: %[[TMP_125:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_123]]
-+  // CHECK: %[[TMP_126:.*]] = stablehlo.divide %[[TMP_124]], %[[TMP_125]]
-+  // CHECK: %[[TMP_127:.*]] = stablehlo.multiply %[[TMP_121]], %[[TMP_121]]
-+  // CHECK: %[[TMP_128:.*]] = stablehlo.divide %[[TMP_91]], %[[TMP_127]]
-+  // CHECK: %[[TMP_129:.*]] = stablehlo.constant dense<2.200000e+01>
-+  // CHECK: %[[TMP_130:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_129]]
-+  // CHECK: %[[TMP_131:.*]] = stablehlo.constant dense<2.100000e+01>
-+  // CHECK: %[[TMP_132:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_131]]
-+  // CHECK: %[[TMP_133:.*]] = stablehlo.multiply %[[TMP_130]], %[[TMP_132]]
-+  // CHECK: %[[TMP_134:.*]] = stablehlo.constant dense<-1.39544646E-19>
-+  // CHECK: %[[TMP_135:.*]] = stablehlo.add %[[TMP_90]], %[[TMP_134]]
-+  // CHECK: %[[TMP_136:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_135]]
-+  // CHECK: %[[TMP_137:.*]] = stablehlo.multiply %[[TMP_133]], %[[TMP_136]]
-+  // CHECK: %[[TMP_138:.*]] = stablehlo.constant dense<2.000000e+01>
-+  // CHECK: %[[TMP_139:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_138]]
-+  // CHECK: %[[TMP_140:.*]] = stablehlo.constant dense<1.900000e+01>
-+  // CHECK: %[[TMP_141:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_140]]
-+  // CHECK: %[[TMP_142:.*]] = stablehlo.multiply %[[TMP_139]], %[[TMP_141]]
-+  // CHECK: %[[TMP_143:.*]] = stablehlo.constant dense<5.50900303E-18>
-+  // CHECK: %[[TMP_144:.*]] = stablehlo.add %[[TMP_137]], %[[TMP_143]]
-+  // CHECK: %[[TMP_145:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_144]]
-+  // CHECK: %[[TMP_146:.*]] = stablehlo.multiply %[[TMP_142]], %[[TMP_145]]
-+  // CHECK: %[[TMP_147:.*]] = stablehlo.constant dense<1.800000e+01>
-+  // CHECK: %[[TMP_148:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_147]]
-+  // CHECK: %[[TMP_149:.*]] = stablehlo.constant dense<1.700000e+01>
-+  // CHECK: %[[TMP_150:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_149]]
-+  // CHECK: %[[TMP_151:.*]] = stablehlo.multiply %[[TMP_148]], %[[TMP_150]]
-+  // CHECK: %[[TMP_152:.*]] = stablehlo.constant dense<-2.17486866E-16>
-+  // CHECK: %[[TMP_153:.*]] = stablehlo.add %[[TMP_146]], %[[TMP_152]]
-+  // CHECK: %[[TMP_154:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_153]]
-+  // CHECK: %[[TMP_155:.*]] = stablehlo.multiply %[[TMP_151]], %[[TMP_154]]
-+  // CHECK: %[[TMP_156:.*]] = stablehlo.constant dense<1.600000e+01>
-+  // CHECK: %[[TMP_157:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_156]]
-+  // CHECK: %[[TMP_158:.*]] = stablehlo.constant dense<1.500000e+01>
-+  // CHECK: %[[TMP_159:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_158]]
-+  // CHECK: %[[TMP_160:.*]] = stablehlo.multiply %[[TMP_157]], %[[TMP_159]]
-+  // CHECK: %[[TMP_161:.*]] = stablehlo.constant dense<8.58606213E-15>
-+  // CHECK: %[[TMP_162:.*]] = stablehlo.add %[[TMP_155]], %[[TMP_161]]
-+  // CHECK: %[[TMP_163:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_162]]
-+  // CHECK: %[[TMP_164:.*]] = stablehlo.multiply %[[TMP_160]], %[[TMP_163]]
-+  // CHECK: %[[TMP_165:.*]] = stablehlo.constant dense<1.400000e+01>
-+  // CHECK: %[[TMP_166:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_165]]
-+  // CHECK: %[[TMP_167:.*]] = stablehlo.constant dense<1.300000e+01>
-+  // CHECK: %[[TMP_168:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_167]]
-+  // CHECK: %[[TMP_169:.*]] = stablehlo.multiply %[[TMP_166]], %[[TMP_168]]
-+  // CHECK: %[[TMP_170:.*]] = stablehlo.constant dense<-3.3896803E-13>
-+  // CHECK: %[[TMP_171:.*]] = stablehlo.add %[[TMP_164]], %[[TMP_170]]
-+  // CHECK: %[[TMP_172:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_171]]
-+  // CHECK: %[[TMP_173:.*]] = stablehlo.multiply %[[TMP_169]], %[[TMP_172]]
-+  // CHECK: %[[TMP_174:.*]] = stablehlo.constant dense<1.200000e+01>
-+  // CHECK: %[[TMP_175:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_174]]
-+  // CHECK: %[[TMP_176:.*]] = stablehlo.constant dense<1.100000e+01>
-+  // CHECK: %[[TMP_177:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_176]]
-+  // CHECK: %[[TMP_178:.*]] = stablehlo.multiply %[[TMP_175]], %[[TMP_177]]
-+  // CHECK: %[[TMP_179:.*]] = stablehlo.constant dense<1.33825364E-11>
-+  // CHECK: %[[TMP_180:.*]] = stablehlo.add %[[TMP_173]], %[[TMP_179]]
-+  // CHECK: %[[TMP_181:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_180]]
-+  // CHECK: %[[TMP_182:.*]] = stablehlo.multiply %[[TMP_178]], %[[TMP_181]]
-+  // CHECK: %[[TMP_183:.*]] = stablehlo.constant dense<1.000000e+01>
-+  // CHECK: %[[TMP_184:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_183]]
-+  // CHECK: %[[TMP_185:.*]] = stablehlo.constant dense<9.000000e+00>
-+  // CHECK: %[[TMP_186:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_185]]
-+  // CHECK: %[[TMP_187:.*]] = stablehlo.multiply %[[TMP_184]], %[[TMP_186]]
-+  // CHECK: %[[TMP_188:.*]] = stablehlo.constant dense<-5.28419031E-10>
-+  // CHECK: %[[TMP_189:.*]] = stablehlo.add %[[TMP_182]], %[[TMP_188]]
-+  // CHECK: %[[TMP_190:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_189]]
-+  // CHECK: %[[TMP_191:.*]] = stablehlo.multiply %[[TMP_187]], %[[TMP_190]]
-+  // CHECK: %[[TMP_192:.*]] = stablehlo.constant dense<8.000000e+00>
-+  // CHECK: %[[TMP_193:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_192]]
-+  // CHECK: %[[TMP_194:.*]] = stablehlo.constant dense<7.000000e+00>
-+  // CHECK: %[[TMP_195:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_194]]
-+  // CHECK: %[[TMP_196:.*]] = stablehlo.multiply %[[TMP_193]], %[[TMP_195]]
-+  // CHECK: %[[TMP_197:.*]] = stablehlo.constant dense<2.08767563E-8>
-+  // CHECK: %[[TMP_198:.*]] = stablehlo.add %[[TMP_191]], %[[TMP_197]]
-+  // CHECK: %[[TMP_199:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_198]]
-+  // CHECK: %[[TMP_200:.*]] = stablehlo.multiply %[[TMP_196]], %[[TMP_199]]
-+  // CHECK: %[[TMP_201:.*]] = stablehlo.constant dense<6.000000e+00>
-+  // CHECK: %[[TMP_202:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_201]]
-+  // CHECK: %[[TMP_203:.*]] = stablehlo.constant dense<5.000000e+00>
-+  // CHECK: %[[TMP_204:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_203]]
-+  // CHECK: %[[TMP_205:.*]] = stablehlo.multiply %[[TMP_202]], %[[TMP_204]]
-+  // CHECK: %[[TMP_206:.*]] = stablehlo.constant dense<-8.26719599E-7>
-+  // CHECK: %[[TMP_207:.*]] = stablehlo.add %[[TMP_200]], %[[TMP_206]]
-+  // CHECK: %[[TMP_208:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_207]]
-+  // CHECK: %[[TMP_209:.*]] = stablehlo.multiply %[[TMP_205]], %[[TMP_208]]
-+  // CHECK: %[[TMP_210:.*]] = stablehlo.constant dense<4.000000e+00>
-+  // CHECK: %[[TMP_211:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_210]]
-+  // CHECK: %[[TMP_212:.*]] = stablehlo.constant dense<3.000000e+00>
-+  // CHECK: %[[TMP_213:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_212]]
-+  // CHECK: %[[TMP_214:.*]] = stablehlo.multiply %[[TMP_211]], %[[TMP_213]]
-+  // CHECK: %[[TMP_215:.*]] = stablehlo.constant dense<3.30687835E-5>
-+  // CHECK: %[[TMP_216:.*]] = stablehlo.add %[[TMP_209]], %[[TMP_215]]
-+  // CHECK: %[[TMP_217:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_216]]
-+  // CHECK: %[[TMP_218:.*]] = stablehlo.multiply %[[TMP_214]], %[[TMP_217]]
-+  // CHECK: %[[TMP_219:.*]] = stablehlo.constant dense<2.000000e+00>
-+  // CHECK: %[[TMP_220:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_219]]
-+  // CHECK: %[[TMP_221:.*]] = stablehlo.constant dense<1.000000e+00>
-+  // CHECK: %[[TMP_222:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_221]]
-+  // CHECK: %[[TMP_223:.*]] = stablehlo.multiply %[[TMP_220]], %[[TMP_222]]
-+  // CHECK: %[[TMP_224:.*]] = stablehlo.constant dense<-0.00138888892>
-+  // CHECK: %[[TMP_225:.*]] = stablehlo.add %[[TMP_218]], %[[TMP_224]]
-+  // CHECK: %[[TMP_226:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_225]]
-+  // CHECK: %[[TMP_227:.*]] = stablehlo.multiply %[[TMP_223]], %[[TMP_226]]
-+  // CHECK: %[[TMP_228:.*]] = stablehlo.constant dense<5.000000e-01>
-+  // CHECK: %[[TMP_229:.*]] = stablehlo.divide %[[TMP_5]], %[[TMP_121]]
-+  // CHECK: %[[TMP_230:.*]] = stablehlo.constant dense<0.0833333358>
-+  // CHECK: %[[TMP_231:.*]] = stablehlo.add %[[TMP_230]], %[[TMP_227]]
-+  // CHECK: %[[TMP_232:.*]] = stablehlo.multiply %[[TMP_229]], %[[TMP_231]]
-+  // CHECK: %[[TMP_233:.*]] = stablehlo.add %[[TMP_228]], %[[TMP_232]]
-+  // CHECK: %[[TMP_234:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_233]]
-+  // CHECK: %[[TMP_235:.*]] = stablehlo.add %[[TMP_120]], %[[TMP_126]]
-+  // CHECK: %[[TMP_236:.*]] = stablehlo.add %[[TMP_235]], %[[TMP_234]]
-   // CHECK: %[[TMP_237:.*]] = stablehlo.abs %[[TMP_122]]
-   // CHECK: %[[TMP_238:.*]] = stablehlo.abs %[[TMP_120]]
-   // CHECK: %[[TMP_239:.*]] = stablehlo.constant dense<1.401300e-45>
-@@ -1732,7 +1731,7 @@
-   // CHECK: %[[TMP_260:.*]] = stablehlo.and %[[TMP_257]], %[[TMP_259]]
-   // CHECK: %[[TMP_261:.*]] = stablehlo.select %[[TMP_260]], %[[TMP_251]], %[[TMP_243]]
-   // CHECK: %[[TMP_262:.*]] = stablehlo.select %[[TMP_254]], %[[TMP_261]], %[[TMP_250]]
--  // CHECK: %[[TMP_263:.*]] = stablehlo.compare EQ, %[[TMP_5]], %[[TMP_93]], NOTYPE
-+  // CHECK: %[[TMP_263:.*]] = stablehlo.compare EQ, %[[TMP_5]], %[[TMP_91]], NOTYPE
-   // CHECK: %[[TMP_264:.*]] = stablehlo.select %[[TMP_263]], %[[TMP_251]], %[[TMP_262]]
-   // CHECK: %[[TMP_265:.*]] = stablehlo.multiply %[[TMP_4]], %[[TMP_89]]
-   // CHECK: %[[TMP_266:.*]] = stablehlo.multiply %[[TMP_265]], %[[TMP_264]]
-@@ -1853,8 +1852,7 @@
+ #include <algorithm>
++#include <cstdint>
  
- // -----
+ #include "llvm/ADT/APFloat.h"
+ #include "llvm/ADT/APInt.h"
+@@ -1428,23 +1429,23 @@
+                         inputSpatialDimensions.end());
+   lhsPermutation.push_back(inputFeatureDimension);
  
+-  auto lhsWindowDimensions =
+-      concatAndPermute(lhs.getShape()[inputBatchDimension],
+-                       extractElements(rhs.getShape(), kernelSpatialDimensions),
+-                       lhs.getShape()[inputFeatureDimension], lhsPermutation);
 -
--// CHECK-LABEL: @polygamma_f64
-+// CHECK: @polygamma_f64
- // CHECK-SAME: (%[[ARG0:.*]]: tensor<f64>, %[[ARG1:.*]]: tensor<f64>)
- func.func @polygamma_f64(%lhs : tensor<f64>, %rhs : tensor<f64>) -> tensor<f64> {
-   // CHECK-DAG: %[[TMP_0:.*]] = stablehlo.constant dense<1.000000e+00>
-@@ -1947,153 +1945,153 @@
-   // CHECK: %[[TMP_87:.*]] = stablehlo.constant dense<0x7FF0000000000000>
-   // CHECK: %[[TMP_88:.*]] = stablehlo.select %[[TMP_86]], %[[TMP_87]], %[[TMP_83]]
-   // CHECK: %[[TMP_89:.*]] = stablehlo.exponential %[[TMP_88]]
--  // CHECK: %[[TMP_90:.*]] = stablehlo.constant dense<0.000000e+00>
--  // CHECK: %[[TMP_91:.*]] = stablehlo.negate %[[TMP_5]]
--  // CHECK: %[[TMP_92:.*]] = stablehlo.power %[[ARG1]], %[[TMP_91]]
--  // CHECK: %[[TMP_93:.*]] = stablehlo.constant dense<1.000000e+00>
--  // CHECK: %[[TMP_94:.*]] = stablehlo.add %[[ARG1]], %[[TMP_93]]
--  // CHECK: %[[TMP_95:.*]] = stablehlo.power %[[TMP_94]], %[[TMP_91]]
--  // CHECK: %[[TMP_96:.*]] = stablehlo.add %[[TMP_92]], %[[TMP_95]]
--  // CHECK: %[[TMP_97:.*]] = stablehlo.add %[[TMP_94]], %[[TMP_93]]
--  // CHECK: %[[TMP_98:.*]] = stablehlo.power %[[TMP_97]], %[[TMP_91]]
-+  // CHECK-DAG: %[[TMP_90:.*]] = stablehlo.constant dense<0.000000e+00>
-+  // CHECK-DAG: %[[TMP_91:.*]] = stablehlo.constant dense<1.000000e+00>
-+  // CHECK: %[[TMP_92:.*]] = stablehlo.negate %[[TMP_5]]
-+  // CHECK: %[[TMP_93:.*]] = stablehlo.power %[[ARG1]], %[[TMP_92]]
-+  // CHECK: %[[TMP_94:.*]] = stablehlo.add %[[ARG1]], %[[TMP_91]]
-+  // CHECK: %[[TMP_95:.*]] = stablehlo.power %[[TMP_94]], %[[TMP_92]]
-+  // CHECK: %[[TMP_96:.*]] = stablehlo.add %[[TMP_93]], %[[TMP_95]]
-+  // CHECK: %[[TMP_97:.*]] = stablehlo.add %[[TMP_94]], %[[TMP_91]]
-+  // CHECK: %[[TMP_98:.*]] = stablehlo.power %[[TMP_97]], %[[TMP_92]]
-   // CHECK: %[[TMP_99:.*]] = stablehlo.add %[[TMP_96]], %[[TMP_98]]
--  // CHECK: %[[TMP_100:.*]] = stablehlo.add %[[TMP_97]], %[[TMP_93]]
--  // CHECK: %[[TMP_101:.*]] = stablehlo.power %[[TMP_100]], %[[TMP_91]]
-+  // CHECK: %[[TMP_100:.*]] = stablehlo.add %[[TMP_97]], %[[TMP_91]]
-+  // CHECK: %[[TMP_101:.*]] = stablehlo.power %[[TMP_100]], %[[TMP_92]]
-   // CHECK: %[[TMP_102:.*]] = stablehlo.add %[[TMP_99]], %[[TMP_101]]
--  // CHECK: %[[TMP_103:.*]] = stablehlo.add %[[TMP_100]], %[[TMP_93]]
--  // CHECK: %[[TMP_104:.*]] = stablehlo.power %[[TMP_103]], %[[TMP_91]]
-+  // CHECK: %[[TMP_103:.*]] = stablehlo.add %[[TMP_100]], %[[TMP_91]]
-+  // CHECK: %[[TMP_104:.*]] = stablehlo.power %[[TMP_103]], %[[TMP_92]]
-   // CHECK: %[[TMP_105:.*]] = stablehlo.add %[[TMP_102]], %[[TMP_104]]
--  // CHECK: %[[TMP_106:.*]] = stablehlo.add %[[TMP_103]], %[[TMP_93]]
--  // CHECK: %[[TMP_107:.*]] = stablehlo.power %[[TMP_106]], %[[TMP_91]]
-+  // CHECK: %[[TMP_106:.*]] = stablehlo.add %[[TMP_103]], %[[TMP_91]]
-+  // CHECK: %[[TMP_107:.*]] = stablehlo.power %[[TMP_106]], %[[TMP_92]]
-   // CHECK: %[[TMP_108:.*]] = stablehlo.add %[[TMP_105]], %[[TMP_107]]
--  // CHECK: %[[TMP_109:.*]] = stablehlo.add %[[TMP_106]], %[[TMP_93]]
--  // CHECK: %[[TMP_110:.*]] = stablehlo.power %[[TMP_109]], %[[TMP_91]]
-+  // CHECK: %[[TMP_109:.*]] = stablehlo.add %[[TMP_106]], %[[TMP_91]]
-+  // CHECK: %[[TMP_110:.*]] = stablehlo.power %[[TMP_109]], %[[TMP_92]]
-   // CHECK: %[[TMP_111:.*]] = stablehlo.add %[[TMP_108]], %[[TMP_110]]
--  // CHECK: %[[TMP_112:.*]] = stablehlo.add %[[TMP_109]], %[[TMP_93]]
--  // CHECK: %[[TMP_113:.*]] = stablehlo.power %[[TMP_112]], %[[TMP_91]]
-+  // CHECK: %[[TMP_112:.*]] = stablehlo.add %[[TMP_109]], %[[TMP_91]]
-+  // CHECK: %[[TMP_113:.*]] = stablehlo.power %[[TMP_112]], %[[TMP_92]]
-   // CHECK: %[[TMP_114:.*]] = stablehlo.add %[[TMP_111]], %[[TMP_113]]
--  // CHECK: %[[TMP_115:.*]] = stablehlo.add %[[TMP_112]], %[[TMP_93]]
--  // CHECK: %[[TMP_116:.*]] = stablehlo.power %[[TMP_115]], %[[TMP_91]]
-+  // CHECK: %[[TMP_115:.*]] = stablehlo.add %[[TMP_112]], %[[TMP_91]]
-+  // CHECK: %[[TMP_116:.*]] = stablehlo.power %[[TMP_115]], %[[TMP_92]]
-   // CHECK: %[[TMP_117:.*]] = stablehlo.add %[[TMP_114]], %[[TMP_116]]
--  // CHECK: %[[TMP_118:.*]] = stablehlo.add %[[TMP_115]], %[[TMP_93]]
--  // CHECK: %[[TMP_119:.*]] = stablehlo.power %[[TMP_118]], %[[TMP_91]]
-+  // CHECK: %[[TMP_118:.*]] = stablehlo.add %[[TMP_115]], %[[TMP_91]]
-+  // CHECK: %[[TMP_119:.*]] = stablehlo.power %[[TMP_118]], %[[TMP_92]]
-   // CHECK: %[[TMP_120:.*]] = stablehlo.add %[[TMP_117]], %[[TMP_119]]
--  // CHECK: %[[TMP_121:.*]] = stablehlo.add %[[TMP_118]], %[[TMP_93]]
--  // CHECK: %[[TMP_122:.*]] = stablehlo.power %[[TMP_121]], %[[TMP_91]]
-+  // CHECK: %[[TMP_121:.*]] = stablehlo.add %[[TMP_118]], %[[TMP_91]]
-+  // CHECK: %[[TMP_122:.*]] = stablehlo.power %[[TMP_121]], %[[TMP_92]]
-   // CHECK: %[[TMP_123:.*]] = stablehlo.constant dense<1.000000e+00>
--  // CHECK: %[[TMP_124:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_123]]
--  // CHECK: %[[TMP_125:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_121]]
--  // CHECK: %[[TMP_126:.*]] = stablehlo.divide %[[TMP_125]], %[[TMP_124]]
--  // CHECK: %[[TMP_127:.*]] = stablehlo.add %[[TMP_120]], %[[TMP_126]]
--  // CHECK: %[[TMP_128:.*]] = stablehlo.multiply %[[TMP_121]], %[[TMP_121]]
--  // CHECK: %[[TMP_129:.*]] = stablehlo.divide %[[TMP_93]], %[[TMP_128]]
--  // CHECK: %[[TMP_130:.*]] = stablehlo.constant dense<2.200000e+01>
--  // CHECK: %[[TMP_131:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_130]]
--  // CHECK: %[[TMP_132:.*]] = stablehlo.constant dense<2.100000e+01>
--  // CHECK: %[[TMP_133:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_132]]
--  // CHECK: %[[TMP_134:.*]] = stablehlo.multiply %[[TMP_131]], %[[TMP_133]]
--  // CHECK: %[[TMP_135:.*]] = stablehlo.constant dense<-1.3954464685812522E-19>
--  // CHECK: %[[TMP_136:.*]] = stablehlo.add %[[TMP_90]], %[[TMP_135]]
--  // CHECK: %[[TMP_137:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_136]]
--  // CHECK: %[[TMP_138:.*]] = stablehlo.multiply %[[TMP_134]], %[[TMP_137]]
--  // CHECK: %[[TMP_139:.*]] = stablehlo.constant dense<2.000000e+01>
--  // CHECK: %[[TMP_140:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_139]]
--  // CHECK: %[[TMP_141:.*]] = stablehlo.constant dense<1.900000e+01>
--  // CHECK: %[[TMP_142:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_141]]
--  // CHECK: %[[TMP_143:.*]] = stablehlo.multiply %[[TMP_140]], %[[TMP_142]]
--  // CHECK: %[[TMP_144:.*]] = stablehlo.constant dense<5.5090028283602295E-18>
--  // CHECK: %[[TMP_145:.*]] = stablehlo.add %[[TMP_138]], %[[TMP_144]]
--  // CHECK: %[[TMP_146:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_145]]
--  // CHECK: %[[TMP_147:.*]] = stablehlo.multiply %[[TMP_143]], %[[TMP_146]]
--  // CHECK: %[[TMP_148:.*]] = stablehlo.constant dense<1.800000e+01>
--  // CHECK: %[[TMP_149:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_148]]
--  // CHECK: %[[TMP_150:.*]] = stablehlo.constant dense<1.700000e+01>
--  // CHECK: %[[TMP_151:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_150]]
--  // CHECK: %[[TMP_152:.*]] = stablehlo.multiply %[[TMP_149]], %[[TMP_151]]
--  // CHECK: %[[TMP_153:.*]] = stablehlo.constant dense<-2.1748686985580617E-16>
--  // CHECK: %[[TMP_154:.*]] = stablehlo.add %[[TMP_147]], %[[TMP_153]]
--  // CHECK: %[[TMP_155:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_154]]
--  // CHECK: %[[TMP_156:.*]] = stablehlo.multiply %[[TMP_152]], %[[TMP_155]]
--  // CHECK: %[[TMP_157:.*]] = stablehlo.constant dense<1.600000e+01>
--  // CHECK: %[[TMP_158:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_157]]
--  // CHECK: %[[TMP_159:.*]] = stablehlo.constant dense<1.500000e+01>
--  // CHECK: %[[TMP_160:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_159]]
--  // CHECK: %[[TMP_161:.*]] = stablehlo.multiply %[[TMP_158]], %[[TMP_160]]
--  // CHECK: %[[TMP_162:.*]] = stablehlo.constant dense<8.5860620562778452E-15>
--  // CHECK: %[[TMP_163:.*]] = stablehlo.add %[[TMP_156]], %[[TMP_162]]
--  // CHECK: %[[TMP_164:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_163]]
--  // CHECK: %[[TMP_165:.*]] = stablehlo.multiply %[[TMP_161]], %[[TMP_164]]
--  // CHECK: %[[TMP_166:.*]] = stablehlo.constant dense<1.400000e+01>
--  // CHECK: %[[TMP_167:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_166]]
--  // CHECK: %[[TMP_168:.*]] = stablehlo.constant dense<1.300000e+01>
--  // CHECK: %[[TMP_169:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_168]]
--  // CHECK: %[[TMP_170:.*]] = stablehlo.multiply %[[TMP_167]], %[[TMP_169]]
--  // CHECK: %[[TMP_171:.*]] = stablehlo.constant dense<-3.3896802963225832E-13>
--  // CHECK: %[[TMP_172:.*]] = stablehlo.add %[[TMP_165]], %[[TMP_171]]
--  // CHECK: %[[TMP_173:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_172]]
--  // CHECK: %[[TMP_174:.*]] = stablehlo.multiply %[[TMP_170]], %[[TMP_173]]
--  // CHECK: %[[TMP_175:.*]] = stablehlo.constant dense<1.200000e+01>
--  // CHECK: %[[TMP_176:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_175]]
--  // CHECK: %[[TMP_177:.*]] = stablehlo.constant dense<1.100000e+01>
--  // CHECK: %[[TMP_178:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_177]]
--  // CHECK: %[[TMP_179:.*]] = stablehlo.multiply %[[TMP_176]], %[[TMP_178]]
--  // CHECK: %[[TMP_180:.*]] = stablehlo.constant dense<1.3382536530684679E-11>
--  // CHECK: %[[TMP_181:.*]] = stablehlo.add %[[TMP_174]], %[[TMP_180]]
--  // CHECK: %[[TMP_182:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_181]]
--  // CHECK: %[[TMP_183:.*]] = stablehlo.multiply %[[TMP_179]], %[[TMP_182]]
--  // CHECK: %[[TMP_184:.*]] = stablehlo.constant dense<1.000000e+01>
--  // CHECK: %[[TMP_185:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_184]]
--  // CHECK: %[[TMP_186:.*]] = stablehlo.constant dense<9.000000e+00>
--  // CHECK: %[[TMP_187:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_186]]
--  // CHECK: %[[TMP_188:.*]] = stablehlo.multiply %[[TMP_185]], %[[TMP_187]]
--  // CHECK: %[[TMP_189:.*]] = stablehlo.constant dense<-5.2841901386874932E-10>
--  // CHECK: %[[TMP_190:.*]] = stablehlo.add %[[TMP_183]], %[[TMP_189]]
--  // CHECK: %[[TMP_191:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_190]]
--  // CHECK: %[[TMP_192:.*]] = stablehlo.multiply %[[TMP_188]], %[[TMP_191]]
--  // CHECK: %[[TMP_193:.*]] = stablehlo.constant dense<8.000000e+00>
--  // CHECK: %[[TMP_194:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_193]]
--  // CHECK: %[[TMP_195:.*]] = stablehlo.constant dense<7.000000e+00>
--  // CHECK: %[[TMP_196:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_195]]
--  // CHECK: %[[TMP_197:.*]] = stablehlo.multiply %[[TMP_194]], %[[TMP_196]]
--  // CHECK: %[[TMP_198:.*]] = stablehlo.constant dense<2.08767569878681E-8>
--  // CHECK: %[[TMP_199:.*]] = stablehlo.add %[[TMP_192]], %[[TMP_198]]
--  // CHECK: %[[TMP_200:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_199]]
--  // CHECK: %[[TMP_201:.*]] = stablehlo.multiply %[[TMP_197]], %[[TMP_200]]
--  // CHECK: %[[TMP_202:.*]] = stablehlo.constant dense<6.000000e+00>
--  // CHECK: %[[TMP_203:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_202]]
--  // CHECK: %[[TMP_204:.*]] = stablehlo.constant dense<5.000000e+00>
--  // CHECK: %[[TMP_205:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_204]]
--  // CHECK: %[[TMP_206:.*]] = stablehlo.multiply %[[TMP_203]], %[[TMP_205]]
--  // CHECK: %[[TMP_207:.*]] = stablehlo.constant dense<-8.2671957671957675E-7>
--  // CHECK: %[[TMP_208:.*]] = stablehlo.add %[[TMP_201]], %[[TMP_207]]
--  // CHECK: %[[TMP_209:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_208]]
--  // CHECK: %[[TMP_210:.*]] = stablehlo.multiply %[[TMP_206]], %[[TMP_209]]
--  // CHECK: %[[TMP_211:.*]] = stablehlo.constant dense<4.000000e+00>
--  // CHECK: %[[TMP_212:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_211]]
--  // CHECK: %[[TMP_213:.*]] = stablehlo.constant dense<3.000000e+00>
--  // CHECK: %[[TMP_214:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_213]]
--  // CHECK: %[[TMP_215:.*]] = stablehlo.multiply %[[TMP_212]], %[[TMP_214]]
--  // CHECK: %[[TMP_216:.*]] = stablehlo.constant dense<3.3068783068783071E-5>
--  // CHECK: %[[TMP_217:.*]] = stablehlo.add %[[TMP_210]], %[[TMP_216]]
--  // CHECK: %[[TMP_218:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_217]]
--  // CHECK: %[[TMP_219:.*]] = stablehlo.multiply %[[TMP_215]], %[[TMP_218]]
--  // CHECK: %[[TMP_220:.*]] = stablehlo.constant dense<2.000000e+00>
--  // CHECK: %[[TMP_221:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_220]]
--  // CHECK: %[[TMP_222:.*]] = stablehlo.constant dense<1.000000e+00>
--  // CHECK: %[[TMP_223:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_222]]
--  // CHECK: %[[TMP_224:.*]] = stablehlo.multiply %[[TMP_221]], %[[TMP_223]]
--  // CHECK: %[[TMP_225:.*]] = stablehlo.constant dense<-0.0013888888888888889>
--  // CHECK: %[[TMP_226:.*]] = stablehlo.add %[[TMP_219]], %[[TMP_225]]
--  // CHECK: %[[TMP_227:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_226]]
--  // CHECK: %[[TMP_228:.*]] = stablehlo.multiply %[[TMP_224]], %[[TMP_227]]
--  // CHECK: %[[TMP_229:.*]] = stablehlo.constant dense<5.000000e-01>
--  // CHECK: %[[TMP_230:.*]] = stablehlo.divide %[[TMP_5]], %[[TMP_121]]
--  // CHECK: %[[TMP_231:.*]] = stablehlo.constant dense<0.083333333333333329>
--  // CHECK: %[[TMP_232:.*]] = stablehlo.add %[[TMP_231]], %[[TMP_228]]
--  // CHECK: %[[TMP_233:.*]] = stablehlo.multiply %[[TMP_230]], %[[TMP_232]]
--  // CHECK: %[[TMP_234:.*]] = stablehlo.add %[[TMP_229]], %[[TMP_233]]
--  // CHECK: %[[TMP_235:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_234]]
--  // CHECK: %[[TMP_236:.*]] = stablehlo.add %[[TMP_127]], %[[TMP_235]]
-+  // CHECK: %[[TMP_124:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_121]]
-+  // CHECK: %[[TMP_125:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_123]]
-+  // CHECK: %[[TMP_126:.*]] = stablehlo.divide %[[TMP_124]], %[[TMP_125]]
-+  // CHECK: %[[TMP_127:.*]] = stablehlo.multiply %[[TMP_121]], %[[TMP_121]]
-+  // CHECK: %[[TMP_128:.*]] = stablehlo.divide %[[TMP_91]], %[[TMP_127]]
-+  // CHECK: %[[TMP_129:.*]] = stablehlo.constant dense<2.200000e+01>
-+  // CHECK: %[[TMP_130:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_129]]
-+  // CHECK: %[[TMP_131:.*]] = stablehlo.constant dense<2.100000e+01>
-+  // CHECK: %[[TMP_132:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_131]]
-+  // CHECK: %[[TMP_133:.*]] = stablehlo.multiply %[[TMP_130]], %[[TMP_132]]
-+  // CHECK: %[[TMP_134:.*]] = stablehlo.constant dense<-1.3954464685812522E-19>
-+  // CHECK: %[[TMP_135:.*]] = stablehlo.add %[[TMP_90]], %[[TMP_134]]
-+  // CHECK: %[[TMP_136:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_135]]
-+  // CHECK: %[[TMP_137:.*]] = stablehlo.multiply %[[TMP_133]], %[[TMP_136]]
-+  // CHECK: %[[TMP_138:.*]] = stablehlo.constant dense<2.000000e+01>
-+  // CHECK: %[[TMP_139:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_138]]
-+  // CHECK: %[[TMP_140:.*]] = stablehlo.constant dense<1.900000e+01>
-+  // CHECK: %[[TMP_141:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_140]]
-+  // CHECK: %[[TMP_142:.*]] = stablehlo.multiply %[[TMP_139]], %[[TMP_141]]
-+  // CHECK: %[[TMP_143:.*]] = stablehlo.constant dense<5.5090028283602295E-18>
-+  // CHECK: %[[TMP_144:.*]] = stablehlo.add %[[TMP_137]], %[[TMP_143]]
-+  // CHECK: %[[TMP_145:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_144]]
-+  // CHECK: %[[TMP_146:.*]] = stablehlo.multiply %[[TMP_142]], %[[TMP_145]]
-+  // CHECK: %[[TMP_147:.*]] = stablehlo.constant dense<1.800000e+01>
-+  // CHECK: %[[TMP_148:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_147]]
-+  // CHECK: %[[TMP_149:.*]] = stablehlo.constant dense<1.700000e+01>
-+  // CHECK: %[[TMP_150:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_149]]
-+  // CHECK: %[[TMP_151:.*]] = stablehlo.multiply %[[TMP_148]], %[[TMP_150]]
-+  // CHECK: %[[TMP_152:.*]] = stablehlo.constant dense<-2.1748686985580617E-16>
-+  // CHECK: %[[TMP_153:.*]] = stablehlo.add %[[TMP_146]], %[[TMP_152]]
-+  // CHECK: %[[TMP_154:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_153]]
-+  // CHECK: %[[TMP_155:.*]] = stablehlo.multiply %[[TMP_151]], %[[TMP_154]]
-+  // CHECK: %[[TMP_156:.*]] = stablehlo.constant dense<1.600000e+01>
-+  // CHECK: %[[TMP_157:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_156]]
-+  // CHECK: %[[TMP_158:.*]] = stablehlo.constant dense<1.500000e+01>
-+  // CHECK: %[[TMP_159:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_158]]
-+  // CHECK: %[[TMP_160:.*]] = stablehlo.multiply %[[TMP_157]], %[[TMP_159]]
-+  // CHECK: %[[TMP_161:.*]] = stablehlo.constant dense<8.5860620562778452E-15>
-+  // CHECK: %[[TMP_162:.*]] = stablehlo.add %[[TMP_155]], %[[TMP_161]]
-+  // CHECK: %[[TMP_163:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_162]]
-+  // CHECK: %[[TMP_164:.*]] = stablehlo.multiply %[[TMP_160]], %[[TMP_163]]
-+  // CHECK: %[[TMP_165:.*]] = stablehlo.constant dense<1.400000e+01>
-+  // CHECK: %[[TMP_166:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_165]]
-+  // CHECK: %[[TMP_167:.*]] = stablehlo.constant dense<1.300000e+01>
-+  // CHECK: %[[TMP_168:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_167]]
-+  // CHECK: %[[TMP_169:.*]] = stablehlo.multiply %[[TMP_166]], %[[TMP_168]]
-+  // CHECK: %[[TMP_170:.*]] = stablehlo.constant dense<-3.3896802963225832E-13>
-+  // CHECK: %[[TMP_171:.*]] = stablehlo.add %[[TMP_164]], %[[TMP_170]]
-+  // CHECK: %[[TMP_172:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_171]]
-+  // CHECK: %[[TMP_173:.*]] = stablehlo.multiply %[[TMP_169]], %[[TMP_172]]
-+  // CHECK: %[[TMP_174:.*]] = stablehlo.constant dense<1.200000e+01>
-+  // CHECK: %[[TMP_175:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_174]]
-+  // CHECK: %[[TMP_176:.*]] = stablehlo.constant dense<1.100000e+01>
-+  // CHECK: %[[TMP_177:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_176]]
-+  // CHECK: %[[TMP_178:.*]] = stablehlo.multiply %[[TMP_175]], %[[TMP_177]]
-+  // CHECK: %[[TMP_179:.*]] = stablehlo.constant dense<1.3382536530684679E-11>
-+  // CHECK: %[[TMP_180:.*]] = stablehlo.add %[[TMP_173]], %[[TMP_179]]
-+  // CHECK: %[[TMP_181:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_180]]
-+  // CHECK: %[[TMP_182:.*]] = stablehlo.multiply %[[TMP_178]], %[[TMP_181]]
-+  // CHECK: %[[TMP_183:.*]] = stablehlo.constant dense<1.000000e+01>
-+  // CHECK: %[[TMP_184:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_183]]
-+  // CHECK: %[[TMP_185:.*]] = stablehlo.constant dense<9.000000e+00>
-+  // CHECK: %[[TMP_186:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_185]]
-+  // CHECK: %[[TMP_187:.*]] = stablehlo.multiply %[[TMP_184]], %[[TMP_186]]
-+  // CHECK: %[[TMP_188:.*]] = stablehlo.constant dense<-5.2841901386874932E-10>
-+  // CHECK: %[[TMP_189:.*]] = stablehlo.add %[[TMP_182]], %[[TMP_188]]
-+  // CHECK: %[[TMP_190:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_189]]
-+  // CHECK: %[[TMP_191:.*]] = stablehlo.multiply %[[TMP_187]], %[[TMP_190]]
-+  // CHECK: %[[TMP_192:.*]] = stablehlo.constant dense<8.000000e+00>
-+  // CHECK: %[[TMP_193:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_192]]
-+  // CHECK: %[[TMP_194:.*]] = stablehlo.constant dense<7.000000e+00>
-+  // CHECK: %[[TMP_195:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_194]]
-+  // CHECK: %[[TMP_196:.*]] = stablehlo.multiply %[[TMP_193]], %[[TMP_195]]
-+  // CHECK: %[[TMP_197:.*]] = stablehlo.constant dense<2.08767569878681E-8>
-+  // CHECK: %[[TMP_198:.*]] = stablehlo.add %[[TMP_191]], %[[TMP_197]]
-+  // CHECK: %[[TMP_199:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_198]]
-+  // CHECK: %[[TMP_200:.*]] = stablehlo.multiply %[[TMP_196]], %[[TMP_199]]
-+  // CHECK: %[[TMP_201:.*]] = stablehlo.constant dense<6.000000e+00>
-+  // CHECK: %[[TMP_202:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_201]]
-+  // CHECK: %[[TMP_203:.*]] = stablehlo.constant dense<5.000000e+00>
-+  // CHECK: %[[TMP_204:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_203]]
-+  // CHECK: %[[TMP_205:.*]] = stablehlo.multiply %[[TMP_202]], %[[TMP_204]]
-+  // CHECK: %[[TMP_206:.*]] = stablehlo.constant dense<-8.2671957671957675E-7>
-+  // CHECK: %[[TMP_207:.*]] = stablehlo.add %[[TMP_200]], %[[TMP_206]]
-+  // CHECK: %[[TMP_208:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_207]]
-+  // CHECK: %[[TMP_209:.*]] = stablehlo.multiply %[[TMP_205]], %[[TMP_208]]
-+  // CHECK: %[[TMP_210:.*]] = stablehlo.constant dense<4.000000e+00>
-+  // CHECK: %[[TMP_211:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_210]]
-+  // CHECK: %[[TMP_212:.*]] = stablehlo.constant dense<3.000000e+00>
-+  // CHECK: %[[TMP_213:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_212]]
-+  // CHECK: %[[TMP_214:.*]] = stablehlo.multiply %[[TMP_211]], %[[TMP_213]]
-+  // CHECK: %[[TMP_215:.*]] = stablehlo.constant dense<3.3068783068783071E-5>
-+  // CHECK: %[[TMP_216:.*]] = stablehlo.add %[[TMP_209]], %[[TMP_215]]
-+  // CHECK: %[[TMP_217:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_216]]
-+  // CHECK: %[[TMP_218:.*]] = stablehlo.multiply %[[TMP_214]], %[[TMP_217]]
-+  // CHECK: %[[TMP_219:.*]] = stablehlo.constant dense<2.000000e+00>
-+  // CHECK: %[[TMP_220:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_219]]
-+  // CHECK: %[[TMP_221:.*]] = stablehlo.constant dense<1.000000e+00>
-+  // CHECK: %[[TMP_222:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_221]]
-+  // CHECK: %[[TMP_223:.*]] = stablehlo.multiply %[[TMP_220]], %[[TMP_222]]
-+  // CHECK: %[[TMP_224:.*]] = stablehlo.constant dense<-0.0013888888888888889>
-+  // CHECK: %[[TMP_225:.*]] = stablehlo.add %[[TMP_218]], %[[TMP_224]]
-+  // CHECK: %[[TMP_226:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_225]]
-+  // CHECK: %[[TMP_227:.*]] = stablehlo.multiply %[[TMP_223]], %[[TMP_226]]
-+  // CHECK: %[[TMP_228:.*]] = stablehlo.constant dense<5.000000e-01>
-+  // CHECK: %[[TMP_229:.*]] = stablehlo.divide %[[TMP_5]], %[[TMP_121]]
-+  // CHECK: %[[TMP_230:.*]] = stablehlo.constant dense<0.083333333333333329>
-+  // CHECK: %[[TMP_231:.*]] = stablehlo.add %[[TMP_230]], %[[TMP_227]]
-+  // CHECK: %[[TMP_232:.*]] = stablehlo.multiply %[[TMP_229]], %[[TMP_231]]
-+  // CHECK: %[[TMP_233:.*]] = stablehlo.add %[[TMP_228]], %[[TMP_232]]
-+  // CHECK: %[[TMP_234:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_233]]
-+  // CHECK: %[[TMP_235:.*]] = stablehlo.add %[[TMP_120]], %[[TMP_126]]
-+  // CHECK: %[[TMP_236:.*]] = stablehlo.add %[[TMP_235]], %[[TMP_234]]
-   // CHECK: %[[TMP_237:.*]] = stablehlo.abs %[[TMP_122]]
-   // CHECK: %[[TMP_238:.*]] = stablehlo.abs %[[TMP_120]]
-   // CHECK: %[[TMP_239:.*]] = stablehlo.constant dense<4.940660e-324>
-@@ -2120,7 +2118,7 @@
-   // CHECK: %[[TMP_260:.*]] = stablehlo.and %[[TMP_257]], %[[TMP_259]]
-   // CHECK: %[[TMP_261:.*]] = stablehlo.select %[[TMP_260]], %[[TMP_251]], %[[TMP_243]]
-   // CHECK: %[[TMP_262:.*]] = stablehlo.select %[[TMP_254]], %[[TMP_261]], %[[TMP_250]]
--  // CHECK: %[[TMP_263:.*]] = stablehlo.compare EQ, %[[TMP_5]], %[[TMP_93]], NOTYPE
-+  // CHECK: %[[TMP_263:.*]] = stablehlo.compare EQ, %[[TMP_5]], %[[TMP_91]], NOTYPE
-   // CHECK: %[[TMP_264:.*]] = stablehlo.select %[[TMP_263]], %[[TMP_251]], %[[TMP_262]]
-   // CHECK: %[[TMP_265:.*]] = stablehlo.multiply %[[TMP_4]], %[[TMP_89]]
-   // CHECK: %[[TMP_266:.*]] = stablehlo.multiply %[[TMP_265]], %[[TMP_264]]
-diff --ruN a/stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp b/stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
---- stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
-+++ stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
-@@ -1575,11 +1575,21 @@
- 
- static Value materializeZeta(ConversionPatternRewriter &rewriter, Location loc,
-                              ValueRange args) {
--  // Code should match XLA's materializeZeta from chlo_legalize_to_hlo.cc
-+  // Implementation ported from:
-+  // https://github.com/openxla/xla/blob/7a067a7b88d2ffb15b1dc5e3c06f701a15f0391d/xla/client/lib/math.cc#L1912-L1917
-+  // Reference: Johansson, Fredrik.
-+  // "Rigorous high-precision computation of the Hurwitz zeta function and its
-+  // derivatives." Numerical Algorithms 69.2 (2015): 253-270.
-+  // https://arxiv.org/abs/1309.2877 - formula (5)
-+  // Notation is more or less kept as a reference to the whitepaper.
-   assert(args.size() == 2);
-   Value x = args[0];
-   Value q = args[1];
--  static const std::array<double, 12> kZetaCoeffs{
-+
-+  static constexpr auto kTerms = 12;
-+  static constexpr auto kIters = 9;
-+  static constexpr auto kTwoTermsMinusOne = 2 * kTerms - 1;
-+  static constexpr auto kZetaCoeffs = std::array<double, kTerms>{
-       -7.1661652561756670113e18,
-       1.8152105401943546773e17,
-       -4.5979787224074726105e15,
-@@ -1596,131 +1606,134 @@
+-  auto lhsWindowStrides =
+-      concatAndPermute(1L, llvm::to_vector(windowStrides), 1L, lhsPermutation);
++  auto lhsWindowDimensions = concatAndPermute<int64_t>(
++      lhs.getShape()[inputBatchDimension],
++      extractElements(rhs.getShape(), kernelSpatialDimensions),
++      lhs.getShape()[inputFeatureDimension], lhsPermutation);
++
++  auto lhsWindowStrides = concatAndPermute<int64_t>(
++      1L, llvm::to_vector(windowStrides), 1L, lhsPermutation);
  
-   // For speed we'll always use 9 iterations for the initial series estimate,
-   // and a 12 term expansion for the Euler-Maclaurin formula.
--  Value a = q;
--  Value zero = getConstantLike(rewriter, loc, 0.0, a);
--  Value negPower = zero;
--  Value negX = rewriter.create<mlir::stablehlo::NegOp>(loc, x);
--  Value initialSum = rewriter.create<mlir::stablehlo::PowOp>(loc, q, negX);
--  Value one = getConstantLike(rewriter, loc, 1.0, a);
--  for (int i = 0; i < 9; ++i) {
--    a = rewriter.create<mlir::stablehlo::AddOp>(loc, a, one);
--    negPower = rewriter.create<mlir::stablehlo::PowOp>(loc, a, negX);
--    initialSum =
--        rewriter.create<mlir::stablehlo::AddOp>(loc, initialSum, negPower);
--  }
+   auto lhsBaseDilations =
+-      concatAndPermute(0L, Sizes(lhsDilation) - 1, 0L, lhsPermutation);
 -
--  a = rewriter.create<mlir::stablehlo::AddOp>(loc, a, one);
--  negPower = rewriter.create<mlir::stablehlo::PowOp>(loc, a, negX);
-+  Value zero = getConstantLike(rewriter, loc, 0.0, q);
-+  Value one = getConstantLike(rewriter, loc, 1.0, q);
-+  Value acc = q;
-+  Value qNegPower = zero;
-+  Value negX = rewriter.create<NegOp>(loc, x);
-+  Value powerSum = rewriter.create<PowOp>(loc, q, negX);
-+  for (int i = 0; i < kIters; ++i) {
-+    acc = rewriter.create<AddOp>(loc, acc, one);
-+    qNegPower = rewriter.create<PowOp>(loc, acc, negX);
-+    powerSum =
-+        rewriter.create<AddOp>(loc, powerSum, qNegPower);
-+  }
-+  acc = rewriter.create<AddOp>(loc, acc, one);
-+  qNegPower = rewriter.create<PowOp>(loc, acc, negX);
-   Value oneLikeX = getConstantLike(rewriter, loc, 1.0, x);
--  Value xMinusOne =
--      rewriter.create<mlir::stablehlo::SubtractOp>(loc, x, oneLikeX);
--  Value negPowerMulA =
--      rewriter.create<mlir::stablehlo::MulOp>(loc, negPower, a);
--  Value negPowerMulADivXMinusOne =
--      rewriter.create<mlir::stablehlo::DivOp>(loc, negPowerMulA, xMinusOne);
--  Value s = rewriter.create<mlir::stablehlo::AddOp>(loc, initialSum,
--                                                    negPowerMulADivXMinusOne);
--  Value aInverseSquare = rewriter.create<mlir::stablehlo::DivOp>(
--      loc, one, rewriter.create<mlir::stablehlo::MulOp>(loc, a, a));
--
--  Value hornerSum = zero;
--  Value factor = one;
-+  Value correctionEulerMaclaurin = rewriter.create<DivOp>(
-+      loc, rewriter.create<MulOp>(loc, qNegPower, acc),
-+      rewriter.create<SubtractOp>(loc, x, oneLikeX));
-+
-+  // Manual reciprocal of the square root as RsqrtOp produces different results
-+  Value rsqrtAcc = rewriter.create<DivOp>(
-+      loc, one, rewriter.create<MulOp>(loc, acc, acc));
-+
-   // Use Horner's rule for this.
-   // Note this differs from Cephes which does a 'naive' polynomial evaluation.
-   // Using Horner's rule allows to avoid some NaN's and Infs from happening,
-   // resulting in more numerically stable code.
--  for (int i = 0; i < 11; ++i) {
--    Value factorLhs = rewriter.create<mlir::stablehlo::AddOp>(
--        loc, x, getConstantLike(rewriter, loc, 22 - 2 * i, x));
--    Value factorRhs = rewriter.create<mlir::stablehlo::AddOp>(
--        loc, x, getConstantLike(rewriter, loc, 21 - 2 * i, x));
--    factor = rewriter.create<mlir::stablehlo::MulOp>(loc, factorLhs, factorRhs);
--    hornerSum = rewriter.create<mlir::stablehlo::MulOp>(
--        loc, factor,
--        rewriter.create<mlir::stablehlo::MulOp>(
--            loc, aInverseSquare,
--            rewriter.create<mlir::stablehlo::AddOp>(
-+  Value hornerSum = zero;
-+  Value hornerProduct = one;
-+
-+  for (int i = 0; i < kTerms - 1; ++i) {
-+    Value factorLhs = rewriter.create<AddOp>(
-+        loc, x,
-+        getConstantLike(rewriter, loc, kTwoTermsMinusOne - 1 - 2 * i, x));
-+    Value factorRhs = rewriter.create<AddOp>(
-+        loc, x,
-+        getConstantLike(rewriter, loc, kTwoTermsMinusOne - 2 - 2 * i, x));
-+    hornerProduct =
-+        rewriter.create<MulOp>(loc, factorLhs, factorRhs);
-+    hornerSum = rewriter.create<MulOp>(
-+        loc, hornerProduct,
-+        rewriter.create<MulOp>(
-+            loc, rsqrtAcc,
-+            rewriter.create<AddOp>(
-                 loc, hornerSum,
--                getConstantLike(rewriter, loc, 1. / kZetaCoeffs[i], a))));
--  }
--  Value zeroPointFiveLikeNegPower =
--      getConstantLike(rewriter, loc, .5, negPower);
--  Value xDivA = rewriter.create<mlir::stablehlo::DivOp>(loc, x, a);
--  s = rewriter.create<mlir::stablehlo::AddOp>(
--      loc, s,
--      rewriter.create<mlir::stablehlo::MulOp>(
--          loc, negPower,
--          rewriter.create<mlir::stablehlo::AddOp>(
--              loc, zeroPointFiveLikeNegPower,
--              rewriter.create<mlir::stablehlo::MulOp>(
--                  loc, xDivA,
--                  rewriter.create<mlir::stablehlo::AddOp>(
--                      loc,
--                      getConstantLike(rewriter, loc, 1. / kZetaCoeffs[11], a),
--                      hornerSum)))));
-+                getConstantLike(rewriter, loc, 1. / kZetaCoeffs[i], acc))));
-+  }
-+  Value zeroPointFiveLikeQNegPower =
-+      getConstantLike(rewriter, loc, .5, qNegPower);
-+  Value xDivAcc = rewriter.create<DivOp>(loc, x, acc);
-+  Value bernoulliTailTerm = rewriter.create<MulOp>(
-+      loc, qNegPower,
-+      rewriter.create<AddOp>(
-+          loc, zeroPointFiveLikeQNegPower,
-+          rewriter.create<MulOp>(
-+              loc, xDivAcc,
-+              rewriter.create<AddOp>(
-+                  loc,
-+                  getConstantLike(rewriter, loc, 1. / kZetaCoeffs[kTerms - 1],
-+                                  acc),
-+                  hornerSum))));
-+  Value accurateResult = rewriter.create<AddOp>(
-+      loc,
-+      rewriter.create<AddOp>(loc, powerSum,
-+                                              correctionEulerMaclaurin),
-+      bernoulliTailTerm);
- 
-   // Use the initial zeta sum without the correction term coming
-   // from Euler-Maclaurin if it is accurate enough.
--  Value absNegPower = rewriter.create<mlir::stablehlo::AbsOp>(loc, negPower);
--  Value absInitialSum =
--      rewriter.create<mlir::stablehlo::AbsOp>(loc, initialSum);
--  Value output = rewriter.create<mlir::stablehlo::SelectOp>(
-+  Value absQNegPower = rewriter.create<AbsOp>(loc, qNegPower);
-+  Value absPowerSum = rewriter.create<AbsOp>(loc, powerSum);
-+  Value output = rewriter.create<SelectOp>(
-       loc,
--      rewriter.create<mlir::stablehlo::CompareOp>(
--          loc, absNegPower,
--          rewriter.create<mlir::stablehlo::MulOp>(
--              loc, absInitialSum,
--              getConstantLikeSmallestFiniteValue(rewriter, loc, a)),
--          mlir::stablehlo::ComparisonDirection::LT),
--      initialSum, s);
-+      rewriter.create<CompareOp>(
-+          loc, absQNegPower,
-+          rewriter.create<MulOp>(
-+              loc, absPowerSum,
-+              getConstantLikeSmallestFiniteValue(rewriter, loc, acc)),
-+          ComparisonDirection::LT),
-+      powerSum, accurateResult);
- 
-   // Function is not defined for x < 1.
-   Value nan = getConstantLike(rewriter, loc,
-                               std::numeric_limits<double>::quiet_NaN(), x);
--  output = rewriter.create<mlir::stablehlo::SelectOp>(
-+  output = rewriter.create<SelectOp>(
-       loc,
--      rewriter.create<mlir::stablehlo::CompareOp>(
--          loc, x, oneLikeX, mlir::stablehlo::ComparisonDirection::LT),
-+      rewriter.create<CompareOp>(
-+          loc, x, oneLikeX, ComparisonDirection::LT),
-       nan, output);
- 
-   // For q <= 0, x must be an integer.
--  Value qLeZero = rewriter.create<mlir::stablehlo::CompareOp>(
--      loc, q, zero, mlir::stablehlo::ComparisonDirection::LE);
--  Value xNotInt = rewriter.create<mlir::stablehlo::CompareOp>(
--      loc, x, rewriter.create<mlir::stablehlo::FloorOp>(loc, x),
--      mlir::stablehlo::ComparisonDirection::NE);
-+  Value qLeZero = rewriter.create<CompareOp>(
-+      loc, q, zero, ComparisonDirection::LE);
-+  Value xNotInt = rewriter.create<CompareOp>(
-+      loc, x, rewriter.create<FloorOp>(loc, x),
-+      ComparisonDirection::NE);
-   Value xDomainError =
--      rewriter.create<mlir::stablehlo::AndOp>(loc, qLeZero, xNotInt);
--  output = rewriter.create<mlir::stablehlo::SelectOp>(loc, xDomainError, nan,
-+      rewriter.create<AndOp>(loc, qLeZero, xNotInt);
-+  output = rewriter.create<SelectOp>(loc, xDomainError, nan,
-                                                       output);
- 
-   // For all integer q <= 0, zeta has a pole. The limit is only defined as
-   // +inf if x is and even integer.
-   Value inf = getConstantLike(rewriter, loc,
-                               std::numeric_limits<double>::infinity(), x);
--  Value qIsInt = rewriter.create<mlir::stablehlo::CompareOp>(
--      loc, q, rewriter.create<mlir::stablehlo::FloorOp>(loc, q),
--      mlir::stablehlo::ComparisonDirection::EQ);
--  Value atPole = rewriter.create<mlir::stablehlo::AndOp>(loc, qLeZero, qIsInt);
-+  Value qIsInt = rewriter.create<CompareOp>(
-+      loc, q, rewriter.create<FloorOp>(loc, q),
-+      ComparisonDirection::EQ);
-+  Value atPole = rewriter.create<AndOp>(loc, qLeZero, qIsInt);
-   Value two = getConstantLike(rewriter, loc, 2.0, x);
--  Value xIsInt = rewriter.create<mlir::stablehlo::CompareOp>(
--      loc, x, rewriter.create<mlir::stablehlo::FloorOp>(loc, x),
--      mlir::stablehlo::ComparisonDirection::EQ);
--  Value xIsEven = rewriter.create<mlir::stablehlo::CompareOp>(
--      loc, rewriter.create<mlir::stablehlo::RemOp>(loc, x, two), zero,
--      mlir::stablehlo::ComparisonDirection::EQ);
-+  Value xIsInt = rewriter.create<CompareOp>(
-+      loc, x, rewriter.create<FloorOp>(loc, x),
-+      ComparisonDirection::EQ);
-+  Value xIsEven = rewriter.create<CompareOp>(
-+      loc, rewriter.create<RemOp>(loc, x, two), zero,
-+      ComparisonDirection::EQ);
-   Value xIsEvenInt =
--      rewriter.create<mlir::stablehlo::AndOp>(loc, xIsInt, xIsEven);
--  output = rewriter.create<mlir::stablehlo::SelectOp>(
-+      rewriter.create<AndOp>(loc, xIsInt, xIsEven);
-+  output = rewriter.create<SelectOp>(
-       loc, atPole,
--      rewriter.create<mlir::stablehlo::SelectOp>(loc, xIsEvenInt, inf, nan),
-+      rewriter.create<SelectOp>(loc, xIsEvenInt, inf, nan),
-       output);
+-  auto lhsWindowDilations =
+-      concatAndPermute(1L, llvm::to_vector(rhsDilation), 1L, lhsPermutation);
++      concatAndPermute<int64_t>(0L, Sizes(lhsDilation) - 1, 0L, lhsPermutation);
++
++  auto lhsWindowDilations = concatAndPermute<int64_t>(
++      1L, llvm::to_vector(rhsDilation), 1L, lhsPermutation);
  
-   // For x = 1, this is the harmonic series and diverges.
--  output = rewriter.create<mlir::stablehlo::SelectOp>(
-+  output = rewriter.create<SelectOp>(
-       loc,
--      rewriter.create<mlir::stablehlo::CompareOp>(
--          loc, x, one, mlir::stablehlo::ComparisonDirection::EQ),
-+      rewriter.create<CompareOp>(
-+          loc, x, one, ComparisonDirection::EQ),
-       inf, output);
+   Sizes lhsPaddingLow, lhsPaddingHigh;
+-  for (auto paddingPair : concatAndPermute({0, 0}, llvm::to_vector(padding),
+-                                           {0, 0}, lhsPermutation)) {
++  for (auto paddingPair : concatAndPermute<std::pair<int64_t, int64_t>>(
++           {0, 0}, llvm::to_vector(padding), {0, 0}, lhsPermutation)) {
+     lhsPaddingLow.push_back(paddingPair.first);
+     lhsPaddingHigh.push_back(paddingPair.second);
+   }
+@@ -1461,8 +1462,8 @@
+   for (; outputSpatialIndexIt != outputSpatialIndexItEnd;
+        ++outputSpatialIndexIt) {
+     Sizes lhsWindowStart;
+-    for (auto [i, offset] : llvm::enumerate(
+-             concatAndPermute(0L, *outputSpatialIndexIt, 0L, lhsPermutation)))
++    for (auto [i, offset] : llvm::enumerate(concatAndPermute<int64_t>(
++             0L, *outputSpatialIndexIt, 0L, lhsPermutation)))
+       lhsWindowStart.push_back(lhsWindowStrides[i] * offset);
  
-   return output;
+     Sizes limitIndices;
+@@ -1507,9 +1508,9 @@
+     for (auto dotProductIt = dotProduct.index_begin();
+          dotProductIt != dotProduct.index_end();
+          ++dotProductIt, ++resultNonSpatialIt) {
+-      Index resultIndex(
+-          concatAndPermute((*resultNonSpatialIt)[0], *outputSpatialIndexIt,
+-                           (*resultNonSpatialIt)[1], resultPermutation));
++      Index resultIndex(concatAndPermute<int64_t>(
++          (*resultNonSpatialIt)[0], *outputSpatialIndexIt,
++          (*resultNonSpatialIt)[1], resultPermutation));
+       result.set(resultIndex, dotProduct.get(*dotProductIt));
+     }
+   }
 
diff --git a/third_party/xla/third_party/stablehlo/workspace.bzl b/third_party/xla/third_party/stablehlo/workspace.bzl
index ca2f3a937f73ad..86613c0dcd88cf 100644
--- a/third_party/xla/third_party/stablehlo/workspace.bzl
+++ b/third_party/xla/third_party/stablehlo/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
     # LINT.IfChange
-    STABLEHLO_COMMIT = "271e8634de184fbfafd677d3876170feb6d08c97"
-    STABLEHLO_SHA256 = "06db84c751bd4a980dc76249e02f10e119175fceba3eebed008da122cb480bab"
+    STABLEHLO_COMMIT = "e81411ef562e11337283ef24bb3c40b2f3a6ebfa"
+    STABLEHLO_SHA256 = "167f15fbdfc3dc54601b6e37d53bce7323123f701893deb3935c8629a763766a"
     # LINT.ThenChange(Google-internal path)
 
     tf_http_archive(
diff --git a/third_party/xla/third_party/triton/cl617812302.patch b/third_party/xla/third_party/triton/cl617812302.patch
deleted file mode 100644
index c9c8066190e6ce..00000000000000
--- a/third_party/xla/third_party/triton/cl617812302.patch
+++ /dev/null
@@ -1,12 +0,0 @@
-==== triton/BUILD#42 - /google/src/cloud/csigg/mlir_83e5a1239242d64110e3dfa96ed3889170ab96b2_1711020969/triton/BUILD ====
-# action=edit type=text
---- triton/BUILD	2024-03-21 04:26:30.000000000 -0700
-+++ triton/BUILD	2024-03-21 05:39:41.000000000 -0700
-@@ -692,6 +692,7 @@
-         "@llvm-project//mlir:SCFDialect",
-         "@llvm-project//mlir:SCFTransforms",
-         "@llvm-project//mlir:SCFUtils",
-+        "@llvm-project//mlir:SideEffectInterfaces",
-         "@llvm-project//mlir:Support",
-         "@llvm-project//mlir:TensorDialect",
-         "@llvm-project//mlir:TransformUtils",
diff --git a/third_party/xla/third_party/triton/cl619146327.patch b/third_party/xla/third_party/triton/cl619146327.patch
deleted file mode 100644
index 4f9c1e4b971c0a..00000000000000
--- a/third_party/xla/third_party/triton/cl619146327.patch
+++ /dev/null
@@ -1,52 +0,0 @@
-This patch can be removed once this commit is included:
-https://github.com/openai/triton/commit/6ea5b56015db9e0bcff45ec7116cfcbfa729a516
-
-diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
---- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
-+++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
-@@ -305,9 +305,10 @@ compared to 1*64 when the hasLeadingOffs
-           int perPhase = 128 / (shapePerCTA[order[0]] * 4 / dotOpEnc.getKWidth());
-           perPhase = std::max<int>(perPhase, 1);
-           std::vector<size_t> matShape = {8, 8, 4 * dotOpEnc.getKWidth()};
--          // for now, disable swizzle when using transposed int8 tensor cores
--          if ((32 / typeWidthInBit != dotOpEnc.getKWidth()) && order[0] == inner)
--            return get(context, 1, 1, 1, order, CTALayout);
-+          int vecWidth = 32 / typeWidthInBit;
-+          if (vecWidth != dotOpEnc.getKWidth() && order[0] == inner) {
-+              perPhase = std::max<int>(perPhase, 2 * vecWidth);
-+          }
-           int rank = order.size();
-           // --- handle A operand ---
-           if (opIdx == 0) { // compute swizzling for A operand
-diff --git a/test/TritonGPU/reduce-data-duplication.mlir b/test/TritonGPU/reduce-data-duplication.mlir
-new file mode 100644
---- /dev/null
-+++ b/test/TritonGPU/reduce-data-duplication.mlir
-@@ -0,0 +1,14 @@
-+// RUN: triton-opt %s -split-input-file -tritongpu-reduce-data-duplication | FileCheck %s
-+
-+//       CHECK:   #[[SHARED:.*]] = #triton_gpu.shared<{vec = 8, perPhase = 8, maxPhase = 2, order = [0, 1], hasLeadingOffset = false}
-+//       CHECK:   apply_swizzle
-+//       CHECK:   %{{.*}} = triton_gpu.local_alloc %{{.*}} : (tensor<16x256xf16, #{{.*}}>) -> !tt.memdesc<16x256xf16, #[[SHARED]]>
-+
-+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 4], order = [0, 1]}>
-+#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 4], instrShape = [16, 8]}>
-+module attributes {"triton_gpu.compute-capability" = 80 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
-+  tt.func @apply_swizzle(%arg0: tensor<16x256xf16, #blocked>) {
-+    %0 = triton_gpu.convert_layout %arg0 : tensor<16x256xf16, #blocked> -> tensor<16x256xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>> 
-+    tt.return 
-+  } 
-+}
-\ No newline at end of file
-diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMMAv2.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMMAv2.cpp
---- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMMAv2.cpp
-+++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMMAv2.cpp
-@@ -541,8 +541,6 @@ getLoadMatrixFn(MemDescType descTy, cons
-   const int elemBytes = descTy.getElementTypeBitWidth() / 8;
-   auto order = sharedLayout.getOrder();
- 
--  if (kWidth != (4 / elemBytes))
--    assert(vecPhase == 1 || vecPhase == 4 * kWidth);
-   int nPerWarp =
-       std::max<int>(shapePerCTA[2] / mmaLayout.getWarpsPerCTA()[2], 8);
- 
diff --git a/third_party/xla/third_party/triton/cl619443019.patch b/third_party/xla/third_party/triton/cl619443019.patch
deleted file mode 100644
index 95ce54b6e4d6aa..00000000000000
--- a/third_party/xla/third_party/triton/cl619443019.patch
+++ /dev/null
@@ -1,76 +0,0 @@
-==== triton/BUILD#44 - /google/src/cloud/csigg/mlir_transform_utils/triton/BUILD ====
-# action=edit type=text
---- triton/BUILD	2024-03-22 08:02:38.000000000 -0700
-+++ triton/BUILD	2024-03-27 01:34:43.000000000 -0700
-@@ -620,6 +620,7 @@
-         "@llvm-project//mlir:FunctionInterfaces",
-         "@llvm-project//mlir:GPUDialect",
-         "@llvm-project//mlir:IR",
-+        "@llvm-project//mlir:InliningUtils",
-         "@llvm-project//mlir:LLVMDialect",
-         "@llvm-project//mlir:MathDialect",
-         "@llvm-project//mlir:SCFDialect",
-@@ -628,6 +629,7 @@
-         # The following is added to make Utility compile
-         ":TritonTools",
-         "@llvm-project//mlir:LLVMCommonConversion",
-+        "@llvm-project//mlir:TransformUtils",
-         "@llvm-project//mlir:Transforms",
-     ],
- )
-@@ -646,6 +648,7 @@
-         "@llvm-project//mlir:IR",
-         "@llvm-project//mlir:Pass",
-         "@llvm-project//mlir:Support",
-+        "@llvm-project//mlir:TransformUtils",
-         "@llvm-project//mlir:Transforms",
-     ],
-     alwayslink = True,  # TritonDialect uses getCanonicalizationPatterns().
-@@ -729,6 +732,7 @@
-         "@llvm-project//mlir:LLVMDialect",
-         "@llvm-project//mlir:NVVMDialect",
-         "@llvm-project//mlir:Pass",
-+        "@llvm-project//mlir:TransformUtils",
-         "@llvm-project//mlir:Transforms",
-     ],
- )
-@@ -780,6 +784,7 @@
-         "@llvm-project//mlir:IR",
-         "@llvm-project//mlir:IndexDialect",
-         "@llvm-project//mlir:Pass",
-+        "@llvm-project//mlir:TransformUtils",
-         "@llvm-project//mlir:Transforms",
-     ],
- )
-==== triton/test/BUILD#18 - /google/src/cloud/csigg/mlir_transform_utils/triton/test/BUILD ====
-# action=edit type=text
---- triton/test/BUILD	2024-03-11 11:42:57.000000000 -0700
-+++ triton/test/BUILD	2024-03-27 01:32:04.000000000 -0700
-@@ -53,6 +53,7 @@
-         "@llvm-project//mlir:IR",
-         "@llvm-project//mlir:Pass",
-         "@llvm-project//mlir:SCFToControlFlow",
-+        "@llvm-project//mlir:TransformUtils",
-         "@llvm-project//mlir:Transforms",
-         "//:TritonAnalysis",
-         "//:TritonDialects",
-==== triton/third_party/nvidia/BUILD#3 - /google/src/cloud/csigg/mlir_transform_utils/triton/third_party/nvidia/BUILD ====
-# action=edit type=text
---- triton/third_party/nvidia/BUILD	2024-03-11 11:42:57.000000000 -0700
-+++ triton/third_party/nvidia/BUILD	2024-03-27 01:32:46.000000000 -0700
-@@ -66,6 +66,7 @@
-         "@llvm-project//mlir:NVVMDialect",
-         "@llvm-project//mlir:Pass",
-         "@llvm-project//mlir:Support",
-+        "@llvm-project//mlir:TransformUtils",
-         "@llvm-project//mlir:Transforms",
-         "//:TritonDialects",
-     ],
-@@ -113,6 +114,7 @@
-         "@llvm-project//mlir:NVVMDialect",
-         "@llvm-project//mlir:Pass",
-         "@llvm-project//mlir:SCFToControlFlow",
-+        "@llvm-project//mlir:TransformUtils",
-         "@llvm-project//mlir:Transforms",
-         "//:TritonAnalysis",
-         "//:TritonDialects",
diff --git a/third_party/xla/third_party/triton/llvm_integration/BUILD b/third_party/xla/third_party/triton/llvm_integration/BUILD
new file mode 100644
index 00000000000000..3c413807167aeb
--- /dev/null
+++ b/third_party/xla/third_party/triton/llvm_integration/BUILD
@@ -0,0 +1 @@
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/xla/third_party/triton/llvm_integration/cl623185214.patch b/third_party/xla/third_party/triton/llvm_integration/cl623185214.patch
new file mode 100644
index 00000000000000..0eed759d61b419
--- /dev/null
+++ b/third_party/xla/third_party/triton/llvm_integration/cl623185214.patch
@@ -0,0 +1,14 @@
+==== triton/lib/Target/LLVMIR/LLVMDIScope.cpp#8 - /google/src/cloud/wcui/mlir_6f6336858e4588ebd113ebcc930f6384a4edca54_1712678792/triton/lib/Target/LLVMIR/LLVMDIScope.cpp ====
+# action=edit type=text
+--- triton/lib/Target/LLVMIR/LLVMDIScope.cpp	2024-02-09 08:45:23.000000000 -0800
++++ triton/lib/Target/LLVMIR/LLVMDIScope.cpp	2024-04-09 09:10:04.000000000 -0700
+@@ -90,7 +90,8 @@
+         compileUnitAttr = LLVM::DICompileUnitAttr::get(
+             context, distinctId, llvm::dwarf::DW_LANG_C, fileAttr,
+             StringAttr::get(context, "triton"),
+-            /*isOptimized=*/true, LLVM::DIEmissionKind::LineTablesOnly);
++            /*isOptimized=*/true, LLVM::DIEmissionKind::LineTablesOnly,
++            LLVM::DINameTableKind::Default);
+       }
+       subprogramFlags = subprogramFlags | LLVM::DISubprogramFlags::Definition;
+     } else {
diff --git a/third_party/xla/third_party/triton/llvm_integration/series.bzl b/third_party/xla/third_party/triton/llvm_integration/series.bzl
new file mode 100644
index 00000000000000..3248f6a5b1322c
--- /dev/null
+++ b/third_party/xla/third_party/triton/llvm_integration/series.bzl
@@ -0,0 +1,9 @@
+"""
+Provides a temporary list of patches created during llvm-integration.
+
+These should be upstreamed to openai/triton as part of the next triton integration process.
+"""
+
+llvm_patch_list = [
+    "//third_party/triton/llvm_integration:cl623185214.patch",
+]
diff --git a/third_party/xla/third_party/triton/temporary/BUILD b/third_party/xla/third_party/triton/temporary/BUILD
new file mode 100644
index 00000000000000..3c413807167aeb
--- /dev/null
+++ b/third_party/xla/third_party/triton/temporary/BUILD
@@ -0,0 +1 @@
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/xla/third_party/triton/temporary/cl609333259.patch b/third_party/xla/third_party/triton/temporary/cl609333259.patch
new file mode 100644
index 00000000000000..d51bae07dda0db
--- /dev/null
+++ b/third_party/xla/third_party/triton/temporary/cl609333259.patch
@@ -0,0 +1,47 @@
+This patch handles internal test failures. We can attempt to upstream this into
+2 changes, but OpenAI might resist. For now, we should move this patch into the
+internal ones. This is tracked here: b/331606551. These issues won't reproduce
+upstream without removing a pass (which we do internally) that needs further
+investigations (tracked here b/331360119).
+
+diff --git a/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp b/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp
+--- a/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp
++++ b/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp
+@@ -123,7 +115,8 @@ public:
+                                 PatternRewriter &rewriter) const override {
+     // Only consider conversions to dot operand.
+     auto cvtTy = cvt.getType().cast<RankedTensorType>();
+-    if (!cvtTy.getEncoding().isa<DotOperandEncodingAttr>())
++    auto dotOpEnc = cvtTy.getEncoding().dyn_cast<DotOperandEncodingAttr>();
++    if (!dotOpEnc)
+       return failure();
+ 
+     auto src = cvt.getSrc().getDefiningOp();
+@@ -138,6 +131,12 @@ public:
+                 [](Type ty) { return ty.isa<RankedTensorType>(); }))
+       return failure();
+ 
++    // Quick handling to fix loading issues when computing the original
++    // bitwidth is unable to realize that there is a mixed-precision dot
++    // (hence kWidth = 1) but wants to hoist through the type conversion.
++    if (isa<arith::ExtFOp>(src) && dotOpEnc.getKWidth() == 1)
++        return failure();
++
+     // Only consider custom conversions or arith ops.
+     // TODO(jlebar): Is this too restrictive?
+     if (!isa<FpToFpOp, BitcastOp>(src) &&
+@@ -150,6 +149,14 @@ public:
+     if (isa<arith::TruncIOp, arith::TruncFOp, arith::SelectOp>(src))
+       return failure();
+ 
++    // Don't hoist through u1 -> fp casts as they aren't supported in
++    // ElementwiseOpToLLVM::reorderValues().
++    if (isa<arith::UIToFPOp>(src)) {
++      Type srcType = getElementTypeOrSelf(src->getOperand(0));
++      if (srcType.isInteger(1))
++        return failure();
++    }
++
+     // Check that the conversion is transitively dependent on a load, and all
+     // operations between the load and the conversion are layout preserving.
+     //
diff --git a/third_party/xla/third_party/triton/temporary/series.bzl b/third_party/xla/third_party/triton/temporary/series.bzl
new file mode 100644
index 00000000000000..e403a80462516b
--- /dev/null
+++ b/third_party/xla/third_party/triton/temporary/series.bzl
@@ -0,0 +1,10 @@
+"""
+Provides a temporary list of patches.
+
+These are created temporarily and should be moved to the first copybara workflow as a public or an
+internal patch during the next triton integration process.
+"""
+
+temporary_patch_list = [
+    "//third_party/triton/temporary:cl609333259.patch",
+]
diff --git a/third_party/xla/third_party/triton/workspace.bzl b/third_party/xla/third_party/triton/workspace.bzl
index 2773b250ac8554..50ba0e0c1ead99 100644
--- a/third_party/xla/third_party/triton/workspace.bzl
+++ b/third_party/xla/third_party/triton/workspace.bzl
@@ -1,22 +1,19 @@
 """Provides the repository macro to import Triton."""
 
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
+load("//third_party/triton/llvm_integration:series.bzl", "llvm_patch_list")
+load("//third_party/triton/temporary:series.bzl", "temporary_patch_list")
+load("//third_party/triton/xla_extensions:series.bzl", "extensions_files_patch_list")
 
 def repo():
     """Imports Triton."""
 
-    TRITON_COMMIT = "cl617459344"
-    TRITON_SHA256 = "f23e65175a67b1091ab6782720b0bcb969f33c19cae8168bf93eea523dab8a3f"
+    TRITON_COMMIT = "cl619179472"
+    TRITON_SHA256 = "aa0b0b338bf16aa7eea778312fa549a421278b24d1a4bc04f5d6ced706f693fe"
     tf_http_archive(
         name = "triton",
         sha256 = TRITON_SHA256,
         strip_prefix = "triton-{commit}".format(commit = TRITON_COMMIT),
         urls = tf_mirror_urls("https://github.com/openxla/triton/archive/{commit}.tar.gz".format(commit = TRITON_COMMIT)),
-        # For temporary changes which haven't landed upstream yet.
-        patch_file = [
-            "//third_party/triton:cl607293980.patch",  # long standing :(
-            "//third_party/triton:cl617812302.patch",
-            "//third_party/triton:cl619146327.patch",
-            "//third_party/triton:cl619443019.patch",
-        ],
+        patch_file = extensions_files_patch_list + llvm_patch_list + temporary_patch_list,
     )
diff --git a/third_party/xla/third_party/triton/xla_extensions/BUILD b/third_party/xla/third_party/triton/xla_extensions/BUILD
new file mode 100644
index 00000000000000..3c413807167aeb
--- /dev/null
+++ b/third_party/xla/third_party/triton/xla_extensions/BUILD
@@ -0,0 +1 @@
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/xla/third_party/triton/cl607293980.patch b/third_party/xla/third_party/triton/xla_extensions/env_vars.patch
similarity index 100%
rename from third_party/xla/third_party/triton/cl607293980.patch
rename to third_party/xla/third_party/triton/xla_extensions/env_vars.patch
diff --git a/third_party/xla/third_party/triton/xla_extensions/series.bzl b/third_party/xla/third_party/triton/xla_extensions/series.bzl
new file mode 100644
index 00000000000000..5e6309a20988ce
--- /dev/null
+++ b/third_party/xla/third_party/triton/xla_extensions/series.bzl
@@ -0,0 +1,11 @@
+"""
+Provides the list of long-term patches applied to openxla/xla that are not possible to be
+applied in the previous copybara workflow.
+"""
+
+extensions_files_patch_list = [
+    "//third_party/triton/xla_extensions:env_vars.patch",  # File not exported to google
+    "//third_party/triton/xla_extensions:sparse_dot_nvgpu.patch",  # Sparsity internal patch
+    "//third_party/triton/xla_extensions:sparse_dot_base.patch",  # Sparsity internal patch
+    "//third_party/triton/xla_extensions:sparse_dot_passes.patch",  # Sparsity internal patch
+]
diff --git a/third_party/xla/third_party/triton/xla_extensions/sparse_dot_base.patch b/third_party/xla/third_party/triton/xla_extensions/sparse_dot_base.patch
new file mode 100644
index 00000000000000..5b520537f6e814
--- /dev/null
+++ b/third_party/xla/third_party/triton/xla_extensions/sparse_dot_base.patch
@@ -0,0 +1,900 @@
+diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
+--- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
++++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
+@@ -1158,4 +1158,12 @@ section 9.7.13.4.1 for more details.
+   let extraClassDeclaration = extraDistributedDeclaration;
+ }
+ 
++def SparseDotMetaEncodingAttr : DistributedEncoding<"SparseDotMetaEncoding", "sparse_dot_meta_encoding"> {
++  let mnemonic = "sparse_dot_meta";
++
++  let parameters = (ins "Attribute":$parent);
++  let assemblyFormat = "`<``{` struct(params) `}``>`";
++  let extraClassDeclaration = extraDistributedDeclaration;
++}
++
+ #endif
+diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
+--- a/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
++++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
+@@ -7,6 +7,7 @@ include "triton/Dialect/TritonGPU/IR/Tri
+ include "mlir/Dialect/Arith/IR/ArithBase.td"
+ include "triton/Dialect/Triton/IR/TritonTypes.td"
+ include "triton/Dialect/Triton/IR/TritonAttrDefs.td"
++include "triton/Dialect/Triton/IR/TritonTypeInterfaces.td"
+ include "mlir/IR/OpBase.td"
+ include "mlir/Interfaces/SideEffectInterfaces.td" // Pure
+ include "mlir/Interfaces/InferTypeOpInterface.td" // SameOperandsAndResultType
+@@ -214,4 +215,19 @@ def TTG_LocalLoadOp : TTG_Op<"local_load
+   let results = (outs TT_Tensor:$result);
+ }
+ 
++def TTNG_SparseDotOp : TTG_Op<"sparse_dot", [
++        Pure, DeclareOpInterfaceMethods<InferTypeOpInterface>,
++        TypesMatchWith<"result's type matches accumulator's type", "d", "c", "$_self">]> {
++    let summary = "sparse dot";
++
++    let arguments = (ins
++      TT_TensorOrMemDesc:$a,
++      TT_TensorOrMemDesc:$b,
++      TT_FpIntTensor:$c,
++      TT_IntTensor: $aMeta);
++    let results = (outs TT_FpIntTensor:$d);
++    let assemblyFormat = "$a`,` $b`,` $c`,` $aMeta attr-dict `:` type($a) `meta` type($aMeta) `*` type($b) `->` type($d)";
++    let hasVerifier = 1;
++}
++
+ #endif
+diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
+--- a/lib/Dialect/TritonGPU/IR/Dialect.cpp
++++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp
+@@ -479,6 +479,119 @@ getDefaultBlockedEncoding(MLIRContext *c
+   return encoding;
+ }
+ 
++///--- SparseDotOp ---
++namespace {
++// Implied properties of 2:4 sparse dots.
++constexpr int kContractingFactor = 2;
++constexpr int kMetadataElementsPerPackedValue = 8;
++constexpr int kMetadataElementsPerWarp = 16;
++}  // namespace
++
++mlir::LogicalResult SparseDotOp::inferReturnTypes(
++    MLIRContext *context, std::optional<Location> location, ValueRange operands,
++    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
++    SmallVectorImpl<Type> &inferredReturnTypes) {
++  return DotOp::inferReturnTypes(context, location, operands, attributes,
++                                 properties, regions, inferredReturnTypes);
++}
++
++LogicalResult SparseDotOp::verify() {
++  // Verify operand A.
++  auto aTensorTy = getOperand(0).getType().cast<TensorOrMemDesc>();
++  auto aElemTy = aTensorTy.getElementType();
++  if (!aElemTy.isF16() && !aElemTy.isBF16())
++    return emitError("element type of operand A is not supported");
++  auto aShape = aTensorTy.getShape();
++  if (aShape.size() != 2) return emitError("shape of operand A is incorrect");
++
++  // Verify operand B.
++  auto bTensorTy = getOperand(1).getType().cast<TensorOrMemDesc>();
++  auto bElemTy = bTensorTy.getElementType();
++  if (!bElemTy.isF16() && !bElemTy.isBF16())
++    return emitError("element type of operand B is not supported");
++  auto bShape = bTensorTy.getShape();
++  if (bShape.size() != 2) return emitError("shape of operand B is incorrect");
++
++  // Verify operand C.
++  auto cTensorTy = getOperand(2).getType().cast<RankedTensorType>();
++  auto cElemTy = cTensorTy.getElementType();
++  if (!cElemTy.isF32())
++    return emitError("element type of operand C is not supported");
++  auto cShape = cTensorTy.getShape();
++  if (cShape.size() != 2) return emitError("shape of operand C is incorrect");
++
++  // Check operand dependencies.
++  if (aShape[0] != cShape[0] || bShape[1] != cShape[1] ||
++      bShape[0] != aShape[1] * kContractingFactor)
++    return emitError("operand shape dimensions are incorrect");
++  if (aElemTy != bElemTy)
++    return emitError("operand element types do not match");
++
++  // Verify sparse metadata.
++  auto metaTy = getOperand(3).getType().cast<RankedTensorType>();
++  auto metaShape = metaTy.getShape();
++  if (!metaTy.getElementType().isInteger(16) || metaShape.size() != 2)
++    return emitError("sparse metadata tensor is invalid");
++  if (metaShape[0] != aShape[0] ||
++      metaShape[1] * kMetadataElementsPerPackedValue != aShape[1])
++    return emitError("sparse metadata shape dimensions are incorrect");
++
++  // Verify tensor encoding.
++  auto aEncoding = aTensorTy.getEncoding();
++  auto bEncoding = bTensorTy.getEncoding();
++  if (!aEncoding && !bEncoding) return mlir::success();
++  if (!aEncoding || !bEncoding)
++    return emitError("mismatching encoding between A and B operands");
++
++  Dialect &dialect = aEncoding.getDialect();
++  auto interface = cast<DialectInferLayoutInterface>(&dialect);
++  return interface->verifyDotOpEncodingCompatibility(getOperation(), aEncoding,
++                                                     bEncoding);
++}
++
++//--- SparseDotMetaEncodingAttr ---
++unsigned SparseDotMetaEncodingAttr::getTotalElemsPerThread(
++    ArrayRef<int64_t> shape, Type eltTy) const {
++  auto mmaLayout = getParent().cast<NvidiaMmaEncodingAttr>();
++  return product<int64_t>(shape) /
++         (mmaLayout.getWarpsPerCTA()[0] * kMetadataElementsPerWarp);
++}
++
++SmallVector<unsigned> SparseDotMetaEncodingAttr::getElemsPerThread(
++    ArrayRef<int64_t> shape, Type eltTy) const {
++  llvm_unreachable("getElemsPerThread is not supported for sparse dot meta");
++  return SmallVector<unsigned>();
++}
++
++SmallVector<unsigned> SparseDotMetaEncodingAttr::getCTAsPerCGA() const {
++  return ::getCTAsPerCGA(getParent());
++}
++SmallVector<unsigned> SparseDotMetaEncodingAttr::getCTAOrder() const {
++  return ::getCTAOrder(getParent());
++}
++SmallVector<unsigned> SparseDotMetaEncodingAttr::getCTASplitNum() const {
++  return ::getCTASplitNum(getParent());
++}
++SmallVector<unsigned> SparseDotMetaEncodingAttr::getWarpsPerCTA() const {
++  return ::getWarpsPerCTA(getParent());
++}
++SmallVector<unsigned> SparseDotMetaEncodingAttr::getWarpOrder() const {
++  return {1, 0};
++}
++SmallVector<unsigned> SparseDotMetaEncodingAttr::getThreadsPerWarp() const {
++  return ::getThreadsPerWarp(getParent());
++}
++SmallVector<unsigned> SparseDotMetaEncodingAttr::getThreadOrder() const {
++  return {1, 0};
++}
++SmallVector<unsigned> SparseDotMetaEncodingAttr::getSizePerThread() const {
++  return ::getSizePerThread(getParent());
++}
++SmallVector<unsigned> SparseDotMetaEncodingAttr::getShapePerCTATile(
++    ArrayRef<int64_t> tensorShape) const {
++  return ::getShapePerCTATile(getParent(), tensorShape);
++}
++
+ } // namespace gpu
+ } // namespace triton
+ } // namespace mlir
+diff --git a/test/SparseDot/convert_to_llvm_ampere.mlir b/test/SparseDot/convert_to_llvm_ampere.mlir
+new file mode 100644
+--- /dev/null
++++ b/test/SparseDot/convert_to_llvm_ampere.mlir
+@@ -0,0 +1,26 @@
++// RUN: triton-opt %s --allocate-shared-memory --convert-triton-gpu-to-llvm=compute-capability=80 | FileCheck %s
++
++#blocked0 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
++#shared0 = #triton_gpu.shared<{vec = 1, perPhase=1, maxPhase=1, order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
++#mma0 = #triton_gpu.nvidia_mma<{versionMajor = 2, warpsPerCTA = [2, 2], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1], instrShape = [16, 8]}>
++#dot_operand_a = #triton_gpu.dot_op<{opIdx=0, parent=#mma0, kWidth=2}>
++#dot_operand_b = #triton_gpu.dot_op<{opIdx=1, parent=#mma0, kWidth=2}>
++#dot_meta_enc = #triton_gpu.sparse_dot_meta<{parent=#mma0}>
++
++module attributes {"triton_gpu.num-warps" = 4 : i32} {
++  tt.func @sparse_dot(%A: tensor<32x32xf16, #blocked0>, %B: tensor<64x32xf16, #blocked0>, %meta: tensor<32x4xi16, #blocked0>) {
++    // CHECK-COUNT-2: ldmatrix.sync.aligned.m8n8.x4.shared.b16
++    %A_alloc = triton_gpu.local_alloc %A {allocation.offset = 0 : i32} : (tensor<32x32xf16, #blocked0>) -> !tt.memdesc<32x32xf16, #shared0>
++    %A_dot = triton_gpu.local_load %A_alloc : !tt.memdesc<32x32xf16, #shared0> -> tensor<32x32xf16, #dot_operand_a>
++    // CHECK-COUNT-4: ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16
++    %B_alloc = triton_gpu.local_alloc %B {allocation.offset = 2048 : i32} : (tensor<64x32xf16, #blocked0>) -> !tt.memdesc<64x32xf16, #shared0>
++    %B_dot = triton_gpu.local_load %B_alloc : !tt.memdesc<64x32xf16, #shared0> -> tensor<64x32xf16, #dot_operand_b>
++    // CHECK-COUNT-4: llvm.load %[[_:.*]] : !llvm.ptr<3> -> i16
++    %meta_alloc = triton_gpu.local_alloc %meta {allocation.offset = 6144 : i32} : (tensor<32x4xi16, #blocked0>) -> !tt.memdesc<32x4xi16, #shared0>
++    %meta_reg = triton_gpu.local_load %meta_alloc : !tt.memdesc<32x4xi16, #shared0> -> tensor<32x4xi16, #dot_meta_enc>
++    // CHECK-COUNT-4: mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32
++    %acc = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma0>
++    %D = triton_gpu.sparse_dot %A_dot, %B_dot, %acc, %meta_reg : tensor<32x32xf16, #dot_operand_a> meta tensor<32x4xi16, #dot_meta_enc> * tensor<64x32xf16, #dot_operand_b> -> tensor<32x32xf32, #mma0>
++    tt.return
++  }
++}
+diff --git a/test/SparseDot/convert_to_llvm_hopper.mlir b/test/SparseDot/convert_to_llvm_hopper.mlir
+new file mode 100644
+--- /dev/null
++++ b/test/SparseDot/convert_to_llvm_hopper.mlir
+@@ -0,0 +1,28 @@
++// RUN: triton-opt %s --allocate-shared-memory --convert-triton-gpu-to-llvm=compute-capability=90 | FileCheck %s
++
++#blocked0 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
++#shared0 = #triton_gpu.shared<{vec = 1, perPhase=2, maxPhase=4, order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
++#shared1 = #triton_gpu.shared<{vec = 1, perPhase=1, maxPhase=1, order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
++#mma0 = #triton_gpu.nvidia_mma<{versionMajor = 3, warpsPerCTA = [4, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1], instrShape = [16, 64, 16]}>
++#dot_meta_enc = #triton_gpu.sparse_dot_meta<{parent=#mma0}>
++
++module attributes {"triton_gpu.num-warps" = 4 : i32} {
++  tt.func @sparse_dot(%A: tensor<64x32xf16, #blocked0>, %B: tensor<64x64xf16, #blocked0>, %meta: tensor<64x4xi16, #blocked0>) {
++    %A_alloc = triton_gpu.local_alloc %A {allocation.offset = 0 : i32} : (tensor<64x32xf16, #blocked0>) -> !tt.memdesc<64x32xf16, #shared0>
++    %B_alloc = triton_gpu.local_alloc %B {allocation.offset = 4096 : i32} : (tensor<64x64xf16, #blocked0>) -> !tt.memdesc<64x64xf16, #shared0>
++    // CHECK-COUNT-2: llvm.load %[[_:.*]] : !llvm.ptr<3> -> i16
++    %meta_alloc = triton_gpu.local_alloc %meta {allocation.offset = 12288 : i32} : (tensor<64x4xi16, #blocked0>) -> !tt.memdesc<64x4xi16, #shared0>
++    %meta_reg = triton_gpu.local_load %meta_alloc : !tt.memdesc<64x4xi16, #shared0> -> tensor<64x4xi16, #dot_meta_enc>
++    // CHECK: nvgpu.wgmma_fence
++    // CHECK-COUNT-2: nvgpu.wgmma_sp %[[A:.*]] meta %[[M:.*]], %[[B:.*]], %[[C:.*]] {
++    // CHECK-DAG: layoutA = 0 : i32
++    // CHECK-DAG: layoutB = 0 : i32
++    // CHECK-DAG: m = 64 : i32
++    // CHECK-DAG: n = 64 : i32
++    // CHECK-DAG: k = 32 : i32
++    // CHECK: nvgpu.wgmma_commit_group
++    %acc = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #mma0>
++    %D = triton_gpu.sparse_dot %A_alloc, %B_alloc, %acc, %meta_reg : !tt.memdesc<64x32xf16, #shared0> meta tensor<64x4xi16, #dot_meta_enc> * !tt.memdesc<64x64xf16, #shared0> -> tensor<64x64xf32, #mma0>
++    tt.return
++  }
++}
+diff --git a/test/SparseDot/validation.mlir b/test/SparseDot/validation.mlir
+new file mode 100644
+--- /dev/null
++++ b/test/SparseDot/validation.mlir
+@@ -0,0 +1,129 @@
++// RUN: triton-opt --split-input-file --verify-diagnostics %s
++
++tt.func @sparse_dot(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
++  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
++  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
++  tt.return
++}
++
++// -----
++tt.func @sparse_dot_invalid_lhs_type(%lhs: tensor<128x32xf32>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
++  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
++  // expected-error @+1 {{element type of operand A is not supported}}
++  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xf32> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
++  tt.return
++}
++
++// -----
++tt.func @sparse_dot_invalid_lhs_shape(%lhs: tensor<1x128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
++  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
++  // expected-error @+1 {{shape of operand A is incorrect}}
++  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<1x128x32xbf16> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
++  tt.return
++}
++
++// -----
++tt.func @sparse_dot_invalid_rhs_type(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xf32>, %meta: tensor<128x4xi16>) {
++  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
++  // expected-error @+1 {{element type of operand B is not supported}}
++  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<64x128xf32> -> tensor<128x128xf32>
++  tt.return
++}
++
++// -----
++tt.func @sparse_dot_invalid_rhs_shape(%lhs: tensor<128x32xbf16>, %rhs: tensor<1x64x128xbf16>, %meta: tensor<128x4xi16>) {
++  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
++  // expected-error @+1 {{shape of operand B is incorrect}}
++  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<1x64x128xbf16> -> tensor<128x128xf32>
++  tt.return
++}
++
++// -----
++tt.func @sparse_dot_invalid_acc_type(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
++  %acc = arith.constant dense<0.00e+00> : tensor<128x128xbf16>
++  // expected-error @+1 {{element type of operand C is not supported}}
++  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<128x128xbf16>
++  tt.return
++}
++
++// -----
++tt.func @sparse_dot_invalid_acc_shape(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
++  %acc = arith.constant dense<0.00e+00> : tensor<16384xf32>
++  // expected-error @+1 {{shape of operand C is incorrect}}
++  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<16384xf32>
++  tt.return
++}
++
++// -----
++tt.func @sparse_dot_mismatch_lhs_acc(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
++  %acc = arith.constant dense<0.00e+00> : tensor<64x128xf32>
++  // expected-error @+1 {{operand shape dimensions are incorrect}}
++  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<64x128xf32>
++  tt.return
++}
++
++// -----
++tt.func @sparse_dot_mismatch_rhs_acc(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
++  %acc = arith.constant dense<0.00e+00> : tensor<128x64xf32>
++  // expected-error @+1 {{operand shape dimensions are incorrect}}
++  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<128x64xf32>
++  tt.return
++}
++
++// -----
++tt.func @sparse_dot_mismatch_lhs_rhs(%lhs: tensor<128x32xbf16>, %rhs: tensor<32x128xbf16>, %meta: tensor<128x4xi16>) {
++  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
++  // expected-error @+1 {{operand shape dimensions are incorrect}}
++  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<32x128xbf16> -> tensor<128x128xf32>
++  tt.return
++}
++
++// -----
++tt.func @sparse_dot_mismatch_input_types(%lhs: tensor<128x32xf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
++  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
++  // expected-error @+1 {{operand element types do not match}}
++  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xf16> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
++  tt.return
++}
++
++// -----
++tt.func @sparse_dot_invalid_meta_type(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi8>) {
++  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
++  // expected-error @+1 {{sparse metadata tensor is invalid}}
++  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi8> * tensor<64x128xbf16> -> tensor<128x128xf32>
++  tt.return
++}
++
++// -----
++tt.func @sparse_dot_invalid_meta_shape(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<512xi16>) {
++  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
++  // expected-error @+1 {{sparse metadata tensor is invalid}}
++  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<512xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
++  tt.return
++}
++
++// -----
++tt.func @sparse_dot_mismatch_meta_noncontracting(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<64x4xi16>) {
++  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
++  // expected-error @+1 {{sparse metadata shape dimensions are incorrect}}
++  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<64x4xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
++  tt.return
++}
++
++// -----
++tt.func @sparse_dot_mismatch_meta_contracting(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x8xi16>) {
++  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
++  // expected-error @+1 {{sparse metadata shape dimensions are incorrect}}
++  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x8xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
++  tt.return
++}
++
++// -----
++#mma0 = #triton_gpu.nvidia_mma<{versionMajor = 2, warpsPerCTA = [2, 2], instrShape = [16, 8]}>
++#enc0 = #triton_gpu.dot_op<{opIdx=0, parent=#mma0, kWidth=2}>
++tt.func @sparse_dot_encoding_operand_mismatch(%lhs: tensor<128x32xbf16, #enc0>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
++  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
++  // expected-error @+1 {{mismatching encoding between A and B operands}}
++  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16, #enc0> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
++  tt.return
++}
+diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM.cpp
+--- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM.cpp
++++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM.cpp
+@@ -38,6 +38,14 @@ Value convertLayout(int opIdx, Conversio
+                     const LLVMTypeConverter *typeConverter, Value thread);
+ }
+ 
++namespace SharedToSparseDotOperand {
++Value convertLayout(
++    ConversionPatternRewriter &rewriter, Location loc, Value tensor,
++    triton::gpu::SparseDotMetaEncodingAttr sparseEncoding,
++    const SharedMemoryObject &smemObj, const LLVMTypeConverter *typeConverter,
++    Value thread);
++} // namespace SharedToSparseDotOperand
++
+ namespace {
+ 
+ struct LocalLoadOpConversion
+@@ -59,6 +67,10 @@ public:
+             .isa<NvidiaMmaEncodingAttr>()) {
+       return lowerSharedToDotOperand(op, adaptor, getTypeConverter(), rewriter);
+     }
++    if (srcLayout.isa<SharedEncodingAttr>() &&
++        dstLayout.isa<triton::gpu::SparseDotMetaEncodingAttr>()) {
++      return lowerSharedToSparseMeta(op, adaptor, getTypeConverter(), rewriter);
++    }
+     return failure();
+   }
+ 
+@@ -130,6 +142,29 @@ private:
+     rewriter.replaceOp(op, res);
+     return success();
+   }
++
++  // shared -> sparse dot meta
++  LogicalResult lowerSharedToSparseMeta(
++      triton::gpu::LocalLoadOp op, triton::gpu::LocalLoadOpAdaptor adaptor,
++      const LLVMTypeConverter *typeConverter,
++      ConversionPatternRewriter &rewriter) const {
++    auto loc = op.getLoc();
++    auto sparseEncoding = op.getResult()
++                              .getType()
++                              .cast<RankedTensorType>()
++                              .getEncoding()
++                              .cast<triton::gpu::SparseDotMetaEncodingAttr>();
++    auto llvmElemTy = typeConverter->convertType(
++        op.getSrc().getType().cast<MemDescType>().getElementType());
++    auto smemObj = getSharedMemoryObjectFromStruct(loc, adaptor.getSrc(),
++                                                   llvmElemTy, rewriter);
++    Value res = SharedToSparseDotOperand::convertLayout(
++        rewriter, loc, op.getSrc(), sparseEncoding, smemObj, typeConverter,
++        getThreadId(rewriter, loc));
++
++    rewriter.replaceOp(op, res);
++    return success();
++  }
+ };
+ 
+ struct ConvertLayoutOpOptimizedConversion
+diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
+new file mode 100644
+--- /dev/null
++++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
+@@ -0,0 +1,69 @@
++#include "../Utility.h"
++
++namespace SharedToSparseDotOperand {
++namespace {
++constexpr int kThreadsPerWarp = 32;
++
++// Each 16x16 original sparse matrix tile requires 16 metadata values of 16-bit
++// size, where the first thread (T0) in each 4-thread group holds two such
++// values in a register (32-bit).
++// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#sparse-matrix-storage
++constexpr int kTileSize = 16;
++constexpr int kThreadsInGroup = 4;
++constexpr int kMetadataElementsPerPackedValue = 8;  // 8 x 2-bit = 16-bit
++constexpr int kMetadataLineOffset = kThreadsPerWarp / kThreadsInGroup;
++}  // namespace
++
++Value convertLayout(
++    ConversionPatternRewriter &rewriter, Location loc, Value tensor,
++    triton::gpu::SparseDotMetaEncodingAttr sparseEncoding,
++    const SharedMemoryObject &smemObj, const LLVMTypeConverter *typeConverter,
++    Value thread) {
++  // Calculate tile size as number of mask elements (4xi4).
++  NvidiaMmaEncodingAttr mmaLayout =
++      sparseEncoding.getParent().cast<NvidiaMmaEncodingAttr>();
++  SmallVector<unsigned> shapePerCTATile = {
++      kTileSize * mmaLayout.getWarpsPerCTA()[0],
++      kTileSize / kMetadataElementsPerPackedValue};
++  Value strideM = smemObj.strides[0];
++  Value strideK = smemObj.strides[1];
++
++  // Calculate offset in the tile for the current thread.
++  Value threadsPerWarp = i32_val(kThreadsPerWarp);
++  Value warpId = udiv(thread, threadsPerWarp);
++  Value warpGroupId = urem(warpId, i32_val(shapePerCTATile[0] / kTileSize));
++  Value laneId = urem(thread, threadsPerWarp);
++  Value laneGroupId = udiv(laneId, i32_val(kThreadsInGroup));
++  Value columnId = urem(laneId, i32_val(shapePerCTATile[1]));
++  Value rowId = add(mul(warpGroupId, i32_val(kTileSize)), laneGroupId);
++
++  // Calculate number of tile repetitions.
++  auto shape = tensor.getType().cast<MemDescType>().getShape();
++  int repM = shape[0] / shapePerCTATile[0];
++  int repK = shape[1] / shapePerCTATile[1];
++  assert(repM > 0 && repK > 0);
++
++  // Load sparse metadata from shared memory.
++  MLIRContext *ctx = tensor.getContext();
++  Type ptrTy = ptr_ty(ctx, 3);
++  Value base = gep(ptrTy, i16_ty, smemObj.base, i32_val(0));
++  SmallVector<Value> values;
++
++  for (int k = 0; k < repK; ++k) {
++    for (int m = 0; m < repM; ++m) {
++      Value row = add(rowId, i32_val(m * shapePerCTATile[0]));
++      Value column = add(columnId, i32_val(k * shapePerCTATile[1]));
++      Value offset1 = add(mul(row, strideM), mul(column, strideK));
++      Value offset2 = add(offset1, mul(i32_val(kMetadataLineOffset), strideM));
++      Value lower = load(i16_ty, gep(ptrTy, i16_ty, base, offset1));
++      Value upper = load(i16_ty, gep(ptrTy, i16_ty, base, offset2));
++      values.push_back(lower);
++      values.push_back(upper);
++    }
++  }
++
++  // Pack resulting values as LLVM struct.
++  Type structTy = struct_ty(SmallVector<Type>(values.size(), i16_ty));
++  return packLLElements(loc, typeConverter, values, rewriter, structTy);
++}
++}  // namespace SharedToSparseDotOperand
+diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM.cpp
+--- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM.cpp
++++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM.cpp
+@@ -32,6 +32,12 @@ LogicalResult convertAsyncWGMMA(triton::
+                                 const LLVMTypeConverter *typeConverter,
+                                 ConversionPatternRewriter &rewriter,
+                                 Value thread);
++
++LogicalResult rewriteSparseDotOp(triton::gpu::SparseDotOp op,
++                                 triton::gpu::SparseDotOp::Adaptor adaptor,
++                                 const LLVMTypeConverter *typeConverter,
++                                 ConversionPatternRewriter &rewriter);
++
+ namespace {
+ struct DotOpConversion : public ConvertOpToLLVMPattern<triton::DotOp> {
+   using ConvertOpToLLVMPattern<triton::DotOp>::ConvertOpToLLVMPattern;
+@@ -180,6 +186,18 @@ struct DotWaitOpConversion
+     return success();
+   }
+ };
++
++struct SparseDotOpConversion
++    : public ConvertOpToLLVMPattern<triton::gpu::SparseDotOp> {
++  using ConvertOpToLLVMPattern<
++      triton::gpu::SparseDotOp>::ConvertOpToLLVMPattern;
++
++  LogicalResult matchAndRewrite(
++      triton::gpu::SparseDotOp op, OpAdaptor adaptor,
++      ConversionPatternRewriter &rewriter) const override {
++    return rewriteSparseDotOp(op, adaptor, getTypeConverter(), rewriter);
++  }
++};
+ } // namespace
+ 
+ void mlir::triton::NVIDIA::populateDotOpToLLVMPatterns(
+@@ -188,4 +206,5 @@ void mlir::triton::NVIDIA::populateDotOp
+   patterns.add<DotOpConversion>(typeConverter, benefit);
+   patterns.add<DotAsyncOpConversion>(typeConverter, benefit);
+   patterns.add<DotWaitOpConversion>(typeConverter, benefit);
++  patterns.add<SparseDotOpConversion>(typeConverter, benefit);
+ }
+diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/Sparse.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/Sparse.cpp
+new file mode 100644
+--- /dev/null
++++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/Sparse.cpp
+@@ -0,0 +1,339 @@
++#include "../Utility.h"
++
++using namespace mlir;
++using namespace mlir::triton;
++using namespace mlir::triton::gpu;
++
++using ::mlir::LLVM::getSharedMemoryObjectFromStruct;
++using ::mlir::triton::gpu::getShapePerCTA;
++using ::mlir::triton::gpu::getShapePerCTATile;
++using ::mlir::triton::gpu::SharedEncodingAttr;
++
++using ValueTableV2 = std::map<std::pair<unsigned, unsigned>, Value>;
++
++namespace {
++constexpr int kContractingFactor = 2;  // implied by N:M (2:4)
++constexpr int kCore = 2;               // number of core matrices per batch
++constexpr int kCoreTile = kCore * kContractingFactor;
++}  // namespace
++
++// ----- Ampere implementation.
++
++ValueTableV2 getValuesFromDotOperandLayoutStruct(SmallVector<Value> elems,
++                                                 int n0, int n1) {
++  int offset = 0;
++  ValueTableV2 vals;
++  for (int i = 0; i < n0; ++i) {
++    for (int j = 0; j < n1; ++j) {
++      vals[{kCore * i, kCore * j}] = elems[offset++];
++      vals[{kCore * i, kCore * j + 1}] = elems[offset++];
++      vals[{kCore * i + 1, kCore * j}] = elems[offset++];
++      vals[{kCore * i + 1, kCore * j + 1}] = elems[offset++];
++    }
++  }
++  return vals;
++}
++
++std::string getMmaSpPtxInstruction(Type type) {
++  if (type.isF16()) {
++    return "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32";
++  } else if (type.isBF16()) {
++    return "mma.sp.sync.aligned.m16n8k32.row.col.f32.bf16.bf16.f32";
++  }
++  llvm::report_fatal_error("Unsupported SparseDotOp operand type");
++}
++
++LogicalResult convertSparseMMA(SparseDotOp op,
++                               SparseDotOp::Adaptor adaptor,
++                               const LLVMTypeConverter *typeConverter,
++                               ConversionPatternRewriter &rewriter) {
++  // Get number of repetitions across the dimensions.
++  auto aTensorTy = op.getA().getType().cast<RankedTensorType>();
++  auto bTensorTy = op.getB().getType().cast<RankedTensorType>();
++
++  auto layoutA = aTensorTy.getEncoding().dyn_cast<DotOperandEncodingAttr>();
++  auto layoutB = bTensorTy.getEncoding().dyn_cast<DotOperandEncodingAttr>();
++  assert(layoutA != nullptr && layoutB != nullptr);
++
++  int bitwidth = aTensorTy.getElementType().getIntOrFloatBitWidth();
++  auto mmaEnc = layoutA.getParent().cast<NvidiaMmaEncodingAttr>();
++  auto repA = mmaEnc.getMMAv2Rep(triton::gpu::getShapePerCTA(aTensorTy),
++                                 bitwidth, layoutA.getOpIdx());
++  auto repB = mmaEnc.getMMAv2Rep(triton::gpu::getShapePerCTA(bTensorTy),
++                                 bitwidth, layoutB.getOpIdx());
++
++  assert(repA[0] == 1 && repB[0] == 1);  // batch size
++  assert(repB[1] == repA[2] * kContractingFactor);
++  int repM = repA[1], repN = repB[2], repK = repB[1];
++
++  // Arrange loaded values into positions.
++  Location loc = op.getLoc();
++  auto ha = getValuesFromDotOperandLayoutStruct(
++      unpackLLElements(loc, adaptor.getA(), rewriter), repM,
++      repK / kContractingFactor);
++  auto hb = getValuesFromDotOperandLayoutStruct(
++      unpackLLElements(loc, adaptor.getB(), rewriter),
++      std::max(repN / kCore, 1), repK);
++
++  // Combine loaded metadata values.
++  auto hMeta = unpackLLElements(loc, adaptor.getAMeta(), rewriter);
++  SmallVector<Value> hMetaPacked;
++  for (int i = 0; i < hMeta.size(); i += kCore) {
++    Value lower = zext(i32_ty, hMeta[i]);
++    Value upper = zext(i32_ty, hMeta[i + 1]);
++    Value packed = or_(shl(upper, i32_val(16)), lower);
++    hMetaPacked.push_back(packed);
++  }
++
++  // Flatten accumulator values.
++  auto dTensorTy = op.getD().getType().cast<RankedTensorType>();
++  auto fc = unpackLLElements(loc, adaptor.getC(), rewriter);
++
++  // Create `mma.sp` instruction for 4/8 core matrices.
++  auto callMma = [&](unsigned m, unsigned n, unsigned k) {
++    PTXBuilder builder;
++    auto &mma =
++        *builder.create(getMmaSpPtxInstruction(aTensorTy.getElementType()));
++
++    auto retArgs = builder.newListOperand(kCoreTile, "=f");
++    auto cArgs = builder.newListOperand();
++    int baseIdx = m * repN * kCore + n * kCoreTile;
++    for (int i = 0; i < kCoreTile; ++i) {
++      cArgs->listAppend(builder.newOperand(fc[baseIdx + i], std::to_string(i)));
++    }
++    int i = k / kContractingFactor;
++    auto aArgs = builder.newListOperand({
++        {ha[{m, i}], "r"},
++        {ha[{m + 1, i}], "r"},
++        {ha[{m, i + 1}], "r"},
++        {ha[{m + 1, i + 1}], "r"},
++    });
++    auto bArgs = builder.newListOperand({
++        {hb[{n, k}], "r"},
++        {hb[{n, k + 1}], "r"},
++        {hb[{n, k + 2}], "r"},
++        {hb[{n, k + 3}], "r"},
++    });
++    auto metaArg =
++        builder.newOperand(hMetaPacked[k / kCoreTile * repM + m / kCore], "r");
++    auto selector = builder.newConstantOperand(0);
++    mma(retArgs, aArgs, bArgs, cArgs, metaArg, selector);
++
++    Type fp32x4Ty = LLVM::LLVMStructType::getLiteral(
++        op.getContext(), SmallVector<Type>(kCoreTile, f32_ty));
++    Value mmaOut = builder.launch(rewriter, loc, fp32x4Ty);
++    for (int i = 0; i < kCoreTile; ++i) {
++      fc[baseIdx + i] = extract_val(f32_ty, mmaOut, i);
++    }
++  };
++
++  for (int k = 0; k < repK; k += kContractingFactor)
++    for (int m = 0; m < repM; ++m)
++      for (int n = 0; n < repN; ++n) callMma(kCore * m, n, kCore * k);
++
++  // Replace with new packed result.
++  Type structTy = LLVM::LLVMStructType::getLiteral(
++      op.getContext(), SmallVector<Type>(fc.size(), f32_ty));
++  Value res = packLLElements(loc, typeConverter, fc, rewriter, structTy);
++  rewriter.replaceOp(op, res);
++
++  return success();
++}
++
++// ----- Hopper implementation.
++
++// Forward declarations.
++Value createDescriptor(ConversionPatternRewriter &rewriter, Location loc,
++                       int64_t swizzling, uint32_t stride);
++int64_t getSwizzlingFromLayout(const SharedEncodingAttr &layout,
++                               uint32_t widthInByte);
++triton::nvgpu::WGMMAEltType getMmaRetType(Value);
++triton::nvgpu::WGMMAEltType getMmaOperandType(Value, bool);
++
++namespace {
++constexpr int kThreadsPerWarp = 32;
++constexpr int kWarpsInGroup = 4;
++constexpr int kMmaAccumulatorCount = 2;
++constexpr int kMmaLineSize = 128;
++constexpr int kMmaAlignment = 16;
++}  // namespace
++
++// Shared memory descriptor builder for WGMMA.
++Value smemDescriptor(int a, int b, ConversionPatternRewriter &rewriter,
++                     Location loc, std::vector<unsigned int> instrShape,
++                     bool trans, int dimWpt, Value warpId, MemDescType tensorTy,
++                     Value baseDesc, int minor) {
++  auto sharedLayout = tensorTy.getEncoding().cast<SharedEncodingAttr>();
++  int elemBytes = tensorTy.getElementTypeBitWidth() / 8;
++  int elemsPerSwizzlingRow =
++      kMmaLineSize / sharedLayout.getPerPhase() / elemBytes;
++  Value elemsPerSwizzlingRowVal = i32_val(elemsPerSwizzlingRow);
++
++  Value k = i32_val(b * instrShape[1]);
++  Value m = add(i32_val(a * dimWpt * instrShape[0]),
++                mul(warpId, i32_val(instrShape[0])));
++  if (trans) {
++    std::swap(k, m);
++  }
++  Value leading_offset = mul(udiv(k, elemsPerSwizzlingRowVal),
++                             i32_val(minor * elemsPerSwizzlingRow));
++  Value stride_offset = mul(m, elemsPerSwizzlingRowVal);
++  Value offset =
++      add(add(leading_offset, stride_offset), urem(k, elemsPerSwizzlingRowVal));
++  Value off1 = mul(i32_val(elemBytes), offset);
++  Value off_ = zext(i64_ty, udiv(off1, i32_val(kMmaAlignment)));
++
++  return add(baseDesc, off_);
++}
++
++LogicalResult convertSparseWGMMA(SparseDotOp op,
++                                 SparseDotOp::Adaptor adaptor,
++                                 const LLVMTypeConverter *typeConverter,
++                                 ConversionPatternRewriter &rewriter,
++                                 Value thread) {
++  // Get number of repetitions across the dimensions.
++  auto aTensorTy = op.getA().getType().cast<MemDescType>();
++  auto bTensorTy = op.getB().getType().cast<MemDescType>();
++  auto dTensorTy = op.getD().getType().cast<RankedTensorType>();
++  auto mmaEnc = dTensorTy.getEncoding().cast<NvidiaMmaEncodingAttr>();
++
++  auto shapePerCTA = getShapePerCTA(dTensorTy);
++  auto shapePerCTATile = getShapePerCTATile(mmaEnc);
++  auto instrShape = mmaEnc.getInstrShape();
++  int repM = ceil<unsigned>(shapePerCTA[0], shapePerCTATile[0]);
++  int repN = ceil<unsigned>(shapePerCTA[1], shapePerCTATile[1]);
++  int repK = ceil<unsigned>(bTensorTy.getShape()[0],
++                            instrShape[2] * kContractingFactor);
++
++  // Flatten accumulator values.
++  auto loc = op.getLoc();
++  auto fc = unpackLLElements(loc, adaptor.getC(), rewriter);
++  int accSize = kMmaAccumulatorCount * (instrShape[1] / kWarpsInGroup);
++  assert(fc.size() == repM * repN * accSize);
++
++  // Get warp ID.
++  auto wpt = mmaEnc.getWarpsPerCTA();
++  Value warp =
++      and_(udiv(thread, i32_val(kThreadsPerWarp)), i32_val(0xFFFFFFFC));
++  Value warpM = urem(warp, i32_val(wpt[0]));
++  Value warpMN = udiv(warp, i32_val(wpt[0]));
++  Value warpN = urem(warpMN, i32_val(wpt[1]));
++
++  // Create descriptor.
++  auto getSharedData = [&](Value arg, MemDescType tensorTy) {
++    auto sharedObj = getSharedMemoryObjectFromStruct(
++        loc, arg, typeConverter->convertType(tensorTy.getElementType()),
++        rewriter);
++    auto sharedLayout = tensorTy.getEncoding().cast<SharedEncodingAttr>();
++    auto shape = getShapePerCTA(tensorTy);
++    auto ord = sharedLayout.getOrder();
++    int byteSize = aTensorTy.getElementTypeBitWidth() / 8;
++    int64_t swizzling =
++        getSwizzlingFromLayout(sharedLayout, shape[ord[0]] * byteSize);
++    Value baseDesc = createDescriptor(rewriter, loc, swizzling, shape[ord[1]]);
++    baseDesc =
++        add(baseDesc, lshr(ptrtoint(i64_ty, sharedObj.base), int_val(64, 4)));
++    return std::make_tuple(shape, ord, baseDesc);
++  };
++
++  // Create descriptor for loading A from shared memory.
++  auto tA = getSharedData(adaptor.getA(), aTensorTy);
++  Value warpA = urem(warpM, i32_val(std::get<0>(tA)[0] / instrShape[0]));
++  bool transA = std::get<1>(tA)[0] == 0;
++  auto loadA = [&](int m, int k) {
++    return smemDescriptor(m, k, rewriter, loc, {instrShape[0], instrShape[2]},
++                          transA, wpt[0], warpA, aTensorTy, std::get<2>(tA),
++                          std::get<0>(tA)[std::get<1>(tA)[1]]);
++  };
++
++  // Create descriptor for loading B from shared memory.
++  auto tB = getSharedData(adaptor.getB(), bTensorTy);
++  Value warpB = urem(warpN, i32_val(std::get<0>(tB)[1] / instrShape[1]));
++  bool transB = std::get<1>(tB)[0] == 1;
++  auto loadB = [&](int n, int k) {
++    return smemDescriptor(n, k, rewriter, loc,
++                          {instrShape[1], instrShape[2] * kContractingFactor},
++                          transB, wpt[1], warpB, bTensorTy, std::get<2>(tB),
++                          std::get<0>(tB)[std::get<1>(tB)[1]]);
++  };
++
++  // Load metadata from shared memory.
++  auto hMeta = unpackLLElements(loc, adaptor.getAMeta(), rewriter);
++  SmallVector<Value> hMetaPacked;
++  for (int i = 0; i < hMeta.size(); i += kCore) {
++    Value lower = zext(i32_ty, hMeta[i]);
++    Value upper = zext(i32_ty, hMeta[i + 1]);
++    Value packed = or_(shl(upper, i32_val(16)), lower);
++    hMetaPacked.push_back(packed);
++  }
++  assert(hMetaPacked.size() == repM * repK);
++
++  // Generate prologue.
++  triton::nvgpu::WGMMAEltType eltTypeA = getMmaOperandType(op.getA(), false);
++  triton::nvgpu::WGMMAEltType eltTypeB = getMmaOperandType(op.getB(), false);
++  triton::nvgpu::WGMMAEltType eltTypeC = getMmaRetType(op.getD());
++
++  triton::nvgpu::WGMMALayout layoutA = transA ? triton::nvgpu::WGMMALayout::col
++                                              : triton::nvgpu::WGMMALayout::row;
++  triton::nvgpu::WGMMALayout layoutB = transB ? triton::nvgpu::WGMMALayout::row
++                                              : triton::nvgpu::WGMMALayout::col;
++
++  rewriter.create<triton::nvgpu::FenceAsyncSharedOp>(loc, 0);
++  rewriter.create<triton::nvgpu::WGMMAFenceOp>(loc);
++
++  // Generate main loop.
++  for (int m = 0; m < repM; ++m) {
++    for (int n = 0; n < repN; ++n) {
++      llvm::MutableArrayRef acc(&fc[(m * repN + n) * accSize], accSize);
++      auto accTy = LLVM::LLVMStructType::getLiteral(
++          op.getContext(), SmallVector<Type>(accSize, f32_ty));
++      Value d = packLLElements(loc, typeConverter, acc, rewriter, accTy);
++      for (int k = 0; k < repK; ++k) {
++        Value a = loadA(m, k);
++        Value b = loadB(n, k);
++        Value meta = hMetaPacked[k * repM + m];
++        d = rewriter.create<triton::nvgpu::SparseWGMMAOp>(
++            loc, accTy, a, meta, b, d, kWarpsInGroup * instrShape[0],
++            instrShape[1], kContractingFactor * instrShape[2], eltTypeC,
++            eltTypeA, eltTypeB, layoutA, layoutB);
++      }
++      auto res = unpackLLElements(loc, d, rewriter);
++      for (int i = 0; i < res.size(); ++i) {
++        acc[i] = res[i];
++      }
++    }
++  }
++
++  // Replace with new packed result.
++  Type structTy = LLVM::LLVMStructType::getLiteral(
++      op.getContext(), SmallVector<Type>(fc.size(), f32_ty));
++  Value res = packLLElements(loc, typeConverter, fc, rewriter, structTy);
++
++  rewriter.create<triton::nvgpu::WGMMACommitGroupOp>(loc);
++  res = rewriter.create<triton::nvgpu::WGMMAWaitGroupOp>(loc, res, 0);
++  rewriter.replaceOp(op, res);
++
++  return success();
++}
++
++// ----- Dispatch based on architecture.
++
++LogicalResult rewriteSparseDotOp(SparseDotOp op,
++                                 SparseDotOp::Adaptor adaptor,
++                                 const LLVMTypeConverter *typeConverter,
++                                 ConversionPatternRewriter &rewriter) {
++  auto resultTy = op.getResult().getType().cast<RankedTensorType>();
++  NvidiaMmaEncodingAttr mmaLayout =
++      resultTy.getEncoding().cast<NvidiaMmaEncodingAttr>();
++
++  if (mmaLayout.isAmpere()) {
++    return convertSparseMMA(op, adaptor, typeConverter, rewriter);
++  }
++  if (mmaLayout.isHopper()) {
++    return convertSparseWGMMA(op, adaptor, typeConverter, rewriter,
++                              getThreadId(rewriter, op.getLoc()));
++  }
++
++  llvm::report_fatal_error(
++      "Unsupported SparseDotOp found when converting TritonGPU to LLVM.");
++}
+diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/WGMMA.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/WGMMA.cpp
+--- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/WGMMA.cpp
++++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/WGMMA.cpp
+@@ -87,8 +87,8 @@ int64_t getSwizzlingFromLayout(const Sha
+   return swizzlingByteWidth;
+ }
+ 
+-static Value createDescriptor(ConversionPatternRewriter &rewriter, Location loc,
+-                              int64_t swizzling, uint32_t stride) {
++Value createDescriptor(ConversionPatternRewriter &rewriter, Location loc,
++                       int64_t swizzling, uint32_t stride) {
+   // Create descriptor based on the format described in the spec:
+   // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-shared-memory-layout-matrix-descriptor
+   union WGMMADescriptor {
diff --git a/third_party/xla/third_party/triton/xla_extensions/sparse_dot_nvgpu.patch b/third_party/xla/third_party/triton/xla_extensions/sparse_dot_nvgpu.patch
new file mode 100644
index 00000000000000..b96aeacced5743
--- /dev/null
+++ b/third_party/xla/third_party/triton/xla_extensions/sparse_dot_nvgpu.patch
@@ -0,0 +1,136 @@
+diff --git a/include/triton/Dialect/NVGPU/IR/NVGPUOps.td b/include/triton/Dialect/NVGPU/IR/NVGPUOps.td
+--- a/include/triton/Dialect/NVGPU/IR/NVGPUOps.td
++++ b/include/triton/Dialect/NVGPU/IR/NVGPUOps.td
+@@ -87,6 +87,15 @@ def NVGPU_WGMMAOp : NVGPU_Op<"wgmma", []
+   let assemblyFormat = "$opA `,` $opB (`,` $opC^)? attr-dict `:` functional-type(operands, $res)";
+ }
+ 
++def NVGPU_SparseWGMMAOp : NVGPU_Op<"wgmma_sp", []> {
++  let arguments = (ins WGMMA_OperandType:$opA, I32:$metaA, WGMMA_OperandType:$opB, LLVM_AnyStruct:$opC,
++                   I32Attr:$m, I32Attr:$n, I32Attr:$k,
++                   WGMMA_EltTypeAttr:$eltTypeC, WGMMA_EltTypeAttr:$eltTypeA, WGMMA_EltTypeAttr:$eltTypeB,
++                   WGMMA_LayoutAttr:$layoutA, WGMMA_LayoutAttr:$layoutB);
++  let results = (outs LLVM_AnyStruct:$res);
++  let assemblyFormat = "$opA `meta` $metaA `,` $opB `,` $opC attr-dict `:` functional-type(operands, $res)";
++}
++
+ def NVGPU_LoadDSmemOp : NVGPU_Op<"load_dsmem", [MemoryEffects<[MemRead]>]> {
+   let arguments = (ins LLVM_AnyPointer:$addr, I32:$ctaId, I32Attr:$bitwidth, I32Attr:$vec);
+   let builders = [
+diff --git a/test/SparseDot/test_wgmma_sp.mlir b/test/SparseDot/test_wgmma_sp.mlir
+new file mode 100644
+--- /dev/null
++++ b/test/SparseDot/test_wgmma_sp.mlir
+@@ -0,0 +1,14 @@
++// RUN: triton-opt %s -split-input-file --convert-nv-gpu-to-llvm | FileCheck %s
++
++module attributes {"triton_gpu.num-warps" = 4 : i32} {
++  tt.func @wgmma_sp(%descA: i64, %metaA: i32, %descB: i64, %acc: !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>) {
++    // CHECK: @wgmma_sp(%[[LHS:.*]]: i64, %[[META:.*]]: i32, %[[RHS:.*]]: i64,
++    // CHECK: llvm.inline_asm has_side_effects asm_dialect = att operand_attrs = []
++    // CHECK-SAME: "wgmma.mma_async.sp.sync.aligned.m64n16k32.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7}, $16, $17, $18, 0, 1, 1, 1, 0, 0;"
++    // CHECK-SAME: "=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,l,l,r" %0, %1, %2, %3, %4, %5, %6, %7, %[[LHS]], %[[RHS]], %[[META]]
++    %acc0 = nvgpu.wgmma_sp %descA meta %metaA, %descB, %acc
++    {eltTypeA = 5 : i32, eltTypeB = 5 : i32, eltTypeC = 7 : i32, layoutA = 0 : i32, layoutB = 1 : i32, m = 64 : i32, n = 16 : i32, k = 32 : i32} :
++    (i64, i32, i64, !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>) -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
++    tt.return
++  }
++}
+diff --git a/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp b/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp
+--- a/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp
++++ b/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp
+@@ -688,6 +688,84 @@ public:
+   }
+ };
+ 
++class SparseWGMMAOpPattern
++    : public NVGPUOpPatternBase<ttn::SparseWGMMAOp, SparseWGMMAOpPattern> {
++public:
++  using Base = NVGPUOpPatternBase<ttn::SparseWGMMAOp, SparseWGMMAOpPattern>;
++  using Base::Base;
++
++  std::vector<std::string> getOutputConstraints(ttn::SparseWGMMAOp op) const {
++    auto outputStructType = op.getType().cast<LLVM::LLVMStructType>();
++    uint32_t numOutputRegs = outputStructType.getBody().size();
++    std::string output =
++        outputStructType.getBody().front().isF32() ? "=f" : "=r";
++    return std::vector<std::string>(numOutputRegs, output);
++  }
++
++  OperandsAndConstraints getOperandsAndConstraints(
++      ttn::SparseWGMMAOp op) const {
++    return {{op.getOpC(), "0"}, {op.getOpA(), "l"}, {op.getOpB(), "l"},
++            {op.getMetaA(), "r"}};
++  }
++
++  std::string getPtxAsm(ttn::SparseWGMMAOp op) const {
++    using namespace ttn;
++    auto opA = op.getOpA();
++    auto opB = op.getOpB();
++    auto m = op.getM();
++    auto n = op.getN();
++    auto k = op.getK();
++    auto eltTypeC = op.getEltTypeC();
++    auto eltTypeA = op.getEltTypeA();
++    auto eltTypeB = op.getEltTypeB();
++    auto layoutA = op.getLayoutA();
++    auto layoutB = op.getLayoutB();
++
++    // Only f16/bf16 variant is supported.
++    bool supported =
++        eltTypeC == WGMMAEltType::f32 &&
++        ((eltTypeA == WGMMAEltType::f16 && eltTypeB == WGMMAEltType::f16) ||
++         (eltTypeA == WGMMAEltType::bf16 && eltTypeB == WGMMAEltType::bf16)) &&
++        (m == 64 && 8 <= n && n <= 256 && n % 8 == 0 && k == 32);
++    assert(supported && "Sparse WGMMA type or shape is not supported");
++
++    // Operands
++    uint32_t asmOpIdx = 0;
++    std::string args = "";
++
++    // Output and operand C
++    uint32_t numCRegs =
++        op.getType().cast<LLVM::LLVMStructType>().getBody().size();
++    args += "{";
++    for (uint32_t i = 0; i < numCRegs; ++i) {
++      args += "$" + std::to_string(asmOpIdx++) + (i == numCRegs - 1 ? "" : ",");
++    }
++    args += "}, ";
++    asmOpIdx += numCRegs;
++
++    // Operands A and B (must be `desc`)
++    args += "$" + std::to_string(asmOpIdx++) + ", ";
++    args += "$" + std::to_string(asmOpIdx++) + ", ";
++
++    // Metadata for A
++    args += "$" + std::to_string(asmOpIdx++) + ", 0, ";
++
++    // `scale-d`, `imm-scale-a`, and `imm-scale-b` are 1 by default
++    args += "1, 1, 1";
++
++    // `trans-a` and `trans-b`
++    args += ", " + std::to_string(layoutA == WGMMALayout::col);
++    args += ", " + std::to_string(layoutB == WGMMALayout::row);
++
++    auto ptxAsm = "wgmma.mma_async.sp.sync.aligned"
++                  ".m" + std::to_string(m) + "n" + std::to_string(n) + "k" +
++                  std::to_string(k) + "." + stringifyEnum(eltTypeC).str() +
++                  "." + stringifyEnum(eltTypeA).str() + "." +
++                  stringifyEnum(eltTypeB).str() + " " + args + ";";
++    return ptxAsm;
++  }
++};
++
+ class ConvertNVGPUToLLVM : public ConvertNVGPUToLLVMBase<ConvertNVGPUToLLVM> {
+ 
+ public:
+@@ -711,7 +789,8 @@ public:
+ 
+     patterns.add<FenceAsyncSharedOpPattern, StoreMatrixOpPattern,
+                  ClusterArriveOpPattern, LoadDSmemOpPattern, WGMMAOpPattern,
+-                 WGMMAWaitGroupOpPattern, StoreDSmemOpPattern>(context);
++                 WGMMAWaitGroupOpPattern, StoreDSmemOpPattern,
++                 SparseWGMMAOpPattern>(context);
+ 
+     if (applyPatternsAndFoldGreedily(mod, std::move(patterns)).failed())
+       signalPassFailure();
diff --git a/third_party/xla/third_party/triton/xla_extensions/sparse_dot_passes.patch b/third_party/xla/third_party/triton/xla_extensions/sparse_dot_passes.patch
new file mode 100644
index 00000000000000..e610f67cd70309
--- /dev/null
+++ b/third_party/xla/third_party/triton/xla_extensions/sparse_dot_passes.patch
@@ -0,0 +1,591 @@
+diff --git a/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp b/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp
+--- a/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp
++++ b/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp
+@@ -277,6 +277,89 @@ struct TritonDotPattern : public OpConve
+   }
+ };
+ 
++struct TritonSparseDotPattern
++    : public OpConversionPattern<triton::gpu::SparseDotOp> {
++  using OpConversionPattern<triton::gpu::SparseDotOp>::OpConversionPattern;
++
++  LogicalResult matchAndRewrite(
++      triton::gpu::SparseDotOp op, OpAdaptor adaptor,
++      ConversionPatternRewriter &rewriter) const override {
++    RankedTensorType origType = op.getType().cast<RankedTensorType>();
++    auto origShape = origType.getShape();
++    auto typeConverter = getTypeConverter<TritonGPUTypeConverter>();
++    int numWarps = typeConverter->getNumWarps();
++    int threadsPerWarp = typeConverter->getThreadsPerWarp();
++    int numCTAs = typeConverter->getNumCTAs();
++
++    auto rank = origShape.size();
++    auto numElements = product<int64_t>(origShape);
++    SmallVector<unsigned> retSizePerThread(rank, 1);
++    if (numElements / (numWarps * threadsPerWarp) >= 4) {
++      retSizePerThread[rank - 1] = 2;
++      retSizePerThread[rank - 2] = 2;
++    }
++    if (numElements / (numWarps * threadsPerWarp) >= 16) {
++      retSizePerThread[rank - 1] = 4;
++      retSizePerThread[rank - 2] = 4;
++    }
++    SmallVector<unsigned> retOrder(rank);
++    for (unsigned i = 0; i < rank; ++i)
++      retOrder[i] = rank - 1 - i;
++    Attribute dEncoding = triton::gpu::BlockedEncodingAttr::get(
++        getContext(), origShape, retSizePerThread, retOrder, numWarps,
++        threadsPerWarp, numCTAs);
++    RankedTensorType retType =
++        RankedTensorType::get(origShape, origType.getElementType(), dEncoding);
++
++    // a & b must be of smem layout
++    auto aType = adaptor.getA().getType().cast<RankedTensorType>();
++    auto bType = adaptor.getB().getType().cast<RankedTensorType>();
++    Type aEltType = aType.getElementType();
++    Type bEltType = bType.getElementType();
++    Attribute aEncoding = aType.getEncoding();
++    Attribute bEncoding = bType.getEncoding();
++    if (!aEncoding || !bEncoding)
++      return failure();
++    Value a = adaptor.getA();
++    Value b = adaptor.getB();
++    Value c = adaptor.getC();
++    if (!aEncoding.isa<triton::gpu::DotOperandEncodingAttr>()) {
++      Attribute encoding = triton::gpu::DotOperandEncodingAttr::get(
++          getContext(), 0, dEncoding, aEltType);
++      auto dstType =
++          RankedTensorType::get(aType.getShape(), aEltType, encoding);
++      a = rewriter.create<triton::gpu::ConvertLayoutOp>(a.getLoc(), dstType, a);
++    }
++    if (!bEncoding.isa<triton::gpu::DotOperandEncodingAttr>()) {
++      Attribute encoding = triton::gpu::DotOperandEncodingAttr::get(
++          getContext(), 1, dEncoding, bEltType);
++      auto dstType =
++          RankedTensorType::get(bType.getShape(), bEltType, encoding);
++      b = rewriter.create<triton::gpu::ConvertLayoutOp>(b.getLoc(), dstType, b);
++    }
++    c = rewriter.create<triton::gpu::ConvertLayoutOp>(c.getLoc(), retType, c);
++
++    // aMeta must be of smem layout
++    auto aMetaType = adaptor.getAMeta().getType().cast<RankedTensorType>();
++    Attribute aMetaEncoding = aMetaType.getEncoding();
++    if (!aMetaEncoding) return failure();
++    Value aMeta = adaptor.getAMeta();
++    if (!aMetaEncoding.isa<triton::gpu::SparseDotMetaEncodingAttr>()) {
++      Attribute encoding =
++          triton::gpu::SparseDotMetaEncodingAttr::get(getContext(), dEncoding);
++      auto dstType = RankedTensorType::get(
++          aMetaType.getShape(), aMetaType.getElementType(), encoding);
++      aMeta = rewriter.create<triton::gpu::ConvertLayoutOp>(aMeta.getLoc(),
++                                                            dstType, aMeta);
++    }
++
++    addNamedAttrs(rewriter.replaceOpWithNewOp<triton::gpu::SparseDotOp>(
++                      op, retType, a, b, c, aMeta),
++                  adaptor.getAttributes());
++    return success();
++  }
++};
++
+ struct TritonCatPattern : public OpConversionPattern<triton::CatOp> {
+   using OpConversionPattern::OpConversionPattern;
+ 
+@@ -550,6 +633,7 @@ void populateTritonPatterns(TritonGPUTyp
+       GenericOpPattern<triton::AtomicRMWOp>, GenericOpPattern<ReturnOp>,
+       GenericOpPattern<triton::CallOp>, TritonFuncOpPattern>(typeConverter,
+                                                              context);
++  patterns.insert<TritonSparseDotPattern>(typeConverter, context);
+ }
+ 
+ //
+@@ -788,6 +872,12 @@ public:
+                  IntegerAttr::get(
+                      i32_ty, llvm::APInt(32, computeCapability.getValue())));
+ 
++    // Only transform sparse dot op with undefined layout.
++    target.addDynamicallyLegalOp<triton::gpu::SparseDotOp>(
++        [](triton::gpu::SparseDotOp op) {
++          return op.getAMeta().getType().getEncoding() != nullptr;
++        });
++
+     if (failed(applyPartialConversion(mod, target, std::move(patterns))))
+       return signalPassFailure();
+ 
+diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
+--- a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
++++ b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
+@@ -42,8 +42,9 @@ static int getMMAVersionSafe(int compute
+   return 0;
+ }
+ 
++template <typename DotType>
+ SmallVector<unsigned>
+-warpsPerTileV2(tt::DotOp dotOp, const ArrayRef<int64_t> shape, int numWarps) {
++warpsPerTileV2(DotType dotOp, const ArrayRef<int64_t> shape, int numWarps) {
+   auto rank = shape.size();
+   // Early exit for batched matmul
+   if (rank == 3)
+@@ -56,14 +57,14 @@ warpsPerTileV2(tt::DotOp dotOp, const Ar
+   auto slices = multiRootGetSlice(dotOp, {filter}, {filter});
+   bool hasChainedDot = false;
+   for (Operation *op : slices) {
+-    if (isa<tt::DotOp>(op) && (op != dotOp)) {
+-      auto chainedDot = cast<tt::DotOp>(op);
++    if (isa<DotType>(op) && (op != dotOp)) {
++      auto chainedDot = cast<DotType>(op);
+       auto resTy = chainedDot.getResult().getType();
+       if (resTy.getRank() != rank) {
+         continue;
+       }
+       if (auto mmaEncoding =
+-              resTy.getEncoding().dyn_cast<NvidiaMmaEncodingAttr>()) {
++              resTy.getEncoding().template dyn_cast<NvidiaMmaEncodingAttr>()) {
+         return ttg::getWarpsPerCTA(mmaEncoding);
+       }
+       hasChainedDot = true;
+@@ -101,12 +102,13 @@ warpsPerTileV2(tt::DotOp dotOp, const Ar
+   return ret;
+ }
+ 
+-SmallVector<unsigned, 2>
+-warpsPerTileV3(tt::DotOp dotOp, const ArrayRef<int64_t> shape, int numWarps,
+-               const SmallVector<unsigned, 3> &instrShape) {
++template <typename DotType>
++SmallVector<unsigned, 2> warpsPerTileV3(
++    DotType dotOp, const ArrayRef<int64_t> shape, int numWarps,
++    const SmallVector<unsigned, 3> &instrShape) {
+   SetVector<Operation *> slices;
+   mlir::getForwardSlice(dotOp.getResult(), &slices);
+-  if (llvm::find_if(slices, [](Operation *op) { return isa<tt::DotOp>(op); }) !=
++  if (llvm::find_if(slices, [](Operation *op) { return isa<DotType>(op); }) !=
+       slices.end())
+     return {(unsigned)numWarps, 1};
+ 
+@@ -175,9 +177,10 @@ public:
+       : mlir::RewritePattern(tt::DotOp::getOperationName(), 2, context),
+         computeCapability(computeCapability) {}
+ 
+-  static SmallVector<unsigned, 3>
+-  getWarpsPerTile(tt::DotOp dotOp, const ArrayRef<int64_t> shape, int version,
+-                  int numWarps, const SmallVector<unsigned, 3> &instrShape) {
++  template <typename DotType>
++  static SmallVector<unsigned, 3> getWarpsPerTile(
++      DotType dotOp, const ArrayRef<int64_t> shape, int version, int numWarps,
++      const SmallVector<unsigned, 3> &instrShape) {
+     switch (version) {
+     case 2:
+       return warpsPerTileV2(dotOp, shape, numWarps);
+@@ -337,6 +340,97 @@ public:
+     return success();
+   }
+ };
++
++class SparseBlockedToMMA : public mlir::RewritePattern {
++ public:
++  using SparseDotOp = mlir::triton::gpu::SparseDotOp;
++  using SparseDotMetaEncodingAttr =
++      mlir::triton::gpu::SparseDotMetaEncodingAttr;
++
++  SparseBlockedToMMA(mlir::MLIRContext *context, int computeCapability)
++      : mlir::RewritePattern(SparseDotOp::getOperationName(), 2, context),
++        computeCapability(computeCapability) {}
++
++  mlir::LogicalResult matchAndRewrite(
++      mlir::Operation *op, mlir::PatternRewriter &rewriter) const override {
++    auto dotOp = cast<SparseDotOp>(op);
++    auto ctx = op->getContext();
++    Value a = dotOp.getA();
++    Value b = dotOp.getB();
++
++    // Check data-types and SM compatibility
++    RankedTensorType oldRetType = dotOp.getType();
++    if (!oldRetType.getEncoding() ||
++        oldRetType.getEncoding().isa<ttg::NvidiaMmaEncodingAttr>())
++      return failure();
++
++    assert(computeCapability >= 80 &&
++           "SparseDot is supported on Ampere and higher");
++    int versionMajor = computeCapability < 90 ? 2 : 3;
++
++    // get MMA encoding for the given number of warps
++    auto retShapePerCTA = ttg::getShapePerCTA(oldRetType);
++    auto mod = op->getParentOfType<mlir::ModuleOp>();
++    int numWarps = ttg::TritonGPUDialect::getNumWarps(mod);
++    auto CTALayout = ttg::getCTALayout(oldRetType.getEncoding());
++
++    auto instrShape = mmaVersionToInstrShape(
++        versionMajor, retShapePerCTA, a.getType().cast<TensorOrMemDesc>());
++    auto warpsPerTile = BlockedToMMA::getWarpsPerTile(
++        dotOp, retShapePerCTA, versionMajor, numWarps, instrShape);
++    ttg::NvidiaMmaEncodingAttr mmaEnc =
++        ttg::NvidiaMmaEncodingAttr::get(ctx, versionMajor, /*versionMinor=*/0,
++                                        warpsPerTile, CTALayout, instrShape);
++    auto newRetType = RankedTensorType::get(
++        oldRetType.getShape(), oldRetType.getElementType(), mmaEnc);
++
++    // convert accumulator
++    auto oldAcc = dotOp.getOperand(2);
++    auto newAcc = rewriter.create<ttg::ConvertLayoutOp>(oldAcc.getLoc(),
++                                                        newRetType, oldAcc);
++
++    if (versionMajor == 2) {
++      // convert A operand
++      auto oldAType = a.getType().cast<RankedTensorType>();
++      auto newAEncoding = ttg::DotOperandEncodingAttr::get(
++          ctx, 0, mmaEnc, oldAType.getElementType());
++      auto newAType = RankedTensorType::get(
++          oldAType.getShape(), oldAType.getElementType(), newAEncoding);
++      a = rewriter.create<ttg::ConvertLayoutOp>(a.getLoc(), newAType, a);
++
++      // convert B operand
++      auto oldBType = b.getType().cast<RankedTensorType>();
++      auto newBEncoding = ttg::DotOperandEncodingAttr::get(
++          ctx, 1, mmaEnc, oldBType.getElementType());
++      auto newBType = RankedTensorType::get(
++          oldBType.getShape(), oldBType.getElementType(), newBEncoding);
++      b = rewriter.create<ttg::ConvertLayoutOp>(b.getLoc(), newBType, b);
++    } else {
++      a = BlockedToMMA::getMMAv3Operand(a, rewriter, 0);
++      b = BlockedToMMA::getMMAv3Operand(b, rewriter, 1);
++    }
++
++    // convert metadata
++    Value meta = dotOp.getAMeta();
++    auto oldMetaType = meta.getType().cast<RankedTensorType>();
++    auto newMetaType = RankedTensorType::get(
++        oldMetaType.getShape(), oldMetaType.getElementType(),
++        SparseDotMetaEncodingAttr::get(ctx, mmaEnc));
++    meta =
++        rewriter.create<ttg::ConvertLayoutOp>(meta.getLoc(), newMetaType, meta);
++
++    // convert dot instruction
++    auto newDot = rewriter.create<SparseDotOp>(dotOp.getLoc(), newRetType, a, b,
++                                               newAcc, meta);
++
++    rewriter.replaceOpWithNewOp<ttg::ConvertLayoutOp>(op, oldRetType,
++                                                      newDot.getResult());
++    return success();
++  }
++
++ private:
++  int computeCapability;
++};
+ } // namespace
+ 
+ static Value promoteOperand(OpBuilder &builder, Location loc, Value operand,
+@@ -397,6 +491,7 @@ public:
+ 
+     mlir::RewritePatternSet patterns(context);
+     patterns.add<::BlockedToMMA>(context, computeCapability);
++    patterns.add<::SparseBlockedToMMA>(context, computeCapability);
+     if (applyPatternsAndFoldGreedily(m, std::move(patterns)).failed()) {
+       signalPassFailure();
+     }
+diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp
+--- a/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp
++++ b/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp
+@@ -47,6 +47,10 @@ struct PipelinedOpInfo {
+   bool loadIsMMAV3 = false;
+ };
+ 
++bool isDotOp(Operation* op) {
++  return isa<tt::DotOp, ttg::SparseDotOp>(op);
++}
++
+ } // namespace
+ 
+ static bool isMMAv3Dot(Operation *op) {
+@@ -163,22 +167,28 @@ getSharedEncIfAllUsersAreDotEnc(Value va
+     } else {
+       if (!isa<ttg::LocalLoadOp, ttg::ConvertLayoutOp>(user))
+         return std::nullopt;
+-      auto dotOpEnc = user->getResult(0)
+-                          .getType()
+-                          .cast<TensorOrMemDesc>()
+-                          .getEncoding()
+-                          .dyn_cast<ttg::DotOperandEncodingAttr>();
+-      if (!dotOpEnc)
++      auto enc =
++          user->getResult(0).getType().cast<TensorOrMemDesc>().getEncoding();
++      if (isa<ttg::DotOperandEncodingAttr>(enc)) {
++        auto srcTy = val.getType().cast<TensorOrMemDesc>();
++        auto CTALayout = ttg::getCTALayout(srcTy.getEncoding());
++        auto order = ttg::getOrder(srcTy.getEncoding());
++        unsigned bitWidth = srcTy.getElementType().getIntOrFloatBitWidth();
++        tempAttr = ttg::SharedEncodingAttr::get(
++            val.getContext(), cast<ttg::DotOperandEncodingAttr>(enc),
++            srcTy.getShape(), ttg::getOrder(srcTy.getEncoding()),
++            ttg::getCTALayout(srcTy.getEncoding()),
++            srcTy.getElementType().getIntOrFloatBitWidth(),
++            /*needTrans=*/false);
++      } else if (isa<ttg::SparseDotMetaEncodingAttr>(enc)) {
++        auto srcTy = val.getType().cast<TensorOrMemDesc>();
++        tempAttr = ttg::SharedEncodingAttr::get(
++            val.getContext(), /*vec=*/1, /*perPhase=*/1, /*maxPhase=*/1,
++            ttg::getOrder(srcTy.getEncoding()),
++            ttg::getCTALayout(srcTy.getEncoding()));
++      } else {
+         return std::nullopt;
+-      auto srcTy = val.getType().cast<TensorOrMemDesc>();
+-      auto CTALayout = ttg::getCTALayout(srcTy.getEncoding());
+-      auto order = ttg::getOrder(srcTy.getEncoding());
+-      unsigned bitWidth = srcTy.getElementType().getIntOrFloatBitWidth();
+-      tempAttr = ttg::SharedEncodingAttr::get(
+-          val.getContext(), dotOpEnc, srcTy.getShape(),
+-          ttg::getOrder(srcTy.getEncoding()),
+-          ttg::getCTALayout(srcTy.getEncoding()),
+-          srcTy.getElementType().getIntOrFloatBitWidth(), /*needTrans=*/false);
++      }
+     }
+     // Check that the shared encodings needed by the users are compatible.
+     if (!tempAttr || (attr != nullptr && attr != tempAttr))
+@@ -311,7 +321,7 @@ loadOpsToDistanceAndUse(scf::ForOp forOp
+       };
+ 
+   for (Operation &op : forOp.getBody()->without_terminator()) {
+-    if (!isa<tt::DotOp>(op))
++    if (!isDotOp(&op))
+       continue;
+     dfs(&op, 0, &op);
+   }
+@@ -385,7 +395,7 @@ collectOpsToPipeline(scf::ForOp forOp,
+   // loads.
+   for (auto &[loadOp, distAndUse] : loadOpToDistAndUse) {
+     PipelinedOpInfo loadInfo;
+-    if (isa<tt::DotOp>(distAndUse.second)) {
++    if (isDotOp(distAndUse.second)) {
+       if (loadIsMMAv3(loadOp)) {
+         loadInfo.loadIsMMAV3 = true;
+         loadInfo.sharedEncoding =
+@@ -743,7 +753,7 @@ bool mlir::triton::preProcessLoopAndGetS
+     int useStage = opToInfo[info.use].stage;
+     int numBuffers = useStage - defStage;
+ 
+-    if (hasMMAV3 && isa<tt::DotOp>(info.use)) {
++    if (hasMMAV3 && isDotOp(info.use)) {
+       // For MMAv3, we need an extra buffer as this is assumed in the wgmma
+       // pipelining post-processing.
+       numBuffers++;
+diff --git a/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp b/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp
+--- a/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp
++++ b/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp
+@@ -36,6 +36,10 @@ public:
+       auto srcEncoding = srcType.getEncoding();
+       if (srcEncoding.isa<triton::gpu::SharedEncodingAttr>())
+         return;
++      if (dstType.getEncoding().isa<triton::gpu::SparseDotMetaEncodingAttr>()) {
++        replaceSparseMetaEncoding(cvtOp);
++        return;
++      }
+       auto dstDotOp =
+           dstType.getEncoding().dyn_cast<triton::gpu::DotOperandEncodingAttr>();
+       if (!dstDotOp)
+@@ -74,6 +78,27 @@ public:
+       cvtOp.erase();
+     });
+   }
++
++ private:
++  void replaceSparseMetaEncoding(triton::gpu::ConvertLayoutOp cvtOp) {
++    auto srcType = cvtOp.getOperand().getType().cast<RankedTensorType>();
++    auto srcEncoding = srcType.getEncoding();
++    auto sharedLayout = triton::gpu::SharedEncodingAttr::get(
++        cvtOp.getContext(), 8, 1, 1, triton::gpu::getOrder(srcEncoding),
++        triton::gpu::getCTALayout(srcEncoding));
++
++    auto dstType = cvtOp.getType().cast<RankedTensorType>();
++    auto tmpType = triton::MemDescType::get(
++        dstType.getShape(), dstType.getElementType(), sharedLayout);
++
++    OpBuilder builder(cvtOp);
++    auto tmp = builder.create<triton::gpu::LocalAllocOp>(
++        cvtOp.getLoc(), tmpType, cvtOp.getSrc());
++    auto newConvert = builder.create<triton::gpu::LocalLoadOp>(
++        cvtOp.getLoc(), dstType, tmp);
++    cvtOp.replaceAllUsesWith(newConvert.getResult());
++    cvtOp.erase();
++  }
+ };
+ 
+ std::unique_ptr<Pass> mlir::triton::gpu::createReduceDataDuplicationPass() {
+diff --git a/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp b/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
+--- a/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
++++ b/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
+@@ -45,7 +45,7 @@ public:
+       return;
+     ModuleOp mod = getOperation();
+     mod.walk([&](Operation *op) {
+-      if (!isa<tt::DotOp, ttng::DotAsyncOp>(op))
++      if (!isa<tt::DotOp, ttng::DotAsyncOp, ttg::SparseDotOp>(op))
+         return WalkResult::advance();
+       OpBuilder builder(op);
+       auto a = op->getOperand(0);
+@@ -83,7 +83,7 @@ private:
+     static DenseSet<std::pair<Operation *, unsigned>> trace;
+     auto op = operand.getDefiningOp();
+     // avoid redundant insertion
+-    if (op && isa<tt::DotOp, ttng::DotAsyncOp>(op))
++    if (op && isa<tt::DotOp, ttng::DotAsyncOp, ttg::SparseDotOp>(op))
+       return false;
+     // reach convertlayout
+     if (op && isa<ttg::LocalAllocOp>(op) &&
+diff --git a/test/SparseDot/add_layout.mlir b/test/SparseDot/add_layout.mlir
+new file mode 100644
+--- /dev/null
++++ b/test/SparseDot/add_layout.mlir
+@@ -0,0 +1,15 @@
++// RUN: triton-opt %s -split-input-file -convert-triton-to-tritongpu | FileCheck %s
++
++// CHECK-COUNT-4: #triton_gpu.blocked
++module attributes {"triton_gpu.num-warps" = 4 : i32} {
++  tt.func @sparse_dot() {
++    %A = arith.constant dense<1.00e+00> : tensor<64x32xf16>
++    %meta = arith.constant dense<0x3333> : tensor<64x4xi16>
++    %B = arith.constant dense<2.00e+00> : tensor<64x64xf16>
++    %C = arith.constant dense<0.00e+00> : tensor<64x64xf32>
++    // CHECK-COUNT-4: triton_gpu.convert_layout
++    // CHECK: triton_gpu.sparse_dot {{.+}} #triton_gpu.sparse_dot_meta
++    %D = triton_gpu.sparse_dot %A, %B, %C, %meta : tensor<64x32xf16> meta tensor<64x4xi16> * tensor<64x64xf16> -> tensor<64x64xf32>
++    tt.return
++  }
++}
+diff --git a/test/SparseDot/ttg_accelerate_matmul.mlir b/test/SparseDot/ttg_accelerate_matmul.mlir
+new file mode 100644
+--- /dev/null
++++ b/test/SparseDot/ttg_accelerate_matmul.mlir
+@@ -0,0 +1,27 @@
++// RUN: ENABLE_MMA_V3=1 triton-opt %s -split-input-file -tritongpu-accelerate-matmul=compute-capability=90 | FileCheck %s
++// RUN: triton-opt %s -split-input-file -tritongpu-accelerate-matmul=compute-capability=80 | FILECHECK_OPTS= FileCheck %s --check-prefix=CHECK-80
++
++#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
++// CHECK: #[[MMA:.+]] = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 64, 16]}>
++// CHECK-80: #[[MMA:.+]] = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 2], instrShape = [16, 8]}>
++#lhs = #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>
++#rhs = #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>
++module attributes {"triton_gpu.num-warps" = 4 : i32} {
++  tt.func @sparse_dot(%A: tensor<64x32xf16, #lhs>, %B: tensor<64x64xf16, #rhs>, %meta: tensor<64x4xi16, #blocked>) -> tensor<64x64xf32, #blocked> {
++    %C = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked>
++    // CHECK-DAG: %[[LHS:.+]] = triton_gpu.local_alloc {{.+}} : (tensor<64x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>>) -> !tt.memdesc<64x32xf16, #{{.+}}>
++    // CHECK-DAG: %[[RHS:.+]] = triton_gpu.local_alloc {{.+}} : (tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>>) -> !tt.memdesc<64x64xf16, #{{.+}}>
++    // CHECK-DAG: %[[ACC:.+]] = triton_gpu.convert_layout {{.+}} : tensor<64x64xf32, #blocked> -> tensor<64x64xf32, #[[MMA]]>
++    // CHECK-DAG: %[[META:.+]] = triton_gpu.convert_layout {{.+}} : tensor<64x4xi16, #blocked> -> tensor<64x4xi16, #triton_gpu.sparse_dot_meta<{parent = #[[MMA]]}>>
++    // CHECK: %[[OUT:.+]] = triton_gpu.sparse_dot %[[LHS]], %[[RHS]], %[[ACC]], %[[META]] : {{.+}} -> tensor<64x64xf32, #[[MMA]]>
++    // CHECK-80-DAG: %[[LHS:.+]] = triton_gpu.convert_layout {{.+}} : {{.+}} -> tensor<64x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #[[MMA]], kWidth = 2}>>
++    // CHECK-80-DAG: %[[RHS:.+]] = triton_gpu.convert_layout {{.+}} : {{.+}} -> tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #[[MMA]], kWidth = 2}>>
++    // CHECK-80-DAG: %[[ACC:.+]] = triton_gpu.convert_layout {{.+}} : {{.+}} -> tensor<64x64xf32, #[[MMA]]>
++    // CHECK-80-DAG: %[[META:.+]] = triton_gpu.convert_layout {{.+}} : {{.+}} -> tensor<64x4xi16, #triton_gpu.sparse_dot_meta<{parent = #[[MMA]]}>>
++    // CHECK-80: %[[OUT:.+]] = triton_gpu.sparse_dot %[[LHS]], %[[RHS]], %[[ACC]], %[[META]] : {{.+}} -> tensor<64x64xf32, #[[MMA]]>
++    %D = triton_gpu.sparse_dot %A, %B, %C, %meta : tensor<64x32xf16, #lhs> meta tensor<64x4xi16, #blocked> * tensor<64x64xf16, #rhs> -> tensor<64x64xf32, #blocked>
++    // CHECK: triton_gpu.convert_layout %[[OUT]] : tensor<64x64xf32, #[[MMA]]> -> tensor<64x64xf32, #blocked>
++    // CHECK-80: triton_gpu.convert_layout %[[OUT]] : tensor<64x64xf32, #[[MMA]]> -> tensor<64x64xf32, #blocked>
++    tt.return %D : tensor<64x64xf32, #blocked>
++  }
++}
+diff --git a/test/SparseDot/ttg_fence_insertion.mlir b/test/SparseDot/ttg_fence_insertion.mlir
+new file mode 100644
+--- /dev/null
++++ b/test/SparseDot/ttg_fence_insertion.mlir
+@@ -0,0 +1,18 @@
++// RUN: ENABLE_MMA_V3=1 triton-opt %s -split-input-file -triton-nvidia-gpu-fence-insertion | FileCheck %s
++
++#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0]}>
++#mma = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 64, 16]}>
++#lhs = #triton_gpu.dot_op<{opIdx = 0, parent = #mma}>
++#rhs = #triton_gpu.dot_op<{opIdx = 1, parent = #mma}>
++#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = true}>
++module attributes {"triton_gpu.num-warps" = 4 : i32} {
++  tt.func public @sparse_dot_fence(%A: tensor<64x32xf16, #lhs>, %B: tensor<64x64xf16, #rhs>, %meta: tensor<64x4xi16, #blocked>) {
++    %C = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #mma>
++    %0 = triton_gpu.local_alloc %A : (tensor<64x32xf16, #lhs>) -> !tt.memdesc<64x32xf16, #shared>
++    %1 = triton_gpu.local_alloc %B : (tensor<64x64xf16, #rhs>) -> !tt.memdesc<64x64xf16, #shared>
++    %2 = triton_gpu.convert_layout %meta : tensor<64x4xi16, #blocked> -> tensor<64x4xi16, #triton_gpu.sparse_dot_meta<{parent = #mma}>>
++    // CHECK: triton_nvidia_gpu.fence_async_shared
++    %3 = triton_gpu.sparse_dot %0, %1, %C, %2 : !tt.memdesc<64x32xf16, #shared> meta tensor<64x4xi16, #triton_gpu.sparse_dot_meta<{parent = #mma}>> * !tt.memdesc<64x64xf16, #shared> -> tensor<64x64xf32, #mma>
++    tt.return
++  }
++}
+diff --git a/test/SparseDot/ttg_loop_pipeline.mlir b/test/SparseDot/ttg_loop_pipeline.mlir
+new file mode 100644
+--- /dev/null
++++ b/test/SparseDot/ttg_loop_pipeline.mlir
+@@ -0,0 +1,61 @@
++// RUN: triton-opt %s -split-input-file -tritongpu-pipeline=num-stages=3 | FileCheck %s
++
++#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0]}>
++#sliced = #triton_gpu.slice<{parent=#blocked, dim=0}>
++#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, warpsPerCTA = [4, 1]}>
++#dot_operand_a = #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth=2}>
++#dot_operand_b = #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth=2}>
++#dot_meta_enc = #triton_gpu.sparse_dot_meta<{parent=#mma}>
++
++module attributes {"triton_gpu.num-warps" = 4 : i32} {
++  tt.func @sparse_dot_loop(%lb : index, %ub : index, %step : index,
++        %A : !tt.ptr<f16> {tt.divisibility = 16 : i32},
++        %B : !tt.ptr<f16> {tt.divisibility = 16 : i32},
++        %A_meta : !tt.ptr<i16> {tt.divisibility = 16 : i32}) -> tensor<128x128xf32, #mma> {
++    // CHECK-COUNT-6: triton_gpu.async_copy_global_to_local
++    // CHECK: triton_gpu.async_wait {{.+}}, {{.+}} {num = 3 : i32}
++    %a_ptr_splat = tt.splat %A : !tt.ptr<f16> -> tensor<128x32x!tt.ptr<f16>, #blocked>
++    %a_tmp0 = tt.make_range {end = 32: i32, start = 0: i32} : tensor<32xi32, #sliced>
++    %a_tmp1 = tt.expand_dims %a_tmp0 {axis = 0 : i32} : tensor<32xi32, #sliced> -> tensor<1x32xi32, #blocked>
++    %a_offs = tt.broadcast %a_tmp1 : tensor<1x32xi32, #blocked> -> tensor<128x32xi32, #blocked>
++    %a_ptr_init = tt.addptr %a_ptr_splat, %a_offs : tensor<128x32x!tt.ptr<f16>, #blocked>, tensor<128x32xi32, #blocked>
++
++    %b_ptr_splat = tt.splat %B : !tt.ptr<f16> -> tensor<64x128x!tt.ptr<f16>, #blocked>
++    %b_tmp0 = tt.make_range {end = 128: i32, start = 0: i32} : tensor<128xi32, #sliced>
++    %b_tmp1 = tt.expand_dims %b_tmp0 {axis = 0 : i32} : tensor<128xi32, #sliced> -> tensor<1x128xi32, #blocked>
++    %b_offs = tt.broadcast %b_tmp1 : tensor<1x128xi32, #blocked> -> tensor<64x128xi32, #blocked>
++    %b_ptr_init = tt.addptr %b_ptr_splat, %b_offs : tensor<64x128x!tt.ptr<f16>, #blocked>, tensor<64x128xi32, #blocked>
++
++    %meta_ptr_splat = tt.splat %A_meta : !tt.ptr<i16> -> tensor<128x4x!tt.ptr<i16>, #blocked>
++    %meta_tmp0 = tt.make_range {end = 4: i32, start = 0: i32} : tensor<4xi32, #sliced>
++    %meta_tmp1 = tt.expand_dims %meta_tmp0 {axis = 0 : i32} : tensor<4xi32, #sliced> -> tensor<1x4xi32, #blocked>
++    %meta_offs = tt.broadcast %meta_tmp1 : tensor<1x4xi32, #blocked> -> tensor<128x4xi32, #blocked>
++    %meta_ptr_init = tt.addptr %meta_ptr_splat, %meta_offs : tensor<128x4x!tt.ptr<i16>, #blocked>, tensor<128x4xi32, #blocked>
++
++    %a_off = arith.constant dense<4> : tensor<128x32xi32, #blocked>
++    %b_off = arith.constant dense<4> : tensor<64x128xi32, #blocked>
++    %meta_off = arith.constant dense<4> : tensor<128x4xi32, #blocked>
++    %c_init = arith.constant dense<0.00e+00> : tensor<128x128xf32, #mma>
++
++    // CHECK: scf.for
++    %loop:4 = scf.for %iv = %lb to %ub step %step iter_args(%a_ptr = %a_ptr_init, %b_ptr = %b_ptr_init, %c = %c_init, %meta_ptr = %meta_ptr_init)
++        -> (tensor<128x32x!tt.ptr<f16>, #blocked>, tensor<64x128x!tt.ptr<f16>, #blocked>, tensor<128x128xf32, #mma>, tensor<128x4x!tt.ptr<i16>, #blocked>) {
++      // CHECK-COUNT-3: triton_gpu.local_load
++      // CHECK: triton_gpu.sparse_dot
++      // CHECK-COUNT-3: triton_gpu.async_copy_global_to_local
++      %a_ = tt.load %a_ptr {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x32xf16, #blocked>
++      %a = triton_gpu.convert_layout %a_ : tensor<128x32xf16, #blocked> -> tensor<128x32xf16, #dot_operand_a>
++      %b_ = tt.load %b_ptr {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<64x128xf16, #blocked>
++      %b = triton_gpu.convert_layout %b_ : tensor<64x128xf16, #blocked> -> tensor<64x128xf16, #dot_operand_b>
++      %meta_ = tt.load %meta_ptr {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x4xi16, #blocked>
++      %meta = triton_gpu.convert_layout %meta_ : tensor<128x4xi16, #blocked> -> tensor<128x4xi16, #dot_meta_enc>
++      %d = triton_gpu.sparse_dot %a, %b, %c, %meta : tensor<128x32xf16, #dot_operand_a> meta tensor<128x4xi16, #dot_meta_enc> * tensor<64x128xf16, #dot_operand_b> -> tensor<128x128xf32, #mma>
++
++      %a_ptr_next = tt.addptr %a_ptr, %a_off : tensor<128x32x!tt.ptr<f16>, #blocked>, tensor<128x32xi32, #blocked>
++      %b_ptr_next = tt.addptr %b_ptr, %b_off : tensor<64x128x!tt.ptr<f16>, #blocked>, tensor<64x128xi32, #blocked>
++      %meta_ptr_next = tt.addptr %meta_ptr, %meta_off : tensor<128x4x!tt.ptr<i16>, #blocked>, tensor<128x4xi32, #blocked>
++      scf.yield %a_ptr_next, %b_ptr_next, %d, %meta_ptr_next : tensor<128x32x!tt.ptr<f16>, #blocked>, tensor<64x128x!tt.ptr<f16>, #blocked>, tensor<128x128xf32, #mma>, tensor<128x4x!tt.ptr<i16>, #blocked>
++    }
++    tt.return %loop#2: tensor<128x128xf32, #mma>
++  }
++}
+diff --git a/test/SparseDot/ttg_reduce_data_duplication.mlir b/test/SparseDot/ttg_reduce_data_duplication.mlir
+new file mode 100644
+--- /dev/null
++++ b/test/SparseDot/ttg_reduce_data_duplication.mlir
+@@ -0,0 +1,13 @@
++// RUN: triton-opt %s -split-input-file -tritongpu-reduce-data-duplication | FileCheck %s
++
++#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0]}>
++#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 2], instrShape = [16, 8]}>
++// CHECK: #[[SHARED:.+]] = #triton_gpu.shared
++module attributes {"triton_gpu.num-warps" = 4 : i32} {
++  tt.func @sparse_dot_metadata(%meta: tensor<64x4xi16, #blocked>) {
++    // CHECK: %[[META:.+]] = triton_gpu.local_alloc {{.+}} : (tensor<64x4xi16, #blocked>) -> !tt.memdesc<64x4xi16, #[[SHARED]]>
++    // CHECK: triton_gpu.local_load %[[META]] : !tt.memdesc<64x4xi16, #[[SHARED]]> -> tensor<64x4xi16, #triton_gpu.sparse_dot_meta<{parent = #mma}>>
++    %0 = triton_gpu.convert_layout %meta : tensor<64x4xi16, #blocked> -> tensor<64x4xi16, #triton_gpu.sparse_dot_meta<{parent = #mma}>>
++    tt.return
++  }
++}
diff --git a/third_party/xla/third_party/tsl/.bazelrc b/third_party/xla/third_party/tsl/.bazelrc
index d8990ac5c12cc5..d7ae76f096431a 100644
--- a/third_party/xla/third_party/tsl/.bazelrc
+++ b/third_party/xla/third_party/tsl/.bazelrc
@@ -253,7 +253,7 @@ build:cuda_clang_official --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-12.3"
 build:cuda_clang_official --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
 build:cuda_clang_official --action_env=CLANG_CUDA_COMPILER_PATH="/usr/lib/llvm-17/bin/clang"
 build:cuda_clang_official --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
-build:cuda_clang_official --crosstool_top="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain"
+build:cuda_clang_official --crosstool_top="@sigbuild-r2.17-clang_config_cuda//crosstool:toolchain"
 
 # Build with nvcc for CUDA and clang for host
 build:nvcc_clang --config=cuda
@@ -293,6 +293,11 @@ build:rocm --define=using_rocm_hipcc=true
 build:rocm --define=tensorflow_mkldnn_contraction_kernel=0
 build:rocm --repo_env TF_NEED_ROCM=1
 
+build:sycl --crosstool_top=@local_config_sycl//crosstool:toolchain
+build:sycl --define=using_sycl=true
+build:sycl --define=tensorflow_mkldnn_contraction_kernel=0
+build:sycl --repo_env TF_NEED_SYCL=1
+
 # Options to disable default on features
 build:noaws --define=no_aws_support=true
 build:nogcp --define=no_gcp_support=true
@@ -497,12 +502,12 @@ build:rbe_linux --host_linkopt=-lm
 
 build:rbe_linux_cpu --config=rbe_linux
 # Linux cpu and cuda builds share the same toolchain now.
-build:rbe_linux_cpu --host_crosstool_top="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain"
-build:rbe_linux_cpu --crosstool_top="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain"
-build:rbe_linux_cpu --extra_toolchains="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain-linux-x86_64"
-build:rbe_linux_cpu --extra_execution_platforms="@sigbuild-r2.16-clang_config_platform//:platform"
-build:rbe_linux_cpu --host_platform="@sigbuild-r2.16-clang_config_platform//:platform"
-build:rbe_linux_cpu --platforms="@sigbuild-r2.16-clang_config_platform//:platform"
+build:rbe_linux_cpu --host_crosstool_top="@sigbuild-r2.17-clang_config_cuda//crosstool:toolchain"
+build:rbe_linux_cpu --crosstool_top="@sigbuild-r2.17-clang_config_cuda//crosstool:toolchain"
+build:rbe_linux_cpu --extra_toolchains="@sigbuild-r2.17-clang_config_cuda//crosstool:toolchain-linux-x86_64"
+build:rbe_linux_cpu --extra_execution_platforms="@sigbuild-r2.17-clang_config_platform//:platform"
+build:rbe_linux_cpu --host_platform="@sigbuild-r2.17-clang_config_platform//:platform"
+build:rbe_linux_cpu --platforms="@sigbuild-r2.17-clang_config_platform//:platform"
 # This is needed for all Clang17 builds but must not be present in GCC builds.
 build:rbe_linux_cpu --copt=-Wno-error=unused-command-line-argument
 # This was added in clang-16 by https://reviews.llvm.org/D133574.
@@ -511,7 +516,7 @@ build:rbe_linux_cpu --copt=-Wno-error=unused-command-line-argument
 # See https://github.com/protocolbuffers/upb/blob/9effcbcb27f0a665f9f345030188c0b291e32482/upb/upb.c#L183.
 build:rbe_linux_cpu --copt=-Wno-gnu-offsetof-extensions
 # Python config is the same across all containers because the binary is the same
-build:rbe_linux_cpu --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.16-clang_config_python"
+build:rbe_linux_cpu --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.17-clang_config_python"
 build:rbe_linux_cpu --python_path="/usr/bin/python3"
 # These you may need to change for your own GCP project.
 common:rbe_linux_cpu --remote_instance_name=projects/tensorflow-testing/instances/default_instance
@@ -532,9 +537,9 @@ build:rbe_linux_cuda --config=cuda_clang_official
 build:rbe_linux_cuda --config=rbe_linux_cpu
 # For Remote build execution -- GPU configuration
 build:rbe_linux_cuda --repo_env=REMOTE_GPU_TESTING=1
-build:rbe_linux_cuda --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.16-clang_config_cuda"
-build:rbe_linux_cuda --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.16-clang_config_tensorrt"
-build:rbe_linux_cuda --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.16-clang_config_nccl"
+build:rbe_linux_cuda --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.17-clang_config_cuda"
+build:rbe_linux_cuda --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.17-clang_config_tensorrt"
+build:rbe_linux_cuda --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.17-clang_config_nccl"
 test:rbe_linux_cuda --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 
 build:rbe_linux_cuda_nvcc --config=rbe_linux_cuda
@@ -639,7 +644,7 @@ test:release_linux_base --test_summary=short
 
 # Use the Clang toolchain to compile
 build:release_cpu_linux --config=release_linux_base
-build:release_cpu_linux --crosstool_top="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain"
+build:release_cpu_linux --crosstool_top="@sigbuild-r2.17-clang_config_cuda//crosstool:toolchain"
 
 build:release_gpu_linux --config=release_cpu_linux
 # Set up compilation CUDA version and paths and use the CUDA Clang toolchain.
diff --git a/third_party/xla/third_party/tsl/WORKSPACE b/third_party/xla/third_party/tsl/WORKSPACE
index 45fe4fccfcc658..6ad0d6e0e2b7be 100644
--- a/third_party/xla/third_party/tsl/WORKSPACE
+++ b/third_party/xla/third_party/tsl/WORKSPACE
@@ -7,6 +7,9 @@ workspace(name = "tsl")
 # restriction that load() statements need to be at the top of .bzl files.
 # E.g. we can not retrieve a new repository with http_archive and then load()
 # a macro from that repository in the same file.
+
+# buildifier: disable=load-on-top
+
 load(":workspace3.bzl", "tsl_workspace3")
 
 tsl_workspace3()
diff --git a/third_party/xla/third_party/tsl/opensource_only.files b/third_party/xla/third_party/tsl/opensource_only.files
index 2920ef7997ad9b..2853de8a621c65 100644
--- a/third_party/xla/third_party/tsl/opensource_only.files
+++ b/third_party/xla/third_party/tsl/opensource_only.files
@@ -22,11 +22,14 @@ third_party/git/BUILD:
 third_party/git/git_configure.bzl:
 third_party/gpus/BUILD:
 third_party/gpus/crosstool/BUILD.rocm.tpl:
+third_party/gpus/crosstool/BUILD.sycl.tpl:
 third_party/gpus/crosstool/BUILD.tpl:
 third_party/gpus/crosstool/BUILD:
 third_party/gpus/crosstool/LICENSE:
 third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl:
 third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl:
+third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_sycl.tpl:
+third_party/gpus/crosstool/sycl_cc_toolchain_config.bzl.tpl:
 third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl:
 third_party/gpus/cuda/BUILD.tpl:
 third_party/gpus/cuda/BUILD.windows.tpl:
@@ -42,6 +45,10 @@ third_party/gpus/rocm/BUILD:
 third_party/gpus/rocm/build_defs.bzl.tpl:
 third_party/gpus/rocm/rocm_config.h.tpl:
 third_party/gpus/rocm_configure.bzl:
+third_party/gpus/sycl/BUILD.tpl:
+third_party/gpus/sycl/BUILD:
+third_party/gpus/sycl/build_defs.bzl.tpl:
+third_party/gpus/sycl_configure.bzl:
 third_party/grpc/BUILD:
 third_party/implib_so/BUILD:
 third_party/implib_so/get_symbols.py:
diff --git a/third_party/xla/third_party/tsl/third_party/curl.BUILD b/third_party/xla/third_party/tsl/third_party/curl.BUILD
index b31d8488aaa0ba..a58b18c73bd8a5 100644
--- a/third_party/xla/third_party/tsl/third_party/curl.BUILD
+++ b/third_party/xla/third_party/tsl/third_party/curl.BUILD
@@ -353,13 +353,13 @@ cc_library(
         "lib/ws.c",
         "lib/ws.h",
     ] + select({
-        "@local_tsl//tsl:macos": [
+        "@local_xla//xla/tsl:macos": [
             "lib/vtls/sectransp.c",
         ],
-        "@local_tsl//tsl:ios": [
+        "@local_xla//xla/tsl:ios": [
             "lib/vtls/sectransp.c",
         ],
-        "@local_tsl//tsl:windows": CURL_WIN_SRCS,
+        "@local_xla//xla/tsl:windows": CURL_WIN_SRCS,
         "//conditions:default": [
         ],
     }),
@@ -378,7 +378,7 @@ cc_library(
         "include/curl/websockets.h",
     ],
     copts = select({
-        "@local_tsl//tsl:windows": CURL_WIN_COPTS,
+        "@local_xla//xla/tsl:windows": CURL_WIN_COPTS,
         "//conditions:default": [
             "-Iexternal/curl/lib",
             "-D_GNU_SOURCE",
@@ -391,10 +391,10 @@ cc_library(
             "-Wno-string-plus-int",
         ],
     }) + select({
-        "@local_tsl//tsl:macos": [
+        "@local_xla//xla/tsl:macos": [
             "-fno-constant-cfstrings",
         ],
-        "@local_tsl//tsl:windows": [
+        "@local_xla//xla/tsl:windows": [
             # See curl.h for discussion of write size and Windows
             "/DCURL_MAX_WRITE_SIZE=16384",
         ],
@@ -405,10 +405,10 @@ cc_library(
     defines = ["CURL_STATICLIB"],
     includes = ["include"],
     linkopts = select({
-        "@local_tsl//tsl:android": [
+        "@local_xla//xla/tsl:android": [
             "-pie",
         ],
-        "@local_tsl//tsl:macos": [
+        "@local_xla//xla/tsl:macos": [
             "-Wl,-framework",
             "-Wl,CoreFoundation",
             "-Wl,-framework",
@@ -416,8 +416,8 @@ cc_library(
             "-Wl,-framework",
             "-Wl,Security",
         ],
-        "@local_tsl//tsl:ios": [],
-        "@local_tsl//tsl:windows": [
+        "@local_xla//xla/tsl:ios": [],
+        "@local_xla//xla/tsl:windows": [
             "-DEFAULTLIB:ws2_32.lib",
             "-DEFAULTLIB:advapi32.lib",
             "-DEFAULTLIB:crypt32.lib",
@@ -431,8 +431,8 @@ cc_library(
     deps = [
         "@zlib",
     ] + select({
-        "@local_tsl//tsl:ios": [],
-        "@local_tsl//tsl:windows": [],
+        "@local_xla//xla/tsl:ios": [],
+        "@local_xla//xla/tsl:windows": [],
         "//conditions:default": [
             "@boringssl//:ssl",
         ],
@@ -538,7 +538,7 @@ cc_binary(
         "src/tool_xattr.h",
     ],
     copts = select({
-        "@local_tsl//tsl:windows": CURL_BIN_WIN_COPTS,
+        "@local_xla//xla/tsl:windows": CURL_BIN_WIN_COPTS,
         "//conditions:default": [
             "-Iexternal/curl/lib",
             "-D_GNU_SOURCE",
diff --git a/third_party/xla/third_party/tsl/third_party/git/git_configure.bzl b/third_party/xla/third_party/tsl/third_party/git/git_configure.bzl
index 9a07c94c13ae73..3ce64242af6af6 100644
--- a/third_party/xla/third_party/tsl/third_party/git/git_configure.bzl
+++ b/third_party/xla/third_party/tsl/third_party/git/git_configure.bzl
@@ -38,10 +38,10 @@ def _git_conf_impl(repository_ctx):
     )
 
     tensorflow_root_path = str(repository_ctx.path(
-        Label("@local_tsl//:BUILD"),
+        Label("@org_tensorflow//:BUILD"),
     ))[:-len("BUILD")]
     python_script_path = repository_ctx.path(
-        Label("@local_tsl//tensorflow/tools/git:gen_git_source.py"),
+        Label("@org_tensorflow//tensorflow/tools/git:gen_git_source.py"),
     )
     generated_files_path = repository_ctx.path("gen")
 
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/crosstool/BUILD.sycl.tpl b/third_party/xla/third_party/tsl/third_party/gpus/crosstool/BUILD.sycl.tpl
new file mode 100644
index 00000000000000..a8db760d0c695c
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/crosstool/BUILD.sycl.tpl
@@ -0,0 +1,71 @@
+# This file is expanded from a template sycl_configure.bzl
+# Update sycl_configure.bzl#verify_build_defines when adding new variables.
+
+load(":cc_toolchain_config.bzl", "cc_toolchain_config")
+
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+toolchain(
+    name = "toolchain-linux-x86_64",
+    exec_compatible_with = [
+        "@bazel_tools//platforms:linux",
+        "@bazel_tools//platforms:x86_64",
+    ],
+    target_compatible_with = [
+        "@bazel_tools//platforms:linux",
+        "@bazel_tools//platforms:x86_64",
+    ],
+    toolchain = ":cc-compiler-local",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain_suite(
+    name = "toolchain",
+    toolchains = {
+        "local|compiler": ":cc-compiler-local",
+        "k8": ":cc-compiler-local",
+    },
+)
+
+cc_toolchain(
+    name = "cc-compiler-local",
+    all_files = ":crosstool_wrapper_driver_is_not_gcc",
+    compiler_files = ":crosstool_wrapper_driver_is_not_gcc",
+    ar_files = ":crosstool_wrapper_driver_is_not_gcc",
+    as_files = ":crosstool_wrapper_driver_is_not_gcc",
+    dwp_files = ":empty",
+    linker_files = ":crosstool_wrapper_driver_is_not_gcc",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    # To support linker flags that need to go to the start of command line
+    # we need the toolchain to support parameter files. Parameter files are
+    # last on the command line and contain all shared libraries to link, so all
+    # regular options will be left of them.
+    supports_param_files = 1,
+    toolchain_identifier = "local_linux",
+    toolchain_config = ":cc-compiler-local-config",
+)
+
+cc_toolchain_config(
+    name = "cc-compiler-local-config",
+    cpu = "local",
+    builtin_include_directories = [%{cxx_builtin_include_directories}],
+    extra_no_canonical_prefixes_flags = [%{extra_no_canonical_prefixes_flags}],
+    host_compiler_path = "%{host_compiler_path}",
+    host_compiler_prefix = "%{host_compiler_prefix}",
+    host_unfiltered_compile_flags = [%{unfiltered_compile_flags}],
+    linker_bin_path = "%{linker_bin_path}",
+    compiler = "unknown",
+)
+
+filegroup(
+    name = "empty",
+    srcs = [],
+)
+
+filegroup(
+    name = "crosstool_wrapper_driver_is_not_gcc",
+    srcs = ["clang/bin/crosstool_wrapper_driver_is_not_gcc"]
+)
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_sycl.tpl b/third_party/xla/third_party/tsl/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_sycl.tpl
new file mode 100644
index 00000000000000..75474521110df5
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_sycl.tpl
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+"""Crosstool wrapper for compiling SYCL program
+SYNOPSIS:
+  crosstool_wrapper_driver_sycl [options passed in by cc_library()
+                                or cc_binary() rule]
+DESCRIPTION:
+  This script is expected to be called by the cc_library() or cc_binary() bazel
+  rules. When the option "sycl_compile" is present in the list of arguments passed
+  to this script, it invokes the sycl compiler. When "sycl_compile" is not
+  present, this wrapper invokes gcc with the input arguments as is.
+"""
+
+from __future__ import print_function
+from argparse import ArgumentParser
+import os
+import subprocess
+import sys
+import shlex
+
+CPU_COMPILER = ('%{cpu_compiler}')
+
+def system(cmd):
+  """Invokes cmd with os.system()"""
+
+  ret = os.system(cmd)
+  if os.WIFEXITED(ret):
+    return os.WEXITSTATUS(ret)
+  else:
+    return -os.WTERMSIG(ret)
+
+def call_compiler(argv):
+  parser = ArgumentParser()
+  parser.add_argument('-c', nargs=1, action='append')
+  parser.add_argument('-o', nargs=1, action='append')
+  args, leftover = parser.parse_known_args(argv)
+
+  flags = leftover
+
+  common_flags = []
+  common_flags.append("-fno-finite-math-only")
+  common_flags.append("-fno-fast-math")
+  common_flags.append("-fexceptions")
+
+  in_files, out_files = [], []
+  if args.c:
+    in_files.append('-c')
+    in_files.extend(args.c[0])
+  if args.o:
+    out_files.append('-o')
+    out_files.extend(args.o[0])
+  flags += (common_flags + in_files + out_files)
+  print("cmd: ", " ".join([CPU_COMPILER] + flags))
+  return subprocess.call([CPU_COMPILER] + flags)
+
+def main():
+  parser = ArgumentParser()
+  parser = ArgumentParser(fromfile_prefix_chars='@')
+  parser.add_argument('-sycl_compile', action='store_true')
+  args, leftover = parser.parse_known_args(sys.argv[1:])
+
+  return call_compiler(leftover)
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/crosstool/sycl_cc_toolchain_config.bzl.tpl b/third_party/xla/third_party/tsl/third_party/gpus/crosstool/sycl_cc_toolchain_config.bzl.tpl
new file mode 100644
index 00000000000000..1b4e1c6def04f2
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/crosstool/sycl_cc_toolchain_config.bzl.tpl
@@ -0,0 +1,598 @@
+"""cc_toolchain_config rule for configuring SYCL toolchains on Linux."""
+
+load(
+    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
+    "action_config",
+    "artifact_name_pattern",
+    "env_entry",
+    "env_set",
+    "feature",
+    "feature_set",
+    "flag_group",
+    "flag_set",
+    "tool",
+    "tool_path",
+    "variable_with_value",
+    "with_feature_set",
+)
+load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
+
+def all_assembly_actions():
+    return [
+        ACTION_NAMES.assemble,
+        ACTION_NAMES.preprocess_assemble,
+    ]
+
+def all_compile_actions():
+    return [
+        ACTION_NAMES.assemble,
+        ACTION_NAMES.c_compile,
+        ACTION_NAMES.cpp_compile,
+        ACTION_NAMES.cpp_header_parsing,
+        ACTION_NAMES.cpp_module_codegen,
+        ACTION_NAMES.cpp_module_compile,
+        ACTION_NAMES.linkstamp_compile,
+        ACTION_NAMES.preprocess_assemble,
+    ]
+
+def all_c_compile_actions():
+    return [
+        ACTION_NAMES.c_compile,
+    ]
+
+def all_cpp_compile_actions():
+    return [
+        ACTION_NAMES.cpp_compile,
+        ACTION_NAMES.cpp_header_parsing,
+        ACTION_NAMES.cpp_module_codegen,
+        ACTION_NAMES.cpp_module_compile,
+        ACTION_NAMES.linkstamp_compile,
+    ]
+
+def all_preprocessed_actions():
+    return [
+        ACTION_NAMES.c_compile,
+        ACTION_NAMES.cpp_compile,
+        ACTION_NAMES.cpp_header_parsing,
+        ACTION_NAMES.cpp_module_codegen,
+        ACTION_NAMES.cpp_module_compile,
+        ACTION_NAMES.linkstamp_compile,
+        ACTION_NAMES.preprocess_assemble,
+    ]
+
+def all_link_actions():
+    return [
+        ACTION_NAMES.cpp_link_executable,
+        ACTION_NAMES.cpp_link_dynamic_library,
+        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+    ]
+
+def all_executable_link_actions():
+    return [
+        ACTION_NAMES.cpp_link_executable,
+    ]
+
+def all_shared_library_link_actions():
+    return [
+        ACTION_NAMES.cpp_link_dynamic_library,
+        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+    ]
+
+def all_archive_actions():
+    return [ACTION_NAMES.cpp_link_static_library]
+
+def all_strip_actions():
+    return [ACTION_NAMES.strip]
+
+def _library_to_link(flag_prefix, value, iterate = None):
+    return flag_group(
+        flags = [
+            "{}%{{libraries_to_link.{}}}".format(
+                flag_prefix,
+                iterate if iterate else "name",
+            ),
+        ],
+        iterate_over = ("libraries_to_link." + iterate if iterate else None),
+        expand_if_equal = variable_with_value(
+            name = "libraries_to_link.type",
+            value = value,
+        ),
+    )
+
+def _surround_static_library(prefix, suffix):
+    return [
+        flag_group(
+            flags = [prefix, "%{libraries_to_link.name}", suffix],
+            expand_if_true = "libraries_to_link.is_whole_archive",
+        ),
+        flag_group(
+            flags = ["%{libraries_to_link.name}"],
+            expand_if_false = "libraries_to_link.is_whole_archive",
+        ),
+    ]
+
+def _prefix_static_library(prefix):
+    return [
+        flag_group(
+            flags = ["%{libraries_to_link.name}"],
+            expand_if_false = "libraries_to_link.is_whole_archive",
+        ),
+        flag_group(
+            flags = [prefix + "%{libraries_to_link.name}"],
+            expand_if_true = "libraries_to_link.is_whole_archive",
+        ),
+    ]
+
+def _static_library_to_link(alwayslink_prefix, alwayslink_suffix = None):
+    if alwayslink_suffix:
+        flag_groups = _surround_static_library(alwayslink_prefix, alwayslink_suffix)
+    else:
+        flag_groups = _prefix_static_library(alwayslink_prefix)
+    return flag_group(
+        flag_groups = flag_groups,
+        expand_if_equal = variable_with_value(
+            name = "libraries_to_link.type",
+            value = "static_library",
+        ),
+    )
+
+def _iterate_flag_group(iterate_over, flags = [], flag_groups = []):
+    return flag_group(
+        iterate_over = iterate_over,
+        expand_if_available = iterate_over,
+        flag_groups = flag_groups,
+        flags = flags,
+    )
+
+def _libraries_to_link_group(flavour):
+    if flavour == "linux":
+        return _iterate_flag_group(
+            iterate_over = "libraries_to_link",
+            flag_groups = [
+                flag_group(
+                    flags = ["-Wl,--start-lib"],
+                    expand_if_equal = variable_with_value(
+                        name = "libraries_to_link.type",
+                        value = "object_file_group",
+                    ),
+                ),
+                _library_to_link("", "object_file_group", "object_files"),
+                flag_group(
+                    flags = ["-Wl,--end-lib"],
+                    expand_if_equal = variable_with_value(
+                        name = "libraries_to_link.type",
+                        value = "object_file_group",
+                    ),
+                ),
+                _library_to_link("", "object_file"),
+                _library_to_link("", "interface_library"),
+                _static_library_to_link("-Wl,-whole-archive", "-Wl,-no-whole-archive"),
+                _library_to_link("-l", "dynamic_library"),
+                _library_to_link("-l:", "versioned_dynamic_library"),
+            ],
+        )
+
+def _action_configs_with_tool(path, actions):
+    return [
+        action_config(
+            action_name = name,
+            enabled = True,
+            tools = [tool(path = path)],
+        )
+        for name in actions
+    ]
+
+def _action_configs(assembly_path, c_compiler_path, cc_compiler_path, archiver_path, linker_path, strip_path):
+    return _action_configs_with_tool(
+        assembly_path,
+        all_assembly_actions(),
+    ) + _action_configs_with_tool(
+        c_compiler_path,
+        all_c_compile_actions(),
+    ) + _action_configs_with_tool(
+        cc_compiler_path,
+        all_cpp_compile_actions(),
+    ) + _action_configs_with_tool(
+        archiver_path,
+        all_archive_actions(),
+    ) + _action_configs_with_tool(
+        linker_path,
+        all_link_actions(),
+    ) + _action_configs_with_tool(
+        strip_path,
+        all_strip_actions(),
+    )
+
+def _tool_paths(cpu, ctx):
+    if cpu == "local":
+        return [
+            tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
+            tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + "/ar"),
+            tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
+            tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
+            tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
+            tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
+            tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
+            tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
+            tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
+            tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
+            tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
+        ]
+    else:
+        fail("Unreachable")
+
+def _sysroot_group():
+    return flag_group(
+        flags = ["--sysroot=%{sysroot}"],
+        expand_if_available = "sysroot",
+    )
+
+def _no_canonical_prefixes_group(extra_flags):
+    return flag_group(
+        flags = [
+            "-no-canonical-prefixes",
+        ] + extra_flags,
+    )
+
+def _nologo():
+    return flag_group(flags = ["/nologo"])
+
+def _features(cpu, compiler, ctx):
+    if cpu == "local":
+        return [
+            feature(name = "no_legacy_features"),
+            feature(
+                name = "all_compile_flags",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = ["-MD", "-MF", "%{dependency_file}"],
+                                expand_if_available = "dependency_file",
+                            ),
+                            flag_group(
+                                flags = ["-gsplit-dwarf"],
+                                expand_if_available = "per_object_debug_info_file",
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_preprocessed_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = ["-frandom-seed=%{output_file}"],
+                                expand_if_available = "output_file",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-D%{preprocessor_defines}"],
+                                iterate_over = "preprocessor_defines",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-include", "%{includes}"],
+                                iterate_over = "includes",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-iquote", "%{quote_include_paths}"],
+                                iterate_over = "quote_include_paths",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-I%{include_paths}"],
+                                iterate_over = "include_paths",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-isystem", "%{system_include_paths}"],
+                                iterate_over = "system_include_paths",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-F", "%{framework_include_paths}"],
+                                iterate_over = "framework_include_paths",
+                            ),
+                        ] + ([
+                            flag_group(flags = ctx.attr.host_unfiltered_compile_flags),
+                        ] if ctx.attr.host_unfiltered_compile_flags else []),
+                    ),
+                    flag_set(
+                        actions = all_cpp_compile_actions(),
+                        flag_groups = [
+                            flag_group(flags = [
+                                "-fmerge-all-constants",
+                            ]),
+                        ] if compiler == "clang" else [],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = [
+                                    "-Wno-builtin-macro-redefined",
+                                    "-D__DATE__=\"redacted\"",
+                                    "-D__TIMESTAMP__=\"redacted\"",
+                                    "-D__TIME__=\"redacted\"",
+                                ],
+                            ),
+                            flag_group(
+                                flags = ["-fPIC"],
+                                expand_if_available = "pic",
+                            ),
+                            flag_group(
+                                flags = ["-fPIE"],
+                                expand_if_not_available = "pic",
+                            ),
+                            flag_group(
+                                flags = [
+                                    "-U_FORTIFY_SOURCE",
+                                    "-D_FORTIFY_SOURCE=1",
+                                    "-fstack-protector",
+                                    "-Wall",
+                                ] + ctx.attr.host_compiler_warnings + [
+                                    "-fno-omit-frame-pointer",
+                                ],
+                            ),
+                            _no_canonical_prefixes_group(
+                                ctx.attr.extra_no_canonical_prefixes_flags,
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [flag_group(flags = ["-DNDEBUG"])],
+                        with_features = [with_feature_set(features = ["disable-assertions"])],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = [
+                                    "-g0",
+                                    "-O2",
+                                    "-ffunction-sections",
+                                    "-fdata-sections",
+                                ],
+                            ),
+                        ],
+                        with_features = [with_feature_set(features = ["opt"])],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [flag_group(flags = ["-g"])],
+                        with_features = [with_feature_set(features = ["dbg"])],
+                    ),
+                ] + [
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [
+                            _iterate_flag_group(
+                                flags = ["%{user_compile_flags}"],
+                                iterate_over = "user_compile_flags",
+                            ),
+                            _sysroot_group(),
+                            flag_group(
+                                expand_if_available = "source_file",
+                                flags = ["-c", "%{source_file}"],
+                            ),
+                            flag_group(
+                                expand_if_available = "output_assembly_file",
+                                flags = ["-S"],
+                            ),
+                            flag_group(
+                                expand_if_available = "output_preprocess_file",
+                                flags = ["-E"],
+                            ),
+                            flag_group(
+                                expand_if_available = "output_file",
+                                flags = ["-o", "%{output_file}"],
+                            ),
+                        ],
+                    ),
+                ],
+            ),
+            feature(
+                name = "all_archive_flags",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = all_archive_actions(),
+                        flag_groups = [
+                            flag_group(
+                                expand_if_available = "linker_param_file",
+                                flags = ["@%{linker_param_file}"],
+                            ),
+                            flag_group(flags = ["rcsD"]),
+                            flag_group(
+                                flags = ["%{output_execpath}"],
+                                expand_if_available = "output_execpath",
+                            ),
+                            flag_group(
+                                iterate_over = "libraries_to_link",
+                                flag_groups = [
+                                    flag_group(
+                                        flags = ["%{libraries_to_link.name}"],
+                                        expand_if_equal = variable_with_value(
+                                            name = "libraries_to_link.type",
+                                            value = "object_file",
+                                        ),
+                                    ),
+                                    flag_group(
+                                        flags = ["%{libraries_to_link.object_files}"],
+                                        iterate_over = "libraries_to_link.object_files",
+                                        expand_if_equal = variable_with_value(
+                                            name = "libraries_to_link.type",
+                                            value = "object_file_group",
+                                        ),
+                                    ),
+                                ],
+                                expand_if_available = "libraries_to_link",
+                            ),
+                        ],
+                    ),
+                ],
+            ),
+            feature(
+                name = "all_link_flags",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = all_shared_library_link_actions(),
+                        flag_groups = [flag_group(flags = ["-shared"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = ([
+                            flag_group(flags = ["-Wl,-no-as-needed"]),
+                        ] if cpu == "local" else []) + ([
+                            flag_group(flags = ["-B" + ctx.attr.linker_bin_path]),
+                        ] if ctx.attr.linker_bin_path else []) + [
+                            flag_group(
+                                flags = ["@%{linker_param_file}"],
+                                expand_if_available = "linker_param_file",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["%{linkstamp_paths}"],
+                                iterate_over = "linkstamp_paths",
+                            ),
+                            flag_group(
+                                flags = ["-o", "%{output_execpath}"],
+                                expand_if_available = "output_execpath",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-L%{library_search_directories}"],
+                                iterate_over = "library_search_directories",
+                            ),
+                            _iterate_flag_group(
+                                iterate_over = "runtime_library_search_directories",
+                                flags = [
+                                    "-Wl,-rpath,$ORIGIN/%{runtime_library_search_directories}",
+                                ] if cpu == "local" else [
+                                    "-Wl,-rpath,@loader_path/%{runtime_library_search_directories}",
+                                ],
+                            ),
+                            _libraries_to_link_group("linux"),
+                            _iterate_flag_group(
+                                flags = ["%{user_link_flags}"],
+                                iterate_over = "user_link_flags",
+                            ),
+                            flag_group(
+                                flags = ["-Wl,--gdb-index"],
+                                expand_if_available = "is_using_fission",
+                            ),
+                            flag_group(
+                                flags = ["-Wl,-S"],
+                                expand_if_available = "strip_debug_symbols",
+                            ),
+                            flag_group(flags = ["-lstdc++"]),
+                            _no_canonical_prefixes_group(
+                                ctx.attr.extra_no_canonical_prefixes_flags,
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_executable_link_actions(),
+                        flag_groups = [flag_group(flags = ["-pie"])],
+                    ),
+                ] + ([
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [flag_group(flags = [
+                            "-Wl,-z,relro,-z,now",
+                        ])],
+                    ),
+                ]) + ([
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [
+                            flag_group(flags = ["-Wl,--gc-sections"]),
+                            flag_group(
+                                flags = ["-Wl,--build-id=md5", "-Wl,--hash-style=gnu"],
+                            ),
+                        ],
+                    ),
+                ]) + [
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [
+                            _sysroot_group(),
+                        ],
+                    ),
+                ],
+            ),
+            feature(name = "disable-assertions"),
+            feature(
+                name = "opt",
+                implies = ["disable-assertions"],
+            ),
+            feature(name = "fastbuild"),
+            feature(name = "dbg"),
+            feature(name = "supports_dynamic_linker", enabled = True),
+            feature(name = "pic", enabled = True),
+            feature(name = "supports_pic", enabled = True),
+            feature(name = "has_configured_linker_path", enabled = True),
+        ]
+    else:
+        fail("Unreachable")
+
+def _impl(ctx):
+    cpu = ctx.attr.cpu
+    compiler = ctx.attr.compiler
+
+    if (cpu == "local"):
+        toolchain_identifier = "local_linux"
+        target_cpu = "local"
+        target_libc = "local"
+        action_configs = _action_configs(
+            assembly_path = ctx.attr.host_compiler_path,
+            c_compiler_path = ctx.attr.host_compiler_path,
+            cc_compiler_path = ctx.attr.host_compiler_path,
+            archiver_path = ctx.attr.host_compiler_prefix + "/ar",
+            linker_path = ctx.attr.host_compiler_path,
+            strip_path = ctx.attr.host_compiler_prefix + "/strip",
+        )
+        artifact_name_patterns = []
+    else:
+        fail("Unreachable")
+
+    out = ctx.actions.declare_file(ctx.label.name)
+    ctx.actions.write(out, "Fake executable")
+    return [
+        cc_common.create_cc_toolchain_config_info(
+            ctx = ctx,
+            features = _features(cpu, compiler, ctx),
+            action_configs = action_configs,
+            artifact_name_patterns = artifact_name_patterns,
+            cxx_builtin_include_directories = ctx.attr.builtin_include_directories,
+            toolchain_identifier = toolchain_identifier,
+            host_system_name = "local",
+            target_system_name = "local",
+            target_cpu = target_cpu,
+            target_libc = target_libc,
+            compiler = compiler,
+            abi_version = "local",
+            abi_libc_version = "local",
+            tool_paths = _tool_paths(cpu, ctx),
+            make_variables = [],
+            builtin_sysroot = ctx.attr.builtin_sysroot,
+            cc_target_os = None,
+        ),
+        DefaultInfo(
+            executable = out,
+        ),
+    ]
+
+cc_toolchain_config = rule(
+    implementation = _impl,
+    attrs = {
+        "cpu": attr.string(mandatory = True, values = ["local"]),
+        "compiler": attr.string(values = ["clang", "unknown"], default = "unknown"),
+        "builtin_include_directories": attr.string_list(),
+        "extra_no_canonical_prefixes_flags": attr.string_list(),
+        "host_compiler_path": attr.string(),
+        "host_compiler_prefix": attr.string(),
+        "host_compiler_warnings": attr.string_list(),
+        "host_unfiltered_compile_flags": attr.string_list(),
+        "linker_bin_path": attr.string(),
+        "builtin_sysroot": attr.string(),
+    },
+    provides = [CcToolchainConfigInfo],
+    executable = True,
+)
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/find_sycl_config.py b/third_party/xla/third_party/tsl/third_party/gpus/find_sycl_config.py
new file mode 100644
index 00000000000000..5db84c32ea9084
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/find_sycl_config.py
@@ -0,0 +1,139 @@
+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Prints SYCL library and header directories and versions found on the system.
+
+The script searches for SYCL library and header files on the system, inspects
+them to determine their version and prints the configuration to stdout. The path
+to inspect is specified through an environment variable (SYCL_PATH). If no valid
+configuration is found, the script prints to stderr and returns an error code.
+The script takes the directory specified by the SYCL_PATH environment variable.
+The script looks for headers and library files in a hard-coded set of
+subdirectories from base path of the specified directory. If SYCL_PATH is not
+specified, then "/opt/sycl" is used as it default value
+"""
+
+import io
+import os
+import re
+import sys
+
+
+class ConfigError(Exception):
+  pass
+
+
+def _get_default_sycl_toolkit_path():
+  return "/opt/intel/oneapi/compiler/latest"
+
+
+def _get_toolkit_path():
+  """Determines and returns the SYCL installation path."""
+  sycl_toolkit_path = None
+  sycl_toolkit_path = _get_default_sycl_toolkit_path()
+  if "SYCL_TOOLKIT_PATH" in os.environ:
+    sycl_toolkit_path = os.environ["SYCL_TOOLKIT_PATH"]
+  return os.path.realpath(sycl_toolkit_path)
+
+
+def _get_basekit_path():
+  return _get_toolkit_path().split("/compiler/")[0]
+
+
+def _get_basekit_version():
+  return _get_toolkit_path().split("/compiler/")[1].split("/")[0]
+
+
+def _get_composite_version_number(major, minor, patch):
+  return 10000 * major + 100 * minor + patch
+
+
+def _get_header_version(path, name):
+  """Returns preprocessor defines in C header file."""
+  for line in io.open(path, "r", encoding="utf-8"):
+    match = re.match(r"#define %s +(\d+)" % name, line)
+    if match:
+      value = match.group(1)
+      return int(value)
+
+  raise ConfigError(
+      '#define "{}" is either\n'.format(name)
+      + "  not present in file {} OR\n".format(path)
+      + "  its value is not an integer literal"
+  )
+
+
+def _find_sycl_config(basekit_path):
+  # pylint: disable=missing-function-docstring
+
+  def sycl_version_numbers(path):
+    possible_version_files = [
+        "compiler/latest/linux/include/sycl/version.hpp",
+        "compiler/latest/include/sycl/version.hpp",
+    ]
+    version_file = None
+    for f in possible_version_files:
+      version_file_path = os.path.join(path, f)
+      if os.path.exists(version_file_path):
+        version_file = version_file_path
+        break
+    if not version_file:
+      raise ConfigError(
+          "SYCL version file not found in {}".format(possible_version_files)
+      )
+
+    major = _get_header_version(version_file, "__LIBSYCL_MAJOR_VERSION")
+    minor = _get_header_version(version_file, "__LIBSYCL_MINOR_VERSION")
+    patch = _get_header_version(version_file, "__LIBSYCL_PATCH_VERSION")
+    return major, minor, patch
+
+  major, minor, patch = sycl_version_numbers(basekit_path)
+
+  sycl_config = {
+      "sycl_version_number": _get_composite_version_number(major, minor, patch),
+      "sycl_basekit_version_number": _get_basekit_version(),
+  }
+
+  return sycl_config
+
+
+def find_sycl_config():
+  """Returns a dictionary of SYCL components config info."""
+  basekit_path = _get_basekit_path()
+  toolkit_path = _get_toolkit_path()
+  if not os.path.exists(basekit_path):
+    raise ConfigError(
+        'Specified SYCL_TOOLKIT_PATH "{}" does not exist'.format(basekit_path)
+    )
+
+  result = {}
+
+  result["sycl_basekit_path"] = basekit_path
+  result["sycl_toolkit_path"] = toolkit_path
+  result.update(_find_sycl_config(basekit_path))
+
+  return result
+
+
+def main():
+  try:
+    for key, value in sorted(find_sycl_config().items()):
+      print("%s: %s" % (key, value))
+  except ConfigError as e:
+    sys.stderr.write("\nERROR: {}\n\n".format(str(e)))
+    sys.exit(1)
+
+
+if __name__ == "__main__":
+  main()
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/rocm/build_defs.bzl.tpl b/third_party/xla/third_party/tsl/third_party/gpus/rocm/build_defs.bzl.tpl
index 339733755d6f1f..231a8a1913e7a9 100644
--- a/third_party/xla/third_party/tsl/third_party/gpus/rocm/build_defs.bzl.tpl
+++ b/third_party/xla/third_party/tsl/third_party/gpus/rocm/build_defs.bzl.tpl
@@ -39,6 +39,10 @@ def rocm_version_number():
     return %{rocm_version_number}
 
 def if_gpu_is_configured(if_true, if_false = []):
+    """Tests if ROCm or CUDA or SYCL was enabled during the configure process."""
+    return select({"//conditions:default": %{gpu_is_configured}})
+
+def if_cuda_or_rocm(if_true, if_false = []):
     """Tests if ROCm or CUDA was enabled during the configure process.
 
     Unlike if_rocm() or if_cuda(), this does not require that we are building
@@ -46,7 +50,7 @@ def if_gpu_is_configured(if_true, if_false = []):
     code to depend on ROCm or CUDA libraries.
 
     """
-    return select({"//conditions:default": %{gpu_is_configured}})
+    return select({"//conditions:default": %{cuda_or_rocm}})
 
 def if_rocm_is_configured(x):
     """Tests if the ROCm was enabled during the configure process.
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/rocm_configure.bzl b/third_party/xla/third_party/tsl/third_party/gpus/rocm_configure.bzl
index b8b1aa92a5b037..0fd4019fc5bb75 100644
--- a/third_party/xla/third_party/tsl/third_party/gpus/rocm_configure.bzl
+++ b/third_party/xla/third_party/tsl/third_party/gpus/rocm_configure.bzl
@@ -29,6 +29,10 @@ load(
     "make_copy_files_rule",
     "to_list_of_strings",
 )
+load(
+    ":sycl_configure.bzl",
+    "enable_sycl",
+)
 
 _GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH"
 _GCC_HOST_COMPILER_PREFIX = "GCC_HOST_COMPILER_PREFIX"
@@ -450,7 +454,8 @@ def _create_dummy_repository(repository_ctx):
         "rocm:build_defs.bzl",
         {
             "%{rocm_is_configured}": "False",
-            "%{gpu_is_configured}": "if_true" if enable_cuda(repository_ctx) else "if_false",
+            "%{gpu_is_configured}": "if_true" if enable_cuda(repository_ctx) or enable_sycl(repository_ctx) else "if_false",
+            "%{cuda_or_rocm}": "if_true" if enable_cuda(repository_ctx) else "if_false",
             "%{rocm_extra_copts}": "[]",
             "%{rocm_gpu_architectures}": "[]",
             "%{rocm_version_number}": "0",
@@ -637,6 +642,7 @@ def _create_local_rocm_repository(repository_ctx):
         {
             "%{rocm_is_configured}": "True",
             "%{gpu_is_configured}": "if_true",
+            "%{cuda_or_rocm}": "if_true",
             "%{rocm_extra_copts}": _compute_rocm_extra_copts(
                 repository_ctx,
                 rocm_config.amdgpu_targets,
@@ -766,6 +772,7 @@ def _create_remote_rocm_repository(repository_ctx, remote_config_repo):
         {
             "%{rocm_is_configured}": "True",
             "%{gpu_is_configured}": "if_true",
+            "%{cuda_or_rocm}": "if_true",
             "%{rocm_extra_copts}": _compute_rocm_extra_copts(
                 repository_ctx,
                 [],  #_compute_capabilities(repository_ctx)
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/sycl/BUILD b/third_party/xla/third_party/tsl/third_party/gpus/sycl/BUILD
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/sycl/BUILD.tpl b/third_party/xla/third_party/tsl/third_party/gpus/sycl/BUILD.tpl
new file mode 100644
index 00000000000000..effbbc88799399
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/sycl/BUILD.tpl
@@ -0,0 +1,63 @@
+package(default_visibility = ["//visibility:public"])
+
+config_setting(
+    name = "using_sycl",
+    values = {
+        "define": "using_sycl=true",
+    },
+)
+
+cc_library(
+    name = "sycl_headers",
+    hdrs = [
+        %{sycl_headers}
+    ],
+    includes = [
+        ".",
+        "sycl/include",
+        "sycl/include/sycl",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "sycl",
+    srcs = [
+        %{core_sycl_libs}
+    ],
+    data = [
+        %{core_sycl_libs}
+    ],
+    includes = [
+        ".",
+        "sycl/include",
+    ],
+    linkopts = ["-lze_loader"],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "mkl",
+    srcs = [
+        "sycl/lib/%{mkl_intel_ilp64_lib}",
+        "sycl/lib/%{mkl_sequential_lib}",
+        "sycl/lib/%{mkl_core_lib}",
+        %{mkl_sycl_libs}
+    ],
+    data = [
+        "sycl/lib/%{mkl_intel_ilp64_lib}",
+        "sycl/lib/%{mkl_sequential_lib}",
+        "sycl/lib/%{mkl_core_lib}",
+        %{mkl_sycl_libs}
+    ],
+    includes = [
+        ".",
+        "sycl/include",
+    ],
+    # linkopts = ["-Wl,-Bstatic,-lsvml,-lirng,-limf,-lirc,-lirc_s,-Bdynamic"],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+%{copy_rules}
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/sycl/build_defs.bzl.tpl b/third_party/xla/third_party/tsl/third_party/gpus/sycl/build_defs.bzl.tpl
new file mode 100644
index 00000000000000..d678f84ca88a01
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/sycl/build_defs.bzl.tpl
@@ -0,0 +1,39 @@
+# Macros for building SYCL code.
+def if_sycl(if_true, if_false = []):
+    """Shorthand for select()'ing on whether we're building with SYCL.
+
+    Returns a select statement which evaluates to if_true if we're building
+    with SYCL enabled.  Otherwise, the select statement evaluates to if_false.
+
+    """
+    return select({
+        "@local_config_sycl//sycl:using_sycl": if_true,
+        "//conditions:default": if_false,
+    })
+
+def sycl_default_copts():
+    """Default options for all SYCL compilations."""
+    return if_sycl(["-x", "sycl"])
+
+def sycl_build_is_configured():
+    """Returns true if SYCL compiler was enabled during the configure process."""
+    return %{sycl_build_is_configured}
+
+def if_sycl_is_configured(x):
+    """Tests if the SYCL was enabled during the configure process.
+
+    Unlike if_sycl(), this does not require that we are building with
+    --config=sycl. Used to allow non-SYCL code to depend on SYCL libraries.
+    """
+    if %{sycl_is_configured}:
+      return select({"//conditions:default": x})
+    return select({"//conditions:default": []})
+
+def if_sycl_build_is_configured(x, y):
+    if sycl_build_is_configured():
+      return x
+    return y
+
+def sycl_library(copts = [], **kwargs):
+    """Wrapper over cc_library which adds default SYCL options."""
+    native.cc_library(copts = sycl_default_copts() + copts, **kwargs)
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/sycl_configure.bzl b/third_party/xla/third_party/tsl/third_party/gpus/sycl_configure.bzl
new file mode 100644
index 00000000000000..05330b2fe53195
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/sycl_configure.bzl
@@ -0,0 +1,531 @@
+"""Repository rule for SYCL autoconfiguration.
+`sycl_configure` depends on the following environment variables:
+  * `TF_NEED_SYCL`: Whether to enable building with SYCL.
+  * `GCC_HOST_COMPILER_PATH`: The GCC host compiler path
+"""
+
+load(
+    "//third_party/remote_config:common.bzl",
+    "err_out",
+    "execute",
+    "files_exist",
+    "get_bash_bin",
+    "get_host_environ",
+    "get_python_bin",
+    "raw_exec",
+    "realpath",
+    "which",
+)
+load(
+    ":cuda_configure.bzl",
+    "make_copy_dir_rule",
+    "make_copy_files_rule",
+    "to_list_of_strings",
+)
+
+_GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH"
+_GCC_HOST_COMPILER_PREFIX = "GCC_HOST_COMPILER_PREFIX"
+
+def _mkl_path(sycl_config):
+    return sycl_config.sycl_basekit_path + "/mkl/" + sycl_config.sycl_basekit_version_number
+
+def _sycl_header_path(repository_ctx, sycl_config, bash_bin):
+    sycl_header_path = sycl_config.sycl_basekit_path + "/compiler/" + sycl_config.sycl_basekit_version_number
+    include_dir = sycl_header_path + "/include"
+    if not files_exist(repository_ctx, [include_dir], bash_bin)[0]:
+        sycl_header_path = sycl_header_path + "/linux"
+        include_dir = sycl_header_path + "/include"
+        if not files_exist(repository_ctx, [include_dir], bash_bin)[0]:
+            auto_configure_fail("Cannot find sycl headers in {}".format(include_dir))
+    return sycl_header_path
+
+def _sycl_include_path(repository_ctx, sycl_config, bash_bin):
+    """Generates the cxx_builtin_include_directory entries for sycl inc dirs.
+    Args:
+      repository_ctx: The repository context.
+      sycl_config: The path to the gcc host compiler.
+    Returns:
+      A string containing the Starlark string for each of the gcc
+      host compiler include directories, which can be added to the CROSSTOOL
+      file.
+    """
+    inc_dirs = []
+
+    inc_dirs.append(_mkl_path(sycl_config) + "/include")
+    inc_dirs.append(_sycl_header_path(repository_ctx, sycl_config, bash_bin) + "/include")
+    inc_dirs.append(_sycl_header_path(repository_ctx, sycl_config, bash_bin) + "/include/sycl")
+
+    return inc_dirs
+
+def enable_sycl(repository_ctx):
+    """Returns whether to build with SYCL support."""
+    return int(get_host_environ(repository_ctx, "TF_NEED_SYCL", False))
+
+def auto_configure_fail(msg):
+    """Output failure message when auto configuration fails."""
+    red = "\033[0;31m"
+    no_color = "\033[0m"
+    fail("\n%sAuto-Configuration Error:%s %s\n" % (red, no_color, msg))
+
+def find_cc(repository_ctx):
+    """Find the C++ compiler."""
+
+    # Return a dummy value for GCC detection here to avoid error
+    target_cc_name = "gcc"
+    cc_path_envvar = _GCC_HOST_COMPILER_PATH
+    cc_name = target_cc_name
+
+    cc_name_from_env = get_host_environ(repository_ctx, cc_path_envvar)
+    if cc_name_from_env:
+        cc_name = cc_name_from_env
+    if cc_name.startswith("/"):
+        # Absolute path, maybe we should make this supported by our which function.
+        return cc_name
+    cc = which(repository_ctx, cc_name)
+    if cc == None:
+        fail(("Cannot find {}, either correct your path or set the {}" +
+              " environment variable").format(target_cc_name, cc_path_envvar))
+    return cc
+
+def find_sycl_root(repository_ctx, sycl_config):
+    sycl_name = str(repository_ctx.path(sycl_config.sycl_toolkit_path.strip()).realpath)
+    if sycl_name.startswith("/"):
+        return sycl_name
+    fail("Cannot find SYCL compiler, please correct your path")
+
+def find_sycl_include_path(repository_ctx, sycl_config):
+    base_path = find_sycl_root(repository_ctx, sycl_config)
+    bin_path = repository_ctx.path(base_path + "/" + "bin" + "/" + "icpx")
+    icpx_extra = ""
+    if not bin_path.exists:
+        bin_path = repository_ctx.path(base_path + "/" + "bin" + "/" + "clang")
+        if not bin_path.exists:
+            fail("Cannot find SYCL compiler, please correct your path")
+    else:
+        icpx_extra = "-fsycl"
+    gcc_path = repository_ctx.which("gcc")
+    gcc_install_dir = repository_ctx.execute([gcc_path, "-print-libgcc-file-name"])
+    gcc_install_dir_opt = "--gcc-install-dir=" + str(repository_ctx.path(gcc_install_dir.stdout.strip()).dirname)
+    cmd_out = repository_ctx.execute([bin_path, icpx_extra, gcc_install_dir_opt, "-xc++", "-E", "-v", "/dev/null", "-o", "/dev/null"])
+    outlist = cmd_out.stderr.split("\n")
+    real_base_path = str(repository_ctx.path(base_path).realpath).strip()
+    include_dirs = []
+    for l in outlist:
+        if l.startswith(" ") and l.strip().startswith("/") and str(repository_ctx.path(l.strip()).realpath) not in include_dirs:
+            include_dirs.append(str(repository_ctx.path(l.strip()).realpath))
+    return include_dirs
+
+def _lib_name(lib, version = "", static = False):
+    """Constructs the name of a library on Linux.
+    Args:
+      lib: The name of the library, such as "mkl"
+      version: The version of the library.
+      static: True the library is static or False if it is a shared object.
+    Returns:
+      The platform-specific name of the library.
+    """
+    if static:
+        return "lib%s.a" % lib
+    else:
+        if version:
+            version = ".%s" % version
+        return "lib%s.so%s" % (lib, version)
+
+def _sycl_lib_paths(repository_ctx, lib, basedir):
+    file_name = _lib_name(lib, version = "", static = False)
+    return [
+        repository_ctx.path("%s/lib/%s" % (basedir, file_name)),
+        repository_ctx.path("%s/lib/intel64/%s" % (basedir, file_name)),
+    ]
+
+def _batch_files_exist(repository_ctx, libs_paths, bash_bin):
+    all_paths = []
+    for _, lib_paths in libs_paths:
+        for lib_path in lib_paths:
+            all_paths.append(lib_path)
+    return files_exist(repository_ctx, all_paths, bash_bin)
+
+def _select_sycl_lib_paths(repository_ctx, libs_paths, bash_bin):
+    test_results = _batch_files_exist(repository_ctx, libs_paths, bash_bin)
+
+    libs = {}
+    i = 0
+    for name, lib_paths in libs_paths:
+        selected_path = None
+        for path in lib_paths:
+            if test_results[i] and selected_path == None:
+                # For each lib select the first path that exists.
+                selected_path = path
+            i = i + 1
+        if selected_path == None:
+            auto_configure_fail("Cannot find sycl library %s in %s" % (name, path))
+
+        libs[name] = struct(file_name = selected_path.basename, path = realpath(repository_ctx, selected_path, bash_bin))
+
+    return libs
+
+def _find_libs(repository_ctx, sycl_config, bash_bin):
+    """Returns the SYCL libraries on the system.
+    Args:
+      repository_ctx: The repository context.
+      sycl_config: The SYCL config as returned by _get_sycl_config
+      bash_bin: the path to the bash interpreter
+    Returns:
+      Map of library names to structs of filename and path
+    """
+    mkl_path = _mkl_path(sycl_config)
+    sycl_path = _sycl_header_path(repository_ctx, sycl_config, bash_bin)
+    libs_paths = [
+        (name, _sycl_lib_paths(repository_ctx, name, path))
+        for name, path in [
+            ("sycl", sycl_path),
+            ("OpenCL", sycl_path),
+            ("mkl_intel_ilp64", mkl_path),
+            ("mkl_sequential", mkl_path),
+            ("mkl_core", mkl_path),
+        ]
+    ]
+    if sycl_config.sycl_basekit_version_number < "2024":
+        libs_paths.append(("mkl_sycl", _sycl_lib_paths(repository_ctx, "mkl_sycl", mkl_path)))
+    else:
+        libs_paths.append(("mkl_sycl_blas", _sycl_lib_paths(repository_ctx, "mkl_sycl_blas", mkl_path)))
+        libs_paths.append(("mkl_sycl_lapack", _sycl_lib_paths(repository_ctx, "mkl_sycl_lapack", mkl_path)))
+        libs_paths.append(("mkl_sycl_sparse", _sycl_lib_paths(repository_ctx, "mkl_sycl_sparse", mkl_path)))
+        libs_paths.append(("mkl_sycl_dft", _sycl_lib_paths(repository_ctx, "mkl_sycl_dft", mkl_path)))
+        libs_paths.append(("mkl_sycl_vm", _sycl_lib_paths(repository_ctx, "mkl_sycl_vm", mkl_path)))
+        libs_paths.append(("mkl_sycl_rng", _sycl_lib_paths(repository_ctx, "mkl_sycl_rng", mkl_path)))
+        libs_paths.append(("mkl_sycl_stats", _sycl_lib_paths(repository_ctx, "mkl_sycl_stats", mkl_path)))
+        libs_paths.append(("mkl_sycl_data_fitting", _sycl_lib_paths(repository_ctx, "mkl_sycl_data_fitting", mkl_path)))
+    return _select_sycl_lib_paths(repository_ctx, libs_paths, bash_bin)
+
+def find_sycl_config(repository_ctx):
+    """Returns SYCL config dictionary from running find_sycl_config.py"""
+    python_bin = get_python_bin(repository_ctx)
+    exec_result = execute(repository_ctx, [python_bin, repository_ctx.attr._find_sycl_config])
+    if exec_result.return_code:
+        auto_configure_fail("Failed to run find_sycl_config.py: %s" % err_out(exec_result))
+
+    # Parse the dict from stdout.
+    return dict([tuple(x.split(": ")) for x in exec_result.stdout.splitlines()])
+
+def _get_sycl_config(repository_ctx, bash_bin):
+    """Detects and returns information about the SYCL installation on the system.
+    Args:
+      repository_ctx: The repository context.
+      bash_bin: the path to the path interpreter
+    """
+    config = find_sycl_config(repository_ctx)
+    sycl_basekit_path = config["sycl_basekit_path"]
+    sycl_toolkit_path = config["sycl_toolkit_path"]
+    sycl_version_number = config["sycl_version_number"]
+    sycl_basekit_version_number = config["sycl_basekit_version_number"]
+    return struct(
+        sycl_basekit_path = sycl_basekit_path,
+        sycl_toolkit_path = sycl_toolkit_path,
+        sycl_version_number = sycl_version_number,
+        sycl_basekit_version_number = sycl_basekit_version_number,
+    )
+
+def _tpl_path(repository_ctx, labelname):
+    return repository_ctx.path(Label("//third_party/gpus/%s.tpl" % labelname))
+
+def _tpl(repository_ctx, tpl, substitutions = {}, out = None):
+    if not out:
+        out = tpl.replace(":", "/")
+    repository_ctx.template(
+        out,
+        _tpl_path(repository_ctx, tpl),
+        substitutions,
+    )
+
+_INC_DIR_MARKER_BEGIN = "#include <...>"
+
+def _cxx_inc_convert(path):
+    """Convert path returned by cc -E xc++ in a complete path."""
+    path = path.strip()
+    return path
+
+def _normalize_include_path(repository_ctx, path):
+    """Normalizes include paths before writing them to the crosstool.
+      If path points inside the 'crosstool' folder of the repository, a relative
+      path is returned.
+      If path points outside the 'crosstool' folder, an absolute path is returned.
+      """
+    path = str(repository_ctx.path(path))
+    crosstool_folder = str(repository_ctx.path(".").get_child("crosstool"))
+
+    if path.startswith(crosstool_folder):
+        # We drop the path to "$REPO/crosstool" and a trailing path separator.
+        return "\"" + path[len(crosstool_folder) + 1:] + "\""
+    return "\"" + path + "\""
+
+def _get_cxx_inc_directories_impl(repository_ctx, cc, lang_is_cpp):
+    """Compute the list of default C or C++ include directories."""
+    if lang_is_cpp:
+        lang = "c++"
+    else:
+        lang = "c"
+
+    result = raw_exec(repository_ctx, [
+        cc,
+        "-no-canonical-prefixes",
+        "-E",
+        "-x" + lang,
+        "-",
+        "-v",
+    ])
+    stderr = err_out(result)
+    index1 = stderr.find(_INC_DIR_MARKER_BEGIN)
+    if index1 == -1:
+        return []
+    index1 = stderr.find("\n", index1)
+    if index1 == -1:
+        return []
+    index2 = stderr.rfind("\n ")
+    if index2 == -1 or index2 < index1:
+        return []
+    index2 = stderr.find("\n", index2 + 1)
+    if index2 == -1:
+        inc_dirs = stderr[index1 + 1:]
+    else:
+        inc_dirs = stderr[index1 + 1:index2].strip()
+
+    return [
+        str(repository_ctx.path(_cxx_inc_convert(p)))
+        for p in inc_dirs.split("\n")
+    ]
+
+def get_cxx_inc_directories(repository_ctx, cc):
+    """Compute the list of default C and C++ include directories."""
+
+    # For some reason `clang -xc` sometimes returns include paths that are
+    # different from the ones from `clang -xc++`. (Symlink and a dir)
+    # So we run the compiler with both `-xc` and `-xc++` and merge resulting lists
+    includes_cpp = _get_cxx_inc_directories_impl(repository_ctx, cc, True)
+    includes_c = _get_cxx_inc_directories_impl(repository_ctx, cc, False)
+
+    includes_cpp_set = depset(includes_cpp)
+    return includes_cpp + [
+        inc
+        for inc in includes_c
+        if inc not in includes_cpp_set.to_list()
+    ]
+
+_DUMMY_CROSSTOOL_BZL_FILE = """
+def error_gpu_disabled():
+  fail("ERROR: Building with --config=sycl but TensorFlow is not configured " +
+       "to build with GPU support. Please re-run ./configure and enter 'Y' " +
+       "at the prompt to build with GPU support.")
+  native.genrule(
+      name = "error_gen_crosstool",
+      outs = ["CROSSTOOL"],
+      cmd = "echo 'Should not be run.' && exit 1",
+  )
+  native.filegroup(
+      name = "crosstool",
+      srcs = [":CROSSTOOL"],
+      output_licenses = ["unencumbered"],
+  )
+"""
+
+_DUMMY_CROSSTOOL_BUILD_FILE = """
+load("//crosstool:error_gpu_disabled.bzl", "error_gpu_disabled")
+error_gpu_disabled()
+"""
+
+def _create_dummy_repository(repository_ctx):
+    # Set up BUILD file for sycl/.
+    _tpl(repository_ctx, "sycl:build_defs.bzl")
+    _tpl(
+        repository_ctx,
+        "sycl:BUILD",
+        {
+            "%{mkl_intel_ilp64_lib}": _lib_name("mkl_intel_ilp64"),
+            "%{mkl_sequential_lib}": _lib_name("mkl_sequential"),
+            "%{mkl_core_lib}": _lib_name("mkl_core"),
+            "%{mkl_sycl_libs}": "",
+            "%{core_sycl_libs}": "",
+            "%{copy_rules}": "",
+            "%{sycl_headers}": "",
+        },
+    )
+
+    # If sycl_configure is not configured to build with SYCL support, and the user
+    # attempts to build with --config=sycl, add a dummy build rule to intercept
+    # this and fail with an actionable error message.
+    repository_ctx.file(
+        "crosstool/error_gpu_disabled.bzl",
+        _DUMMY_CROSSTOOL_BZL_FILE,
+    )
+    repository_ctx.file("crosstool/BUILD", _DUMMY_CROSSTOOL_BUILD_FILE)
+
+    _tpl(
+        repository_ctx,
+        "sycl:build_defs.bzl",
+        {
+            "%{sycl_is_configured}": "False",
+            "%{sycl_build_is_configured}": "False",
+        },
+    )
+
+def _create_local_sycl_repository(repository_ctx):
+    tpl_paths = {labelname: _tpl_path(repository_ctx, labelname) for labelname in [
+        "sycl:build_defs.bzl",
+        "sycl:BUILD",
+        "crosstool:BUILD.sycl",
+        "crosstool:sycl_cc_toolchain_config.bzl",
+        "crosstool:clang/bin/crosstool_wrapper_driver_sycl",
+    ]}
+
+    bash_bin = get_bash_bin(repository_ctx)
+    sycl_config = _get_sycl_config(repository_ctx, bash_bin)
+
+    # Copy header and library files to execroot.
+    copy_rules = [
+        make_copy_dir_rule(
+            repository_ctx,
+            name = "sycl-include",
+            src_dir = _sycl_header_path(repository_ctx, sycl_config, bash_bin) + "/include",
+            out_dir = "sycl/include",
+        ),
+    ]
+    copy_rules.append(make_copy_dir_rule(
+        repository_ctx,
+        name = "mkl-include",
+        src_dir = _mkl_path(sycl_config) + "/include",
+        out_dir = "sycl/include",
+    ))
+
+    sycl_libs = _find_libs(repository_ctx, sycl_config, bash_bin)
+    sycl_lib_srcs = []
+    sycl_lib_outs = []
+    for lib in sycl_libs.values():
+        sycl_lib_srcs.append(lib.path)
+        sycl_lib_outs.append("sycl/lib/" + lib.file_name)
+    copy_rules.append(make_copy_files_rule(
+        repository_ctx,
+        name = "sycl-lib",
+        srcs = sycl_lib_srcs,
+        outs = sycl_lib_outs,
+    ))
+
+    # Set up BUILD file for sycl/
+    repository_ctx.template(
+        "sycl/build_defs.bzl",
+        tpl_paths["sycl:build_defs.bzl"],
+        {
+            "%{sycl_is_configured}": "True",
+            "%{sycl_build_is_configured}": "True",
+        },
+    )
+
+    if sycl_config.sycl_basekit_version_number < "2024":
+        mkl_sycl_libs = '"{}"'.format(
+            "sycl/lib/" + sycl_libs["mkl_sycl"].file_name,
+        )
+    else:
+        mkl_sycl_libs = '"{}",\n"{}",\n"{}",\n"{}",\n"{}",\n"{}",\n"{}",\n"{}"'.format(
+            "sycl/lib/" + sycl_libs["mkl_sycl_blas"].file_name,
+            "sycl/lib/" + sycl_libs["mkl_sycl_lapack"].file_name,
+            "sycl/lib/" + sycl_libs["mkl_sycl_sparse"].file_name,
+            "sycl/lib/" + sycl_libs["mkl_sycl_dft"].file_name,
+            "sycl/lib/" + sycl_libs["mkl_sycl_vm"].file_name,
+            "sycl/lib/" + sycl_libs["mkl_sycl_rng"].file_name,
+            "sycl/lib/" + sycl_libs["mkl_sycl_stats"].file_name,
+            "sycl/lib/" + sycl_libs["mkl_sycl_data_fitting"].file_name,
+        )
+    core_sycl_libs = '"{}",\n"{}"'.format(
+        "sycl/lib/" + sycl_libs["sycl"].file_name,
+        "sycl/lib/" + sycl_libs["OpenCL"].file_name,
+    )
+    repository_dict = {
+        "%{mkl_intel_ilp64_lib}": sycl_libs["mkl_intel_ilp64"].file_name,
+        "%{mkl_sequential_lib}": sycl_libs["mkl_sequential"].file_name,
+        "%{mkl_core_lib}": sycl_libs["mkl_core"].file_name,
+        "%{mkl_sycl_libs}": mkl_sycl_libs,
+        "%{core_sycl_libs}": core_sycl_libs,
+        "%{copy_rules}": "\n".join(copy_rules),
+        "%{sycl_headers}": ('":mkl-include",\n":sycl-include",\n'),
+    }
+    repository_ctx.template(
+        "sycl/BUILD",
+        tpl_paths["sycl:BUILD"],
+        repository_dict,
+    )
+
+    # Set up crosstool/
+
+    cc = find_cc(repository_ctx)
+
+    host_compiler_includes = get_cxx_inc_directories(repository_ctx, cc)
+
+    host_compiler_prefix = get_host_environ(repository_ctx, _GCC_HOST_COMPILER_PREFIX, "/usr/bin")
+
+    sycl_defines = {}
+
+    sycl_defines["%{host_compiler_prefix}"] = host_compiler_prefix
+    sycl_defines["%{host_compiler_path}"] = "clang/bin/crosstool_wrapper_driver_is_not_gcc"
+
+    sycl_defines["%{cpu_compiler}"] = str(cc)
+    sycl_defines["%{linker_bin_path}"] = "/usr/bin"
+
+    sycl_internal_inc_dirs = find_sycl_include_path(repository_ctx, sycl_config)
+    cxx_builtin_includes_list = sycl_internal_inc_dirs + _sycl_include_path(repository_ctx, sycl_config, bash_bin) + host_compiler_includes
+
+    sycl_defines["%{cxx_builtin_include_directories}"] = to_list_of_strings(cxx_builtin_includes_list)
+    sycl_defines["%{extra_no_canonical_prefixes_flags}"] = "\"-fno-canonical-system-headers\""
+    sycl_defines["%{unfiltered_compile_flags}"] = to_list_of_strings([
+        "-DTENSORFLOW_USE_SYCL=1",
+        "-DMKL_ILP64",
+        "-fPIC",
+    ])
+    sycl_defines["%{sycl_compiler_root}"] = str(sycl_config.sycl_toolkit_path)
+    sycl_defines["%{SYCL_ROOT_DIR}"] = str(sycl_config.sycl_toolkit_path)
+    sycl_defines["%{basekit_path}"] = str(sycl_config.sycl_basekit_path)
+    sycl_defines["%{basekit_version}"] = str(sycl_config.sycl_basekit_version_number)
+    sycl_defines["%{MKL_PATH}"] = _mkl_path(sycl_config)
+
+    # Only expand template variables in the BUILD file
+    repository_ctx.template(
+        "crosstool/BUILD",
+        tpl_paths["crosstool:BUILD.sycl"],
+        sycl_defines,
+    )
+
+    # No templating of cc_toolchain_config - use attributes and templatize the
+    # BUILD file.
+    repository_ctx.template(
+        "crosstool/cc_toolchain_config.bzl",
+        tpl_paths["crosstool:sycl_cc_toolchain_config.bzl"],
+        sycl_defines,
+    )
+
+    repository_ctx.template(
+        "crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc",
+        tpl_paths["crosstool:clang/bin/crosstool_wrapper_driver_sycl"],
+        sycl_defines,
+    )
+
+def _sycl_autoconf_imp(repository_ctx):
+    """Implementation of the sycl_autoconf rule."""
+    if not enable_sycl(repository_ctx):
+        _create_dummy_repository(repository_ctx)
+    else:
+        _create_local_sycl_repository(repository_ctx)
+
+sycl_configure = repository_rule(
+    # Detects and configures the local SYCL toolchain.
+    # Add the following to your WORKSPACE FILE:
+    # ```python
+    # sycl_configure(name = "local_config_sycl")
+    # ```
+    # Args:
+    #   name: A unique name for this workspace rule.
+    implementation = _sycl_autoconf_imp,
+    local = True,
+    attrs = {
+        "_find_sycl_config": attr.label(
+            default = Label("//third_party/gpus:find_sycl_config.py"),
+        ),
+    },
+)
diff --git a/third_party/xla/third_party/tsl/third_party/hwloc/hwloc.BUILD b/third_party/xla/third_party/tsl/third_party/hwloc/hwloc.BUILD
index 457cef024907ed..c2ce91ac241fef 100644
--- a/third_party/xla/third_party/tsl/third_party/hwloc/hwloc.BUILD
+++ b/third_party/xla/third_party/tsl/third_party/hwloc/hwloc.BUILD
@@ -50,7 +50,7 @@ expand_template(
     name = "include_hwloc_autogen_config_h",
     out = "include/hwloc/autogen/config.h",
     substitutions = select({
-        "@local_tsl//tsl:linux_x86_64": _INCLUDE_HWLOC_AUTOIGEN_CONFIG_H_LINUX_SUBS,
+        "@local_xla//xla/tsl:linux_x86_64": _INCLUDE_HWLOC_AUTOIGEN_CONFIG_H_LINUX_SUBS,
         "//conditions:default": _INCLUDE_HWLOC_AUTOIGEN_CONFIG_H_COMMON_SUBS,
     }),
     template = "include/hwloc/autogen/config.h.in",
@@ -259,21 +259,21 @@ cc_library(
         "include/private/private.h",
         "include/private/xml.h",
     ] + select({
-        "@local_tsl//tsl:linux_x86_64": [
+        "@local_xla//xla/tsl:linux_x86_64": [
             "hwloc/topology-linux.c",
             "hwloc/topology-x86.c",
             "include/hwloc/linux.h",
             "include/private/cpuid-x86.h",
         ],
-        "@local_tsl//tsl:linux_aarch64": [
+        "@local_xla//xla/tsl:linux_aarch64": [
             "hwloc/topology-linux.c",
             "include/hwloc/linux.h",
         ],
-        "@local_tsl//tsl:linux_ppc64le": [
+        "@local_xla//xla/tsl:linux_ppc64le": [
             "hwloc/topology-linux.c",
             "include/hwloc/linux.h",
         ],
-        "@local_tsl//tsl:freebsd": [
+        "@local_xla//xla/tsl:freebsd": [
             "hwloc/topology-freebsd.c",
             "hwloc/topology-x86.c",
             "include/private/cpuid-x86.h",
diff --git a/third_party/xla/third_party/tsl/third_party/llvm_openmp/BUILD b/third_party/xla/third_party/tsl/third_party/llvm_openmp/BUILD
index e60c91bbe376b6..031d29f5b7f7ae 100644
--- a/third_party/xla/third_party/tsl/third_party/llvm_openmp/BUILD
+++ b/third_party/xla/third_party/tsl/third_party/llvm_openmp/BUILD
@@ -12,7 +12,7 @@ load(
     "libiomp5_cc_binary",
 )
 load(
-    "@local_tsl//tsl:tsl.bzl",
+    "@local_xla//xla/tsl:tsl.bzl",
     "if_linux_x86_64",
     "if_macos",
     "if_windows",
@@ -36,7 +36,7 @@ py_binary(
 )
 
 kmp_i18n_os_type = select({
-    "@local_tsl//tsl:windows": "win",
+    "@local_xla//xla/tsl:windows": "win",
     "//conditions:default": "lin",
 })
 
@@ -114,7 +114,7 @@ omp_vars_win = {
 }
 
 omp_all_cmake_vars = select({
-    "@local_tsl//tsl:windows": cmake_var_string(
+    "@local_xla//xla/tsl:windows": cmake_var_string(
         dict_add(
             omp_vars,
             omp_vars_win,
diff --git a/third_party/xla/third_party/tsl/third_party/llvm_openmp/openmp.bzl b/third_party/xla/third_party/tsl/third_party/llvm_openmp/openmp.bzl
index a57cd59507c1d2..bbc44451c601e7 100644
--- a/third_party/xla/third_party/tsl/third_party/llvm_openmp/openmp.bzl
+++ b/third_party/xla/third_party/tsl/third_party/llvm_openmp/openmp.bzl
@@ -4,7 +4,7 @@ after the TF 2.4 branch cut has passed.
 """
 
 load(
-    "//tsl/platform:rules_cc.bzl",
+    "@local_tsl//tsl/platform:rules_cc.bzl",
     "cc_binary",
 )
 
@@ -33,9 +33,9 @@ def dict_add(*dictionaries):
 
 def select_os_specific(L, M, W):
     return select({
-        "@local_tsl//tsl:linux_x86_64": L,
-        "@local_tsl//tsl:macos": M,
-        "@local_tsl//tsl:windows": W,
+        "@local_xla//xla/tsl:linux_x86_64": L,
+        "@local_xla//xla/tsl:macos": M,
+        "@local_xla//xla/tsl:windows": W,
         "//conditions:default": L,
     })
 
diff --git a/third_party/xla/third_party/tsl/third_party/mkl_dnn/mkldnn_v1.BUILD b/third_party/xla/third_party/tsl/third_party/mkl_dnn/mkldnn_v1.BUILD
index 021557f4c2bc76..cc9e66d77e3a77 100644
--- a/third_party/xla/third_party/tsl/third_party/mkl_dnn/mkldnn_v1.BUILD
+++ b/third_party/xla/third_party/tsl/third_party/mkl_dnn/mkldnn_v1.BUILD
@@ -1,6 +1,6 @@
 load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
 load("@local_tsl//third_party/mkl_dnn:build_defs.bzl", "if_mkldnn_openmp")
-load("@local_tsl//tsl:tsl.bzl", "tf_openmp_copts")
+load("@local_xla//xla/tsl:tsl.bzl", "tf_openmp_copts")
 load("@local_xla//xla/tsl/mkl:build_defs.bzl", "if_mkl", "if_mkl_ml")
 
 exports_files(["LICENSE"])
@@ -102,7 +102,7 @@ expand_template(
 )
 
 _COPTS_LIST = select({
-    "@local_tsl//tsl:windows": [],
+    "@local_xla//xla/tsl:windows": [],
     "//conditions:default": ["-fexceptions"],
 }) + [
     "-UUSE_MKL",
@@ -171,9 +171,9 @@ cc_library(
     includes = _INCLUDES_LIST,
     # TODO(penpornk): Use lrt_if_needed from tensorflow.bzl instead.
     linkopts = select({
-        "@local_tsl//tsl:linux_aarch64": ["-lrt"],
-        "@local_tsl//tsl:linux_x86_64": ["-lrt"],
-        "@local_tsl//tsl:linux_ppc64le": ["-lrt"],
+        "@local_xla//xla/tsl:linux_aarch64": ["-lrt"],
+        "@local_xla//xla/tsl:linux_x86_64": ["-lrt"],
+        "@local_xla//xla/tsl:linux_ppc64le": ["-lrt"],
         "//conditions:default": [],
     }),
     textual_hdrs = _TEXTUAL_HDRS_LIST,
diff --git a/third_party/xla/third_party/tsl/third_party/nccl/build_defs.bzl.tpl b/third_party/xla/third_party/tsl/third_party/nccl/build_defs.bzl.tpl
index 25dcdf4aa2362a..53a6d4e1e41890 100644
--- a/third_party/xla/third_party/tsl/third_party/nccl/build_defs.bzl.tpl
+++ b/third_party/xla/third_party/tsl/third_party/nccl/build_defs.bzl.tpl
@@ -329,8 +329,8 @@ def cuda_rdc_library(name, hdrs = None, copts = None, linkstatic = True, **kwarg
         out = dlink_cc,
         gpu_archs = cuda_gpu_architectures(),
         nvlink_args = select({
-            "@local_tsl//tsl:linux_x86_64": ["--cpu-arch=X86_64"],
-            "@local_tsl//tsl:linux_ppc64le": ["--cpu-arch=PPC64LE"],
+            "@local_xla//xla/tsl:linux_x86_64": ["--cpu-arch=X86_64"],
+            "@local_xla//xla/tsl:linux_ppc64le": ["--cpu-arch=PPC64LE"],
             "//conditions:default": [],
         }),
     )
diff --git a/third_party/xla/third_party/tsl/third_party/py/ml_dtypes/ml_dtypes.BUILD b/third_party/xla/third_party/tsl/third_party/py/ml_dtypes/ml_dtypes.BUILD
index a85195e9c51102..25c98abc05655d 100644
--- a/third_party/xla/third_party/tsl/third_party/py/ml_dtypes/ml_dtypes.BUILD
+++ b/third_party/xla/third_party/tsl/third_party/py/ml_dtypes/ml_dtypes.BUILD
@@ -30,6 +30,21 @@ cc_library(
         ".",
         "ml_dtypes",
     ],
+    deps = [
+        ":intn",
+    ],
+)
+
+cc_library(
+    name = "intn",
+    hdrs = ["include/intn.h"],
+    include_prefix = "ml_dtypes",
+    # Internal headers are all relative to . but other packages
+    # include these headers with the  prefix.
+    includes = [
+        ".",
+        "ml_dtypes",
+    ],
 )
 
 pybind_extension(
@@ -48,8 +63,9 @@ pybind_extension(
     deps = [
         ":float8",
         ":int4",
+        ":intn",
         "@eigen_archive//:eigen3",
-        "@org_tensorflow//third_party/py/numpy:headers",
+        "@local_tsl//third_party/py/numpy:headers",
     ],
 )
 
diff --git a/third_party/xla/third_party/tsl/third_party/py/ml_dtypes/ml_dtypes.tests.BUILD b/third_party/xla/third_party/tsl/third_party/py/ml_dtypes/ml_dtypes.tests.BUILD
index 574659aff7d734..3e73c6f4747976 100644
--- a/third_party/xla/third_party/tsl/third_party/py/ml_dtypes/ml_dtypes.tests.BUILD
+++ b/third_party/xla/third_party/tsl/third_party/py/ml_dtypes/ml_dtypes.tests.BUILD
@@ -8,7 +8,7 @@ py_library(
         "//:ml_dtypes",
         "@absl_py//absl/testing:absltest",
         "@absl_py//absl/testing:parameterized",
-        "@org_tensorflow//third_party/py/numpy",
+        "@local_tsl//third_party/py/numpy",
     ],
 )
 
diff --git a/third_party/xla/third_party/tsl/third_party/py/ml_dtypes/workspace.bzl b/third_party/xla/third_party/tsl/third_party/py/ml_dtypes/workspace.bzl
index 84aeb00edbe105..ac75f63653e8ab 100644
--- a/third_party/xla/third_party/tsl/third_party/py/ml_dtypes/workspace.bzl
+++ b/third_party/xla/third_party/tsl/third_party/py/ml_dtypes/workspace.bzl
@@ -7,8 +7,8 @@ float8 varieties, and int4.
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
-    ML_DTYPES_COMMIT = "2ca30a2b3c0744625ae3d6988f5596740080bbd0"
-    ML_DTYPES_SHA256 = "5ea2b9ab133ddd522bcdf2f9e77ac277bb19d3fa4442bd77ee024fc225f0b5ab"
+    ML_DTYPES_COMMIT = "15b400f4dbad93e725e77e7b8171e4bfebfac874"
+    ML_DTYPES_SHA256 = "368312e4909bffe6a5ef22640ddae425ee14101af069a2e48b69d2fee33461e4"
     tf_http_archive(
         name = "ml_dtypes",
         build_file = "//third_party/py/ml_dtypes:ml_dtypes.BUILD",
diff --git a/third_party/xla/third_party/tsl/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.BUILD b/third_party/xla/third_party/tsl/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.BUILD
index a85195e9c51102..25c98abc05655d 100644
--- a/third_party/xla/third_party/tsl/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.BUILD
+++ b/third_party/xla/third_party/tsl/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.BUILD
@@ -30,6 +30,21 @@ cc_library(
         ".",
         "ml_dtypes",
     ],
+    deps = [
+        ":intn",
+    ],
+)
+
+cc_library(
+    name = "intn",
+    hdrs = ["include/intn.h"],
+    include_prefix = "ml_dtypes",
+    # Internal headers are all relative to . but other packages
+    # include these headers with the  prefix.
+    includes = [
+        ".",
+        "ml_dtypes",
+    ],
 )
 
 pybind_extension(
@@ -48,8 +63,9 @@ pybind_extension(
     deps = [
         ":float8",
         ":int4",
+        ":intn",
         "@eigen_archive//:eigen3",
-        "@org_tensorflow//third_party/py/numpy:headers",
+        "@local_tsl//third_party/py/numpy:headers",
     ],
 )
 
diff --git a/third_party/xla/third_party/tsl/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.tests.BUILD b/third_party/xla/third_party/tsl/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.tests.BUILD
index 574659aff7d734..3e73c6f4747976 100644
--- a/third_party/xla/third_party/tsl/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.tests.BUILD
+++ b/third_party/xla/third_party/tsl/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.tests.BUILD
@@ -8,7 +8,7 @@ py_library(
         "//:ml_dtypes",
         "@absl_py//absl/testing:absltest",
         "@absl_py//absl/testing:parameterized",
-        "@org_tensorflow//third_party/py/numpy",
+        "@local_tsl//third_party/py/numpy",
     ],
 )
 
diff --git a/third_party/xla/third_party/tsl/third_party/py/non_hermetic/ml_dtypes/workspace.bzl b/third_party/xla/third_party/tsl/third_party/py/non_hermetic/ml_dtypes/workspace.bzl
index 84aeb00edbe105..ac75f63653e8ab 100644
--- a/third_party/xla/third_party/tsl/third_party/py/non_hermetic/ml_dtypes/workspace.bzl
+++ b/third_party/xla/third_party/tsl/third_party/py/non_hermetic/ml_dtypes/workspace.bzl
@@ -7,8 +7,8 @@ float8 varieties, and int4.
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
-    ML_DTYPES_COMMIT = "2ca30a2b3c0744625ae3d6988f5596740080bbd0"
-    ML_DTYPES_SHA256 = "5ea2b9ab133ddd522bcdf2f9e77ac277bb19d3fa4442bd77ee024fc225f0b5ab"
+    ML_DTYPES_COMMIT = "15b400f4dbad93e725e77e7b8171e4bfebfac874"
+    ML_DTYPES_SHA256 = "368312e4909bffe6a5ef22640ddae425ee14101af069a2e48b69d2fee33461e4"
     tf_http_archive(
         name = "ml_dtypes",
         build_file = "//third_party/py/ml_dtypes:ml_dtypes.BUILD",
diff --git a/third_party/xla/third_party/tsl/third_party/snappy.BUILD b/third_party/xla/third_party/tsl/third_party/snappy.BUILD
index fbf4e2eae9ef1b..9559b8e9f17d6f 100644
--- a/third_party/xla/third_party/tsl/third_party/snappy.BUILD
+++ b/third_party/xla/third_party/tsl/third_party/snappy.BUILD
@@ -19,7 +19,7 @@ cc_library(
     ],
     hdrs = ["snappy.h"],
     copts = ["-DHAVE_CONFIG_H"] + select({
-        "@local_tsl//tsl:windows": [],
+        "@local_xla//xla/tsl:windows": [],
         "//conditions:default": [
             "-fno-exceptions",
             "-Wno-sign-compare",
@@ -28,7 +28,7 @@ cc_library(
         ],
     }),
     defines = select({
-        "@local_tsl//tsl:windows": [],
+        "@local_xla//xla/tsl:windows": [],
         "//conditions:default": ["HAVE_SYS_UIO_H"],
     }),
 )
diff --git a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
index 2b789c8cead317..01b5c67cc07d67 100644
--- a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
@@ -6,14 +6,18 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "968eb3e5b0aa2e20301a41af9bb14a48dd1aee40"
-    TFRT_SHA256 = "cd3b1d190625d6ca5ddcf1c9cc0b095928707b623f1b986f1ba333a89d5418ae"
+    TFRT_COMMIT = "fc9cb9f1253679cb2b645c26e0198cd0213d4378"
+    TFRT_SHA256 = "f90273589fdbba62b7b7de6b928273b36e870f8aa2922e3cdfac6c2fa5b262aa"
 
     tf_http_archive(
         name = "tf_runtime",
         sha256 = TFRT_SHA256,
         strip_prefix = "runtime-{commit}".format(commit = TFRT_COMMIT),
         urls = tf_mirror_urls("https://github.com/tensorflow/runtime/archive/{commit}.tar.gz".format(commit = TFRT_COMMIT)),
+        repo_mapping = {
+            "@tsl": "@local_tsl",
+            "@xla": "@local_xla",
+        },
         # A patch file can be provided for atomic commits to both TF and TFRT.
         # The job that bumps the TFRT_COMMIT also resets patch_file to 'None'.
         patch_file = None,
diff --git a/third_party/xla/third_party/tsl/third_party/zlib.BUILD b/third_party/xla/third_party/tsl/third_party/zlib.BUILD
index b8ca17d13b39b1..bab615b54dfd50 100644
--- a/third_party/xla/third_party/tsl/third_party/zlib.BUILD
+++ b/third_party/xla/third_party/tsl/third_party/zlib.BUILD
@@ -33,7 +33,7 @@ cc_library(
     ],
     hdrs = ["zlib.h"],
     copts = select({
-        "@local_tsl//tsl:windows": [],
+        "@local_xla//xla/tsl:windows": [],
         "//conditions:default": [
             "-Wno-shift-negative-value",
             "-DZ_HAVE_UNISTD_H",
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl
index f55194087b1277..756c7bbe8d786c 100644
--- a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl
+++ b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl
@@ -725,3 +725,80 @@ def initialize_rbe_configs():
             "TF_TENSORRT_VERSION": "8.6",
         },
     )
+
+    sigbuild_tf_configs(
+        name_container_map = {
+            "sigbuild-r2.17": "docker://gcr.io/tensorflow-sigs/build@sha256:dddcaf30321e9007103dce75c51b83fea3c06de462fcf41e7c6ae93f37fc3545",
+            "sigbuild-r2.17-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:8ca6b205b54f18d26a053cfe606145b8b11cc99cf83fc970a936ce327913c3c3",
+            "sigbuild-r2.17-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:5cfd081a337548165a800546f2365a38245e38e7a97052b1a21830bf66b2356d",
+            "sigbuild-r2.17-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:dddcaf30321e9007103dce75c51b83fea3c06de462fcf41e7c6ae93f37fc3545",
+            "sigbuild-r2.17-python3.12": "docker://gcr.io/tensorflow-sigs/build@sha256:933c9f4bf65c92780863e00bd2132c6cfd41dbd624736c1af0dd2a5a056db6b8",
+        },
+        # Unclear why LIBC is set to 2.19 here, and yet manylinux2010 is 2.12
+        # and manylinux2014 is 2.17.
+        env = {
+            "ABI_LIBC_VERSION": "glibc_2.19",
+            "ABI_VERSION": "gcc",
+            "BAZEL_COMPILER": "/dt9/usr/bin/gcc",
+            "BAZEL_HOST_SYSTEM": "i686-unknown-linux-gnu",
+            "BAZEL_TARGET_CPU": "k8",
+            "BAZEL_TARGET_LIBC": "glibc_2.19",
+            "BAZEL_TARGET_SYSTEM": "x86_64-unknown-linux-gnu",
+            "CC": "/dt9/usr/bin/gcc",
+            "CC_TOOLCHAIN_NAME": "linux_gnu_x86",
+            "CLEAR_CACHE": "1",
+            "CUDNN_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
+            "GCC_HOST_COMPILER_PATH": "/dt9/usr/bin/gcc",
+            "GCC_HOST_COMPILER_PREFIX": "/usr/bin",
+            "HOST_CXX_COMPILER": "/dt9/usr/bin/gcc",
+            "HOST_C_COMPILER": "/dt9/usr/bin/gcc",
+            "PYTHON_BIN_PATH": "/usr/bin/python3",
+            "TF_CUDA_CLANG": "0",
+            "TF_CUDA_COMPUTE_CAPABILITIES": "3.5,6.0",
+            "TF_CUDA_VERSION": "12.3",
+            "TF_CUDNN_VERSION": "8.9",
+            "TF_ENABLE_XLA": "1",
+            "TF_NEED_CUDA": "1",
+            "TF_SYSROOT": "/dt9",
+            "TF_NEED_TENSORRT": "1",
+            "TF_TENSORRT_VERSION": "8.6",
+        },
+    )
+
+    sigbuild_tf_configs(
+        name_container_map = {
+            "sigbuild-r2.17-clang": "docker://gcr.io/tensorflow-sigs/build@sha256:dddcaf30321e9007103dce75c51b83fea3c06de462fcf41e7c6ae93f37fc3545",
+            "sigbuild-r2.17-clang-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:8ca6b205b54f18d26a053cfe606145b8b11cc99cf83fc970a936ce327913c3c3",
+            "sigbuild-r2.17-clang-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:5cfd081a337548165a800546f2365a38245e38e7a97052b1a21830bf66b2356d",
+            "sigbuild-r2.17-clang-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:dddcaf30321e9007103dce75c51b83fea3c06de462fcf41e7c6ae93f37fc3545",
+            "sigbuild-r2.17-clang-python3.12": "docker://gcr.io/tensorflow-sigs/build@sha256:933c9f4bf65c92780863e00bd2132c6cfd41dbd624736c1af0dd2a5a056db6b8",
+        },
+        # Unclear why LIBC is set to 2.19 here, and yet manylinux2010 is 2.12
+        # and manylinux2014 is 2.17.
+        env = {
+            "ABI_LIBC_VERSION": "glibc_2.19",
+            "ABI_VERSION": "gcc",
+            "BAZEL_COMPILER": "/usr/lib/llvm-17/bin/clang",
+            "BAZEL_HOST_SYSTEM": "i686-unknown-linux-gnu",
+            "BAZEL_TARGET_CPU": "k8",
+            "BAZEL_TARGET_LIBC": "glibc_2.19",
+            "BAZEL_TARGET_SYSTEM": "x86_64-unknown-linux-gnu",
+            "CC": "/usr/lib/llvm-17/bin/clang",
+            "CC_TOOLCHAIN_NAME": "linux_gnu_x86",
+            "CLEAR_CACHE": "1",
+            "CUDNN_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
+            "CLANG_CUDA_COMPILER_PATH": "/usr/lib/llvm-17/bin/clang",
+            "HOST_CXX_COMPILER": "/usr/lib/llvm-17/bin/clang",
+            "HOST_C_COMPILER": "/usr/lib/llvm-17/bin/clang",
+            "PYTHON_BIN_PATH": "/usr/bin/python3",
+            "TF_CUDA_CLANG": "1",
+            "TF_CUDA_COMPUTE_CAPABILITIES": "3.5,6.0",
+            "TF_CUDA_VERSION": "12.3",
+            "TF_CUDNN_VERSION": "8.9",
+            "TF_ENABLE_XLA": "1",
+            "TF_NEED_CUDA": "1",
+            "TF_SYSROOT": "/dt9",
+            "TF_NEED_TENSORRT": "1",
+            "TF_TENSORRT_VERSION": "8.6",
+        },
+    )
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/containers.bzl b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/containers.bzl
index c1fbd211e16806..dd222d06bd13b1 100644
--- a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/containers.bzl
+++ b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/containers.bzl
@@ -6,9 +6,9 @@ container_digests = {
     "cuda11.2-cudnn8.1-ubuntu20.04-manylinux2014-multipython": "sha256:48612bd85709cd014711d0b0f87e0806f3567d06d2e81c6e860516b87498b821",
     # JAX manylinux2014 configs.
     "cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython": "sha256:45619e91f14faabddd79fe0cb1526df4c4ad92fc2e6ebdc725ea4419225429c3",
-    "cuda12.1-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:25bb9dcc4af0fabeb09c29c6679d85a72e751872c733465a784e6e1395b31ba3",
+    "cuda12.1-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:8c266e5b0acd203aed5e8871b63f68a39d8d23f6d882e619797e58b973f7fe63",
     "cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:9fefda035b4a12b24cd5bae56c7dbb9527a5fd06a41ced0a22ac86fe5ed26428",
-    "cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:578e8ec3d03451867a8cee100fa92f1686dd83443f8938df8e91b5f9a157f89e",
+    "cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:6f9524a2ed7f75255dc4be3a0c5e3bda581385a1c13e2fa890bc17fa62da95b2",
     # ROCM, probably not all of them still in use
     "rocm-ubuntu18.04-manylinux2010-multipython": "sha256:6e953a09b145df338bcb03e9e36f99b291140c29b72d0a048fb6c5905ccad5eb",
     "rocm-ubuntu20.04-manylinux2014-multipython": "sha256:906faec7765fe5dd067f2b092b5d5f220c1fedde725fb42c83d031b4d6f32204",
diff --git a/third_party/xla/third_party/tsl/tsl/concurrency/BUILD b/third_party/xla/third_party/tsl/tsl/concurrency/BUILD
index a6294d99b40e52..9eded5f2cab510 100644
--- a/third_party/xla/third_party/tsl/tsl/concurrency/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/concurrency/BUILD
@@ -1,5 +1,5 @@
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-load("//tsl:tsl.default.bzl", "get_compatible_with_portable")
+load("@local_xla//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("//tsl/platform:build_config.bzl", "tsl_cc_test")
 
 package(
@@ -23,6 +23,9 @@ cc_library(
     deps = [
         ":concurrent_vector",
         ":ref_count",
+        "//tsl/platform:logging",
+        "//tsl/platform:platform_port",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/status",
@@ -39,6 +42,20 @@ tsl_cc_test(
         ":async_value",
         "//tsl/platform:test",
         "//tsl/platform:test_main",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+tsl_cc_test(
+    name = "async_value_ptr_test",
+    srcs = ["async_value_ptr_test.cc"],
+    deps = [
+        ":async_value",
+        "//tsl/platform:test",
+        "//tsl/platform:test_main",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -47,8 +64,12 @@ tsl_cc_test(
     srcs = ["async_value_ref_test.cc"],
     deps = [
         ":async_value",
+        ":ref_count",
         "//tsl/platform:test",
         "//tsl/platform:test_main",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
diff --git a/third_party/xla/third_party/tsl/tsl/concurrency/async_value.cc b/third_party/xla/third_party/tsl/tsl/concurrency/async_value.cc
index 28af75607cda50..431e6272279aa0 100644
--- a/third_party/xla/third_party/tsl/tsl/concurrency/async_value.cc
+++ b/third_party/xla/third_party/tsl/tsl/concurrency/async_value.cc
@@ -16,55 +16,21 @@ limitations under the License.
 #include "tsl/concurrency/async_value.h"
 
 #include <atomic>
+#include <cstdint>
 #include <cstdlib>
-#include <functional>
+#include <limits>
 #include <utility>
-#include <vector>
 
 #include "absl/container/inlined_vector.h"
 #include "absl/functional/any_invocable.h"
 #include "absl/synchronization/blocking_counter.h"
+#include "absl/types/span.h"
 #include "tsl/concurrency/async_value_ref.h"
+#include "tsl/concurrency/ref_count.h"
+#include "tsl/platform/logging.h"
 
 namespace tsl {
 
-namespace internal {
-
-void* AlignedAlloc(size_t alignment, size_t size) {
-  size = (size + alignment - 1) / alignment * alignment;
-#ifdef _WIN32
-  // MSVC runtime doesn't support aligned_alloc(). See
-  // https://developercommunity.visualstudio.com/t/c17-stdaligned-alloc%E7%BC%BA%E5%A4%B1/468021#T-N473365
-  return _aligned_malloc(size, alignment);
-#elif defined(__ANDROID__) || defined(OS_ANDROID)
-  return memalign(alignment, size);
-#else
-  // posix_memalign requires that the requested alignment be at least
-  // alignof(void*). In this case, fall back on malloc which should return
-  // memory aligned to at least the size of a pointer.
-  if (alignment <= alignof(void*)) return std::malloc(size);
-  void* ptr = nullptr;
-  if (posix_memalign(&ptr, alignment, size) != 0)
-    return nullptr;
-  else
-    return ptr;
-#endif
-}
-
-void AlignedFree(void* ptr) {
-#ifdef _WIN32
-  // _aligned_alloc() must be paired with _aligned_free().
-  //
-  // Attempting to use free() with a pointer returned by _aligned_malloc()
-  // results in runtime issues that are hard to debug.
-  _aligned_free(ptr);
-#else
-  free(ptr);
-#endif
-}
-
-}  // namespace internal
-
 // This is a singly linked list of nodes waiting for notification, hanging off
 // of AsyncValue.  When the value becomes available or if an error occurs, the
 // callbacks are informed.
@@ -83,9 +49,8 @@ class NotifierListNode {
 uint16_t AsyncValue::CreateTypeInfoAndReturnTypeIdImpl(
     const TypeInfo& type_info) {
   size_t type_id = GetTypeInfoTableSingleton()->emplace_back(type_info) + 1;
-  // Detect overflow.
-  assert(type_id < std::numeric_limits<uint16_t>::max() &&
-         "Too many different AsyncValue types.");
+  DCHECK(type_id < std::numeric_limits<uint16_t>::max())
+      << "Too many different AsyncValue types.";
   return type_id;
 }
 
@@ -99,7 +64,7 @@ std::atomic<size_t> AsyncValue::total_allocated_async_values_;
 
 const AsyncValue::TypeInfo& AsyncValue::GetTypeInfo() const {
   TypeInfoTable* type_info_table = AsyncValue::GetTypeInfoTableSingleton();
-  assert(type_id_ != 0);
+  DCHECK_NE(type_id_, 0);
   return (*type_info_table)[type_id_ - 1];
 }
 
@@ -108,17 +73,17 @@ const AsyncValue::TypeInfo& AsyncValue::GetTypeInfo() const {
 // need to change our state and clear out the notifications. The current state
 // must be unavailable (i.e. kUnconstructed or kConstructed).
 void AsyncValue::NotifyAvailable(State available_state) {
-  assert((kind() == Kind::kConcrete || kind() == Kind::kIndirect) &&
-         "Should only be used by ConcreteAsyncValue or IndirectAsyncValue");
+  DCHECK((kind() == Kind::kConcrete || kind() == Kind::kIndirect))
+      << "Should only be used by ConcreteAsyncValue or IndirectAsyncValue";
 
-  assert(available_state == State::kConcrete ||
+  DCHECK(available_state == State::kConcrete ||
          available_state == State::kError);
 
   // Mark the value as available, ensuring that new queries for the state see
   // the value that got filled in.
   auto old_value = waiters_and_state_.exchange(
       WaitersAndState(nullptr, available_state), std::memory_order_acq_rel);
-  assert(old_value.state() == State::kUnconstructed ||
+  DCHECK(old_value.state() == State::kUnconstructed ||
          old_value.state() == State::kConstructed);
 
   RunWaiters(old_value.waiter());
@@ -158,7 +123,7 @@ void AsyncValue::EnqueueWaiter(absl::AnyInvocable<void()> waiter,
     // so, just run the waiter.
     if (old_value.state() == State::kConcrete ||
         old_value.state() == State::kError) {
-      assert(old_value.waiter() == nullptr);
+      DCHECK(old_value.waiter() == nullptr);
       node->notification_();
       delete node;
       return;
@@ -169,16 +134,16 @@ void AsyncValue::EnqueueWaiter(absl::AnyInvocable<void()> waiter,
 
   // compare_exchange_weak succeeds. The old_value must be in either
   // kUnconstructed or kConstructed state.
-  assert(old_value.state() == State::kUnconstructed ||
+  DCHECK(old_value.state() == State::kUnconstructed ||
          old_value.state() == State::kConstructed);
 }
 
 void AsyncValue::SetError(absl::Status status) {
-  assert(!status.ok());
+  DCHECK(!status.ok());
   if (kind() == Kind::kConcrete) {
     GetTypeInfo().set_error(this, std::move(status));
   } else {
-    assert(kind() == Kind::kIndirect);
+    DCHECK(kind() == Kind::kIndirect);
     auto error_av = MakeErrorAsyncValueRef(std::move(status));
     static_cast<IndirectAsyncValue*>(this)->ForwardTo(std::move(error_av));
   }
@@ -187,17 +152,17 @@ void AsyncValue::SetError(absl::Status status) {
 // Mark this IndirectAsyncValue as forwarding to the specified value.  This
 // gives the IndirectAsyncValue a +1 reference.
 void IndirectAsyncValue::ForwardTo(RCReference<AsyncValue> value) {
-  assert(IsUnavailable());
+  DCHECK(IsUnavailable());
 
   auto s = value->state();
   if (s == State::kConcrete || s == State::kError) {
-    assert(!value_ && "IndirectAsyncValue::ForwardTo is called more than once");
+    DCHECK(!value_) << "IndirectAsyncValue::ForwardTo is called more than once";
     auto* concrete_value = value.release();
     if (concrete_value->kind() == Kind::kIndirect) {
       auto* indirect_value = static_cast<IndirectAsyncValue*>(concrete_value);
       concrete_value = indirect_value->value_;
-      assert(concrete_value != nullptr);
-      assert(concrete_value->kind() == Kind::kConcrete);
+      DCHECK(concrete_value != nullptr);
+      DCHECK(concrete_value->kind() == Kind::kConcrete);
       concrete_value->AddRef();
       indirect_value->DropRef();
     }
diff --git a/third_party/xla/third_party/tsl/tsl/concurrency/async_value.h b/third_party/xla/third_party/tsl/tsl/concurrency/async_value.h
index ddeba8e1619508..41049938ad87d5 100644
--- a/third_party/xla/third_party/tsl/tsl/concurrency/async_value.h
+++ b/third_party/xla/third_party/tsl/tsl/concurrency/async_value.h
@@ -22,16 +22,15 @@ limitations under the License.
 #include <cstdint>
 #include <iostream>
 #include <memory>
-#include <string>
 #include <type_traits>
 #include <utility>
 
 #include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
-#include "absl/status/statusor.h"
 #include "absl/types/span.h"
 #include "tsl/concurrency/concurrent_vector.h"
 #include "tsl/concurrency/ref_count.h"
+#include "tsl/platform/mem.h"
 
 namespace tsl {
 
@@ -45,11 +44,6 @@ class ConcreteAsyncValue;
 template <typename T>
 constexpr bool kMaybeBase = std::is_class<T>::value && !std::is_final<T>::value;
 
-// TODO(ezhulenev): Switch to `tsl::port::Aligned(Malloc|Free)` once TFRT will
-// be able to properly depend on TSL in the open source build.
-void* AlignedAlloc(size_t alignment, size_t size);
-void AlignedFree(void* ptr);
-
 }  // namespace internal
 
 // This is a future of the specified value type. Arbitrary C++ types may be used
@@ -258,11 +252,12 @@ class AsyncValue {
   // -----------------------------------------------------------
   // Implementation details follow.  Clients should ignore them.
 
+  friend class IndirectAsyncValue;
+
   // Utility template for tag dispatching.
   template <typename T>
   struct TypeTag {};
 
-  friend class IndirectAsyncValue;
   template <typename T>
   AsyncValue(Kind kind, State state, bool is_refcounted, TypeTag<T>)
       : refcount_(1),
@@ -389,10 +384,12 @@ class AsyncValue {
 
   std::atomic<WaitersAndState> waiters_and_state_;
 
-  // We assume (and static_assert) that this is the offset of
-  // ConcreteAsyncValue::data_, which is the same as the offset of
-  // ConcreteAsyncValue::error_.
-  static constexpr int kDataOffset = 16;
+  // We assume (and static_assert) that this is the offset of ConcreteAsyncValue
+  // data payload so that we can always get a pointer to the start of payload
+  // from an async value pointer. We use alignas attribute to guarantee that the
+  // data payload stored at exactly this offset. It means that types that have
+  // larger alignment requirement are not compatible with AsyncValues.
+  static constexpr int kDataOffset = 64;
 
  private:
   // Information about a ConcreteAsyncValue<T> subclass.
@@ -541,14 +538,18 @@ class ConcreteAsyncValue : public AsyncValue {
   explicit ConcreteAsyncValue(ConstructedPayload payload, Args&&... args)
       : AsyncValue(Kind::kConcrete, State::kConstructed, payload.is_refcounted,
                    TypeTag<T>()),
-        data_store_{TypeTag<T>(), std::forward<Args>(args)...} {}
+        data_store_{TypeTag<T>(), std::forward<Args>(args)...} {
+    VerifyOffsets();
+  }
 
   // Make a ConcreteAsyncValue with kConcrete state.
   template <typename... Args>
   explicit ConcreteAsyncValue(ConcretePayload payload, Args&&... args)
       : AsyncValue(Kind::kConcrete, State::kConcrete, payload.is_refcounted,
                    TypeTag<T>()),
-        data_store_{TypeTag<T>(), std::forward<Args>(args)...} {}
+        data_store_{TypeTag<T>(), std::forward<Args>(args)...} {
+    VerifyOffsets();
+  }
 
   ~ConcreteAsyncValue() { Destroy(); }
 
@@ -693,7 +694,7 @@ class ConcreteAsyncValue : public AsyncValue {
   using DataStoreT =
       std::conditional_t<std::is_base_of_v<KeepAsyncValuePayloadOnError, T>,
                          DataAndError, DataOrError>;
-  DataStoreT data_store_;
+  alignas(AsyncValue::kDataOffset) DataStoreT data_store_;
 
   void Destroy() { data_store_.Destroy(state()); }
   bool HasData() const { return data_store_.HasData(state()); }
@@ -701,12 +702,8 @@ class ConcreteAsyncValue : public AsyncValue {
   static void VerifyOffsets() {
     static_assert(offsetof(ConcreteAsyncValue<T>, data_store_.data_) ==
                       AsyncValue::kDataOffset,
-                  "Offset of ConcreteAsyncValue::data_ is assumed to be "
-                  "AsyncValue::kDataOffset == 16");
-    static_assert(offsetof(ConcreteAsyncValue<T>, data_store_.error_) ==
-                      AsyncValue::kDataOffset,
-                  "Offset of ConcreteAsyncValue::error_ is assumed to be "
-                  "AsyncValue::kDataOffset == 16");
+                  "Offset of ConcreteAsyncValue data payload is assumed to be "
+                  "AsyncValue::kDataOffset == 64");
   }
 
   static const uint16_t concrete_type_id_;
@@ -975,12 +972,12 @@ inline void AsyncValue::Destroy() {
     // explicit check and instead make ~IndirectAsyncValue go through the
     // GetTypeInfo().destructor case below.
     static_cast<IndirectAsyncValue*>(this)->~IndirectAsyncValue();
-    if (was_ref_counted) internal::AlignedFree(this);
+    if (was_ref_counted) port::AlignedFree(this);
     return;
   }
 
   GetTypeInfo().destructor(this);
-  if (was_ref_counted) internal::AlignedFree(this);
+  if (was_ref_counted) port::AlignedFree(this);
 }
 
 inline bool AsyncValue::IsUnique() const {
diff --git a/third_party/xla/third_party/tsl/tsl/concurrency/async_value_ptr_test.cc b/third_party/xla/third_party/tsl/tsl/concurrency/async_value_ptr_test.cc
index 137bf0fcaf8ddc..54815c76190c27 100644
--- a/third_party/xla/third_party/tsl/tsl/concurrency/async_value_ptr_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/concurrency/async_value_ptr_test.cc
@@ -13,6 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstdint>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
 #include "tsl/concurrency/async_value_ref.h"
 #include "tsl/platform/test.h"
 
@@ -75,4 +80,206 @@ TEST(AsyncValuePtrTest, AndThen) {
   EXPECT_TRUE(executed);
 }
 
+TEST(AsyncValuePtrTest, AndThenError) {
+  AsyncValueRef<int32_t> ref = MakeConstructedAsyncValueRef<int32_t>(42);
+  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
+
+  auto error = absl::InternalError("test error");
+  ptr.SetError(error);
+  ptr.AndThen([&](absl::Status status) { EXPECT_EQ(status, error); });
+}
+
+TEST(AsyncValuePtrTest, AndThenNoError) {
+  AsyncValueRef<int32_t> ref = MakeAvailableAsyncValueRef<int32_t>(42);
+  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
+
+  ptr.AndThen([](absl::Status status) { EXPECT_TRUE(status.ok()); });
+}
+
+TEST(AsyncValuePtrTest, AndThenStatusOrError) {
+  AsyncValueRef<int32_t> ref = MakeConstructedAsyncValueRef<int32_t>(42);
+  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
+
+  auto error = absl::InternalError("test error");
+  ptr.SetError(error);
+
+  ptr.AndThen([&](absl::StatusOr<int32_t*> v) {
+    EXPECT_FALSE(v.ok());
+    EXPECT_EQ(v.status(), error);
+  });
+}
+
+TEST(AsyncValuePtrTest, AndThenStatusOrNoError) {
+  AsyncValueRef<int32_t> ref = MakeAvailableAsyncValueRef<int32_t>(42);
+  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
+
+  ptr.AndThen([&](absl::StatusOr<int32_t*> v) { EXPECT_EQ(**v, 42); });
+}
+
+TEST(AsyncValuePtrTest, BlockUntilReady) {
+  AsyncValueRef<int32_t> ref = MakeAvailableAsyncValueRef<int32_t>(42);
+  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
+  BlockUntilReady(ptr);
+}
+
+TEST(AsyncValuePtrTest, RunWhenReady) {
+  AsyncValueRef<int32_t> ref = MakeAvailableAsyncValueRef<int32_t>(42);
+  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
+  bool executed = false;
+  RunWhenReady(absl::MakeConstSpan({ptr}), [&] { executed = true; });
+  EXPECT_TRUE(executed);
+}
+
+namespace {
+struct A {
+  virtual ~A() = default;
+};
+struct B : public A {};
+struct C : public B {};
+struct D : public A {};
+}  // namespace
+
+TEST(AsyncValuePtrTest, Isa) {
+  // Empty async pointer always returns false for any Isa<T>.
+  AsyncValuePtr<A> null_ptr;
+  EXPECT_FALSE(Isa<A>(null_ptr));
+
+  AsyncValueRef<A> a_ref = MakeAvailableAsyncValueRef<A>();
+  AsyncValueRef<A> b_ref = MakeAvailableAsyncValueRef<B>();
+  AsyncValueRef<A> c_ref = MakeAvailableAsyncValueRef<C>();
+  AsyncValueRef<A> d_ref = MakeAvailableAsyncValueRef<D>();
+
+  EXPECT_TRUE(Isa<A>(a_ref.AsPtr()));
+  EXPECT_TRUE(Isa<B>(b_ref.AsPtr()));
+  EXPECT_TRUE(Isa<C>(c_ref.AsPtr()));
+  EXPECT_TRUE(Isa<D>(d_ref.AsPtr()));
+
+  // Error async value is Isa<T> of any type in the hierarchy.
+  AsyncValueRef<A> err = MakeErrorAsyncValueRef(absl::InternalError("error"));
+  EXPECT_TRUE(Isa<A>(err.AsPtr()));
+  EXPECT_TRUE(Isa<B>(err.AsPtr()));
+  EXPECT_TRUE(Isa<C>(err.AsPtr()));
+  EXPECT_TRUE(Isa<D>(err.AsPtr()));
+
+  // If the value was constructed with a concrete type it should return true
+  // for Isa<T> even if it was set to error later but only if types match.
+  AsyncValueRef<A> a_err = MakeConstructedAsyncValueRef<A>();
+  AsyncValueRef<B> b_err = MakeConstructedAsyncValueRef<B>();
+  a_err.SetError(absl::InternalError("error"));
+  b_err.SetError(absl::InternalError("error"));
+
+  EXPECT_TRUE(Isa<A>(a_err.AsPtr()));
+  EXPECT_TRUE(Isa<B>(b_err.AsPtr()));
+
+  // Indirect async value is Isa<T> only if it would be a no-op cast.
+  auto indirect = MakeIndirectAsyncValue();
+  AsyncValueRef<A> c_indirect(indirect);
+  EXPECT_TRUE(Isa<A>(c_indirect.AsPtr()));
+  EXPECT_FALSE(Isa<C>(c_indirect.AsPtr()));
+
+  // After forwarding indirect async value to a concrete one it correctly
+  // returns true from Isa<T> check.
+  indirect->ForwardTo(c_ref.CopyRCRef());
+  EXPECT_TRUE(Isa<A>(c_indirect.AsPtr()));
+  EXPECT_TRUE(Isa<C>(c_indirect.AsPtr()));
+}
+
+TEST(AsyncValuePtrTest, DynCast) {
+  AsyncValueRef<A> a_ref = MakeAvailableAsyncValueRef<A>();
+  AsyncValueRef<A> b_ref = MakeAvailableAsyncValueRef<B>();
+  AsyncValueRef<A> c_ref = MakeAvailableAsyncValueRef<C>();
+  AsyncValueRef<A> d_ref = MakeAvailableAsyncValueRef<D>();
+
+  EXPECT_TRUE(DynCast<A>(a_ref.AsPtr()));
+  EXPECT_TRUE(DynCast<B>(b_ref.AsPtr()));
+  EXPECT_TRUE(DynCast<C>(c_ref.AsPtr()));
+  EXPECT_TRUE(DynCast<D>(d_ref.AsPtr()));
+
+  // No-op casts are always successful.
+  EXPECT_TRUE(DynCast<A>(c_ref.AsPtr()));
+
+  // We don't support casting to base (C inherits from B) because we can't do
+  // that safely relying just on AsyncValue type id. For safe conversion to base
+  // we need to introduce some kind of traits to the type hierarchy or rely on
+  // builtin `dynamic_cast` (will work only for constructed values).
+  EXPECT_FALSE(DynCast<B>(c_ref.AsPtr()));
+
+  // Types are unrelated, although they have same base.
+  EXPECT_FALSE(DynCast<C>(d_ref.AsPtr()));
+
+  // Error async value can be DynCast to any type in the hierarchy.
+  AsyncValueRef<A> err = MakeErrorAsyncValueRef(absl::InternalError("error"));
+  EXPECT_TRUE(DynCast<A>(err.AsPtr()));
+  EXPECT_TRUE(DynCast<B>(err.AsPtr()));
+  EXPECT_TRUE(DynCast<C>(err.AsPtr()));
+  EXPECT_TRUE(DynCast<D>(err.AsPtr()));
+
+  // If the value was constructed with a concrete type it should DynCast
+  // successfully even it it was set to error later but only if types match.
+  AsyncValueRef<A> a_err = MakeConstructedAsyncValueRef<A>();
+  AsyncValueRef<B> b_err = MakeConstructedAsyncValueRef<B>();
+  a_err.SetError(absl::InternalError("error"));
+  b_err.SetError(absl::InternalError("error"));
+
+  EXPECT_TRUE(DynCast<A>(a_err.AsPtr()));
+  EXPECT_TRUE(DynCast<B>(b_err.AsPtr()));
+  EXPECT_FALSE(DynCast<C>(a_err.AsPtr()));
+
+  // Indirect async value can't be DynCast until it's forwarded unless it's a
+  // no-op DynCast to the same type.
+  auto indirect = MakeIndirectAsyncValue();
+  AsyncValueRef<A> c_indirect(indirect);
+  EXPECT_TRUE(DynCast<A>(c_indirect.AsPtr()));
+  EXPECT_FALSE(DynCast<C>(c_indirect.AsPtr()));
+
+  // After forwarding indirect async value to a concrete one it can be DynCast
+  // to a concrete type.
+  indirect->ForwardTo(c_ref.CopyRCRef());
+  EXPECT_TRUE(DynCast<A>(c_indirect.AsPtr()));
+  EXPECT_TRUE(DynCast<C>(c_indirect.AsPtr()));
+}
+
+TEST(AsyncValuePtrTest, Cast) {
+  AsyncValueRef<A> a_ref = MakeAvailableAsyncValueRef<A>();
+  AsyncValueRef<A> b_ref = MakeAvailableAsyncValueRef<B>();
+  AsyncValueRef<A> c_ref = MakeAvailableAsyncValueRef<C>();
+  AsyncValueRef<A> d_ref = MakeAvailableAsyncValueRef<D>();
+
+  EXPECT_TRUE(Cast<A>(a_ref.AsPtr()));
+  EXPECT_TRUE(Cast<B>(b_ref.AsPtr()));
+  EXPECT_TRUE(Cast<C>(c_ref.AsPtr()));
+  EXPECT_TRUE(Cast<D>(d_ref.AsPtr()));
+
+  EXPECT_TRUE(Cast<A>(c_ref.AsPtr()));
+
+  // Error async value can be Cast to any type in the hierarchy.
+  AsyncValueRef<A> err = MakeErrorAsyncValueRef(absl::InternalError("error"));
+  EXPECT_TRUE(Cast<A>(err.AsPtr()));
+  EXPECT_TRUE(Cast<B>(err.AsPtr()));
+  EXPECT_TRUE(Cast<C>(err.AsPtr()));
+  EXPECT_TRUE(Cast<D>(err.AsPtr()));
+
+  // If the value was constructed with a concrete type it should Cast
+  // successfully even it it was set to error later but only if types match.
+  AsyncValueRef<A> a_err = MakeConstructedAsyncValueRef<A>();
+  AsyncValueRef<B> b_err = MakeConstructedAsyncValueRef<B>();
+  a_err.SetError(absl::InternalError("error"));
+  b_err.SetError(absl::InternalError("error"));
+
+  EXPECT_TRUE(Cast<A>(a_err.AsPtr()));
+  EXPECT_TRUE(Cast<B>(b_err.AsPtr()));
+
+  // Indirect async value can't be Cast until it's forwarded unless it's a
+  // no-op Cast to the same type.
+  auto indirect = MakeIndirectAsyncValue();
+  AsyncValueRef<A> c_indirect(indirect);
+  EXPECT_TRUE(Cast<A>(c_indirect.AsPtr()));
+
+  // After forwarding indirect async value to a concrete one it can be Cast
+  // to a concrete type.
+  indirect->ForwardTo(c_ref.CopyRCRef());
+  EXPECT_TRUE(Cast<A>(c_indirect.AsPtr()));
+  EXPECT_TRUE(Cast<C>(c_indirect.AsPtr()));
+}
+
 }  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/concurrency/async_value_ref.cc b/third_party/xla/third_party/tsl/tsl/concurrency/async_value_ref.cc
index b850182dffd6d7..8a1e23c05d2cc6 100644
--- a/third_party/xla/third_party/tsl/tsl/concurrency/async_value_ref.cc
+++ b/third_party/xla/third_party/tsl/tsl/concurrency/async_value_ref.cc
@@ -18,6 +18,10 @@ limitations under the License.
 #include <string_view>
 #include <utility>
 
+#include "absl/status/status.h"
+#include "tsl/concurrency/async_value.h"
+#include "tsl/concurrency/ref_count.h"
+
 namespace tsl {
 
 RCReference<IndirectAsyncValue> MakeIndirectAsyncValue() {
diff --git a/third_party/xla/third_party/tsl/tsl/concurrency/async_value_ref.h b/third_party/xla/third_party/tsl/tsl/concurrency/async_value_ref.h
index 71d5ae0217b78b..2d1b5db45cd319 100644
--- a/third_party/xla/third_party/tsl/tsl/concurrency/async_value_ref.h
+++ b/third_party/xla/third_party/tsl/tsl/concurrency/async_value_ref.h
@@ -17,15 +17,20 @@ limitations under the License.
 #define TENSORFLOW_TSL_CONCURRENCY_ASYNC_VALUE_REF_H_
 
 #include <cstddef>
-#include <cstdlib>
 #include <string_view>
 #include <type_traits>
 #include <utility>
 
+#include "absl/base/attributes.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/types/span.h"
 #include "tsl/concurrency/async_value.h"
 #include "tsl/concurrency/ref_count.h"
+#include "tsl/platform/logging.h"
+#include "tsl/platform/mem.h"
 
 namespace tsl {
 
@@ -53,12 +58,11 @@ class AsyncValueRef {
 
   // Support implicit conversion from AsyncValueRef<Derived> to
   // AsyncValueRef<Base>.
-  template <typename DerivedT,
-            std::enable_if_t<std::is_base_of<T, DerivedT>::value>* = nullptr>
-  AsyncValueRef(AsyncValueRef<DerivedT>&& u)  // NOLINT
+  template <typename Derived, internal::DerivedFrom<Derived, T>* = nullptr>
+  AsyncValueRef(AsyncValueRef<Derived>&& u)  // NOLINT
       : value_(u.ReleaseRCRef()) {}
 
-  // Support implicit conversion from RCReference<AsyncValue>.
+  // Support implicit conversion from RCReference<ErrorAsyncValue>.
   AsyncValueRef(RCReference<ErrorAsyncValue> value)  // NOLINT
       : value_(std::move(value)) {}
 
@@ -83,12 +87,40 @@ class AsyncValueRef {
   // Return the stored value. The AsyncValueRef must be available.
   T& get() const { return value_->get<T>(); }
 
-  // Return the stored value as a subclass type. The AsyncValueRef must be
+  // Return the stored value as a derived type. The AsyncValueRef must be
   // available.
-  template <typename SubclassT,
-            std::enable_if_t<std::is_base_of<T, SubclassT>::value>* = nullptr>
-  SubclassT& get() const {
-    return value_->get<SubclassT>();
+  template <typename Derived, internal::DerivedFrom<Derived, T>* = nullptr>
+  Derived& get() const {
+    return value_->get<Derived>();
+  }
+
+  template <typename Derived, internal::DerivedFrom<Derived, T>* = nullptr>
+  bool Isa() const {
+    // Isa is successful if:
+    //   (1) This is no-op cast even if concrete payload has different type.
+    //   (2) Type id of a concrete payload matches Derived type id.
+    //   (3) Payload is for a special case of ErrorAsyncValue.
+    return value_ && (std::is_same_v<Derived, T> ||                     // (1)
+                      value_->IsType<Derived>() ||                      // (2)
+                      value_->IsType<DummyValueForErrorAsyncValue>());  // (3)
+  }
+
+  template <typename Derived, internal::DerivedFrom<Derived, T>* = nullptr>
+  AsyncValueRef<Derived> Cast() const {
+    DCHECK(DynCast<Derived>()) << "Illegal async value cast";
+    return AsyncValueRef<Derived>(value_);
+  }
+
+  template <typename Derived, internal::DerivedFrom<Derived, T>* = nullptr>
+  AsyncValueRef<Derived> DynCast() const {
+    DCHECK(value_) << "Async value must be not null";
+    return Isa<Derived>() ? AsyncValueRef<Derived>(value_)
+                          : AsyncValueRef<Derived>(nullptr);
+  }
+
+  template <typename Derived, internal::DerivedFrom<Derived, T>* = nullptr>
+  AsyncValueRef<Derived> DynCastOrNull() const {
+    return value_ ? DynCast<Derived>(value_) : AsyncValueRef<Derived>(nullptr);
   }
 
   T* operator->() const { return &get(); }
@@ -130,7 +162,7 @@ class AsyncValueRef {
   }
 
   void SetError(absl::Status status) const {
-    assert(!status.ok() && "expected non-ok status");
+    DCHECK(!status.ok()) << "expected non-ok status";
     return value_->SetError(std::move(status));
   }
 
@@ -202,6 +234,35 @@ class AsyncValuePtr {
     return *this;
   }
 
+  template <typename Derived, internal::DerivedFrom<Derived, T>* = nullptr>
+  bool Isa() const {
+    // Isa is successful if:
+    //   (1) This is no-op cast even if concrete payload has different type.
+    //   (2) Type id of a concrete payload matches Derived type id.
+    //   (3) Payload is for a special case of ErrorAsyncValue.
+    return value_ && (std::is_same_v<Derived, T> ||                     // (1)
+                      value_->IsType<Derived>() ||                      // (2)
+                      value_->IsType<DummyValueForErrorAsyncValue>());  // (3)
+  }
+
+  template <typename Derived, internal::DerivedFrom<Derived, T>* = nullptr>
+  AsyncValuePtr<Derived> Cast() const {
+    DCHECK(DynCast<Derived>()) << "Illegal async value cast";
+    return AsyncValuePtr<Derived>(value_);
+  }
+
+  template <typename Derived, internal::DerivedFrom<Derived, T>* = nullptr>
+  AsyncValuePtr<Derived> DynCast() const {
+    DCHECK(value_) << "Async value must be not null";
+    return Isa<Derived>() ? AsyncValuePtr<Derived>(value_)
+                          : AsyncValuePtr<Derived>(nullptr);
+  }
+
+  template <typename Derived, internal::DerivedFrom<Derived, T>* = nullptr>
+  AsyncValuePtr<Derived> DynCastOrNull() const {
+    return value_ ? DynCast<Derived>(value_) : AsyncValuePtr<Derived>(nullptr);
+  }
+
   bool IsAvailable() const { return value_->IsAvailable(); }
   bool IsUnavailable() const { return value_->IsUnavailable(); }
 
@@ -218,7 +279,7 @@ class AsyncValuePtr {
   const absl::Status& GetError() const { return value_->GetError(); }
 
   void SetError(absl::Status status) const {
-    assert(!status.ok() && "expected non-ok status");
+    DCHECK(!status.ok()) << "expected non-ok status";
     return value_->SetError(std::move(status));
   }
 
@@ -307,6 +368,94 @@ RCReference<ErrorAsyncValue> MakeErrorAsyncValueRef(std::string_view message);
 // Construct an empty IndirectAsyncValue, not forwarding to anything.
 RCReference<IndirectAsyncValue> MakeIndirectAsyncValue();
 
+//===----------------------------------------------------------------------===//
+// Functions for awaiting on the async values.
+//===----------------------------------------------------------------------===//
+
+template <typename T>
+void BlockUntilReady(const AsyncValueRef<T>& ref) {
+  BlockUntilReady(ref.GetAsyncValue());
+}
+
+template <typename T>
+void BlockUntilReady(const AsyncValuePtr<T>& ptr) {
+  BlockUntilReady(ptr.value());
+}
+
+template <typename T>
+void RunWhenReady(absl::Span<const AsyncValueRef<T>> refs,
+                  absl::AnyInvocable<void()> callee) {
+  absl::InlinedVector<AsyncValue*, 8> values(refs.size());
+  for (size_t i = 0; i < refs.size(); ++i) {
+    values[i] = refs[i].GetAsyncValue();
+  }
+  RunWhenReady(values, std::move(callee));
+}
+
+template <typename T>
+void RunWhenReady(absl::Span<const AsyncValuePtr<T>> ptrs,
+                  absl::AnyInvocable<void()> callee) {
+  absl::InlinedVector<AsyncValue*, 8> values(ptrs.size());
+  for (size_t i = 0; i < ptrs.size(); ++i) {
+    values[i] = ptrs[i].value();
+  }
+  RunWhenReady(values, std::move(callee));
+}
+
+//===----------------------------------------------------------------------===//
+// LLVM-style type casting library for async value refs and ptrs.
+//===----------------------------------------------------------------------===//
+
+template <typename Derived, typename T,
+          internal::DerivedFrom<Derived, T>* = nullptr>
+bool Isa(const AsyncValueRef<T>& ref) {
+  return ref.template Isa<Derived>();
+}
+
+template <typename Derived, typename T,
+          internal::DerivedFrom<Derived, T>* = nullptr>
+AsyncValueRef<Derived> Cast(const AsyncValueRef<T>& ref) {
+  return ref.template Cast<Derived>();
+}
+
+template <typename Derived, typename T,
+          internal::DerivedFrom<Derived, T>* = nullptr>
+AsyncValueRef<Derived> DynCast(const AsyncValueRef<T>& ref) {
+  return ref.template DynCast<Derived>();
+}
+
+template <typename Derived, typename T,
+          internal::DerivedFrom<Derived, T>* = nullptr>
+AsyncValueRef<Derived> DynCastOrNull(const AsyncValueRef<T>& ref) {
+  return ref.template DynCastOrNull<Derived>();
+}
+
+template <typename Derived, typename T,
+          internal::DerivedFrom<Derived, T>* = nullptr>
+bool Isa(AsyncValuePtr<T> ptr) {
+  return ptr.template Isa<Derived>();
+}
+
+template <typename Derived, typename T,
+          internal::DerivedFrom<Derived, T>* = nullptr>
+AsyncValuePtr<Derived> Cast(AsyncValuePtr<T> ptr) {
+  return ptr.template Cast<Derived>();
+}
+
+template <typename Derived, typename T,
+          internal::DerivedFrom<Derived, T>* = nullptr>
+AsyncValuePtr<Derived> DynCast(AsyncValuePtr<T> ptr) {
+  return ptr.template DynCast<Derived>();
+}
+
+template <typename Derived, typename T,
+          internal::DerivedFrom<Derived, T>* = nullptr>
+AsyncValuePtr<Derived> DynCastOrNull(AsyncValuePtr<T> ptr) {
+  return ptr.template DynCastOrNull<Derived>();
+}
+
+//===----------------------------------------------------------------------===//
+// Constructing reference-counted async values on the heap.
 //===----------------------------------------------------------------------===//
 
 namespace internal {
@@ -318,17 +467,12 @@ T* PlacementConstruct(void* buf, Args&&... args) {
 
 template <typename T, typename... Args>
 T* AllocateAndConstruct(Args&&... args) {
-  // TODO(ezhulenev): `port::AlignedMalloc` has a different order of arguments!
-  void* buf = internal::AlignedAlloc(alignof(T), sizeof(T));
+  void* buf = port::AlignedMalloc(sizeof(T), alignof(T));
   return PlacementConstruct<T, Args...>(buf, std::forward<Args>(args)...);
 }
 
 }  // namespace internal
 
-//===----------------------------------------------------------------------===//
-// Constructing reference-counted async values on the heap.
-//===----------------------------------------------------------------------===//
-
 // Allocate an unconstructed AsyncValueRef. The AsyncValueRef should be made
 // available later by invoking AsyncValueRef::emplace or
 // AsyncValueRef::SetError.
diff --git a/third_party/xla/third_party/tsl/tsl/concurrency/async_value_ref_test.cc b/third_party/xla/third_party/tsl/tsl/concurrency/async_value_ref_test.cc
index 63b7b6237e8976..513b9b83c4df8c 100644
--- a/third_party/xla/third_party/tsl/tsl/concurrency/async_value_ref_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/concurrency/async_value_ref_test.cc
@@ -15,9 +15,14 @@ limitations under the License.
 
 #include "tsl/concurrency/async_value_ref.h"
 
-#include <memory>
+#include <cstdint>
 #include <utility>
 
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "tsl/concurrency/async_value.h"
+#include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/test.h"
 
 namespace tsl {
@@ -112,67 +117,48 @@ TEST(AsyncValueRefTest, CopyRef) {
   EXPECT_EQ(value.GetAsyncValue(), copied_value.GetAsyncValue());
 }
 
-TEST(AsyncValueRefTest, AndThenError) {
-  auto value = MakeConstructedAsyncValueRef<int32_t>(kTestValue);
+TEST(AsyncValueRefTest, AndThen) {
+  AsyncValueRef<int32_t> ref = MakeUnconstructedAsyncValueRef<int32_t>();
 
-  auto diag = absl::InternalError("test error");
-  value.AndThen([&](absl::Status status) { EXPECT_EQ(status, diag); });
+  EXPECT_FALSE(ref.IsConcrete());
+  EXPECT_FALSE(ref.IsAvailable());
 
-  value.SetError(diag);
+  bool executed = false;
+  ref.AndThen([&]() { executed = true; });
+
+  ref.emplace(42);
+  EXPECT_TRUE(executed);
 }
 
-TEST(AsyncValueRefTest, AndThenNoError) {
-  auto value = MakeConstructedAsyncValueRef<int32_t>(kTestValue);
+TEST(AsyncValueRefTest, AndThenError) {
+  AsyncValueRef<int32_t> ref = MakeConstructedAsyncValueRef<int32_t>(42);
 
-  value.AndThen([](absl::Status status) { EXPECT_TRUE(status.ok()); });
+  auto error = absl::InternalError("test error");
+  ref.SetError(error);
 
-  value.SetStateConcrete();
+  ref.AndThen([&](absl::Status status) { EXPECT_EQ(status, error); });
 }
 
-TEST(AsyncValueRefTest, AndThenStatusOrError) {
-  auto value = MakeConstructedAsyncValueRef<int32_t>(kTestValue);
-
-  auto diag = absl::InternalError("test error");
-  value.AndThen([&](absl::StatusOr<int32_t*> v) {
-    EXPECT_FALSE(v.ok());
-    EXPECT_EQ(v.status(), diag);
-  });
-
-  value.SetError(diag);
+TEST(AsyncValueRefTest, AndThenNoError) {
+  AsyncValueRef<int32_t> ref = MakeAvailableAsyncValueRef<int32_t>(42);
+  ref.AndThen([](absl::Status status) { EXPECT_TRUE(status.ok()); });
 }
 
-TEST(AsyncValueRefTest, PtrAndThenStatusOrError) {
-  auto value = MakeConstructedAsyncValueRef<int32_t>(kTestValue);
+TEST(AsyncValueRefTest, AndThenStatusOrError) {
+  AsyncValueRef<int32_t> ref = MakeConstructedAsyncValueRef<int32_t>(42);
 
-  auto diag = absl::InternalError("test error");
-  value.AsPtr().AndThen([&](absl::StatusOr<int32_t*> v) {
+  auto error = absl::InternalError("test error");
+  ref.SetError(error);
+
+  ref.AndThen([&](absl::StatusOr<int32_t*> v) {
     EXPECT_FALSE(v.ok());
-    EXPECT_EQ(v.status(), diag);
+    EXPECT_EQ(v.status(), error);
   });
-
-  value.SetError(diag);
 }
 
 TEST(AsyncValueRefTest, AndThenStatusOrNoError) {
-  auto value = MakeConstructedAsyncValueRef<int32_t>(kTestValue);
-
-  value.AndThen([](absl::StatusOr<int32_t*> v) {
-    EXPECT_TRUE(v.ok());
-    EXPECT_EQ(**v, kTestValue);
-  });
-
-  value.SetStateConcrete();
-}
-
-TEST(AsyncValueRefTest, PtrAndThenStatusOrNoError) {
-  auto value = MakeConstructedAsyncValueRef<int32_t>(kTestValue);
-
-  value.AsPtr().AndThen([](absl::StatusOr<int32_t*> v) {
-    EXPECT_TRUE(v.ok());
-    EXPECT_EQ(**v, kTestValue);
-  });
-
-  value.SetStateConcrete();
+  AsyncValueRef<int32_t> ref = MakeAvailableAsyncValueRef<int32_t>(42);
+  ref.AndThen([&](absl::StatusOr<int32_t*> v) { EXPECT_EQ(**v, 42); });
 }
 
 TEST(AsyncValueRefTest, Nullptr) {
@@ -187,4 +173,196 @@ TEST(AsyncValueRefTest, Nullptr) {
   EXPECT_FALSE(av_int2);
 }
 
+TEST(AsyncValueRefTest, BlockUntilReady) {
+  AsyncValueRef<int32_t> ref = MakeAvailableAsyncValueRef<int32_t>(42);
+  BlockUntilReady(ref);
+}
+
+TEST(AsyncValueRefTest, RunWhenReady) {
+  AsyncValueRef<int32_t> ref = MakeAvailableAsyncValueRef<int32_t>(42);
+  bool executed = false;
+  RunWhenReady(absl::MakeConstSpan({ref}), [&] { executed = true; });
+  EXPECT_TRUE(executed);
+}
+
+namespace {
+// We create a hierarchy of classes with different alignment requirements so
+// that we can test that they all can be safely accessed via AsyncValueRef<T>
+// references using different types. We also use this hierarchy to test
+// LLVM-style casting APIs (Isa, DynCast, Cast).
+struct A {
+  alignas(16) int32_t a;
+};
+struct B : public A {
+  alignas(32) int32_t b;
+};
+struct C : public B {
+  alignas(64) int32_t c;
+};
+struct D : public B {
+  alignas(64) int32_t d;
+};
+}  // namespace
+
+TEST(AsyncValueRefTest, AlignedPayload) {
+  AsyncValueRef<D> d_ref = MakeAvailableAsyncValueRef<D>();
+  d_ref->a = 1;
+  d_ref->b = 2;
+  d_ref->d = 3;
+
+  EXPECT_EQ(d_ref->a, 1);
+  EXPECT_EQ(d_ref->b, 2);
+  EXPECT_EQ(d_ref->d, 3);
+
+  AsyncValueRef<B> b_ref = d_ref.CopyRef();
+  EXPECT_EQ(b_ref->a, 1);
+  EXPECT_EQ(b_ref->b, 2);
+
+  AsyncValueRef<A> a_ref = d_ref.CopyRef();
+  EXPECT_EQ(a_ref->a, 1);
+}
+
+TEST(AsyncValueRefTest, Isa) {
+  // Empty async reference always returns false for any Isa<T>.
+  AsyncValueRef<A> null_ref;
+  EXPECT_FALSE(Isa<A>(null_ref));
+
+  AsyncValueRef<A> a_ref = MakeAvailableAsyncValueRef<A>();
+  AsyncValueRef<A> b_ref = MakeAvailableAsyncValueRef<B>();
+  AsyncValueRef<A> c_ref = MakeAvailableAsyncValueRef<C>();
+  AsyncValueRef<A> d_ref = MakeAvailableAsyncValueRef<D>();
+
+  EXPECT_TRUE(Isa<A>(a_ref));
+  EXPECT_TRUE(Isa<B>(b_ref));
+  EXPECT_TRUE(Isa<C>(c_ref));
+  EXPECT_TRUE(Isa<D>(d_ref));
+
+  // Error async value is Isa<T> of any type in the hierarchy.
+  AsyncValueRef<A> err = MakeErrorAsyncValueRef(absl::InternalError("error"));
+  EXPECT_TRUE(Isa<A>(err));
+  EXPECT_TRUE(Isa<B>(err));
+  EXPECT_TRUE(Isa<C>(err));
+  EXPECT_TRUE(Isa<D>(err));
+
+  // If the value was constructed with a concrete type it should return true
+  // for Isa<T> even if it was set to error later but only if types match.S
+  AsyncValueRef<A> a_err = MakeConstructedAsyncValueRef<A>();
+  AsyncValueRef<B> b_err = MakeConstructedAsyncValueRef<B>();
+  a_err.SetError(absl::InternalError("error"));
+  b_err.SetError(absl::InternalError("error"));
+
+  EXPECT_TRUE(Isa<A>(a_err));
+  EXPECT_TRUE(Isa<B>(b_err));
+
+  // Indirect async value is Isa<T> only if it would be a no-op cast.
+  auto indirect = MakeIndirectAsyncValue();
+  AsyncValueRef<A> c_indirect(indirect);
+  EXPECT_TRUE(Isa<A>(c_indirect));
+  EXPECT_FALSE(Isa<C>(c_indirect));
+
+  // After forwarding indirect async value to a concrete one it correctly
+  // returns true from Isa<T> check.
+  indirect->ForwardTo(c_ref.CopyRCRef());
+  EXPECT_TRUE(Isa<A>(c_indirect));
+  EXPECT_TRUE(Isa<C>(c_indirect));
+}
+
+TEST(AsyncValueRefTest, DynCast) {
+  AsyncValueRef<A> a_ref = MakeAvailableAsyncValueRef<A>();
+  AsyncValueRef<A> b_ref = MakeAvailableAsyncValueRef<B>();
+  AsyncValueRef<A> c_ref = MakeAvailableAsyncValueRef<C>();
+  AsyncValueRef<A> d_ref = MakeAvailableAsyncValueRef<D>();
+
+  EXPECT_TRUE(DynCast<A>(a_ref));
+  EXPECT_TRUE(DynCast<B>(b_ref));
+  EXPECT_TRUE(DynCast<C>(c_ref));
+  EXPECT_TRUE(DynCast<D>(d_ref));
+
+  // No-op casts are always successful.
+  EXPECT_TRUE(DynCast<A>(c_ref));
+
+  // We don't support casting to base (C inherits from B) because we can't do
+  // that safely relying just on AsyncValue type id. For safe conversion to base
+  // we need to introduce some kind of traits to the type hierarchy or rely on
+  // builtin `dynamic_cast` (will work only for constructed values).
+  EXPECT_FALSE(DynCast<B>(c_ref));
+
+  // Types are unrelated, although they have same base.
+  EXPECT_FALSE(DynCast<C>(d_ref));
+
+  // Error async value can be DynCast to any type in the hierarchy.
+  AsyncValueRef<A> err = MakeErrorAsyncValueRef(absl::InternalError("error"));
+  EXPECT_TRUE(DynCast<A>(err));
+  EXPECT_TRUE(DynCast<B>(err));
+  EXPECT_TRUE(DynCast<C>(err));
+  EXPECT_TRUE(DynCast<D>(err));
+
+  // If the value was constructed with a concrete type it should DynCast
+  // successfully even it it was set to error later but only if types match.
+  AsyncValueRef<A> a_err = MakeConstructedAsyncValueRef<A>();
+  AsyncValueRef<B> b_err = MakeConstructedAsyncValueRef<B>();
+  a_err.SetError(absl::InternalError("error"));
+  b_err.SetError(absl::InternalError("error"));
+
+  EXPECT_TRUE(DynCast<A>(a_err));
+  EXPECT_TRUE(DynCast<B>(b_err));
+  EXPECT_FALSE(DynCast<C>(a_err));
+
+  // Indirect async value can't be DynCast until it's forwarded unless it's a
+  // no-op DynCast to the same type.
+  auto indirect = MakeIndirectAsyncValue();
+  AsyncValueRef<A> c_indirect(indirect);
+  EXPECT_TRUE(DynCast<A>(c_indirect));
+  EXPECT_FALSE(DynCast<C>(c_indirect));
+
+  // After forwarding indirect async value to a concrete one it can be DynCast
+  // to a concrete type.
+  indirect->ForwardTo(c_ref.CopyRCRef());
+  EXPECT_TRUE(DynCast<A>(c_indirect));
+  EXPECT_TRUE(DynCast<C>(c_indirect));
+}
+
+TEST(AsyncValueRefTest, Cast) {
+  AsyncValueRef<A> a_ref = MakeAvailableAsyncValueRef<A>();
+  AsyncValueRef<A> b_ref = MakeAvailableAsyncValueRef<B>();
+  AsyncValueRef<A> c_ref = MakeAvailableAsyncValueRef<C>();
+  AsyncValueRef<A> d_ref = MakeAvailableAsyncValueRef<D>();
+
+  EXPECT_TRUE(Cast<A>(a_ref));
+  EXPECT_TRUE(Cast<B>(b_ref));
+  EXPECT_TRUE(Cast<C>(c_ref));
+  EXPECT_TRUE(Cast<D>(d_ref));
+
+  EXPECT_TRUE(Cast<A>(c_ref));
+
+  // Error async value can be Cast to any type in the hierarchy.
+  AsyncValueRef<A> err = MakeErrorAsyncValueRef(absl::InternalError("error"));
+  EXPECT_TRUE(Cast<A>(err));
+  EXPECT_TRUE(Cast<B>(err));
+  EXPECT_TRUE(Cast<C>(err));
+  EXPECT_TRUE(Cast<D>(err));
+
+  // If the value was constructed with a concrete type it should Cast
+  // successfully even it it was set to error later but only if types match.
+  AsyncValueRef<A> a_err = MakeConstructedAsyncValueRef<A>();
+  AsyncValueRef<B> b_err = MakeConstructedAsyncValueRef<B>();
+  a_err.SetError(absl::InternalError("error"));
+  b_err.SetError(absl::InternalError("error"));
+
+  EXPECT_TRUE(Cast<A>(a_err));
+  EXPECT_TRUE(Cast<B>(b_err));
+
+  // Indirect async value can't be Cast until it's forwarded unless it's a
+  // no-op Cast to the same type.
+  auto indirect = MakeIndirectAsyncValue();
+  AsyncValueRef<A> c_indirect(indirect);
+  EXPECT_TRUE(Cast<A>(c_indirect));
+
+  // After forwarding indirect async value to a concrete one it can be Cast
+  // to a concrete type.
+  indirect->ForwardTo(c_ref.CopyRCRef());
+  EXPECT_TRUE(Cast<A>(c_indirect));
+  EXPECT_TRUE(Cast<C>(c_indirect));
+}
+
 }  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/concurrency/async_value_test.cc b/third_party/xla/third_party/tsl/tsl/concurrency/async_value_test.cc
index 8770e787b3bc6d..67ec25d26137c3 100644
--- a/third_party/xla/third_party/tsl/tsl/concurrency/async_value_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/concurrency/async_value_test.cc
@@ -15,9 +15,11 @@ limitations under the License.
 
 #include "tsl/concurrency/async_value.h"
 
+#include <cstdint>
 #include <memory>
 #include <utility>
 
+#include "absl/status/status.h"
 #include "tsl/concurrency/async_value_ref.h"
 #include "tsl/platform/test.h"
 
diff --git a/third_party/xla/third_party/tsl/tsl/concurrency/ref_count.h b/third_party/xla/third_party/tsl/tsl/concurrency/ref_count.h
index c10921ef2b6fb2..1b3154021c6f38 100644
--- a/third_party/xla/third_party/tsl/tsl/concurrency/ref_count.h
+++ b/third_party/xla/third_party/tsl/tsl/concurrency/ref_count.h
@@ -19,11 +19,19 @@ limitations under the License.
 #include <atomic>
 #include <cassert>
 #include <cstddef>
+#include <cstdint>
 #include <type_traits>
 #include <utility>
 
 namespace tsl {
 
+namespace internal {
+// TODO(ezhulenev): Replace with C++20 concept when available.
+// https://en.cppreference.com/w/cpp/concepts/derived_from
+template <typename Derived, typename Base>
+using DerivedFrom = typename std::enable_if_t<std::is_base_of_v<Base, Derived>>;
+}  // namespace internal
+
 #ifndef NDEBUG
 inline std::atomic<size_t> total_reference_counted_objects;
 
@@ -110,8 +118,7 @@ class ReferenceCounted {
 };
 
 // This is a smart pointer that keeps the specified reference counted value
-// around.  It is move-only to avoid accidental copies, but it can be copied
-// explicitly.
+// around.
 template <typename T>
 class RCReference {
  public:
@@ -138,14 +145,12 @@ class RCReference {
   }
 
   // Support implicit conversion from RCReference<Derived> to RCReference<Base>.
-  template <typename U,
-            typename = std::enable_if_t<std::is_base_of<T, U>::value>>
-  RCReference(RCReference<U>&& u) : pointer_(u.pointer_) {  // NOLINT
+  template <typename Derived, internal::DerivedFrom<Derived, T>* = nullptr>
+  RCReference(RCReference<Derived>&& u) : pointer_(u.pointer_) {  // NOLINT
     u.pointer_ = nullptr;
   }
-  template <typename U,
-            typename = std::enable_if_t<std::is_base_of<T, U>::value>>
-  RCReference(const RCReference<U>& u) : pointer_(u.pointer_) {  // NOLINT
+  template <typename Derived, internal::DerivedFrom<Derived, T>* = nullptr>
+  RCReference(const RCReference<Derived>& u) : pointer_(u.pointer_) {  // NOLINT
     if (pointer_) pointer_->AddRef();
   }
 
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/BUILD b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/BUILD
deleted file mode 100644
index a95081f56bd160..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/BUILD
+++ /dev/null
@@ -1,240 +0,0 @@
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-load("//tsl:tsl.bzl", "if_oss", "internal_visibility", "tsl_gpu_library")
-load("//tsl/platform:build_config.bzl", "tf_proto_library", "tsl_cc_test")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = internal_visibility([
-        "//tsl:internal",
-    ]),
-    licenses = ["notice"],
-)
-
-cc_library(
-    name = "coordination_service_error_util",
-    hdrs = ["coordination_service_error_util.h"],
-    deps = [
-        "//tsl/platform:errors",
-        "//tsl/platform:status",
-        "//tsl/protobuf:coordination_service_proto_cc",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
-tsl_cc_test(
-    name = "coordination_service_error_util_test",
-    srcs = ["coordination_service_error_util_test.cc"],
-    deps = [
-        ":coordination_service_error_util",
-        "//tsl/platform:errors",
-        "//tsl/platform:status",
-        "//tsl/platform:test",
-        "//tsl/platform:test_main",
-        "//tsl/protobuf:coordination_service_proto_cc",
-    ],
-)
-
-cc_library(
-    name = "coordination_client",
-    hdrs = ["coordination_client.h"],
-    deps = [
-        "//tsl/distributed_runtime:call_options",
-        "//tsl/platform:status",
-        "//tsl/protobuf:coordination_service_proto_cc",
-    ],
-)
-
-cc_library(
-    name = "coordination_service",
-    hdrs = ["coordination_service.h"],
-    deps = [
-        ":coordination_client",
-        "//tsl/platform:status",
-        "//tsl/platform:statusor",
-        "//tsl/protobuf:coordination_config_proto_cc",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
-    ],
-)
-
-# Keeping the implementation as a separate build target.
-# This is an alwayslink library for statically registering "standalone" implementation.
-# Other implementations of the service will be provided in the future.
-tsl_gpu_library(
-    name = "coordination_service_impl",
-    srcs = ["coordination_service.cc"],
-    deps = [
-        ":coordination_client",
-        ":coordination_service",
-        ":coordination_service_error_util",
-        "//tsl/distributed_runtime:call_options",
-        "//tsl/platform:env",
-        "//tsl/platform:errors",
-        "//tsl/platform:macros",
-        "//tsl/platform:mutex",
-        "//tsl/platform:random",
-        "//tsl/platform:status",
-        "//tsl/platform:strcat",
-        "//tsl/platform:thread_annotations",
-        "//tsl/protobuf:coordination_config_proto_cc",
-        "//tsl/protobuf:coordination_service_proto_cc",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-        "@local_xla//xla/tsl/util:device_name_utils",
-    ],
-    alwayslink = 1,
-)
-
-tf_proto_library(
-    name = "test_device_proto",
-    testonly = 1,
-    srcs = ["test_device.proto"],
-    cc_api_version = 2,
-)
-
-tsl_cc_test(
-    name = "coordination_service_test",
-    srcs = ["coordination_service_test.cc"],
-    tags = if_oss([
-        "manual",
-        "no_oss",
-    ]),  # b/169705709, no protobuf matchers in OSS.
-    deps = [
-        ":coordination_client",
-        ":coordination_service",
-        ":coordination_service_error_util",
-        ":coordination_service_impl",
-        ":test_device_proto_cc",
-        "//tsl/distributed_runtime:call_options",
-        "//tsl/lib/core:status_test_util",
-        "//tsl/platform:env",
-        "//tsl/platform:errors",
-        "//tsl/platform:mutex",
-        "//tsl/platform:random",
-        "//tsl/platform:status",
-        "//tsl/platform:test",
-        "//tsl/platform:test_main",
-        "//tsl/platform:thread_annotations",
-        "//tsl/platform:types",
-        "//tsl/protobuf:coordination_config_proto_cc",
-        "//tsl/protobuf:coordination_service_proto_cc",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-    ],
-)
-
-tsl_gpu_library(
-    name = "coordination_service_agent",
-    srcs = ["coordination_service_agent.cc"],
-    hdrs = ["coordination_service_agent.h"],
-    deps = [
-        ":coordination_client",
-        ":coordination_service_error_util",
-        "//tsl/distributed_runtime:call_options",
-        "//tsl/framework:cancellation",
-        "//tsl/lib/monitoring:gauge",
-        "//tsl/platform:env",
-        "//tsl/platform:mutex",
-        "//tsl/platform:random",
-        "//tsl/platform:status",
-        "//tsl/platform:statusor",
-        "//tsl/platform:thread_annotations",
-        "//tsl/protobuf:coordination_config_proto_cc",
-        "//tsl/protobuf:coordination_service_proto_cc",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-    ],
-)
-
-tsl_cc_test(
-    name = "coordination_service_agent_test",
-    srcs = ["coordination_service_agent_test.cc"],
-    deps = [
-        ":coordination_client",
-        ":coordination_service_agent",
-        "//tsl/distributed_runtime:call_options",
-        "//tsl/lib/core:status_test_util",
-        "//tsl/platform:env",
-        "//tsl/platform:env_impl",
-        "//tsl/platform:errors",
-        "//tsl/platform:protobuf",
-        "//tsl/platform:status",
-        "//tsl/platform:statusor",
-        "//tsl/platform:test",
-        "//tsl/platform:test_main",
-        "//tsl/protobuf:coordination_config_proto_cc_impl",
-        "//tsl/protobuf:coordination_service_proto_cc_impl",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/time",
-    ],
-)
-
-cc_library(
-    name = "coordination_service_rpc_handler",
-    srcs = ["coordination_service_rpc_handler.cc"],
-    hdrs = [
-        "coordination_service_rpc_handler.h",
-    ],
-    deps = [
-        ":coordination_service",
-        ":coordination_service_agent",
-        ":coordination_service_error_util",
-        "//tsl/platform:casts",
-        "//tsl/platform:errors",
-        "//tsl/platform:mutex",
-        "//tsl/platform:status",
-        "//tsl/platform:statusor",
-        "//tsl/platform:thread_annotations",
-        "//tsl/protobuf:coordination_service_proto_cc",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/time",
-    ],
-)
-
-tsl_cc_test(
-    name = "coordination_service_recoverable_job_test",
-    srcs = ["coordination_service_recoverable_job_test.cc"],
-    deps = [
-        ":coordination_client",
-        ":coordination_service",
-        ":coordination_service_agent",
-        ":coordination_service_impl",
-        "//tsl/distributed_runtime/rpc:async_service_interface",
-        "//tsl/distributed_runtime/rpc/coordination:grpc_coordination_client",
-        "//tsl/distributed_runtime/rpc/coordination:grpc_coordination_service_impl",
-        "//tsl/lib/core:status_test_util",
-        "//tsl/platform:env",
-        "//tsl/platform:env_impl",
-        "//tsl/platform:errors",
-        "//tsl/platform:mutex",
-        "//tsl/platform:status",
-        "//tsl/platform:test",
-        "//tsl/platform:test_main",
-        "//tsl/protobuf:coordination_config_proto_cc_impl",
-        "//tsl/protobuf:coordination_service_proto_cc_impl",
-        "//tsl/protobuf:distributed_runtime_payloads_proto_cc_impl",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/memory",
-    ],
-)
-
-filegroup(
-    name = "pywrap_required_hdrs",
-    srcs = [
-        "coordination_client.h",
-        "coordination_service.h",
-    ],
-)
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/BUILD b/third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/BUILD
deleted file mode 100644
index 4acb411bf32f00..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/BUILD
+++ /dev/null
@@ -1,96 +0,0 @@
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-load("//tsl:tsl.bzl", "internal_visibility")
-load("//tsl:tsl.default.bzl", "get_compatible_with_portable", "tsl_grpc_cc_dependencies")
-load("//tsl/platform:build_config.bzl", "tsl_cc_test")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = internal_visibility([
-        "//tsl:internal",
-    ]),
-    licenses = ["notice"],
-)
-
-cc_library(
-    name = "preemption_notifier",
-    srcs = ["preemption_notifier.cc"],
-    hdrs = ["preemption_notifier.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        "//tsl/platform:env",
-        "//tsl/platform:errors",
-        "//tsl/platform:mutex",
-        "//tsl/platform:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-    ],
-)
-
-tsl_cc_test(
-    name = "preemption_notifier_test",
-    size = "small",
-    srcs = ["preemption_notifier_test.cc"],
-    deps = [
-        ":preemption_notifier",
-        "//tsl/platform:env",
-        "//tsl/platform:env_impl",
-        "//tsl/platform:errors",
-        "//tsl/platform:mutex",
-        "//tsl/platform:status",
-        "//tsl/platform:statusor",
-        "//tsl/platform:test",
-        "//tsl/platform:test_main",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-    ],
-)
-
-cc_library(
-    name = "preemption_sync_manager",
-    srcs = ["preemption_sync_manager.cc"],
-    hdrs = ["preemption_sync_manager.h"],
-    deps = [
-        ":preemption_notifier",
-        "//tsl/distributed_runtime:call_options",
-        "//tsl/distributed_runtime/coordination:coordination_service_agent",
-        "//tsl/lib/monitoring:gauge",
-        "//tsl/platform:env",
-        "//tsl/platform:mutex",
-        "//tsl/platform:status",
-        "//tsl/platform:statusor",
-        "//tsl/protobuf:coordination_service_proto_cc",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-    ],
-)
-
-tsl_cc_test(
-    name = "preemption_sync_manager_test",
-    size = "small",
-    srcs = ["preemption_sync_manager_test.cc"],
-    deps = [
-        ":preemption_notifier",
-        ":preemption_sync_manager",
-        "//tsl/distributed_runtime/coordination:coordination_client",
-        "//tsl/distributed_runtime/coordination:coordination_service",
-        "//tsl/distributed_runtime/coordination:coordination_service_agent",
-        "//tsl/distributed_runtime/coordination:coordination_service_impl",
-        "//tsl/distributed_runtime/rpc:async_service_interface",
-        "//tsl/distributed_runtime/rpc/coordination:grpc_coordination_client",
-        "//tsl/distributed_runtime/rpc/coordination:grpc_coordination_service_impl",
-        "//tsl/platform:env",
-        "//tsl/platform:env_impl",
-        "//tsl/platform:errors",
-        "//tsl/platform:status",
-        "//tsl/platform:statusor",
-        "//tsl/platform:test",
-        "//tsl/platform:test_main",
-        "//tsl/protobuf:coordination_config_proto_cc_impl",
-        "//tsl/protobuf:coordination_service_proto_cc_impl",
-        "//tsl/protobuf:distributed_runtime_payloads_proto_cc_impl",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/time",
-    ] + tsl_grpc_cc_dependencies(),
-)
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/coordination/BUILD b/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/coordination/BUILD
deleted file mode 100644
index 7eb439e018a4bd..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/coordination/BUILD
+++ /dev/null
@@ -1,48 +0,0 @@
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-load("//tsl:tsl.bzl", "internal_visibility")
-load("//tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = internal_visibility([
-        "//tsl:internal",
-    ]),
-    licenses = ["notice"],
-)
-
-cc_library(
-    name = "grpc_coordination_client",
-    srcs = ["grpc_coordination_client.cc"],
-    hdrs = ["grpc_coordination_client.h"],
-    deps = [
-        "//tsl/distributed_runtime:call_options",
-        "//tsl/distributed_runtime/coordination:coordination_client",
-        "//tsl/distributed_runtime/rpc:grpc_channel",
-        "//tsl/distributed_runtime/rpc:grpc_client_cq_tag",
-        "//tsl/distributed_runtime/rpc:grpc_state",
-        "//tsl/distributed_runtime/rpc:grpc_util",
-        "//tsl/platform:mutex",
-        "//tsl/platform:protobuf",
-        "//tsl/platform:status",
-        "//tsl/platform:thread_annotations",
-        "//tsl/protobuf:coordination_service_proto_cc",
-    ],
-)
-
-cc_library(
-    name = "grpc_coordination_service_impl",
-    srcs = ["grpc_coordination_service_impl.cc"],
-    hdrs = ["grpc_coordination_service_impl.h"],
-    deps = [
-        "//tsl/distributed_runtime/coordination:coordination_service_agent",
-        "//tsl/distributed_runtime/coordination:coordination_service_rpc_handler",
-        "//tsl/distributed_runtime/rpc:async_service_interface",
-        "//tsl/distributed_runtime/rpc:grpc_call",
-        "//tsl/distributed_runtime/rpc:grpc_util",
-        "//tsl/platform:env",
-        "//tsl/platform:mutex",
-        "//tsl/platform:thread_annotations",
-        "//tsl/protobuf:coordination_service_cc_grpc_proto",
-        "//tsl/protobuf:coordination_service_proto_cc",
-    ] + tsl_grpc_cc_dependencies(),
-)
diff --git a/third_party/xla/third_party/tsl/tsl/framework/BUILD b/third_party/xla/third_party/tsl/tsl/framework/BUILD
index bca669a616f0ec..e6e771d111fa4e 100644
--- a/third_party/xla/third_party/tsl/tsl/framework/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/framework/BUILD
@@ -8,8 +8,8 @@ load(
     "@local_tsl//tsl/platform:rules_cc.bzl",
     "cc_library",
 )
-load("//tsl:tsl.bzl", "internal_visibility")
-load("//tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
+load("@local_xla//xla/tsl:tsl.bzl", "internal_visibility")
+load("@local_xla//xla/tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
 load(
     "//tsl/platform:build_config.bzl",
     "tsl_cc_test",
diff --git a/third_party/xla/third_party/tsl/tsl/framework/contraction/BUILD b/third_party/xla/third_party/tsl/tsl/framework/contraction/BUILD
index 47bf5f20aa81dc..e047e3f38525c1 100644
--- a/third_party/xla/third_party/tsl/tsl/framework/contraction/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/framework/contraction/BUILD
@@ -1,7 +1,7 @@
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-load("//tsl:tsl.default.bzl", "get_compatible_with_portable")
+load("@local_xla//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -88,15 +88,15 @@ cc_library(
     hdrs = ["eigen_contraction_kernel.h"],
     defines = select({
         ":disable_onednn_contraction_kernel_config": [],
-        "//tsl:android_x86": [],
-        "//tsl:arm_any": [
+        "@local_xla//xla/tsl:android_x86": [],
+        "@local_xla//xla/tsl:arm_any": [
             "TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL",
         ],
-        "//tsl:fuchsia_x86_64": [],
-        "//tsl:ios": [],
-        "//tsl:linux_ppc64le": [],
-        "//tsl:linux_s390x": [],
-        "//tsl:macos_arm64": [],
+        "@local_xla//xla/tsl:fuchsia_x86_64": [],
+        "@local_xla//xla/tsl:ios": [],
+        "@local_xla//xla/tsl:linux_ppc64le": [],
+        "@local_xla//xla/tsl:linux_s390x": [],
+        "@local_xla//xla/tsl:macos_arm64": [],
         "//conditions:default": [
             "TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL",
             "TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL",
@@ -109,13 +109,13 @@ cc_library(
         "@eigen_archive//:eigen3",
     ] + select({
         ":disable_onednn_contraction_kernel_config": [],
-        "//tsl:android_x86": [],
-        "//tsl:arm_any": [],
-        "//tsl:fuchsia_x86_64": [],
-        "//tsl:ios": [],
-        "//tsl:linux_ppc64le": [],
-        "//tsl:linux_s390x": [],
-        "//tsl:macos_arm64": [],
+        "@local_xla//xla/tsl:android_x86": [],
+        "@local_xla//xla/tsl:arm_any": [],
+        "@local_xla//xla/tsl:fuchsia_x86_64": [],
+        "@local_xla//xla/tsl:ios": [],
+        "@local_xla//xla/tsl:linux_ppc64le": [],
+        "@local_xla//xla/tsl:linux_s390x": [],
+        "@local_xla//xla/tsl:macos_arm64": [],
         "//conditions:default": ["@onednn//:mkl_dnn"],
     }),
 )
@@ -137,7 +137,7 @@ cc_library(
     # Somehow the following code works with fixedpoint, but not here.
     # visibility = [
     #     "//tensorflow:__subpackages__",
-    #     "//tsl:internal",
+    #     "@local_xla//xla/tsl:internal",
     # ],
     deps = [
         "//tsl/framework/fixedpoint",
diff --git a/third_party/xla/third_party/tsl/tsl/framework/convolution/BUILD b/third_party/xla/third_party/tsl/tsl/framework/convolution/BUILD
index 6af7b983946c64..c019f7ff998941 100644
--- a/third_party/xla/third_party/tsl/tsl/framework/convolution/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/framework/convolution/BUILD
@@ -1,5 +1,5 @@
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-load("//tsl:tsl.default.bzl", "get_compatible_with_portable")
+load("@local_xla//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 load(
     "//tsl/platform:build_config.bzl",
     "tsl_cc_test",
@@ -64,7 +64,7 @@ filegroup(
     # Somehow the following code works with fixedpoint, but not here.
     # visibility = [
     #     "//tensorflow:__subpackages__",
-    #     "//tsl:internal",
+    #     "@local_xla//xla/tsl:internal",
     # ],
 )
 
diff --git a/third_party/xla/third_party/tsl/tsl/framework/fixedpoint/BUILD b/third_party/xla/third_party/tsl/tsl/framework/fixedpoint/BUILD
index 5818820ad654be..dc9d644968afe7 100644
--- a/third_party/xla/third_party/tsl/tsl/framework/fixedpoint/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/framework/fixedpoint/BUILD
@@ -1,6 +1,6 @@
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-load("//tsl:tsl.bzl", "internal_visibility")
-load("//tsl:tsl.default.bzl", "get_compatible_with_portable")
+load("@local_xla//xla/tsl:tsl.bzl", "internal_visibility")
+load("@local_xla//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -50,7 +50,7 @@ filegroup(
     compatible_with = get_compatible_with_portable(),
     visibility = internal_visibility([
         "//tensorflow:__subpackages__",
-        "//tsl:internal",
+        "@local_xla//xla/tsl:internal",
     ]),
 )
 
diff --git a/third_party/xla/third_party/tsl/tsl/framework/test_util/BUILD b/third_party/xla/third_party/tsl/tsl/framework/test_util/BUILD
new file mode 100644
index 00000000000000..ccf07d2dbc2392
--- /dev/null
+++ b/third_party/xla/third_party/tsl/tsl/framework/test_util/BUILD
@@ -0,0 +1,28 @@
+# Description:
+#   Test util libraries.
+
+load(
+    "//tsl/platform:rules_cc.bzl",
+    "cc_library",
+)
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "mock_serving_device_selector",
+    testonly = 1,
+    hdrs = ["mock_serving_device_selector.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tsl/framework:serving_device_selector",
+        "//tsl/platform:test",
+        "//tsl/platform:test_main",
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
diff --git a/third_party/xla/third_party/tsl/tsl/framework/test_util/mock_serving_device_selector.h b/third_party/xla/third_party/tsl/tsl/framework/test_util/mock_serving_device_selector.h
new file mode 100644
index 00000000000000..5ed54df05ba44a
--- /dev/null
+++ b/third_party/xla/third_party/tsl/tsl/framework/test_util/mock_serving_device_selector.h
@@ -0,0 +1,41 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_FRAMEWORK_TEST_UTIL_MOCK_SERVING_DEVICE_SELECTOR_H_
+#define TENSORFLOW_TSL_FRAMEWORK_TEST_UTIL_MOCK_SERVING_DEVICE_SELECTOR_H_
+
+#include <cstdint>
+
+#include "absl/strings/string_view.h"
+#include "tsl/framework/serving_device_selector.h"
+#include "tsl/platform/test.h"
+namespace tsl {
+
+namespace test_util {
+
+class MockServingDeviceSelector : public tsl::ServingDeviceSelector {
+ public:
+  MOCK_METHOD(tsl::DeviceReservation, ReserveDevice, (absl::string_view),
+              (override));
+  MOCK_METHOD(void, Enqueue, (int32_t, absl::string_view), (override));
+  MOCK_METHOD(void, Completed, (int32_t, bool), (override));
+  MOCK_METHOD(void, FreeDeviceReservation, (const tsl::DeviceReservation&),
+              (override));
+};
+
+}  // namespace test_util
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_FRAMEWORK_TEST_UTIL_MOCK_SERVING_DEVICE_SELECTOR_H_
diff --git a/third_party/xla/third_party/tsl/tsl/lib/core/BUILD b/third_party/xla/third_party/tsl/tsl/lib/core/BUILD
index c5b5830535dd0e..c8bf89d1aa9bd4 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/core/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/lib/core/BUILD
@@ -8,8 +8,8 @@ load(
     "@local_tsl//tsl/platform:rules_cc.bzl",
     "cc_library",
 )
-load("//tsl:tsl.bzl", "internal_visibility")
-load("//tsl:tsl.default.bzl", "get_compatible_with_portable")
+load("@local_xla//xla/tsl:tsl.bzl", "internal_visibility")
+load("@local_xla//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("//tsl/platform:build_config.bzl", "tsl_cc_test")
 
 # TODO(rdzhabarov): Tighten visibility after migration is complete.
@@ -92,7 +92,7 @@ cc_library(
     testonly = 1,
     hdrs = ["status_test_util.h"],
     deps = [
-        "//tsl/platform:status",
+        "//tsl/platform:status_matchers",
         "//tsl/platform:test",
     ],
 )
diff --git a/third_party/xla/third_party/tsl/tsl/lib/core/status_test_util.h b/third_party/xla/third_party/tsl/tsl/lib/core/status_test_util.h
index 394236495a447f..56644ba71773c4 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/core/status_test_util.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/core/status_test_util.h
@@ -16,12 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_TSL_LIB_CORE_STATUS_TEST_UTIL_H_
 #define TENSORFLOW_TSL_LIB_CORE_STATUS_TEST_UTIL_H_
 
-#include "tsl/platform/status.h"
+#include "tsl/platform/status_matchers.h"
 #include "tsl/platform/test.h"
 
 // Macros for testing the results of functions that return tensorflow::Status.
-#define TF_EXPECT_OK(statement) EXPECT_EQ(::tsl::OkStatus(), (statement))
-#define TF_ASSERT_OK(statement) ASSERT_EQ(::tsl::OkStatus(), (statement))
+#define TF_EXPECT_OK(statement) EXPECT_THAT((statement), ::tsl::testing::IsOk())
+#define TF_ASSERT_OK(statement) ASSERT_THAT((statement), ::tsl::testing::IsOk())
 
 // There are no EXPECT_NOT_OK/ASSERT_NOT_OK macros since they would not
 // provide much value (when they fail, they would just print the OK status
diff --git a/third_party/xla/third_party/tsl/tsl/lib/gtl/BUILD b/third_party/xla/third_party/tsl/tsl/lib/gtl/BUILD
index 8a6185374c92bf..ed2d8656f526a4 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/gtl/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/lib/gtl/BUILD
@@ -2,8 +2,8 @@ load(
     "@local_tsl//tsl/platform:rules_cc.bzl",
     "cc_library",
 )
-load("//tsl:tsl.bzl", "internal_visibility")
-load("//tsl:tsl.default.bzl", "filegroup")
+load("@local_xla//xla/tsl:tsl.bzl", "internal_visibility")
+load("@local_xla//xla/tsl:tsl.default.bzl", "filegroup")
 load(
     "//tsl/platform:build_config.bzl",
     "tsl_cc_test",
@@ -29,7 +29,7 @@ package(
         "//tensorflow/examples/custom_ops_doc/simple_hash_table:__pkg__",
         "@local_xla//xla:__subpackages__",
         "//tensorflow/core/lib/gtl:__subpackages__",
-        "//tsl/distributed_runtime/rpc:__pkg__",
+        "@local_xla//xla/tsl/distributed_runtime/rpc:__pkg__",
         "//tsl/profiler/utils:__pkg__",
     ]),
     licenses = ["notice"],
@@ -80,6 +80,7 @@ cc_library(
     deps = [
         "//tsl/platform:macros",
         "//tsl/platform:types",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:inlined_vector",
     ],
 )
diff --git a/third_party/xla/third_party/tsl/tsl/lib/gtl/inlined_vector.h b/third_party/xla/third_party/tsl/tsl/lib/gtl/inlined_vector.h
index 324475229462e6..fc8533b02937ab 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/gtl/inlined_vector.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/gtl/inlined_vector.h
@@ -16,16 +16,25 @@ limitations under the License.
 #ifndef TENSORFLOW_TSL_LIB_GTL_INLINED_VECTOR_H_
 #define TENSORFLOW_TSL_LIB_GTL_INLINED_VECTOR_H_
 
+#include <cstddef>
+
+#include "absl/base/macros.h"
 #include "absl/container/inlined_vector.h"  // IWYU pragma: export
 // TODO(kramerb): This is kept only because lots of targets transitively depend
 // on it. Remove all targets' dependencies.
 #include "tsl/platform/macros.h"
 #include "tsl/platform/types.h"
 
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
 namespace tsl {
 namespace gtl {
 
-using absl::InlinedVector;
+template <typename T, size_t N>
+using InlinedVector ABSL_DEPRECATE_AND_INLINE() = absl::InlinedVector<T, N>;
 
 }  // namespace gtl
 }  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/lib/gtl/subtle/BUILD b/third_party/xla/third_party/tsl/tsl/lib/gtl/subtle/BUILD
index 3e9bfe7a5d03e4..e2f9763dfd2b58 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/gtl/subtle/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/lib/gtl/subtle/BUILD
@@ -1,8 +1,8 @@
 # Description:
 # gtl subtle packages.
 
-load("//tsl:tsl.bzl", "internal_visibility")
-load("//tsl:tsl.default.bzl", "filegroup")
+load("@local_xla//xla/tsl:tsl.bzl", "internal_visibility")
+load("@local_xla//xla/tsl:tsl.default.bzl", "filegroup")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/third_party/tsl/tsl/lib/hash/BUILD b/third_party/xla/third_party/tsl/tsl/lib/hash/BUILD
index 2145beac6e8dda..c497abfe17ac47 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/hash/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/lib/hash/BUILD
@@ -3,12 +3,12 @@ load(
     "cc_library",
 )
 load(
-    "//tsl:tsl.bzl",
+    "@local_xla//xla/tsl:tsl.bzl",
     "if_linux_x86_64",
     "internal_visibility",
     "tsl_copts",
 )
-load("//tsl:tsl.default.bzl", "filegroup")
+load("@local_xla//xla/tsl:tsl.default.bzl", "filegroup")
 load(
     "//tsl/platform:build_config.bzl",
     "tsl_cc_test",
diff --git a/third_party/xla/third_party/tsl/tsl/lib/histogram/BUILD b/third_party/xla/third_party/tsl/tsl/lib/histogram/BUILD
index 3f0565eaf9ae63..7d6f66ad8497ed 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/histogram/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/lib/histogram/BUILD
@@ -2,8 +2,8 @@ load(
     "@local_tsl//tsl/platform:rules_cc.bzl",
     "cc_library",
 )
-load("//tsl:tsl.bzl", "internal_visibility")
-load("//tsl:tsl.default.bzl", "filegroup")
+load("@local_xla//xla/tsl:tsl.bzl", "internal_visibility")
+load("@local_xla//xla/tsl:tsl.default.bzl", "filegroup")
 load(
     "//tsl/platform:build_config.bzl",
     "tsl_cc_test",
diff --git a/third_party/xla/third_party/tsl/tsl/lib/io/BUILD b/third_party/xla/third_party/tsl/tsl/lib/io/BUILD
index 9d6e02ec835ef1..e92e2896dc9241 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/io/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/lib/io/BUILD
@@ -2,8 +2,8 @@ load(
     "@local_tsl//tsl/platform:rules_cc.bzl",
     "cc_library",
 )
-load("//tsl:tsl.bzl", "internal_visibility")
-load("//tsl:tsl.default.bzl", "filegroup")
+load("@local_xla//xla/tsl:tsl.bzl", "internal_visibility")
+load("@local_xla//xla/tsl:tsl.default.bzl", "filegroup")
 load("//tsl/platform:build_config.bzl", "tsl_cc_test")
 
 package(
diff --git a/third_party/xla/third_party/tsl/tsl/lib/io/snappy/BUILD b/third_party/xla/third_party/tsl/tsl/lib/io/snappy/BUILD
index 57e178d668e126..3f42c5fa8b03ae 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/io/snappy/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/lib/io/snappy/BUILD
@@ -1,4 +1,4 @@
-load("//tsl:tsl.bzl", "internal_visibility")
+load("@local_xla//xla/tsl:tsl.bzl", "internal_visibility")
 load(
     "//tsl/platform:build_config.bzl",
     "tsl_cc_test",
diff --git a/third_party/xla/third_party/tsl/tsl/lib/io/snappy/snappy_inputbuffer.cc b/third_party/xla/third_party/tsl/tsl/lib/io/snappy/snappy_inputbuffer.cc
index 82a0025aa6c377..d04d8d184549b3 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/io/snappy/snappy_inputbuffer.cc
+++ b/third_party/xla/third_party/tsl/tsl/lib/io/snappy/snappy_inputbuffer.cc
@@ -34,7 +34,8 @@ SnappyInputBuffer::SnappyInputBuffer(
       next_in_(input_buffer_.get()),
       bytes_read_(0) {}
 
-Status SnappyInputBuffer::ReadNBytes(int64_t bytes_to_read, tstring* result) {
+absl::Status SnappyInputBuffer::ReadNBytes(int64_t bytes_to_read,
+                                           tstring* result) {
   result->clear();
   result->resize_uninitialized(bytes_to_read);
 
@@ -57,18 +58,18 @@ Status SnappyInputBuffer::ReadNBytes(int64_t bytes_to_read, tstring* result) {
     result_ptr += bytes_read;
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 int64_t SnappyInputBuffer::Tell() const { return bytes_read_; }
 
-Status SnappyInputBuffer::Reset() {
+absl::Status SnappyInputBuffer::Reset() {
   file_pos_ = 0;
   avail_in_ = 0;
   avail_out_ = 0;
   next_in_ = input_buffer_.get();
   bytes_read_ = 0;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 size_t SnappyInputBuffer::ReadBytesFromCache(size_t bytes_to_read,
@@ -83,7 +84,7 @@ size_t SnappyInputBuffer::ReadBytesFromCache(size_t bytes_to_read,
   return can_read_bytes;
 }
 
-Status SnappyInputBuffer::Inflate() {
+absl::Status SnappyInputBuffer::Inflate() {
   // Read length of compressed block.
   uint32 compressed_block_length;
   TF_RETURN_IF_ERROR(ReadCompressedBlockLength(&compressed_block_length));
@@ -126,10 +127,10 @@ Status SnappyInputBuffer::Inflate() {
   next_in_ += compressed_block_length;
   avail_in_ -= compressed_block_length;
   avail_out_ += uncompressed_length;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status SnappyInputBuffer::ReadCompressedBlockLength(uint32* length) {
+absl::Status SnappyInputBuffer::ReadCompressedBlockLength(uint32* length) {
   *length = 0;
   size_t bytes_to_read = 4;
   while (bytes_to_read > 0) {
@@ -148,10 +149,10 @@ Status SnappyInputBuffer::ReadCompressedBlockLength(uint32* length) {
       avail_in_--;
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status SnappyInputBuffer::ReadFromFile() {
+absl::Status SnappyInputBuffer::ReadFromFile() {
   int bytes_to_read = input_buffer_capacity_;
   char* read_location = reinterpret_cast<char*>(input_buffer_.get());
 
@@ -171,7 +172,7 @@ Status SnappyInputBuffer::ReadFromFile() {
   }
   StringPiece data;
   // Try to read enough data to fill up input_buffer_.
-  Status s = file_->Read(file_pos_, bytes_to_read, &data, read_location);
+  absl::Status s = file_->Read(file_pos_, bytes_to_read, &data, read_location);
   if (data.data() != read_location) {
     memmove(read_location, data.data(), data.size());
   }
@@ -197,7 +198,7 @@ Status SnappyInputBuffer::ReadFromFile() {
     return errors::OutOfRange("EOF reached");
   }
   if (absl::IsOutOfRange(s)) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   return s;
diff --git a/third_party/xla/third_party/tsl/tsl/lib/io/snappy/snappy_inputbuffer.h b/third_party/xla/third_party/tsl/tsl/lib/io/snappy/snappy_inputbuffer.h
index e4f4144f0625dc..4d7fd3fe2e010d 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/io/snappy/snappy_inputbuffer.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/io/snappy/snappy_inputbuffer.h
@@ -56,11 +56,11 @@ class SnappyInputBuffer : public InputStreamInterface {
   //   If input_buffer_ is smaller in size than a compressed block.
   // others:
   //   If reading from file failed.
-  Status ReadNBytes(int64_t bytes_to_read, tstring* result) override;
+  absl::Status ReadNBytes(int64_t bytes_to_read, tstring* result) override;
 
   int64_t Tell() const override;
 
-  Status Reset() override;
+  absl::Status Reset() override;
 
  private:
   // Reads data from `file_` and tries to fill up `input_buffer_` if enough
@@ -76,13 +76,13 @@ class SnappyInputBuffer : public InputStreamInterface {
   // Returns OutOfRange error if NO data could be read from file. Note that this
   // won't return an OutOfRange if there wasn't sufficient data in file to
   // completely fill up `input_buffer_`.
-  Status ReadFromFile();
+  absl::Status ReadFromFile();
 
   // Reads the length of the next compressed block stored in the next 4 bytes at
   // `next_in_`. Uncompresses the next compressed block and writes the output
   // produced to the output_buffer_.
   // Should be called only after the cached output has been consumed.
-  Status Inflate();
+  absl::Status Inflate();
 
   // Starts reading bytes at `next_out_` until either `bytes_to_read`
   // bytes have been read or `next_out_` is reached.
@@ -92,7 +92,7 @@ class SnappyInputBuffer : public InputStreamInterface {
 
   // Reads the length of the next *compressed* block and stores in `length`.
   // The length is stored in 4 bytes in little endian notation.
-  Status ReadCompressedBlockLength(uint32* length);
+  absl::Status ReadCompressedBlockLength(uint32* length);
 
   RandomAccessFile* file_;         // Not owned
   int64_t file_pos_ = 0;           // Next position to read from in `file_`
diff --git a/third_party/xla/third_party/tsl/tsl/lib/io/snappy/snappy_inputstream.cc b/third_party/xla/third_party/tsl/tsl/lib/io/snappy/snappy_inputstream.cc
index 8955d60d7c3dc3..264f524fcef48f 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/io/snappy/snappy_inputstream.cc
+++ b/third_party/xla/third_party/tsl/tsl/lib/io/snappy/snappy_inputstream.cc
@@ -45,7 +45,8 @@ SnappyInputStream::~SnappyInputStream() {
   }
 }
 
-Status SnappyInputStream::ReadNBytes(int64_t bytes_to_read, tstring* result) {
+absl::Status SnappyInputStream::ReadNBytes(int64_t bytes_to_read,
+                                           tstring* result) {
   result->clear();
   result->resize_uninitialized(bytes_to_read);
 
@@ -67,22 +68,22 @@ Status SnappyInputStream::ReadNBytes(int64_t bytes_to_read, tstring* result) {
     result_ptr += bytes_read;
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 #if defined(TF_CORD_SUPPORT)
-Status SnappyInputStream::ReadNBytes(int64_t bytes_to_read,
-                                     absl::Cord* result) {
+absl::Status SnappyInputStream::ReadNBytes(int64_t bytes_to_read,
+                                           absl::Cord* result) {
   // TODO(frankchn): Optimize this instead of bouncing through the buffer.
   tstring buf;
   TF_RETURN_IF_ERROR(ReadNBytes(bytes_to_read, &buf));
   result->Clear();
   result->Append(buf.data());
-  return OkStatus();
+  return absl::OkStatus();
 }
 #endif
 
-Status SnappyInputStream::Inflate() {
+absl::Status SnappyInputStream::Inflate() {
   tstring compressed_block_length_ts;
   uint32 compressed_block_length;
 
@@ -97,7 +98,7 @@ Status SnappyInputStream::Inflate() {
   tstring compressed_block;
   compressed_block.resize_uninitialized(compressed_block_length);
 
-  Status s =
+  absl::Status s =
       input_stream_->ReadNBytes(compressed_block_length, &compressed_block);
   if (errors::IsOutOfRange(s)) {
     return errors::DataLoss("Failed to read ", compressed_block_length,
@@ -128,7 +129,7 @@ Status SnappyInputStream::Inflate() {
   }
   avail_out_ += uncompressed_length;
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 size_t SnappyInputStream::ReadBytesFromCache(size_t bytes_to_read,
@@ -145,11 +146,11 @@ size_t SnappyInputStream::ReadBytesFromCache(size_t bytes_to_read,
 
 int64_t SnappyInputStream::Tell() const { return bytes_read_; }
 
-Status SnappyInputStream::Reset() {
+absl::Status SnappyInputStream::Reset() {
   TF_RETURN_IF_ERROR(input_stream_->Reset());
   avail_out_ = 0;
   bytes_read_ = 0;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace io
diff --git a/third_party/xla/third_party/tsl/tsl/lib/io/snappy/snappy_inputstream.h b/third_party/xla/third_party/tsl/tsl/lib/io/snappy/snappy_inputstream.h
index f9ffe5e95378dd..6240aa53feb7fa 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/io/snappy/snappy_inputstream.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/io/snappy/snappy_inputstream.h
@@ -46,19 +46,19 @@ class SnappyInputStream : public InputStreamInterface {
   // ABORTED:      If inflate() fails, we return the error code with the
   //               error message in `z_stream_->msg`.
   // others:       If reading from stream failed.
-  Status ReadNBytes(int64_t bytes_to_read, tstring* result) override;
+  absl::Status ReadNBytes(int64_t bytes_to_read, tstring* result) override;
 
 #if defined(TF_CORD_SUPPORT)
-  Status ReadNBytes(int64_t bytes_to_read, absl::Cord* result) override;
+  absl::Status ReadNBytes(int64_t bytes_to_read, absl::Cord* result) override;
 #endif
 
   int64_t Tell() const override;
 
-  Status Reset() override;
+  absl::Status Reset() override;
 
  private:
   // Decompress the next chunk of data and place the data into the cache.
-  Status Inflate();
+  absl::Status Inflate();
 
   // Attempt to read `bytes_to_read` from the decompressed data cache. Returns
   // the actual number of bytes read.
diff --git a/third_party/xla/third_party/tsl/tsl/lib/io/snappy/snappy_outputbuffer.cc b/third_party/xla/third_party/tsl/tsl/lib/io/snappy/snappy_outputbuffer.cc
index 3a70e7fd02cab5..6d19c60839995e 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/io/snappy/snappy_outputbuffer.cc
+++ b/third_party/xla/third_party/tsl/tsl/lib/io/snappy/snappy_outputbuffer.cc
@@ -40,36 +40,38 @@ SnappyOutputBuffer::~SnappyOutputBuffer() {
   }
 }
 
-Status SnappyOutputBuffer::Append(StringPiece data) { return Write(data); }
+absl::Status SnappyOutputBuffer::Append(StringPiece data) {
+  return Write(data);
+}
 
 #if defined(TF_CORD_SUPPORT)
-Status SnappyOutputBuffer::Append(const absl::Cord& cord) {
+absl::Status SnappyOutputBuffer::Append(const absl::Cord& cord) {
   for (absl::string_view fragment : cord.Chunks()) {
     TF_RETURN_IF_ERROR(Append(fragment));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 #endif
 
-Status SnappyOutputBuffer::Close() {
+absl::Status SnappyOutputBuffer::Close() {
   // Given that we do not own `file`, we don't close it.
   return Flush();
 }
 
-Status SnappyOutputBuffer::Name(StringPiece* result) const {
+absl::Status SnappyOutputBuffer::Name(StringPiece* result) const {
   return file_->Name(result);
 }
 
-Status SnappyOutputBuffer::Sync() {
+absl::Status SnappyOutputBuffer::Sync() {
   TF_RETURN_IF_ERROR(Flush());
   return file_->Sync();
 }
 
-Status SnappyOutputBuffer::Tell(int64_t* position) {
+absl::Status SnappyOutputBuffer::Tell(int64_t* position) {
   return file_->Tell(position);
 }
 
-Status SnappyOutputBuffer::Write(StringPiece data) {
+absl::Status SnappyOutputBuffer::Write(StringPiece data) {
   //
   // The deflated output is accumulated in output_buffer_ and gets written to
   // file as and when needed.
@@ -80,7 +82,7 @@ Status SnappyOutputBuffer::Write(StringPiece data) {
   // add it there and return.
   if (static_cast<int32>(bytes_to_write) <= AvailableInputSpace()) {
     AddToInputBuffer(data);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // If there isn't enough available space in the input_buffer_ we empty it
@@ -91,7 +93,7 @@ Status SnappyOutputBuffer::Write(StringPiece data) {
   // input_buffer_ should be empty at this point.
   if (static_cast<int32>(bytes_to_write) <= AvailableInputSpace()) {
     AddToInputBuffer(data);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // `data` is too large to fit in input buffer so we deflate it directly.
@@ -106,13 +108,13 @@ Status SnappyOutputBuffer::Write(StringPiece data) {
 
   next_in_ = input_buffer_.get();
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status SnappyOutputBuffer::Flush() {
+absl::Status SnappyOutputBuffer::Flush() {
   TF_RETURN_IF_ERROR(DeflateBuffered());
   TF_RETURN_IF_ERROR(FlushOutputBufferToFile());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 int32 SnappyOutputBuffer::AvailableInputSpace() const {
@@ -154,7 +156,8 @@ void SnappyOutputBuffer::AddToInputBuffer(StringPiece data) {
   avail_in_ += bytes_to_write;
 }
 
-Status SnappyOutputBuffer::AddToOutputBuffer(const char* data, size_t length) {
+absl::Status SnappyOutputBuffer::AddToOutputBuffer(const char* data,
+                                                   size_t length) {
   while (length > 0) {
     size_t bytes_to_copy = std::min(length, avail_out_);
     memcpy(next_out_, data, bytes_to_copy);
@@ -166,20 +169,20 @@ Status SnappyOutputBuffer::AddToOutputBuffer(const char* data, size_t length) {
       TF_RETURN_IF_ERROR(FlushOutputBufferToFile());
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status SnappyOutputBuffer::DeflateBuffered() {
+absl::Status SnappyOutputBuffer::DeflateBuffered() {
   TF_RETURN_IF_ERROR(Deflate());
   DCHECK_EQ(avail_in_, 0);
   next_in_ = input_buffer_.get();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status SnappyOutputBuffer::FlushOutputBufferToFile() {
+absl::Status SnappyOutputBuffer::FlushOutputBufferToFile() {
   size_t bytes_to_write = output_buffer_capacity_ - avail_out_;
   if (bytes_to_write > 0) {
-    Status s = file_->Append(StringPiece(
+    absl::Status s = file_->Append(StringPiece(
         reinterpret_cast<char*>(output_buffer_.get()), bytes_to_write));
     if (s.ok()) {
       next_out_ = output_buffer_.get();
@@ -187,12 +190,12 @@ Status SnappyOutputBuffer::FlushOutputBufferToFile() {
     }
     return s;
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status SnappyOutputBuffer::Deflate() {
+absl::Status SnappyOutputBuffer::Deflate() {
   if (avail_in_ == 0) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   string output;
   if (!port::Snappy_Compress(next_in_, avail_in_, &output)) {
@@ -213,7 +216,7 @@ Status SnappyOutputBuffer::Deflate() {
   next_in_ += avail_in_;
   avail_in_ = 0;
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace io
diff --git a/third_party/xla/third_party/tsl/tsl/lib/io/snappy/snappy_outputbuffer.h b/third_party/xla/third_party/tsl/tsl/lib/io/snappy/snappy_outputbuffer.h
index 59b34d9cb4d98e..4c4d664d014a07 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/io/snappy/snappy_outputbuffer.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/io/snappy/snappy_outputbuffer.h
@@ -64,10 +64,10 @@ class SnappyOutputBuffer : public WritableFile {
   //
   // The input data is buffered internally and will be written to disk at a
   // later time. To immediately write contents to file call `Flush()`.
-  Status Append(StringPiece data) override;
+  absl::Status Append(StringPiece data) override;
 
 #if defined(TF_CORD_SUPPORT)
-  Status Append(const absl::Cord& cord) override;
+  absl::Status Append(const absl::Cord& cord) override;
 #endif
 
   // Compresses any buffered input and writes all output to file. This must be
@@ -78,17 +78,17 @@ class SnappyOutputBuffer : public WritableFile {
   //
   // After calling this, any further calls to `Write()`, `Flush()` or `Close()`
   // will fail.
-  Status Close() override;
+  absl::Status Close() override;
 
   // Returns the name of the underlying file.
-  Status Name(StringPiece* result) const override;
+  absl::Status Name(StringPiece* result) const override;
 
   // Deflates any cached input, writes all output to file and syncs it.
-  Status Sync() override;
+  absl::Status Sync() override;
 
   // Returns the write position in the underlying file. The position does not
   // reflect buffered, un-flushed data.
-  Status Tell(int64_t* position) override;
+  absl::Status Tell(int64_t* position) override;
 
   // Adds `data` to the compression pipeline.
   //
@@ -98,11 +98,11 @@ class SnappyOutputBuffer : public WritableFile {
   // to file when the buffer is full.
   //
   // To immediately write contents to file call `Flush()`.
-  Status Write(StringPiece data);
+  absl::Status Write(StringPiece data);
 
   // Compresses any cached input and writes all output to file. This must be
   // called before the destructor to avoid any data loss.
-  Status Flush() override;
+  absl::Status Flush() override;
 
  private:
   // Appends `data` to `input_buffer_`.
@@ -111,7 +111,7 @@ class SnappyOutputBuffer : public WritableFile {
 
   // Appends `data` to `output_buffer_`. Flushes buffer contents to file when
   // buffer gets full.
-  Status AddToOutputBuffer(const char* data, size_t length);
+  absl::Status AddToOutputBuffer(const char* data, size_t length);
 
   // Returns the total space available in `input_buffer_`.
   int32 AvailableInputSpace() const;
@@ -121,16 +121,16 @@ class SnappyOutputBuffer : public WritableFile {
   //
   // Note: This method does not flush contents to file.
   // Returns non-ok status if writing contents to file fails.
-  Status DeflateBuffered();
+  absl::Status DeflateBuffered();
 
   // Appends contents of `output_buffer_` to `file_`.
   // Returns non-OK status if writing to file fails.
-  Status FlushOutputBufferToFile();
+  absl::Status FlushOutputBufferToFile();
 
   // Compresses `avail_in_` bytes at `next_in_` location in `input_buffer_` and
   // writes the results to output using `AddToOutputBuffer`.
   // Returns non-OK status if writing to file failed.
-  Status Deflate();
+  absl::Status Deflate();
 
   WritableFile* file_;  // Not owned
 
diff --git a/third_party/xla/third_party/tsl/tsl/lib/io/snappy/snappy_test.cc b/third_party/xla/third_party/tsl/tsl/lib/io/snappy/snappy_test.cc
index c3d21dad010168..33f42bd1b90683 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/io/snappy/snappy_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/lib/io/snappy/snappy_test.cc
@@ -59,12 +59,13 @@ static string GenTestString(int copies = 1) {
   return result;
 }
 
-Status TestMultipleWritesWriteFile(size_t compress_input_buf_size,
-                                   size_t compress_output_buf_size,
-                                   int num_writes, bool with_flush,
-                                   int num_copies, bool corrupt_compressed_file,
-                                   string& fname, string& data,
-                                   string& expected_result) {
+absl::Status TestMultipleWritesWriteFile(size_t compress_input_buf_size,
+                                         size_t compress_output_buf_size,
+                                         int num_writes, bool with_flush,
+                                         int num_copies,
+                                         bool corrupt_compressed_file,
+                                         string& fname, string& data,
+                                         string& expected_result) {
   Env* env = Env::Default();
 
   fname = testing::TmpDir() + "/snappy_buffers_test";
@@ -120,15 +121,16 @@ Status TestMultipleWritesWriteFile(size_t compress_input_buf_size,
     fname = corrupt_fname;
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status TestMultipleWrites(size_t compress_input_buf_size,
-                          size_t compress_output_buf_size,
-                          size_t uncompress_input_buf_size,
-                          size_t uncompress_output_buf_size, int num_writes = 1,
-                          bool with_flush = false, int num_copies = 1,
-                          bool corrupt_compressed_file = false) {
+absl::Status TestMultipleWrites(size_t compress_input_buf_size,
+                                size_t compress_output_buf_size,
+                                size_t uncompress_input_buf_size,
+                                size_t uncompress_output_buf_size,
+                                int num_writes = 1, bool with_flush = false,
+                                int num_copies = 1,
+                                bool corrupt_compressed_file = false) {
   Env* env = Env::Default();
 
   string expected_result;
@@ -159,10 +161,10 @@ Status TestMultipleWrites(size_t compress_input_buf_size,
     TF_RETURN_IF_ERROR(in.Reset());
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status TestMultipleWritesInputStream(
+absl::Status TestMultipleWritesInputStream(
     size_t compress_input_buf_size, size_t compress_output_buf_size,
     size_t uncompress_input_buf_size, size_t uncompress_output_buf_size,
     int num_writes = 1, bool with_flush = false, int num_copies = 1,
@@ -197,7 +199,7 @@ Status TestMultipleWritesInputStream(
     }
     TF_RETURN_IF_ERROR(snappy_input_stream.Reset());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void TestTellWriteFile(size_t compress_input_buf_size,
@@ -322,7 +324,7 @@ TEST(SnappyBuffers, SmallUncompressInputBuffer) {
     fprintf(stderr, "skipping compression tests\n");
     return;
   }
-  Status status = TestMultipleWrites(10000, 10000, 10, 10000, 2, true);
+  absl::Status status = TestMultipleWrites(10000, 10000, 10, 10000, 2, true);
   CHECK_EQ(status.code(), error::Code::RESOURCE_EXHAUSTED);
   CheckPrefixSuffix(
       status.message(),
@@ -346,7 +348,7 @@ TEST(SnappyBuffers, CorruptBlock) {
     fprintf(stderr, "skipping compression tests\n");
     return;
   }
-  Status status =
+  absl::Status status =
       TestMultipleWrites(10000, 10000, 700, 10000, 2, true, 1, true);
   CHECK_EQ(status.code(), error::Code::DATA_LOSS);
   CheckPrefixSuffix(status.message(), "Failed to read ",
@@ -358,7 +360,7 @@ TEST(SnappyBuffers, CorruptBlockInputStream) {
     fprintf(stderr, "skipping compression tests\n");
     return;
   }
-  Status status =
+  absl::Status status =
       TestMultipleWritesInputStream(10000, 10000, 700, 10000, 2, true, 1, true);
   CHECK_EQ(status.code(), error::Code::DATA_LOSS);
   CheckPrefixSuffix(status.message(), "Failed to read ",
@@ -379,8 +381,8 @@ TEST(SnappyBuffers, CorruptBlockLargeInputStream) {
     fprintf(stderr, "skipping compression tests\n");
     return;
   }
-  Status status = TestMultipleWritesInputStream(10000, 10000, 2000, 10000, 2,
-                                                true, 1, true);
+  absl::Status status = TestMultipleWritesInputStream(10000, 10000, 2000, 10000,
+                                                      2, true, 1, true);
   CHECK_EQ(status.code(), error::Code::DATA_LOSS);
   CheckPrefixSuffix(status.message(), "Failed to read ",
                     " bytes from file. Possible data corruption.");
diff --git a/third_party/xla/third_party/tsl/tsl/lib/io/table.cc b/third_party/xla/third_party/tsl/tsl/lib/io/table.cc
index bb28e257294946..c6258ed110796b 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/io/table.cc
+++ b/third_party/xla/third_party/tsl/tsl/lib/io/table.cc
@@ -31,7 +31,7 @@ struct Table::Rep {
   ~Rep() { delete index_block; }
 
   Options options;
-  Status status;
+  absl::Status status;
   RandomAccessFile* file;
   uint64 cache_id;
 
@@ -39,8 +39,8 @@ struct Table::Rep {
   Block* index_block;
 };
 
-Status Table::Open(const Options& options, RandomAccessFile* file, uint64 size,
-                   Table** table) {
+absl::Status Table::Open(const Options& options, RandomAccessFile* file,
+                         uint64 size, Table** table) {
   *table = nullptr;
   if (size < Footer::kEncodedLength) {
     return errors::DataLoss("file is too short to be an sstable");
@@ -48,8 +48,9 @@ Status Table::Open(const Options& options, RandomAccessFile* file, uint64 size,
 
   char footer_space[Footer::kEncodedLength];
   StringPiece footer_input;
-  Status s = file->Read(size - Footer::kEncodedLength, Footer::kEncodedLength,
-                        &footer_input, footer_space);
+  absl::Status s =
+      file->Read(size - Footer::kEncodedLength, Footer::kEncodedLength,
+                 &footer_input, footer_space);
   if (!s.ok()) return s;
 
   Footer footer;
@@ -108,7 +109,7 @@ Iterator* Table::BlockReader(void* arg, const StringPiece& index_value) {
 
   BlockHandle handle;
   StringPiece input = index_value;
-  Status s = handle.DecodeFrom(&input);
+  absl::Status s = handle.DecodeFrom(&input);
   // We intentionally allow extra stuff in index_value so that we
   // can add more features in the future.
 
@@ -157,10 +158,10 @@ Iterator* Table::NewIterator() const {
                              &Table::BlockReader, const_cast<Table*>(this));
 }
 
-Status Table::InternalGet(const StringPiece& k, void* arg,
-                          void (*saver)(void*, const StringPiece&,
-                                        const StringPiece&)) {
-  Status s;
+absl::Status Table::InternalGet(const StringPiece& k, void* arg,
+                                void (*saver)(void*, const StringPiece&,
+                                              const StringPiece&)) {
+  absl::Status s;
   Iterator* iiter = rep_->index_block->NewIterator();
   iiter->Seek(k);
   if (iiter->Valid()) {
@@ -186,7 +187,7 @@ uint64 Table::ApproximateOffsetOf(const StringPiece& key) const {
   if (index_iter->Valid()) {
     BlockHandle handle;
     StringPiece input = index_iter->value();
-    Status s = handle.DecodeFrom(&input);
+    absl::Status s = handle.DecodeFrom(&input);
     if (s.ok()) {
       result = handle.offset();
     } else {
diff --git a/third_party/xla/third_party/tsl/tsl/lib/io/table.h b/third_party/xla/third_party/tsl/tsl/lib/io/table.h
index 31ed1777122f25..ecc9308f15ac1e 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/io/table.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/io/table.h
@@ -43,8 +43,8 @@ class Table {
   // to NULL and returns a non-ok status.  Does not take ownership of
   // "*file", but the client must ensure that "file" remains live
   // for the duration of the returned table's lifetime.
-  static Status Open(const Options& options, tsl::RandomAccessFile* file,
-                     uint64 file_size, Table** table);
+  static absl::Status Open(const Options& options, tsl::RandomAccessFile* file,
+                           uint64 file_size, Table** table);
 
   ~Table();
 
@@ -71,9 +71,10 @@ class Table {
   // Calls (*handle_result)(arg, ...) with the entry found after a call
   // to Seek(key).  May not make such a call if filter policy says
   // that key is not present.
-  Status InternalGet(const StringPiece& key, void* arg,
-                     void (*handle_result)(void* arg, const StringPiece& k,
-                                           const StringPiece& v));
+  absl::Status InternalGet(const StringPiece& key, void* arg,
+                           void (*handle_result)(void* arg,
+                                                 const StringPiece& k,
+                                                 const StringPiece& v));
 
   // No copying allowed
   Table(const Table&);
diff --git a/third_party/xla/third_party/tsl/tsl/lib/io/table_builder.cc b/third_party/xla/third_party/tsl/tsl/lib/io/table_builder.cc
index b8968a3c0ef89b..0adb096d4bb624 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/io/table_builder.cc
+++ b/third_party/xla/third_party/tsl/tsl/lib/io/table_builder.cc
@@ -73,7 +73,7 @@ struct TableBuilder::Rep {
   Options index_block_options;
   WritableFile* file;
   uint64 offset;
-  Status status;
+  absl::Status status;
   BlockBuilder data_block;
   BlockBuilder index_block;
   string last_key;
@@ -218,9 +218,9 @@ void TableBuilder::WriteRawBlock(const StringPiece& block_contents,
   }
 }
 
-Status TableBuilder::status() const { return rep_->status; }
+absl::Status TableBuilder::status() const { return rep_->status; }
 
-Status TableBuilder::Finish() {
+absl::Status TableBuilder::Finish() {
   Rep* r = rep_;
   Flush();
   assert(!r->closed);
diff --git a/third_party/xla/third_party/tsl/tsl/lib/io/table_builder.h b/third_party/xla/third_party/tsl/tsl/lib/io/table_builder.h
index 0740083bddc009..2e6b92d458f18c 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/io/table_builder.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/io/table_builder.h
@@ -60,12 +60,12 @@ class TableBuilder {
   void Flush();
 
   // Return non-ok iff some error has been detected.
-  Status status() const;
+  absl::Status status() const;
 
   // Finish building the table.  Stops using the file passed to the
   // constructor after this function returns.
   // REQUIRES: Finish(), Abandon() have not been called
-  Status Finish();
+  absl::Status Finish();
 
   // Indicate that the contents of this builder should be abandoned.  Stops
   // using the file passed to the constructor after this function returns.
diff --git a/third_party/xla/third_party/tsl/tsl/lib/io/table_test.cc b/third_party/xla/third_party/tsl/tsl/lib/io/table_test.cc
index 05660ea4514499..965d6c722c2c7e 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/io/table_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/lib/io/table_test.cc
@@ -95,20 +95,20 @@ class StringSink : public WritableFile {
 
   const string& contents() const { return contents_; }
 
-  Status Close() override { return OkStatus(); }
-  Status Flush() override { return OkStatus(); }
-  Status Name(StringPiece* result) const override {
+  absl::Status Close() override { return absl::OkStatus(); }
+  absl::Status Flush() override { return absl::OkStatus(); }
+  absl::Status Name(StringPiece* result) const override {
     return errors::Unimplemented("StringSink does not support Name()");
   }
-  Status Sync() override { return OkStatus(); }
-  Status Tell(int64_t* pos) override {
+  absl::Status Sync() override { return absl::OkStatus(); }
+  absl::Status Tell(int64_t* pos) override {
     *pos = contents_.size();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status Append(StringPiece data) override {
+  absl::Status Append(StringPiece data) override {
     contents_.append(data.data(), data.size());
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -124,12 +124,12 @@ class StringSource : public RandomAccessFile {
 
   uint64 Size() const { return contents_.size(); }
 
-  Status Name(StringPiece* result) const override {
+  absl::Status Name(StringPiece* result) const override {
     return errors::Unimplemented("StringSource does not support Name()");
   }
 
-  Status Read(uint64 offset, size_t n, StringPiece* result,
-              char* scratch) const override {
+  absl::Status Read(uint64 offset, size_t n, StringPiece* result,
+                    char* scratch) const override {
     if (offset > contents_.size()) {
       return errors::InvalidArgument("invalid Read offset");
     }
@@ -139,7 +139,7 @@ class StringSource : public RandomAccessFile {
     memcpy(scratch, &contents_[offset], n);
     *result = StringPiece(scratch, n);
     bytes_read_ += n;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   uint64 BytesRead() const { return bytes_read_; }
@@ -172,12 +172,13 @@ class Constructor {
       keys->push_back(it->first);
     }
     data_.clear();
-    Status s = FinishImpl(options, *kvmap);
+    absl::Status s = FinishImpl(options, *kvmap);
     ASSERT_TRUE(s.ok()) << s.ToString();
   }
 
   // Construct the data structure from the data in "data"
-  virtual Status FinishImpl(const Options& options, const KVMap& data) = 0;
+  virtual absl::Status FinishImpl(const Options& options,
+                                  const KVMap& data) = 0;
 
   virtual Iterator* NewIterator() const = 0;
 
@@ -191,7 +192,7 @@ class BlockConstructor : public Constructor {
  public:
   BlockConstructor() : block_(nullptr) {}
   ~BlockConstructor() override { delete block_; }
-  Status FinishImpl(const Options& options, const KVMap& data) override {
+  absl::Status FinishImpl(const Options& options, const KVMap& data) override {
     delete block_;
     block_ = nullptr;
     BlockBuilder builder(&options);
@@ -206,7 +207,7 @@ class BlockConstructor : public Constructor {
     contents.cacheable = false;
     contents.heap_allocated = false;
     block_ = new Block(contents);
-    return OkStatus();
+    return absl::OkStatus();
   }
   Iterator* NewIterator() const override { return block_->NewIterator(); }
 
@@ -219,7 +220,7 @@ class TableConstructor : public Constructor {
  public:
   TableConstructor() : source_(nullptr), table_(nullptr) {}
   ~TableConstructor() override { Reset(); }
-  Status FinishImpl(const Options& options, const KVMap& data) override {
+  absl::Status FinishImpl(const Options& options, const KVMap& data) override {
     Reset();
     StringSink sink;
     TableBuilder builder(options, &sink);
@@ -228,7 +229,7 @@ class TableConstructor : public Constructor {
       builder.Add(it->first, it->second);
       TF_CHECK_OK(builder.status());
     }
-    Status s = builder.Finish();
+    absl::Status s = builder.Finish();
     TF_CHECK_OK(s) << s.ToString();
 
     CHECK_EQ(sink.contents().size(), builder.FileSize());
diff --git a/third_party/xla/third_party/tsl/tsl/lib/io/two_level_iterator.cc b/third_party/xla/third_party/tsl/tsl/lib/io/two_level_iterator.cc
index 35eea1dd345ee0..f021c2ad0331d4 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/io/two_level_iterator.cc
+++ b/third_party/xla/third_party/tsl/tsl/lib/io/two_level_iterator.cc
@@ -49,7 +49,7 @@ class TwoLevelIterator : public Iterator {
     assert(Valid());
     return data_iter_->value();
   }
-  Status status() const override {
+  absl::Status status() const override {
     // It'd be nice if status() returned a const Status& instead of a
     // Status
     if (!index_iter_->status().ok()) {
@@ -62,7 +62,7 @@ class TwoLevelIterator : public Iterator {
   }
 
  private:
-  void SaveError(const Status& s) {
+  void SaveError(const absl::Status& s) {
     if (status_.ok() && !s.ok()) status_ = s;
   }
   void SkipEmptyDataBlocksForward();
@@ -71,7 +71,7 @@ class TwoLevelIterator : public Iterator {
 
   BlockFunction block_function_;
   void* arg_;
-  Status status_;
+  absl::Status status_;
   Iterator* index_iter_;
   Iterator* data_iter_;  // May be NULL
   // If data_iter_ is non-NULL, then "data_block_handle_" holds the
diff --git a/third_party/xla/third_party/tsl/tsl/lib/io/zlib_buffers_test.cc b/third_party/xla/third_party/tsl/tsl/lib/io/zlib_buffers_test.cc
index 3c53f82ea972cf..0aa65e8e747a89 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/io/zlib_buffers_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/lib/io/zlib_buffers_test.cc
@@ -193,7 +193,7 @@ TEST(ZlibInputStream, FailsToReadIfWindowBitsAreIncompatible) {
       new RandomAccessInputStream(file_reader.get()));
   ZlibInputStream in(input_stream.get(), input_buf_size, output_buf_size,
                      input_options);
-  Status read_status = in.ReadNBytes(data.size(), &result);
+  absl::Status read_status = in.ReadNBytes(data.size(), &result);
   CHECK_EQ(read_status.code(), error::DATA_LOSS);
   CHECK(absl::StrContains(read_status.message(), "inflate() failed"));
 }
diff --git a/third_party/xla/third_party/tsl/tsl/lib/io/zlib_inputstream.cc b/third_party/xla/third_party/tsl/tsl/lib/io/zlib_inputstream.cc
index 4f422261a014b9..3407805e62ddff 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/io/zlib_inputstream.cc
+++ b/third_party/xla/third_party/tsl/tsl/lib/io/zlib_inputstream.cc
@@ -84,7 +84,7 @@ ZlibInputStream::~ZlibInputStream() {
   }
 }
 
-Status ZlibInputStream::Reset() {
+absl::Status ZlibInputStream::Reset() {
   if (init_error_) {
     return errors::DataLoss("unable to reset stream, cannot decompress.");
   }
@@ -92,7 +92,7 @@ Status ZlibInputStream::Reset() {
   inflateEnd(z_stream_def_->stream.get());
   InitZlibBuffer();
   bytes_read_ = 0;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void ZlibInputStream::InitZlibBuffer() {
@@ -120,7 +120,7 @@ void ZlibInputStream::InitZlibBuffer() {
   z_stream_def_->stream->avail_out = output_buffer_capacity_;
 }
 
-Status ZlibInputStream::ReadFromStream() {
+absl::Status ZlibInputStream::ReadFromStream() {
   int bytes_to_read = input_buffer_capacity_;
   char* read_location = reinterpret_cast<char*>(z_stream_def_->input.get());
 
@@ -143,7 +143,7 @@ Status ZlibInputStream::ReadFromStream() {
   // Try to read enough data to fill up z_stream_def_->input.
   // TODO(rohanj): Add a char* version of ReadNBytes to InputStreamInterface
   // and use that instead to make this more efficient.
-  Status s = input_stream_->ReadNBytes(bytes_to_read, &data);
+  absl::Status s = input_stream_->ReadNBytes(bytes_to_read, &data);
   memcpy(read_location, data.data(), data.size());
 
   // Since we moved unread data to the head of the input stream we can point
@@ -166,7 +166,7 @@ Status ZlibInputStream::ReadFromStream() {
     return errors::OutOfRange("EOF reached");
   }
   if (errors::IsOutOfRange(s)) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   return s;
@@ -193,7 +193,8 @@ size_t ZlibInputStream::NumUnreadBytes() const {
          read_bytes;
 }
 
-Status ZlibInputStream::ReadNBytes(int64_t bytes_to_read, tstring* result) {
+absl::Status ZlibInputStream::ReadNBytes(int64_t bytes_to_read,
+                                         tstring* result) {
   if (init_error_) {
     return errors::DataLoss("Unable to decompress Zlib file.");
   }
@@ -225,23 +226,24 @@ Status ZlibInputStream::ReadNBytes(int64_t bytes_to_read, tstring* result) {
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 #if defined(TF_CORD_SUPPORT)
-Status ZlibInputStream::ReadNBytes(int64_t bytes_to_read, absl::Cord* result) {
+absl::Status ZlibInputStream::ReadNBytes(int64_t bytes_to_read,
+                                         absl::Cord* result) {
   // TODO(frankchn): Optimize this instead of bouncing through the buffer.
   tstring buf;
   TF_RETURN_IF_ERROR(ReadNBytes(bytes_to_read, &buf));
   result->Clear();
   result->Append(buf.data());
-  return OkStatus();
+  return absl::OkStatus();
 }
 #endif
 
 int64_t ZlibInputStream::Tell() const { return bytes_read_; }
 
-Status ZlibInputStream::Inflate() {
+absl::Status ZlibInputStream::Inflate() {
   int error = inflate(z_stream_def_->stream.get(), zlib_options_.flush_mode);
   // Source: http://zlib.net/manual.html
   // Z_BUF_ERROR: `inflate` returns Z_BUF_ERROR if no progress was made. This is
@@ -258,7 +260,7 @@ Status ZlibInputStream::Inflate() {
   if (error == Z_STREAM_END && zlib_options_.window_bits == MAX_WBITS + 16) {
     inflateReset(z_stream_def_->stream.get());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace io
diff --git a/third_party/xla/third_party/tsl/tsl/lib/io/zlib_inputstream.h b/third_party/xla/third_party/tsl/tsl/lib/io/zlib_inputstream.h
index d009cbfd0baf54..7a61fda7c9de71 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/io/zlib_inputstream.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/io/zlib_inputstream.h
@@ -66,15 +66,15 @@ class ZlibInputStream : public InputStreamInterface {
   // ABORTED:      If inflate() fails, we return the error code with the
   //               error message in `z_stream_->msg`.
   // others:       If reading from stream failed.
-  Status ReadNBytes(int64_t bytes_to_read, tstring* result) override;
+  absl::Status ReadNBytes(int64_t bytes_to_read, tstring* result) override;
 
 #if defined(TF_CORD_SUPPORT)
-  Status ReadNBytes(int64_t bytes_to_read, absl::Cord* result) override;
+  absl::Status ReadNBytes(int64_t bytes_to_read, absl::Cord* result) override;
 #endif
 
   int64_t Tell() const override;
 
-  Status Reset() override;
+  absl::Status Reset() override;
 
  private:
   void InitZlibBuffer();
@@ -103,10 +103,10 @@ class ZlibInputStream : public InputStreamInterface {
   // Returns OutOfRange error if NO data could be read from stream. Note that
   // this won't return an OutOfRange if there wasn't sufficient data in stream
   // to completely fill up z_stream_input_.
-  Status ReadFromStream();
+  absl::Status ReadFromStream();
 
   // Calls `inflate()` and returns DataLoss Status if it failed.
-  Status Inflate();
+  absl::Status Inflate();
 
   // Starts reading bytes at `next_unread_byte_` till either `bytes_to_read`
   // bytes have been read or `z_stream_->next_out` is reached.
diff --git a/third_party/xla/third_party/tsl/tsl/lib/io/zlib_outputbuffer.cc b/third_party/xla/third_party/tsl/tsl/lib/io/zlib_outputbuffer.cc
index e050fedd3c9e01..e2f4b8505f23d5 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/io/zlib_outputbuffer.cc
+++ b/third_party/xla/third_party/tsl/tsl/lib/io/zlib_outputbuffer.cc
@@ -41,7 +41,7 @@ ZlibOutputBuffer::~ZlibOutputBuffer() {
   }
 }
 
-Status ZlibOutputBuffer::Init() {
+absl::Status ZlibOutputBuffer::Init() {
   // Output buffer size should be greater than 1 because deflation needs at
   // least one byte for book keeping etc.
   if (output_buffer_capacity_ <= 1) {
@@ -65,7 +65,7 @@ Status ZlibOutputBuffer::Init() {
   z_stream_->next_out = z_stream_output_.get();
   z_stream_->avail_in = 0;
   z_stream_->avail_out = output_buffer_capacity_;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 int32 ZlibOutputBuffer::AvailableInputSpace() const {
@@ -107,7 +107,7 @@ void ZlibOutputBuffer::AddToInputBuffer(StringPiece data) {
   z_stream_->avail_in += bytes_to_write;
 }
 
-Status ZlibOutputBuffer::DeflateBuffered(int flush_mode) {
+absl::Status ZlibOutputBuffer::DeflateBuffered(int flush_mode) {
   do {
     // From zlib manual (http://www.zlib.net/manual.html):
     //
@@ -126,13 +126,13 @@ Status ZlibOutputBuffer::DeflateBuffered(int flush_mode) {
 
   DCHECK(z_stream_->avail_in == 0);
   z_stream_->next_in = z_stream_input_.get();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status ZlibOutputBuffer::FlushOutputBufferToFile() {
+absl::Status ZlibOutputBuffer::FlushOutputBufferToFile() {
   uint32 bytes_to_write = output_buffer_capacity_ - z_stream_->avail_out;
   if (bytes_to_write > 0) {
-    Status s = file_->Append(StringPiece(
+    absl::Status s = file_->Append(StringPiece(
         reinterpret_cast<char*>(z_stream_output_.get()), bytes_to_write));
     if (s.ok()) {
       z_stream_->next_out = z_stream_output_.get();
@@ -140,10 +140,10 @@ Status ZlibOutputBuffer::FlushOutputBufferToFile() {
     }
     return s;
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status ZlibOutputBuffer::Append(StringPiece data) {
+absl::Status ZlibOutputBuffer::Append(StringPiece data) {
   // If there is sufficient free space in z_stream_input_ to fit data we
   // add it there and return.
   // If there isn't enough space we deflate the existing contents of
@@ -157,7 +157,7 @@ Status ZlibOutputBuffer::Append(StringPiece data) {
 
   if (static_cast<int32>(bytes_to_write) <= AvailableInputSpace()) {
     AddToInputBuffer(data);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   TF_RETURN_IF_ERROR(DeflateBuffered(zlib_options_.flush_mode));
@@ -165,7 +165,7 @@ Status ZlibOutputBuffer::Append(StringPiece data) {
   // At this point input stream should be empty.
   if (static_cast<int32>(bytes_to_write) <= AvailableInputSpace()) {
     AddToInputBuffer(data);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // `data` is too large to fit in input buffer so we deflate it directly.
@@ -188,48 +188,48 @@ Status ZlibOutputBuffer::Append(StringPiece data) {
   // Restore z_stream input pointers.
   z_stream_->next_in = z_stream_input_.get();
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 #if defined(TF_CORD_SUPPORT)
-Status ZlibOutputBuffer::Append(const absl::Cord& cord) {
+absl::Status ZlibOutputBuffer::Append(const absl::Cord& cord) {
   for (absl::string_view fragment : cord.Chunks()) {
     TF_RETURN_IF_ERROR(Append(fragment));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 #endif
 
-Status ZlibOutputBuffer::Flush() {
+absl::Status ZlibOutputBuffer::Flush() {
   TF_RETURN_IF_ERROR(DeflateBuffered(Z_PARTIAL_FLUSH));
   TF_RETURN_IF_ERROR(FlushOutputBufferToFile());
   return file_->Flush();
 }
 
-Status ZlibOutputBuffer::Name(StringPiece* result) const {
+absl::Status ZlibOutputBuffer::Name(StringPiece* result) const {
   return file_->Name(result);
 }
 
-Status ZlibOutputBuffer::Sync() {
+absl::Status ZlibOutputBuffer::Sync() {
   TF_RETURN_IF_ERROR(Flush());
   return file_->Sync();
 }
 
-Status ZlibOutputBuffer::Close() {
+absl::Status ZlibOutputBuffer::Close() {
   if (z_stream_) {
     TF_RETURN_IF_ERROR(DeflateBuffered(Z_FINISH));
     TF_RETURN_IF_ERROR(FlushOutputBufferToFile());
     deflateEnd(z_stream_.get());
     z_stream_.reset(nullptr);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status ZlibOutputBuffer::Deflate(int flush) {
+absl::Status ZlibOutputBuffer::Deflate(int flush) {
   int error = deflate(z_stream_.get(), flush);
   if (error == Z_OK || error == Z_BUF_ERROR ||
       (error == Z_STREAM_END && flush == Z_FINISH)) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   string error_string = strings::StrCat("deflate() failed with error ", error);
   if (z_stream_->msg != nullptr) {
@@ -238,7 +238,7 @@ Status ZlibOutputBuffer::Deflate(int flush) {
   return errors::DataLoss(error_string);
 }
 
-Status ZlibOutputBuffer::Tell(int64_t* position) {
+absl::Status ZlibOutputBuffer::Tell(int64_t* position) {
   return file_->Tell(position);
 }
 
diff --git a/third_party/xla/third_party/tsl/tsl/lib/io/zlib_outputbuffer.h b/third_party/xla/third_party/tsl/tsl/lib/io/zlib_outputbuffer.h
index 8f0793c985bae5..a9a23bf2272efd 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/io/zlib_outputbuffer.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/io/zlib_outputbuffer.h
@@ -53,7 +53,7 @@ class ZlibOutputBuffer : public WritableFile {
 
   // Initializes some state necessary for the output buffer. This call is
   // required before any other operation on the buffer.
-  Status Init();
+  absl::Status Init();
 
   // Adds `data` to the compression pipeline.
   //
@@ -63,14 +63,14 @@ class ZlibOutputBuffer : public WritableFile {
   // to file when the buffer is full.
   //
   // To immediately write contents to file call `Flush()`.
-  Status Append(StringPiece data) override;
+  absl::Status Append(StringPiece data) override;
 
 #if defined(TF_CORD_SUPPORT)
-  Status Append(const absl::Cord& cord) override;
+  absl::Status Append(const absl::Cord& cord) override;
 #endif
 
   // Deflates any cached input and writes all output to file.
-  Status Flush() override;
+  absl::Status Flush() override;
 
   // Compresses any cached input and writes all output to file. This must be
   // called before the destructor to avoid any data loss.
@@ -80,21 +80,21 @@ class ZlibOutputBuffer : public WritableFile {
   //
   // After calling this, any further calls to `Write()`, `Flush()` or `Close()`
   // will fail.
-  Status Close() override;
+  absl::Status Close() override;
 
   // Returns the name of the underlying file.
-  Status Name(StringPiece* result) const override;
+  absl::Status Name(StringPiece* result) const override;
 
   // Deflates any cached input, writes all output to file and syncs it.
-  Status Sync() override;
+  absl::Status Sync() override;
 
   // Returns the write position in the underlying file. The position does not
   // reflect buffered, un-flushed data.
-  Status Tell(int64_t* position) override;
+  absl::Status Tell(int64_t* position) override;
 
  private:
   WritableFile* file_;  // Not owned
-  Status init_status_;
+  absl::Status init_status_;
   size_t input_buffer_capacity_;
   size_t output_buffer_capacity_;
 
@@ -136,14 +136,14 @@ class ZlibOutputBuffer : public WritableFile {
   //
   // Note: This method does not flush contents to file.
   // Returns non-ok status if writing contents to file fails.
-  Status DeflateBuffered(int flush_mode);
+  absl::Status DeflateBuffered(int flush_mode);
 
   // Appends contents of `z_stream_output_` to `file_`.
   // Returns non-OK status if writing to file fails.
-  Status FlushOutputBufferToFile();
+  absl::Status FlushOutputBufferToFile();
 
   // Calls `deflate()` and returns DataLoss Status if it failed.
-  Status Deflate(int flush);
+  absl::Status Deflate(int flush);
 
   static bool IsSyncOrFullFlush(uint8 flush_mode) {
     return flush_mode == Z_SYNC_FLUSH || flush_mode == Z_FULL_FLUSH;
diff --git a/third_party/xla/third_party/tsl/tsl/lib/math/BUILD b/third_party/xla/third_party/tsl/tsl/lib/math/BUILD
index e5f1178382650a..a78947f3c38ffa 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/math/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/lib/math/BUILD
@@ -1,5 +1,5 @@
-load("//tsl:tsl.bzl", "internal_visibility")
-load("//tsl:tsl.default.bzl", "get_compatible_with_portable")
+load("@local_xla//xla/tsl:tsl.bzl", "internal_visibility")
+load("@local_xla//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 load(
     "//tsl/platform:build_config.bzl",
     "tsl_cc_test",
diff --git a/third_party/xla/third_party/tsl/tsl/lib/monitoring/BUILD b/third_party/xla/third_party/tsl/tsl/lib/monitoring/BUILD
index 06af0dc45a5ac1..829b28f128af5d 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/monitoring/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/lib/monitoring/BUILD
@@ -2,8 +2,8 @@ load(
     "@local_tsl//tsl/platform:rules_cc.bzl",
     "cc_library",
 )
-load("//tsl:tsl.bzl", "internal_visibility")
-load("//tsl:tsl.default.bzl", "filegroup")
+load("@local_xla//xla/tsl:tsl.bzl", "internal_visibility")
+load("@local_xla//xla/tsl:tsl.default.bzl", "filegroup")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -24,7 +24,7 @@ package(
         "//tensorflow/core/lib/monitoring:__subpackages__",
         "@local_xla//xla/service:__subpackages__",
         "//tsl/framework:__subpackages__",
-        "//tsl/distributed_runtime:__subpackages__",
+        "@local_xla//xla/tsl/distributed_runtime:__subpackages__",
         "//tensorflow/compiler/mlir/tf2xla:__subpackages__",
         "//tensorflow_serving/model_servers:__subpackages__",
     ]),
diff --git a/third_party/xla/third_party/tsl/tsl/lib/random/BUILD b/third_party/xla/third_party/tsl/tsl/lib/random/BUILD
index 169d41b6f6c969..d9da60f5887004 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/random/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/lib/random/BUILD
@@ -1,6 +1,6 @@
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-load("//tsl:tsl.bzl", "internal_visibility")
-load("//tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
+load("@local_xla//xla/tsl:tsl.bzl", "internal_visibility")
+load("@local_xla//xla/tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
 load(
     "//tsl/platform:build_config.bzl",
     "tsl_cc_test",
diff --git a/third_party/xla/third_party/tsl/tsl/lib/strings/BUILD b/third_party/xla/third_party/tsl/tsl/lib/strings/BUILD
index 6593d44eb6c087..f529fb395957a7 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/strings/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/lib/strings/BUILD
@@ -2,8 +2,8 @@ load(
     "@local_tsl//tsl/platform:rules_cc.bzl",
     "cc_library",
 )
-load("//tsl:tsl.bzl", "internal_visibility")
-load("//tsl:tsl.default.bzl", "filegroup")
+load("@local_xla//xla/tsl:tsl.bzl", "internal_visibility")
+load("@local_xla//xla/tsl:tsl.default.bzl", "filegroup")
 
 # copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/BUILD b/third_party/xla/third_party/tsl/tsl/platform/BUILD
index 2757cc6b86b651..acbcb3350d401a 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/platform/BUILD
@@ -13,12 +13,12 @@ load(
     "cc_library",
 )
 load(
-    "//tsl:tsl.bzl",
+    "@local_xla//xla/tsl:tsl.bzl",
     "if_not_fuchsia",
     "internal_visibility",
     "tsl_copts",
 )
-load("//tsl:tsl.default.bzl", "get_compatible_with_portable")
+load("@local_xla//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 load(
     "//tsl/platform:build_config.bzl",
     "tf_cuda_libdevice_path_deps",
@@ -578,7 +578,7 @@ filegroup(
         "threadpool_interface.h",
         "tracing.h",
     ] + select({
-        "//tsl:fuchsia": tf_google_mobile_srcs_no_runtime(),
+        "@local_xla//xla/tsl:fuchsia": tf_google_mobile_srcs_no_runtime(),
         "//conditions:default": [
             "file_system_helper.cc",
             "tracing.cc",
@@ -1296,7 +1296,7 @@ cc_library(
     srcs = ["test_main.cc"],
     copts = tsl_copts(),
     linkopts = select({
-        "//tsl:windows": [],
+        "@local_xla//xla/tsl:windows": [],
         "//conditions:default": ["-lm"],
     }),
     deps = [
diff --git a/third_party/xla/third_party/tsl/tsl/platform/base64.cc b/third_party/xla/third_party/tsl/tsl/platform/base64.cc
index 6421b5ec920010..7cf8f2d606887f 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/base64.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/base64.cc
@@ -63,7 +63,7 @@ inline uint32 Convert(char x) {
   return static_cast<uint32>(z);
 }
 
-Status DecodeThreeChars(const char* codes, char* result) {
+absl::Status DecodeThreeChars(const char* codes, char* result) {
   const uint32 packed = (Convert(codes[0]) << 18) | (Convert(codes[1]) << 12) |
                         (Convert(codes[2]) << 6) | (Convert(codes[3]));
   // Convert() return value has upper 25 bits set if input is invalid.
@@ -74,19 +74,19 @@ Status DecodeThreeChars(const char* codes, char* result) {
   result[0] = static_cast<char>(packed >> 16);
   result[1] = static_cast<char>(packed >> 8);
   result[2] = static_cast<char>(packed);
-  return OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace
 
 template <typename T>
-Status Base64Decode(StringPiece data, T* decoded) {
+absl::Status Base64Decode(StringPiece data, T* decoded) {
   if (decoded == nullptr) {
     return errors::Internal("'decoded' cannot be nullptr.");
   }
 
   if (data.empty()) {
     decoded->clear();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // This decoding procedure will write 3 * ceil(data.size() / 4) bytes to be
@@ -138,16 +138,16 @@ Status Base64Decode(StringPiece data, T* decoded) {
   current += remain - 1;
 
   decoded->assign(buffer.get(), current - buffer.get());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename T>
-Status Base64Encode(StringPiece source, T* encoded) {
+absl::Status Base64Encode(StringPiece source, T* encoded) {
   return Base64Encode(source, false, encoded);
 }
 
 template <typename T>
-Status Base64Encode(StringPiece source, bool with_padding, T* encoded) {
+absl::Status Base64Encode(StringPiece source, bool with_padding, T* encoded) {
   const char* const base64_chars = kBase64UrlSafeChars;
   if (encoded == nullptr) {
     return errors::Internal("'encoded' cannot be nullptr.");
@@ -196,7 +196,7 @@ Status Base64Encode(StringPiece source, bool with_padding, T* encoded) {
   }
 
   encoded->assign(buffer.get(), current - buffer.get());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template Status Base64Decode<std::string>(StringPiece data,
diff --git a/third_party/xla/third_party/tsl/tsl/platform/base64.h b/third_party/xla/third_party/tsl/tsl/platform/base64.h
index 888a3ebb35545a..fa2ad0ad40d618 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/base64.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/base64.h
@@ -27,16 +27,16 @@ namespace tsl {
 ///
 /// See https://en.wikipedia.org/wiki/Base64
 template <typename T>
-Status Base64Encode(StringPiece source, bool with_padding, T* encoded);
+absl::Status Base64Encode(StringPiece source, bool with_padding, T* encoded);
 template <typename T>
-Status Base64Encode(StringPiece source,
-                    T* encoded);  // with_padding=false.
+absl::Status Base64Encode(StringPiece source,
+                          T* encoded);  // with_padding=false.
 
 /// \brief Converts data from web-safe base64 encoding.
 ///
 /// See https://en.wikipedia.org/wiki/Base64
 template <typename T>
-Status Base64Decode(StringPiece data, T* decoded);
+absl::Status Base64Decode(StringPiece data, T* decoded);
 
 // Explicit instantiations defined in base64.cc.
 extern template Status Base64Decode<std::string>(StringPiece data,
diff --git a/third_party/xla/third_party/tsl/tsl/platform/cloud/BUILD b/third_party/xla/third_party/tsl/tsl/platform/cloud/BUILD
index 21e13663a4b4cf..2ce4463f65b7ee 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/cloud/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/platform/cloud/BUILD
@@ -3,7 +3,7 @@
 
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load(
-    "//tsl:tsl.bzl",
+    "@local_xla//xla/tsl:tsl.bzl",
     "if_windows",
     "internal_visibility",
     "tsl_copts",
diff --git a/third_party/xla/third_party/tsl/tsl/platform/cloud/testdata/BUILD b/third_party/xla/third_party/tsl/tsl/platform/cloud/testdata/BUILD
index 1cafb7b63b75c6..08ff42cba72036 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/cloud/testdata/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/platform/cloud/testdata/BUILD
@@ -1,6 +1,6 @@
 # Cloud test data files.
 
-load("//tsl:tsl.default.bzl", "filegroup")
+load("@local_xla//xla/tsl:tsl.default.bzl", "filegroup")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/third_party/tsl/tsl/platform/default/BUILD b/third_party/xla/third_party/tsl/tsl/platform/default/BUILD
index bac3898fef57a5..c09089518ec1f1 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/default/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/platform/default/BUILD
@@ -2,13 +2,13 @@
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load(
-    "//tsl:tsl.bzl",
+    "@local_xla//xla/tsl:tsl.bzl",
     "if_not_fuchsia",
     "if_not_windows",
     "internal_visibility",
     "tsl_copts",
 )
-load("//tsl:tsl.default.bzl", "filegroup", "tsl_grpc_cc_dependencies")
+load("@local_xla//xla/tsl:tsl.default.bzl", "filegroup", "tsl_grpc_cc_dependencies")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -76,7 +76,7 @@ cc_library(
     name = "dso_loader",
     srcs = ["dso_loader.cc"] + select({
         # include dynamic loading checker only for open source build
-        "//tsl:oss": ["dlopen_checker.cc"],
+        "@local_xla//xla/tsl:oss": ["dlopen_checker.cc"],
         "//conditions:default": ["dlopen_checker_stub.cc"],
     }),
     hdrs = ["dso_loader.h"],
@@ -341,7 +341,7 @@ cc_library(
     copts = tsl_copts(),
     defines = ["TF_USE_SNAPPY"] + select({
         # TF Additional NUMA defines
-        "//tsl:with_numa_support": ["TENSORFLOW_USE_NUMA"],
+        "@local_xla//xla/tsl:with_numa_support": ["TENSORFLOW_USE_NUMA"],
         "//conditions:default": [],
     }),
     tags = [
@@ -361,7 +361,7 @@ cc_library(
         "@snappy",
     ] + select({
         # TF Additional NUMA dependencies
-        "//tsl:with_numa_support": [
+        "@local_xla//xla/tsl:with_numa_support": [
             # Don't merge in a single line
             "@hwloc",
         ],
diff --git a/third_party/xla/third_party/tsl/tsl/platform/default/build_config.bzl b/third_party/xla/third_party/tsl/tsl/platform/default/build_config.bzl
index d8ba2d02903691..35cdcdc503add4 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/default/build_config.bzl
+++ b/third_party/xla/third_party/tsl/tsl/platform/default/build_config.bzl
@@ -3,7 +3,7 @@
 load("@com_github_grpc_grpc//bazel:generate_cc.bzl", "generate_cc")
 load("@com_google_protobuf//:protobuf.bzl", "proto_gen")
 load(
-    "//tsl:tsl.bzl",
+    "@local_xla//xla/tsl:tsl.bzl",
     "clean_dep",
     "if_not_windows",
     "if_tsl_link_protobuf",
@@ -665,7 +665,7 @@ def tf_additional_lib_hdrs():
         clean_dep("//tsl/platform/default:tracing_impl.h"),
         clean_dep("//tsl/platform/default:unbounded_work_queue.h"),
     ] + select({
-        clean_dep("//tsl:windows"): [
+        clean_dep("@local_xla//xla/tsl:windows"): [
             clean_dep("//tsl/platform/windows:intrinsics_port.h"),
             clean_dep("//tsl/platform/windows:stacktrace.h"),
             clean_dep("//tsl/platform/windows:subprocess.h"),
@@ -720,9 +720,9 @@ def tf_additional_lib_deps():
 
 def tf_additional_core_deps():
     return select({
-        clean_dep("//tsl:android"): [],
-        clean_dep("//tsl:ios"): [],
-        clean_dep("//tsl:linux_s390x"): [],
+        clean_dep("@local_xla//xla/tsl:android"): [],
+        clean_dep("@local_xla//xla/tsl:ios"): [],
+        clean_dep("@local_xla//xla/tsl:linux_s390x"): [],
         "//conditions:default": [
             clean_dep("//tsl/platform/cloud:gcs_file_system"),
         ],
@@ -807,7 +807,7 @@ def tf_protobuf_compiler_deps():
 
 def tf_windows_aware_platform_deps(name):
     return select({
-        clean_dep("//tsl:windows"): [
+        clean_dep("@local_xla//xla/tsl:windows"): [
             clean_dep("//tsl/platform/windows:" + name),
         ],
         "//conditions:default": [
diff --git a/third_party/xla/third_party/tsl/tsl/platform/default/build_config_root.bzl b/third_party/xla/third_party/tsl/tsl/platform/default/build_config_root.bzl
index 9280779cbf236d..7524e07b872ace 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/default/build_config_root.bzl
+++ b/third_party/xla/third_party/tsl/tsl/platform/default/build_config_root.bzl
@@ -46,53 +46,56 @@ def tf_additional_tpu_ops_deps():
 # used for all framework_shared_object platforms including MacOS.
 def if_static(extra_deps, otherwise = [], macos = []):
     ret = {
-        str(Label("//tsl:framework_shared_object")): otherwise,
+        str(Label("@local_xla//xla/tsl:framework_shared_object")): otherwise,
         "//conditions:default": extra_deps,
     }
     if macos:
-        ret[str(Label("//tsl:macos_with_framework_shared_object"))] = macos
+        ret[str(Label("@local_xla//xla/tsl:macos_with_framework_shared_object"))] = macos
     return select(ret)
 
 def if_static_and_not_mobile(extra_deps, otherwise = []):
     return select({
-        str(Label("//tsl:framework_shared_object")): otherwise,
-        str(Label("//tsl:android")): otherwise,
-        str(Label("//tsl:ios")): otherwise,
+        str(Label("@local_xla//xla/tsl:framework_shared_object")): otherwise,
+        str(Label("@local_xla//xla/tsl:android")): otherwise,
+        str(Label("@local_xla//xla/tsl:ios")): otherwise,
         "//conditions:default": extra_deps,
     })
 
 def if_llvm_aarch32_available(then, otherwise = []):
     return select({
-        str(Label("//tsl:aarch32_or_cross")): then,
+        str(Label("@local_xla//xla/tsl:aarch32_or_cross")): then,
         "//conditions:default": otherwise,
     })
 
 def if_llvm_aarch64_available(then, otherwise = []):
     return select({
-        str(Label("//tsl:aarch64_or_cross")): then,
+        str(Label("@local_xla//xla/tsl:aarch64_or_cross")): then,
         "//conditions:default": otherwise,
     })
 
 def if_llvm_arm_available(then, otherwise = []):
     return select({
-        str(Label("//tsl:arm_or_cross")): then,
+        str(Label("@local_xla//xla/tsl:arm_or_cross")): then,
         "//conditions:default": otherwise,
     })
 
+def if_llvm_hexagon_available(then, otherwise = []):
+    return otherwise
+
 def if_llvm_powerpc_available(then, otherwise = []):
     return select({
-        str(Label("//tsl:ppc64le_or_cross")): then,
+        str(Label("@local_xla//xla/tsl:ppc64le_or_cross")): then,
         "//conditions:default": otherwise,
     })
 
 def if_llvm_system_z_available(then, otherwise = []):
     return select({
-        str(Label("//tsl:s390x_or_cross")): then,
+        str(Label("@local_xla//xla/tsl:s390x_or_cross")): then,
         "//conditions:default": otherwise,
     })
 
 def if_llvm_x86_available(then, otherwise = []):
     return select({
-        str(Label("//tsl:x86_or_cross")): then,
+        str(Label("@local_xla//xla/tsl:x86_or_cross")): then,
         "//conditions:default": otherwise,
     })
diff --git a/third_party/xla/third_party/tsl/tsl/platform/ml_dtypes.h b/third_party/xla/third_party/tsl/tsl/platform/ml_dtypes.h
index 504085af8518ee..1d23cebcabd6ee 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/ml_dtypes.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/ml_dtypes.h
@@ -28,6 +28,8 @@ using float8_e4m3b11 = float8_e4m3b11fnuz;  // Deprecated: old name for
 using float8_e5m2 = ::ml_dtypes::float8_e5m2;
 using float8_e5m2fnuz = ::ml_dtypes::float8_e5m2fnuz;
 
+using int2 = ::ml_dtypes::int2;
+using uint2 = ::ml_dtypes::uint2;
 using int4 = ::ml_dtypes::int4;
 using uint4 = ::ml_dtypes::uint4;
 }  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/platform/profile_utils/BUILD b/third_party/xla/third_party/tsl/tsl/platform/profile_utils/BUILD
index 5285e49bcb1df3..180662871a857c 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/profile_utils/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/platform/profile_utils/BUILD
@@ -5,8 +5,8 @@ load(
     "@local_tsl//tsl/platform:rules_cc.bzl",
     "cc_library",
 )
-load("//tsl:tsl.bzl", "internal_visibility", "tsl_copts")
-load("//tsl:tsl.default.bzl", "filegroup")
+load("@local_xla//xla/tsl:tsl.bzl", "internal_visibility", "tsl_copts")
+load("@local_xla//xla/tsl:tsl.default.bzl", "filegroup")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/third_party/tsl/tsl/platform/status.cc b/third_party/xla/third_party/tsl/tsl/platform/status.cc
index 2fb124322c1098..3f9dd1ecb58427 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/status.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/status.cc
@@ -155,16 +155,13 @@ std::vector<StackFrame> GetStackTrace(const ::tsl::Status& status) {
 
 }  // namespace errors
 
-const absl::string_view kEmptyString = "";
-
-const char* NullTerminatedMessage(const Status& status) {
-  auto message = status.message();
-  if (message.empty()) {
-    return kEmptyString.data();
-  }
-  return message.data();
+// NB: This Windows-only implementation is exists only to avoid a linker error.
+// Remove if this is resolved.
+#ifdef _WIN32
+const char* NullTerminatedMessage(const absl::Status& status) {
+  return absl::StatusMessageAsCStr(status);
 }
-
+#endif
 
 std::string* TfCheckOpHelperOutOfLine(const ::tsl::Status& v, const char* msg) {
   std::string r("Non-OK-status: ");
diff --git a/third_party/xla/third_party/tsl/tsl/platform/status.h b/third_party/xla/third_party/tsl/tsl/platform/status.h
index 812ac1a0d6adfc..b9aca1e9285a11 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/status.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/status.h
@@ -119,7 +119,16 @@ inline absl::Status ToAbslStatus(const ::absl::Status& s) { return s; }
 // the Tensorflow C-API.
 // A more robust API would be to get both a `char*` of the beginning of the
 // string, plus the size (see e.g. `XlaCustomCallStatusSetFailure`).
-const char* NullTerminatedMessage(const Status& status);
+// NB: This Windows-only implementation is exists only to avoid a linker error.
+// Remove if this is resolved.
+#ifdef _WIN32
+const char* NullTerminatedMessage(const absl::Status& status);
+#else
+ABSL_DEPRECATE_AND_INLINE()
+inline const char* NullTerminatedMessage(const absl::Status& status) {
+  return absl::StatusMessageAsCStr(status);
+}
+#endif
 
 // TODO(b/197552541) Move this namespace to errors.h.
 namespace errors {
diff --git a/third_party/xla/third_party/tsl/tsl/platform/tracing.h b/third_party/xla/third_party/tsl/tsl/platform/tracing.h
index 90678dc5f60ada..a90474099f67c3 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/tracing.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/tracing.h
@@ -121,7 +121,7 @@ class ScopedRegion {
   }
 
   ~ScopedRegion() {
-    if (collector_) {
+    if (collector_ && EventCollector::IsEnabled()) {
       collector_->StopRegion();
     }
   }
diff --git a/third_party/xla/third_party/tsl/tsl/platform/windows/BUILD b/third_party/xla/third_party/tsl/tsl/platform/windows/BUILD
index c8f4ed0d14b6fb..bd07c1b07f59ef 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/windows/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/platform/windows/BUILD
@@ -5,11 +5,11 @@ load(
 
 # Tensorflow windows-specific implementations of tensorflow/core/platform libraries.
 load(
-    "//tsl:tsl.bzl",
+    "@local_xla//xla/tsl:tsl.bzl",
     "internal_visibility",
     "tsl_copts",
 )
-load("//tsl:tsl.default.bzl", "filegroup")
+load("@local_xla//xla/tsl:tsl.default.bzl", "filegroup")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/BUILD b/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/BUILD
index 7c7d9e7f036ab5..25c75b1353d8e2 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/BUILD
@@ -1,5 +1,5 @@
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-load("//tsl:tsl.bzl", "internal_visibility")
+load("@local_xla//xla/tsl:tsl.bzl", "internal_visibility")
 load("//tsl/platform:build_config.bzl", "tsl_cc_test")
 load("//tsl/platform:build_config_root.bzl", "if_static")
 load("//tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
@@ -128,3 +128,39 @@ cc_library(
         "@com_google_absl//absl/strings",
     ],
 )
+
+cc_library(
+    name = "threadpool_listener",
+    srcs = ["threadpool_listener.cc"],
+    hdrs = ["threadpool_listener.h"],
+    visibility = internal_visibility([
+        "//tensorflow/python:__pkg__",
+        "//tsl/platform/cloud:__pkg__",
+        "//tsl/profiler:__pkg__",
+        "//tsl/profiler:internal",
+        "//tsl/profiler:xla_internal",
+    ]),
+    deps = [
+        ":threadpool_listener_state",
+        ":traceme_recorder",
+        "//tsl/platform:logging",
+        "//tsl/platform:tracing",
+        "//tsl/platform:types",
+        "//tsl/profiler/lib:context_types_hdrs",
+        "//tsl/profiler/lib:profiler_interface",
+        "//tsl/profiler/lib:traceme_encode",
+        "//tsl/profiler/utils:time_utils",
+        "//tsl/profiler/utils:xplane_schema",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_library(
+    name = "threadpool_listener_state",
+    srcs = ["threadpool_listener_state.cc"],
+    hdrs = ["threadpool_listener_state.h"],
+    visibility = internal_visibility([
+        "//tsl/platform:__subpackages__",
+    ]),
+)
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/threadpool_listener.cc b/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/threadpool_listener.cc
new file mode 100644
index 00000000000000..1774c10b51db71
--- /dev/null
+++ b/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/threadpool_listener.cc
@@ -0,0 +1,99 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tsl/profiler/backends/cpu/threadpool_listener.h"
+
+#include <cstdint>
+#include <memory>
+
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "tsl/platform/logging.h"
+#include "tsl/platform/tracing.h"
+#include "tsl/platform/types.h"
+#include "tsl/profiler/backends/cpu/threadpool_listener_state.h"
+#include "tsl/profiler/backends/cpu/traceme_recorder.h"
+#include "tsl/profiler/lib/context_types.h"
+#include "tsl/profiler/lib/traceme_encode.h"
+#include "tsl/profiler/utils/time_utils.h"
+#include "tsl/profiler/utils/xplane_schema.h"
+
+namespace tsl {
+namespace profiler {
+namespace {
+
+void RegisterThreadpoolEventCollector(ThreadpoolEventCollector* collector) {
+  tracing::SetEventCollector(tracing::EventCategory::kScheduleClosure,
+                             collector);
+  tracing::SetEventCollector(tracing::EventCategory::kRunClosure, collector);
+}
+
+void UnregisterThreadpoolEventCollector() {
+  tracing::SetEventCollector(tracing::EventCategory::kScheduleClosure, nullptr);
+  tracing::SetEventCollector(tracing::EventCategory::kRunClosure, nullptr);
+}
+
+}  // namespace
+
+void ThreadpoolEventCollector::RecordEvent(uint64 arg) const {
+  int64_t now = GetCurrentTimeNanos();
+  TraceMeRecorder::Record(
+      {TraceMeEncode(kThreadpoolListenerRecord,
+                     {{"_pt", ContextType::kThreadpoolEvent}, {"_p", arg}}),
+       now, now});
+}
+void ThreadpoolEventCollector::StartRegion(uint64 arg) const {
+  int64_t now = GetCurrentTimeNanos();
+  TraceMeRecorder::Record(
+      {TraceMeEncode(kThreadpoolListenerStartRegion,
+                     {{"_ct", ContextType::kThreadpoolEvent}, {"_c", arg}}),
+       now, now});
+}
+void ThreadpoolEventCollector::StopRegion() const {
+  int64_t now = GetCurrentTimeNanos();
+  TraceMeRecorder::Record(
+      {TraceMeEncode(kThreadpoolListenerStopRegion, {}), now, now});
+}
+
+absl::Status ThreadpoolProfilerInterface::Start() {
+  if (tracing::EventCollector::IsEnabled()) {
+    LOG(WARNING) << "[ThreadpoolEventCollector] EventCollector is enabled, Not "
+                    "collecting events from ThreadPool.";
+    status_ = absl::FailedPreconditionError(
+        "ThreadpoolEventCollector is enabled, Not collecting events from "
+        "ThreadPool.");
+    return absl::OkStatus();
+  }
+  event_collector_ = std::make_unique<ThreadpoolEventCollector>();
+  RegisterThreadpoolEventCollector(event_collector_.get());
+  threadpool_listener::Activate();
+  return absl::OkStatus();
+}
+
+absl::Status ThreadpoolProfilerInterface::Stop() {
+  threadpool_listener::Deactivate();
+  UnregisterThreadpoolEventCollector();
+  return absl::OkStatus();
+}
+
+absl::Status ThreadpoolProfilerInterface::CollectData(
+    tensorflow::profiler::XSpace* space) {
+  if (!status_.ok()) {
+    *space->add_errors() = status_.ToString();
+  }
+  return absl::OkStatus();
+}
+
+}  // namespace profiler
+}  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/threadpool_listener.h b/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/threadpool_listener.h
new file mode 100644
index 00000000000000..900be949c60e6a
--- /dev/null
+++ b/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/threadpool_listener.h
@@ -0,0 +1,58 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PROFILER_BACKENDS_CPU_THREADPOOL_LISTENER_H_
+#define TENSORFLOW_TSL_PROFILER_BACKENDS_CPU_THREADPOOL_LISTENER_H_
+
+#include "absl/status/status.h"
+#include "tsl/platform/tracing.h"
+#include "tsl/platform/types.h"
+#include "tsl/profiler/backends/cpu/threadpool_listener_state.h"
+#include "tsl/profiler/lib/profiler_interface.h"
+namespace tsl {
+namespace profiler {
+
+class ThreadpoolEventCollector : public tsl::tracing::EventCollector {
+ public:
+  explicit ThreadpoolEventCollector() = default;
+
+  void RecordEvent(uint64 arg) const override;
+  void StartRegion(uint64 arg) const override;
+  void StopRegion() const override;
+
+  // Annotates the current thread with a name.
+  void SetCurrentThreadName(const char* name) {}
+  // Returns whether event collection is enabled.
+  static bool IsEnabled() { return threadpool_listener::IsEnabled(); }
+};
+
+class ThreadpoolProfilerInterface : public ProfilerInterface {
+ public:
+  explicit ThreadpoolProfilerInterface() = default;
+
+  absl::Status Start() override;
+  absl::Status Stop() override;
+
+  absl::Status CollectData(tensorflow::profiler::XSpace* space) override;
+
+ private:
+  absl::Status status_;
+  std::unique_ptr<ThreadpoolEventCollector> event_collector_;
+};
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PROFILER_BACKENDS_CPU_THREADPOOL_LISTENER_H_
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/threadpool_listener_state.cc b/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/threadpool_listener_state.cc
new file mode 100644
index 00000000000000..4ad318305d1dc1
--- /dev/null
+++ b/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/threadpool_listener_state.cc
@@ -0,0 +1,35 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tsl/profiler/backends/cpu/threadpool_listener_state.h"
+
+#include <atomic>
+
+namespace tsl {
+namespace profiler {
+namespace threadpool_listener {
+namespace {
+static std::atomic<int> enabled = {0};
+}
+
+bool IsEnabled() { return enabled.load(std::memory_order_acquire); }
+
+void Activate() { enabled.store(1, std::memory_order_release); }
+
+void Deactivate() { enabled.store(0, std::memory_order_release); }
+
+}  // namespace threadpool_listener
+}  // namespace profiler
+}  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/threadpool_listener_state.h b/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/threadpool_listener_state.h
new file mode 100644
index 00000000000000..36d19676f48450
--- /dev/null
+++ b/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/threadpool_listener_state.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PROFILER_BACKENDS_CPU_THREADPOOL_LISTENER_STATE_H_
+#define TENSORFLOW_TSL_PROFILER_BACKENDS_CPU_THREADPOOL_LISTENER_STATE_H_
+
+namespace tsl {
+namespace profiler {
+namespace threadpool_listener {
+
+// Check if the threadpool listener is enabled.
+bool IsEnabled();
+
+// Set global state of threadpool listener to enabled.
+void Activate();
+
+// Set global state of threadpool listener to disabled.
+void Deactivate();
+
+}  // namespace threadpool_listener
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PROFILER_BACKENDS_CPU_THREADPOOL_LISTENER_STATE_H_
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/builds/BUILD b/third_party/xla/third_party/tsl/tsl/profiler/builds/BUILD
index 050103eec5570b..00fbfff09906ef 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/builds/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/profiler/builds/BUILD
@@ -1,4 +1,4 @@
-load("//tsl:tsl.bzl", "internal_visibility")
+load("@local_xla//xla/tsl:tsl.bzl", "internal_visibility")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/builds/build_config.bzl b/third_party/xla/third_party/tsl/tsl/profiler/builds/build_config.bzl
index 72e2e53537794d..07e631362e32cd 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/builds/build_config.bzl
+++ b/third_party/xla/third_party/tsl/tsl/profiler/builds/build_config.bzl
@@ -1,6 +1,6 @@
 """Provides a redirection point for platform specific implementations of Starlark utilities."""
 
-load("//tsl:tsl.bzl", "clean_dep")
+load("@local_xla//xla/tsl:tsl.bzl", "clean_dep")
 load(
     "//tsl/profiler/builds/oss:build_config.bzl",
     _tf_profiler_alias = "tf_profiler_alias",
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/builds/oss/build_config.bzl b/third_party/xla/third_party/tsl/tsl/profiler/builds/oss/build_config.bzl
index a9d620a56da310..1b1f3892889f03 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/builds/oss/build_config.bzl
+++ b/third_party/xla/third_party/tsl/tsl/profiler/builds/oss/build_config.bzl
@@ -3,7 +3,7 @@
 TF profiler build macros for use in OSS.
 """
 
-load("//tsl:tsl.bzl", "cc_header_only_library")
+load("@local_xla//xla/tsl:tsl.bzl", "cc_header_only_library")
 
 def tf_profiler_alias(target_dir, name):
     return target_dir + "oss:" + name
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/convert/BUILD b/third_party/xla/third_party/tsl/tsl/profiler/convert/BUILD
index e6ab8e6b5ab874..5cbbafe87ef36d 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/convert/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/profiler/convert/BUILD
@@ -2,7 +2,7 @@ load(
     "@local_tsl//tsl/platform:rules_cc.bzl",
     "cc_library",
 )
-load("//tsl:tsl.bzl", "internal_visibility")
+load("@local_xla//xla/tsl:tsl.bzl", "internal_visibility")
 load("//tsl/platform:build_config.bzl", "tsl_cc_test")
 load("//tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
 
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/BUILD b/third_party/xla/third_party/tsl/tsl/profiler/lib/BUILD
index 84f6a1439711aa..a5f965baa6c1a9 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/BUILD
@@ -1,6 +1,6 @@
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-load("//tsl:tsl.bzl", "if_not_android", "internal_visibility", "nvtx_headers")
-load("//tsl:tsl.default.bzl", "filegroup")
+load("@local_xla//xla/tsl:tsl.bzl", "if_not_android", "internal_visibility", "nvtx_headers")
+load("@local_xla//xla/tsl:tsl.default.bzl", "filegroup")
 load("//tsl/platform:build_config.bzl", "tsl_cc_test")
 load("//tsl/platform:build_config_root.bzl", "if_static")
 load(
@@ -126,7 +126,7 @@ cc_library(
     hdrs = ["profiler_interface.h"],
     copts = tf_profiler_copts(),
     visibility = internal_visibility([
-        "//tsl:internal",
+        "@local_xla//xla/tsl:internal",
         "//tsl/profiler:internal",
         "//tsl/profiler:xla_profiler_backends",
     ]),
@@ -168,7 +168,7 @@ tsl_cc_test(
 cc_library(
     name = "profiler_session",
     hdrs = ["profiler_session.h"],
-    visibility = internal_visibility(["//tsl:internal"]),
+    visibility = internal_visibility(["@local_xla//xla/tsl:internal"]),
     deps = [
         "//tsl/platform",
         "//tsl/platform:errors",
@@ -332,6 +332,7 @@ cc_library(
     visibility = internal_visibility([
         "@local_xla//xla/backends/profiler/plugin:__pkg__",
         "//learning/brain/tfrc/executor/stream_executor:__pkg__",
+        "@local_xla//xla/backends/profiler/cpu:__pkg__",
     ]),
     deps = [
         ":profiler_interface",
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/context_types.cc b/third_party/xla/third_party/tsl/tsl/profiler/lib/context_types.cc
index 371631c10ba882..a581f47572f95d 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/context_types.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/context_types.cc
@@ -48,6 +48,8 @@ const char* GetContextTypeString(ContextType context_type) {
       return "pathways_exec";
     case ContextType::kPjrtLibraryCall:
       return "pjrt_library_call";
+    case ContextType::kThreadpoolEvent:
+      return "threadpool_event";
   }
 }
 
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/context_types.h b/third_party/xla/third_party/tsl/tsl/profiler/lib/context_types.h
index 621f35462fdae2..35bf1b8b2755b7 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/context_types.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/context_types.h
@@ -37,6 +37,7 @@ enum class ContextType : int {
   kTpuLaunch,
   kPathwaysExecutor,
   kPjrtLibraryCall,
+  kThreadpoolEvent,
   kLastContextType = ContextType::kTpuLaunch,
 };
 
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/protobuf/BUILD b/third_party/xla/third_party/tsl/tsl/profiler/protobuf/BUILD
index 428a963d2c31e3..e5707120bd9489 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/protobuf/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/profiler/protobuf/BUILD
@@ -1,6 +1,6 @@
 # copybara:uncomment(oss-unused) load("//net/grpc/go/build_defs:go_grpc_library.bzl", "go_grpc_library")
 # Placeholder: load py_proto_library
-load("//tsl:tsl.bzl", "internal_visibility")
+load("@local_xla//xla/tsl:tsl.bzl", "internal_visibility")
 load("//tsl/platform:build_config.bzl", "tf_proto_library")
 
 # copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/rpc/BUILD b/third_party/xla/third_party/tsl/tsl/profiler/rpc/BUILD
index d87ad885f05aab..32c760974713fa 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/rpc/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/profiler/rpc/BUILD
@@ -1,6 +1,6 @@
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-load("//tsl:tsl.bzl", "internal_visibility")
-load("//tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
+load("@local_xla//xla/tsl:tsl.bzl", "internal_visibility")
+load("@local_xla//xla/tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
 load(
     "//tsl/profiler/builds:build_config.bzl",
     "tf_profiler_copts",
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/BUILD b/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/BUILD
index f9210aeaab557e..925af9f6ebe4ed 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/BUILD
@@ -1,6 +1,6 @@
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-load("//tsl:tsl.bzl", "internal_visibility")
-load("//tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
+load("@local_xla//xla/tsl:tsl.bzl", "internal_visibility")
+load("@local_xla//xla/tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
 load(
     "//tsl/platform:build_config.bzl",
     "tf_protos_profiler_service",
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/capture_profile.cc b/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/capture_profile.cc
index 1f5233aff154d3..21a15e34eebfc6 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/capture_profile.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/capture_profile.cc
@@ -114,7 +114,7 @@ NewProfileSessionRequest PopulateNewProfileSessionRequest(
   return request;
 }
 
-inline bool ShouldRetryTracing(Status status) {
+inline bool ShouldRetryTracing(absl::Status status) {
   return status.code() == error::Code::UNAVAILABLE ||
          status.code() == error::Code::ALREADY_EXISTS ||
          // When auto-reconnecting to a remote TensorFlow worker after it
@@ -125,10 +125,10 @@ inline bool ShouldRetryTracing(Status status) {
           status.message() == "Stream removed");
 }
 
-Status Profile(const std::string& repository_root,
-               const std::string& session_id,
-               const RemoteProfilerSessionManagerOptions& opts) {
-  Status status;
+absl::Status Profile(const std::string& repository_root,
+                     const std::string& session_id,
+                     const RemoteProfilerSessionManagerOptions& opts) {
+  absl::Status status;
   // Host name will be overwritten by RemoteProfilerSessionManager later.
   ProfileRequest request = PopulateProfileRequest(repository_root, session_id,
                                                   /*host_name=*/"", opts);
@@ -162,19 +162,20 @@ Status Profile(const std::string& repository_root,
   }
 
   if (!has_trace_data) {
-    return Status(absl::StatusCode::kUnavailable,
-                  "No trace event was collected because there were no responses"
-                  " from clients or the responses did not have trace data.");
+    return absl::Status(
+        absl::StatusCode::kUnavailable,
+        "No trace event was collected because there were no responses"
+        " from clients or the responses did not have trace data.");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Start a new profiling session that include all the hosts included in
 // hostnames, for the time interval of duration_ms. Possibly save the profiling
 // result in the directory specified by repository_root and session_id.
-Status NewSession(absl::string_view repository_root,
-                  absl::string_view session_id,
-                  const RemoteProfilerSessionManagerOptions& opts) {
+absl::Status NewSession(absl::string_view repository_root,
+                        absl::string_view session_id,
+                        const RemoteProfilerSessionManagerOptions& opts) {
   NewProfileSessionRequest request =
       PopulateNewProfileSessionRequest(repository_root, session_id, opts);
   NewProfileSessionResponse response;
@@ -186,14 +187,15 @@ Status NewSession(absl::string_view repository_root,
   if (response.empty_trace()) {
     return errors::Unavailable("No trace event is collected");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
 
-Status CaptureRemoteTrace(const std::string& logdir, int num_tracing_attempts,
-                          RemoteProfilerSessionManagerOptions& opts,
-                          bool is_cloud_tpu_session) {
+absl::Status CaptureRemoteTrace(const std::string& logdir,
+                                int num_tracing_attempts,
+                                RemoteProfilerSessionManagerOptions& opts,
+                                bool is_cloud_tpu_session) {
   DCHECK_GT(opts.profiler_options().duration_ms(), 0);
   DCHECK(!opts.service_addresses().empty());
 
@@ -202,7 +204,7 @@ Status CaptureRemoteTrace(const std::string& logdir, int num_tracing_attempts,
   std::string repository_root = GetTensorBoardProfilePluginDir(logdir);
   auto duration_ms = opts.profiler_options().duration_ms();
 
-  Status status;
+  absl::Status status;
   int remaining_attempts = num_tracing_attempts;
   while (true) {
     auto start_timestamp = absl::Now() + absl::Milliseconds(opts.delay_ms());
@@ -237,19 +239,20 @@ Status CaptureRemoteTrace(const std::string& logdir, int num_tracing_attempts,
   return status;
 }
 
-Status Monitor(const std::string& service_addr, int duration_ms,
-               int monitoring_level, bool display_timestamp,
-               std::string* result) {
+absl::Status Monitor(const std::string& service_addr, int duration_ms,
+                     int monitoring_level, bool display_timestamp,
+                     std::string* result) {
   MonitorRequest request =
       PopulateMonitorRequest(duration_ms, monitoring_level, display_timestamp);
   MonitorResponse response;
   TF_RETURN_IF_ERROR(MonitorGrpc(service_addr, request, &response));
   *result = response.data();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status ExportToTensorBoard(const XSpace& xspace, const std::string& logdir,
-                           bool also_export_trace_json) {
+absl::Status ExportToTensorBoard(const XSpace& xspace,
+                                 const std::string& logdir,
+                                 bool also_export_trace_json) {
   std::string repository_root =
       tsl::profiler::GetTensorBoardProfilePluginDir(logdir);
   std::string run = tsl::profiler::GetCurrentTimeStampAsString();
@@ -263,10 +266,10 @@ Status ExportToTensorBoard(const XSpace& xspace, const std::string& logdir,
         repository_root, run, host, "trace.json.gz",
         tsl::profiler::TraceContainerToJson(container));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status CaptureRemoteTrace(
+absl::Status CaptureRemoteTrace(
     const char* service_addr, const char* logdir, const char* worker_list,
     bool include_dataset_ops, int duration_ms, int num_tracing_attempts,
     const absl::flat_hash_map<std::string, std::variant<int, std::string>>&
@@ -283,7 +286,7 @@ Status CaptureRemoteTrace(
     TF_RETURN_IF_ERROR(CaptureRemoteTrace(logdir, num_tracing_attempts, opts,
                                           is_cloud_tpu_session));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace profiler
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/capture_profile.h b/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/capture_profile.h
index 844db46c056cfa..0904c2ad22c99d 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/capture_profile.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/capture_profile.h
@@ -30,26 +30,27 @@ namespace tsl {
 namespace profiler {
 
 // Convert XSpace to tool data and saves under <logdir>/plugins/profile/.
-Status ExportToTensorBoard(const tensorflow::profiler::XSpace& xspace,
-                           const std::string& logdir,
-                           bool also_export_trace_json = false);
+absl::Status ExportToTensorBoard(const tensorflow::profiler::XSpace& xspace,
+                                 const std::string& logdir,
+                                 bool also_export_trace_json = false);
 
 // Collects one sample of monitoring profile and shows user-friendly metrics.
 // If timestamp flag is true, timestamp will be displayed in "%H:%M:%S" format.
-Status Monitor(const std::string& service_addr, int duration_ms,
-               int monitoring_level, bool display_timestamp,
-               std::string* result);
+absl::Status Monitor(const std::string& service_addr, int duration_ms,
+                     int monitoring_level, bool display_timestamp,
+                     std::string* result);
 
 // Starts tracing on a single or multiple hosts. Each host will save the result
 // in the given logdir. If no trace was collected, retries tracing for
 // num_tracing_attempts. Assumes that options have been validated.
-Status CaptureRemoteTrace(const std::string& logdir, int num_tracing_attempts,
-                          tensorflow::RemoteProfilerSessionManagerOptions& opts,
-                          bool is_cloud_tpu_session);
+absl::Status CaptureRemoteTrace(
+    const std::string& logdir, int num_tracing_attempts,
+    tensorflow::RemoteProfilerSessionManagerOptions& opts,
+    bool is_cloud_tpu_session);
 
 // Generates RemoteProfilerSessionManagerOptions from inputs and calls
 // CaptureRemoteTrace.
-Status CaptureRemoteTrace(
+absl::Status CaptureRemoteTrace(
     const char* service_addr, const char* logdir, const char* worker_list,
     bool include_dataset_ops, int duration_ms, int num_tracing_attempts,
     const absl::flat_hash_map<std::string, std::variant<int, std::string>>&
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/profiler_client.cc b/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/profiler_client.cc
index 1d941152d734c0..8bc9a1986effb7 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/profiler_client.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/profiler_client.cc
@@ -38,10 +38,10 @@ using tensorflow::NewProfileSessionResponse;
 using tensorflow::ProfileRequest;
 using tensorflow::ProfileResponse;
 
-inline Status FromGrpcStatus(const ::grpc::Status& s) {
-  return s.ok() ? OkStatus()
-                : Status(static_cast<absl::StatusCode>(s.error_code()),
-                         s.error_message());
+inline absl::Status FromGrpcStatus(const ::grpc::Status& s) {
+  return s.ok() ? absl::OkStatus()
+                : absl::Status(static_cast<absl::StatusCode>(s.error_code()),
+                               s.error_message());
 }
 
 template <typename T>
@@ -61,35 +61,37 @@ std::unique_ptr<typename T::Stub> CreateStub(
 
 }  // namespace
 
-Status ProfileGrpc(const std::string& service_address,
-                   const ProfileRequest& request, ProfileResponse* response) {
+absl::Status ProfileGrpc(const std::string& service_address,
+                         const ProfileRequest& request,
+                         ProfileResponse* response) {
   ::grpc::ClientContext context;
   std::unique_ptr<tensorflow::grpc::ProfilerService::Stub> stub =
       CreateStub<tensorflow::grpc::ProfilerService>(service_address);
   TF_RETURN_IF_ERROR(
       FromGrpcStatus(stub->Profile(&context, request, response)));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status NewSessionGrpc(const std::string& service_address,
-                      const NewProfileSessionRequest& request,
-                      NewProfileSessionResponse* response) {
+absl::Status NewSessionGrpc(const std::string& service_address,
+                            const NewProfileSessionRequest& request,
+                            NewProfileSessionResponse* response) {
   ::grpc::ClientContext context;
   std::unique_ptr<tensorflow::grpc::ProfileAnalysis::Stub> stub =
       CreateStub<tensorflow::grpc::ProfileAnalysis>(service_address);
   TF_RETURN_IF_ERROR(
       FromGrpcStatus(stub->NewSession(&context, request, response)));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status MonitorGrpc(const std::string& service_address,
-                   const MonitorRequest& request, MonitorResponse* response) {
+absl::Status MonitorGrpc(const std::string& service_address,
+                         const MonitorRequest& request,
+                         MonitorResponse* response) {
   ::grpc::ClientContext context;
   std::unique_ptr<tensorflow::grpc::ProfilerService::Stub> stub =
       CreateStub<tensorflow::grpc::ProfilerService>(service_address);
   TF_RETURN_IF_ERROR(
       FromGrpcStatus(stub->Monitor(&context, request, response)));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 /*static*/ std::unique_ptr<RemoteProfilerSession> RemoteProfilerSession::Create(
@@ -113,7 +115,7 @@ RemoteProfilerSession::RemoteProfilerSession(
 }
 
 RemoteProfilerSession::~RemoteProfilerSession() {
-  Status dummy;
+  absl::Status dummy;
   WaitForCompletion(dummy);
   grpc_context_.TryCancel();
 }
@@ -131,7 +133,7 @@ void RemoteProfilerSession::ProfileAsync() {
 }
 
 std::unique_ptr<ProfileResponse> RemoteProfilerSession::WaitForCompletion(
-    Status& out_status) {
+    absl::Status& out_status) {
   if (!response_) {
     out_status = errors::FailedPrecondition(
         "WaitForCompletion must only be called once.");
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/profiler_client.h b/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/profiler_client.h
index bbef4e8a2e928f..2a469b66befa1f 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/profiler_client.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/profiler_client.h
@@ -31,17 +31,17 @@ namespace profiler {
 
 // Note that tensorflow/tools/def_file_filter/symbols_pybind.txt is incompatible
 // with absl::string_view.
-Status ProfileGrpc(const std::string& service_address,
-                   const tensorflow::ProfileRequest& request,
-                   tensorflow::ProfileResponse* response);
+absl::Status ProfileGrpc(const std::string& service_address,
+                         const tensorflow::ProfileRequest& request,
+                         tensorflow::ProfileResponse* response);
 
-Status NewSessionGrpc(const std::string& service_address,
-                      const tensorflow::NewProfileSessionRequest& request,
-                      tensorflow::NewProfileSessionResponse* response);
+absl::Status NewSessionGrpc(const std::string& service_address,
+                            const tensorflow::NewProfileSessionRequest& request,
+                            tensorflow::NewProfileSessionResponse* response);
 
-Status MonitorGrpc(const std::string& service_address,
-                   const tensorflow::MonitorRequest& request,
-                   tensorflow::MonitorResponse* response);
+absl::Status MonitorGrpc(const std::string& service_address,
+                         const tensorflow::MonitorRequest& request,
+                         tensorflow::MonitorResponse* response);
 
 class RemoteProfilerSession {
  public:
@@ -64,7 +64,7 @@ class RemoteProfilerSession {
   // whichever is first. Subsequent calls after the first will yield nullptr and
   // an error status.
   std::unique_ptr<tensorflow::ProfileResponse> WaitForCompletion(
-      Status& out_status);
+      absl::Status& out_status);
 
  private:
   explicit RemoteProfilerSession(
@@ -78,7 +78,7 @@ class RemoteProfilerSession {
   //  evaluated lazily at WaitForCompletion() time.
   void ProfileAsync();
 
-  Status status_on_completion_;
+  absl::Status status_on_completion_;
   std::unique_ptr<tensorflow::ProfileResponse> response_;
   // Client address and connection attributes.
   std::string service_address_;
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/profiler_client_test.cc b/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/profiler_client_test.cc
index c5d3f97f06ed76..6fdfc1e03813c6 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/profiler_client_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/profiler_client_test.cc
@@ -48,7 +48,7 @@ TEST(RemoteProfilerSession, Simple) {
   auto remote_session =
       RemoteProfilerSession::Create(service_addr, deadline, request);
 
-  Status status;
+  absl::Status status;
   auto response = remote_session->WaitForCompletion(status);
   absl::Duration elapsed = absl::Now() - approx_start;
   // At end of session this evaluates to true still.
@@ -85,7 +85,7 @@ TEST(RemoteProfilerSession, Timeout) {
   // Expect this to fail immediately since deadline was set to the past,
   auto remote_session =
       RemoteProfilerSession::Create(service_addr, absl::Now(), request);
-  Status status;
+  absl::Status status;
   auto response = remote_session->WaitForCompletion(status);
   // At end of session we will have a timeout error.
   EXPECT_TRUE(errors::IsDeadlineExceeded(status));
@@ -108,7 +108,7 @@ TEST(RemoteProfilerSession, LongDeadline) {
 
   auto remote_session =
       RemoteProfilerSession::Create(service_addr, deadline, request);
-  Status status;
+  absl::Status status;
   auto response = remote_session->WaitForCompletion(status);
   absl::Duration elapsed = absl::Now() - approx_start;
   // At end of session this evaluates to true still.
@@ -135,7 +135,7 @@ TEST(RemoteProfilerSession, LongDuration) {
 
   auto remote_session =
       RemoteProfilerSession::Create(service_addr, deadline, request);
-  Status status;
+  absl::Status status;
   auto response = remote_session->WaitForCompletion(status);
   absl::Duration elapsed = absl::Now() - approx_start;
   // At end of session this evaluates to true still.
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/remote_profiler_session_manager.cc b/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/remote_profiler_session_manager.cc
index de3ae67d8beb8e..6d82f02ea0528e 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/remote_profiler_session_manager.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/remote_profiler_session_manager.cc
@@ -38,7 +38,7 @@ using tensorflow::RemoteProfilerSessionManagerOptions;
 /*static*/ std::unique_ptr<RemoteProfilerSessionManager>
 RemoteProfilerSessionManager::Create(
     const RemoteProfilerSessionManagerOptions& options,
-    const ProfileRequest& request, Status& out_status,
+    const ProfileRequest& request, absl::Status& out_status,
     AddressResolver resolver) {
   VLOG(1) << "Creating a RemoteProfilerSessionManager.";
   auto session_manager = absl::WrapUnique(
@@ -65,7 +65,7 @@ RemoteProfilerSessionManager::~RemoteProfilerSessionManager() {
   VLOG(2) << "Destroying RemoteProfilerSessionManager.";
 }
 
-Status RemoteProfilerSessionManager::Init() {
+absl::Status RemoteProfilerSessionManager::Init() {
   mutex_lock lock(mutex_);
   VLOG(1) << "SessionManager initializing.";
 
@@ -97,7 +97,7 @@ Status RemoteProfilerSessionManager::Init() {
   }
 
   LOG(INFO) << "Issued Profile gRPC to " << clients_.size() << " clients";
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 std::vector<RemoteProfilerSessionManager::Response>
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/remote_profiler_session_manager.h b/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/remote_profiler_session_manager.h
index 42aa5862a18fa8..0b3375fc2ced8e 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/remote_profiler_session_manager.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/remote_profiler_session_manager.h
@@ -39,13 +39,13 @@ class RemoteProfilerSessionManager {
   struct Response {
     std::string service_address;
     std::unique_ptr<tensorflow::ProfileResponse> profile_response;
-    Status status;
+    absl::Status status;
   };
   // Instantiates a collection of RemoteProfilerSessions starts profiling on
   // each of them immediately. Assumes that options have already been validated.
   static std::unique_ptr<RemoteProfilerSessionManager> Create(
       const tensorflow::RemoteProfilerSessionManagerOptions& options,
-      const tensorflow::ProfileRequest& request, Status& out_status,
+      const tensorflow::ProfileRequest& request, absl::Status& out_status,
       AddressResolver resolver = nullptr);
 
   // Awaits for responses from remote profiler sessions and returns them as a
@@ -65,7 +65,7 @@ class RemoteProfilerSessionManager {
       tensorflow::ProfileRequest request, AddressResolver resolver);
 
   // Initialization of all client contexts.
-  Status Init();
+  absl::Status Init();
 
   mutex mutex_;
   // Remote profiler session options.
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/remote_profiler_session_manager_test.cc b/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/remote_profiler_session_manager_test.cc
index 0459506e2a8a9d..90d15556b93764 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/remote_profiler_session_manager_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/remote_profiler_session_manager_test.cc
@@ -84,7 +84,7 @@ TEST(RemoteProfilerSessionManagerTest, Simple) {
 
   ProfileRequest request =
       PopulateProfileRequest(TmpDir(), "session_id", service_address, options);
-  Status status;
+  absl::Status status;
   auto sessions =
       RemoteProfilerSessionManager::Create(options, request, status);
   EXPECT_TRUE(status.ok());
@@ -116,7 +116,7 @@ TEST(RemoteProfilerSessionManagerTest, ExpiredDeadline) {
   absl::Time approx_start = absl::Now();
   ProfileRequest request =
       PopulateProfileRequest(TmpDir(), "session_id", service_address, options);
-  Status status;
+  absl::Status status;
   auto sessions =
       RemoteProfilerSessionManager::Create(options, request, status);
   EXPECT_TRUE(status.ok());
@@ -148,7 +148,7 @@ TEST(RemoteProfilerSessionManagerTest, LongSession) {
 
   ProfileRequest request =
       PopulateProfileRequest(TmpDir(), "session_id", service_address, options);
-  Status status;
+  absl::Status status;
   auto sessions =
       RemoteProfilerSessionManager::Create(options, request, status);
   EXPECT_TRUE(status.ok());
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/save_profile.cc b/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/save_profile.cc
index 508a2e2cec5a1d..2b25bf82d2655e 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/save_profile.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/save_profile.cc
@@ -47,10 +47,12 @@ constexpr char kProtoTraceFileName[] = "trace";
 constexpr char kTfStatsHelperSuffix[] = "tf_stats_helper_result";
 constexpr char kXPlanePb[] = "xplane.pb";
 
-Status DumpToolData(absl::string_view run_dir, absl::string_view host,
-                    const tensorflow::ProfileToolData& tool, std::ostream* os) {
+absl::Status DumpToolData(absl::string_view run_dir, absl::string_view host,
+                          const tensorflow::ProfileToolData& tool,
+                          std::ostream* os) {
   // Don't save the intermediate results for combining the per host tool data.
-  if (absl::EndsWith(tool.name(), kTfStatsHelperSuffix)) return OkStatus();
+  if (absl::EndsWith(tool.name(), kTfStatsHelperSuffix))
+    return absl::OkStatus();
   std::string host_prefix = host.empty() ? "" : absl::StrCat(host, ".");
   std::string path =
       ProfilerJoinPath(run_dir, absl::StrCat(host_prefix, tool.name()));
@@ -58,11 +60,11 @@ Status DumpToolData(absl::string_view run_dir, absl::string_view host,
   if (os) {
     *os << "Dumped tool data for " << tool.name() << " to " << path << '\n';
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status WriteGzippedDataToFile(const std::string& filepath,
-                              const std::string& data) {
+absl::Status WriteGzippedDataToFile(const std::string& filepath,
+                                    const std::string& data) {
   std::unique_ptr<WritableFile> file;
   TF_RETURN_IF_ERROR(Env::Default()->NewWritableFile(filepath, &file));
   io::ZlibCompressionOptions options = io::ZlibCompressionOptions::GZIP();
@@ -72,17 +74,17 @@ Status WriteGzippedDataToFile(const std::string& filepath,
   TF_RETURN_IF_ERROR(buffer.Append(data));
   TF_RETURN_IF_ERROR(buffer.Close());
   TF_RETURN_IF_ERROR(file->Close());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status GetOrCreateRunDir(const std::string& repository_root,
-                         const std::string& run, std::string* run_dir,
-                         std::ostream* os) {
+absl::Status GetOrCreateRunDir(const std::string& repository_root,
+                               const std::string& run, std::string* run_dir,
+                               std::ostream* os) {
   // Creates a directory to <repository_root>/<run>/.
   *run_dir = ProfilerJoinPath(repository_root, run);
   *os << "Creating directory: " << *run_dir << '\n';
   TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(*run_dir));
-  return OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace
 
@@ -92,11 +94,11 @@ std::string GetTensorBoardProfilePluginDir(const std::string& logdir) {
   return ProfilerJoinPath(logdir, kPluginName, kProfileName);
 }
 
-Status SaveProfile(const std::string& repository_root, const std::string& run,
-                   const std::string& host,
-                   const tensorflow::ProfileResponse& response,
-                   std::ostream* os) {
-  if (response.tool_data().empty()) return OkStatus();
+absl::Status SaveProfile(const std::string& repository_root,
+                         const std::string& run, const std::string& host,
+                         const tensorflow::ProfileResponse& response,
+                         std::ostream* os) {
+  if (response.tool_data().empty()) return absl::OkStatus();
   std::string run_dir;
   TF_RETURN_IF_ERROR(GetOrCreateRunDir(repository_root, run, &run_dir, os));
   // Windows file names do not support colons.
@@ -104,16 +106,17 @@ Status SaveProfile(const std::string& repository_root, const std::string& run,
   for (const auto& tool_data : response.tool_data()) {
     TF_RETURN_IF_ERROR(DumpToolData(run_dir, hostname, tool_data, os));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status SaveGzippedToolData(const std::string& repository_root,
-                           const std::string& run, const std::string& host,
-                           const std::string& tool_name,
-                           const std::string& data) {
+absl::Status SaveGzippedToolData(const std::string& repository_root,
+                                 const std::string& run,
+                                 const std::string& host,
+                                 const std::string& tool_name,
+                                 const std::string& data) {
   std::string run_dir;
   std::stringstream ss;
-  Status status = GetOrCreateRunDir(repository_root, run, &run_dir, &ss);
+  absl::Status status = GetOrCreateRunDir(repository_root, run, &run_dir, &ss);
   LOG(INFO) << ss.str();
   TF_RETURN_IF_ERROR(status);
   std::string host_prefix = host.empty() ? "" : absl::StrCat(host, ".");
@@ -121,7 +124,7 @@ Status SaveGzippedToolData(const std::string& repository_root,
       ProfilerJoinPath(run_dir, absl::StrCat(host_prefix, tool_name));
   TF_RETURN_IF_ERROR(WriteGzippedDataToFile(path, data));
   LOG(INFO) << "Dumped gzipped tool data for " << tool_name << " to " << path;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 std::string GetCurrentTimeStampAsString() {
@@ -129,9 +132,9 @@ std::string GetCurrentTimeStampAsString() {
                           absl::LocalTimeZone());
 }
 
-Status SaveXSpace(const std::string& repository_root, const std::string& run,
-                  const std::string& host,
-                  const tensorflow::profiler::XSpace& xspace) {
+absl::Status SaveXSpace(const std::string& repository_root,
+                        const std::string& run, const std::string& host,
+                        const tensorflow::profiler::XSpace& xspace) {
   std::string log_dir = ProfilerJoinPath(repository_root, run);
   VLOG(1) << "Creating " << log_dir;
   TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(log_dir));
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/save_profile.h b/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/save_profile.h
index bd342a86030cf4..e63706f5f5a683 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/save_profile.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/save_profile.h
@@ -36,21 +36,22 @@ std::string GetTensorBoardProfilePluginDir(const std::string& logdir);
 // This writes user-facing log messages to `os`.
 // Note: this function creates a directory even when all fields in
 // ProfileResponse are unset/empty.
-Status SaveProfile(const std::string& repository_root, const std::string& run,
-                   const std::string& host,
-                   const tensorflow::ProfileResponse& response,
-                   std::ostream* os);
+absl::Status SaveProfile(const std::string& repository_root,
+                         const std::string& run, const std::string& host,
+                         const tensorflow::ProfileResponse& response,
+                         std::ostream* os);
 
 // Gzip the data and save to <repository_root>/<run>/.
-Status SaveGzippedToolData(const std::string& repository_root,
-                           const std::string& run, const std::string& host,
-                           const std::string& tool_name,
-                           const std::string& data);
+absl::Status SaveGzippedToolData(const std::string& repository_root,
+                                 const std::string& run,
+                                 const std::string& host,
+                                 const std::string& tool_name,
+                                 const std::string& data);
 
 // Save XSpace to <repository_root>/<run>/<host>_<port>.<kXPlanePb>.
-Status SaveXSpace(const std::string& repository_root, const std::string& run,
-                  const std::string& host,
-                  const tensorflow::profiler::XSpace& xspace);
+absl::Status SaveXSpace(const std::string& repository_root,
+                        const std::string& run, const std::string& host,
+                        const tensorflow::profiler::XSpace& xspace);
 
 }  // namespace profiler
 }  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/rpc/profiler_service_impl.cc b/third_party/xla/third_party/tsl/tsl/profiler/rpc/profiler_service_impl.cc
index e01f330487b5e1..8deee9782aa9fe 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/rpc/profiler_service_impl.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/rpc/profiler_service_impl.cc
@@ -50,9 +50,9 @@ using tensorflow::TerminateResponse;
 
 // Collects data in XSpace format. The data is saved to a repository
 // unconditionally.
-Status CollectDataToRepository(const ProfileRequest& request,
-                               ProfilerSession* profiler,
-                               ProfileResponse* response) {
+absl::Status CollectDataToRepository(const ProfileRequest& request,
+                                     ProfilerSession* profiler,
+                                     ProfileResponse* response) {
   response->set_empty_trace(true);
   // Read the profile data into xspace.
   XSpace xspace;
@@ -76,7 +76,7 @@ class ProfilerServiceImpl : public tensorflow::grpc::ProfilerService::Service {
     VLOG(1) << "Received a profile request: " << req->DebugString();
     std::unique_ptr<ProfilerSession> profiler =
         ProfilerSession::Create(req->opts());
-    Status status = profiler->Status();
+    absl::Status status = profiler->Status();
     if (!status.ok()) {
       return ::grpc::Status(::grpc::StatusCode::INTERNAL,
                             std::string(status.message()));
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/BUILD b/third_party/xla/third_party/tsl/tsl/profiler/utils/BUILD
index 41d669aedbd103..f0145697048df9 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/utils/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/BUILD
@@ -1,5 +1,5 @@
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-load("//tsl:tsl.bzl", "internal_visibility")
+load("@local_xla//xla/tsl:tsl.bzl", "internal_visibility")
 load("//tsl/platform:build_config.bzl", "tsl_cc_test")
 load("//tsl/platform:build_config_root.bzl", "if_static")
 load("//tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
@@ -426,6 +426,7 @@ cc_library(
         "//tsl/profiler/protobuf:xplane_proto_cc",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/hash",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/preprocess_xplane.cc b/third_party/xla/third_party/tsl/tsl/profiler/utils/preprocess_xplane.cc
index 15aedbc9d495b4..7e8e7f3a431c97 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/utils/preprocess_xplane.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/preprocess_xplane.cc
@@ -81,6 +81,7 @@ void MutateXPlane(XPlane* plane,
 std::vector<std::unique_ptr<XplaneEventMutatorFactory>>
 CreateMutatorFactories() {
   std::vector<std::unique_ptr<XplaneEventMutatorFactory>> mutator_factories;
+  mutator_factories.push_back(ThreadpoolLineMutatorFactory::CreateFactory());
   mutator_factories.push_back(XplaneRootEventMutatorFactory::CreateFactory(
       HostEventType::kProcessBatch, 2));
   mutator_factories.push_back(XplaneRootEventMutatorFactory::CreateFactory(
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/preprocess_xplane.h b/third_party/xla/third_party/tsl/tsl/profiler/utils/preprocess_xplane.h
index 2cbc96bea37927..2433cd825cc842 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/utils/preprocess_xplane.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/preprocess_xplane.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/hash/hash.h"
+#include "absl/log/log.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/match.h"
 #include "absl/strings/string_view.h"
@@ -461,8 +462,101 @@ class TpuModuleLineMutatorFactory : public XplaneEventMutatorFactory {
   };
 };
 
-// Preprocess the given XSpace to support legacy traces. It converts old context
-// events and stats into new ones according to go/xprof-traceme2-semantics.
+// Line mutator for threadpool line.
+// Threadpool Line Mutator create a kThreadpoolListenerRegion from StartRegion
+// to StopRegion events, and propagates the context information from the
+// StartRegion to the newly added event.
+class ThreadpoolLineMutatorFactory : public XplaneEventMutatorFactory {
+ public:
+  static std::unique_ptr<XplaneEventMutatorFactory> CreateFactory() {
+    return absl::WrapUnique(new ThreadpoolLineMutatorFactory());
+  }
+
+  std::vector<std::unique_ptr<XplaneEventMutator>> CreateMutators(
+      XPlaneBuilder* xplane) const override {
+    std::vector<std::unique_ptr<XplaneEventMutator>> mutators;
+    mutators.emplace_back(std::make_unique<ThreadpoolLineMutator>(xplane));
+    return mutators;
+  }
+
+ private:
+  ThreadpoolLineMutatorFactory() = default;
+
+  class ThreadpoolLineMutator : public XplaneEventMutator {
+   public:
+    explicit ThreadpoolLineMutator(XPlaneBuilder* xplane)
+        : XplaneEventMutator(nullptr), xplane_(xplane) {
+      start_region_metadata_ =
+          xplane_->GetEventMetadata(kThreadpoolListenerStartRegion);
+      stop_region_metadata_ =
+          xplane_->GetEventMetadata(kThreadpoolListenerStopRegion);
+      thread_pool_metadata_ =
+          xplane_->GetOrCreateEventMetadata(kThreadpoolListenerRegion);
+      consumer_ = xplane_->GetOrCreateStatMetadata(
+          GetStatTypeStr(StatType::kConsumerId));
+      consumer_type_ = xplane_->GetOrCreateStatMetadata(
+          GetStatTypeStr(StatType::kConsumerType));
+    }
+
+    void Mutate(XEventBuilder* event_builder) override {
+      CHECK(false);  // Crash OK
+    }
+
+    void MutateEventsInLine(XLineBuilder* line) override {
+      if (start_region_metadata_ == nullptr ||
+          stop_region_metadata_ == nullptr) {
+        // Skip mutations for xplanes that do not have region markers. These
+        // include device_planes, or situations where the threadpool_listeners
+        // did not start or were not present.
+        return;
+      }
+      int64_t start_region_timestamp_ps = 0;
+      int64_t region_id;
+      struct EventMetadata {
+        int64_t start_region_timestamp_ps;
+        int64_t region_id;
+        int64_t end_region_timestamp_ps;
+      };
+
+      std::vector<EventMetadata> event_metadata;
+      line->ForEachEvent([&](const XEventBuilder& event) {
+        if (event.MetadataId() == start_region_metadata_->id()) {
+          auto consumer_id = event.GetStat(*consumer_);
+          if (!consumer_id) return;
+          start_region_timestamp_ps = event.TimestampPs();
+          region_id = event.IntOrUintValue(*consumer_id);
+        } else if (event.MetadataId() == stop_region_metadata_->id() &&
+                   start_region_timestamp_ps != 0) {
+          EventMetadata metadata;
+          metadata.start_region_timestamp_ps = start_region_timestamp_ps;
+          metadata.region_id = region_id;
+          metadata.end_region_timestamp_ps = event.TimestampPs();
+          event_metadata.emplace_back(metadata);
+        }
+      });
+      for (const auto& event_metadata : event_metadata) {
+        XEventBuilder region = line->AddEvent(*thread_pool_metadata_);
+        region.SetTimestampPs(event_metadata.start_region_timestamp_ps);
+        region.SetEndTimestampPs(event_metadata.end_region_timestamp_ps);
+        region.SetOrAddStatValue(*consumer_, event_metadata.region_id);
+        region.SetOrAddStatValue(
+            *consumer_type_,
+            static_cast<int64_t>(ContextType::kThreadpoolEvent));
+      }
+    }
+
+   private:
+    XStatMetadata* consumer_;
+    XStatMetadata* consumer_type_;
+    XPlaneBuilder* xplane_;
+    XEventMetadata* start_region_metadata_;
+    XEventMetadata* stop_region_metadata_;
+    XEventMetadata* thread_pool_metadata_;
+  };
+};
+// Preprocess the given XSpace to support legacy traces. It converts old
+// context events and stats into new ones according to
+// go/xprof-traceme2-semantics.
 void PreprocessXSpace(XSpace* space);
 void PreprocessXPlane(XPlane* plane);
 
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/preprocess_xplane_test.cc b/third_party/xla/third_party/tsl/tsl/profiler/utils/preprocess_xplane_test.cc
index 08343acf6af1c4..5912c00c397853 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/utils/preprocess_xplane_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/preprocess_xplane_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tsl/profiler/utils/preprocess_xplane.h"
 
+#include <cstdint>
 #include <optional>
 
 #include "absl/container/flat_hash_map.h"
@@ -250,6 +251,45 @@ TEST(PreprocessXPlane, HostRunIdPreprocessorTest) {
   });
 }
 
+TEST(PreprocessXPlane, ThreadPoolPreprocessorTest) {
+  XSpace space;
+  XPlane* plane = space.add_planes();
+  XPlaneBuilder plane_builder(plane);
+  auto main_line = plane_builder.GetOrCreateLine(0);
+  CreateXEvent(&plane_builder, &main_line, kThreadpoolListenerRecord, 100, 100,
+               {{StatType::kProducerType,
+                 static_cast<int64_t>(ContextType::kThreadpoolEvent)},
+                {StatType::kProducerId, int64_t{123}}});
+  auto thread_pool_line = plane_builder.GetOrCreateLine(1);
+  CreateXEvent(&plane_builder, &thread_pool_line,
+               kThreadpoolListenerStartRegion, 200, 0,
+               {{StatType::kConsumerType,
+                 static_cast<int64_t>(ContextType::kThreadpoolEvent)},
+                {StatType::kConsumerId, int64_t{123}}});
+  CreateXEvent(&plane_builder, &thread_pool_line, kThreadpoolListenerStopRegion,
+               300, 0,
+               {{StatType::kConsumerType,
+                 static_cast<int64_t>(ContextType::kThreadpoolEvent)},
+                {StatType::kConsumerId, int64_t{123}}});
+
+  bool new_event_added = false;
+  PreprocessXSpace(&space);
+  XPlaneVisitor plane_visitor = CreateTfXPlaneVisitor(plane);
+  plane_visitor.ForEachLine([&](const XLineVisitor& line) {
+    line.ForEachEvent([&](const XEventVisitor& event) {
+      if (event.Name() == kThreadpoolListenerRegion) {
+        new_event_added = true;
+        EXPECT_EQ(event.DurationPs(), 100);
+        EXPECT_EQ(event.TimestampPs(), 200);
+        auto stat = event.GetStat(StatType::kConsumerId);
+        EXPECT_TRUE(stat.has_value());
+        EXPECT_EQ(stat->IntOrUintValue(), 123);
+      }
+    });
+  });
+  EXPECT_TRUE(new_event_added);
+}
+
 }  // namespace
 }  // namespace profiler
 }  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/session_manager.cc b/third_party/xla/third_party/tsl/tsl/profiler/utils/session_manager.cc
index 5fa007f72396be..7fd31dab970104 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/utils/session_manager.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/session_manager.cc
@@ -158,7 +158,7 @@ RemoteProfilerSessionManagerOptions GetRemoteSessionManagerOptionsLocked(
   return options;
 }
 
-tsl::Status ValidateRemoteProfilerSessionManagerOptions(
+absl::Status ValidateRemoteProfilerSessionManagerOptions(
     const RemoteProfilerSessionManagerOptions& options) {
   if (options.service_addresses().empty()) {
     return tsl::errors::InvalidArgument("No service address provided.");
@@ -180,10 +180,10 @@ tsl::Status ValidateRemoteProfilerSessionManagerOptions(
         "to the local profiler duration.");
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-tsl::Status ValidateHostPortPair(absl::string_view host_port) {
+absl::Status ValidateHostPortPair(absl::string_view host_port) {
   tsl::uint32 port;
   std::vector<absl::string_view> parts = absl::StrSplit(host_port, ':');
   // Must be host:port, port must be a number, host must not contain a '/',
@@ -193,7 +193,7 @@ tsl::Status ValidateHostPortPair(absl::string_view host_port) {
     return tsl::errors::InvalidArgument("Could not interpret \"", host_port,
                                         "\" as a host-port pair.");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace profiler
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/session_manager.h b/third_party/xla/third_party/tsl/tsl/profiler/utils/session_manager.h
index cf7fd4327c2976..9a6a6300cef51f 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/utils/session_manager.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/session_manager.h
@@ -27,7 +27,7 @@ namespace tsl {
 namespace profiler {
 
 // Validate RemoteProfilerSessionManagerOptions.
-tsl::Status ValidateRemoteProfilerSessionManagerOptions(
+absl::Status ValidateRemoteProfilerSessionManagerOptions(
     const tensorflow::RemoteProfilerSessionManagerOptions& options);
 
 // Get RemoteSessionManagerOptions from logdir and opts.
@@ -48,7 +48,7 @@ GetRemoteSessionManagerOptionsLocked(
     bool* is_cloud_tpu_session);
 
 // Validate Host Port pair.
-tsl::Status ValidateHostPortPair(absl::string_view host_port);
+absl::Status ValidateHostPortPair(absl::string_view host_port);
 }  // namespace profiler
 }  // namespace tsl
 
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.cc b/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.cc
index 990ffa1ed4e9df..4fa87ef4085372 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.cc
@@ -553,10 +553,26 @@ const absl::string_view kMegaScaleH2DTransferFinished =
 const absl::string_view kMegaScaleReductionStart = "MegaScale: Reduction";
 const absl::string_view kMegaScaleReductionFinished =
     "MegaScale: Reduction Finished";
+const absl::string_view kMegaScaleCompressionStart = "MegaScale: Compression";
+const absl::string_view kMegaScaleCompressionFinished =
+    "MegaScale: Compression Finished";
+const absl::string_view kMegaScaleDecompressionStart =
+    "MegaScale: Decompression";
+const absl::string_view kMegaScaleDecompressionFinished =
+    "MegaScale: Decompression Finished";
 const char kXProfMetadataKey[] = "key";
 const char kXProfMetadataFlow[] = "flow";
 const char kXProfMetadataTransfers[] = "transfers";
 const char kXProfMetadataBufferSize[] = "buffer_size";
 
+// String constants for threadpool_listener
+const absl::string_view kThreadpoolListenerRecord =
+    "ThreadpoolListener::Record";
+const absl::string_view kThreadpoolListenerStartRegion =
+    "ThreadpoolListener::StartRegion";
+const absl::string_view kThreadpoolListenerStopRegion =
+    "ThreadpoolListener::StopRegion";
+const absl::string_view kThreadpoolListenerRegion =
+    "ThreadpoolListener::Region";
 }  // namespace profiler
 }  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.h b/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.h
index c3d0dbddd70d30..d6870715b99fa2 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.h
@@ -502,11 +502,21 @@ TF_CONST_INIT extern const absl::string_view kMegaScaleH2DTransferStart;
 TF_CONST_INIT extern const absl::string_view kMegaScaleH2DTransferFinished;
 TF_CONST_INIT extern const absl::string_view kMegaScaleReductionStart;
 TF_CONST_INIT extern const absl::string_view kMegaScaleReductionFinished;
+TF_CONST_INIT extern const absl::string_view kMegaScaleCompressionStart;
+TF_CONST_INIT extern const absl::string_view kMegaScaleCompressionFinished;
+TF_CONST_INIT extern const absl::string_view kMegaScaleDecompressionStart;
+TF_CONST_INIT extern const absl::string_view kMegaScaleDecompressionFinished;
 TF_CONST_INIT extern const char kXProfMetadataKey[];
 TF_CONST_INIT extern const char kXProfMetadataFlow[];
 TF_CONST_INIT extern const char kXProfMetadataTransfers[];
 TF_CONST_INIT extern const char kXProfMetadataBufferSize[];
 
+// String constants for threadpool_listener events
+TF_CONST_INIT extern const absl::string_view kThreadpoolListenerRecord;
+TF_CONST_INIT extern const absl::string_view kThreadpoolListenerStartRegion;
+TF_CONST_INIT extern const absl::string_view kThreadpoolListenerStopRegion;
+TF_CONST_INIT extern const absl::string_view kThreadpoolListenerRegion;
+
 }  // namespace profiler
 }  // namespace tsl
 
diff --git a/third_party/xla/third_party/tsl/tsl/protobuf/BUILD b/third_party/xla/third_party/tsl/tsl/protobuf/BUILD
index 557d9fcf2208e6..395527e4e22e6e 100644
--- a/third_party/xla/third_party/tsl/tsl/protobuf/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/protobuf/BUILD
@@ -1,6 +1,6 @@
 # Placeholder: load py_proto_library
 load(
-    "//tsl:tsl.bzl",
+    "@local_xla//xla/tsl:tsl.bzl",
     "if_google",
     "internal_visibility",
 )
@@ -13,7 +13,7 @@ package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = internal_visibility([
         "//tensorflow/core:__subpackages__",
-        "//tsl:internal",
+        "@local_xla//xla/tsl:internal",
         "//tensorflow_models:__subpackages__",
     ]),
     features = if_google(["-parse_headers"]),
diff --git a/third_party/xla/third_party/tsl/tsl/protobuf/dnn.proto b/third_party/xla/third_party/tsl/tsl/protobuf/dnn.proto
index cc16b2141e0e7a..695db935f6a0b4 100644
--- a/third_party/xla/third_party/tsl/tsl/protobuf/dnn.proto
+++ b/third_party/xla/third_party/tsl/tsl/protobuf/dnn.proto
@@ -192,3 +192,12 @@ enum FusedMHAKind {
   BMM1_OUTPUT_INPUT_TYPE = 1;
   BMM1_OUTPUT_FLOAT = 2;
 }
+
+// FusedMHAMaskKind kind
+enum FMHAMaskKind {
+  NO_MASK = 0;
+  PADDING = 1;
+  CAUSAL = 2;
+  PADDING_CAUSAL = 3;
+  ALIBI = 4;
+}
diff --git a/third_party/xla/third_party/tsl/workspace2.bzl b/third_party/xla/third_party/tsl/workspace2.bzl
index e23dcc3a4c7ad6..24e4538857208f 100644
--- a/third_party/xla/third_party/tsl/workspace2.bzl
+++ b/third_party/xla/third_party/tsl/workspace2.bzl
@@ -6,7 +6,6 @@ load("@bazel_skylib//lib:versions.bzl", "versions")
 # Import external repository rules.
 load("@bazel_tools//tools/build_defs/repo:java.bzl", "java_import_external")
 load("@io_bazel_rules_closure//closure:defs.bzl", "filegroup_external")
-load("@tf_runtime//:dependencies.bzl", "tfrt_dependencies")
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 # Import third party repository rules. See go/tfbr-thirdparty.
@@ -20,6 +19,7 @@ load("//third_party/gemmlowp:workspace.bzl", gemmlowp = "repo")
 load("//third_party/git:git_configure.bzl", "git_configure")
 load("//third_party/gpus:cuda_configure.bzl", "cuda_configure")
 load("//third_party/gpus:rocm_configure.bzl", "rocm_configure")
+load("//third_party/gpus:sycl_configure.bzl", "sycl_configure")
 load("//third_party/hwloc:workspace.bzl", hwloc = "repo")
 load("//third_party/implib_so:workspace.bzl", implib_so = "repo")
 load("//third_party/llvm:setup.bzl", "llvm_setup")
@@ -76,6 +76,7 @@ def _tf_toolchains():
     syslibs_configure(name = "local_config_syslibs")
     python_configure(name = "local_config_python")
     rocm_configure(name = "local_config_rocm")
+    sycl_configure(name = "local_config_sycl")
     remote_execution_configure(name = "local_config_remote_execution")
 
     # For windows bazel build
@@ -621,8 +622,6 @@ def workspace():
     # written according to common practice to query native.existing_rule()).
     _tf_repositories()
 
-    tfrt_dependencies()
-
 # Alias so it can be loaded without assigning to a different symbol to prevent
 # shadowing previous loads and trigger a buildifier warning.
 tsl_workspace2 = workspace
diff --git a/third_party/xla/third_party/tsl/workspace3.bzl b/third_party/xla/third_party/tsl/workspace3.bzl
index 9510b09374206c..a1293f59a48885 100644
--- a/third_party/xla/third_party/tsl/workspace3.bzl
+++ b/third_party/xla/third_party/tsl/workspace3.bzl
@@ -2,7 +2,6 @@
 
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 load("//third_party/llvm:workspace.bzl", llvm = "repo")
-load("//third_party/tf_runtime:workspace.bzl", tf_runtime = "repo")
 
 def workspace():
     http_archive(
@@ -15,8 +14,6 @@ def workspace():
         ],
     )
 
-    tf_runtime()
-
     # https://github.com/bazelbuild/bazel-skylib/releases
     http_archive(
         name = "bazel_skylib",
diff --git a/third_party/xla/tools/toolchains/remote_config/configs.bzl b/third_party/xla/tools/toolchains/remote_config/configs.bzl
index f55194087b1277..756c7bbe8d786c 100644
--- a/third_party/xla/tools/toolchains/remote_config/configs.bzl
+++ b/third_party/xla/tools/toolchains/remote_config/configs.bzl
@@ -725,3 +725,80 @@ def initialize_rbe_configs():
             "TF_TENSORRT_VERSION": "8.6",
         },
     )
+
+    sigbuild_tf_configs(
+        name_container_map = {
+            "sigbuild-r2.17": "docker://gcr.io/tensorflow-sigs/build@sha256:dddcaf30321e9007103dce75c51b83fea3c06de462fcf41e7c6ae93f37fc3545",
+            "sigbuild-r2.17-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:8ca6b205b54f18d26a053cfe606145b8b11cc99cf83fc970a936ce327913c3c3",
+            "sigbuild-r2.17-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:5cfd081a337548165a800546f2365a38245e38e7a97052b1a21830bf66b2356d",
+            "sigbuild-r2.17-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:dddcaf30321e9007103dce75c51b83fea3c06de462fcf41e7c6ae93f37fc3545",
+            "sigbuild-r2.17-python3.12": "docker://gcr.io/tensorflow-sigs/build@sha256:933c9f4bf65c92780863e00bd2132c6cfd41dbd624736c1af0dd2a5a056db6b8",
+        },
+        # Unclear why LIBC is set to 2.19 here, and yet manylinux2010 is 2.12
+        # and manylinux2014 is 2.17.
+        env = {
+            "ABI_LIBC_VERSION": "glibc_2.19",
+            "ABI_VERSION": "gcc",
+            "BAZEL_COMPILER": "/dt9/usr/bin/gcc",
+            "BAZEL_HOST_SYSTEM": "i686-unknown-linux-gnu",
+            "BAZEL_TARGET_CPU": "k8",
+            "BAZEL_TARGET_LIBC": "glibc_2.19",
+            "BAZEL_TARGET_SYSTEM": "x86_64-unknown-linux-gnu",
+            "CC": "/dt9/usr/bin/gcc",
+            "CC_TOOLCHAIN_NAME": "linux_gnu_x86",
+            "CLEAR_CACHE": "1",
+            "CUDNN_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
+            "GCC_HOST_COMPILER_PATH": "/dt9/usr/bin/gcc",
+            "GCC_HOST_COMPILER_PREFIX": "/usr/bin",
+            "HOST_CXX_COMPILER": "/dt9/usr/bin/gcc",
+            "HOST_C_COMPILER": "/dt9/usr/bin/gcc",
+            "PYTHON_BIN_PATH": "/usr/bin/python3",
+            "TF_CUDA_CLANG": "0",
+            "TF_CUDA_COMPUTE_CAPABILITIES": "3.5,6.0",
+            "TF_CUDA_VERSION": "12.3",
+            "TF_CUDNN_VERSION": "8.9",
+            "TF_ENABLE_XLA": "1",
+            "TF_NEED_CUDA": "1",
+            "TF_SYSROOT": "/dt9",
+            "TF_NEED_TENSORRT": "1",
+            "TF_TENSORRT_VERSION": "8.6",
+        },
+    )
+
+    sigbuild_tf_configs(
+        name_container_map = {
+            "sigbuild-r2.17-clang": "docker://gcr.io/tensorflow-sigs/build@sha256:dddcaf30321e9007103dce75c51b83fea3c06de462fcf41e7c6ae93f37fc3545",
+            "sigbuild-r2.17-clang-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:8ca6b205b54f18d26a053cfe606145b8b11cc99cf83fc970a936ce327913c3c3",
+            "sigbuild-r2.17-clang-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:5cfd081a337548165a800546f2365a38245e38e7a97052b1a21830bf66b2356d",
+            "sigbuild-r2.17-clang-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:dddcaf30321e9007103dce75c51b83fea3c06de462fcf41e7c6ae93f37fc3545",
+            "sigbuild-r2.17-clang-python3.12": "docker://gcr.io/tensorflow-sigs/build@sha256:933c9f4bf65c92780863e00bd2132c6cfd41dbd624736c1af0dd2a5a056db6b8",
+        },
+        # Unclear why LIBC is set to 2.19 here, and yet manylinux2010 is 2.12
+        # and manylinux2014 is 2.17.
+        env = {
+            "ABI_LIBC_VERSION": "glibc_2.19",
+            "ABI_VERSION": "gcc",
+            "BAZEL_COMPILER": "/usr/lib/llvm-17/bin/clang",
+            "BAZEL_HOST_SYSTEM": "i686-unknown-linux-gnu",
+            "BAZEL_TARGET_CPU": "k8",
+            "BAZEL_TARGET_LIBC": "glibc_2.19",
+            "BAZEL_TARGET_SYSTEM": "x86_64-unknown-linux-gnu",
+            "CC": "/usr/lib/llvm-17/bin/clang",
+            "CC_TOOLCHAIN_NAME": "linux_gnu_x86",
+            "CLEAR_CACHE": "1",
+            "CUDNN_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
+            "CLANG_CUDA_COMPILER_PATH": "/usr/lib/llvm-17/bin/clang",
+            "HOST_CXX_COMPILER": "/usr/lib/llvm-17/bin/clang",
+            "HOST_C_COMPILER": "/usr/lib/llvm-17/bin/clang",
+            "PYTHON_BIN_PATH": "/usr/bin/python3",
+            "TF_CUDA_CLANG": "1",
+            "TF_CUDA_COMPUTE_CAPABILITIES": "3.5,6.0",
+            "TF_CUDA_VERSION": "12.3",
+            "TF_CUDNN_VERSION": "8.9",
+            "TF_ENABLE_XLA": "1",
+            "TF_NEED_CUDA": "1",
+            "TF_SYSROOT": "/dt9",
+            "TF_NEED_TENSORRT": "1",
+            "TF_TENSORRT_VERSION": "8.6",
+        },
+    )
diff --git a/third_party/xla/tools/toolchains/remote_config/containers.bzl b/third_party/xla/tools/toolchains/remote_config/containers.bzl
index c1fbd211e16806..dd222d06bd13b1 100644
--- a/third_party/xla/tools/toolchains/remote_config/containers.bzl
+++ b/third_party/xla/tools/toolchains/remote_config/containers.bzl
@@ -6,9 +6,9 @@ container_digests = {
     "cuda11.2-cudnn8.1-ubuntu20.04-manylinux2014-multipython": "sha256:48612bd85709cd014711d0b0f87e0806f3567d06d2e81c6e860516b87498b821",
     # JAX manylinux2014 configs.
     "cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython": "sha256:45619e91f14faabddd79fe0cb1526df4c4ad92fc2e6ebdc725ea4419225429c3",
-    "cuda12.1-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:25bb9dcc4af0fabeb09c29c6679d85a72e751872c733465a784e6e1395b31ba3",
+    "cuda12.1-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:8c266e5b0acd203aed5e8871b63f68a39d8d23f6d882e619797e58b973f7fe63",
     "cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:9fefda035b4a12b24cd5bae56c7dbb9527a5fd06a41ced0a22ac86fe5ed26428",
-    "cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:578e8ec3d03451867a8cee100fa92f1686dd83443f8938df8e91b5f9a157f89e",
+    "cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:6f9524a2ed7f75255dc4be3a0c5e3bda581385a1c13e2fa890bc17fa62da95b2",
     # ROCM, probably not all of them still in use
     "rocm-ubuntu18.04-manylinux2010-multipython": "sha256:6e953a09b145df338bcb03e9e36f99b291140c29b72d0a048fb6c5905ccad5eb",
     "rocm-ubuntu20.04-manylinux2014-multipython": "sha256:906faec7765fe5dd067f2b092b5d5f220c1fedde725fb42c83d031b4d6f32204",
diff --git a/third_party/xla/workspace4.bzl b/third_party/xla/workspace4.bzl
index 0a12c25ffd3fc7..db1df366808e4c 100644
--- a/third_party/xla/workspace4.bzl
+++ b/third_party/xla/workspace4.bzl
@@ -5,7 +5,7 @@ load("//third_party:repo.bzl", "tf_vendored")
 # buildifier: disable=function-docstring
 # buildifier: disable=unnamed-macro
 def workspace():
-    # Declares @local_tsl
+    # Declares @tsl
     tf_vendored(name = "tsl", relpath = "third_party/tsl")
 
 # Alias so it can be loaded without assigning to a different symbol to prevent
diff --git a/third_party/xla/xla/BUILD b/third_party/xla/xla/BUILD
index b9434882faf5c1..907924518699f6 100644
--- a/third_party/xla/xla/BUILD
+++ b/third_party/xla/xla/BUILD
@@ -1,5 +1,3 @@
-load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
-load("@local_tsl//tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
 load(
     "@local_tsl//tsl/platform:build_config.bzl",
     "tf_proto_library",
@@ -9,6 +7,8 @@ load("//third_party/compute_library:build_defs.bzl", "if_enable_acl")
 
 # Placeholder: load py_proto_library
 load("//xla:xla.bzl", "xla_cc_test", "xla_py_proto_library")
+load("//xla/tsl:tsl.bzl", "internal_visibility")
+load("//xla/tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -317,7 +317,7 @@ cc_library(
         "statusor.h",
     ],
     linkopts = select({
-        "@local_tsl//tsl:freebsd": ["-lexecinfo"],
+        "//xla/tsl:freebsd": ["-lexecinfo"],
         "//conditions:default": [],
     }),
     visibility = ["//visibility:public"],
@@ -608,7 +608,6 @@ cc_library(
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:function_ref",
-        "@com_google_absl//absl/hash",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
diff --git a/third_party/xla/xla/backends/interpreter/BUILD b/third_party/xla/xla/backends/interpreter/BUILD
index cb4602a4aec947..08207ee40fec90 100644
--- a/third_party/xla/xla/backends/interpreter/BUILD
+++ b/third_party/xla/xla/backends/interpreter/BUILD
@@ -150,10 +150,13 @@ cc_library(
         "//xla:status_macros",
         "//xla:xla_data_proto_cc",
         "//xla/stream_executor",
-        "//xla/stream_executor:stream_executor_internal",
+        "//xla/stream_executor:event_interface",
+        "//xla/stream_executor:stream_executor_interface",
         "//xla/stream_executor/host:host_stream",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
     ],
 )
diff --git a/third_party/xla/xla/backends/interpreter/executable_base.cc b/third_party/xla/xla/backends/interpreter/executable_base.cc
index 7329c5d09f3f30..78443e635ad737 100644
--- a/third_party/xla/xla/backends/interpreter/executable_base.cc
+++ b/third_party/xla/xla/backends/interpreter/executable_base.cc
@@ -157,7 +157,8 @@ InterpreterExecutableBase::AllocateOutputMemoryWithInputReuse(
     std::vector<ExecutionInput>* arguments, se::Stream* stream) {
   TF_RETURN_IF_ERROR(alias_config.ForEachAliasWithStatus(
       [&](const ShapeIndex& output_index,
-          std::optional<HloInputOutputAliasConfig::Alias> alias) {
+          std::optional<HloInputOutputAliasConfig::Alias> alias)
+          -> absl::Status {
         if (alias && alias->must_alias()) {
           VLOG(1) << alias->ToString();
           const MaybeOwningDeviceMemory& original_input =
diff --git a/third_party/xla/xla/backends/interpreter/executor.h b/third_party/xla/xla/backends/interpreter/executor.h
index 31c1c03ea72d6d..89cca232411969 100644
--- a/third_party/xla/xla/backends/interpreter/executor.h
+++ b/third_party/xla/xla/backends/interpreter/executor.h
@@ -19,58 +19,61 @@ limitations under the License.
 #ifndef XLA_BACKENDS_INTERPRETER_EXECUTOR_H_
 #define XLA_BACKENDS_INTERPRETER_EXECUTOR_H_
 
+#include <cstdint>
 #include <memory>
 
 #include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/types/span.h"
-#include "xla/shape_util.h"
+#include "xla/shape.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/event.h"
+#include "xla/stream_executor/event_interface.h"
 #include "xla/stream_executor/host/host_stream.h"
+#include "xla/stream_executor/host_memory_allocation.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/stream_executor/stream_executor_internal.h"
+#include "xla/stream_executor/stream_executor_interface.h"
 #include "xla/xla_data.pb.h"
 
 namespace stream_executor {
 namespace interpreter {
 
-using Args = absl::Span<const DeviceMemoryBase>;
-
-class XlaInterpreterExecutor : public internal::StreamExecutorInterface {
+class XlaInterpreterExecutor : public StreamExecutorInterface {
  public:
-  XlaInterpreterExecutor() = default;
+  explicit XlaInterpreterExecutor(int device_ordinal)
+      : device_ordinal_(device_ordinal) {}
 
-  absl::Status Init(int device_ordinal) override {
-    device_ordinal_ = device_ordinal;
-    return absl::OkStatus();
-  }
+  absl::Status Init() override { return absl::OkStatus(); }
 
   int device_ordinal() const override { return device_ordinal_; };
   absl::Status GetKernel(const MultiKernelLoaderSpec &spec,
                          Kernel *kernel) override {
-    return tsl::errors::Unimplemented("Not Implemented");
+    return absl::UnimplementedError("Not Implemented");
   }
   absl::Status Launch(Stream *stream, const ThreadDim &thread_dims,
                       const BlockDim &block_dims, const Kernel &kernel,
                       const KernelArgs &args) override {
-    return tsl::errors::Unimplemented("Not Implemented");
+    return absl::UnimplementedError("Not Implemented");
   }
 
   DeviceMemoryBase Allocate(uint64_t size, int64_t memory_space) override;
   void Deallocate(DeviceMemoryBase *mem) override;
 
-  void *HostMemoryAllocate(uint64_t size) override { return new char[size]; }
+  absl::StatusOr<std::unique_ptr<MemoryAllocation>> HostMemoryAllocate(
+      uint64_t size) override {
+    return std::make_unique<HostMemoryAllocation>(new char[size], size, this);
+  }
   void HostMemoryDeallocate(void *mem) override {
     delete[] static_cast<char *>(mem);
   }
-  bool HostMemoryRegister(void *mem, uint64_t size) override { return true; }
-  bool HostMemoryUnregister(void *mem) override { return true; }
 
   absl::Status Memcpy(Stream *stream, void *host_dst,
                       const DeviceMemoryBase &dev_src, uint64_t size) override;
@@ -84,27 +87,22 @@ class XlaInterpreterExecutor : public internal::StreamExecutorInterface {
 
   absl::Status MemZero(Stream *stream, DeviceMemoryBase *location,
                        uint64_t size) override {
-    return tsl::errors::Internal("Interpreter can not memzero");
+    return absl::InternalError("Interpreter can not memzero");
   }
   absl::Status Memset(Stream *stream, DeviceMemoryBase *location,
                       uint8_t pattern, uint64_t size) override {
-    return tsl::errors::Internal("Interpreter can not memset");
+    return absl::InternalError("Interpreter can not memset");
   }
   absl::Status Memset32(Stream *stream, DeviceMemoryBase *location,
                         uint32_t pattern, uint64_t size) override {
-    return tsl::errors::Internal("Interpreter can not memset");
+    return absl::InternalError("Interpreter can not memset");
   }
 
   // No "synchronize all activity" implemented for this platform at the moment.
   bool SynchronizeAllActivity() override { return true; }
   absl::Status SynchronousMemZero(DeviceMemoryBase *location,
                                   uint64_t size) override {
-    return tsl::errors::Internal("Interpreter can not memzero");
-  }
-
-  absl::Status SynchronousMemSet(DeviceMemoryBase *location, int value,
-                                 uint64_t size) override {
-    return tsl::errors::Internal("Interpreter can not memset");
+    return absl::InternalError("Interpreter can not memzero");
   }
 
   absl::Status SynchronousMemcpy(DeviceMemoryBase *dev_dst,
@@ -112,11 +110,6 @@ class XlaInterpreterExecutor : public internal::StreamExecutorInterface {
   absl::Status SynchronousMemcpy(void *host_dst,
                                  const DeviceMemoryBase &dev_src,
                                  uint64_t size) override;
-  absl::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase *pop_dst,
-                                               const DeviceMemoryBase &pop_src,
-                                               uint64_t size) override {
-    return absl::Status{absl::StatusCode::kUnimplemented, ""};
-  }
 
   bool HostCallback(Stream *stream,
                     absl::AnyInvocable<absl::Status() &&> callback) override;
@@ -165,13 +158,11 @@ class XlaInterpreterExecutor : public internal::StreamExecutorInterface {
     return true;
   }
 
-  std::unique_ptr<internal::EventInterface> CreateEventImplementation()
-      override {
+  std::unique_ptr<EventInterface> CreateEventImplementation() override {
     return nullptr;
   }
 
-  std::unique_ptr<internal::StreamInterface> GetStreamImplementation()
-      override {
+  std::unique_ptr<StreamInterface> GetStreamImplementation() override {
     return std::make_unique<host::HostStream>();
   }
 
diff --git a/third_party/xla/xla/backends/interpreter/platform.cc b/third_party/xla/xla/backends/interpreter/platform.cc
index 0a9681ee0840f6..35383fdcd4a038 100644
--- a/third_party/xla/xla/backends/interpreter/platform.cc
+++ b/third_party/xla/xla/backends/interpreter/platform.cc
@@ -62,7 +62,7 @@ absl::StatusOr<std::unique_ptr<StreamExecutor>>
 XlaInterpreterPlatform::GetUncachedExecutor(
     const StreamExecutorConfig& config) {
   auto executor = std::make_unique<StreamExecutor>(
-      this, std::make_unique<XlaInterpreterExecutor>(), config.ordinal);
+      this, std::make_unique<XlaInterpreterExecutor>(config.ordinal));
   auto init_status = executor->Init();
   if (!init_status.ok()) {
     return absl::Status{
diff --git a/third_party/xla/xla/backends/profiler/BUILD b/third_party/xla/xla/backends/profiler/BUILD
index 5978f53341c13f..65f5da2138f0f9 100644
--- a/third_party/xla/xla/backends/profiler/BUILD
+++ b/third_party/xla/xla/backends/profiler/BUILD
@@ -1,5 +1,5 @@
 load(
-    "@local_tsl//tsl:tsl.bzl",
+    "//xla/tsl:tsl.bzl",
     "if_with_tpu_support",
     "internal_visibility",
     "tsl_gpu_library",
diff --git a/third_party/xla/xla/backends/profiler/cpu/BUILD b/third_party/xla/xla/backends/profiler/cpu/BUILD
index 6cfd1f1a51815b..e558b4e35a5f88 100644
--- a/third_party/xla/xla/backends/profiler/cpu/BUILD
+++ b/third_party/xla/xla/backends/profiler/cpu/BUILD
@@ -1,10 +1,10 @@
-load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("@local_tsl//tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
 load(
     "//xla:xla.bzl",
     "xla_cc_test",
 )
+load("//xla/tsl:tsl.bzl", "internal_visibility")
 
 # copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
 
@@ -37,7 +37,9 @@ cc_library(
         "@com_google_absl//absl/status",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/profiler/backends/cpu:host_tracer_utils",
+        "@local_tsl//tsl/profiler/backends/cpu:threadpool_listener",
         "@local_tsl//tsl/profiler/backends/cpu:traceme_recorder",
+        "@local_tsl//tsl/profiler/lib:profiler_collection",
         "@local_tsl//tsl/profiler/lib:profiler_interface",
         "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
         "@local_tsl//tsl/profiler/utils:time_utils",
@@ -129,12 +131,15 @@ xla_cc_test(
         "@com_google_absl//absl/types:optional",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:blocking_counter",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:types",
         "@local_tsl//tsl/profiler/lib:profiler_interface",
         "@local_tsl//tsl/profiler/lib:traceme",
         "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
+        "@local_tsl//tsl/profiler/utils:tf_xplane_visitor",
+        "@local_tsl//tsl/profiler/utils:timespan",
         "@local_tsl//tsl/profiler/utils:xplane_schema",
         "@local_tsl//tsl/profiler/utils:xplane_visitor",
     ],
diff --git a/third_party/xla/xla/backends/profiler/cpu/host_tracer.cc b/third_party/xla/xla/backends/profiler/cpu/host_tracer.cc
index ad79fb1c398cca..7aa97ffe9bad08 100644
--- a/third_party/xla/xla/backends/profiler/cpu/host_tracer.cc
+++ b/third_party/xla/xla/backends/profiler/cpu/host_tracer.cc
@@ -23,7 +23,9 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "tsl/platform/errors.h"
 #include "tsl/profiler/backends/cpu/host_tracer_utils.h"
+#include "tsl/profiler/backends/cpu/threadpool_listener.h"
 #include "tsl/profiler/backends/cpu/traceme_recorder.h"
+#include "tsl/profiler/lib/profiler_collection.h"
 #include "tsl/profiler/lib/profiler_interface.h"
 #include "tsl/profiler/protobuf/xplane.pb.h"
 #include "tsl/profiler/utils/time_utils.h"
@@ -115,7 +117,12 @@ absl::Status HostTracer::CollectData(  // TENSORFLOW_STATUS_OK
 std::unique_ptr<tsl::profiler::ProfilerInterface> CreateHostTracer(
     const HostTracerOptions& options) {
   if (options.trace_level == 0) return nullptr;
-  return std::make_unique<HostTracer>(options.trace_level);
+  std::vector<std::unique_ptr<tsl::profiler::ProfilerInterface>> profilers;
+  profilers.push_back(std::make_unique<HostTracer>(options.trace_level));
+  profilers.push_back(
+      std::make_unique<tsl::profiler::ThreadpoolProfilerInterface>());
+  return std::make_unique<tsl::profiler::ProfilerCollection>(
+      std::move(profilers));
 }
 
 }  // namespace profiler
diff --git a/third_party/xla/xla/backends/profiler/gpu/BUILD b/third_party/xla/xla/backends/profiler/gpu/BUILD
index 5ea81b05c3276d..7d6a522c1b8060 100644
--- a/third_party/xla/xla/backends/profiler/gpu/BUILD
+++ b/third_party/xla/xla/backends/profiler/gpu/BUILD
@@ -1,11 +1,5 @@
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library", "if_cuda")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
-load(
-    "@local_tsl//tsl:tsl.bzl",
-    "internal_visibility",
-    "tsl_copts",
-    "tsl_gpu_library",
-)
 load(
     "@local_tsl//tsl/platform:build_config.bzl",
     "tf_additional_device_tracer_srcs",
@@ -23,6 +17,12 @@ load(
     "//xla:xla.bzl",
     "xla_cc_test",
 )
+load(
+    "//xla/tsl:tsl.bzl",
+    "internal_visibility",
+    "tsl_copts",
+    "tsl_gpu_library",
+)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.cc b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.cc
index 91795b5cf5ec5a..8bdef9300baa2e 100644
--- a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.cc
@@ -277,8 +277,8 @@ const char* GetRocmTracerEventDomainName(const RocmTracerEventDomain& domain) {
   return "";
 }
 
-tsl::Status RocmApiCallbackImpl::operator()(uint32_t domain, uint32_t cbid,
-                                            const void* cbdata) {
+absl::Status RocmApiCallbackImpl::operator()(uint32_t domain, uint32_t cbid,
+                                             const void* cbdata) {
   /* Some APIs such as hipMalloc, implicitly work on th devices set by the
     user using APIs such as hipSetDevice. API callbacks and activity records
     for functions like hipMalloc does not return the device id (CUDA does). To
@@ -838,8 +838,8 @@ void RocmApiCallbackImpl::AddSynchronizeEventUponApiExit(
   collector_->AddEvent(std::move(event), is_auxiliary);
 }
 
-tsl::Status RocmActivityCallbackImpl::operator()(const char* begin,
-                                                 const char* end) {
+absl::Status RocmActivityCallbackImpl::operator()(const char* begin,
+                                                  const char* end) {
   // we do not dump activities in this set in logger
 
   static std::set<activity_op_t> dump_excluded_activities = {
@@ -1359,14 +1359,14 @@ void ApiCallback(uint32_t domain, uint32_t cbid, const void* cbdata,
   tracer->ApiCallbackHandler(domain, cbid, cbdata).IgnoreError();
 }
 
-tsl::Status RocmTracer::ApiCallbackHandler(uint32_t domain, uint32_t cbid,
-                                           const void* cbdata) {
+absl::Status RocmTracer::ApiCallbackHandler(uint32_t domain, uint32_t cbid,
+                                            const void* cbdata) {
   if (api_tracing_enabled_)
     TF_RETURN_IF_ERROR((*api_cb_impl_)(domain, cbid, cbdata));
   return tsl::OkStatus();
 }
 
-tsl::Status RocmTracer::EnableApiTracing() {
+absl::Status RocmTracer::EnableApiTracing() {
   if (api_tracing_enabled_) return tsl::OkStatus();
   api_tracing_enabled_ = true;
 
@@ -1392,7 +1392,7 @@ tsl::Status RocmTracer::EnableApiTracing() {
   return tsl::OkStatus();
 }
 
-tsl::Status RocmTracer::DisableApiTracing() {
+absl::Status RocmTracer::DisableApiTracing() {
   if (!api_tracing_enabled_) return tsl::OkStatus();
   api_tracing_enabled_ = false;
 
@@ -1423,8 +1423,8 @@ void ActivityCallback(const char* begin, const char* end, void* user_data) {
   tracer->ActivityCallbackHandler(begin, end).IgnoreError();
 }
 
-tsl::Status RocmTracer::ActivityCallbackHandler(const char* begin,
-                                                const char* end) {
+absl::Status RocmTracer::ActivityCallbackHandler(const char* begin,
+                                                 const char* end) {
   if (activity_tracing_enabled_) {
     TF_RETURN_IF_ERROR((*activity_cb_impl_)(begin, end));
   } else {
@@ -1452,7 +1452,7 @@ tsl::Status RocmTracer::ActivityCallbackHandler(const char* begin,
   return tsl::OkStatus();
 }
 
-tsl::Status RocmTracer::EnableActivityTracing() {
+absl::Status RocmTracer::EnableActivityTracing() {
   if (activity_tracing_enabled_) return tsl::OkStatus();
   activity_tracing_enabled_ = true;
 
@@ -1493,7 +1493,7 @@ tsl::Status RocmTracer::EnableActivityTracing() {
   return tsl::OkStatus();
 }
 
-tsl::Status RocmTracer::DisableActivityTracing() {
+absl::Status RocmTracer::DisableActivityTracing() {
   if (!activity_tracing_enabled_) return tsl::OkStatus();
 
   for (auto& iter : options_->activity_tracing) {
diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.h b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.h
index fd566653be02a2..b82a1e66d0092a 100644
--- a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.h
+++ b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.h
@@ -59,7 +59,7 @@ class RocmApiCallbackImpl {
                       RocmTraceCollector* collector)
       : options_(options), tracer_(tracer), collector_(collector) {}
 
-  tsl::Status operator()(uint32_t domain, uint32_t cbid, const void* cbdata);
+  absl::Status operator()(uint32_t domain, uint32_t cbid, const void* cbdata);
 
  private:
   void AddKernelEventUponApiExit(uint32_t cbid, const hip_api_data_t* data,
@@ -97,7 +97,7 @@ class RocmActivityCallbackImpl {
                            RocmTraceCollector* collector)
       : options_(options), tracer_(tracer), collector_(collector) {}
 
-  tsl::Status operator()(const char* begin, const char* end);
+  absl::Status operator()(const char* begin, const char* end);
 
  private:
   void AddHipKernelActivityEvent(const roctracer_record_t* record);
@@ -127,9 +127,9 @@ class RocmTracer {
   void Enable(const RocmTracerOptions& options, RocmTraceCollector* collector);
   void Disable();
 
-  tsl::Status ApiCallbackHandler(uint32_t domain, uint32_t cbid,
-                                 const void* cbdata);
-  tsl::Status ActivityCallbackHandler(const char* begin, const char* end);
+  absl::Status ApiCallbackHandler(uint32_t domain, uint32_t cbid,
+                                  const void* cbdata);
+  absl::Status ActivityCallbackHandler(const char* begin, const char* end);
 
   static uint64_t GetTimestamp();
   static int NumGpus();
@@ -153,11 +153,11 @@ class RocmTracer {
   explicit RocmTracer() : num_gpus_(NumGpus()) {}
 
  private:
-  tsl::Status EnableApiTracing();
-  tsl::Status DisableApiTracing();
+  absl::Status EnableApiTracing();
+  absl::Status DisableApiTracing();
 
-  tsl::Status EnableActivityTracing();
-  tsl::Status DisableActivityTracing();
+  absl::Status EnableActivityTracing();
+  absl::Status DisableActivityTracing();
 
   int num_gpus_;
   std::optional<RocmTracerOptions> options_;
diff --git a/third_party/xla/xla/backends/profiler/tpu/BUILD b/third_party/xla/xla/backends/profiler/tpu/BUILD
index c485f5adef0fc4..b292ec9819d7d0 100644
--- a/third_party/xla/xla/backends/profiler/tpu/BUILD
+++ b/third_party/xla/xla/backends/profiler/tpu/BUILD
@@ -1,6 +1,6 @@
-load("@local_tsl//tsl:tsl.bzl", "if_with_tpu_support")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("@local_tsl//tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
+load("//xla/tsl:tsl.bzl", "if_with_tpu_support")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/backends/profiler/tpu/tpu_tracer.cc b/third_party/xla/xla/backends/profiler/tpu/tpu_tracer.cc
index d5dec2b0e7d50e..19c7e57e49a13e 100644
--- a/third_party/xla/xla/backends/profiler/tpu/tpu_tracer.cc
+++ b/third_party/xla/xla/backends/profiler/tpu/tpu_tracer.cc
@@ -61,7 +61,7 @@ class ProfilerStatusHelper {
     stream_executor::tpu::ProfilerApiFn()->TpuStatus_FreeFn(c_status);
   }
 
-  static tsl::Status FromC(  // TENSORFLOW_STATUS_OK
+  static absl::Status FromC(  // TENSORFLOW_STATUS_OK
       TF_Status* const c_status) {
     if (stream_executor::tpu::ProfilerApiFn()->TpuStatus_CodeFn(c_status) ==
         TSL_OK) {
@@ -80,7 +80,7 @@ class ProfilerStatusHelper {
            TSL_OK;
   }
 
-  tsl::Status status() const {  // TENSORFLOW_STATUS_OK
+  absl::Status status() const {  // TENSORFLOW_STATUS_OK
     return FromC(c_status);
   }
 
diff --git a/third_party/xla/xla/c/BUILD b/third_party/xla/xla/c/BUILD
index 2a2288a13c9395..fcf764145d372f 100644
--- a/third_party/xla/xla/c/BUILD
+++ b/third_party/xla/xla/c/BUILD
@@ -1,5 +1,5 @@
-load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla/tsl:tsl.bzl", "internal_visibility")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/client/BUILD b/third_party/xla/xla/client/BUILD
index d55782e865802e..db3393f9193b00 100644
--- a/third_party/xla/xla/client/BUILD
+++ b/third_party/xla/xla/client/BUILD
@@ -1,9 +1,9 @@
 # Description:
 #   XLA client libraries.
 
-load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//xla:xla.bzl", "xla_cc_test")
+load("//xla/tsl:tsl.default.bzl", "filegroup")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -297,7 +297,6 @@ xla_cc_test(
         "//xla:debug_options_flags",
         "//xla:shape_util",
         "//xla:status",
-        "//xla:statusor",
         "//xla:test",
         "//xla:test_helpers",
         "//xla:util",
diff --git a/third_party/xla/xla/client/lib/BUILD b/third_party/xla/xla/client/lib/BUILD
index dcfd207d69ee5d..c070d5af6fe154 100644
--- a/third_party/xla/xla/client/lib/BUILD
+++ b/third_party/xla/xla/client/lib/BUILD
@@ -1,9 +1,9 @@
 # Common computation builders for XLA.
 
-load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
-load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//xla/tests:build_defs.bzl", "generate_backend_suites", "xla_test")
+load("//xla/tsl:tsl.bzl", "internal_visibility")
+load("//xla/tsl:tsl.default.bzl", "filegroup")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/client/lib/matrix.cc b/third_party/xla/xla/client/lib/matrix.cc
index 9a4aaac6b28d4d..156689d4cda39e 100644
--- a/third_party/xla/xla/client/lib/matrix.cc
+++ b/third_party/xla/xla/client/lib/matrix.cc
@@ -612,7 +612,7 @@ absl::StatusOr<std::array<std::vector<int64_t>, 3>> ParseEinsumString(
     return InvalidArgument("Expected one \",\" in einsum_config.");
   }
 
-  auto maybe_invalid_character = [](char d) {
+  auto maybe_invalid_character = [](char d) -> absl::Status {
     if (absl::ascii_isalpha(d)) {
       return OkStatus();
     }
diff --git a/third_party/xla/xla/client/xla_builder.cc b/third_party/xla/xla/client/xla_builder.cc
index f8d42040b8019b..b395bd81f4a347 100644
--- a/third_party/xla/xla/client/xla_builder.cc
+++ b/third_party/xla/xla/client/xla_builder.cc
@@ -62,7 +62,6 @@ limitations under the License.
 #include "xla/sharding_op_util.h"
 #include "xla/status.h"
 #include "xla/status_macros.h"
-#include "xla/statusor.h"
 #include "xla/util.h"
 #include "xla/window_util.h"
 #include "xla/xla_data.pb.h"
@@ -112,6 +111,18 @@ bool InstrIsSetBound(const HloInstructionProto* instr_proto) {
   return false;
 }
 
+Status NormalizeAndAssignSharing(HloInstructionProto* instr,
+                                 const OpSharding& op_sharding) {
+  // Normalize tuple sharding and fail the call if the sharding is invalid.
+  Shape shape(instr->shape());
+  TF_ASSIGN_OR_RETURN(HloSharding sharding,
+                      HloSharding::FromProto(op_sharding));
+  sharding = sharding.NormalizeTupleSharding(shape);
+  TF_RETURN_IF_ERROR(sharding.Validate(shape));
+  *instr->mutable_sharding() = sharding.ToProto();
+  return OkStatus();
+}
+
 }  // namespace
 
 namespace internal {
@@ -476,6 +487,15 @@ absl::StatusOr<std::vector<Shape>> XlaBuilder::GetOperandShapes(
   return operand_shapes;
 }
 
+absl::StatusOr<std::optional<OpSharding>> XlaBuilder::GetOpSharding(
+    XlaOp op) const {
+  TF_ASSIGN_OR_RETURN(auto instr_proto, LookUpInstruction(op));
+  if (instr_proto->has_sharding()) {
+    return instr_proto->sharding();
+  }
+  return std::nullopt;
+}
+
 std::string XlaBuilder::OpToString(XlaOp op) const {
   std::string s;
   ToStringHelper(&s, /*ident=*/0, op.handle());
@@ -683,6 +703,16 @@ Status XlaBuilder::SetInstructionFrontendAttribute(const XlaOp op,
   return OkStatus();
 }
 
+Status XlaBuilder::SetInstructionSharding(
+    XlaOp op, const std::optional<OpSharding>& sharding) {
+  TF_ASSIGN_OR_RETURN(auto instr_proto, LookUpMutableInstruction(op));
+  if (!sharding.has_value()) {
+    instr_proto->clear_sharding();
+    return OkStatus();
+  }
+  return NormalizeAndAssignSharing(instr_proto, sharding.value());
+}
+
 XlaComputation XlaBuilder::BuildAndNoteError() {
   DCHECK(parent_builder_ != nullptr);
   auto build_status = Build();
@@ -1028,7 +1058,7 @@ absl::StatusOr<XlaOp> BroadcastToTargetRank(
     int64_t target_dim = broadcast_dimensions[origin_dim];
     target_size[target_dim] = origin_shape.dimensions(origin_dim);
   }
-  return xla::BroadcastInDim(origin, target_size, broadcast_dimensions);
+  return BroadcastInDim(origin, target_size, broadcast_dimensions);
 }
 
 // Extract the `num_dims` counts of dimension sizes from the `op`. First,
@@ -1039,14 +1069,14 @@ absl::StatusOr<std::vector<XlaOp>> ExtractDimensionSizesAndPadOnesToLeft(
     XlaBuilder* builder, XlaOp op, size_t num_dims, int pad_count) {
   TF_ASSIGN_OR_RETURN(const Shape* op_shape, builder->GetShapePtr(op));
   std::vector<XlaOp> op_dims(
-      pad_count, xla::ConstantR1(builder, absl::Span<const int32_t>({1})));
+      pad_count, ConstantR1<int32_t>(/*builder=*/builder, /*values=*/{1}));
   for (size_t i = 0; i < num_dims; i++) {
     op_dims.push_back(
         op_shape->is_static_dimension(i)
-            ? ConstantR1(builder,
-                         absl::Span<const int32_t>(
-                             {static_cast<int32_t>(op_shape->dimensions(i))}))
-            : xla::Reshape(xla::GetDimensionSize(op, i), {1}));
+            ? ConstantR1<int32_t>(
+                  /*builder=*/builder,
+                  /*values=*/{static_cast<int32_t>(op_shape->dimensions(i))})
+            : Reshape(GetDimensionSize(op, i), {1}));
   }
   return op_dims;
 }
@@ -1065,12 +1095,12 @@ absl::StatusOr<XlaOp> BroadcastScalarToOutputShapeWithUnbounded(
   for (size_t i = 0; i < output_shape.rank(); i++) {
     output_sizes[i] =
         output_shape.is_static_dimension(i)
-            ? ConstantR1(builder,
-                         absl::Span<const int32_t>({static_cast<int32_t>(
-                             output_shape.dimensions(i))}))
-            : xla::Reshape(xla::GetDimensionSize(output, i), {1});
+            ? ConstantR1<int32_t>(
+                  /*builder=*/builder,
+                  /*values=*/{static_cast<int32_t>(output_shape.dimensions(i))})
+            : Reshape(GetDimensionSize(output, i), {1});
   }
-  return xla::DynamicBroadcastInDim(
+  return DynamicBroadcastInDim(
       scalar, /*output_dimensions=*/ConcatInDim(builder, output_sizes, 0), {},
       output_shape);
 }
@@ -1087,8 +1117,8 @@ absl::StatusOr<XlaOp> DegenerateBroadcastWithUnbounded(
   std::iota(broadcast_dimensions.begin(), broadcast_dimensions.end(),
             output_shape.rank() - operand_shape->rank());
 
-  return xla::DynamicBroadcastInDim(operand, output_dimensions,
-                                    broadcast_dimensions, output_shape);
+  return DynamicBroadcastInDim(operand, output_dimensions, broadcast_dimensions,
+                               output_shape);
 }
 
 // Helper struct to store the result of `BroadcastToOutputShapeWithUnbounded`.
@@ -1334,9 +1364,14 @@ XlaOp XlaBuilder::ConstantLiteral(const LiteralSlice& literal) {
       HloInstructionProto instr;
       *instr.mutable_shape() = scalar.shape().ToProto();
       *instr.mutable_literal() = scalar.ToProto();
-      TF_ASSIGN_OR_RETURN(
-          XlaOp scalar_op,
-          AddInstruction(std::move(instr), HloOpcode::kConstant));
+      XlaOp scalar_op;
+      {
+        // If the builder has a sharding, it should only be added to the
+        // broadcast (and not the scalar constant).
+        XlaScopedShardingAssignment scoped_sharding(this, std::nullopt);
+        TF_ASSIGN_OR_RETURN(
+            scalar_op, AddInstruction(std::move(instr), HloOpcode::kConstant));
+      }
       return Broadcast(scalar_op, literal.shape().dimensions());
     } else {
       HloInstructionProto instr;
@@ -1941,8 +1976,9 @@ Status XlaBuilder::VerifyConvolution(
   }
   int num_spatial_dims = num_dims - 2;
 
-  const auto check_spatial_dimensions = [&](absl::string_view field_name,
-                                            absl::Span<const int64_t> numbers) {
+  const auto check_spatial_dimensions =
+      [&](absl::string_view field_name,
+          absl::Span<const int64_t> numbers) -> absl::Status {
     if (numbers.size() != num_spatial_dims) {
       return InvalidArgument("Expected %d elements for %s, but got %d.",
                              num_spatial_dims, field_name, numbers.size());
@@ -4559,13 +4595,7 @@ absl::StatusOr<XlaOp> XlaBuilder::AddInstruction(
     *instr.mutable_metadata() = metadata_;
   }
   if (sharding_) {
-    // Normalize tuple sharding and fail the call if the sharding is not valid.
-    Shape shape(instr.shape());
-    TF_ASSIGN_OR_RETURN(HloSharding sharding,
-                        HloSharding::FromProto(*sharding_));
-    sharding = sharding.NormalizeTupleSharding(shape);
-    TF_RETURN_IF_ERROR(sharding.Validate(shape));
-    *instr.mutable_sharding() = sharding.ToProto();
+    TF_RETURN_IF_ERROR(NormalizeAndAssignSharing(&instr, *sharding_));
   }
   *instr.mutable_frontend_attributes() = frontend_attributes_;
 
diff --git a/third_party/xla/xla/client/xla_builder.h b/third_party/xla/xla/client/xla_builder.h
index b744c679bd7a66..585eafc91d72ca 100644
--- a/third_party/xla/xla/client/xla_builder.h
+++ b/third_party/xla/xla/client/xla_builder.h
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include <cstdint>
 #include <deque>
-#include <functional>
 #include <initializer_list>
 #include <map>
 #include <memory>
@@ -400,6 +399,10 @@ class XlaBuilder {
   // Returns the shape of the given op.
   virtual absl::StatusOr<const Shape*> GetShapePtr(XlaOp op) const;
 
+  // Returns the OpSharding of the given op. If "op" has no sharding, return
+  // std::nullopt.
+  absl::StatusOr<std::optional<OpSharding>> GetOpSharding(XlaOp op) const;
+
   // Returns the (inferred) result for the current computation's shape. This
   // assumes the root instruction is the last added instruction.
   absl::StatusOr<ProgramShape> GetProgramShape() const;
@@ -471,15 +474,19 @@ class XlaBuilder {
   }
 
   // Looks up the HloInstruction and sets the frontend attribute "attribute" to
-  // "value".
-  //
-  // If the attribute already existed then its value is updated.
+  // "value". If the attribute already existed, then its value is updated.
   //
-  // Note: the attribute is only added to the HloInstruction, not to the
-  // builder.
+  // The attribute is only added to the HloInstruction, not to the builder.
   Status SetInstructionFrontendAttribute(XlaOp op, std::string attribute,
                                          std::string value);
 
+  // Looks up the HloInstruction and sets the sharding. If the sharding already
+  // existed, then its value is updated.
+  //
+  // The sharding is only added to the HloInstruction, not to the builder.
+  Status SetInstructionSharding(XlaOp op,
+                                const std::optional<OpSharding>& sharding);
+
   // Returns shapes for the operands.
   absl::StatusOr<std::vector<Shape>> GetOperandShapes(
       absl::Span<const XlaOp> operands) const;
@@ -1713,7 +1720,7 @@ class XlaBuilder {
   //
   // TODO(hinsu): Return const pointer within StatusOr and use
   // absl::implicit_cast at callsites. This requires implicit_cast support in
-  // xla::StatusOr similar to absl::StatusOr.
+  // absl::StatusOr similar to absl::StatusOr.
   template <typename InstructionType>
   StatusOr<InstructionType> LookUpInstructionInternal(XlaOp op) const {
     TF_RETURN_IF_ERROR(CheckOpBuilder(op));
diff --git a/third_party/xla/xla/client/xla_builder_test.cc b/third_party/xla/xla/client/xla_builder_test.cc
index 665f932b78e537..c97bd9f21b3674 100644
--- a/third_party/xla/xla/client/xla_builder_test.cc
+++ b/third_party/xla/xla/client/xla_builder_test.cc
@@ -53,7 +53,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
-#include "xla/statusor.h"
 #include "xla/test.h"
 #include "xla/test_helpers.h"
 #include "xla/util.h"
@@ -1543,6 +1542,44 @@ TEST(XlaBuilderTest, AddFrontendAttribute) {
   ExpectInstructionsAttributesMatch(*module, expected);
 }
 
+TEST(XlaBuilderTest, SetAndGetSharding) {
+  XlaBuilder b(TestName());
+
+  const Shape shape = ShapeUtil::MakeShape(F32, {1024});
+  OpSharding op_sharding_1 = sharding_builder::Replicate();
+  OpSharding op_sharding_2 = sharding_builder::Tile1D(shape, 4);
+  TF_ASSERT_OK_AND_ASSIGN(HloSharding hlo_sharding_1,
+                          HloSharding::FromProto(op_sharding_1));
+  TF_ASSERT_OK_AND_ASSIGN(HloSharding hlo_sharding_2,
+                          HloSharding::FromProto(op_sharding_2));
+
+  b.SetSharding(op_sharding_1);
+  XlaOp p0 = Parameter(&b, 0, shape, "p0");
+  TF_ASSERT_OK_AND_ASSIGN(auto p0_sharding, b.GetOpSharding(p0));
+  EXPECT_TRUE(p0_sharding.has_value());
+  EXPECT_EQ(HloSharding::FromProto(p0_sharding.value()).value(),
+            hlo_sharding_1);
+
+  EXPECT_TRUE(b.SetInstructionSharding(p0, std::nullopt).ok());
+  TF_ASSERT_OK_AND_ASSIGN(p0_sharding, b.GetOpSharding(p0));
+  EXPECT_FALSE(p0_sharding.has_value());
+
+  EXPECT_TRUE(b.SetInstructionSharding(p0, op_sharding_2).ok());
+  TF_ASSERT_OK_AND_ASSIGN(p0_sharding, b.GetOpSharding(p0));
+  EXPECT_TRUE(p0_sharding.has_value());
+  EXPECT_EQ(HloSharding::FromProto(p0_sharding.value()).value(),
+            hlo_sharding_2);
+
+  EXPECT_EQ(HloSharding::FromProto(b.sharding().value()).value(),
+            hlo_sharding_1);
+
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
+  EXPECT_TRUE(
+      module->entry_computation()->parameter_instruction(0)->has_sharding());
+  EXPECT_EQ(module->entry_computation()->parameter_instruction(0)->sharding(),
+            hlo_sharding_2);
+}
+
 TEST(XlaBuilderTest, ComparisonType) {
   XlaBuilder b(TestName());
   (void)Le(ConstantR0<int32_t>(&b, 1), ConstantR0<int32_t>(&b, 2));
@@ -1797,7 +1834,8 @@ TEST_P(XlaBuilderUnboundedUnaryOpTest, UnboundedUnaryOpTest) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
                           ParseShape(GetParam().expected));
   GetParam().unary_op(Parameter(&b, 0, operand, "operand"));
-  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const std::unique_ptr<xla::HloModule> module,
+                          BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
@@ -1811,9 +1849,9 @@ TEST_P(XlaBuilderUnboundedBinaryOpTest, UnboundedBinaryOpTest) {
   GetParam().binary_op(Parameter(&b, 0, lhs, "lhs"),
                        Parameter(&b, 1, rhs, "rhs"),
                        GetParam().broadcast_dimensions);
-  if (auto result = BuildHloModule(b); result.ok()) {
-    const std::unique_ptr<HloModule> module = std::move(*result);
-    EXPECT_THAT(GetRoot(*module),
+  if (const auto result = BuildHloModule(b); result.ok()) {
+    ASSERT_NE(*result, nullptr);
+    EXPECT_THAT(GetRoot(**result),
                 GmockMatch(m::Op().WithShapeEqualTo(&expected)));
   } else {
     ASSERT_TRUE(GetParam().error_message.has_value());
@@ -1856,6 +1894,40 @@ TEST(XlaBuilderTest, UnboundedAddUnsupportedImplicitBroadcast) {
               StatusIs(_, HasSubstr(kBroadcastDimensionMismatch)));
 }
 
+TEST(XlaBuilderTest, UnboundedAllGather) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
+  AllGather(Parameter(&b, 0, operand, "operand"), /*all_gather_dimension=*/0,
+            /*shard_count=*/2,
+            /*replica_groups=*/{});
+  TF_ASSERT_OK_AND_ASSIGN(const std::unique_ptr<HloModule> module,
+                          BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
+TEST(XlaBuilderTest, UnboundedAllReduce) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
+
+  XlaComputation computation;
+  {
+    const std::unique_ptr<XlaBuilder> sub_builder = b.CreateSubBuilder("add");
+    Add(Parameter(sub_builder.get(), 0, operand, "arg0"),
+        Parameter(sub_builder.get(), 1, operand, "arg1"));
+    TF_ASSERT_OK_AND_ASSIGN(computation, sub_builder->Build());
+  }
+
+  AllReduce(Parameter(&b, 0, operand, "operand"), computation,
+            /*replica_groups=*/{});
+  TF_ASSERT_OK_AND_ASSIGN(const std::unique_ptr<HloModule> module,
+                          BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
 TEST(XlaBuilderTest, UnboundedAnd) {
   XlaBuilder b(TestName());
   TF_ASSERT_OK_AND_ASSIGN(const Shape lhs,
@@ -1865,7 +1937,7 @@ TEST(XlaBuilderTest, UnboundedAnd) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
                           ParseShape("s32[?, ?, 2, 2, <=2, <=2, ?]"));
   And(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
-      /*broadcast_dimensions=*/absl::Span<const int64_t>{});
+      /*broadcast_dimensions=*/empty_array);
   TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
@@ -1978,6 +2050,40 @@ TEST(XlaBuilderTest, UnboundedBroadcastInDimUnsupported) {
                                     "static or bounded dynamic")));
 }
 
+TEST(XlaBuilderTest, UnboundedCall) {
+  // This also serves as a test for StableHLO's FuncOp since there is no direct
+  // 1 to 1 mapping of StableHLO FuncOp in HLO. Instead, we create
+  // XlaComputation with unbounded shapes to test "UnboundedFunc".
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
+
+  XlaComputation computation;
+  {
+    const std::unique_ptr<XlaBuilder> sub_builder = b.CreateSubBuilder("add");
+    Add(Parameter(sub_builder.get(), 0, operand, "arg0"),
+        Parameter(sub_builder.get(), 1, operand, "arg1"));
+    TF_ASSERT_OK_AND_ASSIGN(computation, sub_builder->Build());
+  }
+
+  Call(/*builder=*/&b, /*computation=*/computation, /*operands=*/
+       {Parameter(&b, 0, operand, "arg0"), Parameter(&b, 1, operand, "arg1")});
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
+TEST(XlaBuilderTest, UnboundedCholesky) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape a, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
+  Cholesky(Parameter(&b, 0, a, "a"), /*lower=*/true);
+  TF_ASSERT_OK_AND_ASSIGN(const std::unique_ptr<HloModule> module,
+                          BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
 TEST(XlaBuilderTest, UnboundedClamp) {
   XlaBuilder b(TestName());
   TF_ASSERT_OK_AND_ASSIGN(const Shape lhs,
@@ -2059,6 +2165,28 @@ TEST(XlaBuilderTest,
               StatusIs(_, HasSubstr("Unimplemented implicit broadcast.")));
 }
 
+TEST(XlaBuilderTest, UnboundedCollectiveBroadcast) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
+  CollectiveBroadcast(/*operand=*/Parameter(&b, 0, operand, "operand"),
+                      /*replica_groups=*/{});
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
+TEST(XlaBuilderTest, UnboundedCollectivePermute) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
+  CollectivePermute(/*operand=*/Parameter(&b, 0, operand, "operand"),
+                    /*source_target_pairs=*/{std::make_pair(0, 1)});
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
 TEST(XlaBuilderTest, UnboundedCompare) {
   XlaBuilder b(TestName());
   TF_ASSERT_OK_AND_ASSIGN(const Shape lhs,
@@ -2160,6 +2288,40 @@ TEST(XlaBuilderTest, UnboundedDotGeneral) {
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
+TEST(XlaBuilderTest, UnboundedDynamicSlice) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape start_indices, ParseShape("s32[]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[2, 2]"));
+  DynamicSlice(Parameter(&b, 0, operand, "operand"),
+               /*start_indices=*/
+               {
+                   Parameter(&b, 1, start_indices, "start_indices0"),
+                   Parameter(&b, 2, start_indices, "start_indices1"),
+               },
+               /*slice_sizes=*/{2, 2});
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
+TEST(XlaBuilderTest, UnboundedDynamicUpdateSlice) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape update, ParseShape("f32[?, 5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape start_indices, ParseShape("s32[]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
+  DynamicUpdateSlice(Parameter(&b, 0, operand, "operand"),
+                     Parameter(&b, 1, update, "update"),
+                     /*start_indices=*/
+                     {Parameter(&b, 2, start_indices, "start_indices0"),
+                      Parameter(&b, 3, start_indices, "start_indices1")});
+  TF_ASSERT_OK_AND_ASSIGN(const std::unique_ptr<HloModule> module,
+                          BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
 TEST(XlaBuilderTest, UnboundedGather) {
   XlaBuilder b(TestName());
   TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[3, 4, 2]"));
@@ -2183,6 +2345,79 @@ TEST(XlaBuilderTest, UnboundedGather) {
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
+TEST(XlaBuilderTest, UnboundedGetTupleElement) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
+  GetTupleElement(Tuple(&b, {Parameter(&b, 0, operand, "operand")}), 0);
+  TF_ASSERT_OK_AND_ASSIGN(const std::unique_ptr<xla::HloModule> module,
+                          BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
+TEST(XlaBuilderTest, UnboundedInfeed) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape shape, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
+  Infeed(/*builder=*/&b, /*shape=*/shape, /*config=*/"");
+  TF_ASSERT_OK_AND_ASSIGN(const std::unique_ptr<xla::HloModule> module,
+                          BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
+TEST(XlaBuilderTest, UnboundedInfeedWithToken) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape shape, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
+                          ParseShape("(f32[?, 10], token[])"));
+  InfeedWithToken(/*token=*/CreateToken(&b), /*shape=*/shape, /*config=*/"");
+  TF_ASSERT_OK_AND_ASSIGN(const std::unique_ptr<xla::HloModule> module,
+                          BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
+TEST(XlaBuilderTest, UnboundedMap) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand0, ParseShape("f32[2, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand1, ParseShape("f32[?, 3, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[2, ?, ?]"));
+
+  XlaComputation computation;
+  {
+    const std::unique_ptr<XlaBuilder> sub_builder = b.CreateSubBuilder("add");
+    Add(Parameter(sub_builder.get(), 0, ShapeUtil::MakeScalarShape(F32),
+                  "arg0"),
+        Parameter(sub_builder.get(), 1, ShapeUtil::MakeScalarShape(F32),
+                  "arg1"));
+    TF_ASSERT_OK_AND_ASSIGN(computation, sub_builder->Build());
+  }
+
+  Map(&b, /*operands=*/
+      {Parameter(&b, 0, operand0, "operand0"),
+       Parameter(&b, 1, operand1, "operand1")},
+      computation, /*dimensions=*/{0, 1, 2},
+      /*static_operands=*/{});
+
+  TF_ASSERT_OK_AND_ASSIGN(const std::unique_ptr<HloModule> module,
+                          BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
+TEST(XlaBuilderTest, UnboundedOptimizationBarrier) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
+  OptimizationBarrier(Parameter(&b, 0, operand, "operand"));
+  TF_ASSERT_OK_AND_ASSIGN(const std::unique_ptr<HloModule> module,
+                          BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
 TEST(XlaBuilderTest, UnboundedOr) {
   XlaBuilder b(TestName());
   TF_ASSERT_OK_AND_ASSIGN(const Shape lhs,
@@ -2193,11 +2428,34 @@ TEST(XlaBuilderTest, UnboundedOr) {
                           ParseShape("s32[?, ?, 2, 2, <=2, <=2, ?]"));
   Or(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
      /*broadcast_dimensions=*/empty_array);
-  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const std::unique_ptr<HloModule> module,
+                          BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
+TEST(XlaBuilderTest, UnboundedOutfeed) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape shape_with_layout,
+                          ParseShape("f32[?, 10]"));
+  Outfeed(/*operand=*/Parameter(&b, 0, operand, "operand"),
+          /*shape_with_layout=*/shape_with_layout, /*outfeed_config=*/"");
+  EXPECT_IS_OK(BuildHloModule(b));
+}
+
+TEST(XlaBuilderTest, UnboundedOutfeedWithToken) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape shape_with_layout,
+                          ParseShape("f32[?, 10]"));
+  OutfeedWithToken(/*operand=*/Parameter(&b, 0, operand, "operand"),
+                   /*token=*/CreateToken(&b),
+                   /*shape_with_layout=*/shape_with_layout,
+                   /*outfeed_config=*/"");
+  EXPECT_IS_OK(BuildHloModule(b));
+}
+
 TEST(XlaBuilderTest, UnboundedPad) {
   XlaBuilder b(TestName());
   TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
@@ -2216,6 +2474,36 @@ TEST(XlaBuilderTest, UnboundedPad) {
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
+TEST(XlaBuilderTest, UnboundedRecv) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape shape, ParseShape("f32[?, 10]"));
+  ChannelHandle handle;
+  handle.set_handle(1);
+  handle.set_type(ChannelHandle::DEVICE_TO_DEVICE);
+  Recv(/*builder=*/&b, /*shape=*/shape, /*handle=*/handle);
+  EXPECT_IS_OK(BuildHloModule(b));
+}
+
+TEST(XlaBuilderTest, UnboundedRecvFromHost) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape shape, ParseShape("f32[?, 10]"));
+  ChannelHandle handle;
+  handle.set_handle(1);
+  handle.set_type(ChannelHandle::HOST_TO_DEVICE);
+  RecvFromHost(/*token=*/CreateToken(&b), /*shape=*/shape, /*handle=*/handle);
+  EXPECT_IS_OK(BuildHloModule(b));
+}
+
+TEST(XlaBuilderTest, UnboundedRecvWithToken) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape shape, ParseShape("f32[?, 10]"));
+  ChannelHandle handle;
+  handle.set_handle(1);
+  handle.set_type(ChannelHandle::DEVICE_TO_DEVICE);
+  RecvWithToken(/*token=*/CreateToken(&b), /*shape=*/shape, /*handle=*/handle);
+  EXPECT_IS_OK(BuildHloModule(b));
+}
+
 TEST(XlaBuilderTest, UnboundedReduce) {
   XlaBuilder b(TestName());
   const Shape shape = ShapeUtil::MakeShape(F32, {7}, {false});
@@ -2247,6 +2535,45 @@ TEST(XlaBuilderTest, UnboundedReduce) {
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
+TEST(XlaBuilderTest, UnboundedReducePrecision) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
+  ReducePrecision(Parameter(&b, 0, operand, "operand"), /*exponent_bits=*/2,
+                  /*mantissa_bits=*/2);
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
+TEST(XlaBuilderTest, UnboundedReduceScatter) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
+
+  XlaComputation computation;
+  {
+    const std::unique_ptr<XlaBuilder> sub_builder = b.CreateSubBuilder("add");
+    Add(/*lhs=*/Parameter(sub_builder.get(), 0, operand, "arg0"),
+        /*rhs=*/Parameter(sub_builder.get(), 1, operand, "arg1"));
+    TF_ASSERT_OK_AND_ASSIGN(computation, sub_builder->Build());
+  }
+
+  ReplicaGroup replica_group;
+  replica_group.add_replica_ids(0);
+  replica_group.add_replica_ids(1);
+
+  ReduceScatter(
+      /*operand=*/Parameter(&b, 0, operand, "operand"),
+      /*computation=*/computation,
+      /*scatter_dimension=*/0,
+      /*shard_count=*/2,
+      /*replica_groups=*/{replica_group});
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
 TEST(XlaBuilderTest, UnboundedReduceWindow) {
   XlaBuilder b(TestName());
   TF_ASSERT_OK_AND_ASSIGN(const Shape input, ParseShape("f32[?, 4, 8]"));
@@ -2300,6 +2627,55 @@ TEST(XlaBuilderTest, UnboundedReshapeUnsupportedInferredShape) {
                    "Reshaping with unbounded result shape is not supported.")));
 }
 
+TEST(XlaBuilderTest, UnboundedReverse) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
+  Rev(Parameter(&b, 0, operand, "operand"), /*dimensions=*/{0, 1});
+  TF_ASSERT_OK_AND_ASSIGN(const std::unique_ptr<HloModule> module,
+                          BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
+TEST(XlaBuilderTest, UnboundedRngBitGenerator) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape initial_state, ParseShape("u32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape shape, ParseShape("u32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
+                          ParseShape("(u32[?, 10], u32[?, 10])"));
+  RngBitGenerator(RandomAlgorithm::RNG_DEFAULT,
+                  Parameter(&b, 0, initial_state, "initial_state"), shape);
+  TF_ASSERT_OK_AND_ASSIGN(const std::unique_ptr<HloModule> module,
+                          BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
+TEST(XlaBuilderTest, UnboundedRngNormal) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape shape, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
+  RngNormal(Parameter(&b, 0, ShapeUtil::MakeScalarShape(F32), "mu"),
+            Parameter(&b, 1, ShapeUtil::MakeScalarShape(F32), "sigma"), shape);
+  TF_ASSERT_OK_AND_ASSIGN(const std::unique_ptr<HloModule> module,
+                          BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
+TEST(XlaBuilderTest, UnboundedRngUniform) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape shape, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
+  RngUniform(Parameter(&b, 0, ShapeUtil::MakeScalarShape(F32), "a"),
+             Parameter(&b, 1, ShapeUtil::MakeScalarShape(F32), "b"), shape);
+  TF_ASSERT_OK_AND_ASSIGN(const std::unique_ptr<HloModule> module,
+                          BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
 TEST(XlaBuilderTest, UnboundedScatter) {
   XlaBuilder b(TestName());
   TF_ASSERT_OK_AND_ASSIGN(const Shape input, ParseShape("f32[?, ?, ?]"));
@@ -2311,11 +2687,10 @@ TEST(XlaBuilderTest, UnboundedScatter) {
   XlaComputation update_computation;
   {
     const std::unique_ptr<XlaBuilder> sub_builder = b.CreateSubBuilder("add");
-    const XlaOp arg0 = Parameter(sub_builder.get(), 0,
-                                 ShapeUtil::MakeScalarShape(F32), "arg0");
-    const XlaOp arg1 = Parameter(sub_builder.get(), 1,
-                                 ShapeUtil::MakeScalarShape(F32), "arg1");
-    Add(arg0, arg1);
+    Add(Parameter(sub_builder.get(), 0, ShapeUtil::MakeScalarShape(F32),
+                  "arg0"),
+        Parameter(sub_builder.get(), 1, ShapeUtil::MakeScalarShape(F32),
+                  "arg1"));
     TF_ASSERT_OK_AND_ASSIGN(update_computation, sub_builder->Build());
   }
 
@@ -2419,6 +2794,82 @@ TEST(XlaBuilderTest,
               StatusIs(_, HasSubstr("Unimplemented implicit broadcast.")));
 }
 
+TEST(XlaBuilderTest, UnboundedSelectAndScatter) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape source, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape init_value, ParseShape("f32[]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
+
+  XlaComputation select;
+  {
+    const std::unique_ptr<XlaBuilder> sub_builder =
+        b.CreateSubBuilder("compare");
+    Compare(Parameter(sub_builder.get(), 0, ShapeUtil::MakeScalarShape(F32),
+                      "arg0"),
+            Parameter(sub_builder.get(), 1, ShapeUtil::MakeScalarShape(F32),
+                      "arg1"),
+            ComparisonDirection::kGe);
+    TF_ASSERT_OK_AND_ASSIGN(select, sub_builder->Build());
+  }
+
+  XlaComputation scatter;
+  {
+    const std::unique_ptr<XlaBuilder> sub_builder = b.CreateSubBuilder("add");
+    Add(Parameter(sub_builder.get(), 0, ShapeUtil::MakeScalarShape(F32),
+                  "arg0"),
+        Parameter(sub_builder.get(), 1, ShapeUtil::MakeScalarShape(F32),
+                  "arg1"));
+    TF_ASSERT_OK_AND_ASSIGN(scatter, sub_builder->Build());
+  }
+
+  SelectAndScatter(Parameter(&b, 0, operand, "operand"), select,
+                   /*window_dimensions=*/
+                   std::array<int64_t, 2>({3, 1}),
+                   /*window_strides=*/std::array<int64_t, 2>({2, 1}),
+                   Padding::kValid, Parameter(&b, 1, source, "source"),
+                   Parameter(&b, 2, init_value, "init_value"), scatter);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
+TEST(XlaBuilderTest, UnboundedSend) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
+  ChannelHandle handle;
+  handle.set_handle(1);
+  handle.set_type(ChannelHandle::DEVICE_TO_DEVICE);
+  Send(/*operand=*/Parameter(&b, 0, operand, "operand"), /*handle=*/handle);
+  EXPECT_IS_OK(BuildHloModule(b));
+}
+
+TEST(XlaBuilderTest, UnboundedSendToHost) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape shape_with_layout,
+                          ParseShape("f32[?, 10]"));
+  ChannelHandle handle;
+  handle.set_handle(1);
+  handle.set_type(ChannelHandle::DEVICE_TO_HOST);
+  SendToHost(/*operand=*/Parameter(&b, 0, operand, "operand"),
+             /*token=*/CreateToken(&b), /*shape_with_layout=*/shape_with_layout,
+             /*handle=*/handle);
+  EXPECT_IS_OK(BuildHloModule(b));
+}
+
+TEST(XlaBuilderTest, UnboundedSendWithToken) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
+  ChannelHandle handle;
+  handle.set_handle(1);
+  handle.set_type(ChannelHandle::DEVICE_TO_DEVICE);
+  SendWithToken(/*operand=*/Parameter(&b, 0, operand, "operand"),
+                /*token=*/CreateToken(&b), /*handle=*/handle);
+  EXPECT_IS_OK(BuildHloModule(b));
+}
+
 TEST(XlaBuilderTest, UnboundedSlice) {
   XlaBuilder b(TestName());
   TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[1, <=3, ?]"));
@@ -2427,7 +2878,33 @@ TEST(XlaBuilderTest, UnboundedSlice) {
         /*start_indices=*/{0, 1, 2},
         /*limit_indices=*/{1, 3, 5},
         /*strides=*/{1, 1, 1});
-  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const std::unique_ptr<HloModule> module,
+                          BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
+TEST(XlaBuilderTest, UnboundedSort) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
+
+  XlaComputation comparator;
+  {
+    const std::unique_ptr<XlaBuilder> sub_builder =
+        b.CreateSubBuilder("compare");
+    Compare(Parameter(sub_builder.get(), 0, ShapeUtil::MakeScalarShape(F32),
+                      "arg0"),
+            Parameter(sub_builder.get(), 1, ShapeUtil::MakeScalarShape(F32),
+                      "arg1"),
+            ComparisonDirection::kLt);
+    TF_ASSERT_OK_AND_ASSIGN(comparator, sub_builder->Build());
+  }
+
+  Sort({Parameter(&b, 0, operand, "operand")}, comparator,
+       /*dimension=*/0, /*is_stable=*/true);
+  TF_ASSERT_OK_AND_ASSIGN(const std::unique_ptr<HloModule> module,
+                          BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
@@ -2445,6 +2922,91 @@ TEST(XlaBuilderTest, UnboundedTranspose) {
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
+TEST(XlaBuilderTest, UnboundedTriangularSolve) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape a_shape, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape b_shape, ParseShape("f32[10, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[10, ?]"));
+  TriangularSolveOptions options;
+  TriangularSolve(Parameter(&b, 0, a_shape, "a"),
+                  Parameter(&b, 1, b_shape, "b"),
+                  /*left_side=*/true, /*lower*/ true, /*unit_diagonal=*/false,
+                  TriangularSolveOptions::TRANSPOSE);
+  TF_ASSERT_OK_AND_ASSIGN(const std::unique_ptr<HloModule> module,
+                          BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
+TEST(XlaBuilderTest, UnboundedTuple) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
+  const Shape expected = ShapeUtil::MakeTupleShape({operand});
+  Tuple(&b, {Parameter(&b, 0, operand, "operand")});
+  TF_ASSERT_OK_AND_ASSIGN(const std::unique_ptr<HloModule> module,
+                          BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
+TEST(XlaBuilderTest, UnboundedWhile) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape init, ParseShape("f32[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?]"));
+
+  XlaComputation add;
+  {
+    const std::unique_ptr<XlaBuilder> sub_builder = b.CreateSubBuilder("add");
+    Add(Parameter(sub_builder.get(), 0, ShapeUtil::MakeScalarShape(F32),
+                  "arg0"),
+        Parameter(sub_builder.get(), 1, ShapeUtil::MakeScalarShape(F32),
+                  "arg1"));
+    TF_ASSERT_OK_AND_ASSIGN(add, sub_builder->Build());
+  }
+
+  XlaComputation condition;
+  {
+    const std::unique_ptr<XlaBuilder> sub_builder =
+        b.CreateSubBuilder("compare");
+    Ge(/*lhs=*/ConstantR0<float>(sub_builder.get(), 10.0f),
+       /*rhs=*/Reduce(/*operand=*/Parameter(sub_builder.get(), 0, init, "prev"),
+                      ConstantR0<float>(sub_builder.get(), 0.0f), add,
+                      /*dimensions_to_reduce=*/{0}));
+    TF_ASSERT_OK_AND_ASSIGN(condition, sub_builder->Build());
+  }
+
+  XlaComputation body;
+  {
+    const std::unique_ptr<XlaBuilder> sub_builder = b.CreateSubBuilder("add");
+    Add(ConstantR1<float>(sub_builder.get(), {1.0f}),
+        Parameter(sub_builder.get(), 0, init, "prev"),
+        /*broadcast_dimensions=*/{0});
+    TF_ASSERT_OK_AND_ASSIGN(body, sub_builder->Build());
+  }
+
+  While(condition, body, Parameter(&b, 0, init, "init"));
+  TF_ASSERT_OK_AND_ASSIGN(const std::unique_ptr<HloModule> module,
+                          BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
+TEST(XlaBuilderTest, UnboundedXor) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs,
+                          ParseShape("s32[1, ?, 2, ?, <=2, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs,
+                          ParseShape("s32[?, 1, ?, 2, ?, <=2, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
+                          ParseShape("s32[?, ?, 2, 2, <=2, <=2, ?]"));
+  Xor(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
+      /*broadcast_dimensions=*/empty_array);
+  TF_ASSERT_OK_AND_ASSIGN(const std::unique_ptr<HloModule> module,
+                          BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
 INSTANTIATE_TEST_SUITE_P(UnboundedDynamism, XlaBuilderUnboundedUnaryOpTest,
                          ::testing::ValuesIn<UnaryOpTestCase>(
                              {{"f32[?]", "f32[?]", &Abs},
@@ -2462,6 +3024,7 @@ INSTANTIATE_TEST_SUITE_P(UnboundedDynamism, XlaBuilderUnboundedUnaryOpTest,
                               {"f32[?]", "f32[?]", &Log1p},
                               {"f32[?]", "f32[?]", &Logistic},
                               {"f32[?]", "f32[?]", &Neg},
+                              {"s32[?]", "s32[?]", &Not},
                               {"u32[?]", "u32[?]", &PopulationCount},
                               {"f32[?]", "f32[?]", &Real},
                               {"f32[?]", "f32[?]", &Round},
@@ -2483,6 +3046,11 @@ INSTANTIATE_TEST_SUITE_P(
         {"f32[1, ?, 2, ?, <=2, ?, ?]", "f32[?, 1, ?, 2, ?, <=2, ?]",
          /*broadcast_dimensions=*/empty_array, "f32[?, ?, 2, 2, <=2, <=2, ?]",
          &Atan2},
+        {"f32[1, ?, 2, ?, <=2, ?, ?]", "f32[?, 1, ?, 2, ?, <=2, ?]",
+         /*broadcast_dimensions=*/empty_array, "c64[?, ?, 2, 2, <=2, <=2, ?]",
+         &Complex},
+        {"f32[?, 10]", "f32[1]", /*broadcast_dimensions=*/zero_array,
+         "c64[?, 10]", &Complex},
         {"f32[1, ?, 2, ?, <=2, ?, ?]", "f32[?, 1, ?, 2, ?, <=2, ?]",
          /*broadcast_dimensions=*/empty_array, "f32[?, ?, 2, 2, <=2, <=2, ?]",
          &Div},
@@ -2493,6 +3061,11 @@ INSTANTIATE_TEST_SUITE_P(
          &Max},
         {"f32[?, 10]", "f32[1]", /*broadcast_dimensions=*/zero_array,
          "f32[?, 10]", &Max},
+        {"f32[1, ?, 2, ?, <=2, ?, ?]", "f32[?, 1, ?, 2, ?, <=2, ?]",
+         /*broadcast_dimensions=*/empty_array, "f32[?, ?, 2, 2, <=2, <=2, ?]",
+         &Min},
+        {"f32[?, 10]", "f32[1]", /*broadcast_dimensions=*/zero_array,
+         "f32[?, 10]", &Min},
         {"f32[1, ?, 2, ?, <=2, ?, ?]", "f32[?, 1, ?, 2, ?, <=2, ?]",
          /*broadcast_dimensions=*/empty_array, "f32[?, ?, 2, 2, <=2, <=2, ?]",
          &Mul},
@@ -2505,6 +3078,26 @@ INSTANTIATE_TEST_SUITE_P(
          &Pow},
         {"f32[?, 10]", "f32[1]", /*broadcast_dimensions=*/zero_array,
          "f32[?, 10]", &Pow},
+        {"f32[1, ?, 2, ?, <=2, ?, ?]", "f32[?, 1, ?, 2, ?, <=2, ?]",
+         /*broadcast_dimensions=*/empty_array, "f32[?, ?, 2, 2, <=2, <=2, ?]",
+         &Rem},
+        {"f32[?, 10]", "f32[1]", /*broadcast_dimensions=*/zero_array,
+         "f32[?, 10]", &Rem},
+        {"f32[1, ?, 2, ?, <=2, ?, ?]", "f32[?, 1, ?, 2, ?, <=2, ?]",
+         /*broadcast_dimensions=*/empty_array, "f32[?, ?, 2, 2, <=2, <=2, ?]",
+         &ShiftLeft},
+        {"f32[?, 10]", "f32[1]", /*broadcast_dimensions=*/zero_array,
+         "f32[?, 10]", &ShiftLeft},
+        {"f32[1, ?, 2, ?, <=2, ?, ?]", "f32[?, 1, ?, 2, ?, <=2, ?]",
+         /*broadcast_dimensions=*/empty_array, "f32[?, ?, 2, 2, <=2, <=2, ?]",
+         &ShiftRightArithmetic},
+        {"f32[?, 10]", "f32[1]", /*broadcast_dimensions=*/zero_array,
+         "f32[?, 10]", &ShiftRightArithmetic},
+        {"f32[1, ?, 2, ?, <=2, ?, ?]", "f32[?, 1, ?, 2, ?, <=2, ?]",
+         /*broadcast_dimensions=*/empty_array, "f32[?, ?, 2, 2, <=2, <=2, ?]",
+         &ShiftRightLogical},
+        {"f32[?, 10]", "f32[1]", /*broadcast_dimensions=*/zero_array,
+         "f32[?, 10]", &ShiftRightLogical},
         {"f32[1, ?, 2, ?, <=2, ?, ?]", "f32[?, 1, ?, 2, ?, <=2, ?]",
          /*broadcast_dimensions=*/empty_array, "f32[?, ?, 2, 2, <=2, <=2, ?]",
          &Sub},
diff --git a/third_party/xla/xla/comparison_util.cc b/third_party/xla/xla/comparison_util.cc
index cefaa9a3451749..a0df27e07928d8 100644
--- a/third_party/xla/xla/comparison_util.cc
+++ b/third_party/xla/xla/comparison_util.cc
@@ -163,7 +163,7 @@ absl::string_view ComparisonOrderToString(Comparison::Order order) {
   }
 }
 
-StatusOr<Comparison::Direction> StringToComparisonDirection(
+absl::StatusOr<Comparison::Direction> StringToComparisonDirection(
     absl::string_view direction) {
   static auto* map =
       new absl::flat_hash_map<std::string, Comparison::Direction>({
@@ -181,7 +181,8 @@ StatusOr<Comparison::Direction> StringToComparisonDirection(
   return it->second;
 }
 
-StatusOr<Comparison::Order> StringToComparisonOrder(absl::string_view order) {
+absl::StatusOr<Comparison::Order> StringToComparisonOrder(
+    absl::string_view order) {
   static auto* map = new absl::flat_hash_map<std::string, Comparison::Order>({
       {"TOTALORDER", Comparison::Order::kTotal},
       {"PARTIALORDER", Comparison::Order::kPartial},
@@ -193,7 +194,7 @@ StatusOr<Comparison::Order> StringToComparisonOrder(absl::string_view order) {
   return it->second;
 }
 
-StatusOr<Comparison::Type> StringToComparisonType(
+absl::StatusOr<Comparison::Type> StringToComparisonType(
     absl::string_view comparison) {
   static auto* map = new absl::flat_hash_map<std::string, Comparison::Type>({
       {"FLOAT", Comparison::Type::kFloat},
diff --git a/third_party/xla/xla/comparison_util.h b/third_party/xla/xla/comparison_util.h
index 1127fa17ea1134..3c72cac3655b3b 100644
--- a/third_party/xla/xla/comparison_util.h
+++ b/third_party/xla/xla/comparison_util.h
@@ -236,10 +236,12 @@ std::string ComparisonDirectionToString(Comparison::Direction direction);
 std::string ComparisonTypeToString(Comparison::Type type);
 absl::string_view ComparisonPrimitiveTypeToString(PrimitiveType type);
 
-StatusOr<Comparison::Direction> StringToComparisonDirection(
+absl::StatusOr<Comparison::Direction> StringToComparisonDirection(
     absl::string_view direction);
-StatusOr<Comparison::Type> StringToComparisonType(absl::string_view comparison);
-StatusOr<Comparison::Order> StringToComparisonOrder(absl::string_view order);
+absl::StatusOr<Comparison::Type> StringToComparisonType(
+    absl::string_view comparison);
+absl::StatusOr<Comparison::Order> StringToComparisonOrder(
+    absl::string_view order);
 
 // Returns a comparison function using the provided key function on each value,
 // i.e. `key_fn(a) < key_fn(b)`.
diff --git a/third_party/xla/xla/debug_options_flags.cc b/third_party/xla/xla/debug_options_flags.cc
index 8b29c823b1d921..4fde3b7583f1aa 100644
--- a/third_party/xla/xla/debug_options_flags.cc
+++ b/third_party/xla/xla/debug_options_flags.cc
@@ -50,6 +50,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_llvm_disable_expensive_passes(false);
   opts.set_xla_backend_optimization_level(3);
   opts.set_xla_gpu_autotune_level(4);
+  opts.set_xla_gpu_autotune_max_solutions(0);
   opts.set_xla_cpu_multi_thread_eigen(true);
   opts.set_xla_gpu_cuda_data_dir("./cuda_sdk_lib");
   opts.set_xla_gpu_asm_extra_flags("");
@@ -147,7 +148,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_gpu_enable_shared_constants(true);
   opts.set_xla_gpu_enable_nccl_user_buffers(false);
   opts.set_xla_gpu_enable_nccl_comm_splitting(false);
-  opts.set_xla_gpu_enable_nccl_per_stream_comms(true);
+  opts.set_xla_gpu_enable_nccl_per_stream_comms(false);
 
   // Set 4GB space limit for redzone scratch allocator.
   opts.set_xla_gpu_redzone_scratch_max_megabytes(1LL << 12);
@@ -198,6 +199,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_gpu_exhaustive_tiling_search(false);
 
   opts.set_xla_gpu_enable_priority_fusion(true);
+  opts.set_xla_gpu_enable_triton_softmax_priority_fusion(false);
 
   opts.set_xla_gpu_auto_spmd_partitioning_memory_budget_gb(0);
   opts.set_xla_gpu_auto_spmd_partitioning_memory_budget_ratio(1.1);
@@ -249,6 +251,8 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   const int64_t kDefaultMinGemmRewriteSize = 100;
   opts.set_xla_gpu_gemm_rewrite_size_threshold(kDefaultMinGemmRewriteSize);
 
+  opts.set_xla_gpu_use_memcpy_local_p2p(false);
+
   return opts;
 }
 
@@ -748,6 +752,12 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       debug_options->xla_gpu_autotune_level(),
       "Set GEMM and Convolution auto-tuning level. 0 = off; 1 = on; 2 = "
       "on+init; 3 = on+init+reinit; 4 = on+init+reinit+check."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_autotune_max_solutions",
+      int64_setter_for(&DebugOptions::set_xla_gpu_autotune_max_solutions),
+      debug_options->xla_gpu_autotune_max_solutions(),
+      "Maximal number of GEMM solutions to consider for autotuning: 0 means "
+      "consider all solutions returned by the GEMM library."));
   flag_list->push_back(tsl::Flag(
       "xla_force_host_platform_device_count",
       int32_setter_for(&DebugOptions::set_xla_force_host_platform_device_count),
@@ -1410,6 +1420,12 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       bool_setter_for(&DebugOptions::set_xla_gpu_enable_priority_fusion),
       debug_options->xla_gpu_enable_priority_fusion(),
       "Enable priority queue for fusion order."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_enable_triton_softmax_priority_fusion",
+      bool_setter_for(
+          &DebugOptions::set_xla_gpu_enable_triton_softmax_priority_fusion),
+      debug_options->xla_gpu_enable_triton_softmax_priority_fusion(),
+      "Enable fusion into Triton Softmax in PriorityFusion pass."));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_dump_autotune_results_to",
       string_setter_for(&DebugOptions::set_xla_gpu_dump_autotune_results_to),
@@ -1647,6 +1663,11 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       "Threshold to rewrite matmul to cuBLAS or Triton "
       "(minumum combined number of elements of both matrices "
       "in non-batch dimensions to be considered for a rewrite)."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_use_memcpy_local_p2p",
+      bool_setter_for(&DebugOptions::set_xla_gpu_use_memcpy_local_p2p),
+      debug_options->xla_gpu_use_memcpy_local_p2p(),
+      "Whether to use memcpy for local p2p communication."));
 }  // NOLINT(readability/fn_size)
 
 // Allocates flag_values and flag_objects; this function must not be called more
diff --git a/third_party/xla/xla/execution_options_util.cc b/third_party/xla/xla/execution_options_util.cc
index 956bc6f23ca179..e86196a3eb6225 100644
--- a/third_party/xla/xla/execution_options_util.cc
+++ b/third_party/xla/xla/execution_options_util.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "xla/execution_options_util.h"
+
 #include "xla/debug_options_flags.h"
+#include "xla/xla.pb.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/ffi/api/BUILD b/third_party/xla/xla/ffi/api/BUILD
index fcd33d23fef49a..ab36db4a82d1cf 100644
--- a/third_party/xla/xla/ffi/api/BUILD
+++ b/third_party/xla/xla/ffi/api/BUILD
@@ -1,6 +1,6 @@
-load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//xla:xla.bzl", "xla_cc_test")
+load("//xla/tsl:tsl.default.bzl", "filegroup")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/ffi/api/api.h b/third_party/xla/xla/ffi/api/api.h
index f3595cc20195c3..6169f06b2ce82e 100644
--- a/third_party/xla/xla/ffi/api/api.h
+++ b/third_party/xla/xla/ffi/api/api.h
@@ -210,8 +210,15 @@ namespace internal {
 // A type tag to forward all remaining args as `RemainingArgs`.
 struct RemainingArgsTag {};
 
-// A type tag to distinguish arguments tied to the attributes in the
-// `Binding` variadic template argument.
+// A type tag to distinguish parameters tied to results in the `Binding`
+// variadic template. In XLA FFI we use destination passing style APIs and don't
+// return anything from the handler, but instead pass a destination where the
+// handler should write the result.
+template <typename T>
+struct RetTag {};
+
+// A type tag to distinguish parameters tied to the attributes in the
+// `Binding` variadic template.
 template <typename T>
 struct AttrTag {};
 
@@ -220,7 +227,7 @@ struct AttrTag {};
 template <typename T = Dictionary>
 struct AttrsTag {};
 
-// A type tag to distinguish arguments extracted from an execution context.
+// A type tag to distinguish parameter extracted from an execution context.
 template <typename T>
 struct CtxTag {};
 
@@ -267,6 +274,11 @@ class Binding {
     return {std::move(*this)};
   }
 
+  template <typename T>
+  Binding<Ts..., internal::RetTag<T>> Ret() && {
+    return {std::move(*this)};
+  }
+
   Binding<Ts..., internal::RemainingArgsTag> RemainingArgs() && {
     static_assert(!internal::HasRemainingArgsTag<Ts...>::value,
                   "remaining arguments can be passed just once");
@@ -340,6 +352,20 @@ struct ArgBinding {
   using Arg = void;
 };
 
+// XLA FFI binding for a returned result.
+//
+// Example: binding for the `MyType` result
+//
+//   template <>
+//   struct RetBinding<MyType> {
+//     using Ret = MyType;
+//   };
+//
+template <typename T>
+struct RetBinding {
+  using Ret = void;
+};
+
 // XLA FFI binding for a named attribute.
 //
 // Example: binding for the `MyType` attribute
@@ -382,6 +408,10 @@ template <typename Param>
 inline constexpr bool is_arg_binding_v =
     !std::is_void_v<typename ArgBinding<Param>::Arg>;
 
+template <typename Param>
+inline constexpr bool is_ret_binding_v =
+    !std::is_void_v<typename RetBinding<Param>::Ret>;
+
 template <typename Param>
 inline constexpr bool is_attr_binding_v =
     !std::is_void_v<typename AttrBinding<Param>::Attr>;
@@ -410,6 +440,11 @@ struct BindOne<Fn, Param, Params...> {
       return BindOne<Fn, Params...>::To(
           std::move(fn),
           std::move(binding).template Arg<typename ArgBinding<Param>::Arg>());
+    } else if constexpr (is_ret_binding_v<Param>) {
+      // Bind parameter as an FFI handler result.
+      return BindOne<Fn, Params...>::To(
+          std::move(fn),
+          std::move(binding).template Ret<typename RetBinding<Param>::Ret>());
 
     } else if constexpr (is_attr_binding_v<Param>) {
       // Bind parameter as a named FFI handler attribute.
@@ -482,12 +517,26 @@ auto Ffi::BindTo(Fn fn) {
   }
 }
 
-// A container for defining attribute type and name as compile time parameters.
+// A container for defining parameters corresponding to results.
+template <typename T>
+class Result {
+ public:
+  Result(T value) : value_(value) {}  // NOLINT
+  T& operator*() { return value_; }
+  T* operator->() { return &value_; }
+
+ private:
+  T value_;
+};
+
+// A container for defining parameters corresponding to attributes with an
+// attribute name available as compile time value.
 template <typename T, char const* attr_name>
 class Attr {
  public:
   Attr(T value) : value_(value) {}  // NOLINT
   T& operator*() { return value_; }
+  T* operator->() { return &value_; }
 
  private:
   T value_;
@@ -527,6 +576,23 @@ struct AttrsBinding<Dictionary> {
 template <typename T>
 struct ArgDecoding;
 
+//===----------------------------------------------------------------------===//
+// Results decoding implementation
+//===----------------------------------------------------------------------===//
+
+// XLA FFI results decoding must be defined by specializing this template.
+//
+// Example: decoding for the `MyType` results
+//
+//   template <>
+//   struct RetDecoding<MyType> {
+//     static std::optional<MyType> Decode(XLA_FFI_RetType type, void* ret);
+//   };
+//
+// If argument can't be decoded it should return the empty optional.
+template <typename T>
+struct RetDecoding;
+
 //===----------------------------------------------------------------------===//
 // Attributes decoding implementation
 //===----------------------------------------------------------------------===//
@@ -654,6 +720,7 @@ namespace internal {
 // attributes we decoded so far to compute call frame offsets.
 struct DecodingOffsets {
   int64_t args = 0;
+  int64_t rets = 0;
   int64_t attrs = 0;
 };
 
@@ -677,6 +744,17 @@ struct Decode {
 
 }  // namespace internal
 
+template <typename T>
+struct internal::Decode<internal::RetTag<T>> {
+  static std::optional<Result<T>> call(DecodingOffsets& offsets,
+                                       DecodingContext& ctx,
+                                       DiagnosticEngine& diagnostic) {
+    int64_t idx = offsets.rets++;
+    return RetDecoding<T>::Decode(ctx.call_frame->rets.types[idx],
+                                  ctx.call_frame->rets.rets[idx], diagnostic);
+  }
+};
+
 template <typename T>
 struct internal::Decode<internal::AttrTag<T>> {
   using R = typename AttrDecoding<T>::Type;
@@ -774,16 +852,16 @@ class RemainingArgs {
  public:
   RemainingArgs(const XLA_FFI_Args* args, size_t offset)
       : args_(args), offset_(offset) {
-    assert(offset <= args_->num_args && "illegal remaining args offset");
+    assert(offset <= args_->size && "illegal remaining args offset");
   }
 
-  size_t size() const { return args_->num_args - offset_; }
+  size_t size() const { return args_->size - offset_; }
   bool empty() const { return size() == 0; }
 
   template <typename T>
   Expected<T, std::string> get(size_t index) const {
     size_t idx = offset_ + index;
-    if (idx >= args_->num_args) {
+    if (idx >= args_->size) {
       return Unexpected("Index out of range.");
     }
 
@@ -818,10 +896,10 @@ class Dictionary {
  public:
   explicit Dictionary(const XLA_FFI_Attrs* attrs) : attrs_(attrs) {}
 
-  size_t size() const { return attrs_->num_attrs; }
+  size_t size() const { return attrs_->size; }
 
   bool contains(std::string_view name) const {
-    return Find(name) < attrs_->num_attrs;
+    return Find(name) < attrs_->size;
   }
 
   template <typename T>
@@ -838,7 +916,7 @@ class Dictionary {
   std::optional<T> get(std::string_view name,
                        DiagnosticEngine& diagnostic) const {
     size_t idx = Find(name);
-    if (idx >= attrs_->num_attrs) {
+    if (idx >= attrs_->size) {
       return diagnostic.Emit("Unexpected attribute: ") << name;
     }
 
@@ -850,7 +928,7 @@ class Dictionary {
  private:
   size_t Find(std::string_view name) const {
     XLA_FFI_ByteSpan** begin = attrs_->names;
-    XLA_FFI_ByteSpan** end = begin + attrs_->num_attrs;
+    XLA_FFI_ByteSpan** end = begin + attrs_->size;
 
     auto name_eq = [&](XLA_FFI_ByteSpan* attr) {
       std::string_view name_view = {attr->ptr, attr->len};
@@ -897,21 +975,21 @@ struct FnArgType {
   using Type = T;
 };
 
-// Extracts the underlying type from the attribute type tag.
-template <typename T>
-struct FnArgType<internal::AttrTag<T>> {
-  using Type = typename AttrDecoding<T>::Type;
+template <>
+struct FnArgType<internal::RemainingArgsTag> {
+  using Type = RemainingArgs;
 };
 
-// Extracts the underlying type from the context type tag.
+// Extracts the underlying type from the returned result type tag.
 template <typename T>
-struct FnArgType<internal::CtxTag<T>> {
-  using Type = typename CtxDecoding<T>::Type;
+struct FnArgType<internal::RetTag<T>> {
+  using Type = Result<T>;
 };
 
-template <>
-struct FnArgType<internal::RemainingArgsTag> {
-  using Type = RemainingArgs;
+// Extracts the underlying type from the attribute type tag.
+template <typename T>
+struct FnArgType<internal::AttrTag<T>> {
+  using Type = typename AttrDecoding<T>::Type;
 };
 
 template <typename T>
@@ -919,11 +997,19 @@ struct FnArgType<internal::AttrsTag<T>> {
   using Type = T;
 };
 
+// Extracts the underlying type from the context type tag.
+template <typename T>
+struct FnArgType<internal::CtxTag<T>> {
+  using Type = typename CtxDecoding<T>::Type;
+};
+
 // A template for checking if type in a parameter pack is a tagged one and has
 // a special decoding rule defined by template specialization.
 template <typename>
 struct IsTagged : std::false_type {};
 template <typename T>
+struct IsTagged<RetTag<T>> : std::true_type {};
+template <typename T>
 struct IsTagged<AttrTag<T>> : std::true_type {};
 template <typename T>
 struct IsTagged<AttrsTag<T>> : std::true_type {};
@@ -958,6 +1044,9 @@ class Handler : public Ffi {
 
   static constexpr int64_t kNumArgs = internal::NumArgs<Ts...>::value;
 
+  static constexpr int64_t kNumRets =
+      internal::NumTagged<internal::RetTag, Ts...>::value;
+
   static constexpr int64_t kNumAttrs =
       internal::NumTagged<internal::AttrTag, Ts...>::value;
 
@@ -986,32 +1075,41 @@ class Handler : public Ffi {
     // Check that the number of passed arguments matches the signature. Each
     // individual argument decoding will check the actual type.
     if (internal::HasRemainingArgsTag<Ts...>::value) {
-      if (XLA_FFI_PREDICT_FALSE(call_frame->args.num_args < kNumArgs)) {
+      if (XLA_FFI_PREDICT_FALSE(call_frame->args.size < kNumArgs)) {
         return InvalidArgument(
             call_frame->api,
             StrCat("Wrong number of arguments: expected at least ",
-                   kNumArgs - 1, " but got ", call_frame->args.num_args));
+                   kNumArgs - 1, " but got ", call_frame->args.size));
       }
     } else {
-      if (XLA_FFI_PREDICT_FALSE(call_frame->args.num_args != kNumArgs)) {
+      if (XLA_FFI_PREDICT_FALSE(call_frame->args.size != kNumArgs)) {
         return InvalidArgument(
             call_frame->api,
             StrCat("Wrong number of arguments: expected ", kNumArgs,
-                   " but got ", call_frame->args.num_args));
+                   " but got ", call_frame->args.size));
       }
     }
 
+    // Check that the number of results matches the signature. Each individual
+    // result decoding will check the actual type.
+    if (XLA_FFI_PREDICT_FALSE(call_frame->rets.size != kNumRets)) {
+      return InvalidArgument(
+          call_frame->api,
+          StrCat("Wrong number of results: expected ", kNumRets, " but got ",
+                 call_frame->rets.size));
+    }
+
     // Check that the number of passed attributes matches the signature. Each
     // individual attribute decoding will check the actual type. If we decode
     // attributes into a dictionary (or a custom struct decoded from a
     // dictionary), then there is no need to check attributes, as the FFI
     // handler (or a struct decoding) should be responsible for it.
     if (XLA_FFI_PREDICT_FALSE(kNumDictAttrs == 0 &&
-                              call_frame->attrs.num_attrs != kNumAttrs)) {
+                              call_frame->attrs.size != kNumAttrs)) {
       return InvalidArgument(
           call_frame->api,
           StrCat("Wrong number of attributes: expected ", kNumAttrs,
-                 " but got ", call_frame->attrs.num_attrs));
+                 " but got ", call_frame->attrs.size));
     }
 
     // Define index sequences to access custom call operands.
@@ -1205,9 +1303,9 @@ struct DecodeDictionaryAttr {
   XLA_FFI_ATTRIBUTE_ALWAYS_INLINE static std::optional<T> Decode(
       const XLA_FFI_Attrs* attrs, std::array<std::string_view, kSize> names,
       std::index_sequence<Is...>, DiagnosticEngine& diagnostic) {
-    if (XLA_FFI_PREDICT_FALSE(kSize != attrs->num_attrs)) {
+    if (XLA_FFI_PREDICT_FALSE(kSize != attrs->size)) {
       return diagnostic.Emit("Wrong number of attributes: expected ")
-             << kSize << " attributes but got " << attrs->num_attrs;
+             << kSize << " attributes but got " << attrs->size;
     }
 
     // TODO(ezhulenev): We rely on dictionary to lookup struct members by name
diff --git a/third_party/xla/xla/ffi/api/c_api.h b/third_party/xla/xla/ffi/api/c_api.h
index e7ebd8da060796..78f0e4d710a435 100644
--- a/third_party/xla/xla/ffi/api/c_api.h
+++ b/third_party/xla/xla/ffi/api/c_api.h
@@ -187,6 +187,14 @@ typedef enum {
   XLA_FFI_ArgType_BUFFER = 1,
 } XLA_FFI_ArgType;
 
+//===----------------------------------------------------------------------===//
+// Builtin result types
+//===----------------------------------------------------------------------===//
+
+typedef enum {
+  XLA_FFI_RetType_BUFFER = 1,
+} XLA_FFI_RetType;
+
 //===----------------------------------------------------------------------===//
 // Builtin attribute types
 //===----------------------------------------------------------------------===//
@@ -249,23 +257,34 @@ struct XLA_FFI_Args {
   size_t struct_size;
   void* priv;
 
-  int64_t num_args;
-  XLA_FFI_ArgType* types;  // length == num_args
-  void** args;             // length == num_args
+  int64_t size;
+  XLA_FFI_ArgType* types;  // length == size
+  void** args;             // length == size
 };
 
 XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Args, args);
 
+struct XLA_FFI_Rets {
+  size_t struct_size;
+  void* priv;
+
+  int64_t size;
+  XLA_FFI_RetType* types;  // length == size
+  void** rets;             // length == size
+};
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Rets, rets);
+
 // FFI handler attributes are always sorted by name, so that the handler can
 // rely on binary search to look up attributes by name.
 struct XLA_FFI_Attrs {
   size_t struct_size;
   void* priv;
 
-  int64_t num_attrs;
-  XLA_FFI_AttrType* types;   // length == num_attrs
-  XLA_FFI_ByteSpan** names;  // length == num_attrs
-  void** attrs;              // length == num_attrs
+  int64_t size;
+  XLA_FFI_AttrType* types;   // length == size
+  XLA_FFI_ByteSpan** names;  // length == size
+  void** attrs;              // length == size
 };
 
 XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Attrs, attrs);
@@ -274,9 +293,10 @@ struct XLA_FFI_CallFrame {
   size_t struct_size;
   void* priv;
 
-  XLA_FFI_Api* api;
+  const XLA_FFI_Api* api;
   XLA_FFI_ExecutionContext* ctx;
   XLA_FFI_Args args;
+  XLA_FFI_Rets rets;
   XLA_FFI_Attrs attrs;
 };
 
@@ -354,6 +374,8 @@ struct XLA_FFI_Api {
 
 XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Api, XLA_FFI_Stream_Get);
 
+const XLA_FFI_Api* XLA_FFI_GetApi();
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/third_party/xla/xla/ffi/api/ffi.h b/third_party/xla/xla/ffi/api/ffi.h
index ab7a539c707215..a6995cbc739541 100644
--- a/third_party/xla/xla/ffi/api/ffi.h
+++ b/third_party/xla/xla/ffi/api/ffi.h
@@ -171,6 +171,37 @@ template <DataType dtype> using BufferR3 = Buffer<dtype, 3>;
 template <DataType dtype> using BufferR4 = Buffer<dtype, 4>;
 // clang-format on
 
+namespace internal {
+
+inline BufferBase DecodeBuffer(XLA_FFI_Buffer* buf) {
+  return BufferBase{static_cast<DataType>(buf->dtype), buf->data,
+                    Span<const int64_t>(buf->dims, buf->rank)};
+}
+
+template <DataType dtype, size_t rank>
+std::optional<Buffer<dtype, rank>> DecodeBuffer(XLA_FFI_Buffer* buf,
+                                                DiagnosticEngine& diagnostic) {
+  if (auto buf_dtype = static_cast<DataType>(buf->dtype);
+      XLA_FFI_PREDICT_FALSE(buf_dtype != dtype)) {
+    return diagnostic.Emit("Wrong buffer dtype: expected ")
+           << dtype << " but got " << buf_dtype;
+  }
+
+  if constexpr (rank != internal::kDynamicRank) {
+    if (XLA_FFI_PREDICT_FALSE(buf->rank != rank)) {
+      return diagnostic.Emit("Wrong buffer rank: expected ")
+             << rank << " but got " << buf->rank;
+    }
+  }
+
+  Buffer<dtype, rank> buffer;
+  buffer.data = static_cast<internal::NativeType<dtype>*>(buf->data);
+  buffer.dimensions = Span<const int64_t>(buf->dims, buf->rank);
+  return buffer;
+}
+
+}  // namespace internal
+
 //===----------------------------------------------------------------------===//
 // Arguments binding
 //===----------------------------------------------------------------------===//
@@ -185,6 +216,20 @@ struct ArgBinding<Buffer<dtype, rank>> {
   using Arg = Buffer<dtype, rank>;
 };
 
+//===----------------------------------------------------------------------===//
+// Results binding
+//===----------------------------------------------------------------------===//
+
+template <>
+struct RetBinding<Result<BufferBase>> {
+  using Ret = BufferBase;
+};
+
+template <DataType dtype, size_t rank>
+struct RetBinding<Result<Buffer<dtype, rank>>> {
+  using Ret = Buffer<dtype, rank>;
+};
+
 //===----------------------------------------------------------------------===//
 // Arguments decoding
 //===----------------------------------------------------------------------===//
@@ -205,9 +250,7 @@ struct ArgDecoding<BufferBase> {
       return diagnostic.Emit("Wrong argument type: expected ")
              << XLA_FFI_ArgType_BUFFER << " but got " << type;
     }
-    auto* buf = reinterpret_cast<XLA_FFI_Buffer*>(arg);
-    return BufferBase{static_cast<DataType>(buf->dtype), buf->data,
-                      Span<const int64_t>(buf->dims, buf->rank)};
+    return internal::DecodeBuffer(reinterpret_cast<XLA_FFI_Buffer*>(arg));
   }
 };
 
@@ -221,25 +264,47 @@ struct ArgDecoding<Buffer<dtype, rank>> {
              << XLA_FFI_ArgType_BUFFER << " but got " << type;
     }
 
-    auto* buf = reinterpret_cast<XLA_FFI_Buffer*>(arg);
+    return internal::DecodeBuffer<dtype, rank>(
+        reinterpret_cast<XLA_FFI_Buffer*>(arg), diagnostic);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Results decoding
+//===----------------------------------------------------------------------===//
 
-    if (auto actual_dtype = static_cast<DataType>(buf->dtype);
-        XLA_FFI_PREDICT_FALSE(actual_dtype != dtype)) {
-      return diagnostic.Emit("Wrong buffer dtype: expected ")
-             << dtype << " but got " << actual_dtype;
+inline std::ostream& operator<<(std::ostream& os, const XLA_FFI_RetType type) {
+  switch (type) {
+    case XLA_FFI_RetType_BUFFER:
+      return os << "buffer";
+  }
+}
+
+template <>
+struct RetDecoding<BufferBase> {
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
+  static std::optional<Result<BufferBase>> Decode(
+      XLA_FFI_RetType type, void* ret, DiagnosticEngine& diagnostic) {
+    if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_RetType_BUFFER)) {
+      return diagnostic.Emit("Wrong result type: expected ")
+             << XLA_FFI_RetType_BUFFER << " but got " << type;
     }
+    return internal::DecodeBuffer(reinterpret_cast<XLA_FFI_Buffer*>(ret));
+  }
+};
 
-    if constexpr (rank != internal::kDynamicRank) {
-      if (XLA_FFI_PREDICT_FALSE(buf->rank != rank)) {
-        return diagnostic.Emit("Wrong buffer rank: expected ")
-               << rank << " but got " << buf->rank;
-      }
+template <DataType dtype, size_t rank>
+struct RetDecoding<Buffer<dtype, rank>> {
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
+  static std::optional<Result<Buffer<dtype, rank>>> Decode(
+      XLA_FFI_RetType type, void* ret, DiagnosticEngine& diagnostic) {
+    if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_RetType_BUFFER)) {
+      return diagnostic.Emit("Wrong result type: expected ")
+             << XLA_FFI_RetType_BUFFER << " but got " << type;
     }
 
-    Buffer<dtype, rank> buffer;
-    buffer.data = static_cast<internal::NativeType<dtype>*>(buf->data);
-    buffer.dimensions = Span<const int64_t>(buf->dims, buf->rank);
-    return buffer;
+    return internal::DecodeBuffer<dtype, rank>(
+        reinterpret_cast<XLA_FFI_Buffer*>(ret), diagnostic);
   }
 };
 
@@ -303,7 +368,7 @@ struct AttrDecoding<Pointer<T>> {
 
 template <>
 struct ResultEncoding<Error> {
-  static XLA_FFI_Error* Encode(XLA_FFI_Api* api, Error error) {
+  static XLA_FFI_Error* Encode(const XLA_FFI_Api* api, Error error) {
     if (error.success()) return nullptr;
 
     XLA_FFI_Error_Create_Args args;
diff --git a/third_party/xla/xla/ffi/api/ffi_test.cc b/third_party/xla/xla/ffi/api/ffi_test.cc
index f389a2ac05fb19..b1dc769ca8de9d 100644
--- a/third_party/xla/xla/ffi/api/ffi_test.cc
+++ b/third_party/xla/xla/ffi/api/ffi_test.cc
@@ -100,6 +100,25 @@ TEST(FfiTest, BufferArgument) {
   TF_ASSERT_OK(status);
 }
 
+TEST(FfiTest, BufferBaseResult) {
+  std::vector<float> storage(4, 0.0f);
+  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+
+  CallFrameBuilder builder;
+  builder.AddBufferRet(memory, PrimitiveType::F32, /*dims=*/{2, 2});
+  auto call_frame = builder.Build();
+
+  auto handler =
+      Ffi::Bind().Ret<BufferBase>().To([&](Result<BufferBase> buffer) {
+        EXPECT_EQ(buffer->data, storage.data());
+        EXPECT_EQ(buffer->dimensions.size(), 2);
+        return Error::Success();
+      });
+  auto status = Call(*handler, call_frame);
+
+  TF_ASSERT_OK(status);
+}
+
 TEST(FfiTest, MissingBufferArgument) {
   CallFrameBuilder builder;
   auto call_frame = builder.Build();
@@ -170,6 +189,18 @@ TEST(FfiTest, AutoBinding) {
   TF_ASSERT_OK(status);
 }
 
+TEST(FfiTest, AutoBindingResult) {
+  auto handler =
+      Ffi::BindTo(+[](Result<BufferBase> buffer) { return Error::Success(); });
+
+  CallFrameBuilder builder;
+  builder.AddBufferRet(se::DeviceMemoryBase(), PrimitiveType::F32, /*dims=*/{});
+  auto call_frame = builder.Build();
+
+  auto status = Call(*handler, call_frame);
+  TF_ASSERT_OK(status);
+}
+
 struct I32AndF32 {
   int32_t i32;
   float f32;
diff --git a/third_party/xla/xla/ffi/call_frame.cc b/third_party/xla/xla/ffi/call_frame.cc
index 06dd10fb0833c0..11fc23a5b1a71e 100644
--- a/third_party/xla/xla/ffi/call_frame.cc
+++ b/third_party/xla/xla/ffi/call_frame.cc
@@ -85,13 +85,19 @@ void CallFrameBuilder::AddBufferArg(se::DeviceMemoryBase memory,
   args_.push_back(Buffer{memory, type, {dims.begin(), dims.end()}});
 }
 
+void CallFrameBuilder::AddBufferRet(se::DeviceMemoryBase memory,
+                                    PrimitiveType type,
+                                    absl::Span<const int64_t> dims) {
+  rets_.push_back(Buffer{memory, type, {dims.begin(), dims.end()}});
+}
+
 void CallFrameBuilder::AddAttributes(AttributesMap attrs) {
   for (auto& [name, attr] : attrs) {
     attrs_.try_emplace(std::move(name), std::move(attr));
   }
 }
 
-CallFrame CallFrameBuilder::Build() { return CallFrame(args_, attrs_); }
+CallFrame CallFrameBuilder::Build() { return CallFrame(args_, rets_, attrs_); }
 
 CallFrameBuilder::CallFrameBuilder(CallFrameBuilder&&) = default;
 CallFrameBuilder& CallFrameBuilder::operator=(CallFrameBuilder&&) = default;
@@ -165,6 +171,21 @@ struct CallFrame::Arguments {
   XLA_FFI_Args ffi_args = {XLA_FFI_Args_STRUCT_SIZE, nullptr};
 };
 
+struct CallFrame::Results {
+  explicit Results(size_t size) {
+    results.reserve(size);
+    types.reserve(size);
+    rets.reserve(size);
+  }
+
+  std::vector<Buffer> results;
+
+  std::vector<XLA_FFI_RetType> types;  // XLA_FFI_Rets::types
+  std::vector<void*> rets;             // XLA_FFI_Rets::rets
+
+  XLA_FFI_Rets ffi_rets = {XLA_FFI_Rets_STRUCT_SIZE, nullptr};
+};
+
 struct CallFrame::Attributes {
   explicit Attributes(size_t size) {
     attributes.reserve(size);
@@ -187,63 +208,72 @@ struct CallFrame::Attributes {
 //===----------------------------------------------------------------------===//
 
 CallFrame::CallFrame(absl::Span<const CallFrameBuilder::Buffer> args,
+                     absl::Span<const CallFrameBuilder::Buffer> rets,
                      const CallFrameBuilder::AttributesMap& attrs)
-    : arguments_(InitArgs(args)), attributes_(InitAttrs(attrs)) {}
+    : arguments_(InitArgs(args)),
+      results_(InitRets(rets)),
+      attributes_(InitAttrs(attrs)) {}
 
-XLA_FFI_CallFrame CallFrame::Build(XLA_FFI_Api* api,
+XLA_FFI_CallFrame CallFrame::Build(const XLA_FFI_Api* api,
                                    XLA_FFI_ExecutionContext* ctx) {
   XLA_FFI_CallFrame call_frame = {XLA_FFI_CallFrame_STRUCT_SIZE, nullptr};
   call_frame.api = api;
   call_frame.ctx = ctx;
   call_frame.args = arguments_->ffi_args;
+  call_frame.rets = results_->ffi_rets;
   call_frame.attrs = attributes_->ffi_attrs;
   return call_frame;
 }
 
 CallFrame::~CallFrame() = default;
 
+// We rely on casting to and from underlying integral type to convert from
+// PrimitiveType to XLA FFI DataType, and for safety convert all unknown types
+// to invalid type, otherwise we can accidentally cause UB.
+static XLA_FFI_DataType ToDataType(PrimitiveType primitive_type) {
+  switch (primitive_type) {
+    case PrimitiveType::PRIMITIVE_TYPE_INVALID:
+    case PrimitiveType::PRED:
+    case PrimitiveType::S8:
+    case PrimitiveType::S16:
+    case PrimitiveType::S32:
+    case PrimitiveType::S64:
+    case PrimitiveType::U8:
+    case PrimitiveType::U16:
+    case PrimitiveType::U32:
+    case PrimitiveType::U64:
+    case PrimitiveType::F16:
+    case PrimitiveType::F32:
+    case PrimitiveType::F64:
+    case PrimitiveType::BF16:
+      return static_cast<XLA_FFI_DataType>(primitive_type);
+    default:
+      DCHECK(false) << "Unsupported primitive type" << primitive_type;
+      return XLA_FFI_DataType_INVALID;
+  }
+}
+
+CallFrame::Buffer CallFrame::ConvertBuffer(
+    const CallFrameBuilder::Buffer& buffer) {
+  Buffer result;
+  result.dims = buffer.dims;
+  result.buffer.data = const_cast<void*>(buffer.memory.opaque());
+  result.buffer.dtype = ToDataType(buffer.type);
+  result.buffer.rank = result.dims.size();
+  return result;
+}
+
 //===----------------------------------------------------------------------===//
 // Call frame arguments
 //===----------------------------------------------------------------------===//
 
-/*static*/ std::unique_ptr<CallFrame::Arguments> CallFrame::InitArgs(
+std::unique_ptr<CallFrame::Arguments> CallFrame::InitArgs(
     absl::Span<const CallFrameBuilder::Buffer> bargs) {
   auto res = std::make_unique<Arguments>(bargs.size());
 
-  // We rely on casting to and from underlying integral type to convert from
-  // PrimitiveType to XLA FFI DataType, and for safety convert all unknown types
-  // to invalid type, otherwise we can accidentally cause UB.
-  auto to_data_type = [](PrimitiveType primitive_type) {
-    switch (primitive_type) {
-      case PrimitiveType::PRIMITIVE_TYPE_INVALID:
-      case PrimitiveType::PRED:
-      case PrimitiveType::S8:
-      case PrimitiveType::S16:
-      case PrimitiveType::S32:
-      case PrimitiveType::S64:
-      case PrimitiveType::U8:
-      case PrimitiveType::U16:
-      case PrimitiveType::U32:
-      case PrimitiveType::U64:
-      case PrimitiveType::F16:
-      case PrimitiveType::F32:
-      case PrimitiveType::F64:
-      case PrimitiveType::BF16:
-        return static_cast<XLA_FFI_DataType>(primitive_type);
-      default:
-        DCHECK(false) << "Unsupported primitive type" << primitive_type;
-        return XLA_FFI_DataType_INVALID;
-    }
-  };
-
   // Convert call frame builder arguments to call frame arguments.
   for (const CallFrameBuilder::Buffer& barg : bargs) {
-    Buffer buffer;
-    buffer.dims = barg.dims;
-    buffer.buffer.data = const_cast<void*>(barg.memory.opaque());
-    buffer.buffer.dtype = to_data_type(barg.type);
-    buffer.buffer.rank = buffer.dims.size();
-    res->arguments.push_back(std::move(buffer));
+    res->arguments.push_back(ConvertBuffer(barg));
   }
 
   // Fix up pointers in XLA FFI structs.
@@ -259,13 +289,46 @@ CallFrame::~CallFrame() = default;
 
   // Finally initialize the XLA FFI struct. At this point all storage is
   // allocated and it's safe to grab a pointer to it.
-  res->ffi_args.num_args = res->arguments.size();
+  res->ffi_args.size = res->arguments.size();
   res->ffi_args.types = res->types.data();
   res->ffi_args.args = res->args.data();
 
   return res;
 }
 
+//===----------------------------------------------------------------------===//
+// Call frame results
+//===----------------------------------------------------------------------===//
+
+std::unique_ptr<CallFrame::Results> CallFrame::InitRets(
+    absl::Span<const CallFrameBuilder::Buffer> brets) {
+  auto res = std::make_unique<Results>(brets.size());
+
+  // Convert call frame builder arguments to call frame arguments.
+  for (const CallFrameBuilder::Buffer& bret : brets) {
+    res->results.push_back(ConvertBuffer(bret));
+  }
+
+  // Fix up pointers in XLA FFI structs.
+  for (CallFrame::Buffer& arg : res->results) {
+    arg.buffer.dims = arg.dims.data();
+  }
+
+  // Initialize vectors required for building XLA_FFI_Rets.
+  for (CallFrame::Buffer& ret : res->results) {
+    res->types.push_back(XLA_FFI_RetType_BUFFER);
+    res->rets.push_back(&ret.buffer);
+  }
+
+  // Finally initialize the XLA FFI struct. At this point all storage is
+  // allocated and it's safe to grab a pointer to it.
+  res->ffi_rets.size = res->results.size();
+  res->ffi_rets.types = res->types.data();
+  res->ffi_rets.rets = res->rets.data();
+
+  return res;
+}
+
 //===----------------------------------------------------------------------===//
 // Call frame attributes
 //===----------------------------------------------------------------------===//
@@ -401,7 +464,7 @@ struct CallFrame::AttributeStorage {
 
   // Finally initialize XLA FFI struct. At this point all storage is allocated
   // and it's safe to grab a pointer to it.
-  res->ffi_attrs.num_attrs = res->attributes.size();
+  res->ffi_attrs.size = res->attributes.size();
   res->ffi_attrs.names = res->names.data();
   res->ffi_attrs.types = res->types.data();
   res->ffi_attrs.attrs = res->attrs.data();
diff --git a/third_party/xla/xla/ffi/call_frame.h b/third_party/xla/xla/ffi/call_frame.h
index b8f05105d92343..237e7ddb824bfd 100644
--- a/third_party/xla/xla/ffi/call_frame.h
+++ b/third_party/xla/xla/ffi/call_frame.h
@@ -100,6 +100,9 @@ class CallFrameBuilder {
   void AddBufferArg(se::DeviceMemoryBase memory, PrimitiveType type,
                     absl::Span<const int64_t> dims);
 
+  void AddBufferRet(se::DeviceMemoryBase memory, PrimitiveType type,
+                    absl::Span<const int64_t> dims);
+
   void AddAttributes(AttributesMap attrs);
 
  private:
@@ -108,6 +111,7 @@ class CallFrameBuilder {
   struct Buffer;
 
   std::vector<Buffer> args_;
+  std::vector<Buffer> rets_;
   AttributesMap attrs_;
 };
 
@@ -120,7 +124,8 @@ class CallFrame {
   ~CallFrame();
 
   // Builds an XLA_FFI_CallFrame from owned arguments and attributes.
-  XLA_FFI_CallFrame Build(XLA_FFI_Api* api, XLA_FFI_ExecutionContext* ctx);
+  XLA_FFI_CallFrame Build(const XLA_FFI_Api* api,
+                          XLA_FFI_ExecutionContext* ctx);
 
  private:
   friend class CallFrameBuilder;
@@ -132,21 +137,29 @@ class CallFrame {
   struct Buffer;
   struct Dictionary;
   struct NamedAttribute;
+  struct Results;
   struct Scalar;
   struct String;
 
   using Attribute = std::variant<Scalar, Array, String, Dictionary>;
 
   CallFrame(absl::Span<const CallFrameBuilder::Buffer> args,
+            absl::Span<const CallFrameBuilder::Buffer> rets,
             const CallFrameBuilder::AttributesMap& attrs);
 
   static std::unique_ptr<Arguments> InitArgs(
       absl::Span<const CallFrameBuilder::Buffer> args);
 
+  static std::unique_ptr<Results> InitRets(
+      absl::Span<const CallFrameBuilder::Buffer> rets);
+
   static std::unique_ptr<Attributes> InitAttrs(
       const CallFrameBuilder::AttributesMap& attrs);
 
+  static Buffer ConvertBuffer(const CallFrameBuilder::Buffer& buffer);
+
   std::unique_ptr<Arguments> arguments_;
+  std::unique_ptr<Results> results_;
   std::unique_ptr<Attributes> attributes_;
 
   // Declare implementation detail structs to grant access to private members.
diff --git a/third_party/xla/xla/ffi/ffi.h b/third_party/xla/xla/ffi/ffi.h
index 6720b641ee25a5..5f51d6ec3dae93 100644
--- a/third_party/xla/xla/ffi/ffi.h
+++ b/third_party/xla/xla/ffi/ffi.h
@@ -89,6 +89,45 @@ template <PrimitiveType dtype> using BufferR3 = Buffer<dtype, 3>;
 template <PrimitiveType dtype> using BufferR4 = Buffer<dtype, 4>;
 // clang-format on
 
+namespace internal {
+
+inline BufferBase DecodeBuffer(XLA_FFI_Buffer* buf) {
+  size_t size_bytes = primitive_util::ByteWidth(PrimitiveType(buf->dtype));
+  for (int64_t i = 0; i < buf->rank; ++i) size_bytes *= buf->dims[i];
+
+  BufferBase buffer;
+  buffer.dtype = PrimitiveType(buf->dtype);
+  buffer.data = se::DeviceMemoryBase(buf->data, size_bytes);
+  buffer.dimensions = absl::MakeConstSpan(buf->dims, buf->rank);
+  return buffer;
+}
+
+template <PrimitiveType dtype, size_t rank>
+std::optional<Buffer<dtype, rank>> DecodeBuffer(XLA_FFI_Buffer* buf,
+                                                DiagnosticEngine& diagnostic) {
+  if (auto buf_dtype = PrimitiveType(buf->dtype);
+      XLA_FFI_PREDICT_FALSE(buf_dtype != dtype)) {
+    return diagnostic.Emit("Wrong buffer dtype: expected ")
+           << primitive_util::LowercasePrimitiveTypeName(dtype) << " but got "
+           << primitive_util::LowercasePrimitiveTypeName(buf_dtype);
+  }
+
+  if constexpr (rank != internal::kDynamicRank) {
+    if (XLA_FFI_PREDICT_FALSE(buf->rank != rank)) {
+      return diagnostic.Emit("Wrong buffer rank: expected ")
+             << rank << " but got " << buf->rank;
+    }
+  }
+
+  Buffer<dtype, rank> buffer;
+  buffer.data =
+      se::DeviceMemory<NativeType<dtype>>(se::DeviceMemoryBase(buf->data));
+  buffer.dimensions = absl::MakeConstSpan(buf->dims, buf->rank);
+  return buffer;
+}
+
+}  // namespace internal
+
 //===----------------------------------------------------------------------===//
 // Arguments binding
 //===----------------------------------------------------------------------===//
@@ -117,13 +156,7 @@ struct ArgDecoding<BufferBase> {
              << XLA_FFI_ArgType_BUFFER << " but got " << type;
     }
 
-    auto* buf = reinterpret_cast<XLA_FFI_Buffer*>(arg);
-
-    BufferBase buffer;
-    buffer.dtype = PrimitiveType(buf->dtype);
-    buffer.data = se::DeviceMemoryBase(buf->data);
-    buffer.dimensions = absl::MakeConstSpan(buf->dims, buf->rank);
-    return buffer;
+    return internal::DecodeBuffer(reinterpret_cast<XLA_FFI_Buffer*>(arg));
   }
 };
 
@@ -137,27 +170,40 @@ struct ArgDecoding<Buffer<dtype, rank>> {
              << XLA_FFI_ArgType_BUFFER << " but got " << type;
     }
 
-    auto* buf = reinterpret_cast<XLA_FFI_Buffer*>(arg);
+    return internal::DecodeBuffer<dtype, rank>(
+        reinterpret_cast<XLA_FFI_Buffer*>(arg), diagnostic);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Results decoding
+//===----------------------------------------------------------------------===//
 
-    if (auto actual_dtype = PrimitiveType(buf->dtype);
-        XLA_FFI_PREDICT_FALSE(actual_dtype != dtype)) {
-      return diagnostic.Emit("Wrong buffer dtype: expected ")
-             << primitive_util::LowercasePrimitiveTypeName(dtype) << " but got "
-             << primitive_util::LowercasePrimitiveTypeName(actual_dtype);
+template <>
+struct RetDecoding<BufferBase> {
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
+  static std::optional<Result<BufferBase>> Decode(
+      XLA_FFI_RetType type, void* arg, DiagnosticEngine& diagnostic) {
+    if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_RetType_BUFFER)) {
+      return diagnostic.Emit("Wrong result type: expected ")
+             << XLA_FFI_RetType_BUFFER << " but got " << type;
     }
+    return internal::DecodeBuffer(reinterpret_cast<XLA_FFI_Buffer*>(arg));
+  }
+};
 
-    if constexpr (rank != internal::kDynamicRank) {
-      if (XLA_FFI_PREDICT_FALSE(buf->rank != rank)) {
-        return diagnostic.Emit("Wrong buffer rank: expected ")
-               << rank << " but got " << buf->rank;
-      }
+template <PrimitiveType dtype, size_t rank>
+struct RetDecoding<Buffer<dtype, rank>> {
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
+  static std::optional<Result<Buffer<dtype, rank>>> Decode(
+      XLA_FFI_RetType type, void* arg, DiagnosticEngine& diagnostic) {
+    if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_RetType_BUFFER)) {
+      return diagnostic.Emit("Wrong result type: expected ")
+             << XLA_FFI_RetType_BUFFER << " but got " << type;
     }
 
-    Buffer<dtype, rank> buffer;
-    buffer.data = se::DeviceMemory<internal::NativeType<dtype>>(
-        se::DeviceMemoryBase(buf->data));
-    buffer.dimensions = absl::MakeConstSpan(buf->dims, buf->rank);
-    return buffer;
+    return internal::DecodeBuffer<dtype, rank>(
+        reinterpret_cast<XLA_FFI_Buffer*>(arg), diagnostic);
   }
 };
 
@@ -268,7 +314,7 @@ struct CtxDecoding<CalledComputation> {
 
 template <>
 struct ResultEncoding<Status> {
-  static XLA_FFI_Error* Encode(XLA_FFI_Api* api, Status status) {
+  static XLA_FFI_Error* Encode(const XLA_FFI_Api* api, Status status) {
     return api->internal_api->XLA_FFI_INTERNAL_Error_Forward(&status);
   }
 };
diff --git a/third_party/xla/xla/ffi/ffi_api.cc b/third_party/xla/xla/ffi/ffi_api.cc
index af8dbaa5d50b65..48712c27fb734e 100644
--- a/third_party/xla/xla/ffi/ffi_api.cc
+++ b/third_party/xla/xla/ffi/ffi_api.cc
@@ -38,7 +38,7 @@ limitations under the License.
 //===----------------------------------------------------------------------===//
 
 struct XLA_FFI_Error {
-  xla::Status status;
+  absl::Status status;
 };
 
 struct XLA_FFI_ExecutionContext {
@@ -291,6 +291,8 @@ static void* XLA_FFI_INTERNAL_CalledComputation_Get(
 // XLA FFI Api access
 //===----------------------------------------------------------------------===//
 
+extern "C" const XLA_FFI_Api* XLA_FFI_GetApi() { return GetXlaFfiApi(); }
+
 static XLA_FFI_InternalApi internal_api = {
     XLA_FFI_INTERNAL_Error_Forward,
     XLA_FFI_INTERNAL_Stream_Get,
@@ -312,6 +314,6 @@ static XLA_FFI_Api api = {
     XLA_FFI_Stream_Get,        // returns platform specific stream
 };
 
-XLA_FFI_Api* GetXlaFfiApi() { return &api; }
+const XLA_FFI_Api* GetXlaFfiApi() { return &api; }
 
 }  // namespace xla::ffi
diff --git a/third_party/xla/xla/ffi/ffi_api.h b/third_party/xla/xla/ffi/ffi_api.h
index a90f19182ffc6e..001ed49896dcb9 100644
--- a/third_party/xla/xla/ffi/ffi_api.h
+++ b/third_party/xla/xla/ffi/ffi_api.h
@@ -81,7 +81,7 @@ absl::flat_hash_map<std::string, HandlerRegistration> StaticRegisteredHandlers(
 // XLA FFI Api Implementation
 //===----------------------------------------------------------------------===//
 
-XLA_FFI_Api* GetXlaFfiApi();
+const XLA_FFI_Api* GetXlaFfiApi();
 
 }  // namespace xla::ffi
 
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD b/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD
index aa8135cc69ea94..f91d99d7f10050 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD
@@ -1,9 +1,9 @@
 # Automatic sharding annotation
 
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
-load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_libtpu_portable")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
 load("//xla:xla.bzl", "auto_sharding_deps", "auto_sharding_solver_deps", "xla_cc_binary", "xla_cc_test")
+load("//xla/tsl:tsl.default.bzl", "get_compatible_with_libtpu_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -95,7 +95,11 @@ cc_library(
 
 cc_library(
     name = "auto_sharding_solver",
-    srcs = ["auto_sharding_solver.cc"],
+    srcs = [
+        "auto_sharding_memory.cc",
+        "auto_sharding_solver.cc",
+    ],
+    hdrs = ["auto_sharding_memory.h"],
     compatible_with = get_compatible_with_libtpu_portable(),
     deps = [
         ":auto_sharding_proto_cc",
@@ -116,6 +120,7 @@ cc_library(
         "@com_google_ortools//ortools/linear_solver:linear_solver_cc_proto",
         "@local_tsl//tsl/platform:fingerprint",
         "@local_tsl//tsl/platform:hash",
+        "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:types",
     ] + auto_sharding_solver_deps(),
 )
@@ -185,6 +190,7 @@ cc_library(
         "//xla/hlo/utils:hlo_live_range",
         "//xla/service:hlo_cost_analysis",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings:string_view",
     ],
 )
@@ -269,6 +275,7 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:status",
     ],
 )
@@ -298,6 +305,7 @@ xla_cc_binary(
 tf_proto_library(
     name = "auto_sharding_proto",
     srcs = ["auto_sharding.proto"],
+    visibility = ["//visibility:public"],
 )
 
 build_test(
@@ -343,7 +351,10 @@ xla_cc_test(
 
 xla_cc_test(
     name = "auto_sharding_solver_test",
-    srcs = ["auto_sharding_solver_test.cc"],
+    srcs = [
+        "auto_sharding_memory_test.cc",
+        "auto_sharding_solver_test.cc",
+    ],
     tags = [
         # Disabled until autosharding is fully supported in OSS,
         # https://github.com/openxla/xla/issues/7248.
@@ -354,6 +365,7 @@ xla_cc_test(
         ":auto_sharding_solver",  # build_cleaner: keep
         ":auto_sharding_strategy",
         "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest",
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
index 49d4807266603b..fbf38aea099f98 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
@@ -128,25 +128,27 @@ double ComputeMemoryReshardingCost(const Shape& shape,
                              shape, device_mesh.num_elements(), dst_sharding));
 
   if (src_n_dim != dst_n_dim && src_n_dim != -1 && dst_n_dim != -1) {
-    Shape inter_shape = ComputeIntermediateShape(src_sharding, dst_sharding,
-                                                 shape, device_mesh);
-
-    std::optional<HloSharding> src_inter_sharding =
-        hlo_sharding_util::ReshapeSharding(shape, inter_shape, src_sharding);
-    std::optional<HloSharding> dst_inter_sharding =
-        hlo_sharding_util::ReshapeSharding(shape, inter_shape, dst_sharding);
-    if (!src_inter_sharding.has_value() || !dst_inter_sharding.has_value()) {
-      src_inter_sharding = HloSharding::Replicate();
-      dst_inter_sharding = HloSharding::Replicate();
-    }
-
-    result = std::max(
-        result,
-        static_cast<double>(std::max(
-            GetShardedInstructionSize(inter_shape, device_mesh.num_elements(),
-                                      src_inter_sharding),
-            GetShardedInstructionSize(inter_shape, device_mesh.num_elements(),
-                                      dst_inter_sharding))));
+    absl::StatusOr<Shape> inter_shape = ComputeIntermediateShape(
+        src_sharding, dst_sharding, shape, device_mesh);
+    if (inter_shape.ok()) {
+      std::optional<HloSharding> src_inter_sharding =
+          hlo_sharding_util::ReshapeSharding(shape, *inter_shape, src_sharding);
+      std::optional<HloSharding> dst_inter_sharding =
+          hlo_sharding_util::ReshapeSharding(shape, *inter_shape, dst_sharding);
+      if (!src_inter_sharding.has_value() || !dst_inter_sharding.has_value()) {
+        src_inter_sharding = HloSharding::Replicate();
+        dst_inter_sharding = HloSharding::Replicate();
+      }
+
+      result = std::max(
+          result,
+          static_cast<double>(std::max(
+              GetShardedInstructionSize(
+                  *inter_shape, device_mesh.num_elements(), src_inter_sharding),
+              GetShardedInstructionSize(*inter_shape,
+                                        device_mesh.num_elements(),
+                                        dst_inter_sharding))));
+    }
   }
   return result - src_sharded_bytes;
 }
@@ -1456,8 +1458,9 @@ void TrimOrGenerateStrategiesBasedOnExistingSharding(
             HloInstruction* operand =
                 instructions.at(strategy_group->in_nodes.at(i)->instruction_id);
             std::optional<HloSharding> input_sharding =
-                ShardingPropagation::GetShardingFromUser(*operand, *ins, 10,
-                                                         true, call_graph);
+                ShardingPropagation::GetShardingFromUser(
+                    *operand, *ins, 10, true, call_graph,
+                    /*sharding_helper=*/nullptr);
             StrategyGroup* operand_strategy_group =
                 strategy_map.at(operand).get();
             Shape operand_shape = operand->shape();
@@ -1824,9 +1827,9 @@ AutoShardingSolverResult CallSolver(
     const LivenessEdgeSet& liveness_edge_set, const StrategyMap& strategy_map,
     const StrategyGroups& strategy_groups, const CostGraph& cost_graph,
     const AliasSet& alias_set, const std::vector<NodeStrategyIdx>& s_hint,
-    const bool compute_iis, const int64_t solver_timeout_in_seconds,
-    const AutoShardingOption& option, std::optional<double> max_cost,
-    absl::string_view request_name,
+    const absl::flat_hash_set<LivenessIdx>& peak_times, const bool compute_iis,
+    const int64_t solver_timeout_in_seconds, const AutoShardingOption& option,
+    std::optional<double> max_cost, absl::string_view request_name,
     const absl::flat_hash_map<std::string, const HloInstruction*>&
         sharding_propagation_solution,
     bool deterministic_mode) {
@@ -1840,6 +1843,7 @@ AutoShardingSolverResult CallSolver(
   request.mutable_s_follow()->Add(cost_graph.follow_idx_.begin(),
                                   cost_graph.follow_idx_.end());
   request.mutable_s_hint()->Add(s_hint.begin(), s_hint.end());
+  request.mutable_peak_times()->Add(peak_times.begin(), peak_times.end());
   request.mutable_solver_timeout()->set_solver_timeout_in_seconds(
       solver_timeout_in_seconds);
   request.mutable_overbudget_coeff()->set_coeff(kOverbudgetCoeff);
@@ -2111,6 +2115,8 @@ void SetHloSharding(const HloInstructionSequence& sequence,
 
   for (HloInstruction* inst : instructions) {
     if (inst->opcode() == HloOpcode::kOutfeed ||
+        inst->opcode() == HloOpcode::kRecv ||
+        inst->opcode() == HloOpcode::kRecvDone ||
         inst->opcode() == HloOpcode::kSend ||
         inst->opcode() == HloOpcode::kSendDone) {
       continue;
@@ -2249,12 +2255,14 @@ Status SetHloShardingPostProcessing(
                "but get instruction: "
             << inst->ToString() << ", strategy : " << stra.ToString();
         if (stra.input_shardings[0].has_value()) {
-          FixMixedMeshShapeResharding(inst, 0, stra.input_shardings[0].value(),
-                                      device_mesh, resharding_cache);
+          TF_RETURN_IF_ERROR(FixMixedMeshShapeResharding(
+              inst, 0, stra.input_shardings[0].value(), device_mesh,
+              resharding_cache));
         }
         if (stra.input_shardings[1].has_value()) {
-          FixMixedMeshShapeResharding(inst, 1, stra.input_shardings[1].value(),
-                                      device_mesh, resharding_cache);
+          TF_RETURN_IF_ERROR(FixMixedMeshShapeResharding(
+              inst, 1, stra.input_shardings[1].value(), device_mesh,
+              resharding_cache));
         }
       }
     } else if (inst->opcode() == HloOpcode::kOutfeed ||
@@ -2270,7 +2278,8 @@ Status SetHloShardingPostProcessing(
       // have. Here we restore these maximal shardings if present.
       auto preserved_sharding_iter = preserve_shardings.find(inst->name());
       if (preserved_sharding_iter != preserve_shardings.end()) {
-        const auto& preserved_sharding = preserved_sharding_iter->second;
+        const std::vector<HloSharding>& preserved_sharding =
+            preserved_sharding_iter->second;
         if (preserved_sharding.size() > 1) {
           std::vector<Shape> tuple_elements_shape(
               inst->operand(0)->shape().tuple_shapes().begin(),
@@ -2281,37 +2290,46 @@ Status SetHloShardingPostProcessing(
           ShapeTree<HloSharding> output_tuple_sharding(
               output_tuple_sharding_shape, Undefined());
           size_t i = 0;
-          for (auto& leaf : output_tuple_sharding.leaves()) {
+          for (std::pair<ShapeIndex, HloSharding>& leaf :
+               output_tuple_sharding.leaves()) {
             leaf.second = preserved_sharding.at(i++);
           }
           inst->set_sharding(HloSharding::Tuple(output_tuple_sharding));
         } else {
-          inst->set_sharding(preserved_sharding.at(0));
+          CHECK_EQ(preserved_sharding.size(), 1);  // Crash OK
+          inst->set_sharding(preserved_sharding[0]);
         }
       }
       continue;
-    } else if (inst->opcode() == HloOpcode::kSend) {
+    } else if (inst->opcode() == HloOpcode::kSend ||
+               inst->opcode() == HloOpcode::kRecv ||
+               inst->opcode() == HloOpcode::kRecvDone) {
       // In the analysis itself, we use replicated strategies as a stand-in for
       // the (expected) maximal sharding annotations that send ops usually
       // have. Here we restore these maximal shardings if present.
       auto preserved_sharding_iter = preserve_shardings.find(inst->name());
       if (preserved_sharding_iter != preserve_shardings.end()) {
-        const auto& preserved_sharding = preserved_sharding_iter->second;
+        const std::vector<HloSharding>& preserved_sharding =
+            preserved_sharding_iter->second;
         if (preserved_sharding.size() > 1) {
           inst->set_sharding(
               HloSharding::Tuple(inst->shape(), preserved_sharding));
         } else {
-          CHECK_EQ(preserved_sharding.size(), 1);
+          if (preserved_sharding.size() != 1) {
+            return absl::InternalError(absl::StrCat(
+                "An empty sharding was preserved for ", inst->name(),
+                ". This should be reported as a bug."));
+          }
           inst->set_sharding(preserved_sharding[0]);
         }
       }
       continue;
     } else {
       if (inst->shape().IsTuple()) {
-        // While we do not support nested tuples fully, this is a hack to get
-        // things to work in some cases (specifically observed for the llama and
-        // gemma models) where nested tuples as used as inputs/outputs of the
-        // kOptimizationBarrier instruction.
+        // While we do not support nested tuples fully (b/332951306), this is a
+        // hack to get things to work in some cases (specifically observed for
+        // the llama and gemma models) where nested tuples as used as
+        // inputs/outputs of the kOptimizationBarrier instruction.
         if (absl::c_any_of(
                 inst->shape().tuple_shapes(),
                 [](const Shape& shape) { return shape.IsTuple(); })) {
@@ -2328,9 +2346,9 @@ Status SetHloShardingPostProcessing(
                                               strategy_map, cost_graph, s_val);
               if (stra.input_shardings.size() > i &&
                   stra.input_shardings[i].has_value()) {
-                FixMixedMeshShapeResharding(inst, i,
-                                            stra.input_shardings[i].value(),
-                                            device_mesh, resharding_cache);
+                TF_RETURN_IF_ERROR(FixMixedMeshShapeResharding(
+                    inst, i, stra.input_shardings[i].value(), device_mesh,
+                    resharding_cache));
               }
             }
             break;
@@ -2342,9 +2360,9 @@ Status SetHloShardingPostProcessing(
                                               strategy_map, cost_graph, s_val);
               CHECK_EQ(stra.input_shardings.size(), 1);
               CHECK(stra.input_shardings[0].has_value());
-              FixMixedMeshShapeResharding(inst, i,
-                                          stra.input_shardings[0].value(),
-                                          device_mesh, resharding_cache);
+              TF_RETURN_IF_ERROR(FixMixedMeshShapeResharding(
+                  inst, i, stra.input_shardings[0].value(), device_mesh,
+                  resharding_cache));
             }
             break;
           }
@@ -2354,7 +2372,7 @@ Status SetHloShardingPostProcessing(
             for (size_t i = 0; i < inst->shape().tuple_shapes_size(); ++i) {
               CHECK(!inst->shape().tuple_shapes(i).IsTuple())
                   << "We currently do not support ops with nested tuples as "
-                     "output.";
+                     "output. See b/332951306.";
               const ShardingStrategy& stra =
                   GetShardingStrategyForTuple(inst, {static_cast<int64_t>(i)},
                                               strategy_map, cost_graph, s_val);
@@ -2363,8 +2381,9 @@ Status SetHloShardingPostProcessing(
                 dst_shardings[i] = stra.input_shardings[0].value();
               }
             }
-            FixMixedMeshShapeReshardingGetTupleElementWithTupleOutput(
-                inst, dst_shardings, device_mesh);
+            TF_RETURN_IF_ERROR(
+                FixMixedMeshShapeReshardingGetTupleElementWithTupleOutput(
+                    inst, dst_shardings, device_mesh));
             break;
           }
 
@@ -2386,15 +2405,15 @@ Status SetHloShardingPostProcessing(
           continue;
         }
         if (inst->opcode() == HloOpcode::kGetTupleElement) {
-          FixMixedMeshShapeReshardingGetTupleElement(
-              inst, inst->sharding(), device_mesh, preserve_shardings);
+          TF_RETURN_IF_ERROR(FixMixedMeshShapeReshardingGetTupleElement(
+              inst, inst->sharding(), device_mesh, preserve_shardings));
         } else {
           for (size_t i = 0; i < inst->operand_count(); ++i) {
             if (stra.input_shardings.size() > i &&
                 stra.input_shardings[i].has_value()) {
-              FixMixedMeshShapeResharding(inst, i,
-                                          stra.input_shardings[i].value(),
-                                          device_mesh, resharding_cache);
+              TF_RETURN_IF_ERROR(FixMixedMeshShapeResharding(
+                  inst, i, stra.input_shardings[i].value(), device_mesh,
+                  resharding_cache));
             }
           }
         }
@@ -2841,7 +2860,7 @@ void FindReplicateSet(
 }
 
 // Substitute all-reduce strategies with their reduce-scatter variants.
-void GenerateReduceScatter(
+absl::Status GenerateReduceScatter(
     const HloInstructionSequence& sequence, const AliasMap& alias_map,
     const InstructionDepthMap& depth_map, const StrategyMap& strategy_map,
     const CostGraph& cost_graph, absl::Span<const NodeStrategyIdx> s_val,
@@ -3106,8 +3125,9 @@ void GenerateReduceScatter(
     replace_with->set_sharding(
         GetShardingStrategy(inst, strategy_map, cost_graph, s_val)
             .output_sharding);
-    TF_CHECK_OK(inst->ReplaceAllUsesWith(replace_with));
+    TF_RETURN_IF_ERROR(inst->ReplaceAllUsesWith(replace_with));
   }
+  return OkStatus();
 }
 
 void AnnotateShardingWithSimpleHeuristic(
@@ -3448,11 +3468,14 @@ AutoShardingImplementation::SaveAndRemoveShardingAnnotation(
        module->computations(execution_threads)) {
     for (const auto inst : computation->instructions()) {
       if (inst->opcode() == HloOpcode::kOutfeed ||
+          inst->opcode() == HloOpcode::kRecv ||
+          inst->opcode() == HloOpcode::kRecvDone ||
           inst->opcode() == HloOpcode::kSend ||
           inst->opcode() == HloOpcode::kSendDone) {
         spmd::SaveShardingForInstruction(inst,
                                          /* save_for_copy_users */ false,
                                          preserve_shardings);
+        continue;
       }
       if (inst->has_sharding() &&
           spmd::IsShardingMisaligned(inst->sharding(), inst->shape())) {
@@ -3635,7 +3658,6 @@ absl::StatusOr<AutoShardingResult> AutoShardingImplementation::RunAutoSharding(
     }
   }
   VLOG(10) << hlo_live_range->ToString();
-  VLOG(10) << spmd::PrintLivenessSet(liveness_set);
   XLA_VLOG_LINES(10, spmd::PrintLivenessSet(liveness_set));
   const HloInstructionSequence& sequence =
       hlo_live_range->flattened_instruction_sequence();
@@ -3807,9 +3829,7 @@ absl::StatusOr<AutoShardingResult> AutoShardingImplementation::RunAutoSharding(
     }
 
     // ----- Call the ILP Solver -----
-    std::vector<spmd::NodeStrategyIdx> s_val;
-    std::vector<spmd::EdgeStrategyIdx> e_val;
-    double objective = -1.0;
+    spmd::AutoShardingSolverOutput output;
     std::string request_name = absl::StrCat("mesh_idx_", mesh_idx);
     auto solver_result =
         Solve(*module, *hlo_live_range, liveness_node_set, liveness_edge_set,
@@ -3821,29 +3841,30 @@ absl::StatusOr<AutoShardingResult> AutoShardingImplementation::RunAutoSharding(
       return AutoShardingResult::kModuleUnchanged;
     } else {
       TF_ASSIGN_OR_RETURN(auto solution, solver_result.status);
-      std::tie(s_val, e_val, objective) = solution;
+      output = solution;
       if (mesh_idx == partial_mesh_shapes.size() - 1) {
-        this->solver_optimal_objective_value_ = objective;
+        this->solver_optimal_objective_value_ = output.cost;
       }
     }
 
-    XLA_VLOG_LINES(5, PrintAutoShardingSolution(sequence, liveness_set,
-                                                strategy_map, strategy_groups,
-                                                cost_graph, s_val, objective));
+    XLA_VLOG_LINES(5, PrintAutoShardingSolution(
+                          sequence, liveness_set, strategy_map, strategy_groups,
+                          cost_graph, output.s_val, output.cost));
     XLA_VLOG_LINES(6, PrintSolutionMemoryUsage(liveness_set, strategy_map,
-                                               cost_graph, s_val));
+                                               cost_graph, output.s_val));
 
     // ----- Substitute all-reduce with reduce-scatter -----
     if (option_.prefer_reduce_scatter) {
-      GenerateReduceScatter(sequence, alias_map, ins_depth_map, strategy_map,
-                            cost_graph, s_val, cluster_env, option_);
+      TF_RETURN_IF_ERROR(GenerateReduceScatter(
+          sequence, alias_map, ins_depth_map, strategy_map, cost_graph,
+          output.s_val, cluster_env, option_));
     }
     // ----- Set Sharding -----
-    SetHloSharding(sequence, strategy_map, cost_graph, s_val,
+    SetHloSharding(sequence, strategy_map, cost_graph, output.s_val,
                    (mesh_idx == partial_mesh_shapes.size() - 1));
     if (mesh_idx == partial_mesh_shapes.size() - 1) {
       if (!SetHloShardingPostProcessing(
-               sequence, strategy_map, cost_graph, s_val, cluster_env,
+               sequence, strategy_map, cost_graph, output.s_val, cluster_env,
                /* crash_at_error */ !option_.try_multiple_mesh_shapes,
                preserve_shardings)
                .ok()) {
@@ -3916,6 +3937,21 @@ bool ShardedOnTooManyMeshAxes(const HloModule& module) {
   return false;
 }
 
+bool HasUnsupportedNestedTuples(const HloModule& module) {
+  for (const auto* computation : module.computations()) {
+    for (const auto* instruction : computation->instructions()) {
+      if (instruction->opcode() == HloOpcode::kConditional) {
+        for (const HloInstruction* operand : instruction->operands()) {
+          if (ShapeUtil::IsNestedTuple(operand->shape())) {
+            return true;
+          }
+        }
+      }
+    }
+  }
+  return false;
+}
+
 std::unique_ptr<HloModule> CloneModule(const HloModule* module) {
   auto module_clone = module->Clone("");
   module_clone->set_layout_canonicalization_callback(
@@ -3936,15 +3972,25 @@ absl::StatusOr<bool> AutoSharding::Run(
 
   if (IsModuleManuallySharded(module)) {
     LOG(FATAL)
-        << "Auto-sharding on partially manually sharded modules is not yet "
-           "supported. Please fall back on the sharding propagation pass.";
+        << "Auto-sharding on partially manually sharded modules "  // Crash OK
+           "is not yet supported. Please fall back on the sharding "
+           "propagation pass.";
     return false;
   }
 
   if (ShardedOnTooManyMeshAxes(*module)) {
-    LOG(FATAL) << "The input module contains sharding annotations over a mesh "
-                  "with too many axes (>2). This case is currently not well "
-                  "supported.";
+    LOG(FATAL) << "The input module contains sharding annotations "  // Crash OK
+                  "over a mesh with too many axes (>2). This case is currently "
+                  "not well supported.";
+    return false;
+  }
+
+  // TODO(b/332951306): Remove this check once nested tuples are supported
+  // everywhere
+  if (HasUnsupportedNestedTuples(*module)) {
+    LOG(FATAL) << "The input module contains nested tuples "  // Crash OK
+                  "which we do not currently support well. See b/332951306 to "
+                  "track progress on this.";
     return false;
   }
 
@@ -3958,6 +4004,8 @@ absl::StatusOr<bool> AutoSharding::Run(
   metrics::RecordAutoShardingInvocations();
 #endif
 
+  TF_RETURN_IF_ERROR(module->RemoveUnusedComputations());
+
   TF_RETURN_IF_ERROR(option_.CheckAndSetup());
   LOG(INFO) << "AutoShardingOptions:\n" << option_.ToString();
 
@@ -4000,15 +4048,18 @@ absl::StatusOr<bool> AutoSharding::Run(
     mesh_shapes.push_back(option_.device_mesh_shape);
   }
 
-  HloInstruction* parameter_instruction =
-      module->entry_computation()->parameter_instruction(0);
-  if (parameter_instruction->shape().IsTuple() &&
-      parameter_instruction->has_sharding()) {
-    CHECK_EQ(module->entry_computation()->num_parameters(), 1);
-    parameter_instruction->set_sharding(
-        spmd::ReplaceGivenShardingsWithUnknownForTuple(
-            parameter_instruction->sharding(), parameter_instruction->shape(),
-            module->config().allow_spmd_sharding_propagation_to_parameters()));
+  if (module->entry_computation()->num_parameters() > 0) {
+    HloInstruction* parameter_instruction =
+        module->entry_computation()->parameter_instruction(0);
+    if (parameter_instruction->shape().IsTuple() &&
+        parameter_instruction->has_sharding()) {
+      CHECK_EQ(module->entry_computation()->num_parameters(), 1);
+      parameter_instruction->set_sharding(
+          spmd::ReplaceGivenShardingsWithUnknownForTuple(
+              parameter_instruction->sharding(), parameter_instruction->shape(),
+              module->config()
+                  .allow_spmd_sharding_propagation_to_parameters()));
+    }
   }
 
   HloInstruction* root_instruction =
@@ -4050,7 +4101,7 @@ absl::StatusOr<bool> AutoSharding::Run(
 
   size_t num_meshes = mesh_shapes.size();
   std::vector<std::unique_ptr<HloModule>> modules(num_meshes);
-  std::vector<StatusOr<AutoShardingResult>> changed(
+  std::vector<absl::StatusOr<AutoShardingResult>> changed(
       num_meshes, AutoShardingResult::kModuleUnchanged);
   std::vector<double> objective_values(num_meshes, -1);
 
@@ -4128,12 +4179,25 @@ absl::StatusOr<bool> AutoSharding::Run(
                 << " which had the minimal solver objective value of "
                 << min_objective_value;
         chosen_mesh_shape_ = mesh_shapes[min_mesh_shape_index];
+        TF_RETURN_IF_ERROR(
+            modules[min_mesh_shape_index]->RemoveUnusedComputations());
+        const std::vector<HloComputation*>& original_module_computations =
+            module->MakeComputationSorted();
+        const std::vector<HloComputation*>& clone_module_computations =
+            modules[min_mesh_shape_index]->MakeComputationSorted();
+        if (original_module_computations.size() !=
+            clone_module_computations.size()) {
+          return absl::InternalError(
+              "The cloned and the original modules do not have the same number "
+              "of computations. This is a bug and should be reported.");
+        }
+
         absl::flat_hash_map<HloComputation*, HloComputation*>
             computation_replacements;
-        for (size_t i = 0; i < module->computation_count(); ++i) {
-          auto original_computation = module->mutable_computation(i);
-          auto new_computation =
-              modules[min_mesh_shape_index]->mutable_computation(i);
+        for (size_t i = 0; i < original_module_computations.size(); ++i) {
+          HloComputation* original_computation =
+              original_module_computations[i];
+          HloComputation* new_computation = clone_module_computations[i];
           computation_replacements[original_computation] = new_computation;
         }
 
@@ -4142,6 +4206,10 @@ absl::StatusOr<bool> AutoSharding::Run(
 
         *module->mutable_config().mutable_entry_computation_layout() =
             modules[min_mesh_shape_index]->entry_computation_layout();
+        module->input_output_alias_config() =
+            modules[min_mesh_shape_index]->input_output_alias_config();
+        module->buffer_donor_config() =
+            modules[min_mesh_shape_index]->buffer_donor_config();
 
         module_is_changed = true;
       } else if (changed[min_mesh_shape_index].value() ==
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.h
index 4a0dbe0902fe9c..9a994bfce9a7ed 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.h
@@ -210,7 +210,7 @@ Status CheckAliasSetCompatibility(const AliasSet& alias_set,
                                   const HloInstructionSequence& sequence,
                                   bool crash_on_error);
 
-void GenerateReduceScatter(
+absl::Status GenerateReduceScatter(
     const HloInstructionSequence& sequence, const AliasMap& alias_map,
     const InstructionDepthMap& depth_map, const StrategyMap& strategy_map,
     const CostGraph& cost_graph, absl::Span<const int64_t> s_val,
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.proto b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.proto
index 6ebcf36385ba04..102b3e86e35a05 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.proto
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.proto
@@ -46,6 +46,7 @@ message AutoShardingSolverRequest {
   repeated int64 s_len = 3;
   repeated int64 s_follow = 4;
   repeated int64 s_hint = 5;
+  repeated int64 peak_times = 35;
   repeated Pair edges = 6;
   repeated Nodes live = 7;
   repeated Edges live_edges = 28;
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_impl.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_impl.cc
index 0decb9dd836d64..1adcc3f6a05caf 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_impl.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_impl.cc
@@ -44,9 +44,9 @@ AutoShardingSolverResult Solve(
   return CallSolver(
       hlo_module, hlo_live_range, liveness_node_set, liveness_edge_set,
       strategy_map, strategy_groups, cost_graph, alias_set, /*s_hint*/ {},
-      /*compute_iis*/ true, option.solver_timeout_in_seconds, option,
-      /*max_cost*/ std::nullopt, request_prefix, sharding_propagation_solution,
-      /*deterministic mode*/ true);
+      /*peak_times*/ {}, /*compute_iis*/ true, option.solver_timeout_in_seconds,
+      option, /*max_cost*/ std::nullopt, request_prefix,
+      sharding_propagation_solution, /*deterministic mode*/ true);
 }
 
 void PopulateTemporalValues(const CostGraph& cost_graph,
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_memory.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_memory.cc
new file mode 100644
index 00000000000000..22aa1ab6473c30
--- /dev/null
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_memory.cc
@@ -0,0 +1,229 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/experimental/auto_sharding/auto_sharding_memory.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <limits>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "absl/container/btree_map.h"
+#include "absl/container/btree_set.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
+#include "tsl/platform/protobuf.h"
+
+namespace xla {
+namespace spmd {
+
+namespace {
+
+using PrimIdx = int64_t;  // Indexes into the primitive list (ie, nodes & edges)
+using LiveIdx = int64_t;  // Indexes into the liveness range (like a time point)
+using GroupIdx = int64_t;  // Indexes into the list of groups
+
+using PrimPair = std::pair<PrimIdx, PrimIdx>;
+using LiveAndPrim = std::pair<LiveIdx, PrimIdx>;
+
+struct Interval {
+  LiveIdx lower = std::numeric_limits<LiveIdx>::max();
+  LiveIdx upper = 0;
+
+  bool IsValid() const { return lower <= upper; }
+  int64_t length() const { return upper - lower + 1; }  // (closed interval)
+};
+
+}  // namespace
+
+int64_t MemoryTermReducer::Reduce(
+    int64_t num_lives, int64_t num_primitives,
+    std::function<tsl::protobuf::RepeatedField<int64_t>(int64_t)>  // NOLINT
+        live) {
+  LOG(INFO) << "Memory Term Reducer beginning to reduce number of terms ...";
+
+  // Clear internal state.
+  reduced_live_.clear();
+  reduced_groups_.clear();
+
+  // For each primitive, determine the live interval it spans.
+  int64_t num_terms = 0;
+  std::vector<Interval> intervals(num_primitives);
+  for (LiveIdx live_idx = 0; live_idx < num_lives; ++live_idx) {
+    for (const PrimIdx prim_idx : live(live_idx)) {
+      intervals[prim_idx].lower = std::min(intervals[prim_idx].lower, live_idx);
+      intervals[prim_idx].upper = std::max(intervals[prim_idx].upper, live_idx);
+      ++num_terms;
+    }
+  }
+
+  // For each live index, track the primitives entering memory or being evicted.
+  std::vector<absl::btree_set<PrimIdx>> enter(num_lives), evict(num_lives);
+  for (PrimIdx prim_idx = 0; prim_idx < num_primitives; ++prim_idx) {
+    if (!intervals[prim_idx].IsValid()) continue;  // Not found in live matrix.
+    enter[intervals[prim_idx].lower].insert(prim_idx);
+    evict[intervals[prim_idx].upper].insert(prim_idx);
+  }
+
+  // A function to determine if one primitive would 'split' another.
+  auto Splits = [&intervals](PrimIdx large_idx, PrimIdx small_idx) -> bool {
+    return intervals[large_idx].lower < intervals[small_idx].lower &&
+           intervals[large_idx].upper > intervals[small_idx].upper;
+  };
+
+  // A function to calculate the overlap between any pair.
+  auto CalcOverlap = [&intervals, Splits](
+                         int64_t prim0_idx,
+                         int64_t prim1_idx) -> std::optional<Interval> {
+    if (!intervals[prim0_idx].IsValid() || !intervals[prim1_idx].IsValid()) {
+      return std::nullopt;  // Happens when prim is absent in matrix or vanishes
+    }
+    if (Splits(prim0_idx, prim1_idx) || Splits(prim1_idx, prim0_idx)) {
+      return std::nullopt;  // Merging these would split one of the primitives.
+    }
+    const Interval overlap = {
+        std::max(intervals[prim0_idx].lower, intervals[prim1_idx].lower),
+        std::min(intervals[prim0_idx].upper, intervals[prim1_idx].upper)};
+    return overlap;
+  };
+
+  // A function that merges a primitive (or members of a group) into a group.
+  auto MergeIntoGroup = [num_primitives, this](
+                            PrimIdx prim_idx,
+                            absl::flat_hash_set<PrimIdx>& reduced_group) {
+    if (prim_idx < num_primitives) {
+      reduced_group.insert(prim_idx);
+    } else {
+      const auto& group = reduced_groups_[prim_idx - num_primitives];
+      reduced_group.insert(group.begin(), group.end());
+    }
+  };
+
+  // A function that calculates the # of terms a primitive (or group) uses.
+  auto CalcNumTerms = [num_primitives, &intervals, this](
+                          PrimIdx prim_idx,
+                          std::optional<Interval> overlap = std::nullopt) {
+    int64_t num_terms = intervals[prim_idx].length();
+    if (overlap) num_terms -= overlap->length();
+    if (prim_idx >= num_primitives && num_terms > 0) {
+      num_terms += reduced_groups_[prim_idx - num_primitives].size();
+    }
+    return num_terms;
+  };
+
+  // A function to update a primitive after being merged into a group.
+  auto UpdatePrimitive = [&intervals, &enter, &evict](
+                             PrimIdx prim_idx,
+                             const Interval& overlap) mutable {
+    enter[intervals[prim_idx].lower].erase(prim_idx);
+    evict[intervals[prim_idx].upper].erase(prim_idx);
+    if (intervals[prim_idx].lower == overlap.lower) {
+      intervals[prim_idx].lower = overlap.upper + 1;
+    }
+    if (intervals[prim_idx].upper == overlap.upper) {
+      intervals[prim_idx].upper = overlap.lower - 1;
+    }
+    if (!intervals[prim_idx].IsValid()) return;  // It vanished.
+    enter[intervals[prim_idx].lower].insert(prim_idx);
+    evict[intervals[prim_idx].upper].insert(prim_idx);
+  };
+
+  // A function to sweep through live points & merge large overlaps.
+  auto SweepAndMerge = [&num_lives, &intervals, &enter, &evict, &CalcOverlap,
+                        &CalcNumTerms, &MergeIntoGroup, &UpdatePrimitive,
+                        this]() -> bool {
+    absl::btree_set<LiveAndPrim> actives;  // Active prims sorted by lower value
+    absl::btree_multimap<int64_t, PrimPair> overlaps;
+    for (LiveIdx live_idx = 0; live_idx < num_lives; ++live_idx) {
+      for (const PrimIdx prim_idx : enter[live_idx]) {
+        actives.insert({live_idx, prim_idx});
+      }
+      for (const PrimIdx prim_idx : evict[live_idx]) {
+        actives.erase({intervals[prim_idx].lower, prim_idx});
+        if (actives.empty()) continue;
+        const LiveAndPrim& active = *actives.begin();
+        overlaps.insert({active.first - live_idx, {prim_idx, active.second}});
+      }
+    }
+    bool changed = false;
+    for (const auto& [_, prim_pair] : overlaps) {
+      PrimIdx prim0_idx = prim_pair.first, prim1_idx = prim_pair.second;
+      std::optional<Interval> overlap = CalcOverlap(prim0_idx, prim1_idx);
+      if (!overlap) continue;
+      absl::flat_hash_set<PrimIdx> reduced_group;
+      MergeIntoGroup(prim0_idx, reduced_group);
+      MergeIntoGroup(prim1_idx, reduced_group);
+      if (CalcNumTerms(prim0_idx) + CalcNumTerms(prim1_idx) <=
+          CalcNumTerms(prim0_idx, overlap) + CalcNumTerms(prim1_idx, overlap) +
+              overlap->length() + reduced_group.size()) {
+        continue;  // Not reduced.
+      }
+      enter[overlap->lower].insert(intervals.size());
+      evict[overlap->upper].insert(intervals.size());
+      intervals.push_back({overlap->lower, overlap->upper});
+      reduced_groups_.push_back(reduced_group);
+      UpdatePrimitive(prim0_idx, *overlap);
+      UpdatePrimitive(prim1_idx, *overlap);
+      changed = true;
+    }
+    return changed;
+  };
+
+  while (SweepAndMerge()) {
+    // Repeated until no additional reductions can be achieved.
+  }
+
+  // Remove any groups that have vanished.
+  for (GroupIdx group_idx = reduced_groups_.size() - 1; group_idx >= 0;
+       --group_idx) {
+    if (intervals[num_primitives + group_idx].IsValid()) continue;
+    intervals.erase(intervals.begin() + num_primitives + group_idx);
+    reduced_groups_.erase(reduced_groups_.begin() + group_idx);
+  }
+
+  // Create the reduced live matrix.
+  int64_t num_reduced_terms = 0;
+  reduced_live_.resize(num_lives);
+  for (PrimIdx prim_idx = 0; prim_idx < intervals.size(); ++prim_idx) {
+    for (LiveIdx live_idx = intervals[prim_idx].lower;
+         live_idx <= intervals[prim_idx].upper; ++live_idx) {
+      reduced_live_[live_idx].push_back(prim_idx);
+      ++num_reduced_terms;
+    }
+  }
+
+  // Add in any additional terms that will be needed to define groups.
+  for (const auto& group : reduced_groups_) num_reduced_terms += group.size();
+
+  LOG(INFO) << "Memory Term Reducer reduced number of terms from " << num_terms
+            << " to " << num_reduced_terms;
+  return num_reduced_terms;
+}
+
+const std::vector<std::vector<int64_t>>& MemoryTermReducer::GetReducedLive()
+    const {
+  return reduced_live_;
+}
+
+const std::vector<absl::flat_hash_set<int64_t>>&
+MemoryTermReducer::GetReducedGroups() const {
+  return reduced_groups_;
+}
+
+}  // namespace spmd
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_memory.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_memory.h
new file mode 100644
index 00000000000000..db85fd85262124
--- /dev/null
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_memory.h
@@ -0,0 +1,61 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_MEMORY_H_
+#define XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_MEMORY_H_
+
+#include <cstdint>
+#include <functional>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "tsl/platform/protobuf.h"
+
+namespace xla {
+namespace spmd {
+
+// Reduces the # of terms in a liveness matrix by collapsing co-occurring terms:
+//
+//  |                      |  444466666555
+//  |      333333333  ==>  |      ........3  Groups:
+//  |      22222222   ==>  |      ........     m[4] = m[0] + m[1]
+//  |  111111111      ==>  |  .........        m[5] = m[2] + m[3]
+//  | 0000000000      ==>  | 0.........        m[6] = m[0] + m[1] + m[2] + m[3]
+//  +-------------->  ==>  +-------------->
+//       (time)                 (time)
+//
+// In the above example, we have four overlapping primitives (0, 1, 2, and 3)
+// that are alive for up to ten time units each.  To enforce all memory
+// constraints, the encoding on the left requires thirty-six terms in the Mixed
+// ILP.  The encoding on the right requires only fourteen terms, plus eight more
+// to model some new groups (4, 5, and 6) formed from the others.
+class MemoryTermReducer {
+ public:
+  int64_t Reduce(
+      int64_t num_lives, int64_t num_primitives,
+      std::function<tsl::protobuf::RepeatedField<int64_t>(int64_t)>  // NOLINT
+          live);
+  const std::vector<std::vector<int64_t>>& GetReducedLive() const;
+  const std::vector<absl::flat_hash_set<int64_t>>& GetReducedGroups() const;
+
+ private:
+  std::vector<std::vector<int64_t>> reduced_live_;
+  std::vector<absl::flat_hash_set<int64_t>> reduced_groups_;
+};
+
+}  // namespace spmd
+}  // namespace xla
+
+#endif  // XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_MEMORY_H_
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_memory_test.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_memory_test.cc
new file mode 100644
index 00000000000000..2e73a25830a3d0
--- /dev/null
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_memory_test.cc
@@ -0,0 +1,514 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/experimental/auto_sharding/auto_sharding_memory.h"
+
+#include <cstdint>
+#include <functional>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/container/flat_hash_set.h"
+
+namespace xla {
+namespace spmd {
+namespace {
+
+// Converts rows of an int64_t matrix into the repeated fields of a proto.
+std::function<tsl::protobuf::RepeatedField<int64_t>(int64_t)>  // NOLINT
+Convert(const std::vector<std::vector<int64_t>>& live) {
+  return [live](int64_t live_idx) {
+    return ::proto2::RepeatedField<int64_t>(live[live_idx].begin(),  // NOLINT
+                                            live[live_idx].end());
+  };
+}
+
+// clang-format off
+
+//  |    111   ==>  |    111
+//  | 000      ==>  | 000
+//  +------->  ==>  +------->
+//    (time)          (time)
+TEST(AutoShardingMemoryTest, WithoutOverlap) {
+  const std::vector<std::vector<int64_t>> live =
+      {{0   },
+       {0   },
+       {0   },
+       {   1},
+       {   1},
+       {   1}};
+
+  MemoryTermReducer reducer;
+  const auto num_reduced_terms =
+          reducer.Reduce(live.size(), /*num_primitives=*/2, Convert(live));
+  const auto reduced_live = reducer.GetReducedLive();
+  const auto reduced_groups = reducer.GetReducedGroups();
+
+  const std::vector<std::vector<int64_t>> expected_reduced_live =
+      {{0   },
+       {0   },
+       {0   },
+       {   1},
+       {   1},
+       {   1}};
+  const std::vector<absl::flat_hash_set<int64_t>> expected_reduced_groups = {};
+  EXPECT_EQ(num_reduced_terms, 6);
+  EXPECT_EQ(reduced_live, expected_reduced_live);
+  EXPECT_EQ(reduced_groups, expected_reduced_groups);
+}
+
+//  |                  2222
+//  |  11111   ==>  |  ....1   Groups:
+//  | 00000    ==>  | 0....      m[2] = m[0] + m[1]
+//  +------->  ==>  +------->
+//    (time)          (time)
+TEST(AutoShardingMemoryTest, PartialOverlap) {
+  const std::vector<std::vector<int64_t>> live =
+      {{0   },
+       {0, 1},
+       {0, 1},
+       {0, 1},
+       {0, 1},
+       {   1}};
+
+  MemoryTermReducer reducer;
+  const auto num_reduced_terms =
+          reducer.Reduce(live.size(), /*num_primitives=*/2, Convert(live));
+  const auto reduced_live = reducer.GetReducedLive();
+  const auto reduced_groups = reducer.GetReducedGroups();
+
+  const std::vector<std::vector<int64_t>> expected_reduced_live =
+      {{0      },
+       {      2},
+       {      2},
+       {      2},
+       {      2},
+       {   1   }};
+  const std::vector<absl::flat_hash_set<int64_t>> expected_reduced_groups =
+      {{0, 1}};
+  EXPECT_EQ(num_reduced_terms, 8);
+  EXPECT_EQ(reduced_live, expected_reduced_live);
+  EXPECT_EQ(reduced_groups, expected_reduced_groups);
+}
+
+//  |                  2222
+//  | 11111    ==>  | 1....    Groups:
+//  |  00000   ==>  |  ....0     m[2] = m[0] + m[1]
+//  +------->  ==>  +------->
+//    (time)          (time)
+TEST(AutoShardingMemoryTest, PartialOverlapReversed) {
+  const std::vector<std::vector<int64_t>> live =
+      {{   1},
+       {0, 1},
+       {0, 1},
+       {0, 1},
+       {0, 1},
+       {0   }};
+
+  MemoryTermReducer reducer;
+  const auto num_reduced_terms =
+          reducer.Reduce(live.size(), /*num_primitives=*/2, Convert(live));
+  const auto reduced_live = reducer.GetReducedLive();
+  const auto reduced_groups = reducer.GetReducedGroups();
+
+  const std::vector<std::vector<int64_t>> expected_reduced_live =
+      {{   1   },
+       {      2},
+       {      2},
+       {      2},
+       {      2},
+       {0      }};
+  const std::vector<absl::flat_hash_set<int64_t>> expected_reduced_groups =
+      {{0, 1}};
+  EXPECT_EQ(num_reduced_terms, 8);
+  EXPECT_EQ(reduced_live, expected_reduced_live);
+  EXPECT_EQ(reduced_groups, expected_reduced_groups);
+}
+
+//  |  1111    ==>  |  1111
+//  | 000000   ==>  | 000000
+//  +------->  ==>  +------->
+//    (time)          (time)
+TEST(AutoShardingMemoryTest, DoesNotSplitPrimitive) {
+  const std::vector<std::vector<int64_t>> live =
+      {{0   },
+       {0, 1},
+       {0, 1},
+       {0, 1},
+       {0, 1},
+       {0   }};
+
+  MemoryTermReducer reducer;
+  const auto num_reduced_terms =
+          reducer.Reduce(live.size(), /*num_primitives=*/2, Convert(live));
+  const auto reduced_live = reducer.GetReducedLive();
+  const auto reduced_groups = reducer.GetReducedGroups();
+
+  const std::vector<std::vector<int64_t>> expected_reduced_live =
+      {{0   },
+       {0, 1},
+       {0, 1},
+       {0, 1},
+       {0, 1},
+       {0   }};
+  const std::vector<absl::flat_hash_set<int64_t>> expected_reduced_groups = {};
+  EXPECT_EQ(num_reduced_terms, 10);
+  EXPECT_EQ(reduced_live, expected_reduced_live);
+  EXPECT_EQ(reduced_groups, expected_reduced_groups);
+}
+
+//  |          ==>  |  22222
+//  |  11111   ==>  |  .....   Groups:
+//  | 000000   ==>  | 0.....     m[2] = m[0] + m[1]
+//  +------->  ==>  +------->
+//    (time)          (time)
+TEST(AutoShardingMemoryTest, OnePrimitiveVanishes) {
+  const std::vector<std::vector<int64_t>> live =
+      {{0   },
+       {0, 1},
+       {0, 1},
+       {0, 1},
+       {0, 1},
+       {0, 1}};
+
+  MemoryTermReducer reducer;
+  const auto num_reduced_terms =
+          reducer.Reduce(live.size(), /*num_primitives=*/2, Convert(live));
+  const auto reduced_live = reducer.GetReducedLive();
+  const auto reduced_groups = reducer.GetReducedGroups();
+
+  const std::vector<std::vector<int64_t>> expected_reduced_live =
+      {{0   },
+       {   2},
+       {   2},
+       {   2},
+       {   2},
+       {   2}};
+  const std::vector<absl::flat_hash_set<int64_t>> expected_reduced_groups =
+      {{0, 1}};
+  EXPECT_EQ(num_reduced_terms, 8);
+  EXPECT_EQ(reduced_live, expected_reduced_live);
+  EXPECT_EQ(reduced_groups, expected_reduced_groups);
+}
+
+//  |          ==>  | 222222
+//  | 111111   ==>  | ......   Groups:
+//  | 000000   ==>  | ......     m[2] = m[0] + m[1]
+//  +------->  ==>  +------->
+//    (time)          (time)
+TEST(AutoShardingMemoryTest, BothPrimitivesVanish) {
+  const std::vector<std::vector<int64_t>> live =
+      {{0, 1},
+       {0, 1},
+       {0, 1},
+       {0, 1},
+       {0, 1},
+       {0, 1}};
+
+  MemoryTermReducer reducer;
+  const auto num_reduced_terms =
+          reducer.Reduce(live.size(), /*num_primitives=*/2, Convert(live));
+  const auto reduced_live = reducer.GetReducedLive();
+  const auto reduced_groups = reducer.GetReducedGroups();
+
+  const std::vector<std::vector<int64_t>> expected_reduced_live =
+      {{2},
+       {2},
+       {2},
+       {2},
+       {2},
+       {2}};
+  const std::vector<absl::flat_hash_set<int64_t>> expected_reduced_groups =
+      {{0, 1}};
+  EXPECT_EQ(num_reduced_terms, 8);
+  EXPECT_EQ(reduced_live, expected_reduced_live);
+  EXPECT_EQ(reduced_groups, expected_reduced_groups);
+}
+
+//  |            ==>  | 33333
+//  |     22222  ==>  |     22222
+//  | 11111111   ==>  | .....111   Groups:
+//  | 00000      ==>  | .....        m[3] = m[0] + m[1]
+//  +--------->  ==>  +--------->
+//    (time)            (time)
+TEST(AutoShardingMemoryTest, OneGroupingPreventsAnother) {
+  const std::vector<std::vector<int64_t>> live =
+      {{0, 1   },
+       {0, 1   },
+       {0, 1   },
+       {0, 1   },
+       {0, 1, 2},
+       {   1, 2},
+       {   1, 2},
+       {   1, 2},
+       {      2}};
+
+  MemoryTermReducer reducer;
+  const auto num_reduced_terms =
+          reducer.Reduce(live.size(), /*num_primitives=*/3, Convert(live));
+  const auto reduced_live = reducer.GetReducedLive();
+  const auto reduced_groups = reducer.GetReducedGroups();
+
+  const std::vector<std::vector<int64_t>> expected_reduced_live =
+      {{      3},
+       {      3},
+       {      3},
+       {      3},
+       {   2, 3},
+       {1, 2   },
+       {1, 2   },
+       {1, 2   },
+       {   2   }};
+  const std::vector<absl::flat_hash_set<int64_t>> expected_reduced_groups =
+      {{0, 1}};
+  EXPECT_EQ(num_reduced_terms, 15);
+  EXPECT_EQ(reduced_live, expected_reduced_live);
+  EXPECT_EQ(reduced_groups, expected_reduced_groups);
+}
+
+//  |          ==>  | 333444
+//  |    222   ==>  |    ...   Groups:
+//  | 111      ==>  | ...        m[3] = m[0] + m[1]
+//  | 000000   ==>  | ......     m[4] = m[0] + m[2]
+//  +------->  ==>  +------->
+//    (time)          (time)
+TEST(AutoShardingMemoryTest, TwoGroups) {
+  const std::vector<std::vector<int64_t>> live =
+      {{0, 1   },
+       {0, 1   },
+       {0, 1   },
+       {0,    2},
+       {0,    2},
+       {0,    2}};
+
+  MemoryTermReducer reducer;
+  const auto num_reduced_terms =
+          reducer.Reduce(live.size(), /*num_primitives=*/3, Convert(live));
+  const auto reduced_live = reducer.GetReducedLive();
+  const auto reduced_groups = reducer.GetReducedGroups();
+
+  const std::vector<std::vector<int64_t>> expected_reduced_live =
+      {{3},
+       {3},
+       {3},
+       {4},
+       {4},
+       {4}};
+  const std::vector<absl::flat_hash_set<int64_t>> expected_reduced_groups =
+      {{0, 1}, {0, 2}};
+  EXPECT_EQ(num_reduced_terms, 10);
+  EXPECT_EQ(reduced_live, expected_reduced_live);
+  EXPECT_EQ(reduced_groups, expected_reduced_groups);
+}
+
+//  |           ==>  |  444555
+//  |     3333  ==>  |     ...3
+//  |     222   ==>  |     ...   Groups:
+//  |  111      ==>  |  ...        m[4] = m[0] + m[1]
+//  | 0000      ==>  | 0...        m[5] = m[2] + m[3]
+//  +------->   ==>  +------->
+//    (time)           (time)
+TEST(AutoShardingMemoryTest, TwoGroupsMutuallyExclusive) {
+  const std::vector<std::vector<int64_t>> live =
+      {{0         },
+       {0, 1      },
+       {0, 1      },
+       {0, 1      },
+       {      2, 3},
+       {      2, 3},
+       {      2, 3},
+       {         3}};
+
+  MemoryTermReducer reducer;
+  const auto num_reduced_terms =
+          reducer.Reduce(live.size(), /*num_primitives=*/4, Convert(live));
+  const auto reduced_live = reducer.GetReducedLive();
+  const auto reduced_groups = reducer.GetReducedGroups();
+
+  const std::vector<std::vector<int64_t>> expected_reduced_live =
+      {{0      },
+       {      4},
+       {      4},
+       {      4},
+       {      5},
+       {      5},
+       {      5},
+       {   3   }};
+  const std::vector<absl::flat_hash_set<int64_t>> expected_reduced_groups =
+      {{0, 1}, {2, 3}};
+  EXPECT_EQ(num_reduced_terms, 12);
+  EXPECT_EQ(reduced_live, expected_reduced_live);
+  EXPECT_EQ(reduced_groups, expected_reduced_groups);
+}
+
+//  | 11      ==>  | 11
+//  | 00      ==>  | 00
+//  +------>  ==>  +------>
+//   (time)         (time)
+TEST(AutoShardingMemoryTest, MergingPrimitivesWouldNotReduceTerms) {
+  const std::vector<std::vector<int64_t>> live =
+      {{0, 1},
+       {0, 1}};
+
+  MemoryTermReducer reducer;
+  const auto num_reduced_terms =
+          reducer.Reduce(live.size(), /*num_primitives=*/2, Convert(live));
+  const auto reduced_live = reducer.GetReducedLive();
+  const auto reduced_groups = reducer.GetReducedGroups();
+
+  const std::vector<std::vector<int64_t>> expected_reduced_live =
+      {{0, 1},
+       {0, 1}};
+  const std::vector<absl::flat_hash_set<int64_t>> expected_reduced_groups = {};
+  EXPECT_EQ(num_reduced_terms, 4);
+  EXPECT_EQ(reduced_live, expected_reduced_live);
+  EXPECT_EQ(reduced_groups, expected_reduced_groups);
+}
+
+//  |          ==>  | 333333
+//  | 222222   ==>  | ......
+//  | 111111   ==>  | ......   Groups:
+//  | 000000   ==>  | ......     m[3] = m[0] + m[1] + m[2]
+//  +------->  ==>  +------->
+//    (time)          (time)
+TEST(AutoShardingMemoryTest, AllPrimitivesVanish) {
+  const std::vector<std::vector<int64_t>> live =
+      {{0, 1, 2},
+       {0, 1, 2},
+       {0, 1, 2},
+       {0, 1, 2},
+       {0, 1, 2},
+       {0, 1, 2}};
+
+  MemoryTermReducer reducer;
+  const auto num_reduced_terms =
+          reducer.Reduce(live.size(), /*num_primitives=*/3, Convert(live));
+  const auto reduced_live = reducer.GetReducedLive();
+  const auto reduced_groups = reducer.GetReducedGroups();
+
+  const std::vector<std::vector<int64_t>> expected_reduced_live =
+      {{3},
+       {3},
+       {3},
+       {3},
+       {3},
+       {3}};
+  const std::vector<absl::flat_hash_set<int64_t>> expected_reduced_groups =
+      {{0, 1, 2}};
+  EXPECT_EQ(num_reduced_terms, 9);
+  EXPECT_EQ(reduced_live, expected_reduced_live);
+  EXPECT_EQ(reduced_groups, expected_reduced_groups);
+}
+
+//  |            ==>  |    555555
+//  |            ==>  | 4444444
+//  |    333333  ==>  |    ......
+//  |    222222  ==>  |    ......  Groups:
+//  | 1111111    ==>  | .......      m[4] = m[0] + m[1]
+//  | 0000000    ==>  | .......      m[5] = m[2] + m[3]
+//  +--------->  ==>  +--------->
+//     (time)            (time)
+TEST(AutoShardingMemoryTest, MergingGroupsWouldNotReduceTerms) {
+  const std::vector<std::vector<int64_t>> live =
+      {{0, 1      },
+       {0, 1      },
+       {0, 1      },
+       {0, 1, 2, 3},
+       {0, 1, 2, 3},
+       {0, 1, 2, 3},
+       {0, 1, 2, 3},
+       {      2, 3},
+       {      2, 3}};
+
+  MemoryTermReducer reducer;
+  const auto num_reduced_terms =
+          reducer.Reduce(live.size(), /*num_primitives=*/4, Convert(live));
+  const auto reduced_live = reducer.GetReducedLive();
+  const auto reduced_groups = reducer.GetReducedGroups();
+
+  const std::vector<std::vector<int64_t>> expected_reduced_live =
+      {{4   },
+       {4   },
+       {4   },
+       {4, 5},
+       {4, 5},
+       {4, 5},
+       {4, 5},
+       {   5},
+       {   5}};
+  const std::vector<absl::flat_hash_set<int64_t>> expected_reduced_groups =
+      {{0, 1}, {2, 3}};
+  EXPECT_EQ(num_reduced_terms, 17);
+  EXPECT_EQ(reduced_live, expected_reduced_live);
+  EXPECT_EQ(reduced_groups, expected_reduced_groups);
+}
+
+//  |                      |  444466666555
+//  |      333333333  ==>  |      ........3  Groups:
+//  |      22222222   ==>  |      ........     m[4] = m[0] + m[1]
+//  |  111111111      ==>  |  .........        m[5] = m[2] + m[3]
+//  | 0000000000      ==>  | 0.........        m[6] = m[0] + m[1] + m[2] + m[3]
+//  +-------------->  ==>  +-------------->
+//       (time)                 (time)
+TEST(AutoShardingMemoryTest, ExampleFromDocumentation) {
+  const std::vector<std::vector<int64_t>> live =
+      {{0         },
+       {0, 1      },
+       {0, 1      },
+       {0, 1      },
+       {0, 1      },
+       {0, 1, 2, 3},
+       {0, 1, 2, 3},
+       {0, 1, 2, 3},
+       {0, 1, 2, 3},
+       {0, 1, 2, 3},
+       {      2, 3},
+       {      2, 3},
+       {      2, 3},
+       {         3}};
+
+  MemoryTermReducer reducer;
+  const auto num_reduced_terms =
+          reducer.Reduce(live.size(), /*num_primitives=*/4, Convert(live));
+  const auto reduced_live = reducer.GetReducedLive();
+  const auto reduced_groups = reducer.GetReducedGroups();
+
+  const std::vector<std::vector<int64_t>> expected_reduced_live =
+      {{0      },
+       {      4},
+       {      4},
+       {      4},
+       {      4},
+       {      6},
+       {      6},
+       {      6},
+       {      6},
+       {      6},
+       {      5},
+       {      5},
+       {      5},
+       {   3   }};
+  const std::vector<absl::flat_hash_set<int64_t>> expected_reduced_groups =
+      {{0, 1}, {2, 3}, {0, 1, 2, 3}};
+  EXPECT_EQ(num_reduced_terms, 22);
+  EXPECT_EQ(reduced_live, expected_reduced_live);
+  EXPECT_EQ(reduced_groups, expected_reduced_groups);
+}
+
+// clang-format on
+
+}  // namespace
+}  // namespace spmd
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.cc
index 618073f9900f8a..718392817c124a 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include <limits>
 #include <memory>
 #include <string>
-#include <tuple>
 #include <utility>
 #include <vector>
 
@@ -73,8 +72,14 @@ constexpr double kMemoryMultiplier = 1e-6;
 // Any memory terms below this threshold will be dropped (to reduce MIP size).
 constexpr double kTinyTermThreshold = 1e-6;
 
-// Any memory segments differing by this amount are skipped (reduces MIP size).
-constexpr double kSimilarityThreshold = 1e-2;
+// Always include memory constraints with this number of terms or fewer.
+constexpr int64_t kMemoryCardinalityThreshold = 1000;
+
+bool AutoShardingSolverOutput::operator==(
+    const AutoShardingSolverOutput& other) const {
+  return s_val == other.s_val && e_val == other.e_val && cost == other.cost &&
+         peak_times == other.peak_times;
+}
 
 bool AutoShardingSolverResult::operator==(
     const AutoShardingSolverResult& other) const {
@@ -216,24 +221,6 @@ AutoShardingSolverRequest ScaleRequest(
   return scaled_request;
 }
 
-double MemoryDifference(
-    const AutoShardingSolverRequest& request,
-    const tsl::protobuf::RepeatedPtrField<AutoShardingSolverRequest_Costs>& c,
-    const absl::flat_hash_set<int64_t>& live_prev,
-    const absl::flat_hash_set<int64_t>& live_curr) {
-  double memory_diff = 0.0;  // How much this segment differs from the last.
-  absl::flat_hash_set<int64_t> live_union;
-  live_union.insert(live_prev.begin(), live_prev.end());
-  live_union.insert(live_curr.begin(), live_curr.end());
-  for (int64_t idx : live_union) {
-    if (!live_prev.contains(idx) || !live_curr.contains(idx)) {
-      memory_diff +=
-          *std::max_element(c.at(idx).costs().begin(), c.at(idx).costs().end());
-    }
-  }
-  return memory_diff;
-}
-
 // Taking an auto-sharding problem (`request`) as an input, calls the OR tools
 // CP-SAT solver and outputs a solution to the input problem.
 //
@@ -486,75 +473,6 @@ AutoShardingSolverResult CallORToolsSolver(
   }
   // c.
   if (request.memory_budget() > 0) {
-    int tiny_term_count = 0;
-    int segment_similarity_skips = 0;
-    absl::flat_hash_set<int64_t> live_nodes_prev, live_edges_prev;
-    for (LivenessIdx time_idx = 0; time_idx < request.live_size(); ++time_idx) {
-      // Decide whether this segment is similar enough to be skipped.
-      absl::flat_hash_set<int64_t> live_nodes_curr, live_edges_curr;
-      const auto& live_nodes = request.live(time_idx).nodes();
-      live_nodes_curr.insert(live_nodes.begin(), live_nodes.end());
-      double memory_diff = MemoryDifference(request, request.memory_costs(),
-                                            live_nodes_prev, live_nodes_curr);
-      if (!request.live_edges().empty() && request.enable_memory_edge_costs()) {
-        const auto& live_edges = request.live_edges(time_idx).edges();
-        live_edges_curr.insert(live_edges.begin(), live_edges.end());
-        memory_diff += MemoryDifference(request, request.memory_edge_costs(),
-                                        live_edges_prev, live_edges_curr);
-      }
-      if (memory_diff < kSimilarityThreshold * request.memory_budget()) {
-        ++segment_similarity_skips;
-        continue;
-      }
-      live_nodes_prev = live_nodes_curr;
-      live_edges_prev = live_edges_curr;
-      MPConstraint* constraint =
-          solver->MakeRowConstraint(-MPSolver::infinity(), MPSolver::infinity(),
-                                    absl::StrCat("mem[", time_idx, "]"));
-      if (overbudget_var) constraint->SetCoefficient(overbudget_var, -1.0);
-      double tiny_term_total = 0.0;  // Used to trim the memory budget downward.
-      for (NodeIdx node_idx : request.live(time_idx).nodes()) {
-        double tiny_term_max = 0.0;
-        for (NodeStrategyIdx j = 0; j < s[node_idx].size(); ++j) {
-          double memory_cost = request.memory_costs(node_idx).costs(j);
-          if (memory_cost < kTinyTermThreshold * request.memory_budget()) {
-            tiny_term_max = std::max(tiny_term_max, memory_cost);
-            if (memory_cost > 0.0) ++tiny_term_count;
-            continue;
-          }
-          memory_cost *= kMemoryMultiplier;
-          const double accumulated_coefficient =
-              constraint->GetCoefficient(s[node_idx][j]);
-          constraint->SetCoefficient(s[node_idx][j],
-                                     accumulated_coefficient + memory_cost);
-        }
-        tiny_term_total += tiny_term_max;
-      }
-      if (!request.live_edges().empty() && request.enable_memory_edge_costs()) {
-        for (EdgeIdx edge_idx : request.live_edges(time_idx).edges()) {
-          double tiny_term_max = 0.0;
-          for (EdgeStrategyIdx j = 0; j < e[edge_idx].size(); ++j) {
-            double memory_cost = request.memory_edge_costs(edge_idx).costs(j);
-            if (memory_cost < kTinyTermThreshold * request.memory_budget()) {
-              tiny_term_max = std::max(tiny_term_max, memory_cost);
-              if (memory_cost > 0.0) ++tiny_term_count;
-              continue;
-            }
-            memory_cost *= kMemoryMultiplier;
-            const double accumulated_coefficient =
-                constraint->GetCoefficient(e[edge_idx][j]);
-            constraint->SetCoefficient(e[edge_idx][j],
-                                       accumulated_coefficient + memory_cost);
-          }
-          tiny_term_total += tiny_term_max;
-        }
-      }
-      constraint->SetUB(kMemoryMultiplier *
-                        (request.memory_budget() - tiny_term_total));
-    }
-    LOG(INFO) << "Number of tiny terms: " << tiny_term_count;
-    LOG(INFO) << "Skipped " << segment_similarity_skips << " segments out of "
-              << request.live().size() << " due to similarity";
     if (overbudget_var) {
       solver->MutableObjective()->SetCoefficient(
           overbudget_var,
@@ -756,18 +674,191 @@ AutoShardingSolverResult CallORToolsSolver(
   return result;
 }
 
+std::vector<NodeStrategyIdx> GetChosenNodeStrategy(
+    const AutoShardingSolverRequest& request,
+    const std::vector<std::vector<MPVariable*>>& s) {
+  std::vector<NodeStrategyIdx> chosen_node_strategy(request.num_nodes(), -1);
+  for (NodeIdx node_idx = 0; node_idx < request.num_nodes(); ++node_idx) {
+    for (NodeStrategyIdx j = 0; j < s[node_idx].size(); ++j) {
+      // if lhs == 1
+      if (s[node_idx][j]->solution_value() > 0.5) {
+        chosen_node_strategy[node_idx] = j;
+        break;
+      }
+    }
+  }
+  return chosen_node_strategy;
+}
+
+std::vector<EdgeStrategyIdx> GetChosenEdgeStrategy(
+    const AutoShardingSolverRequest& request,
+    const std::vector<std::vector<MPVariable*>>& e) {
+  size_t num_edges = request.edges_size();
+  std::vector<NodeStrategyIdx> chosen_edge_strategy(num_edges, -1);
+  for (EdgeIdx edge_idx = 0; edge_idx < num_edges; ++edge_idx) {
+    for (EdgeStrategyIdx j = 0; j < e[edge_idx].size(); ++j) {
+      // if lhs == 1
+      if (e[edge_idx][j]->solution_value() > 0.5) {
+        chosen_edge_strategy[edge_idx] = j;
+        break;
+      }
+    }
+  }
+  return chosen_edge_strategy;
+}
+
+// Finds the timestep with the largest memory overbudget (-1 if no such value).
+LivenessIdx FindPeakLiveness(const AutoShardingSolverRequest& request,
+                             const std::vector<std::vector<MPVariable*>>& s,
+                             const std::vector<std::vector<MPVariable*>>& e) {
+  const std::vector<NodeStrategyIdx> chosen_node_strategy =
+      GetChosenNodeStrategy(request, s);
+  const std::vector<EdgeStrategyIdx> chosen_edge_strategy =
+      GetChosenEdgeStrategy(request, e);
+  LivenessIdx peak_time_idx = -1;
+  double peak_overbudget = 0.0;
+  for (LivenessIdx time_idx = 0; time_idx < request.live_size(); ++time_idx) {
+    if (request.live(time_idx).nodes_size() <= kMemoryCardinalityThreshold) {
+      continue;  // We always enforce these, no need to consider them again.
+    }
+    double memory_usage = 0.0;
+    for (NodeIdx node_idx : request.live(time_idx).nodes()) {
+      const NodeStrategyIdx j = chosen_node_strategy[node_idx];
+      memory_usage += request.memory_costs(node_idx).costs(j);
+    }
+    if (!request.live_edges().empty() && request.enable_memory_edge_costs()) {
+      for (EdgeIdx edge_idx : request.live_edges(time_idx).edges()) {
+        const EdgeStrategyIdx j = chosen_edge_strategy[edge_idx];
+        memory_usage += request.memory_edge_costs(edge_idx).costs(j);
+      }
+    }
+    const double overbudget = memory_usage - request.memory_budget();
+    if (peak_overbudget < overbudget) {
+      peak_overbudget = overbudget;
+      peak_time_idx = time_idx;
+    }
+  }
+  return peak_time_idx;
+}
+
+// Imposes a new memory constraint at the given location.  Returns the number of
+// tiny terms created.
+int ImposeMemoryConstraint(const AutoShardingSolverRequest& request,
+                           const std::vector<std::vector<MPVariable*>>& s,
+                           const std::vector<std::vector<MPVariable*>>& e,
+                           const MPVariable* overbudget_var, MPSolver& solver,
+                           LivenessIdx time_idx) {
+  int tiny_term_count = 0;
+  VLOG(1) << "Imposing a memory constraint at time index " << time_idx;
+  MPConstraint* constraint =
+      solver.MakeRowConstraint(-MPSolver::infinity(), MPSolver::infinity(),
+                               absl::StrCat("mem[", time_idx, "]"));
+  if (overbudget_var) constraint->SetCoefficient(overbudget_var, -1.0);
+  double tiny_term_total = 0.0;  // Used to trim the memory budget downward.
+  for (NodeIdx node_idx : request.live(time_idx).nodes()) {
+    double tiny_term_max = 0.0;
+    for (NodeStrategyIdx j = 0; j < s[node_idx].size(); ++j) {
+      double memory_cost = request.memory_costs(node_idx).costs(j);
+      if (memory_cost < kTinyTermThreshold * request.memory_budget()) {
+        tiny_term_max = std::max(tiny_term_max, memory_cost);
+        if (memory_cost > 0.0) ++tiny_term_count;
+        continue;
+      }
+      memory_cost *= kMemoryMultiplier;
+      const double accumulated_coefficient =
+          constraint->GetCoefficient(s[node_idx][j]);
+      constraint->SetCoefficient(s[node_idx][j],
+                                 accumulated_coefficient + memory_cost);
+    }
+    tiny_term_total += tiny_term_max;
+  }
+  if (!request.live_edges().empty() && request.enable_memory_edge_costs()) {
+    for (EdgeIdx edge_idx : request.live_edges(time_idx).edges()) {
+      double tiny_term_max = 0.0;
+      for (EdgeStrategyIdx j = 0; j < e[edge_idx].size(); ++j) {
+        double memory_cost = request.memory_edge_costs(edge_idx).costs(j);
+        if (memory_cost < kTinyTermThreshold * request.memory_budget()) {
+          tiny_term_max = std::max(tiny_term_max, memory_cost);
+          if (memory_cost > 0.0) ++tiny_term_count;
+          continue;
+        }
+        memory_cost *= kMemoryMultiplier;
+        const double accumulated_coefficient =
+            constraint->GetCoefficient(e[edge_idx][j]);
+        constraint->SetCoefficient(e[edge_idx][j],
+                                   accumulated_coefficient + memory_cost);
+      }
+      tiny_term_total += tiny_term_max;
+    }
+  }
+  constraint->SetUB(kMemoryMultiplier *
+                    (request.memory_budget() - tiny_term_total));
+  return tiny_term_count;
+}
+
 AutoShardingSolverResult SolveAndExtractSolution(
     const AutoShardingSolverRequest& request,
     const std::vector<std::vector<MPVariable*>>& s,
     const std::vector<std::vector<MPVariable*>>& e,
     const MPVariable* overbudget_var, const MPVariable* makespan_var,
     MPSolver& solver) {
+  int tiny_term_count = 0;
   absl::Time start_time = absl::Now();
+  absl::flat_hash_set<LivenessIdx> peak_times, small_times;
+  if (request.memory_budget() > 0) {
+    // Always enforce constraints that have a relatively small number of terms.
+    for (LivenessIdx time_idx = 0; time_idx < request.live_size(); ++time_idx) {
+      if (request.live(time_idx).nodes_size() <= kMemoryCardinalityThreshold) {
+        small_times.insert(time_idx);
+        tiny_term_count += ImposeMemoryConstraint(request, s, e, overbudget_var,
+                                                  solver, time_idx);
+      }
+    }
+    // Also add in any peak times that were encountered in previous iterations.
+    if (!request.deterministic_mode()) {
+      for (const LivenessIdx peak_time_idx : request.peak_times()) {
+        if (small_times.contains(peak_time_idx)) continue;
+        peak_times.insert(peak_time_idx);
+        tiny_term_count += ImposeMemoryConstraint(request, s, e, overbudget_var,
+                                                  solver, peak_time_idx);
+      }
+    }
+  }
   auto status = solver.Solve();
+  if (request.memory_budget() > 0) {
+    // Continue to add memory constraints until (a) they are all satisfied,
+    // (b) the problem becomes infeasible, or (c) the solver times out.
+    while (status == operations_research::MPSolver::OPTIMAL) {
+      std::vector<std::pair<const MPVariable*, double>> hint;
+      for (NodeIdx node_idx = 0; node_idx < request.num_nodes(); ++node_idx) {
+        if (request.s_follow(node_idx) >= 0) continue;
+        for (NodeStrategyIdx j = 0; j < s[node_idx].size(); ++j) {
+          hint.push_back({s[node_idx][j], s[node_idx][j]->solution_value()});
+        }
+      }
+      solver.SetHint(hint);
+      const LivenessIdx peak_time_idx = FindPeakLiveness(request, s, e);
+      if (peak_time_idx == -1 || peak_times.contains(peak_time_idx)) break;
+      if (small_times.contains(peak_time_idx)) break;
+      peak_times.insert(peak_time_idx);
+      tiny_term_count += ImposeMemoryConstraint(request, s, e, overbudget_var,
+                                                solver, peak_time_idx);
+      if (request.has_solver_timeout()) {
+        auto remaining_time =
+            request.solver_timeout().solver_timeout_in_seconds();
+        remaining_time -= absl::ToInt64Seconds(absl::Now() - start_time);
+        solver.SetTimeLimit(absl::Seconds(std::max(remaining_time, 0L)));
+      }
+      status = solver.Solve();
+    }
+    LOG(INFO) << "Imposed " << peak_times.size() + small_times.size()
+              << " memory constraints out of " << request.live_size();
+  }
   absl::Time end_time = absl::Now();
   auto duration = end_time - start_time;
   LOG(INFO) << "Solver took " << absl::ToInt64Milliseconds(duration) << " ms";
   LOG(INFO) << "Solver Status: " << status;
+  LOG(INFO) << "Number of tiny terms: " << tiny_term_count;
 
   if (status == operations_research::MPSolver::INFEASIBLE) {
     LOG(ERROR) << "MPSolver could not find any feasible solution.";
@@ -841,28 +932,18 @@ AutoShardingSolverResult SolveAndExtractSolution(
   // Return value
   size_t num_edges = request.edges_size();
   double unsalted_objective = 0.0;
-  std::vector<NodeStrategyIdx> chosen_strategy(request.num_nodes(), -1);
-  std::vector<EdgeStrategyIdx> e_val(num_edges, -1);
+  const std::vector<NodeStrategyIdx> chosen_node_strategy =
+      GetChosenNodeStrategy(request, s);
+  const std::vector<EdgeStrategyIdx> chosen_edge_strategy =
+      GetChosenEdgeStrategy(request, e);
   for (NodeIdx node_idx = 0; node_idx < request.num_nodes(); ++node_idx) {
-    for (NodeStrategyIdx j = 0; j < s[node_idx].size(); ++j) {
-      // if lhs == 1
-      if (s[node_idx][j]->solution_value() > 0.5) {
-        chosen_strategy[node_idx] = j;
-        unsalted_objective += request.computation_costs(node_idx).costs(j) +
-                              request.communication_costs(node_idx).costs(j);
-        break;
-      }
-    }
+    const NodeStrategyIdx j = chosen_node_strategy[node_idx];
+    unsalted_objective += request.computation_costs(node_idx).costs(j) +
+                          request.communication_costs(node_idx).costs(j);
   }
   for (EdgeIdx edge_idx = 0; edge_idx < num_edges; ++edge_idx) {
-    for (EdgeStrategyIdx j = 0; j < e[edge_idx].size(); ++j) {
-      // if lhs == 1
-      if (e[edge_idx][j]->solution_value() > 0.5) {
-        e_val[edge_idx] = j;
-        unsalted_objective += request.resharding_costs(edge_idx).costs(j);
-        break;
-      }
-    }
+    const EdgeStrategyIdx j = chosen_edge_strategy[edge_idx];
+    unsalted_objective += request.resharding_costs(edge_idx).costs(j);
   }
   if (overbudget_var) {
     unsalted_objective += request.overbudget_coeff().coeff() *
@@ -881,11 +962,11 @@ AutoShardingSolverResult SolveAndExtractSolution(
     LOG(INFO) << "memory budget: "
               << request.memory_budget() / (1024 * 1024 * 1024) << " GB";
   }
-  PrintLargestInstructions(chosen_strategy, request);
-  return AutoShardingSolverResult(
-      std::make_tuple(std::move(chosen_strategy), std::move(e_val),
-                      unsalted_objective),
-      false);
+  PrintLargestInstructions(chosen_node_strategy, request);
+  const AutoShardingSolverOutput output = {std::move(chosen_node_strategy),
+                                           std::move(chosen_edge_strategy),
+                                           unsalted_objective, peak_times};
+  return AutoShardingSolverResult(output, false);
 }
 
 bool CostComponents::operator==(const CostComponents& other) const {
@@ -915,8 +996,8 @@ AutoShardingEvaluation Evaluate(const AutoShardingSolverRequest& request,
   const auto& r = request.resharding_costs();
   const auto& v = request.value_costs();
   const auto& p = request.departure_costs();
-  const std::vector<NodeStrategyIdx>& s_val = std::get<0>(*result.status);
-  const std::vector<EdgeStrategyIdx>& e_val = std::get<1>(*result.status);
+  const std::vector<NodeStrategyIdx>& s_val = result.status->s_val;
+  const std::vector<EdgeStrategyIdx>& e_val = result.status->e_val;
   AutoShardingEvaluation evaluation;
   // Compute violations.
   for (NodeIdx node_idx = 0; node_idx < request.num_nodes(); ++node_idx) {
@@ -1012,8 +1093,8 @@ std::vector<std::string> Rationalize(const AutoShardingSolverRequest& request,
   std::vector<std::string> rationales;
   const auto& names = request.instruction_names();
 
-  const std::vector<NodeStrategyIdx>& s_result = std::get<0>(*result.status);
-  const std::vector<NodeStrategyIdx>& s_subopt = std::get<0>(*subopt.status);
+  const std::vector<NodeStrategyIdx>& s_result = result.status->s_val;
+  const std::vector<NodeStrategyIdx>& s_subopt = subopt.status->s_val;
   for (NodeIdx node_idx = 0; node_idx < request.num_nodes(); ++node_idx) {
     const NodeStrategyIdx j = s_result[node_idx], k = s_subopt[node_idx];
     if (j != k) {
@@ -1036,8 +1117,8 @@ std::vector<std::string> Rationalize(const AutoShardingSolverRequest& request,
     }
   }
 
-  const std::vector<EdgeStrategyIdx>& e_result = std::get<1>(*result.status);
-  const std::vector<EdgeStrategyIdx>& e_subopt = std::get<1>(*subopt.status);
+  const std::vector<EdgeStrategyIdx>& e_result = result.status->e_val;
+  const std::vector<EdgeStrategyIdx>& e_subopt = subopt.status->e_val;
   for (EdgeIdx edge_idx = 0; edge_idx < request.edges_size(); ++edge_idx) {
     const auto& edge = request.edges(edge_idx);
     const EdgeStrategyIdx j = e_result[edge_idx], k = e_subopt[edge_idx];
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.h
index 866560584c9f37..32809503603c75 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.h
@@ -16,9 +16,7 @@ limitations under the License.
 #ifndef XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_SOLVER_H_
 #define XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_SOLVER_H_
 
-#include <cstdint>
 #include <string>
-#include <tuple>
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
@@ -31,17 +29,22 @@ limitations under the License.
 namespace xla {
 namespace spmd {
 
+struct AutoShardingSolverOutput {
+  std::vector<NodeStrategyIdx> s_val;
+  std::vector<EdgeStrategyIdx> e_val;
+  double cost = -1.0;
+  absl::flat_hash_set<LivenessIdx> peak_times;
+
+  bool operator==(const AutoShardingSolverOutput& other) const;
+};
+
 struct AutoShardingSolverResult {
  public:
-  AutoShardingSolverResult(
-      absl::StatusOr<std::tuple<std::vector<NodeStrategyIdx>,
-                                std::vector<EdgeStrategyIdx>, double>>
-          status,
-      bool skip_auto_sharding)
+  AutoShardingSolverResult(absl::StatusOr<AutoShardingSolverOutput> status,
+                           bool skip_auto_sharding)
       : status(status), skip_auto_sharding(skip_auto_sharding) {}
   bool operator==(const AutoShardingSolverResult& other) const;
-  absl::StatusOr<std::tuple<std::vector<int64_t>, std::vector<int64_t>, double>>
-      status;
+  absl::StatusOr<AutoShardingSolverOutput> status;
   bool skip_auto_sharding;
 };
 
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver_test.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver_test.cc
index 7a892f1ef5169a..6c35131b0bb085 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver_test.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver_test.cc
@@ -14,12 +14,12 @@ limitations under the License.
 
 #include <cstdint>
 #include <string>
-#include <tuple>
 #include <utility>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding.pb.h"
@@ -229,9 +229,9 @@ TEST(CallORToolsSolverTest, SolvesOptimally) {
   const std::vector<NodeStrategyIdx> s_val = {0, 0, 0, 0, 0};
   const std::vector<EdgeStrategyIdx> e_val = {0, 0};
   const double objective_value = 7650.0;
-  const AutoShardingSolverResult expected_result = {
-      std::make_tuple(
-          std::move(s_val), std::move(e_val), objective_value), false};
+  const AutoShardingSolverOutput expected_output =
+          {s_val, e_val, objective_value};
+  const AutoShardingSolverResult expected_result = {expected_output, false};
   EXPECT_EQ(result, expected_result);
 }
 
@@ -245,9 +245,9 @@ TEST(CallORToolsSolverTest, SolvesOverbudget) {
   const std::vector<NodeStrategyIdx> s_val = {0, 0, 0, 0, 0};
   const std::vector<EdgeStrategyIdx> e_val = {0, 0};
   const double objective_value = 9007650.0;
-  const AutoShardingSolverResult expected_result = {
-      std::make_tuple(
-          std::move(s_val), std::move(e_val), objective_value), false};
+  const AutoShardingSolverOutput expected_output =
+          {s_val, e_val, objective_value};
+  const AutoShardingSolverResult expected_result = {expected_output, false};
   EXPECT_EQ(result, expected_result);
 }
 
@@ -260,9 +260,9 @@ TEST(CallORToolsSolverTest, SolvesMaxDepartures) {
   const std::vector<NodeStrategyIdx> s_val = {0, 0, 1, 1, 0};
   const std::vector<EdgeStrategyIdx> e_val = {1, 1};
   const double objective_value = 7872.0;
-  const AutoShardingSolverResult expected_result = {
-      std::make_tuple(
-          std::move(s_val), std::move(e_val), objective_value), false};
+  const AutoShardingSolverOutput expected_output =
+          {s_val, e_val, objective_value};
+  const AutoShardingSolverResult expected_result = {expected_output, false};
   EXPECT_EQ(result, expected_result);
 }
 
@@ -277,9 +277,9 @@ TEST(CallORToolsSolverTest, AvoidsInfiniteNodeCosts) {
   const std::vector<NodeStrategyIdx> s_val = {3, 0, 0, 0, 0};
   const std::vector<EdgeStrategyIdx> e_val = {12, 0};
   const double objective_value = 10683.0;
-  const AutoShardingSolverResult expected_result = {
-      std::make_tuple(
-          std::move(s_val), std::move(e_val), objective_value), false};
+  const AutoShardingSolverOutput expected_output =
+          {s_val, e_val, objective_value};
+  const AutoShardingSolverResult expected_result = {expected_output, false};
   EXPECT_EQ(result, expected_result);
 }
 
@@ -292,9 +292,9 @@ TEST(CallORToolsSolverTest, AvoidsInfiniteEdgeCosts) {
   const std::vector<NodeStrategyIdx> s_val = {0, 0, 1, 1, 0};
   const std::vector<EdgeStrategyIdx> e_val = {1, 1};
   const double objective_value = 7872.0;
-  const AutoShardingSolverResult expected_result = {
-      std::make_tuple(
-          std::move(s_val), std::move(e_val), objective_value), false};
+  const AutoShardingSolverOutput expected_output =
+          {s_val, e_val, objective_value};
+  const AutoShardingSolverResult expected_result = {expected_output, false};
   EXPECT_EQ(result, expected_result);
 }
 
@@ -319,9 +319,9 @@ TEST(CallORToolsSolverTest, HandlesFollowedEdges) {
   const std::vector<NodeStrategyIdx> s_val = {0, 0, 0, 0, 0};
   const std::vector<EdgeStrategyIdx> e_val = {0, 0, 0};
   const double objective_value = 12650.0;
-  const AutoShardingSolverResult expected_result = {
-      std::make_tuple(
-          std::move(s_val), std::move(e_val), objective_value), false};
+  const AutoShardingSolverOutput expected_output =
+          {s_val, e_val, objective_value};
+  const AutoShardingSolverResult expected_result = {expected_output, false};
   EXPECT_EQ(result, expected_result);
 }
 
@@ -335,9 +335,9 @@ TEST(CallORToolsSolverTest, UsesHint) {
   const std::vector<NodeStrategyIdx> s_val = {0, 0, 0, 0, 0};
   const std::vector<EdgeStrategyIdx> e_val = {0, 0};
   const double objective_value = 7650.0;
-  const AutoShardingSolverResult expected_result = {
-      std::make_tuple(
-          std::move(s_val), std::move(e_val), objective_value), false};
+  const AutoShardingSolverOutput expected_output =
+          {s_val, e_val, objective_value};
+  const AutoShardingSolverResult expected_result = {expected_output, false};
   EXPECT_EQ(result, expected_result);
 }
 
@@ -369,9 +369,9 @@ TEST(CallORToolsSolverTest, HandlesMemoryEdgeCosts) {
   const std::vector<NodeStrategyIdx> s_val = {0, 0, 1, 1, 0};
   const std::vector<EdgeStrategyIdx> e_val = {1, 1};
   const double objective_value = 7872.0;
-  const AutoShardingSolverResult expected_result = {
-      std::make_tuple(
-          std::move(s_val), std::move(e_val), objective_value), false};
+  const AutoShardingSolverOutput expected_output =
+          {s_val, e_val, objective_value};
+  const AutoShardingSolverResult expected_result = {expected_output, false};
   EXPECT_EQ(result, expected_result);
 }
 
@@ -384,9 +384,9 @@ TEST(CallORToolsSolverTest, SolvesWithEquivalences) {
   const std::vector<NodeStrategyIdx> s_val = {0, 0, 5, 5, 1};
   const std::vector<EdgeStrategyIdx> e_val = {5, 5};
   const double objective_value = 7650.0;
-  const AutoShardingSolverResult expected_result = {
-      std::make_tuple(
-          std::move(s_val), std::move(e_val), objective_value), false};
+  const AutoShardingSolverOutput expected_output =
+          {s_val, e_val, objective_value};
+  const AutoShardingSolverResult expected_result = {expected_output, false};
   EXPECT_EQ(result, expected_result);
 }
 
@@ -395,9 +395,8 @@ TEST(AutoShardingEvaluatorTest, NoViolations) {
   const std::vector<NodeStrategyIdx> s_val = {3, 1, 2, 2, 1};
   const std::vector<EdgeStrategyIdx> e_val = {14, 6};
   const double objective_value = 12149.0;
-  const AutoShardingSolverResult result = {
-      std::make_tuple(
-          std::move(s_val), std::move(e_val), objective_value), false};
+  const AutoShardingSolverOutput output = {s_val, e_val, objective_value};
+  const AutoShardingSolverResult result = {output, false};
 
   const AutoShardingEvaluation evaluation = Evaluate(request, result);
 
@@ -419,9 +418,8 @@ TEST(AutoShardingEvaluatorTest, EvaluatesOverbudget) {
   const std::vector<NodeStrategyIdx> s_val = {2 /* violates */, 1, 2, 2, 1};
   const std::vector<EdgeStrategyIdx> e_val = {10, 6};
   const double objective_value = 11138.0;
-  const AutoShardingSolverResult result = {
-      std::make_tuple(
-          std::move(s_val), std::move(e_val), objective_value), false};
+  const AutoShardingSolverOutput output = {s_val, e_val, objective_value};
+  const AutoShardingSolverResult result = {output, false};
 
   const AutoShardingEvaluation evaluation = Evaluate(request, result);
 
@@ -443,9 +441,8 @@ TEST(AutoShardingEvaluatorTest, ViolatesFollower) {
   const std::vector<NodeStrategyIdx> s_val = {3, 1, 2, 1 /* violates */, 1};
   const std::vector<EdgeStrategyIdx> e_val = {14, 6};
   const double objective_value = 12138.0;
-  const AutoShardingSolverResult result = {
-      std::make_tuple(
-          std::move(s_val), std::move(e_val), objective_value), false};
+  const AutoShardingSolverOutput output = {s_val, e_val, objective_value};
+  const AutoShardingSolverResult result = {output, false};
 
   const AutoShardingEvaluation evaluation = Evaluate(request, result);
 
@@ -466,9 +463,8 @@ TEST(AutoShardingEvaluatorTest, ViolatesAlias) {
   const std::vector<NodeStrategyIdx> s_val = {3, 1, 2, 2, 0 /* violates */};
   const std::vector<EdgeStrategyIdx> e_val = {14, 6};
   const double objective_value = 12138.0;
-  const AutoShardingSolverResult result = {
-      std::make_tuple(
-          std::move(s_val), std::move(e_val), objective_value), false};
+  const AutoShardingSolverOutput output = {s_val, e_val, objective_value};
+  const AutoShardingSolverResult result = {output, false};
 
   const AutoShardingEvaluation evaluation = Evaluate(request, result);
 
@@ -489,9 +485,8 @@ TEST(AutoShardingEvaluatorTest, ViolatesMemory) {
   const std::vector<NodeStrategyIdx> s_val = {2 /* violates */, 1, 2, 2, 1};
   const std::vector<EdgeStrategyIdx> e_val = {10, 6};
   const double objective_value = 11138.0;
-  const AutoShardingSolverResult result = {
-      std::make_tuple(
-          std::move(s_val), std::move(e_val), objective_value), false};
+  const AutoShardingSolverOutput output = {s_val, e_val, objective_value};
+  const AutoShardingSolverResult result = {output, false};
 
   const AutoShardingEvaluation evaluation = Evaluate(request, result);
 
@@ -515,9 +510,8 @@ TEST(AutoShardingEvaluatorTest, ViolatesInfiniteCostForNode) {
   const std::vector<NodeStrategyIdx> s_val = {0 /* violates */, 1, 2, 2, 1};
   const std::vector<EdgeStrategyIdx> e_val = {2, 6};
   const double objective_value = 1e+20;
-  const AutoShardingSolverResult result = {
-      std::make_tuple(
-          std::move(s_val), std::move(e_val), objective_value), false};
+  const AutoShardingSolverOutput output = {s_val, e_val, objective_value};
+  const AutoShardingSolverResult result = {output, false};
 
   const AutoShardingEvaluation evaluation = Evaluate(request, result);
 
@@ -539,9 +533,8 @@ TEST(AutoShardingEvaluatorTest, ViolatesInfiniteCostForEdge) {
   const std::vector<NodeStrategyIdx> s_val = {0, 1, 2, 2, 1};
   const std::vector<EdgeStrategyIdx> e_val = {2 /* violates */, 6};
   const double objective_value = 1e+20;
-  const AutoShardingSolverResult result = {
-      std::make_tuple(
-          std::move(s_val), std::move(e_val), objective_value), false};
+  const AutoShardingSolverOutput output = {s_val, e_val, objective_value};
+  const AutoShardingSolverResult result = {output, false};
 
   const AutoShardingEvaluation evaluation = Evaluate(request, result);
 
@@ -563,9 +556,8 @@ TEST(AutoShardingEvaluatorTest, ViolatesMaxDepartures) {
   const std::vector<NodeStrategyIdx> s_val = {3, 1, 2, 2, 1};
   const std::vector<EdgeStrategyIdx> e_val = {14, 6};
   const double objective_value = 12149.0;
-  const AutoShardingSolverResult result = {
-      std::make_tuple(
-          std::move(s_val), std::move(e_val), objective_value), false};
+  const AutoShardingSolverOutput output = {s_val, e_val, objective_value};
+  const AutoShardingSolverResult result = {output, false};
 
   const AutoShardingEvaluation evaluation = Evaluate(request, result);
 
@@ -586,18 +578,17 @@ TEST(AutoShardingRationalizerTest, RationalizesProperly) {
   const std::vector<NodeStrategyIdx> s_val = {0, 1, 2, 2, 1};
   const std::vector<EdgeStrategyIdx> e_val = {2, 6};
   const double objective_value = 9116.0;
-  const AutoShardingSolverResult result = {
-      std::make_tuple(
-          std::move(s_val), std::move(e_val), objective_value), false};
+  const AutoShardingSolverOutput output = {s_val, e_val, objective_value};
+  const AutoShardingSolverResult result = {output, false};
   const std::vector<NodeStrategyIdx> s_subopt = {3, 1, 2, 2, 1};
   const std::vector<EdgeStrategyIdx> e_subopt = {14, 6};
   const double subopt_value = 12149.0;
-  const AutoShardingSolverResult subopt = {
-      std::make_tuple(
-          std::move(s_subopt), std::move(e_subopt), subopt_value), false};
+  const AutoShardingSolverOutput subopt_output =
+      {s_subopt, e_subopt, subopt_value};
+  const AutoShardingSolverResult subopt_result = {subopt_output, false};
 
   const std::vector<std::string> rationales =
-      Rationalize(request, result, subopt);
+      Rationalize(request, result, subopt_result);
 
   const std::vector<std::string> expected_rationales = {
       "strategy changes for A (0 -> 3)",
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.cc
index a4a5149d0338f6..2c67fa56a1dc23 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.cc
@@ -763,6 +763,8 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
                                 strategy_group, replicated_penalty);
         break;
       }
+      case HloOpcode::kRecv:
+      case HloOpcode::kRecvDone:
       case HloOpcode::kSend: {
         strategy_group = CreateTupleStrategyGroup(instruction_id);
         strategy_group->childs.reserve(ins->shape().tuple_shapes_size());
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc
index 359549ee07a872..9ce627b58867b7 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_option.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_util.h"
+#include "xla/hlo/ir/hlo_input_output_alias_config.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_schedule.h"
@@ -2152,6 +2153,62 @@ ENTRY %entry {
   EXPECT_TRUE(changed);
 }
 
+TEST_F(AutoShardingTest, BufferDonorConfigPreservation) {
+  constexpr absl::string_view kHloString = R"(
+HloModule Module, buffer_donor={ (0, {0}), (0, {1}) }
+
+ENTRY entry {
+  %p = (f32[], f32[]) parameter(0)
+  %p0 = f32[] get-tuple-element((f32[], f32[]) %p), index=0
+  %p1 = f32[] get-tuple-element((f32[], f32[]) %p), index=1
+  ROOT %out = (f32[], f32[]) tuple(%p0, %p1)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
+  AutoShardingOption option;
+  option.enable = true;
+  option.device_mesh_shape = {2, 2};
+  // Creating an explicit copy here to ensure that it is not modified during
+  // auto-sharding
+  const HloBufferDonorConfig buffer_donor_config_before =
+      module->buffer_donor_config();
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, AutoSharding(option).Run(module.get()));
+  EXPECT_TRUE(changed);
+  const HloBufferDonorConfig& buffer_donor_config_after =
+      module->buffer_donor_config();
+  EXPECT_EQ(buffer_donor_config_before.ToString(),
+            buffer_donor_config_after.ToString());
+}
+
+TEST_F(AutoShardingTest, InputOutputAliasConfigPreservation) {
+  constexpr absl::string_view kHloString = R"(
+HloModule Module, input_output_alias={ {0}: (0, {0}, must-alias), {1}: (0, {1}) }
+
+ENTRY entry {
+  %p = (f32[], f32[]) parameter(0)
+  %p0 = f32[] get-tuple-element((f32[], f32[]) %p), index=0
+  %p1 = f32[] get-tuple-element((f32[], f32[]) %p), index=1
+  ROOT %out = (f32[], f32[]) tuple(%p0, %p1)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
+  AutoShardingOption option;
+  option.enable = true;
+  option.device_mesh_shape = {2, 2};
+  // Creating an explicit copy here to ensure that it is not modified during
+  // auto-sharding
+  const HloInputOutputAliasConfig input_output_alias_config_before =
+      module->input_output_alias_config();
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, AutoSharding(option).Run(module.get()));
+  EXPECT_TRUE(changed);
+  const HloInputOutputAliasConfig& input_output_alias_config_after =
+      module->input_output_alias_config();
+  EXPECT_EQ(input_output_alias_config_before.ToString(),
+            input_output_alias_config_after.ToString());
+}
+
 }  // namespace
 }  // namespace spmd
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
index baa827febc909a..d07dcd0208d1b3 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
@@ -59,6 +59,7 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/status.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/platform/errors.h"
 #include "tsl/platform/status.h"
 
 namespace xla {
@@ -97,8 +98,9 @@ std::optional<HloSharding> GetInputSharding(const HloInstruction* ins,
   }
 
   std::optional<HloSharding> inferred_sharding =
-      ShardingPropagation::GetShardingFromUser(
-          *ins_clone->operand(op_index), *ins_clone, 10, true, call_graph);
+      ShardingPropagation::GetShardingFromUser(*ins_clone->operand(op_index),
+                                               *ins_clone, 10, true, call_graph,
+                                               /*sharding_helper=*/nullptr);
 
   if (!inferred_sharding.has_value() && IsTopKCustomCall(ins)) {
     // ShardingPropagation::GetShardingFromUser does not handle TopK custom
@@ -164,7 +166,7 @@ HloSharding PropagateDimwiseShardingSlice(const HloSharding& input_spec,
 
   std::vector<int64_t> tensor_to_mesh_dim =
       GetTensorDimToMeshDim(new_shape.rank(), input_spec, device_mesh,
-                            /* consider_reverse_device_meshes */ false);
+                            /* consider_reverse_device_meshes */ true);
 
   std::vector<int64_t> tensor_dims;
   std::vector<int64_t> mesh_dims;
@@ -1270,10 +1272,9 @@ std::vector<int64_t> GetTensorDimToMeshDim(
   }
 }
 
-Shape ComputeIntermediateShape(const HloSharding& src_sharding,
-                               const HloSharding& dst_sharding,
-                               const Shape& shape,
-                               const Array<int64_t>& device_mesh) {
+absl::StatusOr<Shape> ComputeIntermediateShape(
+    const HloSharding& src_sharding, const HloSharding& dst_sharding,
+    const Shape& shape, const Array<int64_t>& device_mesh) {
   int64_t src_n_dim = NumTileDimensions(src_sharding);
 
   const HloSharding* sharding_1d;
@@ -1291,8 +1292,10 @@ Shape ComputeIntermediateShape(const HloSharding& src_sharding,
     if (sharding_1d->tile_assignment().dim(i) == 1) {
       inter_shape_dims.push_back(shape.dimensions(i));
     } else {
-      CHECK(shape.dimensions(i) % device_mesh.dim(0) == 0)
-          << "Only support even partition";
+      // TODO(b/333750146): Support this case instead of bailing here
+      if (shape.dimensions(i) % device_mesh.dim(0) != 0) {
+        return absl::InternalError("Indivisible tensor dims");
+      }
       inter_shape_dims.push_back(device_mesh.dim(0));
       inter_shape_dims.push_back(shape.dimensions(i) / device_mesh.dim(0));
     }
@@ -1309,36 +1312,40 @@ HloInstruction* ReshardTensor(HloInstruction* tensor,
                               const HloSharding& dst_sharding,
                               const Array<int64_t>& device_mesh) {
   const Shape& shape = tensor->shape();
-  auto computation = tensor->parent();
+  HloComputation* computation = tensor->parent();
 
   int64_t src_n_dim = NumTileDimensions(src_sharding);
   int64_t dst_n_dim = NumTileDimensions(dst_sharding);
 
   HloInstruction* replace_with = nullptr;
   if (src_n_dim != dst_n_dim && src_n_dim != -1 && dst_n_dim != -1) {
-    Shape inter_shape = ComputeIntermediateShape(src_sharding, dst_sharding,
-                                                 shape, device_mesh);
-
-    std::optional<HloSharding> src_inter_sharding =
-        hlo_sharding_util::ReshapeSharding(shape, inter_shape, src_sharding);
-    std::optional<HloSharding> dst_inter_sharding =
-        hlo_sharding_util::ReshapeSharding(shape, inter_shape, dst_sharding);
-    if (!src_inter_sharding.has_value() || !dst_inter_sharding.has_value()) {
-      src_inter_sharding = HloSharding::Replicate();
-      dst_inter_sharding = HloSharding::Replicate();
-      LOG(WARNING) << "Invalid mixed mesh shape resharding.";
-    }
+    absl::StatusOr<Shape> inter_shape = ComputeIntermediateShape(
+        src_sharding, dst_sharding, shape, device_mesh);
+    if (inter_shape.ok()) {
+      std::optional<HloSharding> src_inter_sharding =
+          hlo_sharding_util::ReshapeSharding(shape, *inter_shape, src_sharding);
+      std::optional<HloSharding> dst_inter_sharding =
+          hlo_sharding_util::ReshapeSharding(shape, *inter_shape, dst_sharding);
+      if (!src_inter_sharding.has_value() || !dst_inter_sharding.has_value()) {
+        src_inter_sharding = HloSharding::Replicate();
+        dst_inter_sharding = HloSharding::Replicate();
+        LOG(WARNING) << "Invalid mixed mesh shape resharding.";
+      }
 
-    HloInstruction* src_inter = computation->AddInstruction(
-        HloInstruction::CreateReshape(inter_shape, tensor));
-    src_inter->set_sharding(*src_inter_sharding);
+      HloInstruction* src_inter = computation->AddInstruction(
+          HloInstruction::CreateReshape(*inter_shape, tensor));
+      src_inter->set_sharding(*src_inter_sharding);
 
-    HloInstruction* dst_inter = computation->AddInstruction(
-        HloInstruction::CreateReshape(inter_shape, src_inter));
-    dst_inter->set_sharding(*dst_inter_sharding);
+      HloInstruction* dst_inter = computation->AddInstruction(
+          HloInstruction::CreateReshape(*inter_shape, src_inter));
+      dst_inter->set_sharding(*dst_inter_sharding);
 
-    replace_with = computation->AddInstruction(
-        HloInstruction::CreateReshape(shape, dst_inter));
+      replace_with = computation->AddInstruction(
+          HloInstruction::CreateReshape(shape, dst_inter));
+    } else {
+      replace_with = computation->AddInstruction(
+          HloInstruction::CreateReshape(shape, tensor));
+    }
   } else {
     replace_with = computation->AddInstruction(
         HloInstruction::CreateReshape(shape, tensor));
@@ -1348,19 +1355,20 @@ HloInstruction* ReshardTensor(HloInstruction* tensor,
   return replace_with;
 }
 
-void FixMixedMeshShapeReshardingGetTupleElementWithTupleOutput(
+absl::Status FixMixedMeshShapeReshardingGetTupleElementWithTupleOutput(
     HloInstruction* inst,
     const std::vector<std::optional<HloSharding>>& dst_shardings,
     const Array<int64_t>& device_mesh) {
   size_t tuple_size = inst->shape().tuple_shapes_size();
-  auto current_sharding = inst->sharding();
+  const HloSharding& current_sharding = inst->sharding();
 
   bool need_to_reshard = false;
   for (size_t i = 0; i < tuple_size; ++i) {
     CHECK(!inst->shape().tuple_shapes(i).IsTuple());
-    auto element_current_sharding = current_sharding.GetSubSharding(
-        inst->shape(), {static_cast<int64_t>(i)});
-    auto element_dst_sharding_opt = dst_shardings[i];
+    const HloSharding& element_current_sharding =
+        current_sharding.GetSubSharding(inst->shape(),
+                                        {static_cast<int64_t>(i)});
+    std::optional<HloSharding> element_dst_sharding_opt = dst_shardings[i];
 
     // Extract tuple element
     if (element_dst_sharding_opt.has_value() &&
@@ -1370,21 +1378,22 @@ void FixMixedMeshShapeReshardingGetTupleElementWithTupleOutput(
   }
 
   if (!need_to_reshard) {
-    return;
+    return absl::OkStatus();
   }
 
-  auto inst_users = inst->users();
+  const PtrVec<HloInstruction*>& inst_users = inst->users();
   std::vector<HloInstruction*> resharded;
   std::vector<HloSharding> reassembled_tuple_shardings;
   resharded.reserve(tuple_size);
   reassembled_tuple_shardings.reserve(tuple_size);
   for (size_t i = 0; i < tuple_size; ++i) {
-    auto element_current_sharding = current_sharding.GetSubSharding(
-        inst->shape(), {static_cast<int64_t>(i)});
-    auto element_dst_sharding_opt = dst_shardings[i];
+    const HloSharding& element_current_sharding =
+        current_sharding.GetSubSharding(inst->shape(),
+                                        {static_cast<int64_t>(i)});
+    std::optional<HloSharding> element_dst_sharding_opt = dst_shardings[i];
 
     // Extract tuple element
-    auto element =
+    HloInstruction* element =
         inst->parent()->AddInstruction(HloInstruction::CreateGetTupleElement(
             inst->shape().tuple_shapes(i), inst, i));
     if (!element_dst_sharding_opt.has_value() ||
@@ -1392,36 +1401,39 @@ void FixMixedMeshShapeReshardingGetTupleElementWithTupleOutput(
       resharded.push_back(std::move(element));
       reassembled_tuple_shardings.push_back(element_current_sharding);
     } else {
-      auto replace_with = ReshardTensor(element, element_current_sharding,
-                                        *element_dst_sharding_opt, device_mesh);
+      HloInstruction* replace_with =
+          ReshardTensor(element, element_current_sharding,
+                        *element_dst_sharding_opt, device_mesh);
       resharded.push_back(std::move(replace_with));
       reassembled_tuple_shardings.push_back(*element_dst_sharding_opt);
     }
   }
 
-  auto reassembled_tuple =
+  HloInstruction* reassembled_tuple =
       inst->parent()->AddInstruction(HloInstruction::CreateTuple(resharded));
   reassembled_tuple->set_sharding(
       HloSharding::Tuple(inst->shape(), reassembled_tuple_shardings));
 
-  for (auto user : inst_users) {
-    TF_CHECK_OK(inst->ReplaceUseWith(user, reassembled_tuple));
+  for (HloInstruction* user : inst_users) {
+    TF_RETURN_IF_ERROR(inst->ReplaceUseWith(user, reassembled_tuple));
   }
+  return absl::OkStatus();
 }
 
-void FixMixedMeshShapeReshardingGetTupleElement(
+absl::Status FixMixedMeshShapeReshardingGetTupleElement(
     HloInstruction* inst, const HloSharding& dst_sharding,
     const Array<int64_t>& device_mesh,
     absl::flat_hash_map<std::string, std::vector<HloSharding>>&
         preserve_shardings) {
-  HloInstruction* operand = inst->mutable_operand(0);
-  auto input_tuple_sharding = operand->sharding();
+  const HloInstruction* operand = inst->operand(0);
+  const HloSharding& input_tuple_sharding = operand->sharding();
   size_t index = inst->tuple_index();
   if (input_tuple_sharding.tuple_elements()[index] == dst_sharding) {
-    return;
+    return absl::OkStatus();
   }
 
-  auto inst_users = inst->users();
+  // Make a copy of the users before things are modified.
+  const PtrVec<HloInstruction*> inst_users = inst->users();
 
   const HloSharding& src_sharding =
       input_tuple_sharding.tuple_elements()[index];
@@ -1438,8 +1450,8 @@ void FixMixedMeshShapeReshardingGetTupleElement(
                  << "GB: " << replace_with->ToString();
   }
 
-  for (auto user : inst_users) {
-    TF_CHECK_OK(inst->ReplaceUseWith(user, replace_with));
+  for (HloInstruction* user : inst_users) {
+    TF_RETURN_IF_ERROR(inst->ReplaceUseWith(user, replace_with));
   }
 
   auto iter = preserve_shardings.find(inst->name());
@@ -1448,21 +1460,22 @@ void FixMixedMeshShapeReshardingGetTupleElement(
         std::vector<HloSharding>(iter->second);
     preserve_shardings.erase(inst->name());
   }
+  return absl::OkStatus();
 }
 
-void FixMixedMeshShapeResharding(HloInstruction* inst, int operand_num,
-                                 const HloSharding& dst_sharding,
-                                 const Array<int64_t>& device_mesh,
-                                 ReshardingCache* resharding_cache) {
+absl::Status FixMixedMeshShapeResharding(HloInstruction* inst, int operand_num,
+                                         const HloSharding& dst_sharding,
+                                         const Array<int64_t>& device_mesh,
+                                         ReshardingCache* resharding_cache) {
   HloInstruction* operand = inst->mutable_operand(operand_num);
   if (operand->opcode() == HloOpcode::kOutfeed ||
       operand->opcode() == HloOpcode::kSendDone) {
-    return;
+    return absl::OkStatus();
   }
 
   CHECK(operand->has_sharding()) << inst->name() << " " << operand->name();
   if (operand->sharding() == dst_sharding) {
-    return;
+    return absl::OkStatus();
   }
 
   if (operand->shape().IsToken()) {
@@ -1479,7 +1492,8 @@ void FixMixedMeshShapeResharding(HloInstruction* inst, int operand_num,
         nullptr;
     if (resharding_cache != nullptr) {
       cache_vector = &((*resharding_cache)[operand]);
-      for (auto& entry : *cache_vector) {
+      for (const std::pair<HloSharding, HloInstruction*>& entry :
+           *cache_vector) {
         if (entry.first == dst_sharding) {
           replace_with = entry.second;
         }
@@ -1504,8 +1518,9 @@ void FixMixedMeshShapeResharding(HloInstruction* inst, int operand_num,
                    << "GB: " << replace_with->ToString();
     }
 
-    TF_CHECK_OK(inst->ReplaceOperandWith(operand_num, replace_with));
+    TF_RETURN_IF_ERROR(inst->ReplaceOperandWith(operand_num, replace_with));
   }
+  return absl::OkStatus();
 }
 
 bool IsParameterConvert(const HloInstruction* inst) {
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
index 4a129bb159076e..1ca5c644d58041 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #ifndef XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_UTIL_H_
 #define XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_UTIL_H_
 
@@ -455,28 +454,27 @@ int64_t NumTileDimensions(const HloSharding& spec);
 
 // When fixing mixed mesh resharding (see below), compute the correct
 // intermediate shape in order to insert copies.
-Shape ComputeIntermediateShape(const HloSharding& src_sharding,
-                               const HloSharding& dst_sharding,
-                               const Shape& shape,
-                               const Array<int64_t>& device_mesh);
+absl::StatusOr<Shape> ComputeIntermediateShape(
+    const HloSharding& src_sharding, const HloSharding& dst_sharding,
+    const Shape& shape, const Array<int64_t>& device_mesh);
 
 // Forcibly set the sharding of the operand of inst.
 // Also fix the resharding between 1d and 2d logical mesh.
-void FixMixedMeshShapeReshardingGetTupleElement(
+absl::Status FixMixedMeshShapeReshardingGetTupleElement(
     HloInstruction* inst, const HloSharding& dst_sharding,
     const Array<int64_t>& device_mesh,
     absl::flat_hash_map<std::string, std::vector<HloSharding>>&
         preserve_shardings);
 
-void FixMixedMeshShapeReshardingGetTupleElementWithTupleOutput(
+absl::Status FixMixedMeshShapeReshardingGetTupleElementWithTupleOutput(
     HloInstruction* inst,
     const std::vector<std::optional<HloSharding>>& dst_sharding,
     const Array<int64_t>& device_mesh);
 
-void FixMixedMeshShapeResharding(HloInstruction* inst, int operand_num,
-                                 const HloSharding& dst_sharding,
-                                 const Array<int64_t>& device_mesh,
-                                 ReshardingCache* resharding_cache);
+absl::Status FixMixedMeshShapeResharding(HloInstruction* inst, int operand_num,
+                                         const HloSharding& dst_sharding,
+                                         const Array<int64_t>& device_mesh,
+                                         ReshardingCache* resharding_cache);
 
 /*
  * Gradient accumulation
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_wrapper.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_wrapper.h
index 3adbac1700f954..1811a2cc619e5e 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_wrapper.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_wrapper.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_option.h"
@@ -44,9 +45,9 @@ AutoShardingSolverResult CallSolver(
     const LivenessEdgeSet& liveness_edge_set, const StrategyMap& strategy_map,
     const StrategyGroups& strategy_groups, const CostGraph& cost_graph,
     const AliasSet& alias_set, const std::vector<NodeStrategyIdx>& s_hint,
-    bool compute_iis, int64_t solver_timeout_in_seconds,
-    const AutoShardingOption& option, std::optional<double> max_cost,
-    absl::string_view request_name,
+    const absl::flat_hash_set<LivenessIdx>& peak_times, bool compute_iis,
+    int64_t solver_timeout_in_seconds, const AutoShardingOption& option,
+    std::optional<double> max_cost, absl::string_view request_name,
     const absl::flat_hash_map<std::string, const HloInstruction*>&
         sharding_propagation_solution = {},
     bool deterministic_mode = false);
diff --git a/third_party/xla/xla/hlo/ir/BUILD b/third_party/xla/xla/hlo/ir/BUILD
index 56fe33017d685d..1e8ac27a54df3c 100644
--- a/third_party/xla/xla/hlo/ir/BUILD
+++ b/third_party/xla/xla/hlo/ir/BUILD
@@ -1,8 +1,8 @@
 # Description:
 #   XLA’s HLO Intermediate Representation implementation.
 
-load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla/tsl:tsl.bzl", "internal_visibility")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/hlo/ir/dfs_hlo_visitor_with_default.h b/third_party/xla/xla/hlo/ir/dfs_hlo_visitor_with_default.h
index ca6a38ef2067ba..93b085772cd624 100644
--- a/third_party/xla/xla/hlo/ir/dfs_hlo_visitor_with_default.h
+++ b/third_party/xla/xla/hlo/ir/dfs_hlo_visitor_with_default.h
@@ -301,7 +301,7 @@ using ConstDfsHloVisitorWithDefault =
 class DfsHloRewriteVisitor : public DfsHloVisitorWithDefault {
  public:
   // Runs a visitor on the module and returns whether the module has changed.
-  StatusOr<bool> RunOnModule(
+  absl::StatusOr<bool> RunOnModule(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads = {}) {
     Status status;
@@ -341,14 +341,15 @@ class DfsHloRewriteVisitor : public DfsHloVisitorWithDefault {
   // Replaces the existing HLO instruction old_instruction, with
   // new_instruction, and marks the optimizer status as changed.
   // Returns the Status representing the result of the replace operation.
-  StatusOr<bool> ReplaceInstruction(HloInstruction* old_instruction,
-                                    HloInstruction* new_instruction,
-                                    bool preserve_sharding) {
+  absl::StatusOr<bool> ReplaceInstruction(HloInstruction* old_instruction,
+                                          HloInstruction* new_instruction,
+                                          bool preserve_sharding) {
     VLOG(3) << "Replacing instruction:" << "\n  old: "
             << old_instruction->ToString()
             << "\n  new: " << new_instruction->ToString();
-    StatusOr<bool> changed_or = old_instruction->parent()->ReplaceInstruction(
-        old_instruction, new_instruction, preserve_sharding);
+    absl::StatusOr<bool> changed_or =
+        old_instruction->parent()->ReplaceInstruction(
+            old_instruction, new_instruction, preserve_sharding);
     if (ABSL_PREDICT_TRUE(changed_or.ok())) {
       changed_ |= changed_or.value();
     }
@@ -357,7 +358,7 @@ class DfsHloRewriteVisitor : public DfsHloVisitorWithDefault {
 
   Status ReplaceInstruction(HloInstruction* old_instruction,
                             HloInstruction* new_instruction) {
-    StatusOr<bool> changed_or =
+    absl::StatusOr<bool> changed_or =
         ReplaceInstruction(old_instruction, new_instruction,
                            /*preserve_sharding=*/false);
     if (ABSL_PREDICT_TRUE(changed_or.ok())) {
diff --git a/third_party/xla/xla/hlo/ir/hlo_computation.cc b/third_party/xla/xla/hlo/ir/hlo_computation.cc
index 449a86db314cc3..89edd53556616d 100644
--- a/third_party/xla/xla/hlo/ir/hlo_computation.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_computation.cc
@@ -50,6 +50,7 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/lib/gtl/iterator_range.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
@@ -136,6 +137,7 @@ HloComputation::HloComputation(
     HloInstruction* root_instruction)
     : unique_id_(-1),
       root_instruction_(root_instruction),
+      instruction_count_(0),
       name_(NameUniquer::GetSanitizedName(name)) {
   param_instructions_.resize(parameter_count, nullptr);
   bool root_found = false;
@@ -167,10 +169,10 @@ HloComputation::~HloComputation() {
     CHECK(async_start_->async_wrapped_computation() == this);
     async_start_->ClearCalledComputations();
   }
+  Cleanup();
   for (const auto& i : instructions_) {
     delete i.inst();
   }
-  Cleanup();
 }
 
 void HloComputation::SetInstruction(HloInstruction* instruction,
@@ -214,6 +216,18 @@ HloInstruction* HloComputation::AddInstruction(
   return AddInstruction(std::move(instruction));
 }
 
+HloInstruction* HloComputation::AddInstruction(
+    std::unique_ptr<HloInstruction> instruction, const OpMetadata* metadata,
+    const FrontendAttributes* frontend_attributes) {
+  if (metadata != nullptr) {
+    instruction->set_metadata(*metadata);
+  }
+  if (frontend_attributes != nullptr) {
+    instruction->set_frontend_attributes(*frontend_attributes);
+  }
+  return AddInstruction(std::move(instruction));
+}
+
 HloInstruction* HloComputation::AddInstructionInternal(
     std::unique_ptr<HloInstruction> instruction) {
   if (parent() != nullptr) {
@@ -228,7 +242,7 @@ HloInstruction* HloComputation::AddInstructionInternal(
   VLOG(2) << "Adding instruction " << pinst << " " << pinst->name()
           << " from computation " << name() << " opcode " << info.opcode();
   uint32_t index = instructions_.size();
-  instruction_indices_[pinst] = index;
+  instruction_count_++;
   pinst->index_in_parent_ = index;
   instructions_.push_back(info);
   return pinst;
@@ -453,29 +467,64 @@ Status HloComputation::RemoveInstructionImpl(HloInstruction* instruction,
       << "instruction " << instruction->name()
       << " has control successors and cannot be removed";
 
-  auto inst_it = instruction_indices_.find(instruction);
-  TF_RET_CHECK(inst_it != instruction_indices_.end());
-  HloInstructionInfo* info = &instructions_[inst_it->second];
+  HloInstructionInfo* info = &instructions_[instruction->index_in_parent_];
+  DCHECK_EQ(info->inst(), instruction);
   info->inst()->set_parent(nullptr);
   to_be_deleted_.push_back(info->inst());  // Takes ownership
   to_be_deleted_.back()->DetachFromOperandsAndUsers();
   // Clear all operands to avoid Null operands.
   to_be_deleted_.back()->RemoveAllOperands();
-  // These require non-trivial cleanup for their called computations,
-  // which is invoked in the ops destructor.
-  if (!to_be_deleted_.back()->IsAsynchronous() &&
-      !to_be_deleted_.back()->IsFused()) {
-    to_be_deleted_.back()->ClearCalledComputations();
-  }
+  to_be_deleted_.back()->ClearCalledComputations();
   to_be_deleted_.back()->MarkAsDead();
   // TODO(jeff): should we set info->opcode to something?
   info->inst_ =
       nullptr;  // Leave a hole: this is no longer part of "instructions()"
-  instruction_indices_.erase(inst_it);
   instruction->index_in_parent_ = ~0u;
+  instruction_count_--;
+  DCHECK_EQ(instructions_.size() - to_be_deleted_.size(), instruction_count())
+      << "instructions_.size(): " << instructions_.size()
+      << ", to_be_deleted_.size(): " << to_be_deleted_.size();
   return OkStatus();
 }
 
+void HloComputation::Cleanup() {
+  if (to_be_deleted_.empty()) return;
+
+  // Given that there are instructions to be deleted, there must be at least one
+  // instruction not marked for deletion. Otherwise we have deleted *all*
+  // instructions, which is probably a bug.
+  DCHECK_GT(instruction_count(), 0);
+
+  // Perform a stable compaction with the erase-remove idiom. We have to open
+  // code it (instead of using std::erase(std::remove_if)) because we must
+  // update the reverse mapping.
+  auto is_marked_for_removal = [](const HloInstructionInfo& info) {
+    return info.inst() == nullptr;
+  };
+  auto marked_it = absl::c_find_if(instructions_, is_marked_for_removal);
+  DCHECK(marked_it < instructions_.end());
+  for (auto it = marked_it + 1; it < instructions_.end(); ++it) {
+    if (is_marked_for_removal(*it)) continue;
+    // Update reverse mapping and overwrite the 'marked' entry.
+    HloInstruction* unmarked_instruction = it->inst();
+    unmarked_instruction->index_in_parent_ =
+        std::distance(instructions_.begin(), marked_it);
+    *marked_it++ = std::move(*it);
+  }
+
+  DCHECK(marked_it < instructions_.end());
+  DCHECK_EQ(std::distance(marked_it, instructions_.end()),
+            to_be_deleted_.size());
+  DCHECK_EQ(instructions_.size() - to_be_deleted_.size(), instruction_count())
+      << "instructions_.size(): " << instructions_.size()
+      << ", to_be_deleted_.size(): " << to_be_deleted_.size();
+  for (HloInstruction* marked_instruction : to_be_deleted_) {
+    delete marked_instruction;
+  }
+  to_be_deleted_.clear();
+  instructions_.resize(instruction_count());
+}
+
 void HloComputation::set_root_instruction(HloInstruction* new_root_instruction,
                                           bool accept_different_shape) {
   // The shape of the root (ignoring layout) is an invariant of the computation
@@ -647,7 +696,7 @@ std::vector<HloInstruction*> HloComputation::MakeInstructionPostOrder(
                                   post_order, &dfs_stack_scratch);
     }
   }
-  CHECK_EQ(instruction_indices_.size(), post_order.size())
+  CHECK_EQ(instruction_count(), post_order.size())
       << "number of instructions does not match post order size";
   return post_order;
 }
@@ -917,7 +966,7 @@ HloComputationProto HloComputation::ToProto() const {
   return proto;
 }
 
-/* static */ StatusOr<std::unique_ptr<HloComputation>>
+/* static */ absl::StatusOr<std::unique_ptr<HloComputation>>
 HloComputation::CreateFromProto(
     const HloComputationProto& proto,
     const absl::flat_hash_map<int64_t, HloComputation*>& computation_map,
@@ -1026,7 +1075,7 @@ HloInstruction* HloComputation::CreateCallInstruction(
   return call_instruction;
 }
 
-StatusOr<HloInstruction*> HloComputation::CreateAsyncInstructions(
+absl::StatusOr<HloInstruction*> HloComputation::CreateAsyncInstructions(
     HloInstruction* instruction, absl::Span<const Shape> context_shapes,
     absl::string_view async_execution_thread, bool replace,
     bool override_names) {
@@ -1103,7 +1152,7 @@ StatusOr<HloInstruction*> HloComputation::CreateAsyncInstructions(
   return async_done;
 }
 
-StatusOr<HloInstruction*> HloComputation::DeepCopyHelper(
+absl::StatusOr<HloInstruction*> HloComputation::DeepCopyHelper(
     HloInstruction* instruction, ShapeIndex* index,
     absl::FunctionRef<HloInstruction*(HloInstruction* leaf,
                                       const ShapeIndex& leaf_index,
@@ -1137,7 +1186,7 @@ StatusOr<HloInstruction*> HloComputation::DeepCopyHelper(
   return copy_leaf(instruction, *index, this);
 }
 
-StatusOr<HloInstruction*> HloComputation::DeepCopyInstruction(
+absl::StatusOr<HloInstruction*> HloComputation::DeepCopyInstruction(
     HloInstruction* instruction, const ShapeTree<bool>* indices_to_copy,
     ShapeTree<HloInstruction*>* copies_added) {
   if (instruction->parent() != this) {
@@ -1173,7 +1222,8 @@ StatusOr<HloInstruction*> HloComputation::DeepCopyInstruction(
   return DeepCopyHelper(instruction, &index, copy_leaf);
 }
 
-StatusOr<HloInstruction*> HloComputation::DeepCopyInstructionWithCustomCopier(
+absl::StatusOr<HloInstruction*>
+HloComputation::DeepCopyInstructionWithCustomCopier(
     HloInstruction* instruction,
     absl::FunctionRef<HloInstruction*(HloInstruction* leaf,
                                       const ShapeIndex& leaf_index,
@@ -1276,7 +1326,7 @@ Status HloComputation::ReplaceWithNewEntryComputationParameter(
                                                  std::move(new_instruction)));
 }
 
-StatusOr<bool> HloComputation::ReplaceInstruction(
+absl::StatusOr<bool> HloComputation::ReplaceInstruction(
     HloInstruction* old_instruction, HloInstruction* new_instruction,
     bool preserve_sharding, bool relay_control_dependency) {
   TF_RET_CHECK(
@@ -1297,7 +1347,7 @@ Status HloComputation::ReplaceInstruction(HloInstruction* old_instruction,
   return OkStatus();
 }
 
-StatusOr<bool> HloComputation::ReplaceInstructionWithDifferentShape(
+absl::StatusOr<bool> HloComputation::ReplaceInstructionWithDifferentShape(
     HloInstruction* old_instruction, HloInstruction* new_instruction,
     bool preserve_sharding, bool relay_control_dependency,
     bool remove_unused_operands) {
diff --git a/third_party/xla/xla/hlo/ir/hlo_computation.h b/third_party/xla/xla/hlo/ir/hlo_computation.h
index f2df3c7b19a48a..5b67b15788ca12 100644
--- a/third_party/xla/xla/hlo/ir/hlo_computation.h
+++ b/third_party/xla/xla/hlo/ir/hlo_computation.h
@@ -102,7 +102,7 @@ class HloComputation {
       return AddInstruction(std::move(instruction));
     }
 
-    StatusOr<HloInstruction*> AddParameter(
+    absl::StatusOr<HloInstruction*> AddParameter(
         std::unique_ptr<HloInstruction> parameter) {
       if (!parameter_numbers_.insert(parameter->parameter_number()).second) {
         return Internal("Duplicate parameter number %d",
@@ -203,6 +203,10 @@ class HloComputation {
   HloInstruction* AddInstruction(std::unique_ptr<HloInstruction> instruction,
                                  const OpMetadata* metadata);
 
+  HloInstruction* AddInstruction(std::unique_ptr<HloInstruction> instruction,
+                                 const OpMetadata* metadata,
+                                 const FrontendAttributes* frontend_attributes);
+
   // Replace the old parameter at index param_no with
   // `instruction`. Updates uses and root instruction. Removes old
   // instruction from computation. No check is done on the shape.
@@ -357,7 +361,7 @@ class HloComputation {
   //   computation_map: a map from computation id to HloComputation*. This map
   //     must contain all computations which the newly constructed computation
   //     calls.
-  static StatusOr<std::unique_ptr<HloComputation>> CreateFromProto(
+  static absl::StatusOr<std::unique_ptr<HloComputation>> CreateFromProto(
       const HloComputationProto& proto,
       const absl::flat_hash_map<int64_t, HloComputation*>& computation_map,
       bool prohibit_empty_literal = true);
@@ -439,7 +443,7 @@ class HloComputation {
   void ForEachInstructionPostOrder(
       absl::FunctionRef<void(HloInstruction*)> func) const;
 
-  int64_t instruction_count() const { return instruction_indices_.size(); }
+  int64_t instruction_count() const { return instruction_count_; }
 
   // Creates and returns a list of the embedded computations called by this
   // computation. This includes all embedded computations called directly or
@@ -477,7 +481,7 @@ class HloComputation {
   // If `replace` is true, replace instruction with the async done instruction.
   // If `override_names` is true, the clone on `instruction` and the async op
   // created will get non-default names.
-  StatusOr<HloInstruction*> CreateAsyncInstructions(
+  absl::StatusOr<HloInstruction*> CreateAsyncInstructions(
       HloInstruction* instruction, absl::Span<const Shape> context_shapes,
       absl::string_view async_execution_thread =
           HloInstruction::kMainExecutionThread,
@@ -493,14 +497,14 @@ class HloComputation {
   // transparently. If copies_added is non-null, then the added kCopy
   // instructions will be inserted in the respective index in the given
   // ShapeTree.
-  StatusOr<HloInstruction*> DeepCopyInstruction(
+  absl::StatusOr<HloInstruction*> DeepCopyInstruction(
       HloInstruction* instruction,
       const ShapeTree<bool>* indices_to_copy = nullptr,
       ShapeTree<HloInstruction*>* copies_added = nullptr);
 
   // As above, but uses a custom function to copy the leaf nodes, which could
   // create alternative HLOs other than kCopy, or even pass-throughs.
-  StatusOr<HloInstruction*> DeepCopyInstructionWithCustomCopier(
+  absl::StatusOr<HloInstruction*> DeepCopyInstructionWithCustomCopier(
       HloInstruction* instruction,
       absl::FunctionRef<HloInstruction*(HloInstruction* leaf,
                                         const ShapeIndex& leaf_index,
@@ -577,10 +581,9 @@ class HloComputation {
   // return false. Otherwise, when the replacement happens, if |new_instruction|
   // doesn't have any sharding information it will receive the sharding
   // information of |old_instruction|, and function will return true.
-  StatusOr<bool> ReplaceInstruction(HloInstruction* old_instruction,
-                                    HloInstruction* new_instruction,
-                                    bool preserve_sharding,
-                                    bool relay_control_dependency = false);
+  absl::StatusOr<bool> ReplaceInstruction(
+      HloInstruction* old_instruction, HloInstruction* new_instruction,
+      bool preserve_sharding, bool relay_control_dependency = false);
 
   // Same as above, with preserve_sharding=false. Since this replacement always
   // happens, it returns just a Status as opposed to StatusOr<bool>
@@ -589,7 +592,7 @@ class HloComputation {
 
   // Same as ReplaceInstruction, but the new instruction can have a different
   // shape.
-  StatusOr<bool> ReplaceInstructionWithDifferentShape(
+  absl::StatusOr<bool> ReplaceInstructionWithDifferentShape(
       HloInstruction* old_instruction, HloInstruction* new_instruction,
       bool preserve_sharding, bool relay_control_dependency = false,
       bool remove_unused_operands = true);
@@ -842,16 +845,14 @@ class HloComputation {
     return execution_thread_ == HloInstruction::kMainExecutionThread;
   }
 
-  // Deallocate instructions that are marked by "RemoveInstruction". The two
-  // stage clean up process is designed such that HloPass can have stable
-  // internal pointers to HloInstructions while we create and remove
+  // Deallocates instructions that are marked by "RemoveInstruction" and
+  // compacts the instructions_ vector by removing the deleted instructions'
+  // entries (a.k.a. tombstones).
+  // This two-stage clean up process is designed such that HloPass can have
+  // stable internal pointers to HloInstructions while we create and remove
   // HloInstructions in a pass.
-  void Cleanup() {
-    for (HloInstruction* it : to_be_deleted_) {
-      delete it;
-    }
-    to_be_deleted_.clear();
-  }
+  // Note: the removal operation is stable because some users depend on it.
+  void Cleanup();
 
   // Returns true if a given instruction is marked dead in this computation.
   bool IsMarkedAsDead(const HloInstruction* inst);
@@ -884,7 +885,7 @@ class HloComputation {
 
   // Internal helper for recursive copying of an instruction. Creates and
   // returns a deep copy of the given instruction.
-  StatusOr<HloInstruction*> DeepCopyHelper(
+  absl::StatusOr<HloInstruction*> DeepCopyHelper(
       HloInstruction* instruction, ShapeIndex* index,
       absl::FunctionRef<HloInstruction*(HloInstruction* leaf,
                                         const ShapeIndex& leaf_index,
@@ -962,18 +963,23 @@ class HloComputation {
   HloInstruction::InstructionVector param_instructions_;
 
   // Store instructions in std::vector as they can be added and removed
-  // arbitrarily and we want a stable iteration order. Keep a map from
-  // instruction pointer to index in the vector for fast lookup.
+  // arbitrarily and we want a stable iteration order.
+  // For the reverse mapping we use HloInstruction::index_in_parent_.
+  //
+  // Note: removals from this vector must be stable because some users depend on
+  // it. See the Cleanup() method for details on the two-stage removal process.
   HloInstructionList instructions_;
-  absl::flat_hash_map<const HloInstruction*, int> instruction_indices_;
 
-  // Execution thread of this computation. By default, it's main thread.
-  std::string execution_thread_ = HloInstruction::kMainExecutionThread;
+  // Number of not-marked-for-deletion entries in instructions_.
+  int64_t instruction_count_;
 
   // Removed instructions are moved into to_be_deleted_ first and then
   // deallocated when Cleanup is called.
   PtrVec<HloInstruction*> to_be_deleted_;
 
+  // Execution thread of this computation. By default, it's main thread.
+  std::string execution_thread_ = HloInstruction::kMainExecutionThread;
+
   std::string name_;
 
   HloComputation(const HloComputation&) = delete;
@@ -1011,9 +1017,6 @@ Status HloComputation::AcceptOrdered(
   absl::flat_hash_set<const HloInstruction*> visited;
   for (const HloInstruction* instruction : order) {
     VLOG(3) << "Visiting ordered: " << instruction->ToString();
-    TF_RET_CHECK(instruction_indices_.contains(instruction))
-        << "Instruction " << instruction->name() << " is not in computation "
-        << name();
     TF_RET_CHECK(!visited.contains(instruction))
         << "Instruction " << instruction->name()
         << " appears more than once in order";
diff --git a/third_party/xla/xla/hlo/ir/hlo_input_output_alias_config.cc b/third_party/xla/xla/hlo/ir/hlo_input_output_alias_config.cc
index 5e649ed0d4edb4..3dca857c43a84e 100644
--- a/third_party/xla/xla/hlo/ir/hlo_input_output_alias_config.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_input_output_alias_config.cc
@@ -91,7 +91,8 @@ HloInputOutputAliasProto HloInputOutputAliasConfig::ToProto() const {
   return result;
 }
 
-StatusOr<HloInputOutputAliasConfig> HloInputOutputAliasConfig::CreateFromProto(
+absl::StatusOr<HloInputOutputAliasConfig>
+HloInputOutputAliasConfig::CreateFromProto(
     Shape output_shape, const HloInputOutputAliasProto& proto) {
   HloInputOutputAliasConfig result(std::move(output_shape));
   for (const HloInputOutputAliasProto::AliasEntryProto& entry :
@@ -278,7 +279,7 @@ HloBufferDonorProto HloBufferDonorConfig::ToProto() const {
   return result;
 }
 
-StatusOr<HloBufferDonorConfig> HloBufferDonorConfig::CreateFromProto(
+absl::StatusOr<HloBufferDonorConfig> HloBufferDonorConfig::CreateFromProto(
     const HloBufferDonorProto& proto) {
   HloBufferDonorConfig result;
   for (const HloBufferDonorProto::BufferDonorEntryProto& entry :
diff --git a/third_party/xla/xla/hlo/ir/hlo_input_output_alias_config.h b/third_party/xla/xla/hlo/ir/hlo_input_output_alias_config.h
index 623e600232f5a4..236621dbe05384 100644
--- a/third_party/xla/xla/hlo/ir/hlo_input_output_alias_config.h
+++ b/third_party/xla/xla/hlo/ir/hlo_input_output_alias_config.h
@@ -94,7 +94,7 @@ class HloInputOutputAliasConfig {
   // HloInputOutputAliasProto.
   HloInputOutputAliasProto ToProto() const;
 
-  static StatusOr<HloInputOutputAliasConfig> CreateFromProto(
+  static absl::StatusOr<HloInputOutputAliasConfig> CreateFromProto(
       Shape output_shape, const HloInputOutputAliasProto& proto);
 
   // Returns the output index that the given parameter and parameter index is
@@ -199,7 +199,7 @@ class HloBufferDonorConfig {
 
   // (De)Serializes an HloBufferDonorConfig to/from an HloBufferDonorProto.
   HloBufferDonorProto ToProto() const;
-  static StatusOr<HloBufferDonorConfig> CreateFromProto(
+  static absl::StatusOr<HloBufferDonorConfig> CreateFromProto(
       const HloBufferDonorProto& proto);
 
   // Verifies that the given config is valid for the given module.
diff --git a/third_party/xla/xla/hlo/ir/hlo_instruction.cc b/third_party/xla/xla/hlo/ir/hlo_instruction.cc
index 85fe22069dfb28..b2bfb6b8a349b8 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instruction.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_instruction.cc
@@ -234,7 +234,7 @@ HloInstruction* HloInstruction::AddInstruction(
 }
 
 /* static */
-StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
+absl::StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
     const HloInstructionProto& proto,
     const absl::flat_hash_map<int64_t, HloInstruction*>& instruction_map,
     const absl::flat_hash_map<int64_t, HloComputation*>& computation_map,
@@ -2088,7 +2088,7 @@ void HloInstruction::SetupDerivedInstruction(
     // compatible because copying it between differently shaped instructions
     // can produce invalid shardings.
     derived_instruction->set_sharding(*sharding_);
-  } else {
+  } else if (!ShapeUtil::CompatibleKind(shape_, derived_instruction->shape())) {
     derived_instruction->clear_sharding();
   }
   derived_instruction->set_metadata(*metadata_);
@@ -4585,7 +4585,7 @@ absl::string_view ToString(HloInstruction::FusionKind kind) {
   }
 }
 
-StatusOr<HloInstruction::FusionKind> StringToFusionKind(
+absl::StatusOr<HloInstruction::FusionKind> StringToFusionKind(
     absl::string_view kind_name) {
   if (kind_name == "kLoop") {
     return HloInstruction::FusionKind::kLoop;
@@ -4744,7 +4744,8 @@ std::string ReplicaGroupsToString(
   return StrCat("{", StrJoin(replica_group_str, ","), "}");
 }
 
-StatusOr<RandomAlgorithm> StringToRandomAlgorithm(const std::string& name) {
+absl::StatusOr<RandomAlgorithm> StringToRandomAlgorithm(
+    const std::string& name) {
   static absl::flat_hash_map<std::string, RandomAlgorithm>* map = [] {
     static auto* map = new absl::flat_hash_map<std::string, RandomAlgorithm>;
     for (int i = 0; i < RandomAlgorithm_ARRAYSIZE; i++) {
@@ -4762,7 +4763,7 @@ StatusOr<RandomAlgorithm> StringToRandomAlgorithm(const std::string& name) {
   return found->second;
 }
 
-StatusOr<RandomDistribution> StringToRandomDistribution(
+absl::StatusOr<RandomDistribution> StringToRandomDistribution(
     const std::string& name) {
   static absl::flat_hash_map<std::string, RandomDistribution>* map = [] {
     static auto* map = new absl::flat_hash_map<std::string, RandomDistribution>;
@@ -4781,7 +4782,7 @@ StatusOr<RandomDistribution> StringToRandomDistribution(
   return found->second;
 }
 
-StatusOr<PrecisionConfig::Precision> StringToPrecision(
+absl::StatusOr<PrecisionConfig::Precision> StringToPrecision(
     const std::string& name) {
   static absl::flat_hash_map<std::string, PrecisionConfig::Precision>* map =
       [] {
@@ -4823,7 +4824,7 @@ absl::StatusOr<PrecisionConfig::Algorithm> StringToAlgorithm(
   return found->second;
 }
 
-StatusOr<CustomCallSchedule> StringToCustomCallSchedule(
+absl::StatusOr<CustomCallSchedule> StringToCustomCallSchedule(
     absl::string_view name) {
   static const absl::flat_hash_map<std::string, CustomCallSchedule>* map = [] {
     static auto* map = new absl::flat_hash_map<std::string, CustomCallSchedule>;
@@ -4842,7 +4843,7 @@ StatusOr<CustomCallSchedule> StringToCustomCallSchedule(
   return found->second;
 }
 
-StatusOr<CustomCallApiVersion> StringToCustomCallApiVersion(
+absl::StatusOr<CustomCallApiVersion> StringToCustomCallApiVersion(
     absl::string_view name) {
   static const absl::flat_hash_map<std::string, CustomCallApiVersion>* map =
       [] {
@@ -4965,8 +4966,8 @@ bool HloInstruction::BackendConfigRep::operator==(
   return GetRawString() == other.GetRawString();
 }
 
-/* static */ StatusOr<std::string> HloInstruction::BackendConfigToRawString(
-    const tsl::protobuf::Message& proto) {
+/* static */ absl::StatusOr<std::string>
+HloInstruction::BackendConfigToRawString(const tsl::protobuf::Message& proto) {
   std::string ret;
   // Pass ignore_accuracy_loss = true because estimated_cycles field can be
   // INT64_MAX. If ignore_accuracy_loss = false and estimated_cycles =
diff --git a/third_party/xla/xla/hlo/ir/hlo_instruction.h b/third_party/xla/xla/hlo/ir/hlo_instruction.h
index 50c12af36a452d..a28fb7cf529149 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instruction.h
+++ b/third_party/xla/xla/hlo/ir/hlo_instruction.h
@@ -695,7 +695,7 @@ class HloInstruction {
   //   computation_map: a map from computation id to HloComputation*. This map
   //     must contain all computations which the newly constructed instruction
   //     calls.
-  static StatusOr<std::unique_ptr<HloInstruction>> CreateFromProto(
+  static absl::StatusOr<std::unique_ptr<HloInstruction>> CreateFromProto(
       const HloInstructionProto& proto,
       const absl::flat_hash_map<int64_t, HloInstruction*>& instruction_map,
       const absl::flat_hash_map<int64_t, HloComputation*>& computation_map = {},
@@ -2120,7 +2120,7 @@ class HloInstruction {
   //   TF_RETURN_IF_ERROR(instr.set_backend_config(proto));
   //   return instr.raw_backend_config_string();
   //
-  static StatusOr<std::string> BackendConfigToRawString(
+  static absl::StatusOr<std::string> BackendConfigToRawString(
       const tsl::protobuf::Message& proto);
 
   // Returns the information used to tell the implementation information about
@@ -2806,7 +2806,7 @@ extern template Status HloInstruction::Visit(DfsHloVisitor* visitor);
 extern template Status HloInstruction::Visit(ConstDfsHloVisitor* visitor);
 
 absl::string_view ToString(HloInstruction::FusionKind kind);
-StatusOr<HloInstruction::FusionKind> StringToFusionKind(
+absl::StatusOr<HloInstruction::FusionKind> StringToFusionKind(
     absl::string_view kind_name);
 
 // Custom (de)stringification functions for protos that live inside
@@ -2823,14 +2823,17 @@ std::string ConvolutionDimensionNumbersToString(
 std::string ReplicaGroupsToString(
     absl::Span<const ReplicaGroup> replica_groups);
 
-StatusOr<RandomAlgorithm> StringToRandomAlgorithm(const std::string& name);
-StatusOr<RandomDistribution> StringToRandomDistribution(
+absl::StatusOr<RandomAlgorithm> StringToRandomAlgorithm(
+    const std::string& name);
+absl::StatusOr<RandomDistribution> StringToRandomDistribution(
+    const std::string& name);
+absl::StatusOr<PrecisionConfig::Precision> StringToPrecision(
     const std::string& name);
-StatusOr<PrecisionConfig::Precision> StringToPrecision(const std::string& name);
 absl::StatusOr<PrecisionConfig::Algorithm> StringToAlgorithm(
     const std::string& name);
-StatusOr<CustomCallSchedule> StringToCustomCallSchedule(absl::string_view name);
-StatusOr<CustomCallApiVersion> StringToCustomCallApiVersion(
+absl::StatusOr<CustomCallSchedule> StringToCustomCallSchedule(
+    absl::string_view name);
+absl::StatusOr<CustomCallApiVersion> StringToCustomCallApiVersion(
     absl::string_view name);
 
 std::ostream& operator<<(std::ostream& os, HloInstruction::FusionKind kind);
diff --git a/third_party/xla/xla/hlo/ir/hlo_instructions.cc b/third_party/xla/xla/hlo/ir/hlo_instructions.cc
index d2f05e6eb11022..cbeb602313f223 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instructions.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_instructions.cc
@@ -411,7 +411,11 @@ HloAsyncStartInstruction::HloAsyncStartInstruction(
 
 HloAsyncStartInstruction::~HloAsyncStartInstruction() {
   ClearAsyncComputationInstruction();
-  ClearCalledComputations();
+}
+
+void HloAsyncStartInstruction::ClearCalledComputations() {
+  ClearAsyncComputationInstruction();
+  HloInstruction::ClearCalledComputations();
 }
 
 void HloAsyncStartInstruction::ClearAsyncComputationInstruction() {
diff --git a/third_party/xla/xla/hlo/ir/hlo_instructions.h b/third_party/xla/xla/hlo/ir/hlo_instructions.h
index 0a2364355dbd4c..5ee5800df2ad9e 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instructions.h
+++ b/third_party/xla/xla/hlo/ir/hlo_instructions.h
@@ -67,7 +67,6 @@ class HloDimensionsInstruction : public HloInstruction {
       case HloOpcode::kReduce:
       case HloOpcode::kReverse:
       case HloOpcode::kSort:
-      case HloOpcode::kTopK:
       case HloOpcode::kTranspose:
         return true;
       default:
@@ -300,6 +299,7 @@ class HloAsyncStartInstruction : public HloAsyncInstruction {
       absl::string_view async_execution_thread = kMainExecutionThread);
 
   ~HloAsyncStartInstruction() override;
+  void ClearCalledComputations() override;
   // When an async instruction is being destructed, remove it from the vector of
   // pointers of its called computation, to avoid referencing freed memory.
   void ClearAsyncComputationInstruction();
@@ -349,11 +349,11 @@ class HloCopyStartInstruction : public HloInstruction {
 
   // Each cross program prefetched buffer has a unique index. The indices are
   // assigned contiguously starting from zero in
-  // AlternateMemoryBestFitHeap::AllocateCrossProgramPrefetchBuffer. This value
-  // is used during codegen to determine which buffer is being speculated at
-  // runtime. One possible implementation is to initialize an array with boolean
-  // values indicating whether the cross program prefetch succeeds or fails for
-  // each buffer.
+  // MsaAlgorithm::AllocateCrossProgramPrefetchBuffer. This value is used during
+  // codegen to determine which buffer is being speculated at runtime. One
+  // possible implementation is to initialize an array with boolean values
+  // indicating whether the cross program prefetch succeeds or fails for each
+  // buffer.
   std::optional<int> cross_program_prefetch_index_;
 };
 
diff --git a/third_party/xla/xla/hlo/ir/hlo_module.cc b/third_party/xla/xla/hlo/ir/hlo_module.cc
index 28610dac086498..2deb35aa19cd48 100644
--- a/third_party/xla/xla/hlo/ir/hlo_module.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_module.cc
@@ -504,7 +504,7 @@ HloModuleProto HloModule::ToProto() const {
   return proto;
 }
 
-StatusOr<HloModuleProtoWithConfig> HloModule::ToProtoWithConfig() const {
+absl::StatusOr<HloModuleProtoWithConfig> HloModule::ToProtoWithConfig() const {
   HloModuleProtoWithConfig result;
   TF_ASSIGN_OR_RETURN(*result.mutable_config(), config_.get().ToProto());
   *result.mutable_hlo_module() = ToProto();
@@ -540,7 +540,7 @@ Status HloModule::CheckUniqueNamesAndIdsForComputationsAndInstructions() const {
 }
 
 /* static */
-StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
+absl::StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
     const HloModuleProto& proto, const HloModuleConfig& module_config,
     bool prohibit_empty_literal) {
   VLOG(2) << "CreateFromProto()";
@@ -684,7 +684,7 @@ StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
 }
 
 /* static */
-StatusOr<HloModuleConfig> HloModule::CreateModuleConfigFromShape(
+absl::StatusOr<HloModuleConfig> HloModule::CreateModuleConfigFromShape(
     const ProgramShape& program_shape, const DebugOptions& debug_options,
     const ExecutionOptions* execution_options) {
   HloModuleConfig module_config(ProgramShape{program_shape});
@@ -756,7 +756,7 @@ StatusOr<HloModuleConfig> HloModule::CreateModuleConfigFromShape(
 }
 
 /* static */
-StatusOr<HloModuleConfig> HloModule::CreateModuleConfigFromProto(
+absl::StatusOr<HloModuleConfig> HloModule::CreateModuleConfigFromProto(
     const HloModuleProto& module, const DebugOptions& debug_options,
     const ExecutionOptions* execution_options) {
   if (!module.has_host_program_shape()) {
@@ -779,9 +779,9 @@ StatusOr<HloModuleConfig> HloModule::CreateModuleConfigFromProto(
   return config;
 }
 
-StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProtoWithConfig(
+absl::StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProtoWithConfig(
     const HloModuleProtoWithConfig& proto, bool prohibit_empty_literal) {
-  auto hlo_module_proto = proto.hlo_module();
+  const auto& hlo_module_proto = proto.hlo_module();
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> config_ptr,
                       HloModuleConfig::CreateFromProto(proto.config()));
   return HloModule::CreateFromProto(hlo_module_proto, *config_ptr,
diff --git a/third_party/xla/xla/hlo/ir/hlo_module.h b/third_party/xla/xla/hlo/ir/hlo_module.h
index 0f6d449127e68d..d232fb1462ce00 100644
--- a/third_party/xla/xla/hlo/ir/hlo_module.h
+++ b/third_party/xla/xla/hlo/ir/hlo_module.h
@@ -51,7 +51,7 @@ limitations under the License.
 namespace xla {
 
 using LayoutCanonicalizationCallback =
-    std::function<StatusOr<std::pair<std::vector<Shape>, Shape>>(
+    std::function<absl::StatusOr<std::pair<std::vector<Shape>, Shape>>(
         const HloModule& module)>;
 
 // Helper class to maintain a copy-on-write storage of an object of the
@@ -451,25 +451,25 @@ class HloModule {
 
   // Convert an HloModule to or from a proto.
   HloModuleProto ToProto() const;
-  static StatusOr<std::unique_ptr<HloModule>> CreateFromProto(
+  static absl::StatusOr<std::unique_ptr<HloModule>> CreateFromProto(
       const HloModuleProto& proto, const HloModuleConfig& module_config,
       bool prohibit_empty_literal = true);
 
   // Convert an HloModule to or from a proto that includes module configuration
-  StatusOr<HloModuleProtoWithConfig> ToProtoWithConfig() const;
-  static StatusOr<std::unique_ptr<HloModule>> CreateFromProtoWithConfig(
+  absl::StatusOr<HloModuleProtoWithConfig> ToProtoWithConfig() const;
+  static absl::StatusOr<std::unique_ptr<HloModule>> CreateFromProtoWithConfig(
       const HloModuleProtoWithConfig& proto,
       bool prohibit_empty_literal = true);
 
   // Creates and returns an HloModuleConfig with an appropriate program shape
   // for the HLO module in the given proto.
-  static StatusOr<HloModuleConfig> CreateModuleConfigFromProto(
+  static absl::StatusOr<HloModuleConfig> CreateModuleConfigFromProto(
       const HloModuleProto& module, const DebugOptions& debug_options,
       const ExecutionOptions* execution_options = nullptr);
 
   // Creates and returns an HloModuleConfig with an appropriate program shape
   // for the HLO module in the given proto.
-  static StatusOr<HloModuleConfig> CreateModuleConfigFromShape(
+  static absl::StatusOr<HloModuleConfig> CreateModuleConfigFromShape(
       const ProgramShape& program_shape, const DebugOptions& debug_options,
       const ExecutionOptions* execution_options = nullptr);
 
diff --git a/third_party/xla/xla/hlo/ir/hlo_module_group.cc b/third_party/xla/xla/hlo/ir/hlo_module_group.cc
index 12c1a0441ee436..c5623e45aea588 100644
--- a/third_party/xla/xla/hlo/ir/hlo_module_group.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_module_group.cc
@@ -72,7 +72,7 @@ HloModuleGroupProto HloModuleGroup::ToProto() const {
   return proto;
 }
 
-/* static */ StatusOr<HloModuleGroup> HloModuleGroup::CreateFromProto(
+/* static */ absl::StatusOr<HloModuleGroup> HloModuleGroup::CreateFromProto(
     const HloModuleGroupProto& proto,
     absl::Span<const HloModuleConfig> module_configs) {
   TF_RET_CHECK(!proto.name().empty()) << "Module group name cannot be empty";
diff --git a/third_party/xla/xla/hlo/ir/hlo_module_group.h b/third_party/xla/xla/hlo/ir/hlo_module_group.h
index 283aa735525a63..753e8bc61c62d3 100644
--- a/third_party/xla/xla/hlo/ir/hlo_module_group.h
+++ b/third_party/xla/xla/hlo/ir/hlo_module_group.h
@@ -87,7 +87,7 @@ class HloModuleGroup {
 
   // Serialize the module group to/from a proto.
   HloModuleGroupProto ToProto() const;
-  static StatusOr<HloModuleGroup> CreateFromProto(
+  static absl::StatusOr<HloModuleGroup> CreateFromProto(
       const HloModuleGroupProto& proto,
       absl::Span<const HloModuleConfig> module_configs);
 
diff --git a/third_party/xla/xla/hlo/ir/hlo_module_metadata.cc b/third_party/xla/xla/hlo/ir/hlo_module_metadata.cc
index 3af6a63cacec9a..aa910c10919249 100644
--- a/third_party/xla/xla/hlo/ir/hlo_module_metadata.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_module_metadata.cc
@@ -25,7 +25,8 @@ limitations under the License.
 
 namespace xla {
 
-StatusOr<HloPassMetadata*> HloModuleMetadata::GetCurrentHloPassMetadata() {
+absl::StatusOr<HloPassMetadata*>
+HloModuleMetadata::GetCurrentHloPassMetadata() {
   if (running_passes_.empty()) {
     return NotFound(
         "HloPassMetadata for currently running pass not found, either because "
diff --git a/third_party/xla/xla/hlo/ir/hlo_module_metadata.h b/third_party/xla/xla/hlo/ir/hlo_module_metadata.h
index 0fc4f3169ad3ce..a18e9090db4d01 100644
--- a/third_party/xla/xla/hlo/ir/hlo_module_metadata.h
+++ b/third_party/xla/xla/hlo/ir/hlo_module_metadata.h
@@ -65,7 +65,7 @@ class HloModuleMetadata {
   }
   Status set_custom_metadata(const ::tsl::protobuf::Message& message);
 
-  StatusOr<int64_t> current_pass_id() {
+  absl::StatusOr<int64_t> current_pass_id() {
     TF_ASSIGN_OR_RETURN(HloPassMetadata * pass_metadata,
                         GetCurrentHloPassMetadata());
     return pass_metadata->pass_id();
@@ -113,7 +113,7 @@ class HloModuleMetadata {
   // Gets mutable metadata for the currently running pass. If passes are nested,
   // finds the deepest one still running. Returns NotFound if metadata for the
   // currently running pass cannot be found.
-  StatusOr<HloPassMetadata*> GetCurrentHloPassMetadata();
+  absl::StatusOr<HloPassMetadata*> GetCurrentHloPassMetadata();
 
   Status MutateCurrentHloPassMetadata(
       absl::FunctionRef<void(HloPassMetadata*)> mutator);
diff --git a/third_party/xla/xla/hlo/ir/hlo_opcode.cc b/third_party/xla/xla/hlo/ir/hlo_opcode.cc
index c364d7b0f97ed6..dcdf9c1933c829 100644
--- a/third_party/xla/xla/hlo/ir/hlo_opcode.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_opcode.cc
@@ -33,7 +33,7 @@ absl::string_view HloOpcodeString(HloOpcode opcode) {
   }
 }
 
-StatusOr<HloOpcode> StringToHloOpcode(absl::string_view opcode_name) {
+absl::StatusOr<HloOpcode> StringToHloOpcode(absl::string_view opcode_name) {
   static auto* opcode_map = new absl::flat_hash_map<std::string, HloOpcode>({
 #define STRING_TO_OPCODE_ENTRY(enum_name, opcode_name, ...) \
   {opcode_name, HloOpcode::enum_name},
diff --git a/third_party/xla/xla/hlo/ir/hlo_opcode.h b/third_party/xla/xla/hlo/ir/hlo_opcode.h
index 476cc5c6225494..79d90e3120d608 100644
--- a/third_party/xla/xla/hlo/ir/hlo_opcode.h
+++ b/third_party/xla/xla/hlo/ir/hlo_opcode.h
@@ -187,7 +187,7 @@ enum {
 absl::string_view HloOpcodeString(HloOpcode opcode);
 
 // Retrieves the opcode enum by name if the opcode exists.
-StatusOr<HloOpcode> StringToHloOpcode(absl::string_view opcode_name);
+absl::StatusOr<HloOpcode> StringToHloOpcode(absl::string_view opcode_name);
 
 inline std::ostream& operator<<(std::ostream& os, HloOpcode opcode) {
   return os << HloOpcodeString(opcode);
diff --git a/third_party/xla/xla/hlo/ir/hlo_schedule.cc b/third_party/xla/xla/hlo/ir/hlo_schedule.cc
index 28d923534f5a6c..2922a1b1286f37 100644
--- a/third_party/xla/xla/hlo/ir/hlo_schedule.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_schedule.cc
@@ -38,7 +38,7 @@ limitations under the License.
 
 namespace xla {
 
-/* static */ StatusOr<HloSchedule> HloSchedule::CreateFromProto(
+/* static */ absl::StatusOr<HloSchedule> HloSchedule::CreateFromProto(
     const HloModule* module, const HloScheduleProto& proto) {
   absl::flat_hash_map<int64_t, const HloComputation*> id_to_computation;
   for (const HloComputation* computation : module->computations()) {
@@ -76,7 +76,7 @@ namespace xla {
   return std::move(schedule);
 }
 
-StatusOr<HloScheduleProto> HloSchedule::ToProto() const {
+absl::StatusOr<HloScheduleProto> HloSchedule::ToProto() const {
   TF_RETURN_IF_ERROR(Verify());
   HloScheduleProto proto;
   for (const auto& id_sequence : sequences_) {
diff --git a/third_party/xla/xla/hlo/ir/hlo_schedule.h b/third_party/xla/xla/hlo/ir/hlo_schedule.h
index 2a562a75728a18..3e04c843d577de 100644
--- a/third_party/xla/xla/hlo/ir/hlo_schedule.h
+++ b/third_party/xla/xla/hlo/ir/hlo_schedule.h
@@ -122,9 +122,9 @@ class HloSchedule {
   explicit HloSchedule(const HloModule* module) : module_(module) {}
 
   // (De)Serialize an HloSchedule to/from a HloScheduleProto.
-  static StatusOr<HloSchedule> CreateFromProto(const HloModule* module,
-                                               const HloScheduleProto& proto);
-  StatusOr<HloScheduleProto> ToProto() const;
+  static absl::StatusOr<HloSchedule> CreateFromProto(
+      const HloModule* module, const HloScheduleProto& proto);
+  absl::StatusOr<HloScheduleProto> ToProto() const;
 
   // Returns a reference to the sequence for the given computation.
   const HloInstructionSequence& sequence(
diff --git a/third_party/xla/xla/hlo/ir/hlo_sharding.cc b/third_party/xla/xla/hlo/ir/hlo_sharding.cc
index 9b67a081d374b2..0577e5de93d0d1 100644
--- a/third_party/xla/xla/hlo/ir/hlo_sharding.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_sharding.cc
@@ -625,7 +625,7 @@ Status HloSharding::CheckLeafCount(const Shape& shape) const {
   return OkStatus();
 }
 
-StatusOr<ShapeTree<HloSharding>> HloSharding::AsShapeTree(
+absl::StatusOr<ShapeTree<HloSharding>> HloSharding::AsShapeTree(
     const Shape& shape) const {
   if (IsTuple()) {
     ShapeTree<HloSharding> result(shape, HloSharding::Replicate());
@@ -640,7 +640,8 @@ StatusOr<ShapeTree<HloSharding>> HloSharding::AsShapeTree(
   }
 }
 
-StatusOr<HloSharding> HloSharding::GetTupleSharding(const Shape& shape) const {
+absl::StatusOr<HloSharding> HloSharding::GetTupleSharding(
+    const Shape& shape) const {
   if (IsTuple()) {
     TF_RETURN_IF_ERROR(CheckLeafCount(shape));
     return *this;
@@ -792,7 +793,7 @@ Status HloSharding::ValidateNonTuple(const Shape& shape,
   return OkStatus();
 }
 
-/*static*/ StatusOr<HloSharding> HloSharding::FromProto(
+/*static*/ absl::StatusOr<HloSharding> HloSharding::FromProto(
     const OpSharding& proto) {
   std::vector<OpMetadata> metadata(proto.metadata().begin(),
                                    proto.metadata().end());
@@ -846,7 +847,7 @@ Status HloSharding::ValidateNonTuple(const Shape& shape,
   TF_RET_CHECK(!proto.tile_assignment_dimensions().empty());
 
   auto product_no_overflow =
-      [](absl::Span<const int64_t> dims) -> StatusOr<int64_t> {
+      [](absl::Span<const int64_t> dims) -> absl::StatusOr<int64_t> {
     int64_t product_of_dimensions = 1;
     bool any_overflow = false;
     for (auto dimension : dims) {
diff --git a/third_party/xla/xla/hlo/ir/hlo_sharding.h b/third_party/xla/xla/hlo/ir/hlo_sharding.h
index bc38b2291d8097..4237b1e182ba9d 100644
--- a/third_party/xla/xla/hlo/ir/hlo_sharding.h
+++ b/third_party/xla/xla/hlo/ir/hlo_sharding.h
@@ -148,7 +148,7 @@ class HloSharding {
   static HloSharding Single(const Shape& shape, const HloSharding& sharding);
 
   // Create a new sharding from a protobuf OpSharding.
-  static StatusOr<HloSharding> FromProto(const OpSharding& proto);
+  static absl::StatusOr<HloSharding> FromProto(const OpSharding& proto);
 
   // Checks whether device is a reserved device number. A reserved device number
   // has usually a special meaning, with dedicated handling logic.
@@ -337,7 +337,7 @@ class HloSharding {
   // tuple, if IsTuple, or a ShapeTree with a single element containing this
   // sharding. Only the leaf elements are populated. This creates a new
   // ShapeTree object so is not cheap.
-  StatusOr<ShapeTree<HloSharding>> AsShapeTree(const Shape& shape) const;
+  absl::StatusOr<ShapeTree<HloSharding>> AsShapeTree(const Shape& shape) const;
   ShapeTree<HloSharding> GetAsShapeTree(const Shape& shape) const {
     return AsShapeTree(shape).value();
   }
@@ -349,7 +349,7 @@ class HloSharding {
   // If the current sharding is a tuple sharding, return itself as result.
   // Otherwise returns a tuple sharding for the input shape, with all the leaves
   // having this object sharding.
-  StatusOr<HloSharding> GetTupleSharding(const Shape& shape) const;
+  absl::StatusOr<HloSharding> GetTupleSharding(const Shape& shape) const;
 
   // If the shape is tuple and the current sharding is not a tuple, attempt to
   // construct a sharding that is compatible with the shape by replicating the
diff --git a/third_party/xla/xla/hlo/ir/hlo_sharding_metadata.cc b/third_party/xla/xla/hlo/ir/hlo_sharding_metadata.cc
index 3964517b8333de..85f5fc04e579dc 100644
--- a/third_party/xla/xla/hlo/ir/hlo_sharding_metadata.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_sharding_metadata.cc
@@ -174,7 +174,7 @@ Status ApplyDomainSingleSharding(const DomainMetadata::Domain& domain,
 // If user is a tuple instruction, return the tuple subsharding corresponding to
 // the operand matching the instruction argument, because that is the
 // subsharding corresponding to instruction.
-StatusOr<ShapeTree<HloSharding>> GetShardingTreeFromUser(
+absl::StatusOr<ShapeTree<HloSharding>> GetShardingTreeFromUser(
     const HloInstruction& instruction, const HloInstruction& user) {
   if (user.opcode() == HloOpcode::kTuple) {
     return user.sharding()
@@ -188,8 +188,8 @@ StatusOr<ShapeTree<HloSharding>> GetShardingTreeFromUser(
 // then no assignment is made. Therefore kUnassignedDevice is never propagated.
 // kConflict is returned if lhs is already assigned and rhs is assigned to a
 // different device.
-StatusOr<AssignmentKind> AssignLeafSharding(HloSharding* lhs,
-                                            const HloSharding& rhs) {
+absl::StatusOr<AssignmentKind> AssignLeafSharding(HloSharding* lhs,
+                                                  const HloSharding& rhs) {
   TF_RET_CHECK(!lhs->IsTuple() && !rhs.IsTuple());
   if (rhs.UsesDevice(kUnassignedDevice)) {
     return AssignmentKind::kUnassigned;
@@ -207,7 +207,7 @@ StatusOr<AssignmentKind> AssignLeafSharding(HloSharding* lhs,
 // In case of conflicting assignment AssignmentKind::kConflict is returned. In
 // this case lhs_tree is partially assigned, up to the conflicting leaf. It is
 // up to the caller to discard the partial assignment in case of conflict.
-StatusOr<AssignmentKind> AssignTreeSharding(
+absl::StatusOr<AssignmentKind> AssignTreeSharding(
     ShapeTree<HloSharding>* lhs_tree, ShapeTree<HloSharding>::iterator lhs_it,
     const ShapeTree<HloSharding>& rhs_tree) {
   AssignmentKind assigned = AssignmentKind::kUnassigned;
@@ -233,9 +233,9 @@ StatusOr<AssignmentKind> AssignTreeSharding(
   return assigned;
 }
 
-StatusOr<bool> ApplyShardingFromUsers(HloInstruction* instruction,
-                                      const DomainMetadata::Domain& domain,
-                                      const HloSharding& domain_sharding) {
+absl::StatusOr<bool> ApplyShardingFromUsers(
+    HloInstruction* instruction, const DomainMetadata::Domain& domain,
+    const HloSharding& domain_sharding) {
   if (instruction->users().empty()) {
     // No sharding from users, use domain_sharding, after checking
     // compatibility.
@@ -317,8 +317,8 @@ StatusOr<bool> ApplyShardingFromUsers(HloInstruction* instruction,
 // Tries to propagate the sharding information into the instructions that are
 // part of the domain, in a reverse post order manner (users propagate to
 // instruction).
-StatusOr<int64_t> ApplyDomainShardingPass(const DomainMetadata::Domain& domain,
-                                          const HloSharding& domain_sharding) {
+absl::StatusOr<int64_t> ApplyDomainShardingPass(
+    const DomainMetadata::Domain& domain, const HloSharding& domain_sharding) {
   int64_t assigned = 0;
   // domain.instructions are ordered in a post-order manner. As we do
   // user->operand propagation we process instructions in reverse order. In so
@@ -380,8 +380,8 @@ Status ApplyDomainSharding(const DomainMetadata::Domain& domain,
   return OkStatus();
 }
 
-StatusOr<std::shared_ptr<const HloSharding>> ExtractOriginalCommonSharding(
-    absl::Span<HloInstruction* const> instructions) {
+absl::StatusOr<std::shared_ptr<const HloSharding>>
+ExtractOriginalCommonSharding(absl::Span<HloInstruction* const> instructions) {
   // If we are here, all the instructions being passed had the same sharding
   // (or no sharding), by the means of the ShardingMatches() API.
   // As such, no kDomain was inserted, and here we are asked to extract the
@@ -435,7 +435,7 @@ std::string ShardingMetadata::ToString() const {
   return sharding_ != nullptr ? sharding_->ToString() : "{}";
 }
 
-/*static*/ StatusOr<const ShardingMetadata*>
+/*static*/ absl::StatusOr<const ShardingMetadata*>
 ShardingMetadata::ToShardingMetadata(const DomainMetadata* metadata) {
   if (metadata->Kind() != ShardingMetadata::KindName()) {
     return Status(
diff --git a/third_party/xla/xla/hlo/ir/hlo_sharding_metadata.h b/third_party/xla/xla/hlo/ir/hlo_sharding_metadata.h
index 5e9e58edfa6e3b..f8d1b9368eb5d1 100644
--- a/third_party/xla/xla/hlo/ir/hlo_sharding_metadata.h
+++ b/third_party/xla/xla/hlo/ir/hlo_sharding_metadata.h
@@ -58,7 +58,7 @@ class ShardingMetadata : public DomainMetadata {
 
   static absl::string_view KindName() { return "sharding"; }
 
-  static StatusOr<const ShardingMetadata*> ToShardingMetadata(
+  static absl::StatusOr<const ShardingMetadata*> ToShardingMetadata(
       const DomainMetadata* metadata);
 
   // Apply the specified domain metadata onto the specified domain. If no
diff --git a/third_party/xla/xla/hlo/utils/BUILD b/third_party/xla/xla/hlo/utils/BUILD
index 98913c1cef2d4b..5df930b01ea385 100644
--- a/third_party/xla/xla/hlo/utils/BUILD
+++ b/third_party/xla/xla/hlo/utils/BUILD
@@ -67,6 +67,7 @@ cc_library(
     hdrs = ["hlo_matchers.h"],
     deps = [
         "//xla:test",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_parser",
         "@com_google_absl//absl/strings",
@@ -80,8 +81,10 @@ xla_cc_test(
         ":hlo_matchers",
         "//xla:literal_util",
         "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
+        "@com_google_googletest//:gtest_main",
     ],
 )
 
diff --git a/third_party/xla/xla/hlo/utils/hlo_matchers.cc b/third_party/xla/xla/hlo/utils/hlo_matchers.cc
index 4603ead0cad507..584381674d07a0 100644
--- a/third_party/xla/xla/hlo/utils/hlo_matchers.cc
+++ b/third_party/xla/xla/hlo/utils/hlo_matchers.cc
@@ -390,6 +390,47 @@ void HloSourceTargetPairsMatcher::DescribeTo(std::ostream* os) const {
   };
   *os << '{' << absl::StrJoin(source_target_pairs_, ",", pair_formatter) << "}";
 }
+bool HloMetadataMatcher::MatchAndExplain(
+    const HloInstruction* instruction,
+    ::testing::MatchResultListener* listener) const {
+  *listener << " (metadata: ";
+  if (instruction->metadata().op_type() != metadata_.op_type()) {
+    *listener << " has wrong metadata (got "
+              << instruction->metadata().op_type() << ", want "
+              << metadata_.op_type() << ")";
+    return false;
+  }
+  *listener << metadata_.op_type() << " ";
+  if (instruction->metadata().op_name() != metadata_.op_name()) {
+    *listener << " has wrong metadata (got "
+              << instruction->metadata().op_name() << ", want "
+              << metadata_.op_name() << ")";
+    return false;
+  }
+  *listener << metadata_.op_name() << " ";
+  if (instruction->metadata().source_file() != metadata_.source_file()) {
+    *listener << " has wrong metadata (got "
+              << instruction->metadata().source_file() << ", want "
+              << metadata_.source_file() << ")";
+    return false;
+  }
+  *listener << metadata_.source_file() << " ";
+  if (instruction->metadata().source_line() != metadata_.source_line()) {
+    *listener << " has wrong metadata (got "
+              << instruction->metadata().source_line() << ", want "
+              << metadata_.source_line() << ")";
+    return false;
+  }
+  *listener << metadata_.source_line();
+  *listener << ")";
+  return true;
+}
+
+void HloMetadataMatcher::DescribeTo(std::ostream* os) const {
+  *os << " (metadata: " << metadata_.op_type() << " " << metadata_.op_name()
+      << " " << metadata_.source_file() << " " << metadata_.source_line()
+      << ")";
+}
 }  // namespace testing
 
 void PrintTo(const HloInstruction* inst, ::std::ostream* os) {
diff --git a/third_party/xla/xla/hlo/utils/hlo_matchers.h b/third_party/xla/xla/hlo/utils/hlo_matchers.h
index b38672b7814347..17f3294156bcab 100644
--- a/third_party/xla/xla/hlo/utils/hlo_matchers.h
+++ b/third_party/xla/xla/hlo/utils/hlo_matchers.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/hlo_parser.h"
 #include "xla/test.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace testing {
@@ -237,6 +238,20 @@ class HloSourceTargetPairsMatcher
   std::vector<std::pair<int64_t, int64_t>> source_target_pairs_;
 };
 
+class HloMetadataMatcher
+    : public ::testing::MatcherInterface<const HloInstruction*> {
+ public:
+  explicit HloMetadataMatcher(OpMetadata metadata)
+      : metadata_(std::move(metadata)) {}
+
+  bool MatchAndExplain(const HloInstruction* instruction,
+                       ::testing::MatchResultListener* listener) const override;
+  void DescribeTo(std::ostream* os) const override;
+
+ private:
+  OpMetadata metadata_;
+};
+
 // HloInstruction* matchers for opcode and operands. Example:
 //   namespace op = xla::opcode_matchers;
 //   EXPECT_THAT(instruction,
@@ -554,6 +569,12 @@ inline ::testing::Matcher<const ::xla::HloInstruction*> SourceTargetPairs(
       std::move(source_target_pairs)));
 }
 
+inline ::testing::Matcher<const ::xla::HloInstruction*> Metadata(
+    OpMetadata metadata) {
+  return ::testing::MakeMatcher(
+      new ::xla::testing::HloMetadataMatcher(std::move(metadata)));
+}
+
 #undef HLO_MATCHER
 }  // namespace opcode_matchers
 
diff --git a/third_party/xla/xla/hlo/utils/hlo_matchers_test.cc b/third_party/xla/xla/hlo/utils/hlo_matchers_test.cc
index dbf6ff6c104a7f..9811e8830f9449 100644
--- a/third_party/xla/xla/hlo/utils/hlo_matchers_test.cc
+++ b/third_party/xla/xla/hlo/utils/hlo_matchers_test.cc
@@ -20,9 +20,11 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include "xla/literal_util.h"
 #include "xla/shape_util.h"
 #include "xla/tests/hlo_test_base.h"
+#include "xla/xla_data.pb.h"
 
 namespace op = xla::testing::opcode_matchers;
 using ::testing::_;
@@ -372,5 +374,47 @@ TEST_F(HloMatchersTest, SourceTargetPairsMatcher) {
               HasSubstr("source_target_pairs (expected: {{0,1},{2,3}}"));
   EXPECT_THAT(cp.get(), op::SourceTargetPairs({{0, 1}, {2, 3}, {1, 2}}));
 }
+
+TEST_F(HloMatchersTest, MetadataMatcher) {
+  Shape shape = ShapeUtil::MakeShape(F32, {5, 7});
+  std::unique_ptr<HloInstruction> p0 =
+      HloInstruction::CreateParameter(0, shape, "param");
+  OpMetadata metadata;
+  metadata.set_op_type("op_type1");
+  metadata.set_op_name("op_name1");
+  p0->set_metadata(metadata);
+
+  OpMetadata actual_opname;
+  actual_opname.set_op_type("op_type1");
+  actual_opname.set_op_name("op_name2");
+
+  OpMetadata actual_source_file;
+  actual_source_file.set_op_type("op_type1");
+  actual_source_file.set_op_name("op_name1");
+  actual_source_file.set_source_file("source_file");
+
+  OpMetadata actual_optype;
+  actual_optype.set_op_type("op_type2");
+  actual_optype.set_op_name("op_name1");
+
+  OpMetadata actual_source_line;
+  actual_source_line.set_op_type("op_type1");
+  actual_source_line.set_op_name("op_name1");
+  actual_source_line.set_source_line(1);
+
+  EXPECT_THAT(Explain(p0.get(), op::Metadata(actual_opname)),
+              HasSubstr("has wrong metadata (got op_name1, want op_name2)"));
+  EXPECT_THAT(Explain(p0.get(), op::Metadata(actual_source_file)),
+              HasSubstr("has wrong metadata (got "
+                        ", want source_file)"));
+  EXPECT_THAT(Explain(p0.get(), op::Metadata(actual_optype)),
+              HasSubstr("has wrong metadata (got"
+                        " op_type1, want op_type2)"));
+  EXPECT_THAT(Explain(p0.get(), op::Metadata(actual_source_line)),
+              HasSubstr("has wrong metadata (got 0"
+                        ", want 1)"));
+  EXPECT_THAT(DescribeHloMatcher(op::Metadata(p0->metadata())),
+              R"( (metadata: op_type1 op_name1  0))");
+}
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc b/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
index 0177abc48f506f..9a8ab27cf2a67c 100644
--- a/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
+++ b/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
@@ -185,7 +185,7 @@ bool IsSubTilingOrEqualSharding(const Shape& potential_sharded_shape,
   // Compare the start offsets and the end offset of the tiles for each device.
   auto& potential_ta = potential_subsharding.tile_assignment().array();
   absl::Status ok_if_no_violation = potential_ta.EachStatus(
-      [&](absl::Span<const int64_t> indices, int64_t device) {
+      [&](absl::Span<const int64_t> indices, int64_t device) -> absl::Status {
         auto sharding_offset = get_sharding_offsets(device);
         for (int j = 0; j < tiled_data_rank; ++j) {
           const int32_t subsharding_offset_j =
@@ -485,8 +485,9 @@ bool MergeShardingIfCompatible(const HloSharding& to_merge,
     };
     // Try to find the intersection of to_merge and dst replication groups, in
     // order to determine the merged tile assignment.
-    Status compatible = new_tile_array.EachStatus(
-        [&](absl::Span<const int64_t> indices, int64_t* device) {
+    Status compatible =
+        new_tile_array.EachStatus([&](absl::Span<const int64_t> indices,
+                                      int64_t* device) -> absl::Status {
           DimensionVector to_merge_index(
               to_merge.tile_assignment().num_dimensions());
           DimensionVector dst_index(dst->tile_assignment().num_dimensions());
@@ -2330,8 +2331,8 @@ absl::InlinedVector<int64_t, 1> GetScatterParallelUpdateDims(
 }
 
 absl::InlinedVector<int64_t, 1> GetGatherOperandPassthroughOperandDims(
-    const Shape& operand_shape, const HloSharding& operand_sharding,
-    const HloInstruction& hlo, absl::Span<const int64_t> slice_sizes) {
+    const Shape& operand_shape, const HloInstruction& hlo,
+    absl::Span<const int64_t> slice_sizes) {
   const auto& dnums = hlo.gather_dimension_numbers();
   std::vector<int64_t> collapsed_slice_dims(
       dnums.collapsed_slice_dims().begin(), dnums.collapsed_slice_dims().end());
@@ -2362,8 +2363,7 @@ absl::InlinedVector<int64_t, 1> GetScatterOperandPassthroughOperandDims(
 
 absl::InlinedVector<int64_t, 1> GetGatherOperandPassthroughOutputDims(
     const Shape& output_shape, const Shape& operand_shape,
-    const HloSharding& operand_sharding, const HloInstruction& hlo,
-    absl::Span<const int64_t> slice_sizes) {
+    const HloInstruction& hlo, absl::Span<const int64_t> slice_sizes) {
   const auto& dnums = hlo.gather_dimension_numbers();
   std::vector<int64_t> collapsed_slice_dims(
       dnums.collapsed_slice_dims().begin(), dnums.collapsed_slice_dims().end());
diff --git a/third_party/xla/xla/hlo/utils/hlo_sharding_util.h b/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
index 8671ebfe2554f2..db3f8dceb946a6 100644
--- a/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
+++ b/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
@@ -323,8 +323,8 @@ absl::InlinedVector<int64_t, 1> GetScatterParallelUpdateDims(
 
 // Returns the operand pass-through dimensions for gather operand.
 absl::InlinedVector<int64_t, 1> GetGatherOperandPassthroughOperandDims(
-    const Shape& operand_shape, const HloSharding& operand_sharding,
-    const HloInstruction& hlo, absl::Span<const int64_t> slice_sizes);
+    const Shape& operand_shape, const HloInstruction& hlo,
+    absl::Span<const int64_t> slice_sizes);
 
 // Returns the operand pass-through dimensions for scatter operand(s).
 absl::InlinedVector<int64_t, 1> GetScatterOperandPassthroughOperandDims(
@@ -333,8 +333,7 @@ absl::InlinedVector<int64_t, 1> GetScatterOperandPassthroughOperandDims(
 
 absl::InlinedVector<int64_t, 1> GetGatherOperandPassthroughOutputDims(
     const Shape& output_shape, const Shape& operand_shape,
-    const HloSharding& operand_sharding, const HloInstruction& hlo,
-    absl::Span<const int64_t> slice_sizes);
+    const HloInstruction& hlo, absl::Span<const int64_t> slice_sizes);
 
 absl::InlinedVector<int64_t, 1> GetScatterOperandPassthroughUpdateDims(
     const Shape& update_shape, const Shape& operand_shape,
diff --git a/third_party/xla/xla/layout.h b/third_party/xla/xla/layout.h
index 1934efd3251e07..bbc73effab1e42 100644
--- a/third_party/xla/xla/layout.h
+++ b/third_party/xla/xla/layout.h
@@ -474,7 +474,7 @@ class Layout {
 
   // The number of bits used to store an individual array element.
   // When the value is 0, default to ShapeUtil::ByteSizeOfPrimitiveType.
-  uint16_t element_size_in_bits_ = 0;
+  int64_t element_size_in_bits_ = 0;
 
   // A map from physical dimension numbers to logical dimension numbers.
   // The first element is the most minor physical dimension (fastest varying
diff --git a/third_party/xla/xla/layout_util.cc b/third_party/xla/xla/layout_util.cc
index 0b08b06d02872a..03b83e22c20436 100644
--- a/third_party/xla/xla/layout_util.cc
+++ b/third_party/xla/xla/layout_util.cc
@@ -350,7 +350,7 @@ Layout CreateDefaultLayoutForRank(int64_t rank) {
       TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(layout.physical_shape()));
       TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
           layout.physical_shape(),
-          [&](const Shape& subshape, const ShapeIndex& index) {
+          [&](const Shape& subshape, const ShapeIndex& index) -> absl::Status {
             if (subshape.has_layout() &&
                 subshape.layout().has_physical_shape()) {
               return InvalidArgument(
@@ -414,6 +414,11 @@ Layout CreateDefaultLayoutForRank(int64_t rank) {
     }
   }
 
+  if (layout.element_size_in_bits() < 0) {
+    return InvalidArgument("layout element_size_in_bits field is negative: %d",
+                           layout.element_size_in_bits());
+  }
+
   return OkStatus();
 }
 
@@ -522,6 +527,18 @@ Layout CreateDefaultLayoutForRank(int64_t rank) {
   return shape.has_layout();
 }
 
+/* static */ bool LayoutUtil::HasAnyLayout(const Shape& shape) {
+  if (shape.IsTuple()) {
+    // Tuple shape: all subshapes must have a layout.
+    return absl::c_any_of(shape.tuple_shapes(),
+                          [](const Shape& s) { return HasAnyLayout(s); });
+  } else if (!shape.IsArray()) {
+    // Opaque, token types etc. ignore layout.
+    return true;
+  }
+  return shape.has_layout();
+}
+
 /* static */ bool LayoutUtil::HasLayout(const ProgramShape& program_shape) {
   for (auto& parameter_shape : program_shape.parameters()) {
     if (!LayoutUtil::HasLayout(parameter_shape)) {
diff --git a/third_party/xla/xla/layout_util.h b/third_party/xla/xla/layout_util.h
index fca4ef169d2b7b..7cff0b3ce80ad1 100644
--- a/third_party/xla/xla/layout_util.h
+++ b/third_party/xla/xla/layout_util.h
@@ -163,6 +163,7 @@ class LayoutUtil {
   // Returns whether the given shape has a layout. For tuple shapes, true is
   // returned only if all elements have layouts.
   static bool HasLayout(const Shape& shape);
+  static bool HasAnyLayout(const Shape& shape);
 
   // Returns whether all Shapes within the given ProgramShape have layouts.
   static bool HasLayout(const ProgramShape& program_shape);
diff --git a/third_party/xla/xla/literal.cc b/third_party/xla/xla/literal.cc
index 1f3768c1cc1a2b..9a9996e4c48396 100644
--- a/third_party/xla/xla/literal.cc
+++ b/third_party/xla/xla/literal.cc
@@ -48,7 +48,6 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/status.h"
 #include "xla/status_macros.h"
-#include "xla/statusor.h"
 #include "xla/tsl/util/byte_swap_array.h"
 #include "xla/types.h"
 #include "xla/util.h"
@@ -68,10 +67,6 @@ using absl::StrCat;
 using primitive_util::NativeTypeOf;
 
 constexpr bool kLittleEndian = __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__;
-// Literals can be used as DMA targets, which can require alignment. We
-// force a tsl::Allocator::kAllocatorAlignment-byte minimum
-// alignment.
-constexpr int kMinimumAlignment = 64;
 
 // Converts between little and big endian.
 //
@@ -490,7 +485,7 @@ void MutableLiteralBase::CopyElementFrom(const LiteralSlice& src_literal,
   Literal literal(shape);
 
   TF_RETURN_IF_ERROR(literal.root_piece_.ForEachMutableSubpieceWithStatus(
-      [&](const ShapeIndex& index, Piece* piece) {
+      [&](const ShapeIndex& index, Piece* piece) -> absl::Status {
         const LiteralProto* proto_element = &proto;
         for (int64_t i : index) {
           CHECK(i < proto_element->tuple_literals_size());
diff --git a/third_party/xla/xla/literal.h b/third_party/xla/xla/literal.h
index 2ebe0c2d727174..2595dcd512a582 100644
--- a/third_party/xla/xla/literal.h
+++ b/third_party/xla/xla/literal.h
@@ -35,9 +35,7 @@ limitations under the License.
 #include "absl/base/attributes.h"
 #include "absl/base/casts.h"
 #include "absl/base/config.h"
-#include "absl/base/optimization.h"
 #include "absl/functional/function_ref.h"
-#include "absl/hash/hash.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/array.h"
@@ -355,20 +353,6 @@ class LiteralBase {
     return LiteralBase::Hash(std::move(state), value);
   }
 
- private:
-  // With C++20, we can use `requires { absl::Hash<NativeT>(); }`.
-  template <typename T>
-  static constexpr bool IsAbslHashable() {
-#ifdef _MSC_VER
-    // `std::is_invocable_v<absl::Hash<T>, T>` doesn't work on MSVC.
-    // See https://godbolt.org/z/Wj9d7zrav.
-    return std::is_arithmetic_v<T>;
-#else
-    return std::is_invocable_v<absl::Hash<T>, T>;
-#endif
-  }
-
- public:
   template <typename H, bool kIsLayoutSensitive = true,
             int64_t kByteLimit = std::numeric_limits<int64_t>::max()>
   static H Hash(H state, const LiteralBase& literal) {
@@ -382,36 +366,10 @@ class LiteralBase {
           }
 
           CHECK(LayoutUtil::IsDenseArray(subshape));
-          const auto hash_func = [&](auto primitive_type_constant) {
-            using NativeT =
-                primitive_util::NativeTypeOf<primitive_type_constant>;
-            // If we can hash NativeT, then do so. Otherwise, hash raw buffer
-            // data taking care to avoid invalid parts of 4-bit type data.
-            if constexpr (IsAbslHashable<NativeT>()) {
-              state = H::combine(std::move(state),
-                                 literal.piece(index).data<NativeT>());
-            } else {
-              const int64_t num_bytes =
-                  std::min(kByteLimit, literal.size_bytes(index));
-              const char* buffer =
-                  static_cast<const char*>(literal.untyped_data(index));
-              if (primitive_util::Is4BitType(subshape.element_type())) {
-                // Note: in this case, we could potentially read 8 bytes at a
-                // time, mask out the upper 4 bits of each byte, and then hash 8
-                // bytes, but it adds complexity and needs special handling for
-                // the non-divisible-by-8 leftover bytes.
-                for (int64_t i = 0; i < num_bytes; ++i) {
-                  state =
-                      H::combine(std::move(state), buffer[i] & uint8_t{0xf});
-                }
-              } else {
-                auto data = absl::MakeConstSpan(buffer, num_bytes);
-                state = H::combine(std::move(state), data);
-              }
-            }
-          };
-          primitive_util::ArrayTypeSwitch<void>(hash_func,
-                                                subshape.element_type());
+          auto data = absl::MakeConstSpan(
+              static_cast<const char*>(literal.untyped_data(index)),
+              std::min(kByteLimit, literal.size_bytes(index)));
+          state = H::combine(std::move(state), data);
         });
 
     return std::move(state);
@@ -1040,13 +998,18 @@ class LiteralBase {
       std::vector<Piece> children = {};
     };
 
+    // Literals can be used as DMA targets, which can require alignment. We
+    // force a tsl::Allocator::kAllocatorAlignment-byte minimum
+    // alignment.
+    static inline constexpr size_t kMinimumAlignment = 64;
+
     // Use just so many bytes that we don't increase the sizeof(Piece).
     static inline constexpr size_t kMaxInlinedBytes =
         std::max(sizeof(DenseRep), sizeof(TupleRep));
 
     // Inlined dense array storage.
     struct DenseInlinedRep {
-      char data[kMaxInlinedBytes];
+      alignas(kMinimumAlignment) char data[kMaxInlinedBytes];
     };
 
     const DenseInlinedRep* GetDenseInlinedRep() const {
@@ -1644,7 +1607,7 @@ Status LiteralBase::SerializeWithShapeProto(const ShapeProto& shape_proto,
                                             OutputIterator output) const {
   SerializeState<OutputIterator> state(shape_proto, output);
   TF_RETURN_IF_ERROR(root_piece().ForEachSubpieceWithStatus(
-      [&](const ShapeIndex& shape_index, const Piece& piece) {
+      [&](const ShapeIndex& shape_index, const Piece& piece) -> absl::Status {
         const Shape& subshape = piece.subshape();
         if (subshape.IsTuple()) {
           return OkStatus();
@@ -1678,7 +1641,7 @@ absl::StatusOr<Literal> Literal::Deserialize(InputIterator begin,
   Literal literal(shape);
   TF_RETURN_IF_ERROR(
       literal.mutable_root_piece().ForEachMutableSubpieceWithStatus(
-          [&](const ShapeIndex& shape_index, Piece* piece) {
+          [&](const ShapeIndex& shape_index, Piece* piece) -> absl::Status {
             const Shape& subshape = piece->subshape();
             if (subshape.IsTuple()) {
               return OkStatus();
diff --git a/third_party/xla/xla/literal_comparison.cc b/third_party/xla/xla/literal_comparison.cc
index 7cd61dd637aa2a..4da0d71d1ff772 100644
--- a/third_party/xla/xla/literal_comparison.cc
+++ b/third_party/xla/xla/literal_comparison.cc
@@ -814,8 +814,9 @@ Status EqualDynamicShapesAndDimensions(const LiteralSlice& expected,
                                        const LiteralSlice& actual) {
   TF_RETURN_IF_ERROR(EqualShapes(expected.shape(), actual.shape()));
   return ShapeUtil::ForEachSubshapeWithStatus(
-      expected.shape(), [&expected, &actual](const Shape& expected_shape,
-                                             const ShapeIndex& index) {
+      expected.shape(),
+      [&expected, &actual](const Shape& expected_shape,
+                           const ShapeIndex& index) -> absl::Status {
         auto actual_shape = ShapeUtil::GetSubshape(actual.shape(), index);
         for (int i = 0; i < expected_shape.dimensions().size(); ++i) {
           if (!expected_shape.is_dynamic_dimension(i) &&
diff --git a/third_party/xla/xla/literal_util.cc b/third_party/xla/xla/literal_util.cc
index 6afa977248567c..27d801a8038201 100644
--- a/third_party/xla/xla/literal_util.cc
+++ b/third_party/xla/xla/literal_util.cc
@@ -301,7 +301,7 @@ void SetScalarAtIndexImpl(MutableLiteralBase& literal,
 
 /* static */ absl::StatusOr<Literal> LiteralUtil::NanValue(
     PrimitiveType primitive_type) {
-  return primitive_util::PrimitiveTypeSwitch<StatusOr<Literal>>(
+  return primitive_util::PrimitiveTypeSwitch<absl::StatusOr<Literal>>(
       [&](auto primitive_type_constant) -> absl::StatusOr<Literal> {
         if constexpr (primitive_util::IsFloatingPointType(
                           primitive_type_constant)) {
diff --git a/third_party/xla/xla/mlir/backends/cpu/transforms/BUILD b/third_party/xla/xla/mlir/backends/cpu/transforms/BUILD
index ff81d082104b3a..bae1e833302a91 100644
--- a/third_party/xla/xla/mlir/backends/cpu/transforms/BUILD
+++ b/third_party/xla/xla/mlir/backends/cpu/transforms/BUILD
@@ -1,6 +1,6 @@
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
-load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/mlir/framework/ir/BUILD b/third_party/xla/xla/mlir/framework/ir/BUILD
index 9db56a6a5e11a5..462810ebd043d4 100644
--- a/third_party/xla/xla/mlir/framework/ir/BUILD
+++ b/third_party/xla/xla/mlir/framework/ir/BUILD
@@ -1,7 +1,7 @@
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
-load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
-load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla/tsl:tsl.bzl", "internal_visibility")
+load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/mlir/framework/transforms/BUILD b/third_party/xla/xla/mlir/framework/transforms/BUILD
index 26cbb6afcf960b..d9fe2f7f19d354 100644
--- a/third_party/xla/xla/mlir/framework/transforms/BUILD
+++ b/third_party/xla/xla/mlir/framework/transforms/BUILD
@@ -1,7 +1,7 @@
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
-load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
-load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla/tsl:tsl.bzl", "internal_visibility")
+load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/mlir/memref/BUILD b/third_party/xla/xla/mlir/memref/BUILD
index aa2c29d34e2df1..7f628aefb16c13 100644
--- a/third_party/xla/xla/mlir/memref/BUILD
+++ b/third_party/xla/xla/mlir/memref/BUILD
@@ -4,7 +4,6 @@ package_group(
         "//xla/mlir/...",
         # copybara:uncomment_begin(google-only)
         # # TODO(ezhulenev): Clean up dependencies that are leforvers from Autofusion project.
-        # "@tf_runtime//...",
         # "//third_party/py/enzyme_ad/...",
         # copybara:uncomment_end(google-only)
     ],
diff --git a/third_party/xla/xla/mlir/memref/transforms/BUILD b/third_party/xla/xla/mlir/memref/transforms/BUILD
index adc78fc5b6a1cc..cd30507edc3b0d 100644
--- a/third_party/xla/xla/mlir/memref/transforms/BUILD
+++ b/third_party/xla/xla/mlir/memref/transforms/BUILD
@@ -1,6 +1,6 @@
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
-load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/mlir/runtime/BUILD b/third_party/xla/xla/mlir/runtime/BUILD
index 54432dcf8c60b5..45147f52981c2b 100644
--- a/third_party/xla/xla/mlir/runtime/BUILD
+++ b/third_party/xla/xla/mlir/runtime/BUILD
@@ -6,7 +6,6 @@ package_group(
         # "//third_party/mlir_edge/tpgen/...",
         # # TODO(ezhulenev): Clean up dependencies that are leftovers from Autofusion project.
         # "@tf_runtime//...",
-        # "//third_party/tf_runtime_google/...",
         # copybara:uncomment_end(google-only)
         "//tensorflow/compiler/mlir/tfrt/...",
         "//xla/mlir/...",
diff --git a/third_party/xla/xla/mlir/runtime/ir/BUILD b/third_party/xla/xla/mlir/runtime/ir/BUILD
index 703d2f83a194b0..f044bf6f4b5d94 100644
--- a/third_party/xla/xla/mlir/runtime/ir/BUILD
+++ b/third_party/xla/xla/mlir/runtime/ir/BUILD
@@ -1,6 +1,6 @@
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
-load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/mlir/runtime/transforms/BUILD b/third_party/xla/xla/mlir/runtime/transforms/BUILD
index fa29f9ec740de2..00bf20b09e6d1f 100644
--- a/third_party/xla/xla/mlir/runtime/transforms/BUILD
+++ b/third_party/xla/xla/mlir/runtime/transforms/BUILD
@@ -1,5 +1,4 @@
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
-load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load(
     "@local_tsl//tsl/platform:build_config_root.bzl",
     "if_llvm_aarch64_available",
@@ -10,6 +9,7 @@ load(
 )
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//xla:xla.bzl", "xla_cc_test")
+load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -53,6 +53,7 @@ cc_library(
         "//xla/mlir/runtime/ir:rt",
         "//xla/mlir/runtime/utils:custom_calls",
         "//xla/runtime:custom_call",
+        "//xla/runtime:logical_result",
         "//xla/runtime:tracing",
         "//xla/runtime:type_id",
         "@com_google_absl//absl/log:check",
@@ -79,9 +80,10 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         "//xla/mlir/runtime/ir:rt",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TransformUtils",
-        "@llvm-project//mlir:Transforms",
     ],
 )
 
@@ -95,7 +97,6 @@ xla_cc_test(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:MemRefDialect",
         "@llvm-project//mlir:TransformUtils",
-        "@llvm-project//mlir:Transforms",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_main",
     ],
@@ -184,10 +185,15 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
         "//xla/mlir/runtime/ir:rt",
         "//xla/runtime:custom_call",
+        "//xla/runtime:logical_result",
+        "//xla/runtime:memref_view",
         "//xla/runtime:tracing",
         "//xla/runtime:type_id",
+        "@com_google_absl//absl/types:span",
+        "@eigen_archive//:eigen3",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:AsyncDialect",
@@ -215,18 +221,28 @@ cc_library(
         "//xla/runtime:arguments",
         "//xla/runtime:compiler",
         "//xla/runtime:constraints",
+        "//xla/runtime:errors",
         "//xla/runtime:executable",
+        "//xla/runtime:execution_engine",
+        "//xla/runtime:logical_result",
+        "//xla/runtime:memory_mapper",
         "//xla/runtime:symbolic_shape",
         "//xla/service/llvm_ir:llvm_util",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:cord",
+        "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Analysis",
         "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:JITLink",
         "@llvm-project//llvm:Passes",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:Target",
         "@llvm-project//mlir:ExecutionEngineUtils",
         "@llvm-project//mlir:FuncExtensions",
+        "@llvm-project//mlir:FunctionInterfaces",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:Pass",
@@ -250,10 +266,12 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         ":type_converter",
+        "//xla:xla_data_proto_cc",
         "//xla/mlir/runtime/utils:constraints",
         "//xla/runtime:arguments",
         "//xla/runtime:constraints",
         "//xla/runtime:symbolic_shape",
+        "//xla/runtime:types",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -273,14 +291,17 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
         "//xla/mlir/runtime/ir:rt",
         "//xla/runtime:types",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AsyncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -290,6 +311,9 @@ xla_cc_test(
     compatible_with = get_compatible_with_portable(),
     deps = [
         ":type_converter",
+        "//xla:xla_data_proto_cc",
+        "//xla/runtime:types",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_main",
diff --git a/third_party/xla/xla/mlir/runtime/transforms/calling_convention.cc b/third_party/xla/xla/mlir/runtime/transforms/calling_convention.cc
index c6556129e669ec..d7861d265614f1 100644
--- a/third_party/xla/xla/mlir/runtime/transforms/calling_convention.cc
+++ b/third_party/xla/xla/mlir/runtime/transforms/calling_convention.cc
@@ -18,7 +18,15 @@ limitations under the License.
 #include <iterator>
 #include <utility>
 
-#include "xla/mlir/runtime/ir/rt_ops.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "xla/mlir/runtime/ir/rt_dialect.h"
 
 namespace xla {
 namespace runtime {
diff --git a/third_party/xla/xla/mlir/runtime/transforms/calling_convention_test.cc b/third_party/xla/xla/mlir/runtime/transforms/calling_convention_test.cc
index 01572346568f9e..6d326264d3be15 100644
--- a/third_party/xla/xla/mlir/runtime/transforms/calling_convention_test.cc
+++ b/third_party/xla/xla/mlir/runtime/transforms/calling_convention_test.cc
@@ -21,7 +21,7 @@
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/TypeRange.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
-#include "xla/mlir/runtime/ir/rt_ops.h"
+#include "xla/mlir/runtime/ir/rt_dialect.h"
 #include "tsl/platform/test.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/mlir/runtime/transforms/convert_asserts.cc b/third_party/xla/xla/mlir/runtime/transforms/convert_asserts.cc
index 68b936f3590840..94dfee42bf010f 100644
--- a/third_party/xla/xla/mlir/runtime/transforms/convert_asserts.cc
+++ b/third_party/xla/xla/mlir/runtime/transforms/convert_asserts.cc
@@ -18,9 +18,16 @@ limitations under the License.
 
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Block.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "xla/mlir/runtime/ir/rt_dialect.h"
 #include "xla/mlir/runtime/ir/rt_ops.h"
 #include "xla/mlir/runtime/transforms/passes.h"
 
diff --git a/third_party/xla/xla/mlir/runtime/transforms/convert_custom_calls.cc b/third_party/xla/xla/mlir/runtime/transforms/convert_custom_calls.cc
index 1f41c973414414..2c7886cce9b273 100644
--- a/third_party/xla/xla/mlir/runtime/transforms/convert_custom_calls.cc
+++ b/third_party/xla/xla/mlir/runtime/transforms/convert_custom_calls.cc
@@ -17,14 +17,29 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Region.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/TypeRange.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "xla/mlir/runtime/ir/rt_dialect.h"
+#include "xla/mlir/runtime/ir/rt_interfaces.h"
 #include "xla/mlir/runtime/ir/rt_ops.h"
 #include "xla/mlir/runtime/transforms/passes.h"
 
diff --git a/third_party/xla/xla/mlir/runtime/transforms/custom_call_encoding.cc b/third_party/xla/xla/mlir/runtime/transforms/custom_call_encoding.cc
index 1965dffa2fc334..b2c75bd12443c0 100644
--- a/third_party/xla/xla/mlir/runtime/transforms/custom_call_encoding.cc
+++ b/third_party/xla/xla/mlir/runtime/transforms/custom_call_encoding.cc
@@ -22,29 +22,45 @@ limitations under the License.
 #include <string_view>
 #include <utility>
 
+#include "absl/types/span.h"
+#include "Eigen/Core"  // from @eigen_archive
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/Conversion/LLVMCommon/MemRefBuilder.h"  // from @llvm-project
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Async/IR/AsyncTypes.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/LLVMIR/LLVMAttrs.h"  // from @llvm-project
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/ValueRange.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "xla/mlir/runtime/ir/rt_dialect.h"
 #include "xla/primitive_util.h"
 #include "xla/runtime/custom_call.h"
+#include "xla/runtime/logical_result.h"
+#include "xla/runtime/memref_view.h"
 #include "xla/runtime/tracing.h"
 #include "xla/runtime/type_id.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/concurrency/async_value_ref.h"
 #include "tsl/concurrency/chain.h"
 
diff --git a/third_party/xla/xla/mlir/runtime/transforms/custom_call_encoding.h b/third_party/xla/xla/mlir/runtime/transforms/custom_call_encoding.h
index 8675ad0de9ba1b..ea9fac26865ed5 100644
--- a/third_party/xla/xla/mlir/runtime/transforms/custom_call_encoding.h
+++ b/third_party/xla/xla/mlir/runtime/transforms/custom_call_encoding.h
@@ -26,8 +26,15 @@ limitations under the License.
 #include <variant>
 #include <vector>
 
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/LLVMIR/LLVMAttrs.h"  // from @llvm-project
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
diff --git a/third_party/xla/xla/mlir/runtime/transforms/export_functions.cc b/third_party/xla/xla/mlir/runtime/transforms/export_functions.cc
index 475dbc9b511ed3..612666397173dc 100644
--- a/third_party/xla/xla/mlir/runtime/transforms/export_functions.cc
+++ b/third_party/xla/xla/mlir/runtime/transforms/export_functions.cc
@@ -22,9 +22,14 @@ limitations under the License.
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "xla/mlir/runtime/ir/rt_dialect.h"
 #include "xla/mlir/runtime/ir/rt_ops.h"
 #include "xla/mlir/runtime/transforms/passes.h"
 
diff --git a/third_party/xla/xla/mlir/runtime/transforms/jit_compiler.cc b/third_party/xla/xla/mlir/runtime/transforms/jit_compiler.cc
index 9309ca6d0b8497..bfb072221687eb 100644
--- a/third_party/xla/xla/mlir/runtime/transforms/jit_compiler.cc
+++ b/third_party/xla/xla/mlir/runtime/transforms/jit_compiler.cc
@@ -23,32 +23,57 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/PassTimingInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/Passes/OptimizationLevel.h"
 #include "llvm/Passes/PassBuilder.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/SMLoc.h"
 #include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "mlir/Dialect/Func/Extensions/AllExtensions.h"  // from @llvm-project
 #include "mlir/ExecutionEngine/OptUtils.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/Interfaces/FunctionInterfaces.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/Timing.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Export.h"  // from @llvm-project
 #include "xla/mlir/runtime/ir/rt_dialect.h"
 #include "xla/mlir/runtime/ir/rt_ops.h"
 #include "xla/mlir/runtime/transforms/compiler.h"
 #include "xla/mlir/runtime/transforms/passes.h"
+#include "xla/mlir/runtime/transforms/specialization.h"
+#include "xla/runtime/arguments.h"
+#include "xla/runtime/constraints.h"
+#include "xla/runtime/errors.h"
+#include "xla/runtime/executable.h"
+#include "xla/runtime/execution_engine.h"
+#include "xla/runtime/logical_result.h"
+#include "xla/runtime/memory_mapper.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/mlir/runtime/transforms/jit_compiler.h b/third_party/xla/xla/mlir/runtime/transforms/jit_compiler.h
index 0020e138d07ba0..044aac3ea0535c 100644
--- a/third_party/xla/xla/mlir/runtime/transforms/jit_compiler.h
+++ b/third_party/xla/xla/mlir/runtime/transforms/jit_compiler.h
@@ -25,9 +25,18 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/cord.h"
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/Interfaces/FunctionInterfaces.h"  // from @llvm-project
 #include "xla/mlir/runtime/transforms/calling_convention.h"
 #include "xla/mlir/runtime/transforms/specialization.h"
 #include "xla/mlir/runtime/transforms/type_converter.h"
@@ -35,6 +44,7 @@ limitations under the License.
 #include "xla/runtime/compiler.h"
 #include "xla/runtime/constraints.h"
 #include "xla/runtime/executable.h"
+#include "xla/runtime/execution_engine.h"
 #include "xla/runtime/symbolic_shape.h"
 
 namespace xla {
@@ -180,8 +190,9 @@ class JitCompiler {
       absl::Span<const std::string_view> exported);
 
   absl::Status Error(std::string_view error) {
-    // TODO(ezhulenev): Pass diagnstic as a status payload.
-    return absl::InternalError(absl::StrCat(error, ":\n", diagnostic_));
+    absl::Status interr = absl::InternalError(error);
+    interr.SetPayload("__jit_compiler_internal_error", absl::Cord(diagnostic_));
+    return interr;
   }
 
   Options opts_;
diff --git a/third_party/xla/xla/mlir/runtime/transforms/ordinal_assignment.cc b/third_party/xla/xla/mlir/runtime/transforms/ordinal_assignment.cc
index 7dd62127bec0bf..5902c50d8fd991 100644
--- a/third_party/xla/xla/mlir/runtime/transforms/ordinal_assignment.cc
+++ b/third_party/xla/xla/mlir/runtime/transforms/ordinal_assignment.cc
@@ -19,7 +19,9 @@ limitations under the License.
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
-#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "xla/mlir/runtime/ir/rt_ops.h"
 #include "xla/mlir/runtime/transforms/passes.h"
 
diff --git a/third_party/xla/xla/mlir/runtime/transforms/rt_to_llvm.cc b/third_party/xla/xla/mlir/runtime/transforms/rt_to_llvm.cc
index 8fd58d9bc687b3..8af1cfda92803d 100644
--- a/third_party/xla/xla/mlir/runtime/transforms/rt_to_llvm.cc
+++ b/third_party/xla/xla/mlir/runtime/transforms/rt_to_llvm.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
@@ -43,13 +44,15 @@ limitations under the License.
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/ValueRange.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "xla/mlir/runtime/ir/rt_dialect.h"
 #include "xla/mlir/runtime/ir/rt_ops.h"
 #include "xla/mlir/runtime/transforms/custom_call_encoding.h"
 #include "xla/mlir/runtime/transforms/passes.h"
 #include "xla/runtime/custom_call.h"
+#include "xla/runtime/logical_result.h"
 #include "xla/runtime/tracing.h"
 #include "xla/runtime/type_id.h"
 
diff --git a/third_party/xla/xla/mlir/runtime/transforms/specialization.cc b/third_party/xla/xla/mlir/runtime/transforms/specialization.cc
index 1ad62a320e4bf8..3109917e6fb2e1 100644
--- a/third_party/xla/xla/mlir/runtime/transforms/specialization.cc
+++ b/third_party/xla/xla/mlir/runtime/transforms/specialization.cc
@@ -23,18 +23,30 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Casting.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Block.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Interfaces/FunctionInterfaces.h"  // from @llvm-project
 #include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
 #include "xla/mlir/runtime/transforms/type_converter.h"
 #include "xla/mlir/runtime/utils/constraints.h"
 #include "xla/runtime/arguments.h"
+#include "xla/runtime/constraints.h"
 #include "xla/runtime/symbolic_shape.h"
+#include "xla/runtime/types.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace runtime {
diff --git a/third_party/xla/xla/mlir/runtime/transforms/specialization.h b/third_party/xla/xla/mlir/runtime/transforms/specialization.h
index 30fb9d99b4fd07..4a43daa6ab49ed 100644
--- a/third_party/xla/xla/mlir/runtime/transforms/specialization.h
+++ b/third_party/xla/xla/mlir/runtime/transforms/specialization.h
@@ -16,6 +16,9 @@ limitations under the License.
 #ifndef XLA_MLIR_RUNTIME_TRANSFORMS_SPECIALIZATION_H_
 #define XLA_MLIR_RUNTIME_TRANSFORMS_SPECIALIZATION_H_
 
+#include "absl/status/status.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/Interfaces/FunctionInterfaces.h"  // from @llvm-project
diff --git a/third_party/xla/xla/mlir/runtime/transforms/type_converter.cc b/third_party/xla/xla/mlir/runtime/transforms/type_converter.cc
index 1d4be90b139b1c..0c0303bdf864c2 100644
--- a/third_party/xla/xla/mlir/runtime/transforms/type_converter.cc
+++ b/third_party/xla/xla/mlir/runtime/transforms/type_converter.cc
@@ -24,11 +24,17 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/strings/str_format.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Async/IR/AsyncTypes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
 #include "xla/mlir/runtime/ir/rt_dialect.h"
 #include "xla/primitive_util.h"
+#include "xla/runtime/types.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace runtime {
diff --git a/third_party/xla/xla/mlir/runtime/transforms/type_converter.h b/third_party/xla/xla/mlir/runtime/transforms/type_converter.h
index 1355d7052a6ef6..edf275bf5c255d 100644
--- a/third_party/xla/xla/mlir/runtime/transforms/type_converter.h
+++ b/third_party/xla/xla/mlir/runtime/transforms/type_converter.h
@@ -20,8 +20,12 @@ limitations under the License.
 #include <memory>
 
 #include "absl/status/statusor.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
 #include "xla/runtime/types.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace runtime {
diff --git a/third_party/xla/xla/mlir/runtime/transforms/type_converter_test.cc b/third_party/xla/xla/mlir/runtime/transforms/type_converter_test.cc
index c85b68132032ee..94425abfbfa257 100644
--- a/third_party/xla/xla/mlir/runtime/transforms/type_converter_test.cc
+++ b/third_party/xla/xla/mlir/runtime/transforms/type_converter_test.cc
@@ -16,8 +16,11 @@
 
 #include "xla/mlir/runtime/transforms/type_converter.h"
 
+#include "llvm/Support/Casting.h"
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "xla/runtime/types.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/test.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/mlir/runtime/utils/BUILD b/third_party/xla/xla/mlir/runtime/utils/BUILD
index b45fcb04ffdbd8..812119ac4c8a72 100644
--- a/third_party/xla/xla/mlir/runtime/utils/BUILD
+++ b/third_party/xla/xla/mlir/runtime/utils/BUILD
@@ -1,5 +1,5 @@
-load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/mlir/utils/BUILD b/third_party/xla/xla/mlir/utils/BUILD
index cabd61a0d739a5..9652b2fbf3dbb0 100644
--- a/third_party/xla/xla/mlir/utils/BUILD
+++ b/third_party/xla/xla/mlir/utils/BUILD
@@ -1,7 +1,7 @@
-load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
-load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//xla:xla.bzl", "xla_cc_test")
+load("//xla/tsl:tsl.bzl", "internal_visibility")
+load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/mlir/xla_cpu/ir/BUILD b/third_party/xla/xla/mlir/xla_cpu/ir/BUILD
index bb09094c14dca8..b5933c23b96da4 100644
--- a/third_party/xla/xla/mlir/xla_cpu/ir/BUILD
+++ b/third_party/xla/xla/mlir/xla_cpu/ir/BUILD
@@ -1,10 +1,10 @@
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
-load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla/tsl:tsl.bzl", "internal_visibility")
 load(
-    "@local_tsl//tsl:tsl.default.bzl",
+    "//xla/tsl:tsl.default.bzl",
     "get_compatible_with_portable",
 )
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/mlir/xla_cpu/ir/xla_cpu_dialect.td b/third_party/xla/xla/mlir/xla_cpu/ir/xla_cpu_dialect.td
index befd2c984f4388..026168df1f1898 100644
--- a/third_party/xla/xla/mlir/xla_cpu/ir/xla_cpu_dialect.td
+++ b/third_party/xla/xla/mlir/xla_cpu/ir/xla_cpu_dialect.td
@@ -27,8 +27,6 @@ def XlaCpuDialect : Dialect {
     CPU runtime.
   }];
   let cppNamespace = "::mlir::xla_cpu";
-
-  let usePropertiesForAttributes = 0;
 }
 
 #endif  // XLA_MLIR_XLA_CPU_DIALECT_TD_
diff --git a/third_party/xla/xla/mlir_hlo/BUILD b/third_party/xla/xla/mlir_hlo/BUILD
index dfed55f77e5fd3..761446ff70f3e1 100644
--- a/third_party/xla/xla/mlir_hlo/BUILD
+++ b/third_party/xla/xla/mlir_hlo/BUILD
@@ -1,8 +1,8 @@
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "gentbl_filegroup", "td_library")
-load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
-load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla/tsl:tsl.bzl", "internal_visibility")
+load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/mlir_hlo/WORKSPACE b/third_party/xla/xla/mlir_hlo/WORKSPACE
index bab4b26e9a648d..ae7ca4dd0254e7 100644
--- a/third_party/xla/xla/mlir_hlo/WORKSPACE
+++ b/third_party/xla/xla/mlir_hlo/WORKSPACE
@@ -14,6 +14,8 @@
 """Workspace for MLIR HLO."""
 # buildifier: disable=load-on-top
 
+# buildifier: disable=load-on-top
+
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 
 SKYLIB_VERSION = "1.3.0"
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc
index 7060d98b1cfdb5..d12f6599463d2d 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc
@@ -517,9 +517,9 @@ LogicalResult AsyncUpdateOp::verify() {
 
 LogicalResult AsyncUpdateOp::inferReturnTypes(
     MLIRContext*, std::optional<Location>, ValueRange operands,
-    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
-  AsyncUpdateOp::Adaptor adaptor(operands, attributes, {}, regions);
+  AsyncUpdateOp::Adaptor adaptor(operands, attributes, properties, regions);
   auto stateType = adaptor.getBundle().getType().cast<AsyncBundleType>();
   inferredReturnTypes.push_back(stateType);
   return success();
@@ -550,9 +550,9 @@ LogicalResult AsyncDoneOp::verify() {
 
 LogicalResult AsyncDoneOp::inferReturnTypes(
     MLIRContext*, std::optional<Location>, ValueRange operands,
-    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
-  AsyncDoneOp::Adaptor adaptor(operands, attributes, {}, regions);
+  AsyncDoneOp::Adaptor adaptor(operands, attributes, properties, regions);
   ModuleOp module =
       adaptor.getBundle().getDefiningOp()->getParentOfType<ModuleOp>();
   auto calledComputation = adaptor.getCalledComputationAttr();
@@ -602,34 +602,35 @@ OpFoldResult ConstantOp::fold(FoldAdaptor adaptor) {
 // Builds a constant op with the specified attribute `value`.
 void ConstantOp::build(OpBuilder& /*builder*/, OperationState& result,
                        Attribute value) {
+  Properties& properties = result.getOrAddProperties<Properties>();
   Type type;
   if (auto elemAttr = value.dyn_cast<ElementsAttr>()) {
     type = elemAttr.getType();
+    properties.value = elemAttr;
   } else if (value.isa<BoolAttr, FloatAttr, IntegerAttr>()) {
     // All XLA types must be tensor types. In the build() method, we want to
     // provide more flexibility by allowing attributes of scalar types. But we
     // need to wrap it up with ElementsAttr to construct valid XLA constants.
     type =
         RankedTensorType::get(/*shape=*/{}, value.cast<TypedAttr>().getType());
-    value = DenseElementsAttr::get(type.cast<TensorType>(), value);
+    properties.value = DenseElementsAttr::get(type.cast<TensorType>(), value);
   } else if (auto complexAttr = value.dyn_cast<complex::NumberAttr>()) {
     type = RankedTensorType::get(/*shape=*/{},
                                  complexAttr.cast<TypedAttr>().getType());
-    value =
+    properties.value =
         DenseElementsAttr::get(type.cast<TensorType>(), complexAttr.getValue());
   }
 
   // TODO: support other XLA specific types.
   assert(type && "unsupported attribute type for building mhlo.constant");
   result.types.push_back(type);
-  result.addAttribute("value", value);
 }
 
 LogicalResult ConstantOp::inferReturnTypes(
     MLIRContext*, std::optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, OpaqueProperties, RegionRange,
+    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
-  ConstantOpAdaptor adaptor(operands, attributes);
+  ConstantOpAdaptor adaptor(operands, attributes, properties, regions);
   return hlo::inferConstantOp(location, adaptor.getValue(),
                               inferredReturnTypes);
 }
@@ -649,48 +650,11 @@ bool ConstantOp::isCompatibleReturnTypes(TypeRange l, TypeRange r) {
 }
 
 ParseResult ConstantOp::parse(OpAsmParser& parser, OperationState& result) {
-  // Parse the generic form.
-  if (succeeded(parser.parseOptionalLParen())) {
-    if (parser.parseRParen()) return failure();
-    if (parser.parseOptionalAttrDict(result.attributes)) return failure();
-    if (parser.parseColon() || parser.parseLParen() || parser.parseRParen() ||
-        parser.parseArrow())
-      return failure();
-    Type resultTy;
-    if (parser.parseType(resultTy)) {
-      return failure();
-    }
-    result.addTypes(resultTy);
-    return success();
-  }
-
-  ElementsAttr valueAttr;
-  if (parser.parseOptionalAttrDict(result.attributes)) return failure();
-
-  if (parser.parseCustomAttributeWithFallback(valueAttr, Type{}, "value",
-                                              result.attributes)) {
-    return failure();
-  }
-  result.addTypes(valueAttr.getType());
-  return success();
+  return hlo::parseConstantOp(parser, result);
 }
 
-/// Print a `constant` op.
-///
-/// op ::= attr-dict $value
-///
-/// When the `value` and `output` have different type, it just uses the default
-/// operator assembly format as a fallback.
 void ConstantOp::print(::mlir::OpAsmPrinter& p) {
-  // If not all types are the same, use generic form.
-  if (getValue().getType() != getType()) {
-    p.printGenericOp(getOperation(), /*printOpName=*/false);
-    return;
-  }
-
-  p.printOptionalAttrDict((*this)->getAttrs(), /*elidedAttrs=*/{"value"});
-  p << ' ';
-  p.printStrippedAttrOrType(getValueAttr());
+  hlo::printConstantOp(p, getOperation(), getValue());
 }
 
 //===----------------------------------------------------------------------===//
@@ -913,9 +877,9 @@ void CustomCallOp::getEffects(
 
 LogicalResult CholeskyOp::inferReturnTypeComponents(
     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  CholeskyOp::Adaptor adaptor(operands, attributes, {}, regions);
+  CholeskyOp::Adaptor adaptor(operands, attributes, properties, regions);
   return hlo::inferCholeskyOp(location, adaptor.getA(), inferredReturnShapes);
 }
 
@@ -1044,9 +1008,9 @@ LogicalResult verify1dTensor(std::optional<Location> loc,
 
 LogicalResult FftOp::inferReturnTypeComponents(
     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  FftOp::Adaptor adaptor(operands, attributes, {}, regions);
+  FftOp::Adaptor adaptor(operands, attributes, properties, regions);
   if (failed(verify1dTensor(location, adaptor.getFftLength(), "fft_length")))
     return failure();
   return hlo::inferFftOp(
@@ -1228,10 +1192,10 @@ LogicalResult GatherOp::reifyReturnTypeShapes(
 
 LogicalResult GatherOp::inferReturnTypeComponents(
     MLIRContext* context, std::optional<Location> location,
-    ValueShapeRange operands, DictionaryAttr attributes, OpaqueProperties,
-    RegionRange regions,
+    ValueShapeRange operands, DictionaryAttr attributes,
+    OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  GatherOp::Adaptor adaptor(operands, attributes, {}, regions);
+  GatherOp::Adaptor adaptor(operands, attributes, properties, regions);
   if (failed(verify1dTensor(location, adaptor.getSliceSizes(), "slice_sizes")))
     return failure();
   return hlo::inferGatherOp(
@@ -1289,10 +1253,10 @@ LogicalResult DynamicGatherOp::reifyReturnTypeShapes(
 
 LogicalResult DynamicGatherOp::inferReturnTypeComponents(
     MLIRContext* context, std::optional<Location> location,
-    ValueShapeRange operands, DictionaryAttr attributes, OpaqueProperties,
-    RegionRange regions,
+    ValueShapeRange operands, DictionaryAttr attributes,
+    OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  DynamicGatherOp::Adaptor adaptor(operands, attributes, {}, regions);
+  DynamicGatherOp::Adaptor adaptor(operands, attributes, properties, regions);
   return hlo::inferDynamicGatherOp(
       location, adaptor.getOperand(), adaptor.getStartIndices(),
       adaptor.getSliceSizes(), adaptor.getDimensionNumbers().getOffsetDims(),
@@ -1307,9 +1271,10 @@ LogicalResult DynamicGatherOp::inferReturnTypeComponents(
 
 LogicalResult GetDimensionSizeOp::inferReturnTypeComponents(
     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  GetDimensionSizeOp::Adaptor adaptor(operands, attributes, {}, regions);
+  GetDimensionSizeOp::Adaptor adaptor(operands, attributes, properties,
+                                      regions);
   return hlo::inferGetDimensionSizeOp(location, adaptor.getOperand().getType(),
                                       adaptor.getDimension(),
                                       inferredReturnShapes);
@@ -1486,9 +1451,10 @@ LogicalResult DynamicIotaOp::reifyReturnTypeShapes(
 
 LogicalResult DynamicUpdateSliceOp::inferReturnTypeComponents(
     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  DynamicUpdateSliceOp::Adaptor adaptor(operands, attributes, {}, regions);
+  DynamicUpdateSliceOp::Adaptor adaptor(operands, attributes, properties,
+                                        regions);
   return hlo::inferDynamicUpdateSliceOp(
       location, adaptor.getOperand(), adaptor.getUpdate(),
       adaptor.getStartIndices(), inferredReturnShapes);
@@ -1526,9 +1492,9 @@ OpFoldResult DynamicUpdateSliceOp::fold(FoldAdaptor /*adaptor*/) {
 
 LogicalResult AbsOp::inferReturnTypes(
     MLIRContext*, std::optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
-  AbsOp::Adaptor adaptor(operands, attributes, {}, regions);
+  AbsOp::Adaptor adaptor(operands, attributes, properties, regions);
   return hlo::inferAbsOp(location, adaptor.getOperand(), inferredReturnTypes);
 }
 
@@ -1993,9 +1959,9 @@ void TupleOp::getCanonicalizationPatterns(RewritePatternSet& results,
 
 LogicalResult AllToAllOp::inferReturnTypeComponents(
     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  AllToAllOp::Adaptor adaptor(operands, attributes, {}, regions);
+  AllToAllOp::Adaptor adaptor(operands, attributes, properties, regions);
 
   bool isArrayAllToAll = adaptor.getSplitDimension() &&
                          adaptor.getConcatDimension() &&
@@ -2114,9 +2080,9 @@ void AllReduceOp::build(OpBuilder& odsBuilder, OperationState& odsState,
 
 LogicalResult AllReduceOp::inferReturnTypeComponents(
     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  AllReduceOp::Adaptor adaptor(operands, attributes, {}, regions);
+  AllReduceOp::Adaptor adaptor(operands, attributes, properties, regions);
 
   // Verify constraints
   if (adaptor.getOperands().empty())
@@ -2146,10 +2112,10 @@ LogicalResult AllReduceOp::inferReturnTypeComponents(
 
 LogicalResult BatchNormGradOp::inferReturnTypeComponents(
     MLIRContext* context, std::optional<Location> location,
-    ValueShapeRange operands, DictionaryAttr attributes, OpaqueProperties,
-    RegionRange regions,
+    ValueShapeRange operands, DictionaryAttr attributes,
+    OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  BatchNormGradOp::Adaptor adaptor(operands, attributes, {}, regions);
+  BatchNormGradOp::Adaptor adaptor(operands, attributes, properties, regions);
   return hlo::inferBatchNormGradOp(
       location, adaptor.getOperand(), adaptor.getScale(), adaptor.getMean(),
       adaptor.getVariance(), adaptor.getGradOutput(), adaptor.getFeatureIndex(),
@@ -2162,10 +2128,11 @@ LogicalResult BatchNormGradOp::inferReturnTypeComponents(
 
 LogicalResult BatchNormTrainingOp::inferReturnTypeComponents(
     MLIRContext* context, std::optional<Location> location,
-    ValueShapeRange operands, DictionaryAttr attributes, OpaqueProperties,
-    RegionRange regions,
+    ValueShapeRange operands, DictionaryAttr attributes,
+    OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  BatchNormTrainingOp::Adaptor adaptor(operands, attributes, {}, regions);
+  BatchNormTrainingOp::Adaptor adaptor(operands, attributes, properties,
+                                       regions);
   return hlo::inferBatchNormTrainingOp(
       location, adaptor.getOperand(), adaptor.getScale(), adaptor.getOffset(),
       adaptor.getFeatureIndex(), inferredReturnShapes);
@@ -2177,10 +2144,11 @@ LogicalResult BatchNormTrainingOp::inferReturnTypeComponents(
 
 LogicalResult BatchNormInferenceOp::inferReturnTypeComponents(
     MLIRContext* context, std::optional<Location> location,
-    ValueShapeRange operands, DictionaryAttr attributes, OpaqueProperties,
-    RegionRange regions,
+    ValueShapeRange operands, DictionaryAttr attributes,
+    OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  BatchNormInferenceOp::Adaptor adaptor(operands, attributes, {}, regions);
+  BatchNormInferenceOp::Adaptor adaptor(operands, attributes, properties,
+                                        regions);
   return hlo::inferBatchNormInferenceOp(
       location, adaptor.getOperand(), adaptor.getScale(), adaptor.getOffset(),
       adaptor.getMean(), adaptor.getVariance(), adaptor.getFeatureIndex(),
@@ -2281,9 +2249,9 @@ OpFoldResult BroadcastOp::fold(FoldAdaptor adaptor) {
 
 LogicalResult BroadcastOp::inferReturnTypeComponents(
     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  BroadcastOp::Adaptor adaptor(operands, attributes, {}, regions);
+  BroadcastOp::Adaptor adaptor(operands, attributes, properties, regions);
   if (failed(verify1dTensor(location, adaptor.getBroadcastSizes(),
                             "broadcast_sizes")))
     return failure();
@@ -2615,9 +2583,9 @@ LogicalResult DynamicBroadcastInDimOp::reifyReturnTypeShapes(
 
 LogicalResult ComplexOp::inferReturnTypes(
     MLIRContext*, std::optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
-  ComplexOp::Adaptor adaptor(operands, attributes, {}, regions);
+  ComplexOp::Adaptor adaptor(operands, attributes, properties, regions);
   return hlo::inferComplexOp(location, adaptor.getLhs(), inferredReturnTypes);
 }
 
@@ -2637,9 +2605,9 @@ OpFoldResult ComplexOp::fold(FoldAdaptor) {
 
 LogicalResult ImagOp::inferReturnTypes(
     MLIRContext*, std::optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
-  ImagOp::Adaptor adaptor(operands, attributes, {}, regions);
+  ImagOp::Adaptor adaptor(operands, attributes, properties, regions);
   return hlo::inferImagOp(location, adaptor.getOperand(), inferredReturnTypes);
 }
 
@@ -2657,9 +2625,9 @@ OpFoldResult ImagOp::fold(FoldAdaptor) {
 
 LogicalResult IsFiniteOp::inferReturnTypes(
     MLIRContext* ctx, std::optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
-  IsFiniteOp::Adaptor adaptor(operands, attributes, {}, regions);
+  IsFiniteOp::Adaptor adaptor(operands, attributes, properties, regions);
   return hlo::inferIsFiniteOp(ctx, location, adaptor.getX(),
                               inferredReturnTypes);
 }
@@ -2670,9 +2638,9 @@ LogicalResult IsFiniteOp::inferReturnTypes(
 
 LogicalResult RealOp::inferReturnTypes(
     MLIRContext*, std::optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
-  RealOp::Adaptor adaptor(operands, attributes, {}, regions);
+  RealOp::Adaptor adaptor(operands, attributes, properties, regions);
   return hlo::inferRealOp(location, adaptor.getOperand(), inferredReturnTypes);
 }
 
@@ -2768,9 +2736,9 @@ class ConcatenateForwarding : public OpRewritePattern<ConcatenateOp> {
 
 LogicalResult ConcatenateOp::inferReturnTypes(
     MLIRContext*, std::optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
-  ConcatenateOp::Adaptor adaptor(operands, attributes, {}, regions);
+  ConcatenateOp::Adaptor adaptor(operands, attributes, properties, regions);
   return hlo::inferConcatenateOp(location, adaptor.getVal().getTypes(),
                                  adaptor.getDimension(), inferredReturnTypes);
 }
@@ -3090,9 +3058,9 @@ void DynamicSliceOp::getCanonicalizationPatterns(RewritePatternSet& results,
 
 LogicalResult DynamicSliceOp::inferReturnTypeComponents(
     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  DynamicSliceOp::Adaptor adaptor(operands, attributes, {}, regions);
+  DynamicSliceOp::Adaptor adaptor(operands, attributes, properties, regions);
   if (failed(verify1dTensor(location, adaptor.getSliceSizes(), "slice_sizes")))
     return failure();
   return hlo::inferDynamicSliceOp(
@@ -3239,9 +3207,9 @@ LogicalResult InfeedOp::verify() {
 
 LogicalResult MapOp::inferReturnTypeComponents(
     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  MapOp::Adaptor adaptor(operands, attributes, {}, regions);
+  MapOp::Adaptor adaptor(operands, attributes, properties, regions);
   if (failed(verify1dTensor(location, adaptor.getDimensions(), "dimensions")))
     return failure();
   return hlo::inferMapOp(
@@ -3324,9 +3292,9 @@ OpFoldResult CopyOp::fold(FoldAdaptor) { return getOperand(); }
 
 LogicalResult ReduceWindowOp::inferReturnTypeComponents(
     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  ReduceWindowOp::Adaptor adaptor(operands, attributes, {}, regions);
+  ReduceWindowOp::Adaptor adaptor(operands, attributes, properties, regions);
   return hlo::inferReduceWindowOp(
       location, adaptor.getInputs(), adaptor.getInitValues(),
       llvm::to_vector(adaptor.getWindowDimensions().getValues<int64_t>()),
@@ -3449,23 +3417,12 @@ void ReduceWindowOp::build(
     function_ref<void(OpBuilder&, Location, ValueRange)> bodyBuilder) {
   odsState.addOperands(inputs);
   odsState.addOperands(init_values);
-  odsState.addAttribute(getWindowDimensionsAttrName(odsState.name),
-                        window_dimensions);
-  if (window_strides) {
-    odsState.addAttribute(getWindowStridesAttrName(odsState.name),
-                          window_strides);
-  }
-  if (base_dilations) {
-    odsState.addAttribute(getBaseDilationsAttrName(odsState.name),
-                          base_dilations);
-  }
-  if (window_dilations) {
-    odsState.addAttribute(getWindowDilationsAttrName(odsState.name),
-                          window_dilations);
-  }
-  if (padding) {
-    odsState.addAttribute(getPaddingAttrName(odsState.name), padding);
-  }
+  Properties& properties = odsState.getOrAddProperties<Properties>();
+  properties.window_dimensions = window_dimensions;
+  properties.window_strides = window_strides;
+  properties.base_dilations = base_dilations;
+  properties.window_dilations = window_dilations;
+  properties.padding = padding;
   Region* region = odsState.addRegion();
 
   llvm::SmallVector<Type> blockArgTypes;
@@ -3681,9 +3638,9 @@ ParseResult ReduceOp::parse(OpAsmParser& parser, OperationState& result) {
 
 LogicalResult ReduceOp::inferReturnTypeComponents(
     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  ReduceOp::Adaptor adaptor(operands, attributes, {}, regions);
+  ReduceOp::Adaptor adaptor(operands, attributes, properties, regions);
   return hlo::inferReduceOp(
       location, adaptor.getInputs().getTypes(),
       llvm::to_vector(adaptor.getDimensions().getValues<int64_t>()),
@@ -3695,7 +3652,8 @@ void ReduceOp::build(OpBuilder&, OperationState& odsState, ValueRange inputs,
                      TypeRange elementTypes) {
   odsState.addOperands(inputs);
   odsState.addOperands(initValues);
-  odsState.addAttribute(getDimensionsAttrName(odsState.name), dimensions);
+  Properties& properties = odsState.getOrAddProperties<Properties>();
+  properties.dimensions = dimensions;
   (void)odsState.addRegion();
 
   SmallVector<int64_t> newDimensions;
@@ -3882,9 +3840,10 @@ LogicalResult ReduceOp::reifyReturnTypeShapes(
 //===----------------------------------------------------------------------===//
 LogicalResult OptimizationBarrierOp::inferReturnTypes(
     MLIRContext*, std::optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, OpaqueProperties, RegionRange,
+    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
-  OptimizationBarrierOp::Adaptor adaptor(operands, attributes);
+  OptimizationBarrierOp::Adaptor adaptor(operands, attributes, properties,
+                                         regions);
   return hlo::inferOptimizationBarrierOp(location, adaptor.getOperand(),
                                          inferredReturnTypes);
 }
@@ -3916,10 +3875,10 @@ LogicalResult RngBitGeneratorOp::verify() {
 
 LogicalResult RngOp::inferReturnTypeComponents(
     MLIRContext* context, std::optional<Location> location,
-    ValueShapeRange operands, DictionaryAttr attributes, OpaqueProperties,
-    RegionRange regions,
+    ValueShapeRange operands, DictionaryAttr attributes,
+    OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  RngOp::Adaptor adaptor(operands, attributes, {}, regions);
+  RngOp::Adaptor adaptor(operands, attributes, properties, regions);
   return hlo::inferRngOp(
       location, adaptor.getA(), adaptor.getB(), adaptor.getShape(),
       adaptor.getRngDistribution() == RngDistribution::UNIFORM,
@@ -3999,9 +3958,9 @@ void SelectOp::getCanonicalizationPatterns(RewritePatternSet& results,
 // the return type based on operand type.
 LogicalResult SelectOp::inferReturnTypeComponents(
     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, OpaqueProperties, RegionRange,
+    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  SelectOp::Adaptor op(operands, attributes);
+  SelectOp::Adaptor op(operands, attributes, properties, regions);
   return hlo::inferSelectOp(location, op.getPred(), op.getOnTrue(),
                             op.getOnFalse(), inferredReturnShapes);
 }
@@ -4037,10 +3996,11 @@ OpFoldResult SetDimensionSizeOp::fold(FoldAdaptor adaptor) {
 
 LogicalResult SetDimensionSizeOp::inferReturnTypeComponents(
     MLIRContext* context, std::optional<Location> location,
-    ValueShapeRange operands, DictionaryAttr attributes, OpaqueProperties,
-    RegionRange regions,
+    ValueShapeRange operands, DictionaryAttr attributes,
+    OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  SetDimensionSizeOp::Adaptor adaptor(operands, attributes, {}, regions);
+  SetDimensionSizeOp::Adaptor adaptor(operands, attributes, properties,
+                                      regions);
   return hlo::inferSetDimensionSizeOp(
       getMhloDialect(context), location, adaptor.getOperand().getType(),
       adaptor.getSize(), adaptor.getDimension(), inferredReturnShapes);
@@ -4052,9 +4012,9 @@ LogicalResult SetDimensionSizeOp::inferReturnTypeComponents(
 
 LogicalResult PadOp::inferReturnTypes(
     MLIRContext*, std::optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
-  PadOp::Adaptor adaptor(operands, attributes, {}, regions);
+  PadOp::Adaptor adaptor(operands, attributes, properties, regions);
   if (failed(verify1dTensor(location, adaptor.getEdgePaddingLow(),
                             "edge_padding_low")) ||
       failed(verify1dTensor(location, adaptor.getEdgePaddingHigh(),
@@ -4157,7 +4117,8 @@ OpFoldResult PadOp::fold(FoldAdaptor adaptor) {
 LogicalResult PadOp::reifyReturnTypeShapes(
     OpBuilder& builder, ValueRange operands,
     SmallVectorImpl<Value>& reifiedReturnShapes) {
-  PadOp::Adaptor adaptor(operands, this->getOperation()->getAttrDictionary());
+  PadOp::Adaptor adaptor(operands, this->getOperation()->getAttrDictionary(),
+                         this->getOperation()->getPropertiesStorage());
   auto loc = this->getLoc();
   Value operand = adaptor.getOperand();
   auto operandTy = operand.getType().cast<RankedTensorType>();
@@ -4455,9 +4416,9 @@ LogicalResult AddDependencyOp::inferReturnTypes(
 
 LogicalResult IfOp::inferReturnTypes(
     MLIRContext* context, std::optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
-  IfOp::Adaptor adaptor(operands, attributes, {}, regions);
+  IfOp::Adaptor adaptor(operands, attributes, properties, regions);
   return hlo::inferIfOp(location, adaptor.getPred(), adaptor.getRegions(),
                         inferredReturnTypes);
 }
@@ -4486,9 +4447,9 @@ void IfOp::getCanonicalizationPatterns(RewritePatternSet& results,
 
 LogicalResult CaseOp::inferReturnTypes(
     MLIRContext* context, std::optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
-  CaseOp::Adaptor adaptor(operands, attributes, {}, regions);
+  CaseOp::Adaptor adaptor(operands, attributes, properties, regions);
   return hlo::inferCaseOp(location, adaptor.getIndex(), adaptor.getRegions(),
                           inferredReturnTypes);
 }
@@ -5035,9 +4996,9 @@ OpFoldResult ClampOp::fold(FoldAdaptor adaptor) {
 
 LogicalResult ClampOp::inferReturnTypeComponents(
     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  ClampOp::Adaptor adaptor(operands, attributes, {}, regions);
+  ClampOp::Adaptor adaptor(operands, attributes, properties, regions);
   return hlo::inferClampOp(location, adaptor.getMin(), adaptor.getOperand(),
                            adaptor.getMax(), inferredReturnShapes);
 }
@@ -5056,9 +5017,9 @@ LogicalResult ClampOp::reifyReturnTypeShapes(
 
 LogicalResult SliceOp::inferReturnTypes(
     MLIRContext* /*context*/, std::optional<Location> location,
-    ValueRange operands, DictionaryAttr attributes, OpaqueProperties,
-    RegionRange /*regions*/, SmallVectorImpl<Type>& inferredReturnTypes) {
-  SliceOpAdaptor adaptor(operands, attributes);
+    ValueRange operands, DictionaryAttr attributes, OpaqueProperties properties,
+    RegionRange regions, SmallVectorImpl<Type>& inferredReturnTypes) {
+  SliceOpAdaptor adaptor(operands, attributes, properties, regions);
   if (failed(verify1dTensor(location, adaptor.getStartIndices(),
                             "start_indices")) ||
       failed(verify1dTensor(location, adaptor.getLimitIndices(),
@@ -5277,8 +5238,9 @@ void SliceOp::getCanonicalizationPatterns(RewritePatternSet& results,
 void SortOp::build(OpBuilder& builder, OperationState& state,
                    ValueRange operands, int64_t dimension, bool isStable) {
   state.addOperands(operands);
-  state.addAttribute("dimension", builder.getI64IntegerAttr(dimension));
-  state.addAttribute("is_stable", builder.getBoolAttr(isStable));
+  Properties& properties = state.getOrAddProperties<Properties>();
+  properties.dimension = builder.getI64IntegerAttr(dimension);
+  properties.is_stable = builder.getBoolAttr(isStable);
 
   for (Value operand : operands) state.addTypes(operand.getType());
 
@@ -5287,9 +5249,9 @@ void SortOp::build(OpBuilder& builder, OperationState& state,
 
 LogicalResult SortOp::inferReturnTypeComponents(
     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  SortOp::Adaptor adaptor(operands, attributes, {}, regions);
+  SortOp::Adaptor adaptor(operands, attributes, properties, regions);
   return hlo::inferSortOp(location, adaptor.getInputs(), inferredReturnShapes);
 }
 
@@ -5525,9 +5487,9 @@ LogicalResult TransposeOp::reifyReturnTypeShapes(
 
 LogicalResult TransposeOp::inferReturnTypes(
     MLIRContext*, std::optional<Location> loc, ValueRange operands,
-    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
-  TransposeOp::Adaptor adaptor(operands, attributes, {}, regions);
+  TransposeOp::Adaptor adaptor(operands, attributes, properties, regions);
   if (failed(verify1dTensor(loc, adaptor.getPermutation(), "permutation")))
     return failure();
   return hlo::inferTransposeOp(
@@ -5542,9 +5504,9 @@ LogicalResult TransposeOp::inferReturnTypes(
 
 LogicalResult TriangularSolveOp::inferReturnTypeComponents(
     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  TriangularSolveOp::Adaptor adaptor(operands, attributes, {}, regions);
+  TriangularSolveOp::Adaptor adaptor(operands, attributes, properties, regions);
   bool isTransposeAInvalid =
       (adaptor.getTransposeA() == Transpose::TRANSPOSE_INVALID);
   return hlo::inferTriangularSolveOp(location, adaptor.getA(), adaptor.getB(),
@@ -5566,9 +5528,9 @@ OpFoldResult GetTupleElementOp::fold(FoldAdaptor /*adaptor*/) {
 
 LogicalResult GetTupleElementOp::inferReturnTypes(
     MLIRContext*, std::optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
-  GetTupleElementOp::Adaptor adaptor(operands, attributes, {}, regions);
+  GetTupleElementOp::Adaptor adaptor(operands, attributes, properties, regions);
   return hlo::inferGetTupleElementOp(location, adaptor.getOperand(),
                                      adaptor.getIndex(), inferredReturnTypes);
 }
@@ -5579,9 +5541,9 @@ LogicalResult GetTupleElementOp::inferReturnTypes(
 
 LogicalResult TupleOp::inferReturnTypes(
     MLIRContext* context, std::optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
-  TupleOp::Adaptor adaptor(operands, attributes, {}, regions);
+  TupleOp::Adaptor adaptor(operands, attributes, properties, regions);
   return hlo::inferTupleOp(context, location, adaptor.getVal(),
                            inferredReturnTypes);
 }
@@ -5609,10 +5571,10 @@ void CompareOp::build(OpBuilder& builder, OperationState& result, Value lhs,
 
 LogicalResult CompareOp::inferReturnTypeComponents(
     MLIRContext* context, std::optional<Location> location,
-    ValueShapeRange operands, DictionaryAttr attributes, OpaqueProperties,
-    RegionRange regions,
+    ValueShapeRange operands, DictionaryAttr attributes,
+    OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  CompareOp::Adaptor adaptor(operands, attributes, {}, regions);
+  CompareOp::Adaptor adaptor(operands, attributes, properties, regions);
   return hlo::inferCompareOp(context, location, adaptor.getLhs(),
                              inferredReturnShapes);
 }
@@ -5741,9 +5703,10 @@ OpFoldResult CompareOp::fold(FoldAdaptor adaptor) {
 
 LogicalResult SelectAndScatterOp::inferReturnTypes(
     MLIRContext*, std::optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
-  SelectAndScatterOp::Adaptor adaptor(operands, attributes, {}, regions);
+  SelectAndScatterOp::Adaptor adaptor(operands, attributes, properties,
+                                      regions);
   return hlo::inferSelectAndScatterOp(location, adaptor.getOperand(),
                                       adaptor.getScatter(),
                                       inferredReturnTypes);
@@ -5775,9 +5738,9 @@ LogicalResult SelectAndScatterOp::verify() {
 
 LogicalResult ScatterOp::inferReturnTypes(
     MLIRContext*, std::optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
-  ScatterOp::Adaptor adaptor(operands, attributes, {}, regions);
+  ScatterOp::Adaptor adaptor(operands, attributes, properties, regions);
   return hlo::inferScatterOp(location, adaptor.getInputs(),
                              adaptor.getUpdateComputation(),
                              inferredReturnTypes);
@@ -6001,9 +5964,9 @@ void ScatterOp::getCanonicalizationPatterns(RewritePatternSet& results,
 
 LogicalResult WhileOp::inferReturnTypes(
     MLIRContext* context, std::optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
-  WhileOp::Adaptor adaptor(operands, attributes, {}, regions);
+  WhileOp::Adaptor adaptor(operands, attributes, properties, regions);
   return hlo::inferWhileOp(location, adaptor.getOperand(), inferredReturnTypes);
 }
 
@@ -6100,9 +6063,10 @@ void WhileOp::getCanonicalizationPatterns(RewritePatternSet& results,
 
 LogicalResult UniformDequantizeOp::inferReturnTypeComponents(
     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  UniformDequantizeOp::Adaptor adaptor(operands, attributes, {}, regions);
+  UniformDequantizeOp::Adaptor adaptor(operands, attributes, properties,
+                                       regions);
   return hlo::inferUniformDequantizeOp(location, adaptor.getOperand(),
                                        inferredReturnShapes);
 }
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops_common.td b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops_common.td
index 15b77fab8b9973..d7ce2fdd56487e 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops_common.td
+++ b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops_common.td
@@ -22,7 +22,6 @@ def MHLO_Dialect : Dialect {
 
   let useDefaultAttributePrinterParser = 0;
   let useDefaultTypePrinterParser = 0;
-  let usePropertiesForAttributes = 0;
 }
 
 include "mhlo/IR/hlo_base.td"
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h b/third_party/xla/xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h
index afef093cb06294..9761312cad5d82 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h
@@ -1163,7 +1163,7 @@ inline Value mapMhloOpToStdScalarOp<mhlo::PowOp>(Location loc,
   // The accum is correct when the rhs is non-negative. When rhs is
   // negative, we return 0 for integer, with the exception of lhs values of 1
   // and -1 which have integer results for negative exponents. Specifically, the
-  // calulation is the following:
+  // calculation is the following:
   //
   // - Return accum if the rhs is not negative.
   // - Return 1 or -1 depending on the parity of rhs when the lhs is -1.
@@ -1313,9 +1313,11 @@ struct MhloOpToStdScalarOp {
                                  ArrayRef<Type> argTypes, ValueRange args,
                                  OpBuilder* b) {
     static_assert(!std::is_same<MhloOpTy, mhlo::ConvertOp>::value);
-    return mapOpOfType<MhloOpTy>(
-        op.getLoc(), resultTypes, argTypes,
-        typename MhloOpTy::Adaptor(args, op->getAttrDictionary()), b);
+    typename MhloOpTy::Adaptor adaptor(args, op->getAttrDictionary(),
+                                       op->getPropertiesStorage(),
+                                       op->getRegions());
+    return mapOpOfType<MhloOpTy>(op.getLoc(), resultTypes, argTypes, adaptor,
+                                 b);
   }
   // Overload for mhlo::ConvertOp.
   static Value mapOpWithArgTypes(mhlo::ConvertOp op, ArrayRef<Type> resultTypes,
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_quant_legalize_to_int/mhlo_quant_legalize_to_int.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_quant_legalize_to_int/mhlo_quant_legalize_to_int.cc
index d0da0f12e098fb..4d44b97db4fde8 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_quant_legalize_to_int/mhlo_quant_legalize_to_int.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_quant_legalize_to_int/mhlo_quant_legalize_to_int.cc
@@ -592,6 +592,18 @@ struct DotLikeDimensionNumbers {
   SmallVector<int64_t> rhsContractingDims;
 };
 
+// Checks if zero points of the given quantized type are zero.
+bool isZeroPointZero(QuantType type) {
+  if (isPerTensorType(type)) {
+    return getPerTensorType(type).getZeroPoint() == 0;
+  }
+  if (isPerChannelType(type)) {
+    ArrayRef<int64_t> zeroPoints = getPerChannelType(type).getZeroPoints();
+    return llvm::all_of(zeroPoints, [](int64_t zp) { return zp == 0; });
+  }
+  return false;
+}
+
 // A shared matchAndRewrite implementation for dot-like hybrid quantized
 // operators. Hybrid ops are currently only interpreted as weight-only
 // quantization ops, this might change in the future.
@@ -611,8 +623,11 @@ LogicalResult matchAndRewriteDotLikeHybridOp(
                                                               adaptor.getRhs());
   Operation::result_range resultRange = barrier.getResults();
   Value rhs = resultRange.front();
-  auto rhsElementType = getElementTypeOrSelf(op.getRhs().getType())
-                            .template cast<quant::UniformQuantizedType>();
+  FailureOr<QuantType> rhsElementQuantType =
+      getQuantType(op.getRhs().getType());
+  if (failed(rhsElementQuantType)) {
+    return failure();
+  }
   auto resFloat32TensorType =
       op.getResult().getType().template cast<TensorType>();
   auto rhsFloat32TensorType =
@@ -620,21 +635,25 @@ LogicalResult matchAndRewriteDotLikeHybridOp(
           rewriter.getF32Type());
 
   // Get scales and zero points for rhs.
-  Value rhsZeroPoint = rewriter.create<mhlo::ConstantOp>(
-      op->getLoc(), rewriter.getF32FloatAttr((rhsElementType.getZeroPoint())));
-  Value rhsScaleConstant = rewriter.create<mhlo::ConstantOp>(
-      op->getLoc(),
-      rewriter.getF32FloatAttr(static_cast<float>(rhsElementType.getScale())));
+  Value rhsScale, rhsZeroPoint;
+  DenseI64ArrayAttr broadcastDims;
+  getQuantizationParams(rewriter, op->getLoc(), *rhsElementQuantType, rhsScale,
+                        rhsZeroPoint,
+                        /*outputZeroPointInFp=*/true, broadcastDims);
 
   // Dequantize rhs_float32_tensor.
   Value rhsFloat32Tensor =
       rewriter.create<mhlo::ConvertOp>(op->getLoc(), rhsFloat32TensorType, rhs);
-  rhsFloat32Tensor = rewriter.create<chlo::BroadcastSubOp>(
-      op->getLoc(), rhsFloat32TensorType, rhsFloat32Tensor, rhsZeroPoint,
-      nullptr);
+
+  // Subtract zero points only when it is not zero.
+  if (!isZeroPointZero(*rhsElementQuantType)) {
+    rhsFloat32Tensor = rewriter.create<chlo::BroadcastSubOp>(
+        op->getLoc(), rhsFloat32TensorType, rhsFloat32Tensor, rhsZeroPoint,
+        broadcastDims);
+  }
   rhsFloat32Tensor = rewriter.create<chlo::BroadcastMulOp>(
-      op->getLoc(), rhsFloat32TensorType, rhsFloat32Tensor, rhsScaleConstant,
-      nullptr);
+      op->getLoc(), rhsFloat32TensorType, rhsFloat32Tensor, rhsScale,
+      broadcastDims);
 
   // Execute conversion target op.
   SmallVector<Value, 2> operands{lhsFloat32Tensor, rhsFloat32Tensor};
@@ -1045,7 +1064,8 @@ FailureOr<bool> isDotLikeOpHybrid(DotLikeOp op) {
     // both per-tensor quantized.
     return false;
   }
-  if (!isLhsQuant && !isLhsQuantPerChannel && isRhsQuant && !isResQuant &&
+  if (!isLhsQuant && !isLhsQuantPerChannel &&
+      (isRhsQuant || isRhsQuantPerChannel) && !isResQuant &&
       !isResQuantPerChannel) {
     return true;
   }
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/prepare_for_export/prepare_for_export.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/prepare_for_export/prepare_for_export.cc
index 83a3fc7f71b650..f03bd96b273ed8 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/prepare_for_export/prepare_for_export.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/prepare_for_export/prepare_for_export.cc
@@ -90,6 +90,7 @@ void prepareConstantOp(Operation *op, SplatElementsAttr attr) {
 void prepareWhileOp(WhileOp whileOp) {
   llvm::SetVector<Value> implicitInputs;
   getUsedValuesDefinedAbove(whileOp->getRegions(), implicitInputs);
+  if (implicitInputs.empty()) return;
   // Each captured value has to be passed as operand to the while, become then
   // an operand to the condition region and the body region, and an extra
   // operand to the return op in the body. It also becomes an extra result for
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/test_infer_shaped_type/test_infer_shaped_type_pass.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/test_infer_shaped_type/test_infer_shaped_type_pass.cc
index e4dd08a9d13852..8bd3bbc1409610 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/test_infer_shaped_type/test_infer_shaped_type_pass.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/test_infer_shaped_type/test_infer_shaped_type_pass.cc
@@ -46,8 +46,9 @@ struct InferReturnTypesPattern : public RewritePattern {
     SmallVector<Type, 4> types;
     if (failed(definingOpInt.inferReturnTypes(
             op->getContext(), op->getLoc(), definingOp->getOperands(),
-            definingOp->getAttrDictionary(), op->getPropertiesStorage(),
-            definingOp->getRegions(), types))) {
+            definingOpInt->getAttrDictionary(),
+            definingOpInt->getPropertiesStorage(), definingOpInt->getRegions(),
+            types))) {
       return failure();
     }
 
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo.mlir
index 27a2d6567f50d7..fdc11fedbd392f 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo.mlir
@@ -238,7 +238,7 @@ func.func @constant_like_static_shape(%arg : tensor<1x2xi64>) -> tensor<1x2xf32>
 func.func @constant_like_dynamic_shape(%arg : tensor<?x?xi64>) -> tensor<?x?xf32> {
   // CHECK: %[[CONSTANT:.*]] = mhlo.constant dense<3.200000e+00> : tensor<f32>
   // CHECK: %[[SHAPE:.*]] = shape.shape_of %[[ARG]] : tensor<?x?xi64> -> tensor<2xindex>
-  // CHECK: %[[BROADCASTED_CONSTANT:.*]] = "mhlo.dynamic_broadcast_in_dim"(%[[CONSTANT]], %[[SHAPE]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<2xindex>) -> tensor<?x?xf32>
+  // CHECK: %[[BROADCASTED_CONSTANT:.*]] = "mhlo.dynamic_broadcast_in_dim"(%[[CONSTANT]], %[[SHAPE]]) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<f32>, tensor<2xindex>) -> tensor<?x?xf32>
   // CHECK: return %[[BROADCASTED_CONSTANT]] : tensor<?x?xf32>
   %result = "chlo.constant_like"(%arg) { value = 3.2 : f32 }
       : (tensor<?x?xi64>) -> tensor<?x?xf32>
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/broadcast_propagation.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/broadcast_propagation.mlir
index 9af9f9f6a71f3f..4bf50644127e70 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/broadcast_propagation.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/broadcast_propagation.mlir
@@ -52,8 +52,8 @@ func.func @single_bcast_ensure_order(%arg0 : tensor<16x?xf32>, %arg1 : tensor<16
 func.func @double_bcasts(%arg0 : tensor<16x?xf32>, %arg1 : tensor<16x?xf32>,
     %shape0 : tensor<3xindex>, %shape1 : tensor<3xindex>)
     -> (tensor<?x16x?xf32>, tensor<?x16x?xf32>) {
-  // CHECK-DAG: %[[BCASTED_ARG00:.*]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[SHAPE0]]) [[BCAST_DIMS0:{broadcast_dimensions = dense<\[1, 2\]> : tensor<2xi64>}]]
-  // CHECK-DAG: %[[BCASTED_ARG01:.*]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[SHAPE1]]) [[BCAST_DIMS1:{broadcast_dimensions = dense<\[0, 2\]> : tensor<2xi64>}]]
+  // CHECK-DAG: %[[BCASTED_ARG00:.*]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[SHAPE0]]) [[BCAST_DIMS0:<{broadcast_dimensions = dense<\[1, 2\]> : tensor<2xi64>}>]]
+  // CHECK-DAG: %[[BCASTED_ARG01:.*]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[SHAPE1]]) [[BCAST_DIMS1:<{broadcast_dimensions = dense<\[0, 2\]> : tensor<2xi64>}>]]
   // CHECK-DAG: %[[BCASTED_ARG10:.*]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG1]], %[[SHAPE0]]) [[BCAST_DIMS0]]
   // CHECK-DAG: %[[BCASTED_ARG11:.*]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG1]], %[[SHAPE1]]) [[BCAST_DIMS1]]
   // CHECK-DAG: %[[ADD0:.*]] = mhlo.add %[[BCASTED_ARG00]], %[[BCASTED_ARG10]] : [[BCAST_TY:tensor<\?x16x\?xf32>]]
@@ -85,8 +85,8 @@ func.func @double_bcasts(%arg0 : tensor<16x?xf32>, %arg1 : tensor<16x?xf32>,
 func.func @late_output_dimensions(%arg0 : tensor<?x32xf32>, %arg1 : tensor<?x32xf32>,
     %arg2 : tensor<?x?x?xf32>) -> tensor<?x?x32xf32> {
   // CHECK-DAG: %[[SHAPE:.*]] = shape.shape_of %[[ARG2]]
-  // CHECK-DAG: %[[BCASTED_ARG0:.*]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[SHAPE]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
-  // CHECK-DAG: %[[BCASTED_ARG1:.*]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG1]], %[[SHAPE]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
+  // CHECK-DAG: %[[BCASTED_ARG0:.*]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[SHAPE]]) <{broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}>
+  // CHECK-DAG: %[[BCASTED_ARG1:.*]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG1]], %[[SHAPE]]) <{broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}>
   // CHECK-DAG: %[[SUB:.*]] = mhlo.subtract %[[BCASTED_ARG0]], %[[BCASTED_ARG1]] : [[BCAST_TY:tensor<\?x\?x32xf32>]]
   // CHECK-DAG: %[[ADD:.*]] = mhlo.add %[[SUB]], %[[SUB]] : [[BCAST_TY]]
   // CHECK:     return %[[ADD]] : [[BCAST_TY]]
@@ -118,7 +118,7 @@ func.func @very_late_output_dimensions(%arg0 : tensor<?x32xf32>,
   %acc2 = mhlo.subtract %acc1, %arg1 : tensor<?x32xf32>
   %acc3 = mhlo.divide %acc2, %arg1 : tensor<?x32xf32>
   %1 = shape.shape_of %arg2 : tensor<?x?x?xf32> -> tensor<3xindex>
-  %3 = "mhlo.dynamic_broadcast_in_dim"(%acc3, %1) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x32xf32>, tensor<3xindex>) -> tensor<?x?x32xf32>
+  %3 = "mhlo.dynamic_broadcast_in_dim"(%acc3, %1) <{broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}> : (tensor<?x32xf32>, tensor<3xindex>) -> tensor<?x?x32xf32>
   func.return %3 : tensor<?x?x32xf32>
 }
 
@@ -176,7 +176,7 @@ func.func @propagate_within_block_2(%arg : tensor<?x?x?xf32>,
 // CHECK-SAME:  %[[ARG:.*]]: tensor<1xindex>
 func.func @propagate_across_bcasts_cst_src(%s : tensor<1xindex>) -> tensor<?xi1> {
   // CHECK-DAG: %[[C1:.*]] = mhlo.constant dense<true> : tensor<i1>
-  // CHECK-DAG: %[[RES:.*]] = "mhlo.dynamic_broadcast_in_dim"(%[[C1]], %[[ARG]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<i1>, tensor<1xindex>) -> tensor<?xi1>
+  // CHECK-DAG: %[[RES:.*]] = "mhlo.dynamic_broadcast_in_dim"(%[[C1]], %[[ARG]]) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<i1>, tensor<1xindex>) -> tensor<?xi1>
   // CHECK:     return %[[RES]]
   %0 = mhlo.constant dense<true> : tensor<i1>
   %1 = "mhlo.dynamic_broadcast_in_dim"(%0, %s)
@@ -193,7 +193,7 @@ func.func @propagate_across_bcasts_cst_src(%s : tensor<1xindex>) -> tensor<?xi1>
 // CHECK-LABEL: @compose_bcast_dims
 // CHECK-SAME:  %[[ARG:.*]]: tensor<?x?xi1>, %[[S0:.*]]: tensor<3xindex>, %[[S1:.*]]: tensor<4xindex>
 func.func @compose_bcast_dims(%arg : tensor<?x?xi1>, %s0 : tensor<3xindex>, %s1 : tensor<4xindex>) -> tensor<1x?x1x?xi1> {
-  // CHECK-DAG: %[[RES:.*]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG]], %[[S1]]) {broadcast_dimensions = dense<[1, 3]> : tensor<2xi64>} : (tensor<?x?xi1>, tensor<4xindex>) -> tensor<1x?x1x?xi1>
+  // CHECK-DAG: %[[RES:.*]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG]], %[[S1]]) <{broadcast_dimensions = dense<[1, 3]> : tensor<2xi64>}> : (tensor<?x?xi1>, tensor<4xindex>) -> tensor<1x?x1x?xi1>
   // CHECK:     return %[[RES]]
   %1 = "mhlo.dynamic_broadcast_in_dim"(%arg, %s0)
       {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>}
@@ -209,7 +209,7 @@ func.func @compose_bcast_dims(%arg : tensor<?x?xi1>, %s0 : tensor<3xindex>, %s1
 // CHECK-LABEL: @propagate_across_bcasts
 // CHECK-SAME:  %[[ARG:.*]]: tensor<?x?x?xf32>, %[[S:.*]]: tensor<3xindex>
 func.func @propagate_across_bcasts(%arg : tensor<?x?x?xf32>, %shape : tensor<3xindex>) -> tensor<?x?x?xf32> {
-  // CHECK-DAG: %[[RES:.*]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG]], %[[S]]) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<?x?x?xf32>, tensor<3xindex>) -> tensor<?x?x?xf32>
+  // CHECK-DAG: %[[RES:.*]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG]], %[[S]]) <{broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>}> : (tensor<?x?x?xf32>, tensor<3xindex>) -> tensor<?x?x?xf32>
   // CHECK:     return %[[RES]]
   %0 = "mhlo.dynamic_broadcast_in_dim"(%arg, %shape)
       {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>}
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/canonicalize.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/canonicalize.mlir
index c5a58806a893c8..91ddb13539f2a3 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/canonicalize.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/canonicalize.mlir
@@ -288,7 +288,7 @@ func.func @clamp_fold_float() -> tensor<6xf32> {
 // CHECK-LABEL: concatenate_noop
 func.func @concatenate_noop(%arg0: tensor<4xi32>) -> tensor<4xi32> {
   // CHECK-SAME: [[ARG:%.+]]: tensor<4xi32>
-  %0 = "mhlo.concatenate"(%arg0) { dimension = 0 : i64 } : (tensor<4xi32>) -> tensor<4xi32>
+  %0 = "mhlo.concatenate"(%arg0) <{ dimension = 0 : i64 }> : (tensor<4xi32>) -> tensor<4xi32>
 
   // CHECK: return [[ARG]]
   func.return %0 : tensor<4xi32>
@@ -298,7 +298,7 @@ func.func @concatenate_noop(%arg0: tensor<4xi32>) -> tensor<4xi32> {
 func.func @concatenate_noop_typecast(%arg0: tensor<?xi32>) -> tensor<4xi32> {
   // CHECK-SAME: [[ARG:%.+]]: tensor<?xi32>
   // CHECK-NEXT: [[RES:%.+]] = tensor.cast [[ARG]] : tensor<?xi32> to tensor<4xi32>
-  %0 = "mhlo.concatenate"(%arg0) { dimension = 0 : i64 } : (tensor<?xi32>) -> tensor<4xi32>
+  %0 = "mhlo.concatenate"(%arg0) <{ dimension = 0 : i64 }> : (tensor<?xi32>) -> tensor<4xi32>
 
   // CHECK: return [[RES]]
   func.return %0 : tensor<4xi32>
@@ -308,7 +308,7 @@ func.func @concatenate_noop_typecast(%arg0: tensor<?xi32>) -> tensor<4xi32> {
 func.func @concatenate_remove_operand(%arg0: tensor<4xi32>, %arg1: tensor<0xi32>) -> tensor<4xi32> {
   // CHECK-SAME: [[ARG0:%.+]]: tensor<4xi32>
   // CHECK-SAME: [[ARG1:%.+]]: tensor<0xi32>
-  %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<4xi32>, tensor<0xi32>) -> tensor<4xi32>
+  %0 = "mhlo.concatenate"(%arg0, %arg1) <{ dimension = 0 : i64 }> : (tensor<4xi32>, tensor<0xi32>) -> tensor<4xi32>
 
   // CHECK: return [[ARG0]]
   func.return %0 : tensor<4xi32>
@@ -316,10 +316,10 @@ func.func @concatenate_remove_operand(%arg0: tensor<4xi32>, %arg1: tensor<0xi32>
 
 // CHECK-LABEL: concatenate_forward
 func.func @concatenate_forward(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<12xi32> {
-  %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<4xi32>, tensor<4xi32>) -> tensor<8xi32>
+  %0 = "mhlo.concatenate"(%arg0, %arg1) <{ dimension = 0 : i64 }> : (tensor<4xi32>, tensor<4xi32>) -> tensor<8xi32>
   %1 = mhlo.constant dense<[0, 1, 2, 3]> : tensor<4xi32>
-  // CHECK: "mhlo.concatenate"(%arg0, %arg1, %0) {dimension = 0 : i64} : (tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) -> tensor<12xi32>
-  %2 = "mhlo.concatenate"(%0, %1) { dimension = 0 : i64 } : (tensor<8xi32>, tensor<4xi32>) -> tensor<12xi32>
+  // CHECK: "mhlo.concatenate"(%arg0, %arg1, %0) <{dimension = 0 : i64}> : (tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) -> tensor<12xi32>
+  %2 = "mhlo.concatenate"(%0, %1) <{ dimension = 0 : i64 }> : (tensor<8xi32>, tensor<4xi32>) -> tensor<12xi32>
 
   func.return %2 : tensor<12xi32>
 }
@@ -327,7 +327,7 @@ func.func @concatenate_forward(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> te
 // CHECK-LABEL: concatenate_empty_bool
 func.func @concatenate_empty_bool(%arg0: tensor<0xi1>, %arg1: tensor<0xi1>) -> tensor<0xi1> {
   // CHECK: mhlo.constant
-  %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<0xi1>, tensor<0xi1>) -> tensor<0xi1>
+  %0 = "mhlo.concatenate"(%arg0, %arg1) <{ dimension = 0 : i64 }> : (tensor<0xi1>, tensor<0xi1>) -> tensor<0xi1>
 
   func.return %0 : tensor<0xi1>
 }
@@ -335,7 +335,7 @@ func.func @concatenate_empty_bool(%arg0: tensor<0xi1>, %arg1: tensor<0xi1>) -> t
 // CHECK-LABEL: concatenate_empty_int
 func.func @concatenate_empty_int(%arg0: tensor<0xi32>, %arg1: tensor<0xi32>) -> tensor<0xi32> {
   // CHECK: mhlo.constant
-  %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<0xi32>, tensor<0xi32>) -> tensor<0xi32>
+  %0 = "mhlo.concatenate"(%arg0, %arg1) <{ dimension = 0 : i64 }> : (tensor<0xi32>, tensor<0xi32>) -> tensor<0xi32>
 
   func.return %0 : tensor<0xi32>
 }
@@ -343,7 +343,7 @@ func.func @concatenate_empty_int(%arg0: tensor<0xi32>, %arg1: tensor<0xi32>) ->
 // CHECK-LABEL: concatenate_empty_float
 func.func @concatenate_empty_float(%arg0: tensor<0xf32>, %arg1: tensor<0xf32>) -> tensor<0xf32> {
   // CHECK: mhlo.constant
-  %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<0xf32>, tensor<0xf32>) -> tensor<0xf32>
+  %0 = "mhlo.concatenate"(%arg0, %arg1) <{ dimension = 0 : i64 }> : (tensor<0xf32>, tensor<0xf32>) -> tensor<0xf32>
 
   func.return %0 : tensor<0xf32>
 }
@@ -353,7 +353,7 @@ func.func @concatenate_const_1D() -> tensor<4xi32> {
   // CHECK: [[VAL:%.+]]= mhlo.constant dense<[0, 1, 2, 3]>
   %0 = mhlo.constant dense<[0, 1]> : tensor<2xi32>
   %1 = mhlo.constant dense<[2, 3]> : tensor<2xi32>
-  %2 = "mhlo.concatenate"(%0, %1) { dimension = 0 : i64 } : (tensor<2xi32>, tensor<2xi32>) -> tensor<4xi32>
+  %2 = "mhlo.concatenate"(%0, %1) <{ dimension = 0 : i64 }> : (tensor<2xi32>, tensor<2xi32>) -> tensor<4xi32>
 
   // CHECK: return [[VAL]]
   func.return %2 : tensor<4xi32>
@@ -365,7 +365,7 @@ func.func @concatenate_const_1D_float() -> tensor<4xf32> {
 
   %0 = mhlo.constant dense<[0.0, 1.0]> : tensor<2xf32>
   %1 = mhlo.constant dense<[2.0, 3.0]> : tensor<2xf32>
-  %2 = "mhlo.concatenate"(%0, %1) { dimension = 0 : i64 } : (tensor<2xf32>, tensor<2xf32>) -> tensor<4xf32>
+  %2 = "mhlo.concatenate"(%0, %1) <{ dimension = 0 : i64 }> : (tensor<2xf32>, tensor<2xf32>) -> tensor<4xf32>
 
   // CHECK: return [[VAL]]
   func.return %2 : tensor<4xf32>
@@ -378,7 +378,7 @@ func.func @concatenate_const_2D_vertical() -> tensor<2x2xi32> {
   // CHECK-SAME: ]>
   %0 = mhlo.constant dense<[[0, 1]]> : tensor<1x2xi32>
   %1 = mhlo.constant dense<[[2, 3]]> : tensor<1x2xi32>
-  %2 = "mhlo.concatenate"(%0, %1) { dimension = 0 : i64 } : (tensor<1x2xi32>, tensor<1x2xi32>) -> tensor<2x2xi32>
+  %2 = "mhlo.concatenate"(%0, %1) <{ dimension = 0 : i64 }> : (tensor<1x2xi32>, tensor<1x2xi32>) -> tensor<2x2xi32>
 
   // CHECK: return [[VAL]]
   func.return %2 : tensor<2x2xi32>
@@ -391,7 +391,7 @@ func.func @concatenate_const_2D_horizontal() -> tensor<2x2xi32> {
   // CHECK-SAME: ]>
   %0 = mhlo.constant dense<[[0], [1]]> : tensor<2x1xi32>
   %1 = mhlo.constant dense<[[2], [3]]> : tensor<2x1xi32>
-  %2 = "mhlo.concatenate"(%0, %1) { dimension = 1 : i64 } : (tensor<2x1xi32>, tensor<2x1xi32>) -> tensor<2x2xi32>
+  %2 = "mhlo.concatenate"(%0, %1) <{ dimension = 1 : i64 }> : (tensor<2x1xi32>, tensor<2x1xi32>) -> tensor<2x2xi32>
 
   // CHECK: return [[VAL]]
   func.return %2 : tensor<2x2xi32>
@@ -440,7 +440,7 @@ func.func @dynamic_update_slice_fold_fail_dynamic_shapes(%arg0: tensor<?x?xi64>,
 // CHECK-LABEL: dynamic_slice_variable_start
 func.func @dynamic_slice_variable_start(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<1x4xi32> {
   // CHECK: "mhlo.dynamic_slice"
-  %1 = "mhlo.dynamic_slice"(%arg0, %arg1, %arg2) {slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
+  %1 = "mhlo.dynamic_slice"(%arg0, %arg1, %arg2) <{slice_sizes = dense<[1, 4]> : tensor<2xi64>}> : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
   func.return %1 : tensor<1x4xi32>
 }
 
@@ -452,7 +452,7 @@ func.func @dynamic_slice_constant_start(%arg0: tensor<4xi32>) -> tensor<2xi32> {
   // CHECK-DAG-SAME: strides = dense<1> : tensor<1xi64>}
   // CHECK: return %[[RESULT]] : tensor<2xi32>
   %0 = mhlo.constant dense<1> : tensor<i64>
-  %1 = "mhlo.dynamic_slice"(%arg0, %0) {slice_sizes = dense<2> : tensor<1xi64>} : (tensor<4xi32>, tensor<i64>) -> tensor<2xi32>
+  %1 = "mhlo.dynamic_slice"(%arg0, %0) <{slice_sizes = dense<2> : tensor<1xi64>}> : (tensor<4xi32>, tensor<i64>) -> tensor<2xi32>
   func.return %1 : tensor<2xi32>
 }
 
@@ -462,7 +462,7 @@ func.func @dynamic_slice_constant_start_dynamic_shape(%arg0: tensor<?x4xi32>, %a
   // CHECK-NOT: mhlo.slice
   %0 = mhlo.constant dense<1> : tensor<i64>
   %1 = mhlo.constant dense<0> : tensor<i64>
-  %2 = "mhlo.dynamic_slice"(%arg0, %0, %1) {slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor<?x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
+  %2 = "mhlo.dynamic_slice"(%arg0, %0, %1) <{slice_sizes = dense<[1, 4]> : tensor<2xi64>}> : (tensor<?x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
   func.return %2 : tensor<1x4xi32>
 }
 
@@ -475,7 +475,7 @@ func.func @dynamic_slice_constant_start_upper_bound(%arg0: tensor<8x4xi32>, %arg
   // CHECK: return %[[RESULT]] : tensor<1x4xi32>
   %0 = mhlo.constant dense<10> : tensor<i64>
   %1 = mhlo.constant dense<0> : tensor<i64>
-  %2 = "mhlo.dynamic_slice"(%arg0, %0, %1) {slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor<8x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
+  %2 = "mhlo.dynamic_slice"(%arg0, %0, %1) <{slice_sizes = dense<[1, 4]> : tensor<2xi64>}> : (tensor<8x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
   func.return %2 : tensor<1x4xi32>
 }
 
@@ -488,14 +488,14 @@ func.func @dynamic_slice_constant_start_lower_bound(%arg0: tensor<8x4xi32>, %arg
   // CHECK: return %[[RESULT]] : tensor<1x4xi32>
   %0 = mhlo.constant dense<-1> : tensor<i64>
   %1 = mhlo.constant dense<0> : tensor<i64>
-  %2 = "mhlo.dynamic_slice"(%arg0, %0, %1) {slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor<8x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
+  %2 = "mhlo.dynamic_slice"(%arg0, %0, %1) <{slice_sizes = dense<[1, 4]> : tensor<2xi64>}> : (tensor<8x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
   func.return %2 : tensor<1x4xi32>
 }
 
 // CHECK-LABEL: slice_2D_noop
 // CHECK-SAME: [[ARG:%.+]]: tensor<2x2xi64>
 func.func @slice_2D_noop(%arg0: tensor<2x2xi64>) -> tensor<2x2xi64> {
-  %0 = "mhlo.slice"(%arg0) { limit_indices = dense<[2, 2]> : tensor<2xi64>, start_indices = dense<[0, 0]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<2x2xi64>) -> (tensor<2x2xi64>)
+  %0 = "mhlo.slice"(%arg0) <{ limit_indices = dense<[2, 2]> : tensor<2xi64>, start_indices = dense<[0, 0]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}> : (tensor<2x2xi64>) -> (tensor<2x2xi64>)
 
   // CHECK-NEXT: return [[ARG]]
   func.return %0 : tensor<2x2xi64>
@@ -505,7 +505,7 @@ func.func @slice_2D_noop(%arg0: tensor<2x2xi64>) -> tensor<2x2xi64> {
 func.func @slice_1D_fold() -> tensor<2xi64> {
   %0 = mhlo.constant dense<[5, 7, 9, 10]> : tensor<4xi64>
   // CHECK: mhlo.constant dense<[7, 9]>
-  %1 = "mhlo.slice"(%0) { limit_indices = dense<[3]> : tensor<1xi64>, start_indices = dense<[1]> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<4xi64>) -> (tensor<2xi64>)
+  %1 = "mhlo.slice"(%0) <{ limit_indices = dense<[3]> : tensor<1xi64>, start_indices = dense<[1]> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<4xi64>) -> (tensor<2xi64>)
   func.return %1 : tensor<2xi64>
 }
 
@@ -513,7 +513,7 @@ func.func @slice_1D_fold() -> tensor<2xi64> {
 func.func @slice_1D_fp() -> tensor<2xf32> {
   %0 = mhlo.constant dense<[5.0, 7.0, 9.0, 10.0]> : tensor<4xf32>
   // CHECK: mhlo.constant dense<[7.000000e+00, 9.000000e+00]>
-  %1 = "mhlo.slice"(%0) { limit_indices = dense<[3]> : tensor<1xi64>, start_indices = dense<[1]> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> (tensor<2xf32>)
+  %1 = "mhlo.slice"(%0) <{ limit_indices = dense<[3]> : tensor<1xi64>, start_indices = dense<[1]> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<4xf32>) -> (tensor<2xf32>)
   func.return %1 : tensor<2xf32>
 }
 
@@ -521,7 +521,7 @@ func.func @slice_1D_fp() -> tensor<2xf32> {
 func.func @slice_1D_strided_fold() -> tensor<2xi64> {
   %0 = mhlo.constant dense<[5, 7, 9, 10]> : tensor<4xi64>
   // CHECK: mhlo.constant dense<[7, 10]>
-  %1 = "mhlo.slice"(%0) { limit_indices = dense<[4]> : tensor<1xi64>, start_indices = dense<[1]> : tensor<1xi64>, strides = dense<2> : tensor<1xi64>} : (tensor<4xi64>) -> (tensor<2xi64>)
+  %1 = "mhlo.slice"(%0) <{ limit_indices = dense<[4]> : tensor<1xi64>, start_indices = dense<[1]> : tensor<1xi64>, strides = dense<2> : tensor<1xi64>}> : (tensor<4xi64>) -> (tensor<2xi64>)
   func.return %1 : tensor<2xi64>
 }
 
@@ -532,7 +532,7 @@ func.func @slice_2D_fold() -> tensor<2x2xi64> {
   // CHECK-SAME: [6, 7],
   // CHECK-SAME: [10, 11]
   // CHECK-SAME: ]>
-  %1 = "mhlo.slice"(%0) { limit_indices = dense<[3, 4]> : tensor<2xi64>, start_indices = dense<[1, 2]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<4x4xi64>) -> (tensor<2x2xi64>)
+  %1 = "mhlo.slice"(%0) <{ limit_indices = dense<[3, 4]> : tensor<2xi64>, start_indices = dense<[1, 2]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}> : (tensor<4x4xi64>) -> (tensor<2x2xi64>)
   func.return %1 : tensor<2x2xi64>
 }
 
@@ -542,7 +542,7 @@ func.func @slice_2D_fold_horizontal() -> tensor<1x4xi64> {
   // CHECK-NEXT: mhlo.constant dense<[
   // CHECK-SAME: [0, 1, 2, 3]
   // CHECK-SAME: ]>
-  %1 = "mhlo.slice"(%0) { limit_indices = dense<[1, 4]> : tensor<2xi64>, start_indices = dense<[0, 0]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<4x4xi64>) -> (tensor<1x4xi64>)
+  %1 = "mhlo.slice"(%0) <{ limit_indices = dense<[1, 4]> : tensor<2xi64>, start_indices = dense<[0, 0]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}> : (tensor<4x4xi64>) -> (tensor<1x4xi64>)
   func.return %1 : tensor<1x4xi64>
 }
 
@@ -552,7 +552,7 @@ func.func @slice_2D_fold_vertical() -> tensor<4x1xi64> {
   // CHECK-NEXT: mhlo.constant dense<[
   // CHECK-SAME: [2], [6], [10], [14]
   // CHECK-SAME: ]>
-  %1 = "mhlo.slice"(%0) { limit_indices = dense<[4, 3]> : tensor<2xi64>, start_indices = dense<[0, 2]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<4x4xi64>) -> (tensor<4x1xi64>)
+  %1 = "mhlo.slice"(%0) <{ limit_indices = dense<[4, 3]> : tensor<2xi64>, start_indices = dense<[0, 2]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}> : (tensor<4x4xi64>) -> (tensor<4x1xi64>)
   func.return %1 : tensor<4x1xi64>
 }
 
@@ -560,39 +560,39 @@ func.func @slice_2D_fold_vertical() -> tensor<4x1xi64> {
 func.func @slice_zero_elements() -> tensor<0xi64> {
   %0 = mhlo.constant dense<> : tensor<0xi64>
   // CHECK: %[[CONST:.*]] = mhlo.constant dense<> : tensor<0xi64>
-  %1 = "mhlo.slice"(%0) { limit_indices = dense<[0]> : tensor<1xi64>, start_indices = dense<[0]> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<0xi64>) -> (tensor<0xi64>)
+  %1 = "mhlo.slice"(%0) <{ limit_indices = dense<[0]> : tensor<1xi64>, start_indices = dense<[0]> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<0xi64>) -> (tensor<0xi64>)
   // CHECK: return %[[CONST]] : tensor<0xi64>
   func.return %1 : tensor<0xi64>
 }
 
 // CHECK-LABEL: slice_unknown_shape
 func.func @slice_unknown_shape(%arg0: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  // CHECK: "mhlo.slice"(%arg0) {limit_indices = dense<[1, 4]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<?x?xf32>) -> tensor<?x?xf32>
-  %0 = "mhlo.slice"(%arg0) {limit_indices = dense<[1, 4]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<?x?xf32>) -> tensor<?x?xf32>
+  // CHECK: "mhlo.slice"(%arg0) <{limit_indices = dense<[1, 4]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}> : (tensor<?x?xf32>) -> tensor<?x?xf32>
+  %0 = "mhlo.slice"(%arg0) <{limit_indices = dense<[1, 4]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}> : (tensor<?x?xf32>) -> tensor<?x?xf32>
   func.return %0 : tensor<?x?xf32>
 }
 
 // CHECK-LABEL: slice_concat_fold_first
 func.func @slice_concat_fold_first(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5xf32>) -> tensor<1x5xf32> {
-  %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<1x5xf32>, tensor<1x5xf32>) -> tensor<2x5xf32>
-  %1 = "mhlo.slice"(%0) { limit_indices = dense<[1, 5]> : tensor<2xi64>, start_indices = dense<[0, 0]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<2x5xf32>) -> (tensor<1x5xf32>)
+  %0 = "mhlo.concatenate"(%arg0, %arg1) <{ dimension = 0 : i64 }> : (tensor<1x5xf32>, tensor<1x5xf32>) -> tensor<2x5xf32>
+  %1 = "mhlo.slice"(%0) <{ limit_indices = dense<[1, 5]> : tensor<2xi64>, start_indices = dense<[0, 0]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}> : (tensor<2x5xf32>) -> (tensor<1x5xf32>)
   // CHECK: return %arg0
   func.return %1 : tensor<1x5xf32>
 }
 
 // CHECK-LABEL: slice_concat_fold_second
 func.func @slice_concat_fold_second(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5xf32>) -> tensor<1x5xf32> {
-  %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<1x5xf32>, tensor<1x5xf32>) -> tensor<2x5xf32>
-  %1 = "mhlo.slice"(%0) { limit_indices = dense<[2, 5]> : tensor<2xi64>, start_indices = dense<[1, 0]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<2x5xf32>) -> (tensor<1x5xf32>)
+  %0 = "mhlo.concatenate"(%arg0, %arg1) <{ dimension = 0 : i64 }> : (tensor<1x5xf32>, tensor<1x5xf32>) -> tensor<2x5xf32>
+  %1 = "mhlo.slice"(%0) <{ limit_indices = dense<[2, 5]> : tensor<2xi64>, start_indices = dense<[1, 0]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}> : (tensor<2x5xf32>) -> (tensor<1x5xf32>)
   // CHECK: return %arg1
   func.return %1 : tensor<1x5xf32>
 }
 
 // CHECK-LABEL: slice_concat_fold_second_with_slice
 func.func @slice_concat_fold_second_with_slice(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5xf32>) -> tensor<1x4xf32> {
-  %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<1x5xf32>, tensor<1x5xf32>) -> tensor<2x5xf32>
-  // CHECK: [[SLICE:%.+]] = "mhlo.slice"(%arg1) {limit_indices = dense<[1, 5]> : tensor<2xi64>, start_indices = dense<[0, 1]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x5xf32>) -> tensor<1x4xf32>
-  %1 = "mhlo.slice"(%0) { limit_indices = dense<[2, 5]> : tensor<2xi64>, start_indices = dense<[1, 1]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<2x5xf32>) -> (tensor<1x4xf32>)
+  %0 = "mhlo.concatenate"(%arg0, %arg1) <{ dimension = 0 : i64 }> : (tensor<1x5xf32>, tensor<1x5xf32>) -> tensor<2x5xf32>
+  // CHECK: [[SLICE:%.+]] = "mhlo.slice"(%arg1) <{limit_indices = dense<[1, 5]> : tensor<2xi64>, start_indices = dense<[0, 1]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}> : (tensor<1x5xf32>) -> tensor<1x4xf32>
+  %1 = "mhlo.slice"(%0) <{ limit_indices = dense<[2, 5]> : tensor<2xi64>, start_indices = dense<[1, 1]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}> : (tensor<2x5xf32>) -> (tensor<1x4xf32>)
 
   // CHECK: return [[SLICE]]
   func.return %1 : tensor<1x4xf32>
@@ -600,9 +600,9 @@ func.func @slice_concat_fold_second_with_slice(%arg0: tensor<1x5xf32>, %arg1: te
 
 // CHECK-LABEL: slice_concat_fold_middle
 func.func @slice_concat_fold_middle(%arg0: tensor<1x5xf32>, %arg1: tensor<2x5xf32>, %arg2: tensor<1x5xf32>) -> tensor<1x5xf32> {
-  %0 = "mhlo.concatenate"(%arg0, %arg1, %arg2) { dimension = 0 : i64 } : (tensor<1x5xf32>, tensor<2x5xf32>, tensor<1x5xf32>) -> tensor<4x5xf32>
-  // CHECK: [[SLICE:%.+]] = "mhlo.slice"(%arg1) {limit_indices = dense<[2, 5]> : tensor<2xi64>, start_indices = dense<[1, 0]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
-  %1 = "mhlo.slice"(%0) { limit_indices = dense<[3, 5]> : tensor<2xi64>, start_indices = dense<[2, 0]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<4x5xf32>) -> (tensor<1x5xf32>)
+  %0 = "mhlo.concatenate"(%arg0, %arg1, %arg2) <{ dimension = 0 : i64 }> : (tensor<1x5xf32>, tensor<2x5xf32>, tensor<1x5xf32>) -> tensor<4x5xf32>
+  // CHECK: [[SLICE:%.+]] = "mhlo.slice"(%arg1) <{limit_indices = dense<[2, 5]> : tensor<2xi64>, start_indices = dense<[1, 0]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}>
+  %1 = "mhlo.slice"(%0) <{ limit_indices = dense<[3, 5]> : tensor<2xi64>, start_indices = dense<[2, 0]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}> : (tensor<4x5xf32>) -> (tensor<1x5xf32>)
 
   // CHECK: return [[SLICE]]
   func.return %1 : tensor<1x5xf32>
@@ -610,11 +610,11 @@ func.func @slice_concat_fold_middle(%arg0: tensor<1x5xf32>, %arg1: tensor<2x5xf3
 
 // CHECK-LABEL: slice_concat_fold_two
 func.func @slice_concat_fold_two(%arg0: tensor<1x5xf32>, %arg1: tensor<2x5xf32>, %arg2: tensor<1x5xf32>) -> tensor<2x5xf32> {
-  // CHECK: [[CONCAT:%.+]] = "mhlo.concatenate"(%arg1, %arg2) {dimension = 0 : i64}
-  %0 = "mhlo.concatenate"(%arg0, %arg1, %arg2) { dimension = 0 : i64 } : (tensor<1x5xf32>, tensor<2x5xf32>, tensor<1x5xf32>) -> tensor<4x5xf32>
+  // CHECK: [[CONCAT:%.+]] = "mhlo.concatenate"(%arg1, %arg2) <{dimension = 0 : i64}>
+  %0 = "mhlo.concatenate"(%arg0, %arg1, %arg2) <{ dimension = 0 : i64 }> : (tensor<1x5xf32>, tensor<2x5xf32>, tensor<1x5xf32>) -> tensor<4x5xf32>
 
-  // CHECK: [[SLICE:%.+]] = "mhlo.slice"([[CONCAT]]) {limit_indices = dense<[3, 5]> : tensor<2xi64>, start_indices = dense<[1, 0]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
-  %1 = "mhlo.slice"(%0) { limit_indices = dense<[4, 5]> : tensor<2xi64>, start_indices = dense<[2, 0]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<4x5xf32>) -> (tensor<2x5xf32>)
+  // CHECK: [[SLICE:%.+]] = "mhlo.slice"([[CONCAT]]) <{limit_indices = dense<[3, 5]> : tensor<2xi64>, start_indices = dense<[1, 0]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}>
+  %1 = "mhlo.slice"(%0) <{ limit_indices = dense<[4, 5]> : tensor<2xi64>, start_indices = dense<[2, 0]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}> : (tensor<4x5xf32>) -> (tensor<2x5xf32>)
 
   // CHECK: return [[SLICE]]
   func.return %1 : tensor<2x5xf32>
@@ -622,9 +622,9 @@ func.func @slice_concat_fold_two(%arg0: tensor<1x5xf32>, %arg1: tensor<2x5xf32>,
 
 // CHECK-LABEL: slice_concat_empty
 func.func @slice_concat_empty(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5xf32>, %arg2: tensor<1x5xf32>) -> tensor<1x5xf32> {
-  %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<1x5xf32>, tensor<1x5xf32>) -> tensor<2x5xf32>
-  %1 = "mhlo.slice"(%0) { limit_indices = dense<[2, 5]> : tensor<2xi64>, start_indices = dense<[2, 0]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<2x5xf32>) -> (tensor<0x5xf32>)
-  %2 = "mhlo.concatenate"(%1, %arg2) { dimension = 0 : i64 } : (tensor<0x5xf32>, tensor<1x5xf32>) -> tensor<1x5xf32>
+  %0 = "mhlo.concatenate"(%arg0, %arg1) <{ dimension = 0 : i64 }> : (tensor<1x5xf32>, tensor<1x5xf32>) -> tensor<2x5xf32>
+  %1 = "mhlo.slice"(%0) <{ limit_indices = dense<[2, 5]> : tensor<2xi64>, start_indices = dense<[2, 0]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}> : (tensor<2x5xf32>) -> (tensor<0x5xf32>)
+  %2 = "mhlo.concatenate"(%1, %arg2) <{ dimension = 0 : i64 }> : (tensor<0x5xf32>, tensor<1x5xf32>) -> tensor<1x5xf32>
 
   // CHECK: return %arg2
   func.return %2 : tensor<1x5xf32>
@@ -633,28 +633,28 @@ func.func @slice_concat_empty(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5xf32>, %a
 // CHECK-LABEL: func @broadcast_identity
 func.func @broadcast_identity(%arg0: tensor<2x3x4xf32>) -> tensor<2x3x4xf32> {
   // CHECK: return %arg0
-  %0 = "mhlo.broadcast"(%arg0) {broadcast_sizes = dense<[]> : tensor<0xi64>} : (tensor<2x3x4xf32>) -> tensor<2x3x4xf32>
+  %0 = "mhlo.broadcast"(%arg0) <{broadcast_sizes = dense<[]> : tensor<0xi64>}> : (tensor<2x3x4xf32>) -> tensor<2x3x4xf32>
   func.return %0 : tensor<2x3x4xf32>
 }
 
 // CHECK-LABEL: func @broadcast_dynamic_shape_identity
 func.func @broadcast_dynamic_shape_identity(%arg0: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
   // CHECK: return %arg0
-  %0 = "mhlo.broadcast"(%arg0) {broadcast_sizes = dense<[]> : tensor<0xi64>} : (tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+  %0 = "mhlo.broadcast"(%arg0) <{broadcast_sizes = dense<[]> : tensor<0xi64>}> : (tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
   func.return %0 : tensor<?x?x?xf32>
 }
 
 // CHECK-LABEL: func @broadcast_dynamic_shape_not_identity
 func.func @broadcast_dynamic_shape_not_identity(%arg0: tensor<?x?x?xf32>) -> tensor<20x?x?x?xf32> {
   // CHECK: mhlo.broadcast
-  %0 = "mhlo.broadcast"(%arg0) {broadcast_sizes = dense<[20]> : tensor<1xi64>} : (tensor<?x?x?xf32>) -> tensor<20x?x?x?xf32>
+  %0 = "mhlo.broadcast"(%arg0) <{broadcast_sizes = dense<[20]> : tensor<1xi64>}> : (tensor<?x?x?xf32>) -> tensor<20x?x?x?xf32>
   func.return %0 : tensor<20x?x?x?xf32>
 }
 
 // CHECK-LABEL: func @broadcast_constant_fold_0d
 func.func @broadcast_constant_fold_0d() -> tensor<1x64x224x224xf32> {
   %cst = mhlo.constant dense<0.000000e+00> : tensor<f32>
-  %b = "mhlo.broadcast"(%cst) {broadcast_sizes = dense<[1, 64, 224, 224]> : tensor<4xi64>} : (tensor<f32>) -> tensor<1x64x224x224xf32>
+  %b = "mhlo.broadcast"(%cst) <{broadcast_sizes = dense<[1, 64, 224, 224]> : tensor<4xi64>}> : (tensor<f32>) -> tensor<1x64x224x224xf32>
   func.return %b : tensor<1x64x224x224xf32>
 }
 // CHECK-NEXT: %[[CST:.*]] = mhlo.constant dense<0.000000e+00> : tensor<1x64x224x224xf32>
@@ -663,7 +663,7 @@ func.func @broadcast_constant_fold_0d() -> tensor<1x64x224x224xf32> {
 // CHECK-LABEL: func @broadcast_constant_fold
 func.func @broadcast_constant_fold() -> tensor<1x64x4x4xf32> {
   %cst = mhlo.constant dense<0.000000e+00> : tensor<4x4xf32>
-  %b = "mhlo.broadcast"(%cst) {broadcast_sizes = dense<[1, 64]> : tensor<2xi64>} : (tensor<4x4xf32>) -> tensor<1x64x4x4xf32>
+  %b = "mhlo.broadcast"(%cst) <{broadcast_sizes = dense<[1, 64]> : tensor<2xi64>}> : (tensor<4x4xf32>) -> tensor<1x64x4x4xf32>
   func.return %b : tensor<1x64x4x4xf32>
 }
 // CHECK-NEXT: %[[CST:.*]] = mhlo.constant dense<0.000000e+00> : tensor<1x64x4x4xf32>
@@ -674,14 +674,14 @@ func.func @broadcast_constant_fold_not_splat() -> tensor<1x64x2xf32> {
   // CHECK: mhlo.constant
   %cst = mhlo.constant dense<[0.000000e+00, 1.000000e+00]> : tensor<2xf32>
   // CHECK: mhlo.broadcast
-  %b = "mhlo.broadcast"(%cst) {broadcast_sizes = dense<[1, 64]> : tensor<2xi64>} : (tensor<2xf32>) -> tensor<1x64x2xf32>
+  %b = "mhlo.broadcast"(%cst) <{broadcast_sizes = dense<[1, 64]> : tensor<2xi64>}> : (tensor<2xf32>) -> tensor<1x64x2xf32>
   func.return %b : tensor<1x64x2xf32>
 }
 
 // CHECK-LABEL: func @broadcast_constant_fold_complex
 func.func @broadcast_constant_fold_complex() -> tensor<1x64x224x224xcomplex<f32>> {
   %cst = mhlo.constant dense<(0.000000e+00,1.000000e+00)> : tensor<complex<f32>>
-  %b = "mhlo.broadcast"(%cst) {broadcast_sizes = dense<[1, 64, 224, 224]> : tensor<4xi64>} : (tensor<complex<f32>>) -> tensor<1x64x224x224xcomplex<f32>>
+  %b = "mhlo.broadcast"(%cst) <{broadcast_sizes = dense<[1, 64, 224, 224]> : tensor<4xi64>}> : (tensor<complex<f32>>) -> tensor<1x64x224x224xcomplex<f32>>
   func.return %b : tensor<1x64x224x224xcomplex<f32>>
 }
 // CHECK-NEXT: %[[CST:.*]] = mhlo.constant dense<(0.000000e+00,1.000000e+00)> : tensor<1x64x224x224xcomplex<f32>>
@@ -690,31 +690,31 @@ func.func @broadcast_constant_fold_complex() -> tensor<1x64x224x224xcomplex<f32>
 // CHECK-LABEL: func @broadcast_constant_fold_quantized_skipped
 func.func @broadcast_constant_fold_quantized_skipped() -> tensor<1x64x224x224x!quant.uniform<i8:f32, 1.000000e+00:3>> {
   %cst = stablehlo.constant() {value = dense<2> : tensor<i8>} : ()  ->  tensor<!quant.uniform<i8:f32, 1.000000e+00:3>>
-  %b = "mhlo.broadcast"(%cst) {broadcast_sizes = dense<[1, 64, 224, 224]> : tensor<4xi64>} : (tensor<!quant.uniform<i8:f32, 1.000000e+00:3>>) -> tensor<1x64x224x224x!quant.uniform<i8:f32, 1.000000e+00:3>>
+  %b = "mhlo.broadcast"(%cst) <{broadcast_sizes = dense<[1, 64, 224, 224]> : tensor<4xi64>}> : (tensor<!quant.uniform<i8:f32, 1.000000e+00:3>>) -> tensor<1x64x224x224x!quant.uniform<i8:f32, 1.000000e+00:3>>
   func.return %b : tensor<1x64x224x224x!quant.uniform<i8:f32, 1.000000e+00:3>>
 }
 // CHECK-NEXT: %[[CST:.*]] = stablehlo.constant() {value = dense<2> : tensor<i8>} : ()  ->  tensor<!quant.uniform<i8:f32, 1.000000e+00:3>>
-// CHECK-NEXT: %[[RES:.*]] = "mhlo.broadcast"(%[[CST:.*]]) {broadcast_sizes = dense<[1, 64, 224, 224]> : tensor<4xi64>} : (tensor<!quant.uniform<i8:f32, 1.000000e+00:3>>) -> tensor<1x64x224x224x!quant.uniform<i8:f32, 1.000000e+00:3>>
+// CHECK-NEXT: %[[RES:.*]] = "mhlo.broadcast"(%[[CST:.*]]) <{broadcast_sizes = dense<[1, 64, 224, 224]> : tensor<4xi64>}> : (tensor<!quant.uniform<i8:f32, 1.000000e+00:3>>) -> tensor<1x64x224x224x!quant.uniform<i8:f32, 1.000000e+00:3>>
 // CHECK-NEXT: return %[[RES:.*]] : tensor<1x64x224x224x!quant.uniform<i8:f32, 1.000000e+00:3>>
 
 // CHECK-LABEL: func @broadcast_in_dim_identity
 func.func @broadcast_in_dim_identity(%arg0: tensor<2x3x4xf32>) -> tensor<2x3x4xf32> {
   // CHECK: return %arg0
-  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<2x3x4xf32>) -> tensor<2x3x4xf32>
+  %0 = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>}> : (tensor<2x3x4xf32>) -> tensor<2x3x4xf32>
   func.return %0 : tensor<2x3x4xf32>
 }
 
 // CHECK-LABEL: func @broadcast_in_dim_equivalent_reshape
 func.func @broadcast_in_dim_equivalent_reshape(%arg0: tensor<2x3x4xf32>) -> tensor<1x2x3x4xf32> {
   // CHECK: mhlo.reshape
-  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[1, 2, 3]> : tensor<3xi64>} : (tensor<2x3x4xf32>) -> tensor<1x2x3x4xf32>
+  %0 = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<[1, 2, 3]> : tensor<3xi64>}> : (tensor<2x3x4xf32>) -> tensor<1x2x3x4xf32>
   func.return %0 : tensor<1x2x3x4xf32>
 }
 
 // CHECK-LABEL: func @broadcast_in_dim_not_identity_because_it_actually_broadcasts
 func.func @broadcast_in_dim_not_identity_because_it_actually_broadcasts(%arg0: tensor<1x2xf32>) -> tensor<2x2xf32> {
   // CHECK: mhlo.broadcast_in_dim
-  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x2xf32>) -> tensor<2x2xf32>
+  %0 = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}> : (tensor<1x2xf32>) -> tensor<2x2xf32>
   func.return %0 : tensor<2x2xf32>
 }
 
@@ -722,16 +722,16 @@ func.func @broadcast_in_dim_not_identity_because_it_actually_broadcasts(%arg0: t
 func.func @broadcast_in_dim_equivalent_transpose(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
   // CHECK: mhlo.transpose
   // CHECK-SAME: permutation = dense<[1, 0]>
-  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[1, 0]> : tensor<2xi64>} : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  %0 = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<[1, 0]> : tensor<2xi64>}> : (tensor<2x2xf32>) -> tensor<2x2xf32>
   func.return %0 : tensor<2x2xf32>
 }
 
 // CHECK-LABEL: func @broadcast_in_dim_constant_fold_quantized_skipped
 func.func @broadcast_in_dim_constant_fold_quantized_skipped(%arg0: tensor<1x2x!quant.uniform<i8:f32, 1.000000e+00:3>>) -> tensor<2x2x!quant.uniform<i8:f32, 1.000000e+00:3>> {
-  %b = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x2x!quant.uniform<i8:f32, 1.000000e+00:3>>) -> tensor<2x2x!quant.uniform<i8:f32, 1.000000e+00:3>>
+  %b = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}> : (tensor<1x2x!quant.uniform<i8:f32, 1.000000e+00:3>>) -> tensor<2x2x!quant.uniform<i8:f32, 1.000000e+00:3>>
   func.return %b : tensor<2x2x!quant.uniform<i8:f32, 1.000000e+00:3>>
 }
-// CHECK-NEXT: %[[RES:.*]] = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x2x!quant.uniform<i8:f32, 1.000000e+00:3>>) -> tensor<2x2x!quant.uniform<i8:f32, 1.000000e+00:3>>
+// CHECK-NEXT: %[[RES:.*]] = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}> : (tensor<1x2x!quant.uniform<i8:f32, 1.000000e+00:3>>) -> tensor<2x2x!quant.uniform<i8:f32, 1.000000e+00:3>>
 // CHECK-NEXT: return %[[RES:.*]] : tensor<2x2x!quant.uniform<i8:f32, 1.000000e+00:3>>
 
 // CHECK-LABEL: func @broadcast_consecutive
@@ -739,15 +739,15 @@ func.func @broadcast_consecutive(%arg0: tensor<2x3xf32>) -> tensor<2x3x4x5xf32>
   // CHECK: mhlo.broadcast_in_dim
   // CHECK-SAME: broadcast_dimensions = dense<[0, 1]>
   // CHECK-NEXT: return
-  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<2x3xf32>) -> tensor<2x3x4xf32>
-  %1 = "mhlo.broadcast_in_dim"(%0) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<2x3x4xf32>) -> tensor<2x3x4x5xf32>
+  %0 = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}> : (tensor<2x3xf32>) -> tensor<2x3x4xf32>
+  %1 = "mhlo.broadcast_in_dim"(%0) <{broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>}> : (tensor<2x3x4xf32>) -> tensor<2x3x4x5xf32>
   func.return %1 : tensor<2x3x4x5xf32>
 }
 
 // CHECK-LABEL: func @dynamic_broadcast_in_dim_op_not_actually_dynamic
 func.func @dynamic_broadcast_in_dim_op_not_actually_dynamic(%arg0: tensor<4xf32>, %arg1: tensor<2xi64>) -> tensor<5x4xf32> {
-  // CHECK: %[[RESULT:.+]] = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<5x4xf32>
-  %0 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %arg1) { broadcast_dimensions = dense<1> : tensor<1xi64> } : (tensor<4xf32>, tensor<2xi64>) -> tensor<5x4xf32>
+  // CHECK: %[[RESULT:.+]] = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<1> : tensor<1xi64>}> : (tensor<4xf32>) -> tensor<5x4xf32>
+  %0 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %arg1) <{ broadcast_dimensions = dense<1> : tensor<1xi64> }> : (tensor<4xf32>, tensor<2xi64>) -> tensor<5x4xf32>
   // CHECK: return %[[RESULT]] : tensor<5x4xf32>
   func.return %0 : tensor<5x4xf32>
 }
@@ -755,8 +755,8 @@ func.func @dynamic_broadcast_in_dim_op_not_actually_dynamic(%arg0: tensor<4xf32>
 // CHECK-LABEL: func @dynamic_broadcast_in_dim_op_not_actually_dynamic_constant_shape
 func.func @dynamic_broadcast_in_dim_op_not_actually_dynamic_constant_shape(%arg0: tensor<i32>) -> tensor<4x32xi32> {
   %0 = mhlo.constant dense<[4, 32]> : tensor<2xi32>
-  // CHECK: %[[RESULT:.+]] = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<i32>) -> tensor<4x32xi32>
-  %1 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %0) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<i32>, tensor<2xi32>) -> tensor<?x32xi32>
+  // CHECK: %[[RESULT:.+]] = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<i32>) -> tensor<4x32xi32>
+  %1 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %0) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<i32>, tensor<2xi32>) -> tensor<?x32xi32>
   %2 = "mhlo.dynamic_reshape"(%1, %0) : (tensor<?x32xi32>, tensor<2xi32>) -> tensor<4x32xi32>
   // CHECK: return %[[RESULT]] : tensor<4x32xi32>
   func.return %2 : tensor<4x32xi32>
@@ -765,8 +765,8 @@ func.func @dynamic_broadcast_in_dim_op_not_actually_dynamic_constant_shape(%arg0
 // CHECK-LABEL: func @dynamic_broadcast_in_dim_op_not_actually_dynamic_constant_index_shape
 func.func @dynamic_broadcast_in_dim_op_not_actually_dynamic_constant_index_shape(%arg0: tensor<f32>) -> tensor<4x32xf32> {
   %0 = shape.const_shape [4, 32] : tensor<2xindex>
-  // CHECK: %[[RESULT:.+]] = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<4x32xf32>
-  %1 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %0) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<2xindex>) -> tensor<?x32xf32>
+  // CHECK: %[[RESULT:.+]] = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<f32>) -> tensor<4x32xf32>
+  %1 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %0) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<f32>, tensor<2xindex>) -> tensor<?x32xf32>
   %2 = "mhlo.dynamic_reshape"(%1, %0) : (tensor<?x32xf32>, tensor<2xindex>) -> tensor<4x32xf32>
   // CHECK: return %[[RESULT]] : tensor<4x32xf32>
   func.return %2 : tensor<4x32xf32>
@@ -775,8 +775,8 @@ func.func @dynamic_broadcast_in_dim_op_not_actually_dynamic_constant_index_shape
 // CHECK-LABEL: func @dynamic_broadcast_in_dim_op_not_actually_dynamic_constant_requires_cast
 func.func @dynamic_broadcast_in_dim_op_not_actually_dynamic_constant_requires_cast(%arg0: tensor<f32>) -> tensor<?x?xf32> {
   %0 = shape.const_shape [4, 32] : tensor<2xindex>
-  // CHECK: %[[BCAST:.+]] = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<4x32xf32>
-  %1 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %0) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<2xindex>) -> tensor<?x?xf32>
+  // CHECK: %[[BCAST:.+]] = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<f32>) -> tensor<4x32xf32>
+  %1 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %0) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<f32>, tensor<2xindex>) -> tensor<?x?xf32>
   // CHECK: %[[RESULT:.*]] = tensor.cast %[[BCAST]] : tensor<4x32xf32> to tensor<?x?xf32>
   // CHECK: return %[[RESULT]] : tensor<?x?xf32>
   func.return %1 : tensor<?x?xf32>
@@ -784,8 +784,8 @@ func.func @dynamic_broadcast_in_dim_op_not_actually_dynamic_constant_requires_ca
 
 // CHECK-LABEL: func @dynamic_broadcast_in_dim_op_almost_not_actually_dynamic
 func.func @dynamic_broadcast_in_dim_op_almost_not_actually_dynamic(%arg0: tensor<?xf32>, %arg1: tensor<2xi64>) -> tensor<5x4xf32> {
-  // CHECK: %[[RESULT:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<2xi64>) -> tensor<5x4xf32>
-  %0 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %arg1) { broadcast_dimensions = dense<1> : tensor<1xi64> } : (tensor<?xf32>, tensor<2xi64>) -> tensor<5x4xf32>
+  // CHECK: %[[RESULT:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg0, %arg1) <{broadcast_dimensions = dense<1> : tensor<1xi64>}> : (tensor<?xf32>, tensor<2xi64>) -> tensor<5x4xf32>
+  %0 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %arg1) <{ broadcast_dimensions = dense<1> : tensor<1xi64> }> : (tensor<?xf32>, tensor<2xi64>) -> tensor<5x4xf32>
   // CHECK: return %[[RESULT]] : tensor<5x4xf32>
   func.return %0 : tensor<5x4xf32>
 }
@@ -794,7 +794,7 @@ func.func @dynamic_broadcast_in_dim_op_almost_not_actually_dynamic(%arg0: tensor
 func.func @dynamic_broadcast_in_dim_to_same_shape_1(%arg0: tensor<?xf32>) -> tensor<?xf32> {
   // CHECK-SAME: %[[ARG:.*]]: tensor<?xf32>
   %0 = shape.shape_of %arg0 : tensor<?xf32> -> tensor<1xindex>
-  %2 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %0) { broadcast_dimensions = dense<0> : tensor<1xi64> } : (tensor<?xf32>, tensor<1xindex>) -> tensor<?xf32>
+  %2 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %0) <{ broadcast_dimensions = dense<0> : tensor<1xi64> }> : (tensor<?xf32>, tensor<1xindex>) -> tensor<?xf32>
   // CHECK: return %[[ARG]] : tensor<?xf32>
   func.return %2 : tensor<?xf32>
 }
@@ -804,7 +804,7 @@ func.func @dynamic_broadcast_in_dim_to_same_shape_2(%arg0: tensor<?xf32>) -> ten
   // CHECK-SAME: %[[ARG:.*]]: tensor<?xf32>
   %0 = shape.shape_of %arg0 : tensor<?xf32> -> !shape.shape
   %1 = shape.to_extent_tensor %0 : !shape.shape -> tensor<1xindex>
-  %2 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %1) { broadcast_dimensions = dense<0> : tensor<1xi64> } : (tensor<?xf32>, tensor<1xindex>) -> tensor<?xf32>
+  %2 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %1) <{ broadcast_dimensions = dense<0> : tensor<1xi64> }> : (tensor<?xf32>, tensor<1xindex>) -> tensor<?xf32>
   // CHECK: return %[[ARG]] : tensor<?xf32>
   func.return %2 : tensor<?xf32>
 }
@@ -814,7 +814,7 @@ func.func @dynamic_broadcast_in_dim_to_same_shape_3(%arg0: tensor<?xf32>) -> ten
   // CHECK-SAME: %[[ARG:.*]]: tensor<?xf32>
   %0 = shape.shape_of %arg0 : tensor<?xf32> -> tensor<?xindex>
   %1 = tensor.cast %0 : tensor<?xindex> to tensor<1xindex>
-  %2 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %1) { broadcast_dimensions = dense<0> : tensor<1xi64> } : (tensor<?xf32>, tensor<1xindex>) -> tensor<?xf32>
+  %2 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %1) <{ broadcast_dimensions = dense<0> : tensor<1xi64> }> : (tensor<?xf32>, tensor<1xindex>) -> tensor<?xf32>
   // CHECK: return %[[ARG]] : tensor<?xf32>
   func.return %2 : tensor<?xf32>
 }
@@ -825,7 +825,7 @@ func.func @dynamic_broadcast_in_dim_to_same_shape_4(%arg0: tensor<?xf32>) -> ten
   %0 = shape.shape_of %arg0 : tensor<?xf32> -> !shape.shape
   %1 = shape.to_extent_tensor %0 : !shape.shape -> tensor<?xindex>
   %2 = tensor.cast %1 : tensor<?xindex> to tensor<1xindex>
-  %3 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %2) { broadcast_dimensions = dense<0> : tensor<1xi64> } : (tensor<?xf32>, tensor<1xindex>) -> tensor<?xf32>
+  %3 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %2) <{ broadcast_dimensions = dense<0> : tensor<1xi64> }> : (tensor<?xf32>, tensor<1xindex>) -> tensor<?xf32>
   // CHECK: return %[[ARG]] : tensor<?xf32>
   func.return %3 : tensor<?xf32>
 }
@@ -845,7 +845,7 @@ func.func @dynamic_broadcast_in_dim_all_dims_non_expanding(%arg0: tensor<?xf32>,
 // CHECK-LABEL: func @broadcast_in_dim_constant_fold_0d
 func.func @broadcast_in_dim_constant_fold_0d() -> tensor<1x64x224x224xf32> {
   %cst = mhlo.constant dense<0.000000e+00> : tensor<f32>
-  %b = "mhlo.broadcast_in_dim"(%cst) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<f32>) -> tensor<1x64x224x224xf32>
+  %b = "mhlo.broadcast_in_dim"(%cst) <{broadcast_dimensions = dense<[]> : tensor<0xi64>}> : (tensor<f32>) -> tensor<1x64x224x224xf32>
   func.return %b : tensor<1x64x224x224xf32>
 }
 // CHECK-NEXT: %[[CST:.*]] = mhlo.constant dense<0.000000e+00> : tensor<1x64x224x224xf32>
@@ -854,7 +854,7 @@ func.func @broadcast_in_dim_constant_fold_0d() -> tensor<1x64x224x224xf32> {
 // CHECK-LABEL: func @broadcast_in_dim_constant_fold
 func.func @broadcast_in_dim_constant_fold() -> tensor<1x64x4x4xf32> {
   %cst = mhlo.constant dense<0.000000e+00> : tensor<4x4xf32>
-  %b = "mhlo.broadcast_in_dim"(%cst) {broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>} : (tensor<4x4xf32>) -> tensor<1x64x4x4xf32>
+  %b = "mhlo.broadcast_in_dim"(%cst) <{broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>}> : (tensor<4x4xf32>) -> tensor<1x64x4x4xf32>
   func.return %b : tensor<1x64x4x4xf32>
 }
 // CHECK-NEXT: %[[CST:.*]] = mhlo.constant dense<0.000000e+00> : tensor<1x64x4x4xf32>
@@ -863,7 +863,7 @@ func.func @broadcast_in_dim_constant_fold() -> tensor<1x64x4x4xf32> {
 // CHECK-LABEL: func @broadcast_in_dim_constant_fold_complex
 func.func @broadcast_in_dim_constant_fold_complex() -> tensor<1x64x224x224xcomplex<f32>> {
   %cst = mhlo.constant dense<(0.000000e+00,1.000000e+00)> : tensor<complex<f32>>
-  %b = "mhlo.broadcast_in_dim"(%cst) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<complex<f32>>) -> tensor<1x64x224x224xcomplex<f32>>
+  %b = "mhlo.broadcast_in_dim"(%cst) <{broadcast_dimensions = dense<[]> : tensor<0xi64>}> : (tensor<complex<f32>>) -> tensor<1x64x224x224xcomplex<f32>>
   func.return %b : tensor<1x64x224x224xcomplex<f32>>
 }
 // CHECK-NEXT: %[[CST:.*]] = mhlo.constant dense<(0.000000e+00,1.000000e+00)> : tensor<1x64x224x224xcomplex<f32>>
@@ -892,15 +892,15 @@ func.func @complex_collapse_fold(%arg0: tensor<4xcomplex<f32>>) -> tensor<4xcomp
 func.func @dynamic_iota_is_static(%arg0 : tensor<1xindex>) -> tensor<4xi32> {
   // CHECK: [[RESULT:%.*]] = "mhlo.iota"
   // CHECK: return [[RESULT]]
-  %0 = "mhlo.dynamic_iota"(%arg0) {iota_dimension = 0 : i64} : (tensor<1xindex>) -> tensor<4xi32>
+  %0 = "mhlo.dynamic_iota"(%arg0) <{iota_dimension = 0 : i64}> : (tensor<1xindex>) -> tensor<4xi32>
   func.return %0 : tensor<4xi32>
 }
 
 // CHECK-LABEL: @dynamic_iota_broadcast
 func.func @dynamic_iota_broadcast(%arg0 : tensor<2xindex>) -> tensor<5x?xi32> {
-  // CHECK: [[IOTA:%.+]] = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<5xi32>
-  // CHECK: [[BROADCAST:%.+]] = "mhlo.dynamic_broadcast_in_dim"([[IOTA]], %arg0) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<5xi32>, tensor<2xindex>) -> tensor<5x?xi32>
-  %0 = "mhlo.dynamic_iota"(%arg0) {iota_dimension = 0 : i64} : (tensor<2xindex>) -> tensor<5x?xi32>
+  // CHECK: [[IOTA:%.+]] = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<5xi32>
+  // CHECK: [[BROADCAST:%.+]] = "mhlo.dynamic_broadcast_in_dim"([[IOTA]], %arg0) <{broadcast_dimensions = dense<0> : tensor<1xi64>}> : (tensor<5xi32>, tensor<2xindex>) -> tensor<5x?xi32>
+  %0 = "mhlo.dynamic_iota"(%arg0) <{iota_dimension = 0 : i64}> : (tensor<2xindex>) -> tensor<5x?xi32>
 
   // CHECK: return [[BROADCAST]]
   func.return %0 : tensor<5x?xi32>
@@ -909,11 +909,11 @@ func.func @dynamic_iota_broadcast(%arg0 : tensor<2xindex>) -> tensor<5x?xi32> {
 // CHECK-LABEL: @dynamic_iota_broadcast_second
 func.func @dynamic_iota_broadcast_second(%arg0 : tensor<2xindex>) -> tensor<5x?xi32> {
   // CHECK-NEXT: [[CAST1:%.+]] = arith.index_cast %arg0 : tensor<2xindex> to tensor<2xi64>
-  // CHECK-NEXT: [[SLICE:%.+]] = "mhlo.slice"([[CAST1]]) {limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi64>) -> tensor<1xi64>
+  // CHECK-NEXT: [[SLICE:%.+]] = "mhlo.slice"([[CAST1]]) <{limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi64>) -> tensor<1xi64>
   // CHECK-NEXT: [[CAST2:%.+]] = arith.index_cast [[SLICE]] : tensor<1xi64> to tensor<1xindex>
-  // CHECK-NEXT: [[IOTA:%.+]] = "mhlo.dynamic_iota"([[CAST2]]) {iota_dimension = 0 : i64} : (tensor<1xindex>) -> tensor<?xi32>
-  // CHECK-NEXT: [[BROADCAST:%.+]] = "mhlo.dynamic_broadcast_in_dim"([[IOTA]], %arg0) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xi32>, tensor<2xindex>) -> tensor<5x?xi32>
-  %0 = "mhlo.dynamic_iota"(%arg0) {iota_dimension = 1 : i64} : (tensor<2xindex>) -> tensor<5x?xi32>
+  // CHECK-NEXT: [[IOTA:%.+]] = "mhlo.dynamic_iota"([[CAST2]]) <{iota_dimension = 0 : i64}> : (tensor<1xindex>) -> tensor<?xi32>
+  // CHECK-NEXT: [[BROADCAST:%.+]] = "mhlo.dynamic_broadcast_in_dim"([[IOTA]], %arg0) <{broadcast_dimensions = dense<1> : tensor<1xi64>}> : (tensor<?xi32>, tensor<2xindex>) -> tensor<5x?xi32>
+  %0 = "mhlo.dynamic_iota"(%arg0) <{iota_dimension = 1 : i64}> : (tensor<2xindex>) -> tensor<5x?xi32>
 
   // CHECK: return [[BROADCAST]]
   func.return %0 : tensor<5x?xi32>
@@ -922,8 +922,8 @@ func.func @dynamic_iota_broadcast_second(%arg0 : tensor<2xindex>) -> tensor<5x?x
 // CHECK-LABEL: @dynamic_iota_constant
 func.func @dynamic_iota_constant(%arg0 : tensor<2xindex>) -> tensor<1x?xi32> {
   // CHECK: [[IOTA:%.+]] = mhlo.constant dense<0> : tensor<1xi32>
-  // CHECK: [[BROADCAST:%.+]] = "mhlo.dynamic_broadcast_in_dim"([[IOTA]], %arg0) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xi32>, tensor<2xindex>) -> tensor<1x?xi32>
-  %0 = "mhlo.dynamic_iota"(%arg0) {iota_dimension = 0 : i64} : (tensor<2xindex>) -> tensor<1x?xi32>
+  // CHECK: [[BROADCAST:%.+]] = "mhlo.dynamic_broadcast_in_dim"([[IOTA]], %arg0) <{broadcast_dimensions = dense<0> : tensor<1xi64>}> : (tensor<1xi32>, tensor<2xindex>) -> tensor<1x?xi32>
+  %0 = "mhlo.dynamic_iota"(%arg0) <{iota_dimension = 0 : i64}> : (tensor<2xindex>) -> tensor<1x?xi32>
 
   // CHECK: return [[BROADCAST]]
   func.return %0 : tensor<1x?xi32>
@@ -932,7 +932,7 @@ func.func @dynamic_iota_constant(%arg0 : tensor<2xindex>) -> tensor<1x?xi32> {
 // CHECK-LABEL: @iota_constant
 func.func @iota_constant() -> tensor<1xi32> {
   // CHECK: [[CONST:%.+]] = mhlo.constant dense<0> : tensor<1xi32>
-  %0 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<1xi32>
+  %0 = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<1xi32>
 
   // CHECK: return [[CONST]] : tensor<1xi32>
   func.return %0 : tensor<1xi32>
@@ -941,7 +941,7 @@ func.func @iota_constant() -> tensor<1xi32> {
 // CHECK-LABEL: @iota_constant_multi
 func.func @iota_constant_multi() -> tensor<1x4xi32> {
   // CHECK: [[CONST:%.+]] = mhlo.constant dense<0> : tensor<1x4xi32>
-  %0 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<1x4xi32>
+  %0 = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<1x4xi32>
 
   // CHECK: return [[CONST]] : tensor<1x4xi32>
   func.return %0 : tensor<1x4xi32>
@@ -951,24 +951,24 @@ func.func @iota_constant_multi() -> tensor<1x4xi32> {
 func.func @iota_not_lowered_to_constant() -> tensor<4xi32> {
   // CHECK: [[RESULT:%.*]] = "mhlo.iota"
   // CHECK: return [[RESULT]]
-  %0 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<4xi32>
+  %0 = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<4xi32>
   func.return %0 : tensor<4xi32>
 }
 
 // CHECK-LABEL: @iota_broadcast
 func.func @iota_broadcast() -> tensor<5x4xi32> {
-  // CHECK: [[IOTA:%.+]] = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<5xi32>
-  // CHECK: [[RESULT:%.+]] = "mhlo.broadcast_in_dim"([[IOTA]]) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<5xi32>) -> tensor<5x4xi32>
-  %0 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<5x4xi32>
+  // CHECK: [[IOTA:%.+]] = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<5xi32>
+  // CHECK: [[RESULT:%.+]] = "mhlo.broadcast_in_dim"([[IOTA]]) <{broadcast_dimensions = dense<0> : tensor<1xi64>}> : (tensor<5xi32>) -> tensor<5x4xi32>
+  %0 = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<5x4xi32>
 
   func.return %0 : tensor<5x4xi32>
 }
 
 // CHECK-LABEL: @iota_broadcast
 func.func @iota_broadcast_second() -> tensor<5x4xi32> {
-  // CHECK: [[IOTA:%.+]] = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<4xi32>
-  // CHECK: [[RESULT:%.+]] = "mhlo.broadcast_in_dim"([[IOTA]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xi32>) -> tensor<5x4xi32>
-  %0 = "mhlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<5x4xi32>
+  // CHECK: [[IOTA:%.+]] = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<4xi32>
+  // CHECK: [[RESULT:%.+]] = "mhlo.broadcast_in_dim"([[IOTA]]) <{broadcast_dimensions = dense<1> : tensor<1xi64>}> : (tensor<4xi32>) -> tensor<5x4xi32>
+  %0 = "mhlo.iota"() <{iota_dimension = 1 : i64}> : () -> tensor<5x4xi32>
 
   func.return %0 : tensor<5x4xi32>
 }
@@ -976,7 +976,7 @@ func.func @iota_broadcast_second() -> tensor<5x4xi32> {
 // CHECK-LABEL: @unary_einsum
 func.func @unary_einsum(%arg0: tensor<2x3xf32>) -> tensor<2x2xf32> {
   // CHECK: %[[ONE:.*]] = mhlo.constant dense<1.000000e+00> : tensor<f32>
-  // CHECK: "mhlo.einsum"(%[[ONE]], %arg0) {einsum_config = ",ab->aa"}
+  // CHECK: "mhlo.einsum"(%[[ONE]], %arg0) <{einsum_config = ",ab->aa"}>
   %0 = "mhlo.unary_einsum"(%arg0) {einsum_config = "ab->aa"} : (tensor<2x3xf32>) -> tensor<2x2xf32>
   func.return %0 : tensor<2x2xf32>
 }
@@ -1452,7 +1452,7 @@ func.func @erase_dead_lhlo_constant_negative(%M : memref<4xf32>) -> memref<256x1
 
 // CHECK-LABEL: func @fold_get_dimension_size
 func.func @fold_get_dimension_size(%I: tensor<1x128x512xf32>) -> tensor<i32> {
-  %size = "mhlo.get_dimension_size"(%I) {dimension = 2 : i64} : (tensor<1x128x512xf32>) -> tensor<i32>
+  %size = "mhlo.get_dimension_size"(%I) <{dimension = 2 : i64}> : (tensor<1x128x512xf32>) -> tensor<i32>
   func.return %size : tensor<i32>
   // CHECK-NEXT: %[[C:.*]] = mhlo.constant dense<512> : tensor<i32>
   // CHECK-NEXT: return %[[C]]
@@ -1461,7 +1461,7 @@ func.func @fold_get_dimension_size(%I: tensor<1x128x512xf32>) -> tensor<i32> {
 // CHECK-LABEL: func @fold_get_dimension_size_fail
 func.func @fold_get_dimension_size_fail(%I: tensor<1x128x?xf32>) -> tensor<i32> {
   // CHECK: "mhlo.get_dimension_size"
-  %size = "mhlo.get_dimension_size"(%I) {dimension = 2 : i64} : (tensor<1x128x?xf32>) -> tensor<i32>
+  %size = "mhlo.get_dimension_size"(%I) <{dimension = 2 : i64}> : (tensor<1x128x?xf32>) -> tensor<i32>
   func.return %size : tensor<i32>
 }
 
@@ -1519,11 +1519,11 @@ func.func @simplify_not_as_select_pred(%arg0 : tensor<4xi1>, %arg1 : tensor<4xf3
 // CHECK-LABEL: func @simplify_broadcasted_not_as_select_pred(
 func.func @simplify_broadcasted_not_as_select_pred(%arg0 : tensor<1xi1>, %arg1 : tensor<4xf32>, %arg2 : tensor<4xf32>) -> tensor<4xf32> {
   %0 = "mhlo.not"(%arg0) : (tensor<1xi1>) -> tensor<1xi1>
-  %1 = "mhlo.broadcast_in_dim"(%0) {broadcast_dimensions = dense<[0]> : tensor<1xi64> } : (tensor<1xi1>) -> tensor<4xi1>
+  %1 = "mhlo.broadcast_in_dim"(%0) <{broadcast_dimensions = dense<[0]> : tensor<1xi64> }> : (tensor<1xi1>) -> tensor<4xi1>
   %2 = "mhlo.select"(%1, %arg1, %arg2) : (tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
   func.return %2 : tensor<4xf32>
 
-  // CHECK: %[[B:.*]] = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xi1>) -> tensor<4xi1>
+  // CHECK: %[[B:.*]] = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<0> : tensor<1xi64>}> : (tensor<1xi1>) -> tensor<4xi1>
   // CHECK: %[[R:.*]] = mhlo.select %[[B]], %arg2, %arg1
   // CHECK: return %[[R]]
 }
@@ -1540,7 +1540,7 @@ func.func @gather_to_slice(%arg0: tensor<5x6x7xf32>) -> tensor<3x6x5xf32> {
     indices_are_sorted = false,
     slice_sizes = dense<[3, 6, 5]> : tensor<3xi64>} : (tensor<5x6x7xf32>, tensor<2xi32>) -> tensor<3x6x5xf32>
   func.return %1 : tensor<3x6x5xf32>
-  // CHECK:  %[[RET:.*]] = "mhlo.slice"(%arg0) {limit_indices = dense<[4, 6, 7]> : tensor<3xi64>, start_indices = dense<[1, 0, 2]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<5x6x7xf32>) -> tensor<3x6x5xf32>
+  // CHECK:  %[[RET:.*]] = "mhlo.slice"(%arg0) <{limit_indices = dense<[4, 6, 7]> : tensor<3xi64>, start_indices = dense<[1, 0, 2]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>}> : (tensor<5x6x7xf32>) -> tensor<3x6x5xf32>
   // CHECK: return %[[RET]] : tensor<3x6x5xf32>
 }
 
@@ -1556,7 +1556,7 @@ func.func @gather_scalar_index_to_slice(%arg0: tensor<5x6x7xf32>) -> tensor<5x6x
     indices_are_sorted = false,
     slice_sizes = dense<[5, 6, 4]> : tensor<3xi64>} : (tensor<5x6x7xf32>, tensor<i32>) -> tensor<5x6x4xf32>
   func.return %1 : tensor<5x6x4xf32>
-  // CHECK:  %[[RET:.*]] = "mhlo.slice"(%arg0) {limit_indices = dense<[5, 6, 5]> : tensor<3xi64>, start_indices = dense<[0, 0, 1]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<5x6x7xf32>) -> tensor<5x6x4xf32>
+  // CHECK:  %[[RET:.*]] = "mhlo.slice"(%arg0) <{limit_indices = dense<[5, 6, 5]> : tensor<3xi64>, start_indices = dense<[0, 0, 1]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>}> : (tensor<5x6x7xf32>) -> tensor<5x6x4xf32>
   // CHECK: return %[[RET]] : tensor<5x6x4xf32>
 }
 
@@ -1573,7 +1573,7 @@ func.func @gather_to_slice_reshape(%arg0: tensor<5x6x7xf32>) -> tensor<3x6xf32>
     indices_are_sorted = false,
     slice_sizes = dense<[3, 6, 1]> : tensor<3xi64>} : (tensor<5x6x7xf32>, tensor<2xi32>) -> tensor<3x6xf32>
   func.return %1 : tensor<3x6xf32>
-  // CHECK:  %[[V0:.*]] = "mhlo.slice"(%arg0) {limit_indices = dense<[4, 6, 3]> : tensor<3xi64>, start_indices = dense<[1, 0, 2]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<5x6x7xf32>) -> tensor<3x6x1xf32>
+  // CHECK:  %[[V0:.*]] = "mhlo.slice"(%arg0) <{limit_indices = dense<[4, 6, 3]> : tensor<3xi64>, start_indices = dense<[1, 0, 2]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>}> : (tensor<5x6x7xf32>) -> tensor<3x6x1xf32>
   // CHECK:  %[[V1:.*]] = mhlo.reshape %[[V0]] : (tensor<3x6x1xf32>) -> tensor<3x6xf32>
   // CHECK: return %[[V1]] : tensor<3x6xf32>
 }
@@ -1590,7 +1590,7 @@ func.func @gather_to_slice_indices_clamp_upperbound(%arg0 : tensor<4x2xui32>) ->
     >, indices_are_sorted = true,
     slice_sizes = dense<[1, 2]> : tensor<2xi64>} : (tensor<4x2xui32>, tensor<1xi32>) -> tensor<2xui32>
   func.return %1 : tensor<2xui32>
-  // CHECK:  %[[V0:.*]] = "mhlo.slice"(%arg0) {limit_indices = dense<[4, 2]> : tensor<2xi64>, start_indices = dense<[3, 0]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<4x2xui32>) -> tensor<1x2xui32>
+  // CHECK:  %[[V0:.*]] = "mhlo.slice"(%arg0) <{limit_indices = dense<[4, 2]> : tensor<2xi64>, start_indices = dense<[3, 0]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}> : (tensor<4x2xui32>) -> tensor<1x2xui32>
   // CHECK:  %[[V1:.*]] = mhlo.reshape %[[V0]] : (tensor<1x2xui32>) -> tensor<2xui32>
   // CHECK: return %[[V1]] : tensor<2xui32>
 }
@@ -1607,7 +1607,7 @@ func.func @gather_to_slice_indices_clamp_lowerbound(%arg0 : tensor<4x2xui32>) ->
     >, indices_are_sorted = true,
     slice_sizes = dense<[1, 2]> : tensor<2xi64>} : (tensor<4x2xui32>, tensor<1xi32>) -> tensor<2xui32>
   func.return %1 : tensor<2xui32>
-  // CHECK:  %[[V0:.*]] = "mhlo.slice"(%arg0) {limit_indices = dense<[1, 2]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<4x2xui32>) -> tensor<1x2xui32>
+  // CHECK:  %[[V0:.*]] = "mhlo.slice"(%arg0) <{limit_indices = dense<[1, 2]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}> : (tensor<4x2xui32>) -> tensor<1x2xui32>
   // CHECK:  %[[V1:.*]] = mhlo.reshape %[[V0]] : (tensor<1x2xui32>) -> tensor<2xui32>
   // CHECK: return %[[V1]] : tensor<2xui32>
 }
@@ -2373,7 +2373,7 @@ func.func @pad_negative_fold() -> tensor<4x4xi32> {
 func.func @pad_fold_zero_elements() -> tensor<3xi32> {
   %0 = mhlo.constant dense<> : tensor<0xi32>
   %1 = mhlo.constant dense<7> : tensor<i32>
-  %2 = "mhlo.pad"(%0, %1) {edge_padding_high = dense<3> : tensor<1xi64>, edge_padding_low = dense<0> : tensor<1xi64>, interior_padding = dense<0> : tensor<1xi64>} : (tensor<0xi32>, tensor<i32>) -> tensor<3xi32>
+  %2 = "mhlo.pad"(%0, %1) <{edge_padding_high = dense<3> : tensor<1xi64>, edge_padding_low = dense<0> : tensor<1xi64>, interior_padding = dense<0> : tensor<1xi64>}> : (tensor<0xi32>, tensor<i32>) -> tensor<3xi32>
   func.return %2 : tensor<3xi32>
   // CHECK: mhlo.constant dense<7> : tensor<3xi32>
 }
@@ -2382,14 +2382,14 @@ func.func @pad_fold_zero_elements() -> tensor<3xi32> {
 func.func @pad_float_fold() -> tensor<2xf32> {
   %0 = mhlo.constant dense<2.000000e+00> : tensor<1xf32>
   %1 = mhlo.constant dense<1.000000e+00> : tensor<f32>
-  %2 = "mhlo.pad"(%0, %1) {edge_padding_high = dense<1> : tensor<1xi64>, edge_padding_low = dense<0> : tensor<1xi64>, interior_padding = dense<0> : tensor<1xi64>} : (tensor<1xf32>, tensor<f32>) -> tensor<2xf32>
+  %2 = "mhlo.pad"(%0, %1) <{edge_padding_high = dense<1> : tensor<1xi64>, edge_padding_low = dense<0> : tensor<1xi64>, interior_padding = dense<0> : tensor<1xi64>}> : (tensor<1xf32>, tensor<f32>) -> tensor<2xf32>
   return %2 : tensor<2xf32>
   // CHECK: mhlo.constant dense<[2.000000e+00, 1.000000e+00]> : tensor<2xf32>
 }
 
 // CHECK-LABEL: @pad_zero_length
 func.func @pad_zero_length(%arg0: tensor<5x0xf32>, %arg1: tensor<f32>) -> tensor<7x2xf32> {
-  // CHECK: %[[RES:.+]] = "mhlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<7x2xf32>
+  // CHECK: %[[RES:.+]] = "mhlo.broadcast_in_dim"(%arg1) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<f32>) -> tensor<7x2xf32>
   %0 = "mhlo.pad"(%arg0, %arg1) {
     edge_padding_low = dense<1> : tensor<2xi64>,
     edge_padding_high = dense<1> : tensor<2xi64>,
@@ -2411,7 +2411,7 @@ func.func @pad_zero_length_dyn(%arg0: tensor<?x0xf32>, %arg1: tensor<f32>) -> te
   // CHECK-DAG: %[[ADD1:.+]] = arith.addi %[[DIM]], %[[MUL]]
   // CHECK-DAG: %[[ADD2:.+]] = arith.addi %[[ADD1]], %[[C2]]
   // CHECK-DAG: %[[SHAPE:.+]] = tensor.from_elements %[[ADD2]], %[[C2]] : tensor<2xindex>
-  // CHECK-DAG: %[[BROAD:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg1, %[[SHAPE]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<2xindex>) -> tensor<?x2xf32>
+  // CHECK-DAG: %[[BROAD:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg1, %[[SHAPE]]) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<f32>, tensor<2xindex>) -> tensor<?x2xf32>
   %0 = "mhlo.pad"(%arg0, %arg1) {
     edge_padding_low = dense<1> : tensor<2xi64>,
     edge_padding_high = dense<1> : tensor<2xi64>,
@@ -2464,7 +2464,7 @@ func.func @dynamic_pad_length_dyn(
   // CHECK: %[[EX4:.+]] = tensor.extract %arg2[%[[CI1]]]
   // CHECK: %[[ADD3:.+]] = arith.addi %[[EX3]], %[[EX4]] : i32
   // CHECK: %[[SHAPE:.+]] = tensor.from_elements %[[ADD2]], %[[ADD3]] : tensor<2xi32>
-  // CHECK: %[[BROAD:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[CST]], %[[SHAPE]]) {broadcast_dimensions = dense<> : tensor<0xi64>}
+  // CHECK: %[[BROAD:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[CST]], %[[SHAPE]]) <{broadcast_dimensions = dense<> : tensor<0xi64>}>
   %0 = arith.constant dense<0.0> : tensor<f32>
   %1 = "mhlo.dynamic_pad"(%arg0, %0, %arg1, %arg2, %arg3) {
   } : (tensor<?x0xf32>, tensor<f32>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<?x?xf32>
@@ -2476,7 +2476,7 @@ func.func @dynamic_pad_length_dyn(
 func.func @pad_complex_fold() -> tensor<2xcomplex<f32>> {
   %0 = mhlo.constant dense<(2.000000e+00,0.000000e+00)> : tensor<1xcomplex<f32>>
   %1 = mhlo.constant dense<(1.000000e+00,0.000000e+00)> : tensor<complex<f32>>
-  %2 = "mhlo.pad"(%0, %1) {edge_padding_high = dense<1> : tensor<1xi64>, edge_padding_low = dense<0> : tensor<1xi64>, interior_padding = dense<0> : tensor<1xi64>} : (tensor<1xcomplex<f32>>, tensor<complex<f32>>) -> tensor<2xcomplex<f32>>
+  %2 = "mhlo.pad"(%0, %1) <{edge_padding_high = dense<1> : tensor<1xi64>, edge_padding_low = dense<0> : tensor<1xi64>, interior_padding = dense<0> : tensor<1xi64>}> : (tensor<1xcomplex<f32>>, tensor<complex<f32>>) -> tensor<2xcomplex<f32>>
   return %2 : tensor<2xcomplex<f32>>
   // CHECK: mhlo.constant dense<[(2.000000e+00,0.000000e+00), (1.000000e+00,0.000000e+00)]> : tensor<2xcomplex<f32>>
 }
@@ -2665,11 +2665,11 @@ func.func @sort_drop_second_arg(%arg0: tensor<3xi32>, %arg1: tensor<3xi32>) -> t
 // CHECK-LABEL: @sort_drop_second_arg
 // CHECK-SAME:    %[[ARG0:[a-zA-Z0-9_]+]]
 // CHECK-SAME:    %[[ARG1:[a-zA-Z0-9_]+]]
-// CHECK:         %[[RES:.+]] = "mhlo.sort"(%[[ARG0]])
+// CHECK:         %[[RES:.+]] = "mhlo.sort"(%[[ARG0]]) <{dimension = 0 : i64, is_stable = false}> ({
 // CHECK:         ^bb0(%[[ARG2:.+]]: tensor<i32>, %[[ARG3:.+]]: tensor<i32>)
 // CHECK:           %[[CMP:.+]] = mhlo.compare GT, %[[ARG2]], %[[ARG3]] : (tensor<i32>, tensor<i32>) -> tensor<i1>
 // CHECK:           mhlo.return %[[CMP]] : tensor<i1>
-// CHECK:         {dimension = 0 : i64, is_stable = false} : (tensor<3xi32>) -> tensor<3xi32>
+// CHECK:         }) : (tensor<3xi32>) -> tensor<3xi32>
 // CHECK:         return %[[RES]] : tensor<3xi32>
 
 func.func @sort_no_dim_provided(%arg0: tensor<3x5xi32>) -> tensor<3x5xi32> {
@@ -2702,7 +2702,7 @@ func.func public @reshape_splat_of_bools() -> tensor<2x1xi1> {
 func.func @simplify_dynamic_gather_i64(%arg0: tensor<375682x256xf16>, %arg1: tensor<16x64xi64>) -> tensor<16x64x256xf16> {
   %0 = "arith.constant"() {value = dense<[1, 256]> : tensor<2xi64>} : () -> tensor<2xi64>
   %1 = "mhlo.dynamic_gather"(%arg0, %arg1, %0) {dimension_numbers = #mhlo.gather<offset_dims = [2], collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false} : (tensor<375682x256xf16>, tensor<16x64xi64>, tensor<2xi64>) -> tensor<16x64x256xf16>
-  // CHECK: %[[RET:.+]] = "mhlo.gather"(%arg0, %arg1) {dimension_numbers = #mhlo.gather<offset_dims = [2], collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false, slice_sizes = dense<[1, 256]> : tensor<2xi64>} : (tensor<375682x256xf16>, tensor<16x64xi64>) -> tensor<16x64x256xf16>
+  // CHECK: %[[RET:.+]] = "mhlo.gather"(%arg0, %arg1) <{dimension_numbers = #mhlo.gather<offset_dims = [2], collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false, slice_sizes = dense<[1, 256]> : tensor<2xi64>}> : (tensor<375682x256xf16>, tensor<16x64xi64>) -> tensor<16x64x256xf16>
   // CHECK: return %[[RET]]
   return %1 : tensor<16x64x256xf16>
 }
@@ -2711,7 +2711,7 @@ func.func @simplify_dynamic_gather_i64(%arg0: tensor<375682x256xf16>, %arg1: ten
 func.func @simplify_dynamic_gather_i32(%arg0: tensor<375682x256xf16>, %arg1: tensor<16x64xi64>) -> tensor<16x64x256xf16> {
   %0 = "arith.constant"() {value = dense<[1, 256]> : tensor<2xi32>} : () -> tensor<2xi32>
   %1 = "mhlo.dynamic_gather"(%arg0, %arg1, %0) {dimension_numbers = #mhlo.gather<offset_dims = [2], collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false} : (tensor<375682x256xf16>, tensor<16x64xi64>, tensor<2xi32>) -> tensor<16x64x256xf16>
-  // CHECK: %[[RET:.+]] = "mhlo.gather"(%arg0, %arg1) {dimension_numbers = #mhlo.gather<offset_dims = [2], collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false, slice_sizes = dense<[1, 256]> : tensor<2xi64>} : (tensor<375682x256xf16>, tensor<16x64xi64>) -> tensor<16x64x256xf16>
+  // CHECK: %[[RET:.+]] = "mhlo.gather"(%arg0, %arg1) <{dimension_numbers = #mhlo.gather<offset_dims = [2], collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false, slice_sizes = dense<[1, 256]> : tensor<2xi64>}> : (tensor<375682x256xf16>, tensor<16x64xi64>) -> tensor<16x64x256xf16>
   // CHECK: return %[[RET]]
   return %1 : tensor<16x64x256xf16>
 }
@@ -2754,12 +2754,12 @@ func.func @simplify_real_dynamic_slice_to_dynamic_slice(%arg0: tensor<?x4xf32>,
   %2 = mhlo.constant dense<[1, 1]> : tensor<2xi32>
   %3 = mhlo.real_dynamic_slice %arg0, %arg1, %1, %2 : (tensor<?x4xf32>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<1x4xf32>
   return %3 : tensor<1x4xf32>
-  //      CHECK: [[START_INDEX_0_1D:%.*]] = "mhlo.slice"(%arg1) {limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi32>) -> tensor<1xi32>
+  //      CHECK: [[START_INDEX_0_1D:%.*]] = "mhlo.slice"(%arg1) <{limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi32>) -> tensor<1xi32>
   // CHECK-NEXT: [[START_INDEX_0_0D:%.*]] = mhlo.reshape [[START_INDEX_0_1D]] : (tensor<1xi32>) -> tensor<i32>
-  // CHECK-NEXT: [[START_INDEX_1_1D:%.*]] = "mhlo.slice"(%arg1) {limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi32>) -> tensor<1xi32>
+  // CHECK-NEXT: [[START_INDEX_1_1D:%.*]] = "mhlo.slice"(%arg1) <{limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi32>) -> tensor<1xi32>
   // CHECK-NEXT: [[START_INDEX_1_0D:%.*]] = mhlo.reshape [[START_INDEX_1_1D]] : (tensor<1xi32>) -> tensor<i32>
-  // CHECK-NEXT: [[RESULT:%.*]] = "mhlo.dynamic_slice"(%arg0, [[START_INDEX_0_0D]], [[START_INDEX_1_0D]]) {
+  // CHECK-NEXT: [[RESULT:%.*]] = "mhlo.dynamic_slice"(%arg0, [[START_INDEX_0_0D]], [[START_INDEX_1_0D]]) <{
   // CHECK-SAME:   slice_sizes = dense<[1, 4]> : tensor<2xi64>
-  // CHECK-SAME: } : (tensor<?x4xf32>, tensor<i32>, tensor<i32>) -> tensor<1x4xf32>
+  // CHECK-SAME: }> : (tensor<?x4xf32>, tensor<i32>, tensor<i32>) -> tensor<1x4xf32>
   // CHECK-NEXT: return [[RESULT]] : tensor<1x4xf32>
 }
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/concatenate.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/concatenate.mlir
index 15f543ae83486b..a770286fbc9491 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/concatenate.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/concatenate.mlir
@@ -3,7 +3,7 @@
 // CHECK-LABEL: func @single_operand
 // CHECK-SAME: [[ARG:%[a-zA-Z0-9]+]]
 func.func @single_operand(%arg: tensor<1x2xf32>) -> tensor<1x2xf32> {
-  %0 = "mhlo.concatenate"(%arg) {dimension = 0 : i64} : (tensor<1x2xf32>) -> tensor<1x2xf32>
+  %0 = "mhlo.concatenate"(%arg) <{dimension = 0 : i64}> : (tensor<1x2xf32>) -> tensor<1x2xf32>
   // CHECK-NEXT: return [[ARG]]
   func.return %0 : tensor<1x2xf32>
 }
@@ -13,7 +13,7 @@ func.func @single_operand(%arg: tensor<1x2xf32>) -> tensor<1x2xf32> {
 // CHECK-LABEL: func @operand_with_dynamic_shape
 func.func @operand_with_dynamic_shape(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
   // CHECK-NEXT: mhlo.concatenate
-  %0 = "mhlo.concatenate"(%arg0, %arg1) {dimension = 0 : i64} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %0 = "mhlo.concatenate"(%arg0, %arg1) <{dimension = 0 : i64}> : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   // CHECK-NEXT: return
   func.return %0 : tensor<?xf32>
 }
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/convolution.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/convolution.mlir
index 4316d22d08aaf8..526ecb39688955 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/convolution.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/convolution.mlir
@@ -28,8 +28,8 @@ func.func @convolution_is_dot_general_swap(%arg0: tensor<5x6xf32>, %arg1: tensor
 func.func @conv_grouped_is_dot(%arg0: tensor<5x12xf32>, %arg1: tensor<2x6xf32>) -> tensor<5x6xf32> {
   // CHECK: %[[RES0:.+]] = mhlo.reshape %arg0 : (tensor<5x12xf32>) -> tensor<5x6x2xf32>
   // CHECK: %[[RES1:.+]] = mhlo.reshape %arg1 : (tensor<2x6xf32>) -> tensor<6x1x2xf32>
-  // CHECK: %[[DOT:.+]] = "mhlo.dot_general"(%[[RES0]], %[[RES1]]) {dot_dimension_numbers = #mhlo.dot<lhs_batching_dimensions = [1], rhs_batching_dimensions = [0], lhs_contracting_dimensions = [2], rhs_contracting_dimensions = [2]>, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]}
-  // CHECK: %[[TRANSPOSE:.+]] = "mhlo.transpose"(%2) {permutation = dense<[1, 0, 2]> : tensor<3xi64>}
+  // CHECK: %[[DOT:.+]] = "mhlo.dot_general"(%[[RES0]], %[[RES1]]) <{dot_dimension_numbers = #mhlo.dot<lhs_batching_dimensions = [1], rhs_batching_dimensions = [0], lhs_contracting_dimensions = [2], rhs_contracting_dimensions = [2]>, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]}>
+  // CHECK: %[[TRANSPOSE:.+]] = "mhlo.transpose"(%2) <{permutation = dense<[1, 0, 2]> : tensor<3xi64>}>
   // CHECK: %[[OUT:.+]] = mhlo.reshape %3 : (tensor<5x6x1xf32>) -> tensor<5x6xf32>
   %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [b, f]x[i, o]->[b, f], window = {stride = [], pad = [], lhs_dilate = [], rhs_dilate = [], reverse = []} {batch_group_count = 1 : i64, feature_group_count = 6 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<5x12xf32>, tensor<2x6xf32>) -> tensor<5x6xf32>
   // CHECK: return %[[OUT]]
@@ -42,8 +42,8 @@ func.func @conv_grouped_is_dot(%arg0: tensor<5x12xf32>, %arg1: tensor<2x6xf32>)
 func.func @conv_grouped_is_dot_multi(%arg0: tensor<5x4xf32>, %arg1: tensor<2x6xf32>) -> tensor<5x6xf32> {
   // CHECK: %[[LHS:.+]] = mhlo.reshape %arg0 : (tensor<5x4xf32>) -> tensor<5x2x2xf32>
   // CHECK: %[[RHS:.+]] = mhlo.reshape %arg1 : (tensor<2x6xf32>) -> tensor<2x3x2xf32>
-  // CHECK: %[[DOT:.+]] = "mhlo.dot_general"(%[[LHS]], %[[RHS]]) {dot_dimension_numbers = #mhlo.dot<lhs_batching_dimensions = [1], rhs_batching_dimensions = [0], lhs_contracting_dimensions = [2], rhs_contracting_dimensions = [2]>, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]}
-  // CHECK: %[[TRANSPOSE:.+]] = "mhlo.transpose"(%[[DOT]]) {permutation = dense<[1, 0, 2]> : tensor<3xi64>}
+  // CHECK: %[[DOT:.+]] = "mhlo.dot_general"(%[[LHS]], %[[RHS]]) <{dot_dimension_numbers = #mhlo.dot<lhs_batching_dimensions = [1], rhs_batching_dimensions = [0], lhs_contracting_dimensions = [2], rhs_contracting_dimensions = [2]>, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]}>
+  // CHECK: %[[TRANSPOSE:.+]] = "mhlo.transpose"(%[[DOT]]) <{permutation = dense<[1, 0, 2]> : tensor<3xi64>}>
   // CHECK: %[[OUT:.+]] = mhlo.reshape %[[TRANSPOSE]] : (tensor<5x2x3xf32>) -> tensor<5x6xf32>
   %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [b, f]x[i, o]->[b, f], window = {stride = [], pad = [], lhs_dilate = [], rhs_dilate = [], reverse = []} {batch_group_count = 1 : i64, feature_group_count = 2 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<5x4xf32>, tensor<2x6xf32>) -> tensor<5x6xf32>
   // CHECK: return %[[OUT]]
@@ -56,8 +56,8 @@ func.func @conv_grouped_is_dot_multi(%arg0: tensor<5x4xf32>, %arg1: tensor<2x6xf
 func.func @conv_grouped_is_dot_transpose_rhs(%arg0: tensor<5x4xf32>, %arg1: tensor<6x2xf32>) -> tensor<5x6xf32> {
   // CHECK: %[[LHS:.+]] = mhlo.reshape %arg0 : (tensor<5x4xf32>) -> tensor<5x2x2xf32>
   // CHECK: %[[RHS:.+]] = mhlo.reshape %arg1 : (tensor<6x2xf32>) -> tensor<2x2x3xf32>
-  // CHECK: %[[DOT:.+]] = "mhlo.dot_general"(%[[LHS]], %[[RHS]]) {dot_dimension_numbers = #mhlo.dot<lhs_batching_dimensions = [1], rhs_batching_dimensions = [1], lhs_contracting_dimensions = [2], rhs_contracting_dimensions = [0]>, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]}
-  // CHECK: %[[TRANSPOSE:.+]] = "mhlo.transpose"(%[[DOT]]) {permutation = dense<[1, 0, 2]> : tensor<3xi64>}
+  // CHECK: %[[DOT:.+]] = "mhlo.dot_general"(%[[LHS]], %[[RHS]]) <{dot_dimension_numbers = #mhlo.dot<lhs_batching_dimensions = [1], rhs_batching_dimensions = [1], lhs_contracting_dimensions = [2], rhs_contracting_dimensions = [0]>, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]}>
+  // CHECK: %[[TRANSPOSE:.+]] = "mhlo.transpose"(%[[DOT]]) <{permutation = dense<[1, 0, 2]> : tensor<3xi64>}>
   // CHECK: %[[OUT:.+]] = mhlo.reshape %[[TRANSPOSE]] : (tensor<5x2x3xf32>) -> tensor<5x6xf32>
   %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [b, f]x[o, i]->[b, f], window = {stride = [], pad = [], lhs_dilate = [], rhs_dilate = [], reverse = []} {batch_group_count = 1 : i64, feature_group_count = 2 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<5x4xf32>, tensor<6x2xf32>) -> tensor<5x6xf32>
   // CHECK: return %[[OUT]]
@@ -70,8 +70,8 @@ func.func @conv_grouped_is_dot_transpose_rhs(%arg0: tensor<5x4xf32>, %arg1: tens
 func.func @conv_grouped_is_dot_transpose_ins(%arg0: tensor<4x5xf32>, %arg1: tensor<6x2xf32>) -> tensor<5x6xf32> {
   // CHECK: %[[LHS:.+]] = mhlo.reshape %arg0 : (tensor<4x5xf32>) -> tensor<2x2x5xf32>
   // CHECK: %[[RHS:.+]] = mhlo.reshape %arg1 : (tensor<6x2xf32>) -> tensor<2x2x3xf32>
-  // CHECK: %[[DOT:.+]] = "mhlo.dot_general"(%[[LHS]], %[[RHS]]) {dot_dimension_numbers = #mhlo.dot<lhs_batching_dimensions = [0], rhs_batching_dimensions = [1], lhs_contracting_dimensions = [1], rhs_contracting_dimensions = [0]>, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]}
-  // CHECK: %[[TRANSPOSE:.+]] = "mhlo.transpose"(%[[DOT]]) {permutation = dense<[1, 0, 2]> : tensor<3xi64>}
+  // CHECK: %[[DOT:.+]] = "mhlo.dot_general"(%[[LHS]], %[[RHS]]) <{dot_dimension_numbers = #mhlo.dot<lhs_batching_dimensions = [0], rhs_batching_dimensions = [1], lhs_contracting_dimensions = [1], rhs_contracting_dimensions = [0]>, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]}>
+  // CHECK: %[[TRANSPOSE:.+]] = "mhlo.transpose"(%[[DOT]]) <{permutation = dense<[1, 0, 2]> : tensor<3xi64>}>
   // CHECK: %[[OUT:.+]] = mhlo.reshape %[[TRANSPOSE]] : (tensor<5x2x3xf32>) -> tensor<5x6xf32>
   %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b]x[o, i]->[b, f], window = {stride = [], pad = [], lhs_dilate = [], rhs_dilate = [], reverse = []} {batch_group_count = 1 : i64, feature_group_count = 2 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x5xf32>, tensor<6x2xf32>) -> tensor<5x6xf32>
   // CHECK: return %[[OUT]]
@@ -84,8 +84,8 @@ func.func @conv_grouped_is_dot_transpose_ins(%arg0: tensor<4x5xf32>, %arg1: tens
 func.func @conv_grouped_is_dot_transpose_out(%arg0: tensor<5x4xf32>, %arg1: tensor<2x6xf32>) -> tensor<6x5xf32> {
   // CHECK: %[[LHS:.+]] = mhlo.reshape %arg0 : (tensor<5x4xf32>) -> tensor<5x2x2xf32>
   // CHECK: %[[RHS:.+]] = mhlo.reshape %arg1 : (tensor<2x6xf32>) -> tensor<2x3x2xf32>
-  // CHECK: %[[DOT:.+]] = "mhlo.dot_general"(%[[LHS]], %[[RHS]]) {dot_dimension_numbers = #mhlo.dot<lhs_batching_dimensions = [1], rhs_batching_dimensions = [0], lhs_contracting_dimensions = [2], rhs_contracting_dimensions = [2]>, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]}
-  // CHECK: %[[TRANSPOSE:.+]] = "mhlo.transpose"(%[[DOT]]) {permutation = dense<[0, 2, 1]> : tensor<3xi64>}
+  // CHECK: %[[DOT:.+]] = "mhlo.dot_general"(%[[LHS]], %[[RHS]]) <{dot_dimension_numbers = #mhlo.dot<lhs_batching_dimensions = [1], rhs_batching_dimensions = [0], lhs_contracting_dimensions = [2], rhs_contracting_dimensions = [2]>, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]}>
+  // CHECK: %[[TRANSPOSE:.+]] = "mhlo.transpose"(%[[DOT]]) <{permutation = dense<[0, 2, 1]> : tensor<3xi64>}>
   // CHECK: %[[OUT:.+]] = mhlo.reshape %[[TRANSPOSE]]
   %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [b, f]x[i, o]->[f, b], window = {stride = [], pad = [], lhs_dilate = [], rhs_dilate = [], reverse = []} {batch_group_count = 1 : i64, feature_group_count = 2 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<5x4xf32>, tensor<2x6xf32>) -> tensor<6x5xf32>
   // CHECK: return %[[OUT]]
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/folder_limit.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/folder_limit.mlir
index 1d62442436c7e6..fcb4ae16f5fbaa 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/folder_limit.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/folder_limit.mlir
@@ -152,7 +152,7 @@ func.func @compare_large_constants() -> tensor<65537xi1> {
 func.func @concatenate_small_constants() -> tensor<65536xi32> {
   // CHECK-NOT: mhlo.concatenate
   %0 = mhlo.constant dense<0> : tensor<32768xi32>
-  %1 = "mhlo.concatenate"(%0, %0) {dimension = 0 : i64} : (tensor<32768xi32>, tensor<32768xi32>) -> tensor<65536xi32>
+  %1 = "mhlo.concatenate"(%0, %0) <{dimension = 0 : i64}> : (tensor<32768xi32>, tensor<32768xi32>) -> tensor<65536xi32>
   func.return %1 : tensor<65536xi32>
 }
 
@@ -160,7 +160,7 @@ func.func @concatenate_small_constants() -> tensor<65536xi32> {
 func.func @concatenate_large_constants() -> tensor<65538xi32> {
   // CHECK: mhlo.concatenate
   %0 = mhlo.constant dense<0> : tensor<32769xi32>
-  %1 = "mhlo.concatenate"(%0, %0) {dimension = 0 : i64} : (tensor<32769xi32>, tensor<32769xi32>) -> tensor<65538xi32>
+  %1 = "mhlo.concatenate"(%0, %0) <{dimension = 0 : i64}> : (tensor<32769xi32>, tensor<32769xi32>) -> tensor<65538xi32>
   func.return %1 : tensor<65538xi32>
 }
 
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/reduce.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/reduce.mlir
index 70e5f3e4b4fc92..994bc1041a687f 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/reduce.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/reduce.mlir
@@ -70,7 +70,7 @@ func.func @or_fold() -> (tensor<i1>, tensor<i1>) {
 // CHECK-LABEL: func @zero_ext
 func.func @zero_ext(%arg0: tensor<0xi1>) -> tensor<i32> {
   %0 = mhlo.constant dense<false> : tensor<i1>
-  %1 = "mhlo.broadcast_in_dim"(%0) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<i1>) -> tensor<0xi1>
+  %1 = "mhlo.broadcast_in_dim"(%0) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<i1>) -> tensor<0xi1>
   %2 = mhlo.compare  NE, %arg0, %1,  UNSIGNED : (tensor<0xi1>, tensor<0xi1>) -> tensor<0xi1>
   %3 = mhlo.convert %2 : (tensor<0xi1>) -> tensor<0xi32>
   %4 = mhlo.constant dense<0> : tensor<i32>
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/reverse.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/reverse.mlir
index 33880f6adb1394..0f98bbd754b484 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/reverse.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/reverse.mlir
@@ -3,7 +3,7 @@
 // CHECK-LABEL: func @noop
 // CHECK-SAME: (%[[ARG0:.*]]: tensor<1x2xf32>)
 func.func @noop(%arg0: tensor<1x2xf32>) -> tensor<1x2xf32> {
-  %0 = "mhlo.reverse"(%arg0) {dimensions = dense<[]> : tensor<0xi64>} : (tensor<1x2xf32>) -> tensor<1x2xf32>
+  %0 = "mhlo.reverse"(%arg0) <{dimensions = dense<[]> : tensor<0xi64>}> : (tensor<1x2xf32>) -> tensor<1x2xf32>
   // CHECK: return %[[ARG0]]
   func.return %0 : tensor<1x2xf32>
 }
@@ -11,7 +11,7 @@ func.func @noop(%arg0: tensor<1x2xf32>) -> tensor<1x2xf32> {
 // CHECK-LABEL: func @dim1
 // CHECK-SAME: (%[[ARG0:.*]]: tensor
 func.func @dim1(%arg0: tensor<9x1x2x1x42xf32>) -> tensor<9x1x2x1x42xf32> {
-  %0 = "mhlo.reverse"(%arg0) {dimensions = dense<[1,3]> : tensor<2xi64>} : (tensor<9x1x2x1x42xf32>) -> tensor<9x1x2x1x42xf32>
+  %0 = "mhlo.reverse"(%arg0) <{dimensions = dense<[1,3]> : tensor<2xi64>}> : (tensor<9x1x2x1x42xf32>) -> tensor<9x1x2x1x42xf32>
   // CHECK: return %[[ARG0]]
   func.return %0 : tensor<9x1x2x1x42xf32>
 }
@@ -19,7 +19,7 @@ func.func @dim1(%arg0: tensor<9x1x2x1x42xf32>) -> tensor<9x1x2x1x42xf32> {
 // CHECK-LABEL: @noop_reverse_dynamic_shape
 // CHECK-SAME: [[ARG:%[a-zA-Z0-9]+]]
 func.func @noop_reverse_dynamic_shape(%arg0 : tensor<10x?x512xf32>) -> tensor<10x?x512xf32> {
-  %0 = "mhlo.reverse"(%arg0) {dimensions = dense<[0,1]> : tensor<2xi64>}: (tensor<10x?x512xf32>) -> tensor<10x?x512xf32>
+  %0 = "mhlo.reverse"(%arg0) <{dimensions = dense<[0,1]> : tensor<2xi64>}>: (tensor<10x?x512xf32>) -> tensor<10x?x512xf32>
   // CHECK-NEXT: "mhlo.reverse"([[ARG]])
   func.return %0 : tensor<10x?x512xf32>
 }
@@ -28,7 +28,7 @@ func.func @noop_reverse_dynamic_shape(%arg0 : tensor<10x?x512xf32>) -> tensor<10
 func.func @reverse_fold_constant_int() -> tensor<0x2x0xi64> {
   %cst = mhlo.constant dense<> : tensor<0x2x0xi64>
   // CHECK: mhlo.constant dense<>
-  %1 = "mhlo.reverse"(%cst) {dimensions = dense<[0,1]> : tensor<2xi64>} : (tensor<0x2x0xi64>) -> tensor<0x2x0xi64>
+  %1 = "mhlo.reverse"(%cst) <{dimensions = dense<[0,1]> : tensor<2xi64>}> : (tensor<0x2x0xi64>) -> tensor<0x2x0xi64>
   func.return %1 : tensor<0x2x0xi64>
 }
 
@@ -36,7 +36,7 @@ func.func @reverse_fold_constant_int() -> tensor<0x2x0xi64> {
 func.func @reverse_fold_constant_int_0() -> tensor<0xi64> {
   %cst = mhlo.constant dense<> : tensor<0xi64>
   // CHECK: mhlo.constant dense<>
-  %1 = "mhlo.reverse"(%cst) {dimensions = dense<[0]> : tensor<1xi64>} : (tensor<0xi64>) -> tensor<0xi64>
+  %1 = "mhlo.reverse"(%cst) <{dimensions = dense<[0]> : tensor<1xi64>}> : (tensor<0xi64>) -> tensor<0xi64>
   func.return %1 : tensor<0xi64>
 }
 
@@ -44,7 +44,7 @@ func.func @reverse_fold_constant_int_0() -> tensor<0xi64> {
 func.func @reverse_fold_constant_int_1() -> tensor<3x2xi32> {
   %cst = mhlo.constant dense<[[1, 2], [3, 4], [5, 6]]> : tensor<3x2xi32>
   // CHECK: mhlo.constant dense<{{\[\[}}6, 5], [4, 3], [2, 1]]>
-  %1 = "mhlo.reverse"(%cst) {dimensions = dense<[0,1]> : tensor<2xi64>} : (tensor<3x2xi32>) -> tensor<3x2xi32>
+  %1 = "mhlo.reverse"(%cst) <{dimensions = dense<[0,1]> : tensor<2xi64>}> : (tensor<3x2xi32>) -> tensor<3x2xi32>
   func.return %1 : tensor<3x2xi32>
 }
 
@@ -52,7 +52,7 @@ func.func @reverse_fold_constant_int_1() -> tensor<3x2xi32> {
 func.func @reverse_fold_constant_int_2() -> tensor<3x2xi32> {
   %cst = mhlo.constant dense<[[1, 2], [3, 4], [5, 6]]> : tensor<3x2xi32>
   // CHECK: mhlo.constant dense<{{\[\[}}5, 6], [3, 4], [1, 2]]>
-  %1 = "mhlo.reverse"(%cst) {dimensions = dense<[0]> : tensor<1xi64>} : (tensor<3x2xi32>) -> tensor<3x2xi32>
+  %1 = "mhlo.reverse"(%cst) <{dimensions = dense<[0]> : tensor<1xi64>}> : (tensor<3x2xi32>) -> tensor<3x2xi32>
   func.return %1 : tensor<3x2xi32>
 }
 
@@ -60,7 +60,7 @@ func.func @reverse_fold_constant_int_2() -> tensor<3x2xi32> {
 func.func @reverse_fold_constant_int_3() -> tensor<3x2xi32> {
   %cst = mhlo.constant dense<[[1, 2], [3, 4], [5, 6]]> : tensor<3x2xi32>
   // CHECK: mhlo.constant dense<{{\[\[}}2, 1], [4, 3], [6, 5]]>
-  %1 = "mhlo.reverse"(%cst) {dimensions = dense<[1]> : tensor<1xi64>} : (tensor<3x2xi32>) -> tensor<3x2xi32>
+  %1 = "mhlo.reverse"(%cst) <{dimensions = dense<[1]> : tensor<1xi64>}> : (tensor<3x2xi32>) -> tensor<3x2xi32>
   func.return %1 : tensor<3x2xi32>
 }
 
@@ -68,7 +68,7 @@ func.func @reverse_fold_constant_int_3() -> tensor<3x2xi32> {
 func.func @reverse_fold_constant_int_4() -> tensor<2x3x2xi32> {
   %cst = mhlo.constant dense<[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]]> : tensor<2x3x2xi32>
   // CHECK: mhlo.constant dense<{{\[\[\[}}12, 11], [10, 9], [8, 7]], {{\[\[}}6, 5], [4, 3], [2, 1]]]>
-  %1 = "mhlo.reverse"(%cst) {dimensions = dense<[0,1,2]> : tensor<3xi64>} : (tensor<2x3x2xi32>) -> tensor<2x3x2xi32>
+  %1 = "mhlo.reverse"(%cst) <{dimensions = dense<[0,1,2]> : tensor<3xi64>}> : (tensor<2x3x2xi32>) -> tensor<2x3x2xi32>
   func.return %1 : tensor<2x3x2xi32>
 }
 
@@ -76,7 +76,7 @@ func.func @reverse_fold_constant_int_4() -> tensor<2x3x2xi32> {
 func.func @reverse_fold_constant_float() -> tensor<3x2xf32> {
   %cst = mhlo.constant dense<[[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]> : tensor<3x2xf32>
   // CHECK: mhlo.constant dense<{{\[\[}}6.000000e+00, 5.000000e+00], [4.000000e+00, 3.000000e+00], [2.000000e+00, 1.000000e+00]]>
-  %1 = "mhlo.reverse"(%cst) {dimensions = dense<[0,1]> : tensor<2xi64>} : (tensor<3x2xf32>) -> tensor<3x2xf32>
+  %1 = "mhlo.reverse"(%cst) <{dimensions = dense<[0,1]> : tensor<2xi64>}> : (tensor<3x2xf32>) -> tensor<3x2xf32>
   func.return %1 : tensor<3x2xf32>
 }
 
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/scatter.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/scatter.mlir
index 1a7a6f343e933f..901428df8a10c3 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/scatter.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/scatter.mlir
@@ -84,11 +84,11 @@ func.func @scatter_full_overwrite_add(
       mhlo.return %2 : tensor<bf16>
     }) {indices_are_sorted = true, scatter_dimension_numbers = #mhlo.scatter<update_window_dims = [0]>, unique_indices = true} : (tensor<1xbf16>, tensor<0xi32>, tensor<1xbf16>) -> tensor<1xbf16>
 
-  // CHECK: "mhlo.map"(%[[ARG0]], %[[ARG2]]) ({
+  // CHECK: "mhlo.map"(%[[ARG0]], %[[ARG2]]) <{dimensions = dense<0> : tensor<1xi64>}> ({
   // CHECK:  ^bb0(%[[ARG3:.*]]: tensor<bf16>, %[[ARG4:.*]]: tensor<bf16>):
   // CHECK:    %[[ADD:.*]] = mhlo.add %[[ARG3]], %[[ARG4]] : tensor<bf16>
   // CHECK:    mhlo.return %[[ADD]] : tensor<bf16>
-  // CHECK:  }) {dimensions = dense<0> : tensor<1xi64>} : (tensor<1xbf16>, tensor<1xbf16>) -> tensor<1xbf16>
+  // CHECK:  }) : (tensor<1xbf16>, tensor<1xbf16>) -> tensor<1xbf16>
   func.return %scatter : tensor<1xbf16>
 }
 
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/transpose.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/transpose.mlir
index 2b4b8cd9982a23..63d9c6d537755e 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/transpose.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/transpose.mlir
@@ -4,7 +4,7 @@
 func.func @transpose_splat_constant() -> tensor<5x10xf32> {
   // CHECK-NEXT: [[CST:%.+]] = mhlo.constant dense<1.000000e+00> : tensor<5x10xf32>
   %cst = mhlo.constant dense<1.000000e+00> : tensor<10x5xf32>
-  %0 = "mhlo.transpose"(%cst) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<10x5xf32>) -> tensor<5x10xf32>
+  %0 = "mhlo.transpose"(%cst) <{permutation = dense<[1, 0]> : tensor<2xi64>}> : (tensor<10x5xf32>) -> tensor<5x10xf32>
   // CHECK-NEXT: return [[CST]]
   func.return %0 : tensor<5x10xf32>
 }
@@ -14,7 +14,7 @@ func.func @transpose_splat_constant() -> tensor<5x10xf32> {
 // CHECK-LABEL: func @remove_noop
 // CHECK-SAME: [[ARG:%[a-zA-Z0-9]+]]
 func.func @remove_noop(%arg : tensor<2x3x9x5xi32>) -> tensor<2x3x9x5xi32> {
-  %0 = "mhlo.transpose"(%arg) {permutation = dense<[0, 1, 2, 3]> : tensor<4xi64>}: (tensor<2x3x9x5xi32>) -> tensor<2x3x9x5xi32>
+  %0 = "mhlo.transpose"(%arg) <{permutation = dense<[0, 1, 2, 3]> : tensor<4xi64>}>: (tensor<2x3x9x5xi32>) -> tensor<2x3x9x5xi32>
   // CHECK-NEXT: return [[ARG]]
   func.return %0 : tensor<2x3x9x5xi32>
 }
@@ -25,7 +25,7 @@ func.func @remove_noop(%arg : tensor<2x3x9x5xi32>) -> tensor<2x3x9x5xi32> {
 // CHECK-SAME: [[ARG:%[a-zA-Z0-9]+]]
 func.func @keep_real_transpose(%arg : tensor<2x3x9x5xi32>) -> tensor<3x2x5x9xi32> {
   // CHECK-NEXT: "mhlo.transpose"([[ARG]])
-  %0 = "mhlo.transpose"(%arg) {permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>}: (tensor<2x3x9x5xi32>) -> tensor<3x2x5x9xi32>
+  %0 = "mhlo.transpose"(%arg) <{permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>}>: (tensor<2x3x9x5xi32>) -> tensor<3x2x5x9xi32>
   func.return %0 : tensor<3x2x5x9xi32>
 }
 
@@ -35,7 +35,7 @@ func.func @keep_real_transpose(%arg : tensor<2x3x9x5xi32>) -> tensor<3x2x5x9xi32
 // CHECK-SAME: [[ARG:%[a-zA-Z0-9]+]]
 func.func @keep_same_shape_real_transpose(%arg : tensor<4x4xi32>) -> tensor<4x4xi32> {
   // CHECK-NEXT: "mhlo.transpose"([[ARG]])
-  %0 = "mhlo.transpose"(%arg) {permutation = dense<[1, 0]> : tensor<2xi64>}: (tensor<4x4xi32>) -> tensor<4x4xi32>
+  %0 = "mhlo.transpose"(%arg) <{permutation = dense<[1, 0]> : tensor<2xi64>}>: (tensor<4x4xi32>) -> tensor<4x4xi32>
   func.return %0 : tensor<4x4xi32>
 }
 
@@ -44,8 +44,8 @@ func.func @keep_same_shape_real_transpose(%arg : tensor<4x4xi32>) -> tensor<4x4x
 // CHECK-LABEL: @eliminate_redundant_transpose
 // CHECK-SAME: [[ARG:%[a-zA-Z0-9]+]]
 func.func @eliminate_redundant_transpose(%arg : tensor<3x4x16x2xf32>) -> tensor<3x2x16x4xf32> {
-  %0 = "mhlo.transpose"(%arg) {permutation = dense<[0, 3, 1, 2]> : tensor<4xi64>}: (tensor<3x4x16x2xf32>) -> tensor<3x2x4x16xf32>
-  %1 = "mhlo.transpose"(%0) {permutation = dense<[0, 1, 3, 2]> : tensor<4xi64>}: (tensor<3x2x4x16xf32>) -> tensor<3x2x16x4xf32>
+  %0 = "mhlo.transpose"(%arg) <{permutation = dense<[0, 3, 1, 2]> : tensor<4xi64>}>: (tensor<3x4x16x2xf32>) -> tensor<3x2x4x16xf32>
+  %1 = "mhlo.transpose"(%0) <{permutation = dense<[0, 1, 3, 2]> : tensor<4xi64>}>: (tensor<3x2x4x16xf32>) -> tensor<3x2x16x4xf32>
   // CHECK: [[RET:%[a-zA-Z0-9]+]] = "mhlo.transpose"([[ARG]])
   // CHECK-SAME: dense<[0, 3, 2, 1]
   // CHECK-NEXT: return [[RET]]
@@ -57,7 +57,7 @@ func.func @eliminate_redundant_transpose(%arg : tensor<3x4x16x2xf32>) -> tensor<
 // CHECK-LABEL: @simplify_transpose_case1
 // CHECK-SAME: [[ARG:%[a-zA-Z0-9]+]]
 func.func @simplify_transpose_case1(%arg : tensor<10x1x512xf32>) -> tensor<1x10x512xf32> {
-  %0 = "mhlo.transpose"(%arg) {permutation = dense<[1, 0, 2]> : tensor<3xi64>}: (tensor<10x1x512xf32>) -> tensor<1x10x512xf32>
+  %0 = "mhlo.transpose"(%arg) <{permutation = dense<[1, 0, 2]> : tensor<3xi64>}>: (tensor<10x1x512xf32>) -> tensor<1x10x512xf32>
   // CHECK-NEXT: mhlo.reshape [[ARG]]
   func.return %0 : tensor<1x10x512xf32>
 }
@@ -67,7 +67,7 @@ func.func @simplify_transpose_case1(%arg : tensor<10x1x512xf32>) -> tensor<1x10x
 // CHECK-LABEL: @simplify_transpose_case2
 // CHECK-SAME: [[ARG:%[a-zA-Z0-9]+]]
 func.func @simplify_transpose_case2(%arg : tensor<10x1x512x1xf32>) -> tensor<1x1x10x512xf32> {
-  %0 = "mhlo.transpose"(%arg) {permutation = dense<[1, 3, 0, 2]> : tensor<4xi64>}: (tensor<10x1x512x1xf32>) -> tensor<1x1x10x512xf32>
+  %0 = "mhlo.transpose"(%arg) <{permutation = dense<[1, 3, 0, 2]> : tensor<4xi64>}>: (tensor<10x1x512x1xf32>) -> tensor<1x1x10x512xf32>
   // CHECK-NEXT: mhlo.reshape [[ARG]]
   func.return %0 : tensor<1x1x10x512xf32>
 }
@@ -77,7 +77,7 @@ func.func @simplify_transpose_case2(%arg : tensor<10x1x512x1xf32>) -> tensor<1x1
 // CHECK-LABEL: @not_simplify_transpose_dynamic_shape
 // CHECK-SAME: [[ARG:%[a-zA-Z0-9]+]]
 func.func @not_simplify_transpose_dynamic_shape(%arg : tensor<10x?x512xf32>) -> tensor<?x10x512xf32> {
-  %0 = "mhlo.transpose"(%arg) {permutation = dense<[1, 0, 2]> : tensor<3xi64>}: (tensor<10x?x512xf32>) -> tensor<?x10x512xf32>
+  %0 = "mhlo.transpose"(%arg) <{permutation = dense<[1, 0, 2]> : tensor<3xi64>}>: (tensor<10x?x512xf32>) -> tensor<?x10x512xf32>
   // CHECK-NEXT: "mhlo.transpose"([[ARG]])
   func.return %0 : tensor<?x10x512xf32>
 }
@@ -87,8 +87,8 @@ func.func @not_simplify_transpose_dynamic_shape(%arg : tensor<10x?x512xf32>) ->
 // CHECK-LABEL: func @broadcast_transpose
 // CHECK-SAME: [[ARG:%[a-zA-Z0-9]+]]
 func.func @broadcast_transpose(%arg0 : tensor<64xf32>) -> tensor<5x64x31x95xf32> {
-    %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<64xf32>) -> tensor<5x31x95x64xf32>
-    %1 = "mhlo.transpose"(%0) {permutation = dense<[0, 3, 1, 2]> : tensor<4xi64>} : (tensor<5x31x95x64xf32>) -> tensor<5x64x31x95xf32>
+    %0 = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<3> : tensor<1xi64>}> : (tensor<64xf32>) -> tensor<5x31x95x64xf32>
+    %1 = "mhlo.transpose"(%0) <{permutation = dense<[0, 3, 1, 2]> : tensor<4xi64>}> : (tensor<5x31x95x64xf32>) -> tensor<5x64x31x95xf32>
     // CHECK: [[RET:%[a-zA-Z0-9]+]] = "mhlo.broadcast_in_dim"([[ARG]])
     // CHECK-SAME: dense<1>
     // CHECK-NEXT: return [[RET]]
@@ -100,8 +100,8 @@ func.func @broadcast_transpose(%arg0 : tensor<64xf32>) -> tensor<5x64x31x95xf32>
 // CHECK-LABEL: func @broadcast_transpose_non_dim
 // CHECK-SAME: [[ARG:%[a-zA-Z0-9]+]]
 func.func @broadcast_transpose_non_dim(%arg0 : tensor<f32>) -> tensor<5x64x31x95xf32> {
-    %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<5x31x95x64xf32>
-    %1 = "mhlo.transpose"(%0) {permutation = dense<[0, 3, 1, 2]> : tensor<4xi64>} : (tensor<5x31x95x64xf32>) -> tensor<5x64x31x95xf32>
+    %0 = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<f32>) -> tensor<5x31x95x64xf32>
+    %1 = "mhlo.transpose"(%0) <{permutation = dense<[0, 3, 1, 2]> : tensor<4xi64>}> : (tensor<5x31x95x64xf32>) -> tensor<5x64x31x95xf32>
     // CHECK: [[RET:%[a-zA-Z0-9]+]] = "mhlo.broadcast_in_dim"([[ARG]])
     // CHECK-SAME: dense<>
     // CHECK-NEXT: return [[RET]]
@@ -113,8 +113,8 @@ func.func @broadcast_transpose_non_dim(%arg0 : tensor<f32>) -> tensor<5x64x31x95
 // CHECK-LABEL: func @broadcast_transpose_multi_dim
 // CHECK-SAME: [[ARG:%[a-zA-Z0-9]+]]
 func.func @broadcast_transpose_multi_dim(%arg0 : tensor<95x64xf32>) -> tensor<5x64x31x95xf32> {
-    %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>} : (tensor<95x64xf32>) -> tensor<5x31x95x64xf32>
-    %1 = "mhlo.transpose"(%0) {permutation = dense<[0, 3, 1, 2]> : tensor<4xi64>} : (tensor<5x31x95x64xf32>) -> tensor<5x64x31x95xf32>
+    %0 = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>}> : (tensor<95x64xf32>) -> tensor<5x31x95x64xf32>
+    %1 = "mhlo.transpose"(%0) <{permutation = dense<[0, 3, 1, 2]> : tensor<4xi64>}> : (tensor<5x31x95x64xf32>) -> tensor<5x64x31x95xf32>
     // CHECK: [[RET:%[a-zA-Z0-9]+]] = "mhlo.broadcast_in_dim"([[ARG]])
     // CHECK-SAME: dense<[3, 1]>
     // CHECK-NEXT: return [[RET]]
@@ -127,7 +127,7 @@ func.func @broadcast_transpose_multi_dim(%arg0 : tensor<95x64xf32>) -> tensor<5x
 // CHECK-NOT: mhlo.transpose
 func.func @transpose_splat_constant_quantized_per_tensor() -> tensor<5x10x!quant.uniform<i8:f32, 2.000000e+0:16>> {
   %cst = mhlo.constant() {value = dense<42> : tensor<10x5xi8>} : () -> tensor<10x5x!quant.uniform<i8:f32, 2.000000e+0:16>>
-  %0 = "mhlo.transpose"(%cst) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<10x5x!quant.uniform<i8:f32, 2.000000e+0:16>>) -> tensor<5x10x!quant.uniform<i8:f32, 2.000000e+0:16>>
+  %0 = "mhlo.transpose"(%cst) <{permutation = dense<[1, 0]> : tensor<2xi64>}> : (tensor<10x5x!quant.uniform<i8:f32, 2.000000e+0:16>>) -> tensor<5x10x!quant.uniform<i8:f32, 2.000000e+0:16>>
   // CHECK-NEXT: [[CST:%.+]] = mhlo.constant
   // CHECK-SAME: tensor<5x10x!quant.uniform<i8:f32, 2.000000e+00:16>>
   // CHECK-NEXT: return [[CST]]
@@ -140,7 +140,7 @@ func.func @transpose_splat_constant_quantized_per_tensor() -> tensor<5x10x!quant
 // CHECK-NOT: mhlo.transpose
 func.func @transpose_splat_constant_quantized_per_axis() -> tensor<2x10x!quant.uniform<i8:f32:0, {2.000000e+0:16,3.000000e+0:32}>> {
   %cst = mhlo.constant() {value = dense<42> : tensor<10x2xi8>} : () -> tensor<10x2x!quant.uniform<i8:f32:1, {2.000000e+0:16,3.000000e+0:32}>>
-  %0 = "mhlo.transpose"(%cst) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<10x2x!quant.uniform<i8:f32:1, {2.000000e+0:16,3.000000e+0:32}>>) -> tensor<2x10x!quant.uniform<i8:f32:0, {2.000000e+0:16,3.000000e+0:32}>>
+  %0 = "mhlo.transpose"(%cst) <{permutation = dense<[1, 0]> : tensor<2xi64>}> : (tensor<10x2x!quant.uniform<i8:f32:1, {2.000000e+0:16,3.000000e+0:32}>>) -> tensor<2x10x!quant.uniform<i8:f32:0, {2.000000e+0:16,3.000000e+0:32}>>
   // CHECK-NEXT: [[CST:%.+]] = mhlo.constant
   // CHECK-SAME: tensor<2x10x!quant.uniform<i8:f32:0, {2.000000e+00:16,3.000000e+00:32}>>
   // CHECK-NEXT: return [[CST]]
@@ -153,9 +153,9 @@ func.func @transpose_splat_constant_quantized_per_axis() -> tensor<2x10x!quant.u
 // CHECK-LABEL: func @nofold_nonsplat_quant_constant
 func.func @nofold_nonsplat_quant_constant() -> tensor<4x2x!quant.uniform<i8:f32, 2.000000e+0:16>> {
   %cst = mhlo.constant() {value = dense<[[1, 2, 3, 4],[5, 6, 7, 8]]> : tensor<2x4xi8>} : () -> tensor<2x4x!quant.uniform<i8:f32, 2.000000e+0:16>>
-  %0 = "mhlo.transpose"(%cst) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<2x4x!quant.uniform<i8:f32, 2.000000e+0:16>>) -> tensor<4x2x!quant.uniform<i8:f32, 2.000000e+0:16>>
+  %0 = "mhlo.transpose"(%cst) <{permutation = dense<[1, 0]> : tensor<2xi64>}> : (tensor<2x4x!quant.uniform<i8:f32, 2.000000e+0:16>>) -> tensor<4x2x!quant.uniform<i8:f32, 2.000000e+0:16>>
   // CHECK: [[TRANSPOSED:%.+]] = "mhlo.transpose"
   // CHECK-SAME: -> tensor<4x2x!quant.uniform<i8:f32, 2.000000e+00:16>>
   // CHECK-NEXT: return [[TRANSPOSED]]
   func.return %0 : tensor<4x2x!quant.uniform<i8:f32, 2.000000e+0:16>>
-}
\ No newline at end of file
+}
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/expand_ops_simplifier.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/expand_ops_simplifier.mlir
index 58f60bb795cecc..ee6601b0ae08aa 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/expand_ops_simplifier.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/expand_ops_simplifier.mlir
@@ -2,7 +2,10 @@
 
 func.func @main(%arg0: tensor<10x24x24x64xf32>, %arg1: tensor<10x12x12x64xf32>) -> tensor<10x24x24x64xf32> {
   %0 = mhlo.constant dense<0.000000e+00> : tensor<f32>
-  %1 = "mhlo.select_and_scatter"(%arg0, %arg1, %0) ({
+  %1 = "mhlo.select_and_scatter"(%arg0, %arg1, %0) <{
+    window_dimensions = dense<[1, 2, 2, 1]> : tensor<4xi64>,
+    window_strides = dense<[1, 2, 2, 1]> : tensor<4xi64>
+  }> ({
   ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
     %2 = "mhlo.compare"(%arg3, %arg4) {compare_type = #mhlo<comparison_type TOTALORDER>, comparison_direction = #mhlo<comparison_direction GE>} : (tensor<f32>, tensor<f32>) -> tensor<i1>
     "mhlo.return"(%2) : (tensor<i1>) -> ()
@@ -10,10 +13,7 @@ func.func @main(%arg0: tensor<10x24x24x64xf32>, %arg1: tensor<10x12x12x64xf32>)
   ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
     %2 = mhlo.add %arg3, %arg4 : tensor<f32>
     "mhlo.return"(%2) : (tensor<f32>) -> ()
-  }) {
-    window_dimensions = dense<[1, 2, 2, 1]> : tensor<4xi64>,
-    window_strides = dense<[1, 2, 2, 1]> : tensor<4xi64>
-  } : (tensor<10x24x24x64xf32>, tensor<10x12x12x64xf32>, tensor<f32>) -> tensor<10x24x24x64xf32>
+  }) : (tensor<10x24x24x64xf32>, tensor<10x12x12x64xf32>, tensor<f32>) -> tensor<10x24x24x64xf32>
   func.return %1 : tensor<10x24x24x64xf32>
 }
 
@@ -24,11 +24,11 @@ func.func @main(%arg0: tensor<10x24x24x64xf32>, %arg1: tensor<10x12x12x64xf32>)
 // CHECK-DAG:       %[[NEG_1:.*]] = mhlo.constant dense<-1> : tensor<i64>
 // CHECK-DAG:       %[[INIT:.*]] = mhlo.constant dense<0.000000e+00> : tensor<10x24x24x64xf32>
 // CHECK-DAG:       %[[C0:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
-// CHECK:           %[[IOTA_0:.*]] = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<10x24x24x64xi64>
-// CHECK:           %[[IOTA_1:.*]] = "mhlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<10x24x24x64xi64>
-// CHECK:           %[[IOTA_2:.*]] = "mhlo.iota"() {iota_dimension = 2 : i64} : () -> tensor<10x24x24x64xi64>
-// CHECK:           %[[IOTA_3:.*]] = "mhlo.iota"() {iota_dimension = 3 : i64} : () -> tensor<10x24x24x64xi64>
-// CHECK:           %[[REDUCE_WINDOW:.*]]:5 = "mhlo.reduce_window"(%[[OPERAND]], %[[IOTA_0]], %[[IOTA_1]], %[[IOTA_2]], %[[IOTA_3]], %[[C0]], %[[NEG_1]], %[[NEG_1]], %[[NEG_1]], %[[NEG_1]]) ({
+// CHECK:           %[[IOTA_0:.*]] = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<10x24x24x64xi64>
+// CHECK:           %[[IOTA_1:.*]] = "mhlo.iota"() <{iota_dimension = 1 : i64}> : () -> tensor<10x24x24x64xi64>
+// CHECK:           %[[IOTA_2:.*]] = "mhlo.iota"() <{iota_dimension = 2 : i64}> : () -> tensor<10x24x24x64xi64>
+// CHECK:           %[[IOTA_3:.*]] = "mhlo.iota"() <{iota_dimension = 3 : i64}> : () -> tensor<10x24x24x64xi64>
+// CHECK:           %[[REDUCE_WINDOW:.*]]:5 = "mhlo.reduce_window"(%[[OPERAND]], %[[IOTA_0]], %[[IOTA_1]], %[[IOTA_2]], %[[IOTA_3]], %[[C0]], %[[NEG_1]], %[[NEG_1]], %[[NEG_1]], %[[NEG_1]]) <{window_dimensions = dense<[1, 2, 2, 1]> : tensor<4xi64>, window_strides = dense<[1, 2, 2, 1]> : tensor<4xi64>}> ({
 // CHECK:           ^bb0(%[[VAL_10:.*]]: tensor<f32>, %[[VAL_11:.*]]: tensor<i64>, %[[VAL_12:.*]]: tensor<i64>, %[[VAL_13:.*]]: tensor<i64>, %[[VAL_14:.*]]: tensor<i64>, %[[VAL_15:.*]]: tensor<f32>, %[[VAL_16:.*]]: tensor<i64>, %[[VAL_17:.*]]: tensor<i64>, %[[VAL_18:.*]]: tensor<i64>, %[[VAL_19:.*]]: tensor<i64>):
 // CHECK:             %[[VAL_20:.*]] = mhlo.compare  NE, %[[VAL_11]], %[[NEG_1]]
 // CHECK:             %[[VAL_21:.*]] = mhlo.compare  NE, %[[VAL_16]], %[[NEG_1]]
@@ -42,16 +42,16 @@ func.func @main(%arg0: tensor<10x24x24x64xf32>, %arg1: tensor<10x12x12x64xf32>)
 // CHECK:             %[[SELECTED_3:.*]] = mhlo.select %[[VAL_25]], %[[VAL_13]], %[[VAL_18]]
 // CHECK:             %[[SELECTED_4:.*]] = mhlo.select %[[VAL_25]], %[[VAL_14]], %[[VAL_19]]
 // CHECK:             mhlo.return %[[SELECTED_0]], %[[SELECTED_1]], %[[SELECTED_2]], %[[SELECTED_3]], %[[SELECTED_4]]
-// CHECK:           }) {window_dimensions = dense<[1, 2, 2, 1]> : tensor<4xi64>, window_strides = dense<[1, 2, 2, 1]> : tensor<4xi64>} : (tensor<10x24x24x64xf32>, tensor<10x24x24x64xi64>, tensor<10x24x24x64xi64>, tensor<10x24x24x64xi64>, tensor<10x24x24x64xi64>, tensor<f32>, tensor<i64>, tensor<i64>, tensor<i64>, tensor<i64>) -> (tensor<10x12x12x64xf32>, tensor<10x12x12x64xi64>, tensor<10x12x12x64xi64>, tensor<10x12x12x64xi64>, tensor<10x12x12x64xi64>)
+// CHECK:           }) : (tensor<10x24x24x64xf32>, tensor<10x24x24x64xi64>, tensor<10x24x24x64xi64>, tensor<10x24x24x64xi64>, tensor<10x24x24x64xi64>, tensor<f32>, tensor<i64>, tensor<i64>, tensor<i64>, tensor<i64>) -> (tensor<10x12x12x64xf32>, tensor<10x12x12x64xi64>, tensor<10x12x12x64xi64>, tensor<10x12x12x64xi64>, tensor<10x12x12x64xi64>)
 // CHECK:           %[[RESHAPE_0:.*]] = mhlo.reshape %[[REDUCE_WINDOW]]#1 : (tensor<10x12x12x64xi64>) -> tensor<10x12x12x64x1xi64>
 // CHECK:           %[[RESHAPE_1:.*]] = mhlo.reshape %[[REDUCE_WINDOW]]#2 : (tensor<10x12x12x64xi64>) -> tensor<10x12x12x64x1xi64>
 // CHECK:           %[[RESHAPE_2:.*]] = mhlo.reshape %[[REDUCE_WINDOW]]#3 : (tensor<10x12x12x64xi64>) -> tensor<10x12x12x64x1xi64>
 // CHECK:           %[[RESHAPE_3:.*]] = mhlo.reshape %[[REDUCE_WINDOW]]#4 : (tensor<10x12x12x64xi64>) -> tensor<10x12x12x64x1xi64>
-// CHECK:           %[[CONCAT:.*]] = "mhlo.concatenate"(%[[RESHAPE_0]], %[[RESHAPE_1]], %[[RESHAPE_2]], %[[RESHAPE_3]]) {dimension = 4 : i64}
-// CHECK:           %[[SCATTER:.*]] = "mhlo.scatter"(%[[INIT]], %[[CONCAT]], %[[SOURCE]]) ({
+// CHECK:           %[[CONCAT:.*]] = "mhlo.concatenate"(%[[RESHAPE_0]], %[[RESHAPE_1]], %[[RESHAPE_2]], %[[RESHAPE_3]]) <{dimension = 4 : i64}>
+// CHECK:           %[[SCATTER:.*]] = "mhlo.scatter"(%[[INIT]], %[[CONCAT]], %[[SOURCE]]) <{indices_are_sorted = false, scatter_dimension_numbers = #mhlo.scatter<inserted_window_dims = [0, 1, 2, 3], scatter_dims_to_operand_dims = [0, 1, 2, 3], index_vector_dim = 4>, unique_indices = false}> ({
 // CHECK:           ^bb0(%[[VAL_38:.*]]: tensor<f32>, %[[VAL_39:.*]]: tensor<f32>):
 // CHECK:             %[[UPDATE:.*]] = mhlo.add %[[VAL_38]], %[[VAL_39]] : tensor<f32>
 // CHECK:             mhlo.return %[[UPDATE]] : tensor<f32>
-// CHECK:           }) {indices_are_sorted = false, scatter_dimension_numbers = #mhlo.scatter<inserted_window_dims = [0, 1, 2, 3], scatter_dims_to_operand_dims = [0, 1, 2, 3], index_vector_dim = 4>, unique_indices = false} : (tensor<10x24x24x64xf32>, tensor<10x12x12x64x4xi64>, tensor<10x12x12x64xf32>) -> tensor<10x24x24x64xf32>
+// CHECK:           }) : (tensor<10x24x24x64xf32>, tensor<10x12x12x64x4xi64>, tensor<10x12x12x64xf32>) -> tensor<10x24x24x64xf32>
 // CHECK:           return %[[SCATTER]] : tensor<10x24x24x64xf32>
-// CHECK:         }
\ No newline at end of file
+// CHECK:         }
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/group_reduction_dimensions.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/group_reduction_dimensions.mlir
index e49f3c12429dae..ae9349a7b43673 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/group_reduction_dimensions.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/group_reduction_dimensions.mlir
@@ -407,7 +407,7 @@ func.func @needs_transpose(%arg : tensor<10x11x12x13x14x15x16x17x18x19xf32>)
 // CHECK-ROW-RED-SAME:     into tensor<110x156x210x272x342xf32>
 // CHECK-ROW-RED:      %[[CTED:.*]] = "mhlo.transpose"(%[[CED]])
 // CHECK-ROW-RED-SAME:     {permutation = dense<[0, 2, 4, 1, 3]>
-// CHECK-ROW-RED-SAME:     : tensor<5xi64>} : (tensor<110x156x210x272x342xf32>)
+// CHECK-ROW-RED-SAME:     : tensor<5xi64>}> : (tensor<110x156x210x272x342xf32>)
 // CHECK-ROW-RED-SAME:     -> tensor<110x210x342x156x272xf32>
 // CHECK-ROW-RED:      %[[CTCED:.*]] = tensor.collapse_shape %[[CTED]]
 // CHECK-ROW-RED-SAME:     {{\[}}[0, 1, 2], [3, 4]{{\]}}
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-broadcast-to-broadcast-in-dim.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-broadcast-to-broadcast-in-dim.mlir
index fbe4cb57ae5535..f91e65b2cc46fa 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-broadcast-to-broadcast-in-dim.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-broadcast-to-broadcast-in-dim.mlir
@@ -2,7 +2,7 @@
 
 // CHECK-LABEL: @broadcast_to_broadcast_in_dim
 func.func @broadcast_to_broadcast_in_dim(%arg0: tensor<4xi64>) -> tensor<1x2x3x4xi64> {
-  // CHECK: [[RES:%.+]] = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<4xi64>) -> tensor<1x2x3x4xi64>
+  // CHECK: [[RES:%.+]] = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<3> : tensor<1xi64>}> : (tensor<4xi64>) -> tensor<1x2x3x4xi64>
   %0 = "mhlo.broadcast"(%arg0) {
     broadcast_sizes = dense<[1, 2, 3]> : tensor<3xi64>
   } : (tensor<4xi64>) -> tensor<1x2x3x4xi64>
@@ -14,7 +14,7 @@ func.func @broadcast_to_broadcast_in_dim(%arg0: tensor<4xi64>) -> tensor<1x2x3x4
 
 // CHECK-LABEL: @broadcast_to_broadcast_in_dim_dynamic_operand
 func.func @broadcast_to_broadcast_in_dim_dynamic_operand(%arg0: tensor<?x4xi64>) -> tensor<1x2x3x4xi64> {
-  // CHECK: [[RES:%.+]] = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>} : (tensor<?x4xi64>) -> tensor<1x2x3x4xi64>
+  // CHECK: [[RES:%.+]] = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>}> : (tensor<?x4xi64>) -> tensor<1x2x3x4xi64>
   %0 = "mhlo.broadcast"(%arg0) {
     broadcast_sizes = dense<[1, 2]> : tensor<2xi64>
   } : (tensor<?x4xi64>) -> tensor<1x2x3x4xi64>
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-cross-replica-sum-to-all-reduce.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-cross-replica-sum-to-all-reduce.mlir
index 70496c07d338e4..69932525ec0f7e 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-cross-replica-sum-to-all-reduce.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-cross-replica-sum-to-all-reduce.mlir
@@ -2,11 +2,12 @@
 
 // CHECK-LABEL: @cross_replica_sum_to_all_reduce
 func.func @cross_replica_sum_to_all_reduce(%arg0 : tensor<4xi64>) -> tensor<4xi64> {
-  // CHECK: [[RES:%.+]] = "mhlo.all_reduce"(%arg0) ({
+  // CHECK: [[RES:%.+]] = "mhlo.all_reduce"(%arg0)
+  // CHECK-SAME{LITERAL}: <{replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>}> ({
   // CHECK: ^bb0(%arg1: tensor<i64>, %arg2: tensor<i64>):
   // CHECK:   [[ADD:%.+]] = mhlo.add %arg1, %arg2 : tensor<i64>
   // CHECK:   mhlo.return [[ADD]] : tensor<i64>
-  // CHECK-SAME{LITERAL} }) {replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>} : (tensor<4xi64>) -> tensor<4xi64>
+  // CHECK-NEXT: }) : (tensor<4xi64>) -> tensor<4xi64>
   %0 = "mhlo.cross-replica-sum"(%arg0) {
     replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>
   } : (tensor<4xi64>) -> tensor<4xi64>
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-dot-general-to-dot.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-dot-general-to-dot.mlir
index b2f5eb6b25b865..9fc51f42724255 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-dot-general-to-dot.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-dot-general-to-dot.mlir
@@ -4,7 +4,7 @@
 func.func @dot_general_is_dot(%arg0: tensor<5x6xf32>, %arg1: tensor<6x?xf32>) -> tensor<5x?xf32> {
   // CHECK: %[[DOT:.+]] = "mhlo.dot"(%arg0, %arg1)
   // CHECK-SAME: precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]
-  %0 = "mhlo.dot_general"(%arg0, %arg1) {dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1], rhs_contracting_dimensions = [0]>, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<5x6xf32>, tensor<6x?xf32>) -> tensor<5x?xf32>
+  %0 = "mhlo.dot_general"(%arg0, %arg1) <{dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1], rhs_contracting_dimensions = [0]>, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]}> : (tensor<5x6xf32>, tensor<6x?xf32>) -> tensor<5x?xf32>
   // CHECK: %[[DOT]]
   return %0 : tensor<5x?xf32>
 }
@@ -14,9 +14,9 @@ func.func @dot_general_is_dot(%arg0: tensor<5x6xf32>, %arg1: tensor<6x?xf32>) ->
 // CHECK-LABEL: @dot_general_is_dot_keep_attrs
 func.func @dot_general_is_dot_keep_attrs(%arg0: tensor<5x6xf32>, %arg1: tensor<6x?xf32>) -> tensor<5x?xf32> {
   // CHECK: %[[DOT:.+]] = "mhlo.dot"(%arg0, %arg1)
-  // CHECK-SAME: mhlo.frontend_attributes = {test_name = "test_value"}
   // CHECK-SAME: precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]
-  %0 = "mhlo.dot_general"(%arg0, %arg1) {dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1], rhs_contracting_dimensions = [0]>, mhlo.frontend_attributes = {test_name = "test_value"}, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<5x6xf32>, tensor<6x?xf32>) -> tensor<5x?xf32>
+  // CHECK-SAME: mhlo.frontend_attributes = {test_name = "test_value"}
+  %0 = "mhlo.dot_general"(%arg0, %arg1) <{dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1], rhs_contracting_dimensions = [0]>, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]}> {mhlo.frontend_attributes = {test_name = "test_value"}} : (tensor<5x6xf32>, tensor<6x?xf32>) -> tensor<5x?xf32>
   // CHECK: %[[DOT]]
   return %0 : tensor<5x?xf32>
 }
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-dot-to-dot-general.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-dot-to-dot-general.mlir
index 3276dfc09dbbe0..f45ab298ebb47e 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-dot-to-dot-general.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-dot-to-dot-general.mlir
@@ -2,12 +2,12 @@
 
 // CHECK-LABEL: @dot_to_dot_general_vector_dot_vector
 func.func @dot_to_dot_general_vector_dot_vector(%arg0 : tensor<4xi64>, %arg1 : tensor<4xi64>) -> tensor<i64> {
-  // CHECK: [[RES:%.+]] = "mhlo.dot_general"(%arg0, %arg1) {
+  // CHECK: [[RES:%.+]] = "mhlo.dot_general"(%arg0, %arg1) <{
   // CHECK-SAME:   dot_dimension_numbers = #mhlo.dot<
   // CHECK-SAME:     lhs_contracting_dimensions = [0],
   // CHECK-SAME:     rhs_contracting_dimensions = [0]
   // CHECK-SAME:   >
-  // CHECK-SAME: } : (tensor<4xi64>, tensor<4xi64>) -> tensor<i64>
+  // CHECK-SAME: }> : (tensor<4xi64>, tensor<4xi64>) -> tensor<i64>
   %0 = "mhlo.dot"(%arg0, %arg1) : (tensor<4xi64>, tensor<4xi64>) -> tensor<i64>
   func.return %0 : tensor<i64>
 }
@@ -16,12 +16,12 @@ func.func @dot_to_dot_general_vector_dot_vector(%arg0 : tensor<4xi64>, %arg1 : t
 
 // CHECK-LABEL: @dot_to_dot_general_matrix_dot_vector
 func.func @dot_to_dot_general_matrix_dot_vector(%arg0 : tensor<4x5xi64>, %arg1 : tensor<5xi64>) -> tensor<4xi64> {
-  // CHECK: [[RES:%.+]] = "mhlo.dot_general"(%arg0, %arg1) {
+  // CHECK: [[RES:%.+]] = "mhlo.dot_general"(%arg0, %arg1) <{
   // CHECK-SAME:   dot_dimension_numbers = #mhlo.dot<
   // CHECK-SAME:     lhs_contracting_dimensions = [1],
   // CHECK-SAME:     rhs_contracting_dimensions = [0]
   // CHECK-SAME:   >
-  // CHECK-SAME: } : (tensor<4x5xi64>, tensor<5xi64>) -> tensor<4xi64>
+  // CHECK-SAME: }> : (tensor<4x5xi64>, tensor<5xi64>) -> tensor<4xi64>
   %0 = "mhlo.dot"(%arg0, %arg1) : (tensor<4x5xi64>, tensor<5xi64>) -> tensor<4xi64>
   func.return %0 : tensor<4xi64>
 }
@@ -30,12 +30,12 @@ func.func @dot_to_dot_general_matrix_dot_vector(%arg0 : tensor<4x5xi64>, %arg1 :
 
 // CHECK-LABEL: @dot_to_dot_general_vector_dot_matrix
 func.func @dot_to_dot_general_vector_dot_matrix(%arg0 : tensor<5xi64>, %arg1 : tensor<5x4xi64>) -> tensor<4xi64> {
-  // CHECK: [[RES:%.+]] = "mhlo.dot_general"(%arg0, %arg1) {
+  // CHECK: [[RES:%.+]] = "mhlo.dot_general"(%arg0, %arg1) <{
   // CHECK-SAME:   dot_dimension_numbers = #mhlo.dot<
   // CHECK-SAME:     lhs_contracting_dimensions = [0],
   // CHECK-SAME:     rhs_contracting_dimensions = [0]
   // CHECK-SAME:   >
-  // CHECK-SAME: } : (tensor<5xi64>, tensor<5x4xi64>) -> tensor<4xi64>
+  // CHECK-SAME: }> : (tensor<5xi64>, tensor<5x4xi64>) -> tensor<4xi64>
   %0 = "mhlo.dot"(%arg0, %arg1) : (tensor<5xi64>, tensor<5x4xi64>) -> tensor<4xi64>
   func.return %0 : tensor<4xi64>
 }
@@ -44,12 +44,12 @@ func.func @dot_to_dot_general_vector_dot_matrix(%arg0 : tensor<5xi64>, %arg1 : t
 
 // CHECK-LABEL: @dot_to_dot_general_matrix_dot_matrix
 func.func @dot_to_dot_general_matrix_dot_matrix(%arg0 : tensor<4x5xi64>, %arg1 : tensor<5x4xi64>) -> tensor<4x4xi64> {
-  // CHECK: [[RES:%.+]] = "mhlo.dot_general"(%arg0, %arg1) {
+  // CHECK: [[RES:%.+]] = "mhlo.dot_general"(%arg0, %arg1) <{
   // CHECK-SAME:   dot_dimension_numbers = #mhlo.dot<
   // CHECK-SAME:     lhs_contracting_dimensions = [1],
   // CHECK-SAME:     rhs_contracting_dimensions = [0]
   // CHECK-SAME:   >
-  // CHECK-SAME: } : (tensor<4x5xi64>, tensor<5x4xi64>) -> tensor<4x4xi64>
+  // CHECK-SAME: }> : (tensor<4x5xi64>, tensor<5x4xi64>) -> tensor<4x4xi64>
   %0 = "mhlo.dot"(%arg0, %arg1) : (tensor<4x5xi64>, tensor<5x4xi64>) -> tensor<4x4xi64>
   func.return %0 : tensor<4x4xi64>
 }
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-einsum-to-dot-general.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-einsum-to-dot-general.mlir
index a224c31027b98c..51363c9c9c3042 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-einsum-to-dot-general.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-einsum-to-dot-general.mlir
@@ -2,7 +2,7 @@
 
 func.func @einsum_diag(%arg0: tensor<6x6xf32>) -> tensor<6xf32> {
   %0 = mhlo.constant dense<1.000000e+00> : tensor<f32>
-  %1 = "mhlo.einsum"(%0, %arg0) {einsum_config = ",ii->i"} : (tensor<f32>, tensor<6x6xf32>) -> tensor<6xf32>
+  %1 = "mhlo.einsum"(%0, %arg0) <{einsum_config = ",ii->i"}> : (tensor<f32>, tensor<6x6xf32>) -> tensor<6xf32>
   func.return %1 : tensor<6xf32>
 }
 // CHECK-LABEL: func @einsum_diag
@@ -14,7 +14,7 @@ func.func @einsum_diag(%arg0: tensor<6x6xf32>) -> tensor<6xf32> {
 // CHECK:         "mhlo.einsum"
 
 func.func @einsum_batched_matrix_high_rank_vector_mul(%arg0: tensor<8x2x6xf32>, %arg1: tensor<8x5x3x6xf32>) -> tensor<8x5x3x2xf32> {
-  %0 = "mhlo.einsum"(%arg0, %arg1) {einsum_config = "bxy,bijy->bijx"} : (tensor<8x2x6xf32>, tensor<8x5x3x6xf32>) -> tensor<8x5x3x2xf32>
+  %0 = "mhlo.einsum"(%arg0, %arg1) <{einsum_config = "bxy,bijy->bijx"}> : (tensor<8x2x6xf32>, tensor<8x5x3x6xf32>) -> tensor<8x5x3x2xf32>
   func.return %0 : tensor<8x5x3x2xf32>
 }
 // CHECK-LABEL: func @einsum_batched_matrix_high_rank_vector_mul
@@ -32,7 +32,7 @@ func.func @einsum_batched_matrix_high_rank_vector_mul(%arg0: tensor<8x2x6xf32>,
 // CHECK-SAME:    : (tensor<8x2x5x3xf32>) -> tensor<8x5x3x2xf32>
 
 func.func @matmul(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  %0 = "mhlo.einsum"(%arg0, %arg1) {einsum_config = "ij,jk->ik"} : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
+  %0 = "mhlo.einsum"(%arg0, %arg1) <{einsum_config = "ij,jk->ik"}> : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
   func.return %0 : tensor<?x?xf32>
 }
 // CHECK-LABEL: func @matmul
@@ -45,7 +45,7 @@ func.func @matmul(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?x
 // CHECK-SAME:    : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
 
 func.func @matvec(%arg0: tensor<?x?xf32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
-  %0 = "mhlo.einsum"(%arg0, %arg1) {einsum_config = "ij,j->i"} : (tensor<?x?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %0 = "mhlo.einsum"(%arg0, %arg1) <{einsum_config = "ij,j->i"}> : (tensor<?x?xf32>, tensor<?xf32>) -> tensor<?xf32>
   func.return %0 : tensor<?xf32>
 }
 // CHECK-LABEL: func @matvec
@@ -58,7 +58,7 @@ func.func @matvec(%arg0: tensor<?x?xf32>, %arg1: tensor<?xf32>) -> tensor<?xf32>
 // CHECK-SAME:    : (tensor<?x?xf32>, tensor<?xf32>) -> tensor<?xf32>
 
 func.func @dot(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>) -> tensor<f32> {
-  %0 = "mhlo.einsum"(%arg0, %arg1) {einsum_config = "i,i->"} : (tensor<?xf32>, tensor<?xf32>) -> tensor<f32>
+  %0 = "mhlo.einsum"(%arg0, %arg1) <{einsum_config = "i,i->"}> : (tensor<?xf32>, tensor<?xf32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 // CHECK-LABEL: func @dot
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-gather-to-torch-index-select.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-gather-to-torch-index-select.mlir
index 0fd367809d2022..2649723ca4fe64 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-gather-to-torch-index-select.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-gather-to-torch-index-select.mlir
@@ -2,10 +2,10 @@
 
 // CHECK-LABEL: @gather_to_index_select
 func.func @gather_to_index_select(%arg0 : tensor<5x4xf32>, %arg1 : tensor<1x3x1xi32>) -> tensor<1x3x4xf32> {
-  // CHECK: [[TIS:%.+]] = "mhlo.torch_index_select"(%arg0, %arg1) {
+  // CHECK: [[TIS:%.+]] = "mhlo.torch_index_select"(%arg0, %arg1) <{
   // CHECK-SAME:   batch_dims = 0 : i64,
   // CHECK-SAME:   dim = 0 : i64
-  // CHECK-SAME: } : (tensor<5x4xf32>, tensor<1x3x1xi32>) -> tensor<1x3x1x4xf32>
+  // CHECK-SAME: }> : (tensor<5x4xf32>, tensor<1x3x1xi32>) -> tensor<1x3x1x4xf32>
   // CHECK: [[RES:%.+]] = mhlo.reshape [[TIS]]
   %0 = "mhlo.gather"(%arg0, %arg1) {
     dimension_numbers = #mhlo.gather<
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-rng-to-linalg.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-rng-to-linalg.mlir
index 056fbfb2eb64a5..bd7b54c5c3bc73 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-rng-to-linalg.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-rng-to-linalg.mlir
@@ -2,7 +2,7 @@
 // RUN: FILECHECK_OPTS="" FileCheck %s
 
 func.func @three_fry_i64(%arg0: tensor<2xi64>) -> (tensor<2xi64>, tensor<8xi64>) {
-  %output_state, %output = "mhlo.rng_bit_generator"(%arg0) {rng_algorithm = #mhlo.rng_algorithm<THREE_FRY>} : (tensor<2xi64>) -> (tensor<2xi64>, tensor<8xi64>)
+  %output_state, %output = "mhlo.rng_bit_generator"(%arg0) <{rng_algorithm = #mhlo.rng_algorithm<THREE_FRY>}> : (tensor<2xi64>) -> (tensor<2xi64>, tensor<8xi64>)
   return %output_state, %output : tensor<2xi64>, tensor<8xi64>
 }
 
@@ -98,7 +98,7 @@ func.func @three_fry_i64(%arg0: tensor<2xi64>) -> (tensor<2xi64>, tensor<8xi64>)
 // -----
 
 func.func @three_fry_i32(%arg0: tensor<2xi64>) -> (tensor<2xi64>, tensor<8xi32>) {
-  %output_state, %output = "mhlo.rng_bit_generator"(%arg0) {rng_algorithm = #mhlo.rng_algorithm<THREE_FRY>} : (tensor<2xi64>) -> (tensor<2xi64>, tensor<8xi32>)
+  %output_state, %output = "mhlo.rng_bit_generator"(%arg0) <{rng_algorithm = #mhlo.rng_algorithm<THREE_FRY>}> : (tensor<2xi64>) -> (tensor<2xi64>, tensor<8xi32>)
   return %output_state, %output : tensor<2xi64>, tensor<8xi32>
 }
 
@@ -140,7 +140,7 @@ func.func @three_fry_i32(%arg0: tensor<2xi64>) -> (tensor<2xi64>, tensor<8xi32>)
 // -----
 
 func.func @three_fry_odd_i32(%arg0: tensor<2xi64>) -> (tensor<2xi64>, tensor<7x11xi32>) {
-  %output_state, %output = "mhlo.rng_bit_generator"(%arg0) {rng_algorithm = #mhlo.rng_algorithm<THREE_FRY>} : (tensor<2xi64>) -> (tensor<2xi64>, tensor<7x11xi32>)
+  %output_state, %output = "mhlo.rng_bit_generator"(%arg0) <{rng_algorithm = #mhlo.rng_algorithm<THREE_FRY>}> : (tensor<2xi64>) -> (tensor<2xi64>, tensor<7x11xi32>)
   return %output_state, %output : tensor<2xi64>, tensor<7x11xi32>
 }
 
@@ -183,7 +183,7 @@ func.func @three_fry_odd_i32(%arg0: tensor<2xi64>) -> (tensor<2xi64>, tensor<7x1
 // -----
 
 func.func @three_fry_i16(%arg0: tensor<2xi64>) -> (tensor<2xi64>, tensor<8xi16>) {
-  %output_state, %output = "mhlo.rng_bit_generator"(%arg0) {rng_algorithm = #mhlo.rng_algorithm<THREE_FRY>} : (tensor<2xi64>) -> (tensor<2xi64>, tensor<8xi16>)
+  %output_state, %output = "mhlo.rng_bit_generator"(%arg0) <{rng_algorithm = #mhlo.rng_algorithm<THREE_FRY>}> : (tensor<2xi64>) -> (tensor<2xi64>, tensor<8xi16>)
   return %output_state, %output : tensor<2xi64>, tensor<8xi16>
 }
 
@@ -224,7 +224,7 @@ func.func @three_fry_i16(%arg0: tensor<2xi64>) -> (tensor<2xi64>, tensor<8xi16>)
 // -----
 
 func.func @philox_i64(%arg0: tensor<2xi64>) -> (tensor<2xi64>, tensor<8xi64>) {
-  %output_state, %output = "mhlo.rng_bit_generator"(%arg0) {rng_algorithm = #mhlo.rng_algorithm<PHILOX>} : (tensor<2xi64>) -> (tensor<2xi64>, tensor<8xi64>)
+  %output_state, %output = "mhlo.rng_bit_generator"(%arg0) <{rng_algorithm = #mhlo.rng_algorithm<PHILOX>}> : (tensor<2xi64>) -> (tensor<2xi64>, tensor<8xi64>)
   return %output_state, %output : tensor<2xi64>, tensor<8xi64>
 }
 
@@ -349,7 +349,7 @@ func.func @philox_i64(%arg0: tensor<2xi64>) -> (tensor<2xi64>, tensor<8xi64>) {
 // -----
 
 func.func @philox_i32(%arg0: tensor<2xi64>) -> (tensor<2xi64>, tensor<8xi32>) {
-  %output_state, %output = "mhlo.rng_bit_generator"(%arg0) {rng_algorithm = #mhlo.rng_algorithm<PHILOX>} : (tensor<2xi64>) -> (tensor<2xi64>, tensor<8xi32>)
+  %output_state, %output = "mhlo.rng_bit_generator"(%arg0) <{rng_algorithm = #mhlo.rng_algorithm<PHILOX>}> : (tensor<2xi64>) -> (tensor<2xi64>, tensor<8xi32>)
   return %output_state, %output : tensor<2xi64>, tensor<8xi32>
 }
 
@@ -385,7 +385,7 @@ func.func @philox_i32(%arg0: tensor<2xi64>) -> (tensor<2xi64>, tensor<8xi32>) {
 // -----
 
 func.func @philox_i32_odd(%arg0: tensor<2xi64>) -> (tensor<2xi64>, tensor<7x11xi32>) {
-  %output_state, %output = "mhlo.rng_bit_generator"(%arg0) {rng_algorithm = #mhlo.rng_algorithm<PHILOX>} : (tensor<2xi64>) -> (tensor<2xi64>, tensor<7x11xi32>)
+  %output_state, %output = "mhlo.rng_bit_generator"(%arg0) <{rng_algorithm = #mhlo.rng_algorithm<PHILOX>}> : (tensor<2xi64>) -> (tensor<2xi64>, tensor<7x11xi32>)
   return %output_state, %output : tensor<2xi64>, tensor<7x11xi32>
 }
 
@@ -436,7 +436,7 @@ func.func @philox_i32_odd(%arg0: tensor<2xi64>) -> (tensor<2xi64>, tensor<7x11xi
 
 
 func.func @philox_i64_odd(%arg0: tensor<2xi64>) -> (tensor<2xi64>, tensor<3x5xi64>) {
-  %output_state, %output = "mhlo.rng_bit_generator"(%arg0) {rng_algorithm = #mhlo.rng_algorithm<PHILOX>} : (tensor<2xi64>) -> (tensor<2xi64>, tensor<3x5xi64>)
+  %output_state, %output = "mhlo.rng_bit_generator"(%arg0) <{rng_algorithm = #mhlo.rng_algorithm<PHILOX>}> : (tensor<2xi64>) -> (tensor<2xi64>, tensor<3x5xi64>)
   return %output_state, %output : tensor<2xi64>, tensor<3x5xi64>
 }
 
@@ -475,7 +475,7 @@ func.func @philox_i64_odd(%arg0: tensor<2xi64>) -> (tensor<2xi64>, tensor<3x5xi6
 // -----
 
 func.func @philox_i16(%arg0: tensor<2xi64>) -> (tensor<2xi64>, tensor<8xi16>) {
-  %output_state, %output = "mhlo.rng_bit_generator"(%arg0) {rng_algorithm = #mhlo.rng_algorithm<PHILOX>} : (tensor<2xi64>) -> (tensor<2xi64>, tensor<8xi16>)
+  %output_state, %output = "mhlo.rng_bit_generator"(%arg0) <{rng_algorithm = #mhlo.rng_algorithm<PHILOX>}> : (tensor<2xi64>) -> (tensor<2xi64>, tensor<8xi16>)
   return %output_state, %output : tensor<2xi64>, tensor<8xi16>
 }
 
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-linalg.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-linalg.mlir
index 1febe1b714eda5..3af8f3e9b6a206 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-linalg.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-linalg.mlir
@@ -928,7 +928,7 @@ func.func @select_mixed(%pred: tensor<2x?xi1>, %lhs: tensor<?x2xf32>,
 // CHECK-DAG: #[[RESULT_MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 // CHECK-LABEL: func @broadcast_scalar
 func.func @broadcast_scalar(%arg: tensor<f32>) -> tensor<4x2x1xf32> {
-  %0 = "mhlo.broadcast"(%arg) {broadcast_sizes = dense<[4, 2, 1]> : tensor<3xi64>} : (tensor<f32>) -> tensor<4x2x1xf32>
+  %0 = "mhlo.broadcast"(%arg) <{broadcast_sizes = dense<[4, 2, 1]> : tensor<3xi64>}> : (tensor<f32>) -> tensor<4x2x1xf32>
   func.return %0: tensor<4x2x1xf32>
 }
 // CHECK: tensor.empty() : tensor<4x2x1xf32>
@@ -949,7 +949,7 @@ func.func @broadcast_scalar(%arg: tensor<f32>) -> tensor<4x2x1xf32> {
 // CHECK-DAG: #[[RESULT_MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>
 // CHECK-LABEL: func @broadcast
 func.func @broadcast(%arg: tensor<4x?x16xf32>) -> tensor<4x2x1x4x?x16xf32> {
-  %0 = "mhlo.broadcast"(%arg) {broadcast_sizes = dense<[4, 2, 1]> : tensor<3xi64>} : (tensor<4x?x16xf32>) -> tensor<4x2x1x4x?x16xf32>
+  %0 = "mhlo.broadcast"(%arg) <{broadcast_sizes = dense<[4, 2, 1]> : tensor<3xi64>}> : (tensor<4x?x16xf32>) -> tensor<4x2x1x4x?x16xf32>
   func.return %0: tensor<4x2x1x4x?x16xf32>
 }
 // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
@@ -1092,7 +1092,7 @@ func.func @broadcast_in_dim_scalar(%operand: tensor<f32>) -> tensor<7x10x6xf32>
 // CHECK-DAG: #[[RESULT_MAP:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 // CHECK-LABEL: func @transpose
 func.func @transpose(%arg0: tensor<2x3x9x5xi32>) -> tensor<3x2x5x9xi32> {
-  %0 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>}
+  %0 = "mhlo.transpose"(%arg0) <{permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>}>
         : (tensor<2x3x9x5xi32>) -> tensor<3x2x5x9xi32>
   func.return %0 : tensor<3x2x5x9xi32>
 }
@@ -1107,7 +1107,7 @@ func.func @transpose(%arg0: tensor<2x3x9x5xi32>) -> tensor<3x2x5x9xi32> {
 // CHECK-DAG: #[[RESULT_MAP:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 // CHECK-LABEL: func @transpose_dynamic
 func.func @transpose_dynamic(%arg0: tensor<?x?x9x?xi32>) -> tensor<?x?x?x9xi32> {
-  %0 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>, someattr}
+  %0 = "mhlo.transpose"(%arg0) <{permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>}> {someattr}
         : (tensor<?x?x9x?xi32>) -> tensor<?x?x?x9xi32>
   func.return %0 : tensor<?x?x?x9xi32>
 }
@@ -1944,7 +1944,7 @@ func.func @reverse(%input: tensor<2x3xf32>) -> tensor<2x3xf32> {
 // CHECK: #[[RESULT_MAP:.*]] = affine_map<(d0, d1) -> (d0, d1)>
 // CHECK-LABEL: func @iota_f32
 func.func @iota_f32() -> tensor<7x10xf32> {
-  %result = "mhlo.iota"() {iota_dimension = 1 : i64, someattr} : () -> (tensor<7x10xf32>)
+  %result = "mhlo.iota"() <{iota_dimension = 1 : i64}> {someattr} : () -> (tensor<7x10xf32>)
   func.return %result : tensor<7x10xf32>
 }
 // CHECK: tensor.empty
@@ -1971,7 +1971,7 @@ func.func @iota_f32() -> tensor<7x10xf32> {
 // CHECK: #[[RESULT_MAP:.*]] = affine_map<(d0, d1) -> (d0, d1)>
 // CHECK-LABEL: func @iota_i32
 func.func @iota_i32() -> tensor<7x10xi32> {
-  %result = "mhlo.iota"() {iota_dimension = 1 : i64} : () -> (tensor<7x10xi32>)
+  %result = "mhlo.iota"() <{iota_dimension = 1 : i64}> : () -> (tensor<7x10xi32>)
   func.return %result : tensor<7x10xi32>
 }
 // CHECK: tensor.empty
@@ -1987,7 +1987,7 @@ func.func @iota_i32() -> tensor<7x10xi32> {
 // CHECK: #[[RESULT_MAP:.*]] = affine_map<(d0, d1) -> (d0, d1)>
 // CHECK-LABEL: func @iota_ui32
 func.func @iota_ui32() -> tensor<7x10xui32> {
-  %result = "mhlo.iota"() {iota_dimension = 1 : i64} : () -> (tensor<7x10xui32>)
+  %result = "mhlo.iota"() <{iota_dimension = 1 : i64}> : () -> (tensor<7x10xui32>)
   func.return %result : tensor<7x10xui32>
 }
 // CHECK: tensor.empty
@@ -2004,7 +2004,7 @@ func.func @iota_ui32() -> tensor<7x10xui32> {
 // CHECK: #[[RESULT_MAP:.*]] = affine_map<(d0, d1) -> (d0, d1)>
 // CHECK-LABEL: func @iota_complexf32
 func.func @iota_complexf32() -> tensor<7x10xcomplex<f32>> {
-  %result = "mhlo.iota"() {iota_dimension = 1 : i64} : () -> (tensor<7x10xcomplex<f32>>)
+  %result = "mhlo.iota"() <{iota_dimension = 1 : i64}> : () -> (tensor<7x10xcomplex<f32>>)
   func.return %result : tensor<7x10xcomplex<f32>>
 }
 // CHECK-DAG:    %[[ZERO:.*]] = arith.constant 0.000000e+00 : f32
@@ -2024,7 +2024,7 @@ func.func @iota_complexf32() -> tensor<7x10xcomplex<f32>> {
 // CHECK-LABEL: func @dynamic_iota_f32
 // CHECK-SAME: %[[SHAPE:.*]]: tensor<?xi32>
 func.func @dynamic_iota_f32(%shape: tensor<?xi32>) -> tensor<?x?x8xf32> {
-  %result = "mhlo.dynamic_iota"(%shape) {iota_dimension = 1 : i64} : (tensor<?xi32>) -> (tensor<?x?x8xf32>)
+  %result = "mhlo.dynamic_iota"(%shape) <{iota_dimension = 1 : i64}> : (tensor<?xi32>) -> (tensor<?x?x8xf32>)
   func.return %result : tensor<?x?x8xf32>
 }
 // CHECK: %[[V1:.*]] = tensor.extract %[[SHAPE]][%c0]
@@ -2046,7 +2046,7 @@ func.func @dynamic_iota_f32(%shape: tensor<?xi32>) -> tensor<?x?x8xf32> {
 // CHECK-LABEL: func @dyanmic_iota_ui32
 // CHECK-SAME: %[[SHAPE:.*]]: tensor<?xi32>
 func.func @dyanmic_iota_ui32(%shape: tensor<?xi32>) -> tensor<?x?x8xui32> {
-  %result = "mhlo.dynamic_iota"(%shape) {iota_dimension = 1 : i64} : (tensor<?xi32>) -> (tensor<?x?x8xui32>)
+  %result = "mhlo.dynamic_iota"(%shape) <{iota_dimension = 1 : i64}> : (tensor<?xi32>) -> (tensor<?x?x8xui32>)
   func.return %result : tensor<?x?x8xui32>
 }
 // CHECK: %[[V1:.*]] = tensor.extract %[[SHAPE]][%c0]
@@ -2211,9 +2211,9 @@ func.func @integer_pow(%lhs: tensor<2x2xi32>,
 // CHECK-SAME: [[SHAPE:%.*]]: tensor<1xindex>
 func.func @dynamic_broadcast_in_dim(%shape: tensor<1xindex>) -> tensor<?xf32> {
   %cst = mhlo.constant dense<0x7F800000> : tensor<f32>
-  %result = "mhlo.dynamic_broadcast_in_dim"(%cst, %shape) {
-     broadcast_dimensions = dense<> : tensor<0xi64>, someattr
-  } : (tensor<f32>, tensor<1xindex>) -> tensor<?xf32>
+  %result = "mhlo.dynamic_broadcast_in_dim"(%cst, %shape) <{
+     broadcast_dimensions = dense<> : tensor<0xi64>
+  }> {someattr} : (tensor<f32>, tensor<1xindex>) -> tensor<?xf32>
   func.return %result : tensor<?xf32>
 }
 // CHECK: [[CST:%.*]] = arith.constant dense
@@ -2242,9 +2242,9 @@ func.func @dynamic_broadcast_in_dim(%shape: tensor<1xindex>) -> tensor<?xf32> {
 // CHECK-SAME: [[SHAPE:%.*]]: tensor<2xindex>
 func.func @dynamic_broadcast_in_dim(%scalar: tensor<f32>, %shape: tensor<2xindex>)
     -> tensor<?x32xf32> {
-  %result = "mhlo.dynamic_broadcast_in_dim"(%scalar, %shape) {
+  %result = "mhlo.dynamic_broadcast_in_dim"(%scalar, %shape) <{
      broadcast_dimensions = dense<> : tensor<0xi64>
-  } : (tensor<f32>, tensor<2xindex>) -> tensor<?x32xf32>
+  }> : (tensor<f32>, tensor<2xindex>) -> tensor<?x32xf32>
   func.return %result : tensor<?x32xf32>
 }
 // CHECK: [[INIT:%.*]] = tensor.empty
@@ -2293,9 +2293,9 @@ func.func @dynamic_broadcast_in_dim(%vector: tensor<42xf32>, %shape: tensor<3xin
 // fail if the %shape i32 -> index cast is not performed properly.
 func.func @dynamic_broadcast_in_dim(%scalar: tensor<f32>, %shape: tensor<2xi32>)
     -> tensor<?x32xf32> {
-  %result = "mhlo.dynamic_broadcast_in_dim"(%scalar, %shape) {
+  %result = "mhlo.dynamic_broadcast_in_dim"(%scalar, %shape) <{
      broadcast_dimensions = dense<> : tensor<0xi64>
-  } : (tensor<f32>, tensor<2xi32>) -> tensor<?x32xf32>
+  }> : (tensor<f32>, tensor<2xi32>) -> tensor<?x32xf32>
   func.return %result : tensor<?x32xf32>
 }
 
@@ -2307,9 +2307,9 @@ func.func @dynamic_broadcast_in_dim(%scalar: tensor<f32>, %shape: tensor<2xi32>)
 // CHECK-LABEL: func @dynamic_broadcast_in_dim(
 // CHECK-SAME: [[SHAPE:%.*]]: tensor<1xindex>, [[CSTARG:%.*]]: tensor<ui32>
 func.func @dynamic_broadcast_in_dim(%shape: tensor<1xindex>, %cst: tensor<ui32>) -> tensor<?xui32> {
-  %result = "mhlo.dynamic_broadcast_in_dim"(%cst, %shape) {
+  %result = "mhlo.dynamic_broadcast_in_dim"(%cst, %shape) <{
      broadcast_dimensions = dense<> : tensor<0xi64>
-  } : (tensor<ui32>, tensor<1xindex>) -> tensor<?xui32>
+  }> : (tensor<ui32>, tensor<1xindex>) -> tensor<?xui32>
   func.return %result : tensor<?xui32>
 }
 // CHECK: [[CST:%.*]] = builtin.unrealized_conversion_cast [[CSTARG]] : tensor<ui32> to tensor<i32>
@@ -2340,10 +2340,10 @@ func.func @dynamic_broadcast_in_dim(%shape: tensor<1xindex>, %cst: tensor<ui32>)
 // CHECK-PRIMITIVE-SAME:  %[[ARG:.*]]: tensor<?x?x?x?x1x42xf32>, %[[SHAPE:.*]]: tensor<7xindex>
 func.func @dynamic_broadcast_in_dim(%arg: tensor<?x?x?x?x1x42xf32>,
     %shape: tensor<7xindex>) -> tensor<?x?x?x?x?x?x?xf32> {
-  %result = "mhlo.dynamic_broadcast_in_dim"(%arg, %shape) {
+  %result = "mhlo.dynamic_broadcast_in_dim"(%arg, %shape) <{
       broadcast_dimensions = dense<[1, 2, 3, 4, 5, 6]> : tensor<6xi64>,
       known_expanding_dimensions = dense<[0, 1]> : tensor<2xi64>,
-      known_nonexpanding_dimensions = dense<[2, 3]> : tensor<2xi64> }
+      known_nonexpanding_dimensions = dense<[2, 3]> : tensor<2xi64> }>
       : (tensor<?x?x?x?x1x42xf32>, tensor<7xindex>) -> tensor<?x?x?x?x?x?x?xf32>
   func.return %result : tensor<?x?x?x?x?x?x?xf32>
 }
@@ -2696,7 +2696,7 @@ func.func @dot_general_batch_matmul_large
 // CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
 // CHECK-LABEL: func @einsum_basic
 func.func @einsum_basic(%arg0: tensor<3x4x5xf32>, %arg1: tensor<3x5x6xf32>) -> tensor<3x4x6xf32> {
-  %0 = "mhlo.einsum"(%arg0, %arg1) {einsum_config = "ijk,ikm->ijm", someattr}: (tensor<3x4x5xf32>, tensor<3x5x6xf32>) -> tensor<3x4x6xf32>
+  %0 = "mhlo.einsum"(%arg0, %arg1) <{einsum_config = "ijk,ikm->ijm"}> {someattr} : (tensor<3x4x5xf32>, tensor<3x5x6xf32>) -> tensor<3x4x6xf32>
   func.return %0 : tensor<3x4x6xf32>
 }
 // CHECK-SAME:  (%[[LHS:.*]]: tensor<3x4x5xf32>, %[[RHS:.*]]: tensor<3x5x6xf32>)
@@ -2738,7 +2738,7 @@ func.func @dot_general_batch_matvec(%arg0: tensor<?x?x3xf32>,
 // CHECK: #[[MAP0:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 // CHECK: func @einsum_pointwisemul
 func.func @einsum_pointwisemul(%arg0: tensor<3x4x5xf32>, %arg1: tensor<3x4x5xf32>) -> tensor<3x4x5xf32> {
-  %0 = "mhlo.einsum"(%arg0, %arg1) {einsum_config = "abc,abc->abc"} : (tensor<3x4x5xf32>, tensor<3x4x5xf32>) -> tensor<3x4x5xf32>
+  %0 = "mhlo.einsum"(%arg0, %arg1) <{einsum_config = "abc,abc->abc"}> : (tensor<3x4x5xf32>, tensor<3x4x5xf32>) -> tensor<3x4x5xf32>
   func.return %0 : tensor<3x4x5xf32>
 }
 // CHECK-SAME:  (%[[LHS:.*]]: tensor<3x4x5xf32>, %[[RHS:.*]]: tensor<3x4x5xf32>)
@@ -2759,7 +2759,7 @@ func.func @einsum_pointwisemul(%arg0: tensor<3x4x5xf32>, %arg1: tensor<3x4x5xf32
 // CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
 // CHECK: func @einsum_matmul
 func.func @einsum_matmul(%arg0: tensor<7x9xf32>, %arg1: tensor<9x5xf32>) -> tensor<7x5xf32> {
-  %0 = "mhlo.einsum"(%arg0, %arg1) {einsum_config = "ae,ed->ad"}: (tensor<7x9xf32>, tensor<9x5xf32>) -> tensor<7x5xf32>
+  %0 = "mhlo.einsum"(%arg0, %arg1) <{einsum_config = "ae,ed->ad"}>: (tensor<7x9xf32>, tensor<9x5xf32>) -> tensor<7x5xf32>
   func.return %0 : tensor<7x5xf32>
 }
 // CHECK-SAME:  (%[[LHS:.*]]: tensor<7x9xf32>, %[[RHS:.*]]: tensor<9x5xf32>)
@@ -2783,7 +2783,7 @@ func.func @einsum_matmul(%arg0: tensor<7x9xf32>, %arg1: tensor<9x5xf32>) -> tens
 // CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d5)>
 // CHECK: func @einsum_broadcast4
 func.func @einsum_broadcast4(%arg0: tensor<3x4x5x6x7xf32>, %arg1: tensor<7x8xf32>) -> tensor<3x4x5x6x8xf32> {
-  %0 = "mhlo.einsum"(%arg0, %arg1) {einsum_config = "abcdh,hg->abcdg"}: (tensor<3x4x5x6x7xf32>, tensor<7x8xf32>) -> tensor<3x4x5x6x8xf32>
+  %0 = "mhlo.einsum"(%arg0, %arg1) <{einsum_config = "abcdh,hg->abcdg"}>: (tensor<3x4x5x6x7xf32>, tensor<7x8xf32>) -> tensor<3x4x5x6x8xf32>
   func.return %0 : tensor<3x4x5x6x8xf32>
 }
 // CHECK-SAME:  (%[[LHS:.*]]: tensor<3x4x5x6x7xf32>, %[[RHS:.*]]: tensor<7x8xf32>)
@@ -2807,7 +2807,7 @@ func.func @einsum_broadcast4(%arg0: tensor<3x4x5x6x7xf32>, %arg1: tensor<7x8xf32
 // CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
 // CHECK: func @einsum_ellipsis
 func.func @einsum_ellipsis(%arg0: tensor<1x512x128xf32>, %arg1: tensor<128x256xf32>) -> tensor<1x512x256xf32> {
-  %0 = "mhlo.einsum"(%arg0, %arg1) {einsum_config = "...x,xy->...y"} : (tensor<1x512x128xf32>, tensor<128x256xf32>) -> tensor<1x512x256xf32>
+  %0 = "mhlo.einsum"(%arg0, %arg1) <{einsum_config = "...x,xy->...y"}> : (tensor<1x512x128xf32>, tensor<128x256xf32>) -> tensor<1x512x256xf32>
   func.return %0 : tensor<1x512x256xf32>
 }
 // CHECK-SAME:  (%[[LHS:.*]]: tensor<1x512x128xf32>, %[[RHS:.*]]: tensor<128x256xf32>)
@@ -2831,7 +2831,7 @@ func.func @einsum_ellipsis(%arg0: tensor<1x512x128xf32>, %arg1: tensor<128x256xf
 // CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
 // CHECK: func @einsum_dynamic_size_broadcast_dot
 func.func @einsum_dynamic_size_broadcast_dot(%arg0: tensor<?x?x4xf32>, %arg1: tensor<4x?xf32>) -> tensor<?x?x?xf32> {
-  %0 = "mhlo.einsum"(%arg0, %arg1) {einsum_config = "abc,cd->abd"} : (tensor<?x?x4xf32>, tensor<4x?xf32>) -> tensor<?x?x?xf32>
+  %0 = "mhlo.einsum"(%arg0, %arg1) <{einsum_config = "abc,cd->abd"}> : (tensor<?x?x4xf32>, tensor<4x?xf32>) -> tensor<?x?x?xf32>
   func.return %0 : tensor<?x?x?xf32>
 }
 // CHECK-SAME:  (%[[LHS:.*]]: tensor<?x?x4xf32>, %[[RHS:.*]]: tensor<4x?xf32>)
@@ -5032,7 +5032,7 @@ func.func @torch_index_select(%arg0: tensor<5x1x5xi32>,
 func.func @rng_uniform_1d(%min: tensor<f32>, %max: tensor<f32>) -> tensor<10xf32>
 {
   %shape = arith.constant dense<[10]>  : tensor<1xi32>
-  %0 = "mhlo.rng"(%min, %max, %shape) {rng_distribution = #mhlo.rng_distribution<UNIFORM>} : (tensor<f32>, tensor<f32>, tensor<1xi32>) -> tensor<10xf32>
+  %0 = "mhlo.rng"(%min, %max, %shape) <{rng_distribution = #mhlo.rng_distribution<UNIFORM>}> : (tensor<f32>, tensor<f32>, tensor<1xi32>) -> tensor<10xf32>
   func.return %0 : tensor<10xf32>
 }
 // CHECK-LABEL: func @rng_uniform_1d
@@ -5057,7 +5057,7 @@ func.func @rng_uniform_1d(%min: tensor<f32>, %max: tensor<f32>) -> tensor<10xf32
 func.func @rng_uniform_2d(%min: tensor<f32>, %max: tensor<f32>) -> tensor<3x3xf32>
 {
         %shape = arith.constant dense<[3, 3]>  : tensor<2xi32>
-        %0 = "mhlo.rng"(%min, %max, %shape) {rng_distribution = #mhlo.rng_distribution<UNIFORM>} : (tensor<f32>, tensor<f32>, tensor<2xi32>) -> tensor<3x3xf32>
+        %0 = "mhlo.rng"(%min, %max, %shape) <{rng_distribution = #mhlo.rng_distribution<UNIFORM>}> : (tensor<f32>, tensor<f32>, tensor<2xi32>) -> tensor<3x3xf32>
         func.return %0 : tensor<3x3xf32>
 }
 // CHECK-LABEL: func @rng_uniform_2d
@@ -5087,7 +5087,7 @@ func.func @rng_uniform_2d(%min: tensor<f32>, %max: tensor<f32>) -> tensor<3x3xf3
 func.func @rng_uniform_3d(%min: tensor<f32>, %max: tensor<f32>) -> tensor<2x2x2xf32>
 {
         %shape = arith.constant dense<[2, 2, 2]>  : tensor<3xi32>
-        %0 = "mhlo.rng"(%min, %max, %shape) {rng_distribution = #mhlo.rng_distribution<UNIFORM>} : (tensor<f32>, tensor<f32>, tensor<3xi32>) -> tensor<2x2x2xf32>
+        %0 = "mhlo.rng"(%min, %max, %shape) <{rng_distribution = #mhlo.rng_distribution<UNIFORM>}> : (tensor<f32>, tensor<f32>, tensor<3xi32>) -> tensor<2x2x2xf32>
         func.return %0 : tensor<2x2x2xf32>
 }
 // CHECK-LABEL: func @rng_uniform_3d
@@ -5121,7 +5121,7 @@ func.func @rng_uniform_3d(%min: tensor<f32>, %max: tensor<f32>) -> tensor<2x2x2x
 
 func.func @rng_uniform_dynamic_1d(%min: tensor<f32>, %max: tensor<f32>, %shape: tensor<1xi32>) -> tensor<?xf32>
 {
-  %0 = "mhlo.rng"(%min, %max, %shape) {rng_distribution = #mhlo.rng_distribution<UNIFORM>} : (tensor<f32>, tensor<f32>, tensor<1xi32>) -> tensor<?xf32>
+  %0 = "mhlo.rng"(%min, %max, %shape) <{rng_distribution = #mhlo.rng_distribution<UNIFORM>}> : (tensor<f32>, tensor<f32>, tensor<1xi32>) -> tensor<?xf32>
   func.return %0 : tensor<?xf32>
 }
 // CHECK-LABEL: func @rng_uniform_dynamic_1d
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir
index 9e8e5b7b71d106..31cf7728f48d2d 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir
@@ -390,15 +390,15 @@ func.func @op_all_reduce(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK-SAME{LITERAL}:   replica_groups = dense<[[0], [1]]> : tensor<2x1xi64>,
   //          CHECK-SAME:   use_global_device_ids
   //          CHECK-SAME: } : (tensor<f32>) -> tensor<f32>
-  %0 = "mhlo.all_reduce"(%arg0) ({
-    ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
-      %1 = "mhlo.add"(%arg1, %arg2) : (tensor<f32>, tensor<f32>) -> tensor<f32>
-      "mhlo.return"(%1) : (tensor<f32>) -> ()
-  }) {
+  %0 = "mhlo.all_reduce"(%arg0) <{
     replica_groups = dense<[[0], [1]]> : tensor<2x1xi64>,
     channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
     use_global_device_ids
-  } : (tensor<f32>) -> tensor<f32>
+  }> ({
+    ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+      %1 = "mhlo.add"(%arg1, %arg2) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+      "mhlo.return"(%1) : (tensor<f32>) -> ()
+  }) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
@@ -492,9 +492,9 @@ func.func @op_broadcast_in_dim(%arg0: tensor<16xf32>) -> tensor<16x16xf32> {
   //      CHECK: "stablehlo.broadcast_in_dim"(%arg0) {
   // CHECK-SAME:   broadcast_dimensions = array<i64: 1>
   // CHECK-SAME: } : (tensor<16xf32>) -> tensor<16x16xf32>
-  %0 = "mhlo.broadcast_in_dim"(%arg0) {
+  %0 = "mhlo.broadcast_in_dim"(%arg0) <{
     broadcast_dimensions = dense<1> : tensor<1xi64>
-  } : (tensor<16xf32>) -> tensor<16x16xf32>
+  }> : (tensor<16xf32>) -> tensor<16x16xf32>
   func.return %0 : tensor<16x16xf32>
 }
 
@@ -802,11 +802,11 @@ func.func @op_dynamic_broadcast_in_dim(%arg0: tensor<?xf32>, %arg1: tensor<2xind
   // CHECK-SAME:   known_expanding_dimensions = array<i64>,
   // CHECK-SAME:   known_nonexpanding_dimensions = array<i64: 0>
   // CHECK-SAME: } : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
-  %0 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %arg1) {
+  %0 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %arg1) <{
     broadcast_dimensions = dense<1> : tensor<1xi64>,
     known_expanding_dimensions = dense<[]> : tensor<0xi64>,
     known_nonexpanding_dimensions = dense<0> : tensor<1xi64>
-  } : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
+  }> : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
   func.return %0 : tensor<?x?xf32>
 }
 
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-torch-index-select-to-gather.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-torch-index-select-to-gather.mlir
index 73a93946c2d788..058bbe8b068faf 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-torch-index-select-to-gather.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-torch-index-select-to-gather.mlir
@@ -3,7 +3,7 @@
 // CHECK-LABEL: @index_select_to_gather_convert_index_type
 func.func @index_select_to_gather_convert_index_type(%arg0 : tensor<5x1x5xi64>, %arg1 : tensor<2xi64>) -> tensor<2x1x5xi64> {
   // CHECK: [[ARG1:%.+]] = mhlo.convert %arg1 : (tensor<2xi64>) -> tensor<2xui32>
-  // CHECK: [[RES:%.+]] = "mhlo.gather"(%arg0, [[ARG1]]) {
+  // CHECK: [[RES:%.+]] = "mhlo.gather"(%arg0, [[ARG1]]) <{
   // CHECK-SAME:   dimension_numbers = #mhlo.gather<
   // CHECK-SAME:     offset_dims = [1, 2],
   // CHECK-SAME:     collapsed_slice_dims = [0],
@@ -12,11 +12,11 @@ func.func @index_select_to_gather_convert_index_type(%arg0 : tensor<5x1x5xi64>,
   // CHECK-SAME:   >,
   // CHECK-SAME:   indices_are_sorted = false,
   // CHECK-SAME:   slice_sizes = dense<[1, 1, 5]> : tensor<3xi64>
-  // CHECK-SAME: } : (tensor<5x1x5xi64>, tensor<2xui32>) -> tensor<2x1x5xi64>
-  %0 = "mhlo.torch_index_select"(%arg0, %arg1) {
+  // CHECK-SAME: }> : (tensor<5x1x5xi64>, tensor<2xui32>) -> tensor<2x1x5xi64>
+  %0 = "mhlo.torch_index_select"(%arg0, %arg1) <{
     dim = 0 : i64,
     batch_dims = 0 : i64
-  } : (tensor<5x1x5xi64>, tensor<2xi64>) -> tensor<2x1x5xi64>
+  }> : (tensor<5x1x5xi64>, tensor<2xi64>) -> tensor<2x1x5xi64>
   // CHECK: return [[RES]] : tensor<2x1x5xi64>
   func.return %0 : tensor<2x1x5xi64>
 }
@@ -25,7 +25,7 @@ func.func @index_select_to_gather_convert_index_type(%arg0 : tensor<5x1x5xi64>,
 
 // CHECK-LABEL: @index_select_to_gather_multi_offset_dims
 func.func @index_select_to_gather_multi_offset_dims(%arg0 : tensor<5x1x5xi32>, %arg1 : tensor<2xi32>) -> tensor<2x1x5xi32> {
-  // CHECK: [[RES:%.+]] = "mhlo.gather"(%arg0, %arg1) {
+  // CHECK: [[RES:%.+]] = "mhlo.gather"(%arg0, %arg1) <{
   // CHECK-SAME:   dimension_numbers = #mhlo.gather<
   // CHECK-SAME:     offset_dims = [1, 2],
   // CHECK-SAME:     collapsed_slice_dims = [0],
@@ -34,11 +34,11 @@ func.func @index_select_to_gather_multi_offset_dims(%arg0 : tensor<5x1x5xi32>, %
   // CHECK-SAME:   >,
   // CHECK-SAME:   indices_are_sorted = false,
   // CHECK-SAME:   slice_sizes = dense<[1, 1, 5]> : tensor<3xi64>
-  // CHECK-SAME: } : (tensor<5x1x5xi32>, tensor<2xi32>) -> tensor<2x1x5xi32>
-  %0 = "mhlo.torch_index_select"(%arg0, %arg1) {
+  // CHECK-SAME: }> : (tensor<5x1x5xi32>, tensor<2xi32>) -> tensor<2x1x5xi32>
+  %0 = "mhlo.torch_index_select"(%arg0, %arg1) <{
     dim = 0 : i64,
     batch_dims = 0 : i64
-  } : (tensor<5x1x5xi32>, tensor<2xi32>) -> tensor<2x1x5xi32>
+  }> : (tensor<5x1x5xi32>, tensor<2xi32>) -> tensor<2x1x5xi32>
   // CHECK: return [[RES]] : tensor<2x1x5xi32>
   func.return %0 : tensor<2x1x5xi32>
 }
@@ -47,7 +47,7 @@ func.func @index_select_to_gather_multi_offset_dims(%arg0 : tensor<5x1x5xi32>, %
 
 // CHECK-LABEL: @index_select_to_gather_larger_output
 func.func @index_select_to_gather_larger_output(%arg0 : tensor<5x4xf32>, %arg1 : tensor<1x3x1xi32>) -> tensor<1x3x1x4xf32> {
-  // CHECK: [[RES:%.+]] = "mhlo.gather"(%arg0, %arg1) {
+  // CHECK: [[RES:%.+]] = "mhlo.gather"(%arg0, %arg1) <{
   // CHECK-SAME:   dimension_numbers = #mhlo.gather<
   // CHECK-SAME:     offset_dims = [3],
   // CHECK-SAME:     collapsed_slice_dims = [0],
@@ -56,11 +56,11 @@ func.func @index_select_to_gather_larger_output(%arg0 : tensor<5x4xf32>, %arg1 :
   // CHECK-SAME:   >,
   // CHECK-SAME:   indices_are_sorted = false,
   // CHECK-SAME:   slice_sizes = dense<[1, 4]> : tensor<2xi64>
-  // CHECK-SAME: } : (tensor<5x4xf32>, tensor<1x3x1xi32>) -> tensor<1x3x1x4xf32>
-  %0 = "mhlo.torch_index_select"(%arg0, %arg1) {
+  // CHECK-SAME: }> : (tensor<5x4xf32>, tensor<1x3x1xi32>) -> tensor<1x3x1x4xf32>
+  %0 = "mhlo.torch_index_select"(%arg0, %arg1) <{
     dim = 0 : i64,
     batch_dims = 0 : i64
-  } : (tensor<5x4xf32>, tensor<1x3x1xi32>) -> tensor<1x3x1x4xf32>
+  }> : (tensor<5x4xf32>, tensor<1x3x1xi32>) -> tensor<1x3x1x4xf32>
   // CHECK: return [[RES]] : tensor<1x3x1x4xf32>
   func.return %0 : tensor<1x3x1x4xf32>
 }
@@ -69,7 +69,7 @@ func.func @index_select_to_gather_larger_output(%arg0 : tensor<5x4xf32>, %arg1 :
 
 // CHECK-LABEL: @index_select_to_gather_regular_map
 func.func @index_select_to_gather_regular_map(%arg0: tensor<3x4xi32>, %arg1: tensor<2xi32>) -> tensor<2x4xi32> {
-  // CHECK: [[RES:%.+]] = "mhlo.gather"(%arg0, %arg1) {
+  // CHECK: [[RES:%.+]] = "mhlo.gather"(%arg0, %arg1) <{
   // CHECK-SAME:   dimension_numbers = #mhlo.gather<
   // CHECK-SAME:     offset_dims = [1],
   // CHECK-SAME:     collapsed_slice_dims = [0],
@@ -78,11 +78,11 @@ func.func @index_select_to_gather_regular_map(%arg0: tensor<3x4xi32>, %arg1: ten
   // CHECK-SAME:   >,
   // CHECK-SAME:   indices_are_sorted = false,
   // CHECK-SAME:   slice_sizes = dense<[1, 4]> : tensor<2xi64>
-  // CHECK-SAME: } : (tensor<3x4xi32>, tensor<2xi32>) -> tensor<2x4xi32>
-  %0 = "mhlo.torch_index_select"(%arg0, %arg1) {
+  // CHECK-SAME: }> : (tensor<3x4xi32>, tensor<2xi32>) -> tensor<2x4xi32>
+  %0 = "mhlo.torch_index_select"(%arg0, %arg1) <{
     dim = 0 : i64,
     batch_dims = 0 : i64
-  } : (tensor<3x4xi32>, tensor<2xi32>) -> tensor<2x4xi32>
+  }> : (tensor<3x4xi32>, tensor<2xi32>) -> tensor<2x4xi32>
   // CHECK: return [[RES]] : tensor<2x4xi32>
   func.return %0 : tensor<2x4xi32>
 }
@@ -91,7 +91,7 @@ func.func @index_select_to_gather_regular_map(%arg0: tensor<3x4xi32>, %arg1: ten
 
 // CHECK-LABEL: @index_select_to_gather_reverse_map
 func.func @index_select_to_gather_reverse_map(%arg0: tensor<3x4xi32>, %arg1: tensor<2xi32>) -> tensor<3x2xi32> {
-  // CHECK: [[RES:%.+]] = "mhlo.gather"(%arg0, %arg1) {
+  // CHECK: [[RES:%.+]] = "mhlo.gather"(%arg0, %arg1) <{
   // CHECK-SAME:   dimension_numbers = #mhlo.gather<
   // CHECK-SAME:     offset_dims = [0],
   // CHECK-SAME:     collapsed_slice_dims = [1],
@@ -100,11 +100,11 @@ func.func @index_select_to_gather_reverse_map(%arg0: tensor<3x4xi32>, %arg1: ten
   // CHECK-SAME:   >,
   // CHECK-SAME:   indices_are_sorted = false,
   // CHECK-SAME:   slice_sizes = dense<[3, 1]> : tensor<2xi64>
-  // CHECK-SAME: } : (tensor<3x4xi32>, tensor<2xi32>) -> tensor<3x2xi32>
-  %0 = "mhlo.torch_index_select"(%arg0, %arg1) {
+  // CHECK-SAME: }> : (tensor<3x4xi32>, tensor<2xi32>) -> tensor<3x2xi32>
+  %0 = "mhlo.torch_index_select"(%arg0, %arg1) <{
     dim = 1 : i64,
     batch_dims = 0 : i64
-  } : (tensor<3x4xi32>, tensor<2xi32>) -> tensor<3x2xi32>
+  }> : (tensor<3x4xi32>, tensor<2xi32>) -> tensor<3x2xi32>
   // CHECK: return [[RES]] : tensor<3x2xi32>
   func.return %0 : tensor<3x2xi32>
 }
@@ -113,10 +113,10 @@ func.func @index_select_to_gather_reverse_map(%arg0: tensor<3x4xi32>, %arg1: ten
 
 // CHECK-LABEL: @index_select_to_gather_batch_dim_greater_than_1
 func.func @index_select_to_gather_batch_dim_greater_than_1(%arg0 : tensor<5x1x5xi32>, %arg1 : tensor<2xi32>) -> tensor<2x5xi32> {
-  // CHECK: [[ARG0:%.+]] = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<2x1xi32>
+  // CHECK: [[ARG0:%.+]] = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<2x1xi32>
   // CHECK: [[ARG1:%.+]] = mhlo.reshape %arg1 : (tensor<2xi32>) -> tensor<2x1xi32>
-  // CHECK: [[ARG2:%.+]] = "mhlo.concatenate"([[ARG0]], [[ARG1]]) {dimension = 1 : i64} : (tensor<2x1xi32>, tensor<2x1xi32>) -> tensor<2x2xi32>
-  // CHECK: [[RES:%.+]] = "mhlo.gather"(%arg0, [[ARG2]]) {
+  // CHECK: [[ARG2:%.+]] = "mhlo.concatenate"([[ARG0]], [[ARG1]]) <{dimension = 1 : i64}> : (tensor<2x1xi32>, tensor<2x1xi32>) -> tensor<2x2xi32>
+  // CHECK: [[RES:%.+]] = "mhlo.gather"(%arg0, [[ARG2]]) <{
   // CHECK-SAME:   dimension_numbers = #mhlo.gather<
   // CHECK-SAME:     offset_dims = [1],
   // CHECK-SAME:     collapsed_slice_dims = [0, 1],
@@ -125,11 +125,11 @@ func.func @index_select_to_gather_batch_dim_greater_than_1(%arg0 : tensor<5x1x5x
   // CHECK-SAME:   >,
   // CHECK-SAME:   indices_are_sorted = false,
   // CHECK-SAME:   slice_sizes = dense<[1, 1, 5]> : tensor<3xi64>
-  // CHECK-SAME: } : (tensor<5x1x5xi32>, tensor<2x2xi32>) -> tensor<2x5xi32>
-  %0 = "mhlo.torch_index_select"(%arg0, %arg1) {
+  // CHECK-SAME: }> : (tensor<5x1x5xi32>, tensor<2x2xi32>) -> tensor<2x5xi32>
+  %0 = "mhlo.torch_index_select"(%arg0, %arg1) <{
     dim = 1 : i64,
     batch_dims = 1 : i64
-  } : (tensor<5x1x5xi32>, tensor<2xi32>) -> tensor<2x5xi32>
+  }> : (tensor<5x1x5xi32>, tensor<2xi32>) -> tensor<2x5xi32>
   func.return %0 : tensor<2x5xi32>
 }
 
@@ -137,10 +137,10 @@ func.func @index_select_to_gather_batch_dim_greater_than_1(%arg0 : tensor<5x1x5x
 
 func.func @index_select_to_gather_non_static_operand(%arg0 : tensor<5x1x?xi32>, %arg1 : tensor<2xi32>) -> tensor<2x1x5xi32> {
   // CHECK: mhlo.torch_index_select
-  %0 = "mhlo.torch_index_select"(%arg0, %arg1) {
+  %0 = "mhlo.torch_index_select"(%arg0, %arg1) <{
     dim = 0 : i64,
     batch_dims = 0 : i64
-  } : (tensor<5x1x?xi32>, tensor<2xi32>) -> tensor<2x1x5xi32>
+  }> : (tensor<5x1x?xi32>, tensor<2xi32>) -> tensor<2x1x5xi32>
   func.return %0 : tensor<2x1x5xi32>
 }
 
@@ -148,10 +148,10 @@ func.func @index_select_to_gather_non_static_operand(%arg0 : tensor<5x1x?xi32>,
 
 func.func @index_select_to_gather_non_static_index(%arg0 : tensor<5x1x5xi32>, %arg1 : tensor<?xi32>) -> tensor<2x1x5xi32> {
   // CHECK: mhlo.torch_index_select
-  %0 = "mhlo.torch_index_select"(%arg0, %arg1) {
+  %0 = "mhlo.torch_index_select"(%arg0, %arg1) <{
     dim = 0 : i64,
     batch_dims = 0 : i64
-  } : (tensor<5x1x5xi32>, tensor<?xi32>) -> tensor<2x1x5xi32>
+  }> : (tensor<5x1x5xi32>, tensor<?xi32>) -> tensor<2x1x5xi32>
   func.return %0 : tensor<2x1x5xi32>
 }
 
@@ -159,10 +159,10 @@ func.func @index_select_to_gather_non_static_index(%arg0 : tensor<5x1x5xi32>, %a
 
 func.func @index_select_to_gather_dim_less_than_batch_dims(%arg0 : tensor<5x1x5xi32>, %arg1 : tensor<2xi32>) -> tensor<2x1x5xi32> {
   // CHECK: mhlo.torch_index_select
-  %0 = "mhlo.torch_index_select"(%arg0, %arg1) {
+  %0 = "mhlo.torch_index_select"(%arg0, %arg1) <{
     dim = 0 : i64,
     batch_dims = 1 : i64
-  } : (tensor<5x1x5xi32>, tensor<2xi32>) -> tensor<2x1x5xi32>
+  }> : (tensor<5x1x5xi32>, tensor<2xi32>) -> tensor<2x1x5xi32>
   func.return %0 : tensor<2x1x5xi32>
 }
 
@@ -170,9 +170,9 @@ func.func @index_select_to_gather_dim_less_than_batch_dims(%arg0 : tensor<5x1x5x
 
 func.func @index_select_to_gather_non_integer_index(%arg0 : tensor<5x1x5xi32>, %arg1 : tensor<2xf32>) -> tensor<2x1x5xi32> {
   // CHECK: mhlo.torch_index_select
-  %0 = "mhlo.torch_index_select"(%arg0, %arg1) {
+  %0 = "mhlo.torch_index_select"(%arg0, %arg1) <{
     dim = 0 : i64,
     batch_dims = 0 : i64
-  } : (tensor<5x1x5xi32>, tensor<2xf32>) -> tensor<2x1x5xi32>
+  }> : (tensor<5x1x5xi32>, tensor<2xf32>) -> tensor<2x1x5xi32>
   func.return %0 : tensor<2x1x5xi32>
 }
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/legalize-control-flow.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/legalize-control-flow.mlir
index 2f91ca955248c6..49cb1c4f825758 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/legalize-control-flow.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/legalize-control-flow.mlir
@@ -64,14 +64,14 @@ func.func @while_multi_operands(%arg0: tensor<3xi32>) -> tuple<tensor<i32>, tens
     // CHECK-NEXT: %[[VAL_13:.*]] = mhlo.constant dense<1> : tensor<i32>
     // CHECK: %[[VAL_14:.*]] = mhlo.add %[[VAL_10]], %[[VAL_13]] : tensor<i32>
     // CHECK: %[[VAL_15:.*]] = mhlo.convert %[[VAL_10]] : tensor<i32>
-    // CHECK: %[[VAL_16:.*]] = "mhlo.broadcast_in_dim"(%[[VAL_15]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<i32>) -> tensor<3xi32>
+    // CHECK: %[[VAL_16:.*]] = "mhlo.broadcast_in_dim"(%[[VAL_15]]) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<i32>) -> tensor<3xi32>
     // CHECK: %[[VAL_17:.*]] = mhlo.add %[[VAL_11]], %[[VAL_16]] : tensor<3xi32>
     // CHECK: scf.yield %[[VAL_14]], %[[VAL_17]] : tensor<i32>, tensor<3xi32>
     %4 = mhlo.constant dense<false> : tensor<i1>
     %5 = mhlo.constant dense<1> : tensor<i32>
     %6 = mhlo.add %arg1, %5 : tensor<i32>
     %7 = mhlo.convert %arg1 : (tensor<i32>) -> tensor<i32>
-    %8 = "mhlo.broadcast_in_dim"(%7) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<i32>) -> tensor<3xi32>
+    %8 = "mhlo.broadcast_in_dim"(%7) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<i32>) -> tensor<3xi32>
     %9 = mhlo.add %arg2, %8 : tensor<3xi32>
     "mhlo.return"(%6, %9) : (tensor<i32>, tensor<3xi32>) -> ()
   }) : (tensor<i32>, tensor<3xi32>) -> (tensor<i32>, tensor<3xi32>)
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/legalize-hlo-shape-computations.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/legalize-hlo-shape-computations.mlir
index fe6ab0e1860da2..c98d4fe442d052 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/legalize-hlo-shape-computations.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/legalize-hlo-shape-computations.mlir
@@ -2,7 +2,7 @@
 
  // CHECK-LABEL: func @get_dimension_size
 func.func @get_dimension_size(%arg0: tensor<?x?xf32>) -> (tensor<i32>) {
-  %1 = "mhlo.get_dimension_size"(%arg0) {dimension = 1 : i64} : (tensor<?x?xf32>) -> tensor<i32>
+  %1 = "mhlo.get_dimension_size"(%arg0) <{dimension = 1 : i64}> : (tensor<?x?xf32>) -> tensor<i32>
   func.return %1 : tensor<i32>
 }
 
@@ -16,7 +16,7 @@ func.func @get_dimension_size(%arg0: tensor<?x?xf32>) -> (tensor<i32>) {
 
  // CHECK-LABEL: func @reshape_dimension_size
 func.func @reshape_dimension_size(%arg0: tensor<?x?xf32>) -> (tensor<1xi32>) {
-  %0 = "mhlo.get_dimension_size"(%arg0) {dimension = 1 : i64} : (tensor<?x?xf32>) -> tensor<i32>
+  %0 = "mhlo.get_dimension_size"(%arg0) <{dimension = 1 : i64}> : (tensor<?x?xf32>) -> tensor<i32>
   %1 = "mhlo.reshape"(%0) : (tensor<i32>) -> tensor<1xi32>
   func.return %1 : tensor<1xi32>
 }
@@ -32,7 +32,7 @@ func.func @reshape_dimension_size(%arg0: tensor<?x?xf32>) -> (tensor<1xi32>) {
 // CHECK-LABEL: func @multiply_dimension_size
 func.func @multiply_dimension_size(%arg0: tensor<?x?xf32>) -> (tensor<i32>) {
   %0 = mhlo.constant dense<2> : tensor<i32>
-  %1 = "mhlo.get_dimension_size"(%arg0) {dimension = 1 : i64} : (tensor<?x?xf32>) -> tensor<i32>
+  %1 = "mhlo.get_dimension_size"(%arg0) <{dimension = 1 : i64}> : (tensor<?x?xf32>) -> tensor<i32>
   %2 = "mhlo.multiply"(%0, %1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
   func.return %2 : tensor<i32>
 }
@@ -50,10 +50,10 @@ func.func @multiply_dimension_size(%arg0: tensor<?x?xf32>) -> (tensor<i32>) {
 
 // CHECK-LABEL: func @concat_dimension_size
 func.func @concat_dimension_size(%arg0: tensor<?x?xf32>) -> (tensor<2xi32>) {
-  %0 = "mhlo.get_dimension_size"(%arg0) {dimension = 1 : i64} : (tensor<?x?xf32>) -> tensor<i32>
+  %0 = "mhlo.get_dimension_size"(%arg0) <{dimension = 1 : i64}> : (tensor<?x?xf32>) -> tensor<i32>
   %1 = "mhlo.reshape"(%0) : (tensor<i32>) -> tensor<1xi32>
   %2 = mhlo.constant dense<2> : tensor<1xi32>
-  %3 = "mhlo.concatenate"(%1, %2) {dimension = 0 : i64} : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+  %3 = "mhlo.concatenate"(%1, %2) <{dimension = 0 : i64}> : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
   func.return %3 : tensor<2xi32>
 }
 
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/legalize-to-std.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/legalize-to-std.mlir
index 59900aeae16ac3..cd1ea4fdb9a553 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/legalize-to-std.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/legalize-to-std.mlir
@@ -120,7 +120,7 @@ func.func @float_constant() -> (tensor<f32>, tensor<2x3xf32>, tensor<2x3xf32>) {
 // CHECK-LABEL: func @iota.const.1() -> tensor<4xi32> {
 func.func @iota.const.1() -> tensor<4xi32> {
   // CHECK-NEXT: %[[CST:.*]] = arith.constant dense<[0, 1, 2, 3]> : tensor<4xi32>
-  %0 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<4xi32>
+  %0 = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<4xi32>
   // CHECK-NEXT: return %[[CST]] : tensor<4xi32>
   func.return %0 : tensor<4xi32>
 }
@@ -128,7 +128,7 @@ func.func @iota.const.1() -> tensor<4xi32> {
 // CHECK-LABEL: func @iota.const.2() -> tensor<2x4xi32> {
 func.func @iota.const.2() -> tensor<2x4xi32> {
   // CHECK-NEXT: %[[CST:.*]] = arith.constant dense<{{\[\[}}0, 0, 0, 0], [1, 1, 1, 1]]> : tensor<2x4xi32>
-  %0 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<2x4xi32>
+  %0 = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<2x4xi32>
   // CHECK-NEXT: return %[[CST]] : tensor<2x4xi32>
   func.return %0 : tensor<2x4xi32>
 }
@@ -136,7 +136,7 @@ func.func @iota.const.2() -> tensor<2x4xi32> {
 // CHECK-LABEL: func @iota.const.3() -> tensor<2x4xi32> {
 func.func @iota.const.3() -> tensor<2x4xi32> {
   // CHECK-NEXT: %[[CST:.*]] = arith.constant dense<{{\[\[}}0, 1, 2, 3], [0, 1, 2, 3]]> : tensor<2x4xi32>
-  %0 = "mhlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<2x4xi32>
+  %0 = "mhlo.iota"() <{iota_dimension = 1 : i64}> : () -> tensor<2x4xi32>
   // CHECK-NEXT: return %[[CST]] : tensor<2x4xi32>
   func.return %0 : tensor<2x4xi32>
 }
@@ -144,7 +144,7 @@ func.func @iota.const.3() -> tensor<2x4xi32> {
 // CHECK-LABEL: func @iota.const.4() -> tensor<2x3x4xi32> {
 func.func @iota.const.4() -> tensor<2x3x4xi32> {
   // CHECK-NEXT: %[[CST:.*]] = arith.constant dense<{{\[\[\[}}0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0{{\]\]}}, {{\[\[}}1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]]> : tensor<2x3x4xi32>
-  %0 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<2x3x4xi32>
+  %0 = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<2x3x4xi32>
   // CHECK-NEXT: return %[[CST]] : tensor<2x3x4xi32>
   func.return %0 : tensor<2x3x4xi32>
 }
@@ -152,7 +152,7 @@ func.func @iota.const.4() -> tensor<2x3x4xi32> {
 // CHECK-LABEL: func @iota.const.5() -> tensor<2x3x4xi32> {
 func.func @iota.const.5() -> tensor<2x3x4xi32> {
   // CHECK-NEXT: %[[CST:.*]] = arith.constant dense<{{\[\[\[}}0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2{{\]\]}}, {{\[\[}}0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]]]> : tensor<2x3x4xi32>
-  %0 = "mhlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<2x3x4xi32>
+  %0 = "mhlo.iota"() <{iota_dimension = 1 : i64}> : () -> tensor<2x3x4xi32>
   // CHECK-NEXT: return %[[CST]] : tensor<2x3x4xi32>
   func.return %0 : tensor<2x3x4xi32>
 }
@@ -160,7 +160,7 @@ func.func @iota.const.5() -> tensor<2x3x4xi32> {
 // CHECK-LABEL: func @iota.const.6() -> tensor<2x3x4xi32> {
 func.func @iota.const.6() -> tensor<2x3x4xi32> {
   // CHECK-NEXT: %[[CST:.*]] = arith.constant dense<{{\[\[\[}}0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3{{\]\]}}, {{\[\[}}0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3]]]> : tensor<2x3x4xi32>
-  %0 = "mhlo.iota"() {iota_dimension = 2 : i64} : () -> tensor<2x3x4xi32>
+  %0 = "mhlo.iota"() <{iota_dimension = 2 : i64}> : () -> tensor<2x3x4xi32>
   // CHECK-NEXT: return %[[CST]] : tensor<2x3x4xi32>
   func.return %0 : tensor<2x3x4xi32>
 }
@@ -168,7 +168,7 @@ func.func @iota.const.6() -> tensor<2x3x4xi32> {
 // CHECK-LABEL: func @iota.const.f32
 func.func @iota.const.f32() -> tensor<4xf32> {
   // CHECK-NEXT: %[[CST:.*]] = arith.constant dense<[0.000000e+00, 1.000000e+00, 2.000000e+00, 3.000000e+00]> : tensor<4xf32>
-  %0 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<4xf32>
+  %0 = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<4xf32>
   // CHECK-NEXT: return %[[CST]] : tensor<4xf32>
   func.return %0 : tensor<4xf32>
 }
@@ -176,7 +176,7 @@ func.func @iota.const.f32() -> tensor<4xf32> {
 // CHECK-LABEL: func @iota.const.f64
 func.func @iota.const.f64() -> tensor<4xf64> {
   // CHECK-NEXT: %[[CST:.*]] = arith.constant dense<[0.000000e+00, 1.000000e+00, 2.000000e+00, 3.000000e+00]> : tensor<4xf64>
-  %0 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<4xf64>
+  %0 = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<4xf64>
   // CHECK-NEXT: return %[[CST]] : tensor<4xf64>
   func.return %0 : tensor<4xf64>
 }
@@ -184,7 +184,7 @@ func.func @iota.const.f64() -> tensor<4xf64> {
 // CHECK-LABEL: func @iota.const.bf16
 func.func @iota.const.bf16() -> tensor<4xbf16> {
   // CHECK-NEXT: %[[CST:.*]] = arith.constant dense<[0.000000e+00, 1.000000e+00, 2.000000e+00, 3.000000e+00]> : tensor<4xbf16>
-  %0 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<4xbf16>
+  %0 = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<4xbf16>
   // CHECK-NEXT: return %[[CST]] : tensor<4xbf16>
   func.return %0 : tensor<4xbf16>
 }
@@ -194,7 +194,7 @@ func.func @iota.const.complex.f32() -> tensor<4xcomplex<f32>> {
   // CHECK-NEXT: [[REAL:%.*]] = arith.constant dense<[0.000000e+00, 1.000000e+00, 2.000000e+00, 3.000000e+00]> : tensor<4xf32>
   // CHECK-NEXT: [[IMAG:%.*]] = arith.constant dense<0.000000e+00> : tensor<4xf32>
   // CHECK-NEXT: [[COMPLEX:%.*]] = mhlo.complex [[REAL]], [[IMAG]]
-  %0 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<4xcomplex<f32>>
+  %0 = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<4xcomplex<f32>>
   // CHECK-NEXT: return [[COMPLEX]] : tensor<4xcomplex<f32>>
   func.return %0 : tensor<4xcomplex<f32>>
 }
@@ -204,7 +204,7 @@ func.func @iota.const.complex.f64() -> tensor<4xcomplex<f64>> {
   // CHECK-NEXT: [[REAL:%.*]] = arith.constant dense<[0.000000e+00, 1.000000e+00, 2.000000e+00, 3.000000e+00]> : tensor<4xf64>
   // CHECK-NEXT: [[IMAG:%.*]] = arith.constant dense<0.000000e+00> : tensor<4xf64>
   // CHECK-NEXT: [[COMPLEX:%.*]] = mhlo.complex [[REAL]], [[IMAG]]
-  %0 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<4xcomplex<f64>>
+  %0 = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<4xcomplex<f64>>
   // CHECK-NEXT: return [[COMPLEX]] : tensor<4xcomplex<f64>>
   func.return %0 : tensor<4xcomplex<f64>>
 }
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/lower-general-dot.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/lower-general-dot.mlir
index 60b810ff24432d..f0d93cabe5b205 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/lower-general-dot.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/lower-general-dot.mlir
@@ -3,15 +3,15 @@
 // CHECK-LABEL: @testDebatch1
 func.func @testDebatch1(%arg0: tensor<1x1x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<1x1x3xf32> {
   // CHECK-DAG: [[R0:%.+]] = mhlo.reshape %arg0 : (tensor<1x1x2xf32>) -> tensor<1x2xf32>
-  // CHECK-DAG: [[R1:%.+]] = "mhlo.dot"([[R0]], %arg1) {precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+  // CHECK-DAG: [[R1:%.+]] = "mhlo.dot"([[R0]], %arg1) <{precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]}> : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
   // CHECK: [[R2:%.+]] = mhlo.reshape [[R1]] : (tensor<1x3xf32>) -> tensor<1x1x3xf32>
-  %0 = "mhlo.dot_general"(%arg0, %arg1) {
+  %0 = "mhlo.dot_general"(%arg0, %arg1) <{
     dot_dimension_numbers = #mhlo.dot<
       lhs_contracting_dimensions = [2],
       rhs_contracting_dimensions = [0]
     >,
    precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]
-  } : (tensor<1x1x2xf32>, tensor<2x3xf32>) -> tensor<1x1x3xf32>
+  }> : (tensor<1x1x2xf32>, tensor<2x3xf32>) -> tensor<1x1x3xf32>
 
   func.return %0 : tensor<1x1x3xf32>
 }
@@ -20,19 +20,19 @@ func.func @testDebatch1(%arg0: tensor<1x1x2xf32>, %arg1: tensor<2x3xf32>) -> ten
 
 // CHECK-LABEL: @testDebatch2
 func.func @testDebatch2(%arg0: tensor<2x3xf32>, %arg1: tensor<1x1x2xf32>) -> tensor<3x1x1xf32> {
-  // CHECK-DAG: [[R0:%.+]] = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<2x3xf32>) -> tensor<3x2xf32>
-  // CHECK-DAG: [[R1:%.+]] = "mhlo.transpose"(%arg1) {permutation = dense<[2, 0, 1]> : tensor<3xi64>} : (tensor<1x1x2xf32>) -> tensor<2x1x1xf32>
+  // CHECK-DAG: [[R0:%.+]] = "mhlo.transpose"(%arg0) <{permutation = dense<[1, 0]> : tensor<2xi64>}> : (tensor<2x3xf32>) -> tensor<3x2xf32>
+  // CHECK-DAG: [[R1:%.+]] = "mhlo.transpose"(%arg1) <{permutation = dense<[2, 0, 1]> : tensor<3xi64>}> : (tensor<1x1x2xf32>) -> tensor<2x1x1xf32>
   // CHECK-DAG: [[R2:%.+]] = mhlo.reshape [[R1]] : (tensor<2x1x1xf32>) -> tensor<2x1xf32>
-  // CHECK-DAG: [[R3:%.+]] = "mhlo.dot"([[R0]], [[R2]]) {precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<3x2xf32>, tensor<2x1xf32>) -> tensor<3x1xf32>
+  // CHECK-DAG: [[R3:%.+]] = "mhlo.dot"([[R0]], [[R2]]) <{precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]}> : (tensor<3x2xf32>, tensor<2x1xf32>) -> tensor<3x1xf32>
   // CHECK: [[R4:%.+]] = mhlo.reshape [[R3]] : (tensor<3x1xf32>) -> tensor<3x1x1xf32>
 
-  %0 = "mhlo.dot_general"(%arg0, %arg1) {
+  %0 = "mhlo.dot_general"(%arg0, %arg1) <{
     dot_dimension_numbers = #mhlo.dot<
       lhs_contracting_dimensions = [0],
       rhs_contracting_dimensions = [2]
     >,
     precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]
-  } : (tensor<2x3xf32>, tensor<1x1x2xf32>) -> tensor<3x1x1xf32>
+  }> : (tensor<2x3xf32>, tensor<1x1x2xf32>) -> tensor<3x1x1xf32>
   func.return %0 : tensor<3x1x1xf32>
 }
 
@@ -41,7 +41,7 @@ func.func @testDebatch2(%arg0: tensor<2x3xf32>, %arg1: tensor<1x1x2xf32>) -> ten
 // CHECK-LABEL: @testBatchPassthrough
 func.func @testBatchPassthrough(%arg0: tensor<2x2x3xf32>, %arg1: tensor<2x1x2xf32>) -> tensor<2x3x1xf32> {
   // CHECK-NEXT: "mhlo.dot_general"(%arg0, %arg1)
-  %0 = "mhlo.dot_general"(%arg0, %arg1) {
+  %0 = "mhlo.dot_general"(%arg0, %arg1) <{
     dot_dimension_numbers = #mhlo.dot<
       lhs_batching_dimensions = [0],
       lhs_contracting_dimensions = [1],
@@ -49,7 +49,7 @@ func.func @testBatchPassthrough(%arg0: tensor<2x2x3xf32>, %arg1: tensor<2x1x2xf3
       rhs_contracting_dimensions = [2]
     >,
     precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]
-  } : (tensor<2x2x3xf32>, tensor<2x1x2xf32>) -> tensor<2x3x1xf32>
+  }> : (tensor<2x2x3xf32>, tensor<2x1x2xf32>) -> tensor<2x3x1xf32>
   func.return %0 : tensor<2x3x1xf32>
 }
 
@@ -59,13 +59,13 @@ func.func @testBatchPassthrough(%arg0: tensor<2x2x3xf32>, %arg1: tensor<2x1x2xf3
 func.func @testVec(%arg0: tensor<32xf32>, %arg1: tensor<32xf32>) -> tensor<f32> {
   // CHECK-NEXT: [[R:%.+]] = "mhlo.dot"(%arg0, %arg1)
   // CHECK-NEXT: return [[R]]
-  %0 = "mhlo.dot_general"(%arg0, %arg1) {
+  %0 = "mhlo.dot_general"(%arg0, %arg1) <{
     dot_dimension_numbers = #mhlo.dot<
       lhs_contracting_dimensions = [0],
       rhs_contracting_dimensions = [0]
     >,
     precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]
-  } : (tensor<32xf32>, tensor<32xf32>) -> tensor<f32>
+  }> : (tensor<32xf32>, tensor<32xf32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
@@ -75,13 +75,13 @@ func.func @testVec(%arg0: tensor<32xf32>, %arg1: tensor<32xf32>) -> tensor<f32>
 func.func @testMatVec(%arg0: tensor<20x32xf32>, %arg1: tensor<32xf32>) -> tensor<20xf32> {
   // CHECK-NEXT: [[R:%.+]] = "mhlo.dot"(%arg0, %arg1)
   // CHECK-NEXT: return [[R]]
-  %0 = "mhlo.dot_general"(%arg0, %arg1) {
+  %0 = "mhlo.dot_general"(%arg0, %arg1) <{
     dot_dimension_numbers = #mhlo.dot<
       lhs_contracting_dimensions = [1],
       rhs_contracting_dimensions = [0]
     >,
     precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]
-  } : (tensor<20x32xf32>, tensor<32xf32>) -> tensor<20xf32>
+  }> : (tensor<20x32xf32>, tensor<32xf32>) -> tensor<20xf32>
   func.return %0 : tensor<20xf32>
 }
 
@@ -89,25 +89,25 @@ func.func @testMatVec(%arg0: tensor<20x32xf32>, %arg1: tensor<32xf32>) -> tensor
 
 // CHECK-LABEL: @testMatVec
 func.func @testMatVec(%arg0: tensor<32x20xf32>, %arg1: tensor<32xf32>) -> tensor<20xf32> {
-  // CHECK-NEXT: [[T:%.+]] = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0]>
+  // CHECK-NEXT: [[T:%.+]] = "mhlo.transpose"(%arg0) <{permutation = dense<[1, 0]>
   // CHECK-NEXT: [[R1:%.+]] = mhlo.reshape %arg1 : (tensor<32xf32>) -> tensor<32x1xf32>
   // CHECK-NEXT: [[M:%.+]] = "mhlo.dot"([[T]], [[R1]])
   // CHECK-NEXT: [[R:%.+]] = mhlo.reshape [[M]]
   // CHECK-NEXT: return [[R]]
-  %0 = "mhlo.dot_general"(%arg0, %arg1) {
+  %0 = "mhlo.dot_general"(%arg0, %arg1) <{
     dot_dimension_numbers = #mhlo.dot<
       lhs_contracting_dimensions = [0],
       rhs_contracting_dimensions = [0]
     >,
     precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]
-  } : (tensor<32x20xf32>, tensor<32xf32>) -> tensor<20xf32>
+  }> : (tensor<32x20xf32>, tensor<32xf32>) -> tensor<20xf32>
   func.return %0 : tensor<20xf32>
 }
 
 // -----
 
 func.func @dot_general_to_dot_dynamic(%arg0: tensor<128x4x?x32xf32>, %arg1: tensor<8x?x128x4xf32>) -> tensor<?x32x8x?xf32> {
-  %0 = "mhlo.dot_general"(%arg0, %arg1) {
+  %0 = "mhlo.dot_general"(%arg0, %arg1) <{
     dot_dimension_numbers = #mhlo.dot<
       lhs_batching_dimensions = [],
       lhs_contracting_dimensions = [0, 1],
@@ -115,31 +115,31 @@ func.func @dot_general_to_dot_dynamic(%arg0: tensor<128x4x?x32xf32>, %arg1: tens
       rhs_contracting_dimensions = [2, 3],
     >,
     precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]
-  } : (tensor<128x4x?x32xf32>, tensor<8x?x128x4xf32>) -> tensor<?x32x8x?xf32>
+  }> : (tensor<128x4x?x32xf32>, tensor<8x?x128x4xf32>) -> tensor<?x32x8x?xf32>
   func.return %0 : tensor<?x32x8x?xf32>
 }
 // CHECK-LABEL: func @dot_general_to_dot_dynamic
 // CHECK-DAG: %[[C32:.+]] = mhlo.constant dense<32> : tensor<1xi32>
 // CHECK-DAG: %[[C512:.+]] = mhlo.constant dense<512> : tensor<1xi32>
 // CHECK-DAG: %[[C8:.+]] = mhlo.constant dense<8> : tensor<1xi32>
-// CHECK-DAG: %[[TRANS0:.+]] = "mhlo.transpose"(%arg0) {permutation = dense<[2, 3, 0, 1]> : tensor<4xi64>}
-// CHECK-DAG: %[[DIM0:.+]] = "mhlo.get_dimension_size"(%arg0) {dimension = 2 : i64}
+// CHECK-DAG: %[[TRANS0:.+]] = "mhlo.transpose"(%arg0) <{permutation = dense<[2, 3, 0, 1]> : tensor<4xi64>}>
+// CHECK-DAG: %[[DIM0:.+]] = "mhlo.get_dimension_size"(%arg0) <{dimension = 2 : i64}>
 // CHECK-DAG: %[[RESHAPE0:.+]] = mhlo.reshape %[[DIM0]] : (tensor<i32>) -> tensor<1xi32>
 // CHECK-DAG: %[[MUL0:.+]] = mhlo.multiply %[[RESHAPE0]], %[[C32]]
-// CHECK-DAG: %[[CONCAT1:.+]] = "mhlo.concatenate"(%[[MUL0]], %[[C512]]) {dimension = 0 : i64}
+// CHECK-DAG: %[[CONCAT1:.+]] = "mhlo.concatenate"(%[[MUL0]], %[[C512]]) <{dimension = 0 : i64}>
 // CHECK-DAG: %[[DR1:.+]] = mhlo.dynamic_reshape %[[TRANS0]], %[[CONCAT1]]
-// CHECK-DAG: %[[TRANS1:.+]] = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 0, 1]> : tensor<4xi64>}
-// CHECK-DAG: %[[DIM1:.+]] = "mhlo.get_dimension_size"(%arg1) {dimension = 1 : i64}
+// CHECK-DAG: %[[TRANS1:.+]] = "mhlo.transpose"(%arg1) <{permutation = dense<[2, 3, 0, 1]> : tensor<4xi64>}>
+// CHECK-DAG: %[[DIM1:.+]] = "mhlo.get_dimension_size"(%arg1) <{dimension = 1 : i64}>
 // CHECK-DAG: %[[RESHAPE1:.+]] = mhlo.reshape %[[DIM1]] : (tensor<i32>) -> tensor<1xi32>
 // CHECK-DAG: %[[MUL1:.+]] = mhlo.multiply %[[RESHAPE1]], %[[C8]]
-// CHECK-DAG: %[[CONCAT2:.+]] = "mhlo.concatenate"(%[[C512]], %[[MUL1]]) {dimension = 0 : i64}
+// CHECK-DAG: %[[CONCAT2:.+]] = "mhlo.concatenate"(%[[C512]], %[[MUL1]]) <{dimension = 0 : i64}>
 // CHECK-DAG: %[[DR2:.+]] = mhlo.dynamic_reshape %[[TRANS1]], %[[CONCAT2]]
 // CHECK-DAG: %[[DOT:.+]] = "mhlo.dot"(%[[DR1:.+]], %[[DR2:.+]])
-// CHECK-DAG: %[[DIM2:.+]] = "mhlo.get_dimension_size"(%arg0) {dimension = 2 : i64}
+// CHECK-DAG: %[[DIM2:.+]] = "mhlo.get_dimension_size"(%arg0) <{dimension = 2 : i64}>
 // CHECK-DAG: %[[RESHAPE2:.+]] = mhlo.reshape %[[DIM2]] : (tensor<i32>) -> tensor<1xi32>
-// CHECK-DAG: %[[DIM3:.+]] = "mhlo.get_dimension_size"(%arg1) {dimension = 1 : i64}
+// CHECK-DAG: %[[DIM3:.+]] = "mhlo.get_dimension_size"(%arg1) <{dimension = 1 : i64}>
 // CHECK-DAG: %[[RESHAPE3:.+]] = mhlo.reshape %[[DIM3]] : (tensor<i32>) -> tensor<1xi32>
-// CHECK-DAG: %[[CONCAT3:.+]] = "mhlo.concatenate"(%[[RESHAPE2]], %[[C32]], %[[C8]], %[[RESHAPE3]]) {dimension = 0 : i64}
+// CHECK-DAG: %[[CONCAT3:.+]] = "mhlo.concatenate"(%[[RESHAPE2]], %[[C32]], %[[C8]], %[[RESHAPE3]]) <{dimension = 0 : i64}>
 // CHECK-DAG: %[[DR3:.+]] = mhlo.dynamic_reshape %[[DOT]], %[[CONCAT3]]
 // CHECK: return %[[DR3]]
 
@@ -147,11 +147,11 @@ func.func @dot_general_to_dot_dynamic(%arg0: tensor<128x4x?x32xf32>, %arg1: tens
 // -----
 
 func.func @dot_no_rhs_batch(%arg0: tensor<1x512x768xf32>, %arg1: tensor<768x12x64xf32>) -> tensor<1x512x12x64xf32> {
-  %0 = "mhlo.dot_general"(%arg0, %arg1) {
+  %0 = "mhlo.dot_general"(%arg0, %arg1) <{
     dot_dimension_numbers = #mhlo.dot<
       lhs_contracting_dimensions = [2],
       rhs_contracting_dimensions = [0]>
-    } : (tensor<1x512x768xf32>, tensor<768x12x64xf32>) -> tensor<1x512x12x64xf32>
+    }> : (tensor<1x512x768xf32>, tensor<768x12x64xf32>) -> tensor<1x512x12x64xf32>
   func.return %0 : tensor<1x512x12x64xf32>
 }
 
@@ -165,14 +165,14 @@ func.func @dot_no_rhs_batch(%arg0: tensor<1x512x768xf32>, %arg1: tensor<768x12x6
 
 // CHECK-LABEL: @testPrefElem
 func.func @testPrefElem(%arg0: tensor<1x1x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<1x1x3xf64> {
-  // CHECK: "mhlo.dot"({{%.*}}, {{%.*}}) {precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf64>
-  %0 = "mhlo.dot_general"(%arg0, %arg1) {
+  // CHECK: "mhlo.dot"({{%.*}}, {{%.*}}) <{precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]}> : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf64>
+  %0 = "mhlo.dot_general"(%arg0, %arg1) <{
     dot_dimension_numbers = #mhlo.dot<
       lhs_contracting_dimensions = [2],
       rhs_contracting_dimensions = [0]
     >,
    precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]
-  } : (tensor<1x1x2xf32>, tensor<2x3xf32>) -> tensor<1x1x3xf64>
+  }> : (tensor<1x1x2xf32>, tensor<2x3xf32>) -> tensor<1x1x3xf64>
 
   func.return %0 : tensor<1x1x3xf64>
 }
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/materialize-broadcasts.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/materialize-broadcasts.mlir
index 18829da584b4fd..5c09f9193d50e3 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/materialize-broadcasts.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/materialize-broadcasts.mlir
@@ -3,8 +3,8 @@
 // CHECK-LABEL: @clampBroadcast
 // CHECK-SAME: (%[[MIN:.+]]: tensor<f32>, %[[VAL:.+]]: tensor<4xf32>, %[[MAX:.+]]: tensor<f32>)
 func.func @clampBroadcast(%min: tensor<f32>, %value: tensor<4xf32>, %max: tensor<f32>) -> tensor<4xf32> {
-  // CHECK-DAG: %[[MIN_BC:.+]] = "mhlo.broadcast"(%[[MIN]]) {broadcast_sizes = dense<4> : tensor<1xi64>} : (tensor<f32>) -> tensor<4xf32>
-  // CHECK-DAG: %[[MAX_BC:.+]] = "mhlo.broadcast"(%[[MAX]]) {broadcast_sizes = dense<4> : tensor<1xi64>} : (tensor<f32>) -> tensor<4xf32>
+  // CHECK-DAG: %[[MIN_BC:.+]] = "mhlo.broadcast"(%[[MIN]]) <{broadcast_sizes = dense<4> : tensor<1xi64>}> : (tensor<f32>) -> tensor<4xf32>
+  // CHECK-DAG: %[[MAX_BC:.+]] = "mhlo.broadcast"(%[[MAX]]) <{broadcast_sizes = dense<4> : tensor<1xi64>}> : (tensor<f32>) -> tensor<4xf32>
   // CHECK: mhlo.clamp %[[MIN_BC]], %[[VAL]], %[[MAX_BC]] : tensor<4xf32>
   %0 = "mhlo.clamp"(%min, %value, %max) : (tensor<f32>, tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
   func.return %0 : tensor<4xf32>
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo-quant-legalize-to-int.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo-quant-legalize-to-int.mlir
index 01ac1b8d13bc4a..5988d8efb68f4e 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo-quant-legalize-to-int.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo-quant-legalize-to-int.mlir
@@ -549,11 +549,11 @@ func.func @dot_dynamic(
   // CHECK-SAME: applies mhlo.add across dimensions = [1]
   // CHECK-SAME: (tensor<?x?xi32>, tensor<i32>) -> tensor<?xi32>
   // CHECK: "mhlo.get_dimension_size"(%[[DOT]])
-  // CHECK-SAME: {dimension = 0 : i64} : (tensor<?x?xi32>) -> tensor<i32>
+  // CHECK-SAME: <{dimension = 0 : i64}> : (tensor<?x?xi32>) -> tensor<i32>
   // CHECK: "mhlo.get_dimension_size"(%[[DOT]])
-  // CHECK-SAME: {dimension = 1 : i64} : (tensor<?x?xi32>) -> tensor<i32>
+  // CHECK-SAME: <{dimension = 1 : i64}> : (tensor<?x?xi32>) -> tensor<i32>
   // CHECK: %[[DYN_DIMS:.*]] = "mhlo.concatenate"
-  // CHECK-SAME: {dimension = 0 : i64}
+  // CHECK-SAME: <{dimension = 0 : i64}>
   // CHECK: mhlo.dynamic_broadcast_in_dim
   // CHECK-SAME: %[[DYN_DIMS]])
   // CHECK-SAME: broadcast_dimensions = dense<0>
@@ -613,7 +613,7 @@ func.func @dot_dynamic_contracting_dim(
 
   // CHECK: %[[DYNAMIC_DIM_INIT:.*]] = mhlo.constant dense<1> : tensor<i32>
   // CHECK: %[[DYNAMIC_DIM:.*]] = "mhlo.get_dimension_size"
-  // CHECK-SAME: {dimension = 0 : i64} : (tensor<?x2xi8>) -> tensor<i32>
+  // CHECK-SAME: <{dimension = 0 : i64}> : (tensor<?x2xi8>) -> tensor<i32>
   // CHECK: %[[DYNAMIC_DIM_TOTAL:.*]] = mhlo.multiply
   // CHECK-SAME: %[[DYNAMIC_DIM_INIT]], %[[DYNAMIC_DIM]]
   // CHECK: %[[DIMS:.*]] = mhlo.constant dense<9> : tensor<i32>
@@ -1779,6 +1779,61 @@ func.func @dot_hybrid(
 
 // -----
 
+// CHECK-LABEL: func @dot_general_hybrid_per_channel
+// CHECK-SAME: %[[ARG0:.*]]: tensor<3x2xf32>
+// CHECK-SAME: %[[ARG1:.*]]: tensor<2x2xi8>
+func.func @dot_general_hybrid_per_channel(
+    %arg0: tensor<3x2xf32>,
+    %arg1: tensor<2x2x!quant.uniform<i8<-127:127>:f32:1, {3.000000e+00, 4.000000e+00}>>
+  ) -> tensor<3x2xf32> {
+  // CHECK-DAG: %[[BARRIER:.*]] = mhlo.optimization_barrier %[[ARG1]] : tensor<2x2xi8>
+  // CHECK-DAG: %[[SCALES:.*]] = mhlo.constant dense<[3.000000e+00, 4.000000e+00]> : tensor<2xf32>
+  // CHECK-DAG: %[[CONVERT:.*]] = mhlo.convert %[[BARRIER]] : (tensor<2x2xi8>) -> tensor<2x2xf32>
+  // CHECK-NOT: chlo.broadcast_subtract
+  // CHECK: %[[MUL:.*]] = chlo.broadcast_multiply %[[CONVERT]], %[[SCALES]] {broadcast_dimensions = array<i64: 1>} : (tensor<2x2xf32>, tensor<2xf32>) -> tensor<2x2xf32>
+  // CHECK: %[[DOT:.*]] = "mhlo.dot_general"(%[[ARG0]], %[[MUL]])
+  // CHECK-SAME: (tensor<3x2xf32>, tensor<2x2xf32>) -> tensor<3x2xf32>
+  // CHECK: return %[[DOT]]
+
+  %0 = "mhlo.dot_general"(%arg0, %arg1) {
+      dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1],
+      rhs_contracting_dimensions = [0]>} : (
+    tensor<3x2xf32>,
+    tensor<2x2x!quant.uniform<i8<-127:127>:f32:1, {3.000000e+00, 4.000000e+00}>>
+  ) -> tensor<3x2xf32>
+  return %0 : tensor<3x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @dot_general_hybrid_per_channel_asymmetric
+// CHECK-SAME: %[[ARG0:.*]]: tensor<3x2xf32>
+// CHECK-SAME: %[[ARG1:.*]]: tensor<2x2xi8>
+func.func @dot_general_hybrid_per_channel_asymmetric(
+    %arg0: tensor<3x2xf32>,
+    %arg1: tensor<2x2x!quant.uniform<i8<-127:127>:f32:1, {3.000000e+00:10, 4.000000e+00:20}>>
+  ) -> tensor<3x2xf32> {
+  // CHECK-DAG: %[[BARRIER:.*]] = mhlo.optimization_barrier %[[ARG1]] : tensor<2x2xi8>
+  // CHECK-DAG: %[[SCALES:.*]] = mhlo.constant dense<[3.000000e+00, 4.000000e+00]> : tensor<2xf32>
+  // CHECK-DAG: %[[ZPS:.*]] = mhlo.constant dense<[1.000000e+01, 2.000000e+01]> : tensor<2xf32>
+  // CHECK-DAG: %[[CONVERT:.*]] = mhlo.convert %[[BARRIER]] : (tensor<2x2xi8>) -> tensor<2x2xf32>
+  // CHECK: %[[SUB:.*]] = chlo.broadcast_subtract %[[CONVERT]], %[[ZPS]] {broadcast_dimensions = array<i64: 1>} : (tensor<2x2xf32>, tensor<2xf32>) -> tensor<2x2xf32>
+  // CHECK: %[[MUL:.*]] = chlo.broadcast_multiply %[[SUB]], %[[SCALES]] {broadcast_dimensions = array<i64: 1>} : (tensor<2x2xf32>, tensor<2xf32>) -> tensor<2x2xf32>
+  // CHECK: %[[DOT:.*]] = "mhlo.dot_general"(%[[ARG0]], %[[MUL]])
+  // CHECK-SAME: (tensor<3x2xf32>, tensor<2x2xf32>) -> tensor<3x2xf32>
+  // CHECK: return %[[DOT]]
+
+  %0 = "mhlo.dot_general"(%arg0, %arg1) {
+      dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1],
+      rhs_contracting_dimensions = [0]>} : (
+    tensor<3x2xf32>,
+    tensor<2x2x!quant.uniform<i8<-127:127>:f32:1, {3.000000e+00:10, 4.000000e+00:20}>>
+  ) -> tensor<3x2xf32>
+  return %0 : tensor<3x2xf32>
+}
+
+// -----
+
 func.func @dot_hybrid_result_type_not_float(
     %arg0: tensor<?x?xf32>,
     %arg1: tensor<?x?x!quant.uniform<i8:f32, 1.000000e+00:3>>) {
@@ -1839,6 +1894,79 @@ func.func @conv2d_static_hybrid(
 
 // -----
 
+// CHECK-LABEL: func @conv2d_hybrid_per_channel
+// CHECK-SAME: %[[ARG0:.*]]: tensor<128x28x28x1xf32>
+// CHECK-SAME: %[[ARG1:.*]]: tensor<3x3x1x2xi8>
+func.func @conv2d_hybrid_per_channel(
+    %arg0: tensor<128x28x28x1xf32>,
+    %arg1: tensor<3x3x1x2x!quant.uniform<i8:f32:3, {2.000000e+00:0, 1.000000e+00:0}>>
+  ) -> tensor<128x26x26x2xf32> {
+  // CHECK-DAG: %[[BARRIER:.*]] = mhlo.optimization_barrier %[[ARG1]] : tensor<3x3x1x2xi8>
+  // CHECK-DAG: %[[SCALES:.*]] = mhlo.constant dense<[2.000000e+00, 1.000000e+00]> : tensor<2xf32>
+  // CHECK-DAG: %[[CONVERT:.*]] = mhlo.convert %[[BARRIER]] : (tensor<3x3x1x2xi8>) -> tensor<3x3x1x2xf32>
+  // CHECK-NOT: chlo.broadcast_subtract
+  // CHECK: %[[MUL:.*]] = chlo.broadcast_multiply %[[CONVERT]], %[[SCALES]] {broadcast_dimensions = array<i64: 3>} : (tensor<3x3x1x2xf32>, tensor<2xf32>) -> tensor<3x3x1x2xf32>
+  // CHECK: %[[CONV:.*]] = mhlo.convolution(%[[ARG0]], %[[MUL]])
+  // CHECK-SAME{LITERAL}: dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]}
+  // CHECK-SAME: {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<128x28x28x1xf32>, tensor<3x3x1x2xf32>) -> tensor<128x26x26x2xf32>
+  // CHECK: return %[[CONV]]
+
+  %0 = mhlo.convolution(%arg0, %arg1)
+    dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
+    window = {
+      stride = [1, 1], pad = [[0, 0], [0, 0]],
+      lhs_dilate = [1, 1],
+      rhs_dilate = [1, 1]
+    }
+    {
+      batch_group_count = 1 : i64,
+      feature_group_count = 1 : i64
+    } : (
+      tensor<128x28x28x1xf32>,
+      tensor<3x3x1x2x!quant.uniform<i8:f32:3, {2.000000e+00:0, 1.000000e+00:0}>>)
+    -> tensor<128x26x26x2xf32>
+  return %0 : tensor<128x26x26x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @conv2d_hybrid_per_channel_asymmetric
+// CHECK-SAME: %[[ARG0:.*]]: tensor<128x28x28x1xf32>
+// CHECK-SAME: %[[ARG1:.*]]: tensor<3x3x1x2xi8>
+func.func @conv2d_hybrid_per_channel_asymmetric(
+    %arg0: tensor<128x28x28x1xf32>,
+    %arg1: tensor<3x3x1x2x!quant.uniform<i8:f32:3, {2.000000e+00:10, 1.000000e+00:20}>>
+  ) -> tensor<128x26x26x2xf32> {
+  // CHECK-DAG: %[[BARRIER:.*]] = mhlo.optimization_barrier %[[ARG1]] : tensor<3x3x1x2xi8>
+  // CHECK-DAG: %[[SCALES:.*]] = mhlo.constant dense<[2.000000e+00, 1.000000e+00]> : tensor<2xf32>
+  // CHECK-DAG: %[[ZPS:.*]] = mhlo.constant dense<[1.000000e+01, 2.000000e+01]> : tensor<2xf32>
+  // CHECK-DAG: %[[CONVERT:.*]] = mhlo.convert %[[BARRIER]] : (tensor<3x3x1x2xi8>) -> tensor<3x3x1x2xf32>
+  // CHECK: %[[SUB:.*]] = chlo.broadcast_subtract %[[CONVERT]], %[[ZPS]] {broadcast_dimensions = array<i64: 3>} : (tensor<3x3x1x2xf32>, tensor<2xf32>) -> tensor<3x3x1x2xf32>
+  // CHECK: %[[MUL:.*]] = chlo.broadcast_multiply %[[SUB]], %[[SCALES]] {broadcast_dimensions = array<i64: 3>} : (tensor<3x3x1x2xf32>, tensor<2xf32>) -> tensor<3x3x1x2xf32>
+  // CHECK: %[[CONV:.*]] = mhlo.convolution(%[[ARG0]], %[[MUL]])
+  // CHECK-SAME{LITERAL}: dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]}
+  // CHECK-SAME: {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<128x28x28x1xf32>, tensor<3x3x1x2xf32>) -> tensor<128x26x26x2xf32>
+  // CHECK: return %[[CONV]]
+
+  %0 = mhlo.convolution(%arg0, %arg1)
+    dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
+    window = {
+      stride = [1, 1], pad = [[0, 0], [0, 0]],
+      lhs_dilate = [1, 1],
+      rhs_dilate = [1, 1]
+    }
+    {
+      batch_group_count = 1 : i64,
+      feature_group_count = 1 : i64
+    } : (
+      tensor<128x28x28x1xf32>,
+      tensor<3x3x1x2x!quant.uniform<i8:f32:3, {2.000000e+00:10, 1.000000e+00:20}>>)
+    -> tensor<128x26x26x2xf32>
+  return %0 : tensor<128x26x26x2xf32>
+}
+
+// -----
+
 func.func @conv2d_hybrid_result_not_float(
     %arg0: tensor<128x28x28x1xf32>,
     %arg1: tensor<3x3x1x128x!quant.uniform<i8:f32, 3.000000e+00:0>>) {
@@ -1932,7 +2060,7 @@ func.func @broadcast_per_channel(
   // CHECK: "mhlo.broadcast_in_dim"
   // CHECK-SAME: broadcast_dimensions = dense<3> : tensor<1xi64>
   // CHECK-SAME: (tensor<2xi32>) -> tensor<128x26x26x2xi32>
-  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<3> : tensor<1xi64>}: (
+  %0 = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<3> : tensor<1xi64>}>: (
       tensor<2x!quant.uniform<i32:f32:0, {4.000000e+00:0, 2.000000e+00:0}>>
     ) -> tensor<128x26x26x2x!quant.uniform<i32:f32:3, {4.000000e+00:0, 2.000000e+00:0}>>
   return %0 : tensor<128x26x26x2x!quant.uniform<i32:f32:3, {4.000000e+00:0, 2.000000e+00:0}>>
@@ -2035,7 +2163,7 @@ func.func @concatenate(
   ) -> tensor<4x2x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>> {
   // CHECK: mhlo.concatenate
   // CHECK-SAME: (tensor<3x2xi8>, tensor<1x2xi8>) -> tensor<4x2xi8>
-  %0 = "mhlo.concatenate"(%arg0, %arg1) {dimension = 0 : i64} : (
+  %0 = "mhlo.concatenate"(%arg0, %arg1) <{dimension = 0 : i64}> : (
     tensor<3x2x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>,
     tensor<1x2x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>
   ) -> tensor<4x2x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>
@@ -2117,7 +2245,7 @@ func.func @transpose(
   ) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>> {
   // CHECK: mhlo.transpose
   // CHECK-SAME: (tensor<3x1xi8>) -> tensor<1x3xi8>
-  %0 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (
+  %0 = "mhlo.transpose"(%arg0) <{permutation = dense<[1, 0]> : tensor<2xi64>}> : (
     tensor<3x1x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
   ) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
   return %0 : tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
@@ -2192,7 +2320,7 @@ func.func @get_dimension_size(
   ) -> tensor<i32> {
   // CHECK: mhlo.get_dimension_size
   // CHECK-SAME: (tensor<?x4xi8>) -> tensor<i32>
-  %0 = "mhlo.get_dimension_size"(%arg0) {dimension = 0 : i64} : (
+  %0 = "mhlo.get_dimension_size"(%arg0) <{dimension = 0 : i64}> : (
       tensor<?x4x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<i32>
   return %0 : tensor<i32>
 }
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_canonicalize_scatter.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_canonicalize_scatter.mlir
index 653bd3cf913be3..2d61f1c0bf16ea 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_canonicalize_scatter.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_canonicalize_scatter.mlir
@@ -62,8 +62,8 @@ func.func @collapse_scatter_dims(%dst: tensor<3x3xf32>,
 
 // CHECK:         %[[IND_:.*]] = tensor.collapse_shape %[[IND]] {{\[\[}}0, 1], [2]] : tensor<2x1x2xi32> into tensor<2x2xi32>
 // CHECK:         %[[UPD_:.*]] = tensor.collapse_shape %[[UPD]] {{\[\[}}0, 1], [2], [3]] : tensor<2x1x1x3xf32> into tensor<2x1x3xf32>
-// CHECK:         "mhlo.scatter"(%[[DST]], %[[IND_]], %[[UPD_]]) ({
-// CHECK:           update_window_dims = [1, 2],
+// CHECK:         "mhlo.scatter"(%[[DST]], %[[IND_]], %[[UPD_]])
+// CHECK-SAME:      update_window_dims = [1, 2],
 // CHECK-SAME:      scatter_dims_to_operand_dims = [0, 1],
 // CHECK-SAME:      index_vector_dim = 1
 
@@ -91,7 +91,7 @@ func.func @move_index_vector_dim(%dst: tensor<3x3xf32>,
 // CHECK-SAME:      %[[IND:.*]]: tensor<2x1xi32>,
 // CHECK-SAME:      %[[UPD:.*]]: tensor<1x3x3xf32>
 
-// CHECK:         %[[IND_:.*]] = "mhlo.transpose"(%[[IND]]) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<2x1xi32>) -> tensor<1x2xi32>
+// CHECK:         %[[IND_:.*]] = "mhlo.transpose"(%[[IND]]) <{permutation = dense<[1, 0]> : tensor<2xi64>}> : (tensor<2x1xi32>) -> tensor<1x2xi32>
 // CHECK:         "mhlo.scatter"(%[[DST]], %[[IND_]], %[[UPD]])
 // CHECK:           update_window_dims = [1, 2],
 // CHECK-SAME:      scatter_dims_to_operand_dims = [0, 1],
@@ -121,21 +121,21 @@ func.func @transform_updates_and_operands_using_scatter_dims(%dst: tensor<3x4x5x
 // CHECK-SAME:      %[[IND:.*]]: tensor<2x2xi32>,
 // CHECK-SAME:      %[[UPD:.*]]: tensor<2x1x1x3xf32>) -> tensor<3x4x5xf32> {
 
-// CHECK:         %[[DST_:.*]] = "mhlo.transpose"(%[[DST]]) {
+// CHECK:         %[[DST_:.*]] = "mhlo.transpose"(%[[DST]]) <{
 // CHECK-SAME:      permutation = dense<[2, 0, 1]> : tensor<3xi64>
-// CHECK-SAME:    } : (tensor<3x4x5xf32>) -> tensor<5x3x4xf32>
-// CHECK:         %[[UPD_:.*]] = "mhlo.transpose"(%[[UPD]]) {
+// CHECK-SAME:    }> : (tensor<3x4x5xf32>) -> tensor<5x3x4xf32>
+// CHECK:         %[[UPD_:.*]] = "mhlo.transpose"(%[[UPD]]) <{
 // CHECK-SAME:      permutation = dense<[0, 3, 1, 2]> : tensor<4xi64>
-// CHECK-SAME:    } : (tensor<2x1x1x3xf32>) -> tensor<2x3x1x1xf32>
+// CHECK-SAME:    }> : (tensor<2x1x1x3xf32>) -> tensor<2x3x1x1xf32>
 
 // CHECK:         %[[NEW_OP:.*]] = "mhlo.scatter"(%[[DST_]], %[[IND]], %[[UPD_]])
-// CHECK:           update_window_dims = [1, 2, 3],
+// CHECK-SAME:       update_window_dims = [1, 2, 3],
 // CHECK-SAME:      scatter_dims_to_operand_dims = [0, 1],
 // CHECK-SAME:      index_vector_dim = 1
 
-// CHECK-NEXT:    "mhlo.transpose"(%[[NEW_OP:.*]]) {
+// CHECK:        "mhlo.transpose"(%[[NEW_OP:.*]]) <{
 // CHECK-SAME:      permutation = dense<[1, 2, 0]> : tensor<3xi64>
-// CHECK-SAME:    } : (tensor<5x3x4xf32>) -> tensor<3x4x5xf32>
+// CHECK-SAME:    }> : (tensor<5x3x4xf32>) -> tensor<3x4x5xf32>
 
 // -----
 
@@ -161,12 +161,12 @@ func.func @make_scatter_dims_leading_in_updates(%dst: tensor<3xf32>,
 // CHECK-SAME:      %[[IND:.*]]: tensor<1x1xi32>,
 // CHECK-SAME:      %[[UPD:.*]]: tensor<2x1xf32>
 
-// CHECK:         %[[UPD_:.*]] = "mhlo.transpose"(%[[UPD]]) {
+// CHECK:         %[[UPD_:.*]] = "mhlo.transpose"(%[[UPD]]) <{
 // CHECK-SAME:      permutation = dense<[1, 0]> : tensor<2xi64>
-// CHECK-SAME:    } : (tensor<2x1xf32>) -> tensor<1x2xf32>
+// CHECK-SAME:    }> : (tensor<2x1xf32>) -> tensor<1x2xf32>
 
 // CHECK:         "mhlo.scatter"(%[[DST]], %[[IND]], %[[UPD_]]
-// CHECK:           update_window_dims = [1],
+// CHECK-SAME:      update_window_dims = [1],
 // CHECK-SAME:      scatter_dims_to_operand_dims = [0],
 // CHECK-SAME:      index_vector_dim = 1
 
@@ -197,8 +197,8 @@ func.func @zero_dim_scatter_indices(%dst: tensor<4x4xf32>,
 // CHECK-SAME:      [0, 1]] : tensor<2xi32> into tensor<1x2xi32>
 // CHECK:         %[[UPD_:.*]] = tensor.expand_shape %[[UPD]] [
 // CHECK-SAME:      [0, 1], [2]] : tensor<3x3xf32> into tensor<1x3x3xf32>
-// CHECK:         "mhlo.scatter"(%[[DST]], %[[IND_]], %[[UPD_]]) ({
-// CHECK:           update_window_dims = [1, 2],
+// CHECK:         "mhlo.scatter"(%[[DST]], %[[IND_]], %[[UPD_]])
+// CHECK-SAME:      update_window_dims = [1, 2],
 // CHECK-SAME:      scatter_dims_to_operand_dims = [0, 1]
 // CHECK-SAME:      index_vector_dim = 1
 
@@ -231,4 +231,4 @@ func.func @multiple_window_and_scatter_dims(
 // CHECK:         %[[UPD0:.*]] = "mhlo.transpose"(%[[UPD]]) {{.*}} -> tensor<6x7x2x4xf32>
 // CHECK:         %[[UPD1:.*]] = tensor.collapse_shape %[[UPD0]] {{.*}} into tensor<42x2x4xf32>
 // CHECK:         %[[UPD2:.*]] = tensor.expand_shape %[[UPD1]] {{.*}} into tensor<42x1x2x1x4x1xf32>
-// CHECK:         "mhlo.scatter"(%[[DST]], %[[IND0]], %[[UPD2]])
\ No newline at end of file
+// CHECK:         "mhlo.scatter"(%[[DST]], %[[IND0]], %[[UPD2]])
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_infer_shape_type_methods.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_infer_shape_type_methods.mlir
index c827d916f17c05..a30c82e793778c 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_infer_shape_type_methods.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_infer_shape_type_methods.mlir
@@ -39,7 +39,7 @@ func.func @compare(%a : tensor<2x2xf32>, %b : tensor<2x2xf32>) -> tensor<2x2xind
 
 // CHECK-LABEL: @broadcast
 func.func @broadcast(%a : tensor<3xi32>) -> tensor<1x2x3xindex> {
-  %0 = "mhlo.broadcast"(%a) {broadcast_sizes = dense<[1, 2]> : tensor<2xi64>}
+  %0 = "mhlo.broadcast"(%a) <{broadcast_sizes = dense<[1, 2]> : tensor<2xi64>}>
       : (tensor<3xi32>) -> tensor<1x2x3xi32>
   // CHECK: types0 = tensor<1x2x3xi32>
   %1 = "mhlo_test.get_return_types"(%0) : (tensor<1x2x3xi32>) -> tensor<1x2x3xindex>
@@ -51,7 +51,7 @@ func.func @broadcast(%a : tensor<3xi32>) -> tensor<1x2x3xindex> {
 func.func @broadcast(%a : tensor<3xi32>) -> tensor<1x2x3xi32> {
   // expected-error@+2 {{'mhlo.broadcast' op failed to infer returned types}}
   // expected-error@+1 {{Broadcast with negative dimension size -2}}
-  %0 = "mhlo.broadcast"(%a) {broadcast_sizes = dense<[1, -2]> : tensor<2xi64>}
+  %0 = "mhlo.broadcast"(%a) <{broadcast_sizes = dense<[1, -2]> : tensor<2xi64>}>
       : (tensor<3xi32>) -> tensor<1x2x3xi32>
   func.return %0 : tensor<1x2x3xi32>
 }
@@ -60,7 +60,7 @@ func.func @broadcast(%a : tensor<3xi32>) -> tensor<1x2x3xi32> {
 
 // CHECK-LABEL: @dynamic_slice
 func.func @dynamic_slice(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<1x4xindex> {
-  %0 = "mhlo.dynamic_slice"(%arg0, %arg1, %arg2) {slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
+  %0 = "mhlo.dynamic_slice"(%arg0, %arg1, %arg2) <{slice_sizes = dense<[1, 4]> : tensor<2xi64>}> : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
   // CHECK: types0 = tensor<1x4xi32>
   %1 = "mhlo_test.get_return_types"(%0) : (tensor<1x4xi32>) -> tensor<1x4xindex>
   func.return %1 : tensor<1x4xindex>
@@ -70,11 +70,11 @@ func.func @dynamic_slice(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>, %arg2: tens
 
 // CHECK-LABEL: @pad
 func.func @pad(%arg0: tensor<1x2x3xf16>, %arg1: tensor<f16>) -> tensor<2x4x7xindex> {
-  %0 = "mhlo.pad"(%arg0, %arg1) {
+  %0 = "mhlo.pad"(%arg0, %arg1) <{
     edge_padding_high = dense<[1, 1, 0]> : tensor<3xi64>,
     edge_padding_low = dense<[0, 1, 2]> : tensor<3xi64>,
     interior_padding = dense<[0, 0, 1]> : tensor<3xi64>
-  } : (tensor<1x2x3xf16>, tensor<f16>) -> tensor<2x4x7xf16>
+  }> : (tensor<1x2x3xf16>, tensor<f16>) -> tensor<2x4x7xf16>
   // CHECK: types0 = tensor<2x4x7xf16>
   %1 = "mhlo_test.get_return_types"(%0) : (tensor<2x4x7xf16>) -> tensor<2x4x7xindex>
   func.return %1 : tensor<2x4x7xindex>
@@ -162,7 +162,7 @@ func.func @abs(%arg0: tensor<1x2xf32>) -> tensor<1x2xindex> {
 
 // CHECK-LABEL: @concat
 func.func @concat(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>) -> tensor<3xindex> {
-  %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<1xi32>, tensor<2xi32>) -> tensor<3xi32>
+  %0 = "mhlo.concatenate"(%arg0, %arg1) <{ dimension = 0 : i64 }> : (tensor<1xi32>, tensor<2xi32>) -> tensor<3xi32>
   // CHECK: types0 = tensor<3xi32>
   %1 = "mhlo_test.get_return_types"(%0) : (tensor<3xi32>) -> tensor<3xindex>
   func.return %1 : tensor<3xindex>
@@ -185,7 +185,7 @@ func.func @concat(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>) -> tensor<3xindex>
 func.func @concat_bounds_c0(
   %arg0: tensor<5x1xi32, #mhlo.type_extensions<bounds = [?, ?]>>,
   %arg1: tensor<5x2xi32, #mhlo.type_extensions<bounds = [?, ?]>>) -> tensor<?x?xindex> {
-  %result = "mhlo.concatenate"(%arg0, %arg1) { dimension = 1 : i64 } : (
+  %result = "mhlo.concatenate"(%arg0, %arg1) <{ dimension = 1 : i64 }> : (
     tensor<5x1xi32, #mhlo.type_extensions<bounds = [?, ?]>>,
     tensor<5x2xi32, #mhlo.type_extensions<bounds = [?, ?]>>) -> tensor<?x?xi32>
   // CHECK: types0 = tensor<5x3xi32>
@@ -199,13 +199,13 @@ func.func @concat_bounds_c0(
 func.func @concat_bounds_c1(
   %arg0: tensor<5x2xi32, #mhlo.type_extensions<bounds = [?, ?]>>,
   %arg1: tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, ?]>>) -> tensor<?x?xindex> {
-  %result = "mhlo.concatenate"(%arg0, %arg1) { dimension = 1 : i64 } : (
+  %result = "mhlo.concatenate"(%arg0, %arg1) <{ dimension = 1 : i64 }> : (
     tensor<5x2xi32, #mhlo.type_extensions<bounds = [?, ?]>>,
     tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, ?]>>) -> tensor<?x?xi32>
   // CHECK: types0 = tensor<5x?xi32>
   %1 = "mhlo_test.get_return_types"(%result) : (tensor<?x?xi32>) -> tensor<?x?xindex>
 
-  %result_swap = "mhlo.concatenate"(%arg1, %arg0) { dimension = 1 : i64 } : (
+  %result_swap = "mhlo.concatenate"(%arg1, %arg0) <{ dimension = 1 : i64 }> : (
     tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, ?]>>,
     tensor<5x2xi32, #mhlo.type_extensions<bounds = [?, ?]>>) -> tensor<?x?xi32>
   // CHECK: types0 = tensor<5x?xi32>
@@ -220,13 +220,13 @@ func.func @concat_bounds_c1(
 func.func @concat_bounds_c2(
   %arg0: tensor<5x2xi32, #mhlo.type_extensions<bounds = [?, ?]>>,
   %arg1: tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, 4]>>) -> tensor<?x?xindex> {
-  %result = "mhlo.concatenate"(%arg0, %arg1) { dimension = 1 : i64 } : (
+  %result = "mhlo.concatenate"(%arg0, %arg1) <{ dimension = 1 : i64 }> : (
     tensor<5x2xi32, #mhlo.type_extensions<bounds = [?, ?]>>,
     tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, 4]>>) -> tensor<?x?xi32>
   // CHECK: types0 = tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, 6]>>
   %1 = "mhlo_test.get_return_types"(%result) : (tensor<?x?xi32>) -> tensor<?x?xindex>
 
-  %result_swap = "mhlo.concatenate"(%arg1, %arg0) { dimension = 1 : i64 } : (
+  %result_swap = "mhlo.concatenate"(%arg1, %arg0) <{ dimension = 1 : i64 }> : (
     tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, 4]>>,
     tensor<5x2xi32, #mhlo.type_extensions<bounds = [?, ?]>>) -> tensor<?x?xi32>
   // CHECK: types0 = tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, 6]>>
@@ -241,7 +241,7 @@ func.func @concat_bounds_c2(
 func.func @concat_bounds_c3(
   %arg0: tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, ?]>>,
   %arg1: tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, ?]>>) -> tensor<?x?xindex> {
-  %result = "mhlo.concatenate"(%arg0, %arg1) { dimension = 1 : i64 } : (
+  %result = "mhlo.concatenate"(%arg0, %arg1) <{ dimension = 1 : i64 }> : (
     tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, ?]>>,
     tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, ?]>>) -> tensor<?x?xi32>
   // CHECK: types0 = tensor<5x?xi32>
@@ -255,13 +255,13 @@ func.func @concat_bounds_c3(
 func.func @concat_bounds_c4(
   %arg0: tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, ?]>>,
   %arg1: tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, 4]>>) -> tensor<?x?xindex> {
-  %result = "mhlo.concatenate"(%arg0, %arg1) { dimension = 1 : i64 } : (
+  %result = "mhlo.concatenate"(%arg0, %arg1) <{ dimension = 1 : i64 }> : (
     tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, ?]>>,
     tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, 4]>>) -> tensor<?x?xi32>
   // CHECK: types0 = tensor<5x?xi32>
   %1 = "mhlo_test.get_return_types"(%result) : (tensor<?x?xi32>) -> tensor<?x?xindex>
 
-  %result_swap = "mhlo.concatenate"(%arg1, %arg0) { dimension = 1 : i64 } : (
+  %result_swap = "mhlo.concatenate"(%arg1, %arg0) <{ dimension = 1 : i64 }> : (
     tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, 4]>>,
     tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, ?]>>) -> tensor<?x?xi32>
   // CHECK: types0 = tensor<5x?xi32>
@@ -276,7 +276,7 @@ func.func @concat_bounds_c4(
 func.func @concat_bounds_c5(
   %arg0: tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, 3]>>,
   %arg1: tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, 4]>>) -> tensor<?x?xindex> {
-  %result = "mhlo.concatenate"(%arg0, %arg1) { dimension = 1 : i64 } : (
+  %result = "mhlo.concatenate"(%arg0, %arg1) <{ dimension = 1 : i64 }> : (
     tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, 3]>>,
     tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, 4]>>) -> tensor<?x?xi32>
   // CHECK: types0 = tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, 7]>>
@@ -407,7 +407,7 @@ func.func @gather_bounds(%operand : tensor<?x?x?xi32, #mhlo.type_extensions<boun
 // CHECK-LABEL: @rng_normal
 func.func @rng_normal(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<7xindex> {
   %0 = "mhlo.constant"() {value = dense<7> : tensor<1xi64>} : () -> tensor<1xi64>
-  %1 = "mhlo.rng"(%arg0, %arg1, %0) {rng_distribution = #mhlo.rng_distribution<NORMAL>} : (tensor<f32>, tensor<f32>, tensor<1xi64>) -> tensor<7xf32>
+  %1 = "mhlo.rng"(%arg0, %arg1, %0) <{rng_distribution = #mhlo.rng_distribution<NORMAL>}> : (tensor<f32>, tensor<f32>, tensor<1xi64>) -> tensor<7xf32>
   // CHECK: types0 = tensor<7xf32>
   %2 = "mhlo_test.get_return_types"(%1) : (tensor<7xf32>) -> tensor<7xindex>
   func.return %2 : tensor<7xindex>
@@ -418,7 +418,7 @@ func.func @rng_normal(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<7xindex>
 // CHECK-LABEL: func @rng_uniform
 func.func @rng_uniform(%a: tensor<f32>, %b: tensor<f32>) -> tensor<2x3x5xindex> {
   %0 = mhlo.constant dense<[2, 3, 5]> : tensor<3xi64>
-  %1 = "mhlo.rng"(%a, %b, %0) {rng_distribution = #mhlo.rng_distribution<UNIFORM>} : (tensor<f32>, tensor<f32>, tensor<3xi64>) -> tensor<2x3x5xf32>
+  %1 = "mhlo.rng"(%a, %b, %0) <{rng_distribution = #mhlo.rng_distribution<UNIFORM>}> : (tensor<f32>, tensor<f32>, tensor<3xi64>) -> tensor<2x3x5xf32>
   // CHECK: types0 = tensor<2x3x5xf32>
   %2 = "mhlo_test.get_return_types"(%1) : (tensor<2x3x5xf32>) -> tensor<2x3x5xindex>
   func.return %2 : tensor<2x3x5xindex>
@@ -428,7 +428,7 @@ func.func @rng_uniform(%a: tensor<f32>, %b: tensor<f32>) -> tensor<2x3x5xindex>
 
 // CHECK-LABEL: func @slice
 func.func @slice(%arg0: tensor<3x4xi32>) -> tensor<1x2xindex> {
-  %0 = "mhlo.slice"(%arg0) {start_indices = dense<[1, 0]> : tensor<2xi64>, limit_indices = dense<[2, 4]> : tensor<2xi64>, strides = dense<[1, 2]> : tensor<2xi64>} : (tensor<3x4xi32>) -> tensor<1x2xi32>
+  %0 = "mhlo.slice"(%arg0) <{start_indices = dense<[1, 0]> : tensor<2xi64>, limit_indices = dense<[2, 4]> : tensor<2xi64>, strides = dense<[1, 2]> : tensor<2xi64>}> : (tensor<3x4xi32>) -> tensor<1x2xi32>
   // CHECK: types0 = tensor<1x2xi32>
   %1 = "mhlo_test.get_return_types"(%0) : (tensor<1x2xi32>) -> tensor<1x2xindex>
   func.return %1 : tensor<1x2xindex>
@@ -438,7 +438,7 @@ func.func @slice(%arg0: tensor<3x4xi32>) -> tensor<1x2xindex> {
 
 // CHECK-LABEL: func @slice_with_bounds
 func.func @slice_with_bounds(%arg0: tensor<3x?x?xi32, #mhlo.type_extensions<bounds = [?, 4, ?]>>) -> tensor<?x?x?xindex> {
-  %0 = "mhlo.slice"(%arg0) {start_indices = dense<[1, 0, 0]> : tensor<3xi64>, limit_indices = dense<[2, 4, 4]> : tensor<3xi64>, strides = dense<[1, 2, 2]> : tensor<3xi64>} : (tensor<3x?x?xi32, #mhlo.type_extensions<bounds = [?, 4, ?]>>) -> tensor<?x?x?xi32>
+  %0 = "mhlo.slice"(%arg0) <{start_indices = dense<[1, 0, 0]> : tensor<3xi64>, limit_indices = dense<[2, 4, 4]> : tensor<3xi64>, strides = dense<[1, 2, 2]> : tensor<3xi64>}> : (tensor<3x?x?xi32, #mhlo.type_extensions<bounds = [?, 4, ?]>>) -> tensor<?x?x?xi32>
   // CHECK: types0 = tensor<1x2x2xi32>
   %1 = "mhlo_test.get_return_types"(%0) : (tensor<?x?x?xi32>) -> tensor<?x?x?xindex>
   func.return %1 : tensor<?x?x?xindex>
@@ -449,7 +449,7 @@ func.func @slice_with_bounds(%arg0: tensor<3x?x?xi32, #mhlo.type_extensions<boun
 func.func @slice_with_index_larger_than_bound_dim(%arg0: tensor<3x?x?xi32, #mhlo.type_extensions<bounds = [?, 4, ?]>>) -> tensor<?x?x?xindex> {
   // expected-error@+2 {{'mhlo.slice' op failed to infer returned types}}
   // expected-error@+1 {{limit index 5 is larger than dimension bound 4 in dimension 1}}
-  %0 = "mhlo.slice"(%arg0) {start_indices = dense<[1, 0, 0]> : tensor<3xi64>, limit_indices = dense<[2, 5, 4]> : tensor<3xi64>, strides = dense<[1, 2, 2]> : tensor<3xi64>} : (tensor<3x?x?xi32, #mhlo.type_extensions<bounds = [?, 4, ?]>>) -> tensor<?x?x?xi32>
+  %0 = "mhlo.slice"(%arg0) <{start_indices = dense<[1, 0, 0]> : tensor<3xi64>, limit_indices = dense<[2, 5, 4]> : tensor<3xi64>, strides = dense<[1, 2, 2]> : tensor<3xi64>}> : (tensor<3x?x?xi32, #mhlo.type_extensions<bounds = [?, 4, ?]>>) -> tensor<?x?x?xi32>
   %1 = "mhlo_test.get_return_types"(%0) : (tensor<?x?x?xi32>) -> tensor<?x?x?xindex>
   func.return %1 : tensor<?x?x?xindex>
 }
@@ -478,7 +478,7 @@ func.func @uniform_dequantize(%arg: tensor<16x16x!quant.uniform<i8:f32, 34.0:16>
 
 // CHECK-LABEL: func @fft
 func.func @fft(%arg0: tensor<3x9xcomplex<f32>>) -> tensor<3x9xindex> {
-  %0 = "mhlo.fft"(%arg0) { fft_length = dense<9> : tensor<1xi64>, fft_type = #mhlo<fft_type FFT> } : (tensor<3x9xcomplex<f32>>) -> tensor<3x9xcomplex<f32>>
+  %0 = "mhlo.fft"(%arg0) <{ fft_length = dense<9> : tensor<1xi64>, fft_type = #mhlo<fft_type FFT> }> : (tensor<3x9xcomplex<f32>>) -> tensor<3x9xcomplex<f32>>
   // CHECK: types0 = tensor<3x9xcomplex<f32>>
   %1 = "mhlo_test.get_return_types"(%0) : (tensor<3x9xcomplex<f32>>) -> tensor<3x9xindex>
   func.return %1 : tensor<3x9xindex>
@@ -717,7 +717,7 @@ func.func @while(%arg0: tensor<4xf32>, %arg1: tensor<f32>, %arg2: tensor<f32>, %
 
 // CHECK-LABEL: func @get_dimension_size
 func.func @get_dimension_size(%arg0: tensor<4x2xf32>) -> tensor<index> {
-  %0 = "mhlo.get_dimension_size"(%arg0) {dimension = 1 : i64} : (tensor<4x2xf32>) -> tensor<i32>
+  %0 = "mhlo.get_dimension_size"(%arg0) <{dimension = 1 : i64}> : (tensor<4x2xf32>) -> tensor<i32>
   // CHECK: types0 = tensor<i32>
   %1 = "mhlo_test.get_return_types"(%0) : (tensor<i32>) -> tensor<index>
   func.return %1 : tensor<index>
@@ -1285,7 +1285,7 @@ func.func @broadcast(%arg0: tensor<?xi32>) -> tensor<3xindex> {
   // CHECK: %[[DIM:.*]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?xi32>
   // CHECK: %[[RES:.*]] = tensor.from_elements %[[C1]], %[[C2]], %[[DIM]] : tensor<3xindex>
   // CHECK: return %[[RES]] : tensor<3xindex>
-  %result = "mhlo.broadcast"(%arg0) {broadcast_sizes = dense<[1, 2]> : tensor<2xi64>} : (tensor<?xi32>) -> tensor<1x2x?xi32>
+  %result = "mhlo.broadcast"(%arg0) <{broadcast_sizes = dense<[1, 2]> : tensor<2xi64>}> : (tensor<?xi32>) -> tensor<1x2x?xi32>
   %1 = "mhlo_test.reify_return_type_shapes"(%result): (tensor<1x2x?xi32>) -> tensor<3xindex>
   func.return %1: tensor<3xindex>
 }
@@ -1305,7 +1305,7 @@ func.func @transpose(%arg0: tensor<?x?x?x?xi32>) -> tensor<4xindex> {
   // CHECK: %[[DIM2:.*]] = tensor.dim %[[ARG0]], %[[C3]] : tensor<?x?x?x?xi32>
   // CHECK: %[[RES:.*]] = tensor.from_elements %[[DIM0]], %[[DIM]], %[[DIM2]], %[[DIM1]] : tensor<4xindex>
   // CHECK: return %[[RES]] : tensor<4xindex>
-  %result = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>} : (tensor<?x?x?x?xi32>) -> tensor<?x?x?x?xi32>
+  %result = "mhlo.transpose"(%arg0) <{permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>}> : (tensor<?x?x?x?xi32>) -> tensor<?x?x?x?xi32>
   %1 = "mhlo_test.reify_return_type_shapes"(%result): (tensor<?x?x?x?xi32>) -> tensor<4xindex>
   func.return %1: tensor<4xindex>
 }
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_ops_prettyprint.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_ops_prettyprint.mlir
index 4296eb0cc37958..d45da5fa1f7b77 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_ops_prettyprint.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_ops_prettyprint.mlir
@@ -223,8 +223,8 @@ func.func @compare_op(%arg0 : tensor<3xi32>) -> () {
 // CHECK-LABEL: func @extensions
 func.func @extensions(%arg0 : tensor<?x?xf32, #mhlo.type_extensions<bounds = [3, ?]>>,
                 %arg1 : tensor<i32>) -> () {
-  // CHECK:      %0 = "mhlo.set_dimension_size"(%arg0, %arg1) {dimension = 1 : i64} : (tensor<?x?xf32, #mhlo.type_extensions<bounds = [3, ?]>>, tensor<i32>) -> tensor<?x?xf32>
-  %0 = "mhlo.set_dimension_size"(%arg0, %arg1) {dimension = 1 : i64} : (tensor<?x?xf32, #mhlo.type_extensions<bounds = [3, ?]>>, tensor<i32>) -> tensor<?x?xf32>
+  // CHECK:      %0 = "mhlo.set_dimension_size"(%arg0, %arg1) <{dimension = 1 : i64}> : (tensor<?x?xf32, #mhlo.type_extensions<bounds = [3, ?]>>, tensor<i32>) -> tensor<?x?xf32>
+  %0 = "mhlo.set_dimension_size"(%arg0, %arg1) <{dimension = 1 : i64}> : (tensor<?x?xf32, #mhlo.type_extensions<bounds = [3, ?]>>, tensor<i32>) -> tensor<?x?xf32>
   "mhlo.return"() : () -> ()
 }
 
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir
index 9c9d2be9793e1a..8239be797a9b09 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir
@@ -14,40 +14,38 @@ func.func private @invalid_type() -> !mhlo.foobar
 // -----
 
 func.func @main(%arg0: tensor<10xf32>) -> tensor<10xf32> {
-  %0 = "mhlo.all_reduce"(%arg0) ({
-  // Perform max reduction inside the region
-  ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
-    %max = mhlo.maximum %lhs, %rhs : tensor<f32>
-    "mhlo.return"(%max) : (tensor<f32>) -> ()
-  })
-  {
+  %0 = "mhlo.all_reduce"(%arg0) <{
     replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>,
     channel_handle = #mhlo.channel_handle<
       handle = 5,
       type = 2
     >,
     use_global_device_ids
-  } : (tensor<10xf32>) -> tensor<10xf32>
+  }> ({
+  // Perform max reduction inside the region
+  ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
+    %max = mhlo.maximum %lhs, %rhs : tensor<f32>
+    "mhlo.return"(%max) : (tensor<f32>) -> ()
+  }) : (tensor<10xf32>) -> tensor<10xf32>
   func.return %0 : tensor<10xf32>
 }
 
 // -----
 
 func.func @all_reduce_tuple(%arg0: tensor<10xf32>, %arg1: tensor<f32>) -> tensor<10xf32> {
-  %0:2 = "mhlo.all_reduce"(%arg0, %arg1) ({
-  // Perform max reduction inside the region
-  ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
-    %max = mhlo.maximum %lhs, %rhs : tensor<f32>
-    "mhlo.return"(%max) : (tensor<f32>) -> ()
-  })
-  {
+  %0:2 = "mhlo.all_reduce"(%arg0, %arg1) <{
     replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>,
     channel_handle = #mhlo.channel_handle<
       handle = 5,
       type = 2
     >,
     use_global_device_ids
-  } : (tensor<10xf32>, tensor<f32>) -> (tensor<10xf32>, tensor<f32>)
+  }> ({
+  // Perform max reduction inside the region
+  ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
+    %max = mhlo.maximum %lhs, %rhs : tensor<f32>
+    "mhlo.return"(%max) : (tensor<f32>) -> ()
+  }) : (tensor<10xf32>, tensor<f32>) -> (tensor<10xf32>, tensor<f32>)
   func.return %0 : tensor<10xf32>
 }
 
@@ -56,14 +54,14 @@ func.func @all_reduce_tuple(%arg0: tensor<10xf32>, %arg1: tensor<f32>) -> tensor
 // CHECK-LABEL: func @all_reduce_with_promotable_types
 func.func @all_reduce_with_promotable_types(%operand: tensor<f32>) -> tensor<f64> {
 
-  %result = "mhlo.all_reduce"(%operand) ({
+  %result = "mhlo.all_reduce"(%operand) <{
+    replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>,
+    channel_handle = #mhlo.channel_handle<handle = 0, type = 0>
+  }> ({
     ^bb0(%arg0: tensor<f64>, %arg1: tensor<f64>):
       %0 = "mhlo.add"(%arg0, %arg1) : (tensor<f64>, tensor<f64>) -> tensor<f64>
       "mhlo.return"(%0) : (tensor<f64>) -> ()
-  }) {
-    replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>,
-    channel_handle = #mhlo.channel_handle<handle = 0, type = 0>
-  } : (tensor<f32>) -> tensor<f64>
+  }) : (tensor<f32>) -> tensor<f64>
 
   func.return %result : tensor<f64>
 }
@@ -74,14 +72,14 @@ func.func @all_reduce_with_promotable_types(%operand: tensor<f32>) -> tensor<f64
 func.func @all_reduce_with_promotable_quantized_types(%operand: tensor<!quant.uniform<i8:f32, 2.000000e+00:15>>)
     -> tensor<!quant.uniform<i32:f32, 2.000000e+00:15>> {
 
-  %result = "mhlo.all_reduce"(%operand) ({
+  %result = "mhlo.all_reduce"(%operand) <{
+    replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>,
+    channel_handle = #mhlo.channel_handle<handle = 0, type = 0>
+  }> ({
     ^bb0(%arg0: tensor<!quant.uniform<i32:f32, 2.000000e+00:15>>, %arg1: tensor<!quant.uniform<i32:f32, 2.000000e+00:15>>):
       %0 = mhlo.add %arg0, %arg1 : tensor<!quant.uniform<i32:f32, 2.000000e+00:15>>
       "mhlo.return"(%0) : (tensor<!quant.uniform<i32:f32, 2.000000e+00:15>>) -> ()
-  }) {
-    replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>,
-    channel_handle = #mhlo.channel_handle<handle = 0, type = 0>
-  } : (tensor<!quant.uniform<i8:f32, 2.000000e+00:15>>) -> tensor<!quant.uniform<i32:f32, 2.000000e+00:15>>
+  }) : (tensor<!quant.uniform<i8:f32, 2.000000e+00:15>>) -> tensor<!quant.uniform<i32:f32, 2.000000e+00:15>>
 
   func.return %result : tensor<!quant.uniform<i32:f32, 2.000000e+00:15>>
 }
@@ -91,14 +89,13 @@ func.func @all_reduce_with_promotable_quantized_types(%operand: tensor<!quant.un
 func.func @all_reduce_invalid_reducer(%operand: tensor<10xf32>) -> tensor<10xf32> {
   // expected-error@+2 {{'mhlo.all_reduce' op failed to infer returned types}}
   // expected-error@+1 {{Reduction-region must take 2 parameters, but takes 3 parameter(s)}}
-  %0 = "mhlo.all_reduce"(%operand) ({
+  %0 = "mhlo.all_reduce"(%operand) <{
+    replica_groups = dense<[[0, 2, 4, -1], [1, 3, -1, -1]]> : tensor<2x4xi64>
+  }> ({
   ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<f32>):
     %max = mhlo.maximum %arg0, %arg1 : tensor<f32>
     "mhlo.return"(%max) : (tensor<f32>) -> ()
-  })
-  {
-    replica_groups = dense<[[0, 2, 4, -1], [1, 3, -1, -1]]> : tensor<2x4xi64>
-  } : (tensor<10xf32>) -> tensor<10xf32>
+  }) : (tensor<10xf32>) -> tensor<10xf32>
   func.return %0 : tensor<10xf32>
 }
 
@@ -107,14 +104,13 @@ func.func @all_reduce_invalid_reducer(%operand: tensor<10xf32>) -> tensor<10xf32
 func.func @all_reduce_invalid_reducer(%operand: tensor<10xf32>) -> tensor<10xf32> {
   // expected-error@+2 {{'mhlo.all_reduce' op failed to infer returned types}}
   // expected-error@+1 {{The reduction-region expected to return some value(s)}}
-  %0 = "mhlo.all_reduce"(%operand) ({
+  %0 = "mhlo.all_reduce"(%operand) <{
+    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
+  }> ({
   ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):
     %max = mhlo.maximum %arg0, %arg1 : tensor<f32>
     "mhlo.return"() : () -> ()
-  })
-  {
-    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
-  } : (tensor<10xf32>) -> tensor<10xf32>
+  }) : (tensor<10xf32>) -> tensor<10xf32>
   func.return %0 : tensor<10xf32>
 }
 
@@ -123,14 +119,13 @@ func.func @all_reduce_invalid_reducer(%operand: tensor<10xf32>) -> tensor<10xf32
 func.func @all_reduce_invalid_reducer(%operand: tensor<10xf32>) -> tensor<10xf32> {
   // expected-error@+2 {{'mhlo.all_reduce' op failed to infer returned types}}
   // expected-error@+1 {{Reduction-region here must produce 1 tensors, but produces 2 instead}}
-  %0 = "mhlo.all_reduce"(%operand) ({
+  %0 = "mhlo.all_reduce"(%operand) <{
+    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
+  }> ({
   ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):
     %max = mhlo.maximum %arg0, %arg1 : tensor<f32>
     "mhlo.return"(%max, %max) : (tensor<f32>, tensor<f32>) -> ()
-  })
-  {
-    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
-  } : (tensor<10xf32>) -> tensor<10xf32>
+  }) : (tensor<10xf32>) -> tensor<10xf32>
   func.return %0 : tensor<10xf32>
 }
 
@@ -139,15 +134,14 @@ func.func @all_reduce_invalid_reducer(%operand: tensor<10xf32>) -> tensor<10xf32
 func.func @all_reduce_invalid_reducer(%operand: tensor<10xf32>) -> tensor<10xf32> {
   // expected-error@+2 {{'mhlo.all_reduce' op failed to infer returned types}}
   // expected-error@+1 {{Reduction-region here must produce tensor-typed result(s), but produces 'tuple<tensor<f32>, tensor<f32>>' instead}}
-  %0 = "mhlo.all_reduce"(%operand) ({
+  %0 = "mhlo.all_reduce"(%operand) <{
+    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
+  }> ({
   ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):
     %max = mhlo.maximum %arg0, %arg1 : tensor<f32>
     %tup = "mhlo.tuple"(%max, %max) : (tensor<f32>, tensor<f32>) -> tuple<tensor<f32>, tensor<f32>>
     "mhlo.return"(%tup) : (tuple<tensor<f32>, tensor<f32>>) -> ()
-  })
-  {
-    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
-  } : (tensor<10xf32>) -> tensor<10xf32>
+  }) : (tensor<10xf32>) -> tensor<10xf32>
   func.return %0 : tensor<10xf32>
 }
 
@@ -156,14 +150,13 @@ func.func @all_reduce_invalid_reducer(%operand: tensor<10xf32>) -> tensor<10xf32
 func.func @all_reduce_invalid_reducer(%operand: tensor<10xf32>) -> tensor<10xf32> {
   // expected-error@+2 {{'mhlo.all_reduce' op failed to infer returned types}}
   // expected-error@+1 {{The type of reduction-region's parameter at index 1 is different than the corresponding result type: 'tensor<i32>' vs 'tensor<f32>'}}
-  %0 = "mhlo.all_reduce"(%operand) ({
+  %0 = "mhlo.all_reduce"(%operand) <{
+    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
+  }> ({
   ^bb0(%arg0: tensor<f32>, %arg1: tensor<i32>):
     %max = mhlo.maximum %arg0, %arg0 : tensor<f32>
     "mhlo.return"(%max) : (tensor<f32>) -> ()
-  })
-  {
-    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
-  } : (tensor<10xf32>) -> tensor<10xf32>
+  }) : (tensor<10xf32>) -> tensor<10xf32>
   func.return %0 : tensor<10xf32>
 }
 
@@ -172,15 +165,14 @@ func.func @all_reduce_invalid_reducer(%operand: tensor<10xf32>) -> tensor<10xf32
 func.func @all_reduce_invalid_reducer(%operand: tensor<10xf32>) -> tensor<10xf32> {
   // expected-error@+2 {{'mhlo.all_reduce' op failed to infer returned types}}
   // expected-error@+1 {{The type of reduction-region's parameter at index 0 is different than the corresponding result type: 'tensor<f32>' vs 'tensor<i32>'}}
-  %0 = "mhlo.all_reduce"(%operand) ({
+  %0 = "mhlo.all_reduce"(%operand) <{
+    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
+  }> ({
   ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):
     %max = mhlo.maximum %arg0, %arg1 : tensor<f32>
     %maxint = "mhlo.convert"(%max) : (tensor<f32>) -> tensor<i32>
     "mhlo.return"(%maxint) : (tensor<i32>) -> ()
-  })
-  {
-    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
-  } : (tensor<10xf32>) -> tensor<10xf32>
+  }) : (tensor<10xf32>) -> tensor<10xf32>
   func.return %0 : tensor<10xf32>
 }
 
@@ -189,14 +181,13 @@ func.func @all_reduce_invalid_reducer(%operand: tensor<10xf32>) -> tensor<10xf32
 func.func @all_reduce_invalid_reducer(%operand: tensor<10xf32>) -> tensor<10xf32> {
   // expected-error@+2 {{'mhlo.all_reduce' op failed to infer returned types}}
   // expected-error@+1 {{The element-type of reduction-region's result type at index 0 is expected to be promotable from the op's corresponding init-value element-type: 'tensor<i32>' vs 'tensor<f32>'}}
-  %0 = "mhlo.all_reduce"(%operand) ({
+  %0 = "mhlo.all_reduce"(%operand) <{
+    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
+  }> ({
   ^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>):
     %max = mhlo.maximum %arg0, %arg1 : tensor<i32>
     "mhlo.return"(%max) : (tensor<i32>) -> ()
-  })
-  {
-    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
-  } : (tensor<10xf32>) -> tensor<10xf32>
+  }) : (tensor<10xf32>) -> tensor<10xf32>
   func.return %0 : tensor<10xf32>
 }
 
@@ -205,14 +196,13 @@ func.func @all_reduce_invalid_reducer(%operand: tensor<10xf32>) -> tensor<10xf32
 func.func @all_reduce_invalid_reducer(%operand: tensor<10xf32>) -> tensor<10xf32> {
   // expected-error@+2 {{'mhlo.all_reduce' op failed to infer returned types}}
   // expected-error@+1 {{The shape of reduction-region's result type at index 0 differs from the op's corresponding init-value type: 'tensor<4xf32>' vs 'tensor<f32>'}}
-  %0 = "mhlo.all_reduce"(%operand) ({
+  %0 = "mhlo.all_reduce"(%operand) <{
+    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
+  }> ({
   ^bb0(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>):
     %max = mhlo.maximum %arg0, %arg1 : tensor<4xf32>
     "mhlo.return"(%max) : (tensor<4xf32>) -> ()
-  })
-  {
-    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
-  } : (tensor<10xf32>) -> tensor<10xf32>
+  }) : (tensor<10xf32>) -> tensor<10xf32>
   func.return %0 : tensor<10xf32>
 }
 
@@ -221,14 +211,13 @@ func.func @all_reduce_invalid_reducer(%operand: tensor<10xf32>) -> tensor<10xf32
 func.func @all_reduce_invalid_return_type(%operand: tensor<10xf32>) -> tensor<10x4xf32> {
   // expected-error@+2 {{'mhlo.all_reduce' op failed to infer returned types}}
   // expected-error@+1 {{'mhlo.all_reduce' op inferred type(s) 'tensor<10xf32>' are incompatible with return type(s) of operation 'tensor<10x4xf32>'}}
-  %0 = "mhlo.all_reduce"(%operand) ({
+  %0 = "mhlo.all_reduce"(%operand) <{
+    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
+  }> ({
   ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):
     %max = mhlo.maximum %arg0, %arg1 : tensor<f32>
     "mhlo.return"(%max) : (tensor<f32>) -> ()
-  })
-  {
-    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
-  } : (tensor<10xf32>) -> tensor<10x4xf32>
+  }) : (tensor<10xf32>) -> tensor<10x4xf32>
   func.return %0 : tensor<10x4xf32>
 }
 
@@ -237,14 +226,13 @@ func.func @all_reduce_invalid_return_type(%operand: tensor<10xf32>) -> tensor<10
 func.func @all_reduce_invalid_return_type(%operand: tensor<10xf32>) -> tensor<10xi32> {
   // expected-error@+2 {{'mhlo.all_reduce' op inferred type(s) 'tensor<10xf32>' are incompatible with return type(s) of operation 'tensor<10xi32>'}}
   // expected-error@+1 {{'mhlo.all_reduce' op failed to infer returned types}}
-  %0 = "mhlo.all_reduce"(%operand) ({
+  %0 = "mhlo.all_reduce"(%operand) <{
+    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
+  }> ({
   ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):
     %max = mhlo.maximum %arg0, %arg1 : tensor<f32>
     "mhlo.return"(%max) : (tensor<f32>) -> ()
-  })
-  {
-    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
-  } : (tensor<10xf32>) -> tensor<10xi32>
+  }) : (tensor<10xf32>) -> tensor<10xi32>
   func.return %0 : tensor<10xi32>
 }
 
@@ -253,14 +241,13 @@ func.func @all_reduce_invalid_return_type(%operand: tensor<10xf32>) -> tensor<10
 func.func @all_reduce_invalid_replica_group(%operand: tensor<10xf32>) -> tensor<10xf32> {
   // expected-error@+2 {{'mhlo.all_reduce' op failed to infer returned types}}
   // expected-error@+1 {{replica groups should be a rank 2 tensor}}
-  %0 = "mhlo.all_reduce"(%operand) ({
+  %0 = "mhlo.all_reduce"(%operand) <{
+    replica_groups = dense<0> : tensor<1xi64>
+  }> ({
   ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):
     %max = mhlo.maximum %arg0, %arg1 : tensor<f32>
     "mhlo.return"(%max) : (tensor<f32>) -> ()
-  })
-  {
-    replica_groups = dense<0> : tensor<1xi64>
-  } : (tensor<10xf32>) -> tensor<10xf32>
+  }) : (tensor<10xf32>) -> tensor<10xf32>
   func.return %0 : tensor<10xf32>
 }
 
@@ -269,14 +256,13 @@ func.func @all_reduce_invalid_replica_group(%operand: tensor<10xf32>) -> tensor<
 func.func @all_reduce_invalid_replica_group(%operand: tensor<10xf32>) -> tensor<10xf32> {
   // expected-error@+2 {{'mhlo.all_reduce' op failed to infer returned types}}
   // expected-error@+1 {{replica id #1 seen more than once}}
-  %0 = "mhlo.all_reduce"(%operand) ({
+  %0 = "mhlo.all_reduce"(%operand) <{
+    replica_groups = dense<[[0, 1, 1, 3]]> : tensor<1x4xi64>
+  }> ({
   ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):
     %max = mhlo.maximum %arg0, %arg1 : tensor<f32>
     "mhlo.return"(%max) : (tensor<f32>) -> ()
-  })
-  {
-    replica_groups = dense<[[0, 1, 1, 3]]> : tensor<1x4xi64>
-  } : (tensor<10xf32>) -> tensor<10xf32>
+  }) : (tensor<10xf32>) -> tensor<10xf32>
   func.return %0 : tensor<10xf32>
 }
 
@@ -285,14 +271,13 @@ func.func @all_reduce_invalid_replica_group(%operand: tensor<10xf32>) -> tensor<
 func.func @all_reduce_invalid_replica_group(%operand: tensor<10xf32>) -> tensor<10xf32> {
   // expected-error@+2 {{'mhlo.all_reduce' op failed to infer returned types}}
   //  expected-error@+1 {{replica id #2 not seen in replica groups}}
-  %0 = "mhlo.all_reduce"(%operand) ({
+  %0 = "mhlo.all_reduce"(%operand) <{
+    replica_groups = dense<[[0, 1, 3]]> : tensor<1x3xi64>
+  }> ({
   ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):
     %max = mhlo.maximum %arg0, %arg1 : tensor<f32>
     "mhlo.return"(%max) : (tensor<f32>) -> ()
-  })
-  {
-    replica_groups = dense<[[0, 1, 3]]> : tensor<1x3xi64>
-  } : (tensor<10xf32>) -> tensor<10xf32>
+  }) : (tensor<10xf32>) -> tensor<10xf32>
   func.return %0 : tensor<10xf32>
 }
 
@@ -301,15 +286,14 @@ func.func @all_reduce_invalid_replica_group(%operand: tensor<10xf32>) -> tensor<
 func.func @all_reduce_invalid_replica_group(%operand: tensor<10xf32>) -> tensor<10xf32> {
   // expected-error@+2 {{'mhlo.all_reduce' op failed to infer returned types}}
   //  expected-error@+1 {{replica groups cannot be empty}}
-  %0 = "mhlo.all_reduce"(%operand) ({
+  %0 = "mhlo.all_reduce"(%operand) <{
+    replica_groups = dense<0> : tensor<0x2xi64>,
+    use_global_device_ids
+  }> ({
   ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):
     %max = mhlo.maximum %arg0, %arg1 : tensor<f32>
     "mhlo.return"(%max) : (tensor<f32>) -> ()
-  })
-  {
-    replica_groups = dense<0> : tensor<0x2xi64>,
-    use_global_device_ids
-  } : (tensor<10xf32>) -> tensor<10xf32>
+  }) : (tensor<10xf32>) -> tensor<10xf32>
   func.return %0 : tensor<10xf32>
 }
 
@@ -317,14 +301,16 @@ func.func @all_reduce_invalid_replica_group(%operand: tensor<10xf32>) -> tensor<
 
 // CHECK-LABEL: func @reduce_scatter
 func.func @reduce_scatter(%data: tensor<4x16xf32>) -> tensor<4x4xf32> {
-  %0 = "mhlo.reduce_scatter"(%data) ({
+  %0 = "mhlo.reduce_scatter"(%data) <{
+    replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
+    scatter_dimension = 1 : i64,
+    channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
+    use_global_device_ids
+  }> ({
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
     %1 = mhlo.add %arg2, %arg3 : tensor<f32>
     "mhlo.return"(%1) : (tensor<f32>) -> ()
-  }) {replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
-      scatter_dimension = 1 : i64,
-      channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
-      use_global_device_ids} : (tensor<4x16xf32>) -> tensor<4x4xf32>
+  }) : (tensor<4x16xf32>) -> tensor<4x4xf32>
   func.return %0 : tensor<4x4xf32>
 }
 
@@ -332,14 +318,16 @@ func.func @reduce_scatter(%data: tensor<4x16xf32>) -> tensor<4x4xf32> {
 
 // CHECK-LABEL: func @reduce_scatter_dynamic
 func.func @reduce_scatter_dynamic(%data: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  %0 = "mhlo.reduce_scatter"(%data) ({
+  %0 = "mhlo.reduce_scatter"(%data) <{
+    replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
+    scatter_dimension = 1 : i64,
+    channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
+    use_global_device_ids
+  }> ({
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
     %1 = mhlo.add %arg2, %arg3 : tensor<f32>
     "mhlo.return"(%1) : (tensor<f32>) -> ()
-  }) {replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
-      scatter_dimension = 1 : i64,
-      channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
-      use_global_device_ids} : (tensor<?x?xf32>) -> tensor<?x?xf32>
+  }) : (tensor<?x?xf32>) -> tensor<?x?xf32>
   func.return %0 : tensor<?x?xf32>
 }
 
@@ -347,14 +335,14 @@ func.func @reduce_scatter_dynamic(%data: tensor<?x?xf32>) -> tensor<?x?xf32> {
 
 // CHECK-LABEL: func @reduce_scatter_with_promotable_types
 func.func @reduce_scatter_with_promotable_types(%data: tensor<4x16xf32>) -> tensor<4x4xf64> {
-  %0 = "mhlo.reduce_scatter"(%data) ({
+  %0 = "mhlo.reduce_scatter"(%data) <{replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
+      scatter_dimension = 1 : i64,
+      channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
+      use_global_device_ids}> ({
     ^bb0(%arg2: tensor<f64>, %arg3: tensor<f64>):
     %1 = mhlo.add %arg2, %arg3 : tensor<f64>
     "mhlo.return"(%1) : (tensor<f64>) -> ()
-  }) {replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
-      scatter_dimension = 1 : i64,
-      channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
-      use_global_device_ids} : (tensor<4x16xf32>) -> tensor<4x4xf64>
+  }) : (tensor<4x16xf32>) -> tensor<4x4xf64>
   func.return %0 : tensor<4x4xf64>
 }
 
@@ -364,14 +352,14 @@ func.func @reduce_scatter_with_promotable_types(%data: tensor<4x16xf32>) -> tens
 func.func @reduce_scatter_with_promotable_quantized_types(
     %data: tensor<4x16x!quant.uniform<i8:f32, 2.000000e+00:15>>) ->
     tensor<4x4x!quant.uniform<i32:f32, 2.000000e+00:15>> {
-  %0 = "mhlo.reduce_scatter"(%data) ({
+  %0 = "mhlo.reduce_scatter"(%data) <{replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
+      scatter_dimension = 1 : i64,
+      channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
+      use_global_device_ids}> ({
     ^bb0(%arg2: tensor<!quant.uniform<i32:f32, 2.000000e+00:15>>, %arg3: tensor<!quant.uniform<i32:f32, 2.000000e+00:15>>):
     %1 = mhlo.add %arg2, %arg3 : tensor<!quant.uniform<i32:f32, 2.000000e+00:15>>
     "mhlo.return"(%1) : (tensor<!quant.uniform<i32:f32, 2.000000e+00:15>>) -> ()
-  }) {replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
-      scatter_dimension = 1 : i64,
-      channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
-      use_global_device_ids} : (tensor<4x16x!quant.uniform<i8:f32, 2.000000e+00:15>>) -> tensor<4x4x!quant.uniform<i32:f32, 2.000000e+00:15>>
+  }) : (tensor<4x16x!quant.uniform<i8:f32, 2.000000e+00:15>>) -> tensor<4x4x!quant.uniform<i32:f32, 2.000000e+00:15>>
   func.return %0 : tensor<4x4x!quant.uniform<i32:f32, 2.000000e+00:15>>
 }
 
@@ -379,12 +367,12 @@ func.func @reduce_scatter_with_promotable_quantized_types(
 
 func.func @reduce_scatter_c2(%data: tensor<4x16xf32>) -> tensor<4x4xf32> {
   // expected-error@+1 {{expects scatter_dimension >= 0}}
-  %0 = "mhlo.reduce_scatter"(%data) ({
+  %0 = "mhlo.reduce_scatter"(%data) <{replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
+      scatter_dimension = -1 : i64}> ({
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
     %1 = mhlo.add %arg2, %arg3 : tensor<f32>
     "mhlo.return"(%1) : (tensor<f32>) -> ()
-  }) {replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
-      scatter_dimension = -1 : i64} : (tensor<4x16xf32>) -> tensor<4x4xf32>
+  }) : (tensor<4x16xf32>) -> tensor<4x4xf32>
   func.return %0 : tensor<4x4xf32>
 }
 
@@ -392,12 +380,12 @@ func.func @reduce_scatter_c2(%data: tensor<4x16xf32>) -> tensor<4x4xf32> {
 
 func.func @reduce_scatter_c2(%data: tensor<4x16xf32>) -> tensor<4x4xf32> {
   // expected-error@+1 {{scatter dim should be less than operand/result rank}}
-  %0 = "mhlo.reduce_scatter"(%data) ({
+  %0 = "mhlo.reduce_scatter"(%data) <{replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
+      scatter_dimension = 4 : i64}> ({
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
     %1 = mhlo.add %arg2, %arg3 : tensor<f32>
     "mhlo.return"(%1) : (tensor<f32>) -> ()
-  }) {replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
-      scatter_dimension = 4 : i64} : (tensor<4x16xf32>) -> tensor<4x4xf32>
+  }) : (tensor<4x16xf32>) -> tensor<4x4xf32>
   func.return %0 : tensor<4x4xf32>
 }
 
@@ -405,12 +393,12 @@ func.func @reduce_scatter_c2(%data: tensor<4x16xf32>) -> tensor<4x4xf32> {
 
 func.func @reduce_scatter_c3(%data: tensor<4x16xf32>) -> tensor<4x4xf32> {
   //  expected-error@+1 {{replica id #1 seen more than once}}
-  %0 = "mhlo.reduce_scatter"(%data) ({
+  %0 = "mhlo.reduce_scatter"(%data) <{replica_groups = dense<[[0, 1, 1, 3]]> : tensor<1x4xi64>,
+      scatter_dimension = 1 : i64}> ({
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
     %1 = mhlo.add %arg2, %arg3 : tensor<f32>
     "mhlo.return"(%1) : (tensor<f32>) -> ()
-  }) {replica_groups = dense<[[0, 1, 1, 3]]> : tensor<1x4xi64>,
-      scatter_dimension = 1 : i64} : (tensor<4x16xf32>) -> tensor<4x4xf32>
+  }) : (tensor<4x16xf32>) -> tensor<4x4xf32>
   func.return %0 : tensor<4x4xf32>
 }
 
@@ -418,12 +406,12 @@ func.func @reduce_scatter_c3(%data: tensor<4x16xf32>) -> tensor<4x4xf32> {
 
 func.func @reduce_scatter_c5(%data: tensor<4x16xf32>) -> tensor<4x4xf32> {
   //  expected-error@+1 {{Invalid replica id -1}}
-  %0 = "mhlo.reduce_scatter"(%data) ({
+  %0 = "mhlo.reduce_scatter"(%data) <{replica_groups = dense<[[0, -1, 2, 3]]> : tensor<1x4xi64>,
+      scatter_dimension = 1 : i64}> ({
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
     %1 = mhlo.add %arg2, %arg3 : tensor<f32>
     "mhlo.return"(%1) : (tensor<f32>) -> ()
-  }) {replica_groups = dense<[[0, -1, 2, 3]]> : tensor<1x4xi64>,
-      scatter_dimension = 1 : i64} : (tensor<4x16xf32>) -> tensor<4x4xf32>
+  }) : (tensor<4x16xf32>) -> tensor<4x4xf32>
   func.return %0 : tensor<4x4xf32>
 }
 
@@ -431,12 +419,12 @@ func.func @reduce_scatter_c5(%data: tensor<4x16xf32>) -> tensor<4x4xf32> {
 
 func.func @reduce_scatter_c5(%data: tensor<4x16xf32>) -> tensor<4x4xf32> {
   //  expected-error@+1 {{replica id #2 not seen in replica groups}}
-  %0 = "mhlo.reduce_scatter"(%data) ({
+  %0 = "mhlo.reduce_scatter"(%data) <{replica_groups = dense<[[0, 1, 3]]> : tensor<1x3xi64>,
+      scatter_dimension = 1 : i64}> ({
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
     %1 = mhlo.add %arg2, %arg3 : tensor<f32>
     "mhlo.return"(%1) : (tensor<f32>) -> ()
-  }) {replica_groups = dense<[[0, 1, 3]]> : tensor<1x3xi64>,
-      scatter_dimension = 1 : i64} : (tensor<4x16xf32>) -> tensor<4x4xf32>
+  }) : (tensor<4x16xf32>) -> tensor<4x4xf32>
   func.return %0 : tensor<4x4xf32>
 }
 
@@ -444,13 +432,13 @@ func.func @reduce_scatter_c5(%data: tensor<4x16xf32>) -> tensor<4x4xf32> {
 
 func.func @reduce_scatter_c6(%data: tensor<4x16xf32>) -> tensor<4x4xf32> {
   //  expected-error@+1 {{channel_id must be positive when useGlobalDeviceIds is set but got: 0}}
-  %0 = "mhlo.reduce_scatter"(%data) ({
+  %0 = "mhlo.reduce_scatter"(%data) <{replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
+      scatter_dimension = 1 : i64,
+      use_global_device_ids}> ({
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
     %1 = mhlo.add %arg2, %arg3 : tensor<f32>
     "mhlo.return"(%1) : (tensor<f32>) -> ()
-  }) {replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
-      scatter_dimension = 1 : i64,
-      use_global_device_ids} : (tensor<4x16xf32>) -> tensor<4x4xf32>
+  }) : (tensor<4x16xf32>) -> tensor<4x4xf32>
   func.return %0 : tensor<4x4xf32>
 }
 
@@ -458,12 +446,12 @@ func.func @reduce_scatter_c6(%data: tensor<4x16xf32>) -> tensor<4x4xf32> {
 
 func.func @reduce_scatter_c7(%data: tensor<4x16xf32>) -> tensor<4x4xf32> {
   // expected-error@+1 {{Reduction-region must take 2 parameters, but takes 3 parameter(s)}}
-  %0 = "mhlo.reduce_scatter"(%data) ({
+  %0 = "mhlo.reduce_scatter"(%data) <{replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
+      scatter_dimension = 1 : i64}> ({
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>, %arg4: tensor<f32>):
     %1 = mhlo.add %arg2, %arg3 : tensor<f32>
     "mhlo.return"() : () -> ()
-  }) {replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
-      scatter_dimension = 1 : i64} : (tensor<4x16xf32>) -> tensor<4x4xf32>
+  }) : (tensor<4x16xf32>) -> tensor<4x4xf32>
   func.return %0 : tensor<4x4xf32>
 }
 
@@ -471,12 +459,12 @@ func.func @reduce_scatter_c7(%data: tensor<4x16xf32>) -> tensor<4x4xf32> {
 
 func.func @reduce_scatter_c7(%data: tensor<4x16xf32>) -> tensor<4x4xf32> {
   // expected-error@+1 {{The reduction-region expected to return some value(s)}}
-  %0 = "mhlo.reduce_scatter"(%data) ({
+  %0 = "mhlo.reduce_scatter"(%data) <{replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
+      scatter_dimension = 1 : i64}> ({
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
     %1 = mhlo.add %arg2, %arg3 : tensor<f32>
     "mhlo.return"() : () -> ()
-  }) {replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
-      scatter_dimension = 1 : i64} : (tensor<4x16xf32>) -> tensor<4x4xf32>
+  }) : (tensor<4x16xf32>) -> tensor<4x4xf32>
   func.return %0 : tensor<4x4xf32>
 }
 
@@ -484,12 +472,12 @@ func.func @reduce_scatter_c7(%data: tensor<4x16xf32>) -> tensor<4x4xf32> {
 
 func.func @reduce_scatter_c7(%data: tensor<4x16xf32>) -> tensor<4x4xf32> {
   // expected-error@+1 {{Reduction-region here must produce 1 tensors, but produces 2 instead}}
-  %0 = "mhlo.reduce_scatter"(%data) ({
+  %0 = "mhlo.reduce_scatter"(%data) <{replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
+      scatter_dimension = 1 : i64}> ({
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
     %1 = mhlo.add %arg2, %arg3 : tensor<f32>
     "mhlo.return"(%1, %1) : (tensor<f32>, tensor<f32>) -> ()
-  }) {replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
-      scatter_dimension = 1 : i64} : (tensor<4x16xf32>) -> tensor<4x4xf32>
+  }) : (tensor<4x16xf32>) -> tensor<4x4xf32>
   func.return %0 : tensor<4x4xf32>
 }
 
@@ -497,12 +485,12 @@ func.func @reduce_scatter_c7(%data: tensor<4x16xf32>) -> tensor<4x4xf32> {
 
 func.func @reduce_scatter_c7(%data: tensor<4x16xf32>) -> tensor<4x4xf32> {
   // expected-error@+1 {{Reduction-region here must produce tensor-typed result(s), but produces 'tuple<tensor<f32>, tensor<f32>>' instead}}
-  %0 = "mhlo.reduce_scatter"(%data) ({
+  %0 = "mhlo.reduce_scatter"(%data) <{replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
+      scatter_dimension = 1 : i64}> ({
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
     %1 = "mhlo.tuple"(%arg2, %arg2) : (tensor<f32>, tensor<f32>) -> tuple<tensor<f32>, tensor<f32>>
     "mhlo.return"(%1) : (tuple<tensor<f32>, tensor<f32>>) -> ()
-  }) {replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
-      scatter_dimension = 1 : i64} : (tensor<4x16xf32>) -> tensor<4x4xf32>
+  }) : (tensor<4x16xf32>) -> tensor<4x4xf32>
   func.return %0 : tensor<4x4xf32>
 }
 
@@ -510,12 +498,12 @@ func.func @reduce_scatter_c7(%data: tensor<4x16xf32>) -> tensor<4x4xf32> {
 
 func.func @reduce_scatter_c7(%data: tensor<4x16xf32>) -> tensor<4x4xf32> {
   // expected-error@+1 {{The type of reduction-region's parameter at index 1 is different than the corresponding result type: 'tensor<i32>' vs 'tensor<f32>'}}
-  %0 = "mhlo.reduce_scatter"(%data) ({
+  %0 = "mhlo.reduce_scatter"(%data) <{replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
+      scatter_dimension = 1 : i64}> ({
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<i32>):
     %1 = mhlo.add %arg2, %arg2 : tensor<f32>
     "mhlo.return"(%1) : (tensor<f32>) -> ()
-  }) {replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
-      scatter_dimension = 1 : i64} : (tensor<4x16xf32>) -> tensor<4x4xf32>
+  }) : (tensor<4x16xf32>) -> tensor<4x4xf32>
   func.return %0 : tensor<4x4xf32>
 }
 
@@ -523,13 +511,13 @@ func.func @reduce_scatter_c7(%data: tensor<4x16xf32>) -> tensor<4x4xf32> {
 
 func.func @reduce_scatter_c7(%data: tensor<4x16xf32>) -> tensor<4x4xf32> {
   // expected-error@+1 {{The type of reduction-region's parameter at index 0 is different than the corresponding result type: 'tensor<f32>' vs 'tensor<i32>'}}
-  %0 = "mhlo.reduce_scatter"(%data) ({
+  %0 = "mhlo.reduce_scatter"(%data) <{replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
+      scatter_dimension = 1 : i64}> ({
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
     %1 = mhlo.add %arg2, %arg3 : tensor<f32>
     %2 = "mhlo.convert"(%1) : (tensor<f32>) -> tensor<i32>
     "mhlo.return"(%2) : (tensor<i32>) -> ()
-  }) {replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
-      scatter_dimension = 1 : i64} : (tensor<4x16xf32>) -> tensor<4x4xf32>
+  }) : (tensor<4x16xf32>) -> tensor<4x4xf32>
   func.return %0 : tensor<4x4xf32>
 }
 
@@ -537,12 +525,12 @@ func.func @reduce_scatter_c7(%data: tensor<4x16xf32>) -> tensor<4x4xf32> {
 
 func.func @reduce_scatter_c7(%data: tensor<4x16xf32>) -> tensor<4x4xf32> {
   // expected-error@+1 {{The element-type of reduction-region's result type at index 0 is expected to be promotable from the op's corresponding init-value element-type: 'tensor<i32>' vs 'tensor<f32>'}}
-  %0 = "mhlo.reduce_scatter"(%data) ({
+  %0 = "mhlo.reduce_scatter"(%data) <{replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
+      scatter_dimension = 1 : i64}> ({
     ^bb0(%arg2: tensor<i32>, %arg3: tensor<i32>):
     %1 = mhlo.add %arg2, %arg3 : tensor<i32>
     "mhlo.return"(%1) : (tensor<i32>) -> ()
-  }) {replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
-      scatter_dimension = 1 : i64} : (tensor<4x16xf32>) -> tensor<4x4xf32>
+  }) : (tensor<4x16xf32>) -> tensor<4x4xf32>
   func.return %0 : tensor<4x4xf32>
 }
 
@@ -550,12 +538,12 @@ func.func @reduce_scatter_c7(%data: tensor<4x16xf32>) -> tensor<4x4xf32> {
 
 func.func @reduce_scatter_c8(%data: tensor<4x16xf32>) -> tensor<4xf32> {
   // expected-error@+1 {{operand and result should have same rank}}
-  %0 = "mhlo.reduce_scatter"(%data) ({
+  %0 = "mhlo.reduce_scatter"(%data) <{replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
+      scatter_dimension = 1 : i64}> ({
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
     %1 = mhlo.add %arg2, %arg3 : tensor<f32>
     "mhlo.return"(%1) : (tensor<f32>) -> ()
-  }) {replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
-      scatter_dimension = 1 : i64} : (tensor<4x16xf32>) -> tensor<4xf32>
+  }) : (tensor<4x16xf32>) -> tensor<4xf32>
   func.return %0 : tensor<4xf32>
 }
 
@@ -563,12 +551,12 @@ func.func @reduce_scatter_c8(%data: tensor<4x16xf32>) -> tensor<4xf32> {
 
 func.func @reduce_scatter_c8(%data: tensor<4x16xf32>) -> tensor<4x5xf32> {
   // expected-error@+1 {{operand scatter dimension has size 16, expected to be a multiple of result scatter dimension size 5}}
-  %0 = "mhlo.reduce_scatter"(%data) ({
+  %0 = "mhlo.reduce_scatter"(%data) <{replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
+      scatter_dimension = 1 : i64}> ({
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
     %1 = mhlo.add %arg2, %arg3 : tensor<f32>
     "mhlo.return"(%1) : (tensor<f32>) -> ()
-  }) {replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
-      scatter_dimension = 1 : i64} : (tensor<4x16xf32>) -> tensor<4x5xf32>
+  }) : (tensor<4x16xf32>) -> tensor<4x5xf32>
   func.return %0 : tensor<4x5xf32>
 }
 
@@ -576,12 +564,12 @@ func.func @reduce_scatter_c8(%data: tensor<4x16xf32>) -> tensor<4x5xf32> {
 
 func.func @reduce_scatter_c8(%data: tensor<4x16xf32>) -> tensor<3x4xf32> {
   // expected-error@+1 {{non scatter dimensions should be same for operand (4) and result (3)}}
-  %0 = "mhlo.reduce_scatter"(%data) ({
+  %0 = "mhlo.reduce_scatter"(%data) <{replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
+      scatter_dimension = 1 : i64}> ({
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
     %1 = mhlo.add %arg2, %arg3 : tensor<f32>
     "mhlo.return"(%1) : (tensor<f32>) -> ()
-  }) {replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
-      scatter_dimension = 1 : i64} : (tensor<4x16xf32>) -> tensor<3x4xf32>
+  }) : (tensor<4x16xf32>) -> tensor<3x4xf32>
   func.return %0 : tensor<3x4xf32>
 }
 
@@ -589,12 +577,12 @@ func.func @reduce_scatter_c8(%data: tensor<4x16xf32>) -> tensor<3x4xf32> {
 
 func.func @reduce_scatter_i3(%data: tensor<4x16xf32>) -> tensor<4x4xf32> {
   // expected-error@+1 {{replica groups should be a rank 2 tensor}}
-  %0 = "mhlo.reduce_scatter"(%data) ({
+  %0 = "mhlo.reduce_scatter"(%data) <{replica_groups = dense<0> : tensor<1xi64>,
+      scatter_dimension = 1 : i64}> ({
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
     %1 = mhlo.add %arg2, %arg3 : tensor<f32>
     "mhlo.return"(%1) : (tensor<f32>) -> ()
-  }) {replica_groups = dense<0> : tensor<1xi64>,
-      scatter_dimension = 1 : i64} : (tensor<4x16xf32>) -> tensor<4x4xf32>
+  }) : (tensor<4x16xf32>) -> tensor<4x4xf32>
   func.return %0 : tensor<4x4xf32>
 }
 
@@ -603,12 +591,12 @@ func.func @reduce_scatter_i3(%data: tensor<4x16xf32>) -> tensor<4x4xf32> {
 // TODO(#1746): Sync verification of ReduceScatter with HLO.
 func.func @reduce_scatter_invalid(%data: tensor<4x16xf32>) -> tensor<4x0xf32> {
   // expected-error@+1 {{result dimension size at scatter_dimension cannot be zero}}
-  %0 = "mhlo.reduce_scatter"(%data) ({
+  %0 = "mhlo.reduce_scatter"(%data) <{replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
+      scatter_dimension = 1 : i64}> ({
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
     %1 = mhlo.add %arg2, %arg3 : tensor<f32>
     "mhlo.return"(%1) : (tensor<f32>) -> ()
-  }) {replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
-      scatter_dimension = 1 : i64} : (tensor<4x16xf32>) -> tensor<4x0xf32>
+  }) : (tensor<4x16xf32>) -> tensor<4x0xf32>
   func.return %0 : tensor<4x0xf32>
 }
 
@@ -617,12 +605,12 @@ func.func @reduce_scatter_invalid(%data: tensor<4x16xf32>) -> tensor<4x0xf32> {
 // TODO(#1746): Sync verification of ReduceScatter with HLO.
 func.func @reduce_scatter_invalid(%data: tensor<4x0xf32>) -> tensor<4x4xf32> {
   // expected-error@+1 {{operand dimension size at scatter_dimension cannot be zero}}
-  %0 = "mhlo.reduce_scatter"(%data) ({
+  %0 = "mhlo.reduce_scatter"(%data) <{replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
+      scatter_dimension = 1 : i64}> ({
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
     %1 = mhlo.add %arg2, %arg3 : tensor<f32>
     "mhlo.return"(%1) : (tensor<f32>) -> ()
-  }) {replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
-      scatter_dimension = 1 : i64} : (tensor<4x0xf32>) -> tensor<4x4xf32>
+  }) : (tensor<4x0xf32>) -> tensor<4x4xf32>
   func.return %0 : tensor<4x4xf32>
 }
 
@@ -809,11 +797,11 @@ func.func @all_to_all_i5(%data: tensor<4x16xf32>) -> tensor<16x4xf32> {
 // -----
 
 func.func @all_gather_variadic(%arg0: tensor<8x2xf32>, %arg1: tensor<8x4xf32>) -> (tensor<8x8xf32>, tensor<8x16xf32>) {
-  %0:2 = "mhlo.all_gather"(%arg0, %arg1) {
+  %0:2 = "mhlo.all_gather"(%arg0, %arg1) <{
     all_gather_dim = 1 : i64,
     channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
     replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
-  } : (tensor<8x2xf32>, tensor<8x4xf32>) -> (tensor<8x8xf32>, tensor<8x16xf32>)
+  }> : (tensor<8x2xf32>, tensor<8x4xf32>) -> (tensor<8x8xf32>, tensor<8x16xf32>)
   func.return %0#0, %0#1 : tensor<8x8xf32>, tensor<8x16xf32>
 }
 
@@ -821,11 +809,11 @@ func.func @all_gather_variadic(%arg0: tensor<8x2xf32>, %arg1: tensor<8x4xf32>) -
 
 func.func @allgather_gather_along_zero_dimension(%arg0: tensor<128x0xf32>) -> tensor<128x100xf32> {
   // expected-error@+1 {{dimension size of operand at 'all_gather_dim' cannot be zero}}
-  %0 = "mhlo.all_gather"(%arg0) {
+  %0 = "mhlo.all_gather"(%arg0) <{
     all_gather_dim = 1 : i64,
     channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
     replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
-  } : (tensor<128x0xf32>) -> tensor<128x100xf32>
+  }> : (tensor<128x0xf32>) -> tensor<128x100xf32>
   func.return %0 : tensor<128x100xf32>
 }
 
@@ -833,11 +821,11 @@ func.func @allgather_gather_along_zero_dimension(%arg0: tensor<128x0xf32>) -> te
 
 func.func @all_gather_c1(%arg0: tensor<8x2xf32>) -> tensor<8x8xf32> {
   // expected-error@+1 {{all_gather_dim cannot be negative}}
-  %0 = "mhlo.all_gather"(%arg0) {
+  %0 = "mhlo.all_gather"(%arg0) <{
     all_gather_dim = -1 : i64,
     channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
     replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
-  } : (tensor<8x2xf32>) -> tensor<8x8xf32>
+  }> : (tensor<8x2xf32>) -> tensor<8x8xf32>
   func.return %0 : tensor<8x8xf32>
 }
 
@@ -845,11 +833,11 @@ func.func @all_gather_c1(%arg0: tensor<8x2xf32>) -> tensor<8x8xf32> {
 
 func.func @all_gather_c1(%arg0: tensor<8x2xf32>) -> tensor<8x8xf32> {
   // expected-error@+1 {{all_gather_dim must be a valid index of operand}}
-  %0 = "mhlo.all_gather"(%arg0) {
+  %0 = "mhlo.all_gather"(%arg0) <{
     all_gather_dim = 2 : i64,
     channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
     replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
-  } : (tensor<8x2xf32>) -> tensor<8x8xf32>
+  }> : (tensor<8x2xf32>) -> tensor<8x8xf32>
   func.return %0 : tensor<8x8xf32>
 }
 
@@ -857,11 +845,11 @@ func.func @all_gather_c1(%arg0: tensor<8x2xf32>) -> tensor<8x8xf32> {
 
 func.func @all_gather_c2(%arg0: tensor<8x2xf32>) -> tensor<8x8xf32> {
   // expected-error@+1 {{replica id #2 seen more than once}}
-  %0 = "mhlo.all_gather"(%arg0) {
+  %0 = "mhlo.all_gather"(%arg0) <{
     all_gather_dim = 1 : i64,
     channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
     replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 2]]> : tensor<2x4xi64>
-  } : (tensor<8x2xf32>) -> tensor<8x8xf32>
+  }> : (tensor<8x2xf32>) -> tensor<8x8xf32>
   func.return %0 : tensor<8x8xf32>
 }
 
@@ -869,11 +857,11 @@ func.func @all_gather_c2(%arg0: tensor<8x2xf32>) -> tensor<8x8xf32> {
 
 func.func @all_gather_c4(%arg0: tensor<8x2xf32>) -> tensor<8x8xf32> {
   // expected-error@+1 {{Invalid replica id -1}}
-  %0 = "mhlo.all_gather"(%arg0) {
+  %0 = "mhlo.all_gather"(%arg0) <{
     all_gather_dim = 1 : i64,
     channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
     replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, -1]]> : tensor<2x4xi64>
-  } : (tensor<8x2xf32>) -> tensor<8x8xf32>
+  }> : (tensor<8x2xf32>) -> tensor<8x8xf32>
   func.return %0 : tensor<8x8xf32>
 }
 
@@ -881,11 +869,11 @@ func.func @all_gather_c4(%arg0: tensor<8x2xf32>) -> tensor<8x8xf32> {
 
 func.func @all_gather_c4(%arg0: tensor<8x2xf32>) -> tensor<8x8xf32> {
   // expected-error@+1 {{replica id #4 not seen in replica groups}}
-  %0 = "mhlo.all_gather"(%arg0) {
+  %0 = "mhlo.all_gather"(%arg0) <{
     all_gather_dim = 1 : i64,
     channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
     replica_groups = dense<[[0, 2, 6, 8], [1, 3, 5, 7]]> : tensor<2x4xi64>
-  } : (tensor<8x2xf32>) -> tensor<8x8xf32>
+  }> : (tensor<8x2xf32>) -> tensor<8x8xf32>
   func.return %0 : tensor<8x8xf32>
 }
 
@@ -893,12 +881,12 @@ func.func @all_gather_c4(%arg0: tensor<8x2xf32>) -> tensor<8x8xf32> {
 
 func.func @all_gather_c5(%arg0: tensor<8x2xf32>) -> tensor<8x8xf32> {
   // expected-error@+1 {{channel_id cannot be negative when useGlobalDeviceIds is set}}
-  %0 = "mhlo.all_gather"(%arg0) {
+  %0 = "mhlo.all_gather"(%arg0) <{
     all_gather_dim = 1 : i64,
     replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
     channel_handle = #mhlo.channel_handle<handle = -1, type = 0>,
     use_global_device_ids
-  } : (tensor<8x2xf32>) -> tensor<8x8xf32>
+  }> : (tensor<8x2xf32>) -> tensor<8x8xf32>
   func.return %0 : tensor<8x8xf32>
 }
 
@@ -906,11 +894,11 @@ func.func @all_gather_c5(%arg0: tensor<8x2xf32>) -> tensor<8x8xf32> {
 
 func.func @all_gather_c6(%arg0: tensor<8x2x32xf32>) -> tensor<8x8xf32> {
   // expected-error@+1 {{operand and result must have the same rank}}
-  %0 = "mhlo.all_gather"(%arg0) {
+  %0 = "mhlo.all_gather"(%arg0) <{
     all_gather_dim = 1 : i64,
     channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
     replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
-  } : (tensor<8x2x32xf32>) -> tensor<8x8xf32>
+  }> : (tensor<8x2x32xf32>) -> tensor<8x8xf32>
   func.return %0 : tensor<8x8xf32>
 }
 
@@ -918,11 +906,11 @@ func.func @all_gather_c6(%arg0: tensor<8x2x32xf32>) -> tensor<8x8xf32> {
 
 func.func @all_gather_c6(%arg0: tensor<8x2xf32>) -> tensor<4x8xf32> {
   // expected-error@+1 {{operand and result should have the same shape except for the dimension size at 'all_gather_dim'}}
-  %0 = "mhlo.all_gather"(%arg0) {
+  %0 = "mhlo.all_gather"(%arg0) <{
     all_gather_dim = 1 : i64,
     channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
     replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
-  } : (tensor<8x2xf32>) -> tensor<4x8xf32>
+  }> : (tensor<8x2xf32>) -> tensor<4x8xf32>
   func.return %0 : tensor<4x8xf32>
 }
 
@@ -930,11 +918,11 @@ func.func @all_gather_c6(%arg0: tensor<8x2xf32>) -> tensor<4x8xf32> {
 
 func.func @all_gather_c6(%arg0: tensor<128x32xf32>) -> tensor<128x100xf32> {
   // expected-error@+1 {{result gather dimension has size 100, expected to be a multiple of operand gather dimension size 32}}
-  %0 = "mhlo.all_gather"(%arg0) {
+  %0 = "mhlo.all_gather"(%arg0) <{
     all_gather_dim = 1 : i64,
     channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
     replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
-  } : (tensor<128x32xf32>) -> tensor<128x100xf32>
+  }> : (tensor<128x32xf32>) -> tensor<128x100xf32>
   func.return %0 : tensor<128x100xf32>
 }
 
@@ -942,11 +930,11 @@ func.func @all_gather_c6(%arg0: tensor<128x32xf32>) -> tensor<128x100xf32> {
 
 func.func @all_gather_i3(%arg0: tensor<8x2xf32>) -> tensor<8x8xf32> {
   // expected-error@+1 {{replica groups should be a rank 2 tensor}}
-  %0 = "mhlo.all_gather"(%arg0) {
+  %0 = "mhlo.all_gather"(%arg0) <{
     all_gather_dim = 1 : i64,
     channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
     replica_groups = dense<[[[0], [1], [2], [3]]]> : tensor<1x4x1xi64>
-  } : (tensor<8x2xf32>) -> tensor<8x8xf32>
+  }> : (tensor<8x2xf32>) -> tensor<8x8xf32>
   func.return %0 : tensor<8x8xf32>
 }
 
@@ -954,7 +942,7 @@ func.func @all_gather_i3(%arg0: tensor<8x2xf32>) -> tensor<8x8xf32> {
 
 // CHECK-LABEL: func @broadcast
 func.func @broadcast(%arg0: tensor<3xi32>) -> tensor<1x2x3xi32> {
-  %0 = "mhlo.broadcast"(%arg0) {broadcast_sizes = dense<[1, 2]> : tensor<2xi64>} : (tensor<3xi32>) -> tensor<1x2x3xi32>
+  %0 = "mhlo.broadcast"(%arg0) <{broadcast_sizes = dense<[1, 2]> : tensor<2xi64>}> : (tensor<3xi32>) -> tensor<1x2x3xi32>
   func.return %0 : tensor<1x2x3xi32>
 }
 
@@ -963,7 +951,7 @@ func.func @broadcast(%arg0: tensor<3xi32>) -> tensor<1x2x3xi32> {
 func.func @broadcast_bad_sizes_rank(%arg0: tensor<3xi32>) -> tensor<1x2x3xi32> {
   // expected-error@+2 {{'mhlo.broadcast' op failed to infer returned types}}
   // expected-error@+1 {{broadcast_sizes has rank 2 instead of required rank 1.}}
-  %0 = "mhlo.broadcast"(%arg0) {broadcast_sizes = dense<[[1, 2]]> : tensor<1x2xi64>} : (tensor<3xi32>) -> tensor<1x2x3xi32>
+  %0 = "mhlo.broadcast"(%arg0) <{broadcast_sizes = dense<[[1, 2]]> : tensor<1x2xi64>}> : (tensor<3xi32>) -> tensor<1x2x3xi32>
   func.return %0 : tensor<1x2x3xi32>
 }
 
@@ -972,7 +960,7 @@ func.func @broadcast_bad_sizes_rank(%arg0: tensor<3xi32>) -> tensor<1x2x3xi32> {
 func.func @broadcast_bad_result_rank(%arg0: tensor<3xi32>) -> tensor<1x2x3xi32> {
   // expected-error@+2 {{'mhlo.broadcast' op failed to infer returned types}}
   // expected-error@+1 {{'mhlo.broadcast' op inferred type(s) 'tensor<2x3xi32>' are incompatible with return type(s) of operation 'tensor<1x2x3xi32>'}}
-  %0 = "mhlo.broadcast"(%arg0) {broadcast_sizes = dense<[2]> : tensor<1xi64>} : (tensor<3xi32>) -> tensor<1x2x3xi32>
+  %0 = "mhlo.broadcast"(%arg0) <{broadcast_sizes = dense<[2]> : tensor<1xi64>}> : (tensor<3xi32>) -> tensor<1x2x3xi32>
   func.return %0 : tensor<1x2x3xi32>
 }
 
@@ -981,7 +969,7 @@ func.func @broadcast_bad_result_rank(%arg0: tensor<3xi32>) -> tensor<1x2x3xi32>
 func.func @broadcast_bad_first_part_result_shape(%arg0: tensor<3xi32>) -> tensor<1x2x3xi32> {
   // expected-error@+2 {{'mhlo.broadcast' op failed to infer returned types}}
   // expected-error@+1 {{'mhlo.broadcast' op inferred type(s) 'tensor<2x3xi32>' are incompatible with return type(s) of operation 'tensor<1x3xi32>'}}
-  %0 = "mhlo.broadcast"(%arg0) {broadcast_sizes = dense<[2]> : tensor<1xi64>} : (tensor<3xi32>) -> tensor<1x3xi32>
+  %0 = "mhlo.broadcast"(%arg0) <{broadcast_sizes = dense<[2]> : tensor<1xi64>}> : (tensor<3xi32>) -> tensor<1x3xi32>
   func.return %0 : tensor<1x3xi32>
 }
 
@@ -990,7 +978,7 @@ func.func @broadcast_bad_first_part_result_shape(%arg0: tensor<3xi32>) -> tensor
 func.func @broadcast_bad_second_part_result_shape(%arg0: tensor<3xi32>) -> tensor<1x2x3xi32> {
   // expected-error@+2 {{'mhlo.broadcast' op failed to infer returned types}}
   // expected-error@+1 {{'mhlo.broadcast' op inferred type(s) 'tensor<2x3xi32>' are incompatible with return type(s) of operation 'tensor<2x1xi32>'}}
-  %0 = "mhlo.broadcast"(%arg0) {broadcast_sizes = dense<[2]> : tensor<1xi64>} : (tensor<3xi32>) -> tensor<2x1xi32>
+  %0 = "mhlo.broadcast"(%arg0) <{broadcast_sizes = dense<[2]> : tensor<1xi64>}> : (tensor<3xi32>) -> tensor<2x1xi32>
   func.return %0 : tensor<2x1xi32>
 }
 
@@ -998,7 +986,7 @@ func.func @broadcast_bad_second_part_result_shape(%arg0: tensor<3xi32>) -> tenso
 
 // CHECK-LABEL: func @dynamic_broadcast_in_dim
 func.func @dynamic_broadcast_in_dim(%arg0: tensor<?x?xi32>, %shape: tensor<3xi64>) -> tensor<?x?x?xi32> {
-  %0 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %shape) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<?x?xi32>, tensor<3xi64>) -> tensor<?x?x?xi32>
+  %0 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %shape) <{broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>}> : (tensor<?x?xi32>, tensor<3xi64>) -> tensor<?x?x?xi32>
   func.return %0 : tensor<?x?x?xi32>
 }
 
@@ -1006,7 +994,7 @@ func.func @dynamic_broadcast_in_dim(%arg0: tensor<?x?xi32>, %shape: tensor<3xi64
 
 // CHECK-LABEL: func @dynamic_broadcast_in_dim_unknown_dim
 func.func @dynamic_broadcast_in_dim_unknown_dim(%arg0: tensor<32xf32>, %shape: tensor<3xi64>) -> tensor<?x?x?xf32> {
-  %0 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %shape) {broadcast_dimensions = dense<[2]> : tensor<1xi64>} : (tensor<32xf32>, tensor<3xi64>) -> tensor<?x?x?xf32>
+  %0 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %shape) <{broadcast_dimensions = dense<[2]> : tensor<1xi64>}> : (tensor<32xf32>, tensor<3xi64>) -> tensor<?x?x?xf32>
   func.return %0 : tensor<?x?x?xf32>
 }
 
@@ -1014,7 +1002,7 @@ func.func @dynamic_broadcast_in_dim_unknown_dim(%arg0: tensor<32xf32>, %shape: t
 
 // CHECK-LABEL: func @dynamic_broadcast_in_dim_ok_dim
 func.func @dynamic_broadcast_in_dim_ok_dim(%arg0: tensor<1xf32>, %shape: tensor<3xi64>) -> tensor<7x8x9xf32> {
-  %0 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %shape) {broadcast_dimensions = dense<[2]> : tensor<1xi64>} : (tensor<1xf32>, tensor<3xi64>) -> tensor<7x8x9xf32>
+  %0 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %shape) <{broadcast_dimensions = dense<[2]> : tensor<1xi64>}> : (tensor<1xf32>, tensor<3xi64>) -> tensor<7x8x9xf32>
   func.return %0 : tensor<7x8x9xf32>
 }
 
@@ -1022,7 +1010,7 @@ func.func @dynamic_broadcast_in_dim_ok_dim(%arg0: tensor<1xf32>, %shape: tensor<
 
 func.func @dynamic_broadcast_in_dim_shape_mismatch(%arg0: tensor<32xf32>, %shape: tensor<3xi64>) -> tensor<7x8x9xf32> {
   // expected-error@+1 {{size of operand dimension 0 (32) is not compatible with size of result dimension 2 (9)}}
-  %0 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %shape) {broadcast_dimensions = dense<[2]> : tensor<1xi64>} : (tensor<32xf32>, tensor<3xi64>) -> tensor<7x8x9xf32>
+  %0 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %shape) <{broadcast_dimensions = dense<[2]> : tensor<1xi64>}> : (tensor<32xf32>, tensor<3xi64>) -> tensor<7x8x9xf32>
   func.return %0 : tensor<7x8x9xf32>
 }
 
@@ -1030,7 +1018,7 @@ func.func @dynamic_broadcast_in_dim_shape_mismatch(%arg0: tensor<32xf32>, %shape
 
 func.func @dynamic_broadcast_in_dim_negative_size(%arg0: tensor<1xf32>, %shape: tensor<3xi64>) -> tensor<7x8x9xf32> {
   // expected-error@+1 {{broadcast_dimensions contains invalid value -1 for result with rank 3}}
-  %0 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %shape) {broadcast_dimensions = dense<[-1]> : tensor<1xi64>} : (tensor<1xf32>, tensor<3xi64>) -> tensor<7x8x9xf32>
+  %0 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %shape) <{broadcast_dimensions = dense<[-1]> : tensor<1xi64>}> : (tensor<1xf32>, tensor<3xi64>) -> tensor<7x8x9xf32>
   func.return %0 : tensor<7x8x9xf32>
 }
 
@@ -1038,7 +1026,7 @@ func.func @dynamic_broadcast_in_dim_negative_size(%arg0: tensor<1xf32>, %shape:
 
 func.func @dynamic_broadcast_in_dim_too_large(%arg0: tensor<1xf32>, %shape: tensor<3xi64>) -> tensor<7x8x9xf32> {
   // expected-error@+1 {{broadcast_dimensions contains invalid value 3 for result with rank 3}}
-  %0 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %shape) {broadcast_dimensions = dense<[3]> : tensor<1xi64>} : (tensor<1xf32>, tensor<3xi64>) -> tensor<7x8x9xf32>
+  %0 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %shape) <{broadcast_dimensions = dense<[3]> : tensor<1xi64>}> : (tensor<1xf32>, tensor<3xi64>) -> tensor<7x8x9xf32>
   func.return %0 : tensor<7x8x9xf32>
 }
 
@@ -1047,7 +1035,7 @@ func.func @dynamic_broadcast_in_dim_too_large(%arg0: tensor<1xf32>, %shape: tens
 
 // CHECK-LABEL: func @broadcast_in_dim
 func.func @broadcast_in_dim(%arg0: tensor<1x2xi32>) -> tensor<1x2x2xi32> {
-  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<1x2xi32>) -> tensor<1x2x2xi32>
+  %0 = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>}> : (tensor<1x2xi32>) -> tensor<1x2x2xi32>
   func.return %0 : tensor<1x2x2xi32>
 }
 
@@ -1055,7 +1043,7 @@ func.func @broadcast_in_dim(%arg0: tensor<1x2xi32>) -> tensor<1x2x2xi32> {
 
 func.func @broadcast_in_dim_c2(%arg0: tensor<1x2xi32>) -> tensor<1x2x3xi32> {
   // expected-error@+1 {{broadcast_dimensions size (1) does not match operand rank (2)}}
-  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[1]> : tensor<1xi64>} : (tensor<1x2xi32>) -> tensor<1x2x3xi32>
+  %0 = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<[1]> : tensor<1xi64>}> : (tensor<1x2xi32>) -> tensor<1x2x3xi32>
   func.return %0 : tensor<1x2x3xi32>
 }
 
@@ -1063,7 +1051,7 @@ func.func @broadcast_in_dim_c2(%arg0: tensor<1x2xi32>) -> tensor<1x2x3xi32> {
 
 func.func @broadcast_in_dim_c3(%arg0: tensor<1x2xi32>) -> tensor<1x2x2xi32> {
   // expected-error@+1 {{broadcast_dimensions contains invalid value -1 for result with rank 3}}
-  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[-1, 2]> : tensor<2xi64>} : (tensor<1x2xi32>) -> tensor<1x2x2xi32>
+  %0 = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<[-1, 2]> : tensor<2xi64>}> : (tensor<1x2xi32>) -> tensor<1x2x2xi32>
   func.return %0 : tensor<1x2x2xi32>
 }
 
@@ -1071,7 +1059,7 @@ func.func @broadcast_in_dim_c3(%arg0: tensor<1x2xi32>) -> tensor<1x2x2xi32> {
 
 func.func @broadcast_in_dim_c3(%arg0: tensor<1x2x3xi32>) -> tensor<3xi32> {
   // expected-error@+1 {{broadcast_dimensions contains invalid value 1 for result with rank 1}}
-  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0,1,2]> : tensor<3xi64>} : (tensor<1x2x3xi32>) -> tensor<3xi32>
+  %0 = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<[0,1,2]> : tensor<3xi64>}> : (tensor<1x2x3xi32>) -> tensor<3xi32>
   func.return %0 : tensor<3xi32>
 }
 
@@ -1079,7 +1067,7 @@ func.func @broadcast_in_dim_c3(%arg0: tensor<1x2x3xi32>) -> tensor<3xi32> {
 
 func.func @broadcast_in_dim_c4(%arg0: tensor<1x1x3xi32>) -> tensor<1x2x3xi32> {
   // expected-error@+1 {{broadcast_dimensions should not have duplicates}}
-  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0,0,2]> : tensor<3xi64>} : (tensor<1x1x3xi32>) -> tensor<1x2x3xi32>
+  %0 = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<[0,0,2]> : tensor<3xi64>}> : (tensor<1x1x3xi32>) -> tensor<1x2x3xi32>
   func.return %0 : tensor<1x2x3xi32>
 }
 
@@ -1087,7 +1075,7 @@ func.func @broadcast_in_dim_c4(%arg0: tensor<1x1x3xi32>) -> tensor<1x2x3xi32> {
 
 func.func @broadcast_in_dim_c5(%arg0: tensor<3xi32>) -> tensor<1x2x3xi32> {
   // expected-error@+1 {{size of operand dimension 0 (3) is not equal to 1 or size of result dimension 1 (2)}}
-  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[1]> : tensor<1xi64>} : (tensor<3xi32>) -> tensor<1x2x3xi32>
+  %0 = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<[1]> : tensor<1xi64>}> : (tensor<3xi32>) -> tensor<1x2x3xi32>
   func.return %0 : tensor<1x2x3xi32>
 }
 
@@ -1095,7 +1083,7 @@ func.func @broadcast_in_dim_c5(%arg0: tensor<3xi32>) -> tensor<1x2x3xi32> {
 
 func.func @broadcast_in_dim_i2(%arg0: tensor<1x2xi32>) -> tensor<1x2x3xi32> {
   // expected-error@+1 {{broadcast_dimensions size (4) does not match operand rank (2)}}
-  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[[1,1],[1,1]]> : tensor<2x2xi64>} : (tensor<1x2xi32>) -> tensor<1x2x3xi32>
+  %0 = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<[[1,1],[1,1]]> : tensor<2x2xi64>}> : (tensor<1x2xi32>) -> tensor<1x2x3xi32>
   func.return %0 : tensor<1x2x3xi32>
 }
 
@@ -1416,7 +1404,7 @@ func.func @collective_permute_invalid_source_target_pairs(%arg0: tensor<128x32xf
 
 // CHECK-LABEL: @concatenate_1D
 func.func @concatenate_1D(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>)  -> tensor<3xi32> {
-  %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<1xi32>, tensor<2xi32>) -> tensor<3xi32>
+  %0 = "mhlo.concatenate"(%arg0, %arg1) <{ dimension = 0 : i64 }> : (tensor<1xi32>, tensor<2xi32>) -> tensor<3xi32>
   func.return %0 : tensor<3xi32>
 }
 
@@ -1426,7 +1414,7 @@ func.func @concatenate_1D(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>)  -> tensor
 // Verifies that an error is not thrown if the inferred type is compatible with
 // the result type.
 func.func @concatenate_1D(%arg0: tensor<1xi32>, %arg1: tensor<?xi32>)  -> tensor<3xi32> {
-  %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<1xi32>, tensor<?xi32>) -> tensor<3xi32>
+  %0 = "mhlo.concatenate"(%arg0, %arg1) <{ dimension = 0 : i64 }> : (tensor<1xi32>, tensor<?xi32>) -> tensor<3xi32>
   func.return %0 : tensor<3xi32>
 }
 
@@ -1435,7 +1423,7 @@ func.func @concatenate_1D(%arg0: tensor<1xi32>, %arg1: tensor<?xi32>)  -> tensor
 func.func @concatenate_c1_c5(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>)  -> tensor<4xi32> {
   // @expected-error@+2 {{'mhlo.concatenate' op failed to infer returned types}}
   // expected-error@+1 {{op inferred type(s) 'tensor<3xi32>' are incompatible with return type(s) of operation 'tensor<4xi32>'}}
-  %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<1xi32>, tensor<2xi32>) -> tensor<4xi32>
+  %0 = "mhlo.concatenate"(%arg0, %arg1) <{ dimension = 0 : i64 }> : (tensor<1xi32>, tensor<2xi32>) -> tensor<4xi32>
   func.return %0 : tensor<4xi32>
 }
 
@@ -1444,7 +1432,7 @@ func.func @concatenate_c1_c5(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>)  -> ten
 func.func @concatenate_c2(%arg0: tensor<1xi32>, %arg1: tensor<2x2xi32>)  -> tensor<3xi32> {
   // @expected-error@+2 {{'mhlo.concatenate' op failed to infer returned types}}
   // expected-error@+1 {{operands (0) and (1) do not match rank}}
-  %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<1xi32>, tensor<2x2xi32>) -> tensor<3xi32>
+  %0 = "mhlo.concatenate"(%arg0, %arg1) <{ dimension = 0 : i64 }> : (tensor<1xi32>, tensor<2x2xi32>) -> tensor<3xi32>
   func.return %0 : tensor<3xi32>
 }
 
@@ -1452,7 +1440,7 @@ func.func @concatenate_c2(%arg0: tensor<1xi32>, %arg1: tensor<2x2xi32>)  -> tens
 
 func.func @concatenate_c3()  -> tensor<2xi32> {
   // expected-error@+1 {{expected 1 or more operands, but found 0}}
-  %0 = "mhlo.concatenate"() { dimension = 0 : i64 } : () -> tensor<2xi32>
+  %0 = "mhlo.concatenate"() <{ dimension = 0 : i64 }> : () -> tensor<2xi32>
   func.return %0 : tensor<2xi32>
 }
 
@@ -1461,7 +1449,7 @@ func.func @concatenate_c3()  -> tensor<2xi32> {
 func.func @concatenate_c4(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>)  -> tensor<3xi32> {
   // @expected-error@+2 {{'mhlo.concatenate' op failed to infer returned types}}
   // expected-error@+1 {{dimension -1 is negative}}
-  %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = -1 : i64 } : (tensor<1xi32>, tensor<2xi32>) -> tensor<3xi32>
+  %0 = "mhlo.concatenate"(%arg0, %arg1) <{ dimension = -1 : i64 }> : (tensor<1xi32>, tensor<2xi32>) -> tensor<3xi32>
   func.return %0 : tensor<3xi32>
 }
 
@@ -1470,7 +1458,7 @@ func.func @concatenate_c4(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>)  -> tensor
 func.func @concatenate_c4(%arg0: tensor<?xi32>, %arg1: tensor<?xi32>)  -> tensor<?xi32> {
   // @expected-error@+2 {{'mhlo.concatenate' op failed to infer returned types}}
   // expected-error@+1 {{dimension -1 is negative}}
-  %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = -1 : i64 } : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
+  %0 = "mhlo.concatenate"(%arg0, %arg1) <{ dimension = -1 : i64 }> : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
   func.return %0 : tensor<?xi32>
 }
 
@@ -1479,7 +1467,7 @@ func.func @concatenate_c4(%arg0: tensor<?xi32>, %arg1: tensor<?xi32>)  -> tensor
 func.func @concatenate_c4(%arg0: tensor<i32>, %arg1: tensor<i32>)  -> tensor<2xi32> {
   // @expected-error@+2 {{'mhlo.concatenate' op failed to infer returned types}}
   // expected-error@+1 {{rank-0 values cannot be concatenated}}
-  %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  %0 = "mhlo.concatenate"(%arg0, %arg1) <{ dimension = 0 : i64 }> : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
   func.return %0 : tensor<2xi32>
 }
 
@@ -1488,7 +1476,7 @@ func.func @concatenate_c4(%arg0: tensor<i32>, %arg1: tensor<i32>)  -> tensor<2xi
 func.func @concatenate_c4(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>)  -> tensor<3xi32> {
   // @expected-error@+2 {{'mhlo.concatenate' op failed to infer returned types}}
   // expected-error@+1 {{dimension 10 is out-of-bounds for input rank 1}}
-  %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = 10 : i64 } : (tensor<1xi32>, tensor<2xi32>) -> tensor<3xi32>
+  %0 = "mhlo.concatenate"(%arg0, %arg1) <{ dimension = 10 : i64 }> : (tensor<1xi32>, tensor<2xi32>) -> tensor<3xi32>
   func.return %0 : tensor<3xi32>
 }
 
@@ -1497,7 +1485,7 @@ func.func @concatenate_c4(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>)  -> tensor
 func.func @concatenate_c6(%arg0: tensor<1x3xi32>, %arg1: tensor<2x2xi32>)  -> tensor<3x3xi32> {
   // @expected-error@+2 {{'mhlo.concatenate' op failed to infer returned types}}
   // expected-error@+1 {{shapes of operand (0) and (1) are not compatible at non-concat index 1: (1, 3) != (2, 2)}}
-  %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<1x3xi32>, tensor<2x2xi32>) -> tensor<3x3xi32>
+  %0 = "mhlo.concatenate"(%arg0, %arg1) <{ dimension = 0 : i64 }> : (tensor<1x3xi32>, tensor<2x2xi32>) -> tensor<3x3xi32>
   func.return %0 : tensor<3x3xi32>
 }
 
@@ -1739,7 +1727,7 @@ func.func @main(%arg0: !mhlo.token) -> tensor<3x3xi32> {
 
 func.func @iota_scalar() -> tensor<i32> {
   // expected-error@+1 {{does not support scalars}}
-  %0 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<i32>
+  %0 = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<i32>
   func.return %0 : tensor<i32>
 }
 
@@ -1747,7 +1735,7 @@ func.func @iota_scalar() -> tensor<i32> {
 
 func.func @iota_invalid_iota_dimension() -> tensor<4xi32> {
   // expected-error@+1 {{iota dimension cannot go beyond the output rank or be negative}}
-  %0 = "mhlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<4xi32>
+  %0 = "mhlo.iota"() <{iota_dimension = 1 : i64}> : () -> tensor<4xi32>
   func.return %0 : tensor<4xi32>
 }
 
@@ -1944,7 +1932,7 @@ func.func @replica_id() -> tensor<ui32> {
 
 // CHECK-LABEL: func @rng_bit_generator
 func.func @rng_bit_generator(%arg0: tensor<2xui64>) -> (tensor<2xui64>, tensor<10x12xui32>) {
-  %0, %1 = "mhlo.rng_bit_generator"(%arg0) {rng_algorithm = #mhlo.rng_algorithm<DEFAULT>} : (tensor<2xui64>) -> (tensor<2xui64>, tensor<10x12xui32>)
+  %0, %1 = "mhlo.rng_bit_generator"(%arg0) <{rng_algorithm = #mhlo.rng_algorithm<DEFAULT>}> : (tensor<2xui64>) -> (tensor<2xui64>, tensor<10x12xui32>)
   func.return %0, %1 : tensor<2xui64>, tensor<10x12xui32>
 }
 
@@ -1952,7 +1940,7 @@ func.func @rng_bit_generator(%arg0: tensor<2xui64>) -> (tensor<2xui64>, tensor<1
 
 func.func @rng_bit_generator(%arg0: tensor<2xui64>) -> (tensor<2xui64>, tensor<10x12xui32>) {
   // expected-error@+1 {{output state shape must be compatible with initial state shape. Got: 'tensor<2xui64>' and 'tensor<3xui64>'}}
-  %0, %1 = "mhlo.rng_bit_generator"(%arg0) {rng_algorithm = #mhlo.rng_algorithm<DEFAULT>} : (tensor<2xui64>) -> (tensor<3xui64>, tensor<10x12xui32>)
+  %0, %1 = "mhlo.rng_bit_generator"(%arg0) <{rng_algorithm = #mhlo.rng_algorithm<DEFAULT>}> : (tensor<2xui64>) -> (tensor<3xui64>, tensor<10x12xui32>)
   func.return %0, %1 : tensor<3xui64>, tensor<10x12xui32>
 }
 
@@ -1960,7 +1948,7 @@ func.func @rng_bit_generator(%arg0: tensor<2xui64>) -> (tensor<2xui64>, tensor<1
 
 // CHECK-LABEL: func @rng_bit_generator_dynamic
 func.func @rng_bit_generator_dynamic(%arg0: tensor<?xui64>) -> (tensor<?xui64>, tensor<10x12xui32>) {
-  %0, %1 = "mhlo.rng_bit_generator"(%arg0) {rng_algorithm = #mhlo.rng_algorithm<DEFAULT>} : (tensor<?xui64>) -> (tensor<?xui64>, tensor<10x12xui32>)
+  %0, %1 = "mhlo.rng_bit_generator"(%arg0) <{rng_algorithm = #mhlo.rng_algorithm<DEFAULT>}> : (tensor<?xui64>) -> (tensor<?xui64>, tensor<10x12xui32>)
   func.return %0, %1 : tensor<?xui64>, tensor<10x12xui32>
 }
 
@@ -1969,7 +1957,7 @@ func.func @rng_bit_generator_dynamic(%arg0: tensor<?xui64>) -> (tensor<?xui64>,
 // CHECK-LABEL: func @rng_normal
 func.func @rng_normal(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<2x3x5xf32> {
   %cst = "mhlo.constant"() {value = dense<[2, 3, 5]> : tensor<3xi64>} : () -> tensor<3xi64>
-  %0 = "mhlo.rng"(%arg0, %arg1, %cst) {rng_distribution = #mhlo.rng_distribution<NORMAL>}: (tensor<f32>, tensor<f32>, tensor<3xi64>) -> tensor<2x3x5xf32>
+  %0 = "mhlo.rng"(%arg0, %arg1, %cst) <{rng_distribution = #mhlo.rng_distribution<NORMAL>}>: (tensor<f32>, tensor<f32>, tensor<3xi64>) -> tensor<2x3x5xf32>
   func.return %0 : tensor<2x3x5xf32>
 }
 
@@ -1977,7 +1965,7 @@ func.func @rng_normal(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<2x3x5xf3
 
 // CHECK-LABEL: func @rng_normal_no_constant
 func.func @rng_normal_no_constant(%a: tensor<f32>, %b: tensor<f32>, %shape: tensor<3xi64>) -> tensor<?x?x?xf32> {
-  %0 = "mhlo.rng"(%a, %b, %shape) {rng_distribution = #mhlo.rng_distribution<NORMAL>}: (tensor<f32>, tensor<f32>, tensor<3xi64>) -> tensor<?x?x?xf32>
+  %0 = "mhlo.rng"(%a, %b, %shape) <{rng_distribution = #mhlo.rng_distribution<NORMAL>}>: (tensor<f32>, tensor<f32>, tensor<3xi64>) -> tensor<?x?x?xf32>
   func.return %0 : tensor<?x?x?xf32>
 }
 
@@ -1987,7 +1975,7 @@ func.func @rng_normal_invalid_shape(%arg0: tensor<f32>, %arg1: tensor<f32>) {
   %cst = "mhlo.constant"() {value = dense<7> : tensor<1xi64>} : () -> tensor<1xi64>
   // @expected-error@+2 {{'mhlo.rng' op failed to infer returned types}}
   // expected-error @+1 {{inferred type(s) 'tensor<7xf32>' are incompatible with return type(s) of operation 'tensor<12xf32>'}}
-  %0 = "mhlo.rng"(%arg0, %arg1, %cst) {rng_distribution = #mhlo.rng_distribution<NORMAL>}: (tensor<f32>, tensor<f32>, tensor<1xi64>) -> tensor<12xf32>
+  %0 = "mhlo.rng"(%arg0, %arg1, %cst) <{rng_distribution = #mhlo.rng_distribution<NORMAL>}>: (tensor<f32>, tensor<f32>, tensor<1xi64>) -> tensor<12xf32>
   func.return
 }
 
@@ -1996,7 +1984,7 @@ func.func @rng_normal_invalid_shape(%arg0: tensor<f32>, %arg1: tensor<f32>) {
 func.func @rng_normal_invalid_mu_rank(%mu: tensor<1xf32>, %sigma: tensor<f32>) -> tensor<2x3x5xf32> {
   %shape = mhlo.constant dense<[2, 3, 5]> : tensor<3xi64>
   // expected-error@+1 {{op operand #0 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3B11FNUZ type or f8E4M3FN type or f8E4M3FNUZ type or f8E5M2 type or f8E5M2FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<1xf32>'}}
-  %0 = "mhlo.rng"(%mu, %sigma, %shape) {rng_distribution = #mhlo.rng_distribution<NORMAL>}: (tensor<1xf32>, tensor<f32>, tensor<3xi64>) -> tensor<2x3x5xf32>
+  %0 = "mhlo.rng"(%mu, %sigma, %shape) <{rng_distribution = #mhlo.rng_distribution<NORMAL>}>: (tensor<1xf32>, tensor<f32>, tensor<3xi64>) -> tensor<2x3x5xf32>
   func.return %0 : tensor<2x3x5xf32>
 }
 
@@ -2005,7 +1993,7 @@ func.func @rng_normal_invalid_mu_rank(%mu: tensor<1xf32>, %sigma: tensor<f32>) -
 func.func @rng_normal_invalid_sigma_rank(%mu: tensor<f32>, %sigma: tensor<1xf32>) -> tensor<2x3x5xf32> {
   %shape = mhlo.constant dense<[2, 3, 5]> : tensor<3xi64>
   // expected-error@+1 {{op operand #1 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3B11FNUZ type or f8E4M3FN type or f8E4M3FNUZ type or f8E5M2 type or f8E5M2FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<1xf32>'}}
-  %0 = "mhlo.rng"(%mu, %sigma, %shape) {rng_distribution = #mhlo.rng_distribution<NORMAL>}: (tensor<f32>, tensor<1xf32>, tensor<3xi64>) -> tensor<2x3x5xf32>
+  %0 = "mhlo.rng"(%mu, %sigma, %shape) <{rng_distribution = #mhlo.rng_distribution<NORMAL>}>: (tensor<f32>, tensor<1xf32>, tensor<3xi64>) -> tensor<2x3x5xf32>
   func.return %0 : tensor<2x3x5xf32>
 }
 
@@ -2014,7 +2002,7 @@ func.func @rng_normal_invalid_sigma_rank(%mu: tensor<f32>, %sigma: tensor<1xf32>
 func.func @rng_normal_invalid_shape_rank(%mu: tensor<f32>, %sigma: tensor<f32>) -> tensor<2x3x5xf32> {
   %shape = mhlo.constant dense<[[2, 3, 5]]> : tensor<1x3xi64>
   // expected-error@+1 {{operand #2 must be 1D tensor of index or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer values, but got 'tensor<1x3xi64>'}}
-  %0 = "mhlo.rng"(%mu, %sigma, %shape) {rng_distribution = #mhlo.rng_distribution<NORMAL>}: (tensor<f32>, tensor<f32>, tensor<1x3xi64>) -> tensor<2x3x5xf32>
+  %0 = "mhlo.rng"(%mu, %sigma, %shape) <{rng_distribution = #mhlo.rng_distribution<NORMAL>}>: (tensor<f32>, tensor<f32>, tensor<1x3xi64>) -> tensor<2x3x5xf32>
   func.return %0 : tensor<2x3x5xf32>
 }
 
@@ -2023,7 +2011,7 @@ func.func @rng_normal_invalid_shape_rank(%mu: tensor<f32>, %sigma: tensor<f32>)
 func.func @rng_normal_invalid_type(%arg0: tensor<complex<f32>>, %arg1: tensor<f32>) {
   %cst = "mhlo.constant"() {value = dense<7> : tensor<1xi64>} : () -> tensor<1xi64>
   // expected-error @+1 {{op operand #0 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3B11FNUZ type or f8E4M3FN type or f8E4M3FNUZ type or f8E5M2 type or f8E5M2FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<complex<f32>>'}}
-  %0 = "mhlo.rng"(%arg0, %arg1, %cst) {rng_distribution = #mhlo.rng_distribution<NORMAL>}: (tensor<complex<f32>>, tensor<f32>, tensor<1xi64>) -> tensor<7xf32>
+  %0 = "mhlo.rng"(%arg0, %arg1, %cst) <{rng_distribution = #mhlo.rng_distribution<NORMAL>}>: (tensor<complex<f32>>, tensor<f32>, tensor<1xi64>) -> tensor<7xf32>
   func.return
 }
 
@@ -2032,7 +2020,7 @@ func.func @rng_normal_invalid_type(%arg0: tensor<complex<f32>>, %arg1: tensor<f3
 // CHECK-LABEL: func @rng_uniform
 func.func @rng_uniform(%a: tensor<f32>, %b: tensor<f32>) -> tensor<2x3x5xf32> {
   %shape = mhlo.constant dense<[2, 3, 5]> : tensor<3xi64>
-  %0 = "mhlo.rng"(%a, %b, %shape) {rng_distribution = #mhlo.rng_distribution<UNIFORM>}: (tensor<f32>, tensor<f32>, tensor<3xi64>) -> tensor<2x3x5xf32>
+  %0 = "mhlo.rng"(%a, %b, %shape) <{rng_distribution = #mhlo.rng_distribution<UNIFORM>}>: (tensor<f32>, tensor<f32>, tensor<3xi64>) -> tensor<2x3x5xf32>
   func.return %0 : tensor<2x3x5xf32>
 }
 
@@ -2040,7 +2028,7 @@ func.func @rng_uniform(%a: tensor<f32>, %b: tensor<f32>) -> tensor<2x3x5xf32> {
 
 // CHECK-LABEL: func @rng_uniform_no_constant
 func.func @rng_uniform_no_constant(%a: tensor<f32>, %b: tensor<f32>, %shape: tensor<3xi64>) -> tensor<?x?x?xf32> {
-  %0 = "mhlo.rng"(%a, %b, %shape) {rng_distribution = #mhlo.rng_distribution<UNIFORM>}: (tensor<f32>, tensor<f32>, tensor<3xi64>) -> tensor<?x?x?xf32>
+  %0 = "mhlo.rng"(%a, %b, %shape) <{rng_distribution = #mhlo.rng_distribution<UNIFORM>}>: (tensor<f32>, tensor<f32>, tensor<3xi64>) -> tensor<?x?x?xf32>
   func.return %0 : tensor<?x?x?xf32>
 }
 
@@ -2049,7 +2037,7 @@ func.func @rng_uniform_no_constant(%a: tensor<f32>, %b: tensor<f32>, %shape: ten
 func.func @rng_uniform_invalid_shape(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<7xi64>) {
   // @expected-error@+2 {{'mhlo.rng' op failed to infer returned types}}
   // expected-error @+1 {{inferred type(s) 'tensor<?x?x?x?x?x?x?xf32>' are incompatible with return type(s) of operation 'tensor<?xf32>'}}
-  %0 = "mhlo.rng"(%arg0, %arg1, %arg2) {rng_distribution = #mhlo.rng_distribution<UNIFORM>}: (tensor<f32>, tensor<f32>, tensor<7xi64>) -> tensor<?xf32>
+  %0 = "mhlo.rng"(%arg0, %arg1, %arg2) <{rng_distribution = #mhlo.rng_distribution<UNIFORM>}>: (tensor<f32>, tensor<f32>, tensor<7xi64>) -> tensor<?xf32>
   func.return
 }
 
@@ -2058,7 +2046,7 @@ func.func @rng_uniform_invalid_shape(%arg0: tensor<f32>, %arg1: tensor<f32>, %ar
 func.func @rng_uniform_invalid_a_rank(%a: tensor<1xf32>, %b: tensor<f32>) -> tensor<2x3x5xf32> {
   %shape = mhlo.constant dense<[2, 3, 5]> : tensor<3xi64>
   // expected-error@+1 {{op operand #0 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3B11FNUZ type or f8E4M3FN type or f8E4M3FNUZ type or f8E5M2 type or f8E5M2FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<1xf32>'}}
-  %0 = "mhlo.rng"(%a, %b, %shape) {rng_distribution = #mhlo.rng_distribution<UNIFORM>}: (tensor<1xf32>, tensor<f32>, tensor<3xi64>) -> tensor<2x3x5xf32>
+  %0 = "mhlo.rng"(%a, %b, %shape) <{rng_distribution = #mhlo.rng_distribution<UNIFORM>}>: (tensor<1xf32>, tensor<f32>, tensor<3xi64>) -> tensor<2x3x5xf32>
   func.return %0 : tensor<2x3x5xf32>
 }
 
@@ -2068,7 +2056,7 @@ func.func @rng_uniform_invalid_a_rank(%a: tensor<1xf32>, %b: tensor<f32>) -> ten
 func.func @rng_uniform_invalid_b_rank(%a: tensor<f32>, %b: tensor<1xf32>) -> tensor<2x3x5xf32> {
   %shape = mhlo.constant dense<[2, 3, 5]> : tensor<3xi64>
   // expected-error@+1 {{op operand #1 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3B11FNUZ type or f8E4M3FN type or f8E4M3FNUZ type or f8E5M2 type or f8E5M2FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<1xf32>'}}
-  %0 = "mhlo.rng"(%a, %b, %shape) {rng_distribution = #mhlo.rng_distribution<UNIFORM>}: (tensor<f32>, tensor<1xf32>, tensor<3xi64>) -> tensor<2x3x5xf32>
+  %0 = "mhlo.rng"(%a, %b, %shape) <{rng_distribution = #mhlo.rng_distribution<UNIFORM>}>: (tensor<f32>, tensor<1xf32>, tensor<3xi64>) -> tensor<2x3x5xf32>
   func.return %0 : tensor<2x3x5xf32>
 }
 
@@ -2077,7 +2065,7 @@ func.func @rng_uniform_invalid_b_rank(%a: tensor<f32>, %b: tensor<1xf32>) -> ten
 func.func @rng_uniform_invalid_shape_rank(%a: tensor<f32>, %b: tensor<f32>) -> tensor<2x3x5xf32> {
   %shape = mhlo.constant dense<[[2, 3, 5]]> : tensor<1x3xi64>
   // expected-error@+1 {{operand #2 must be 1D tensor of index or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer values, but got 'tensor<1x3xi64>'}}
-  %0 = "mhlo.rng"(%a, %b, %shape) {rng_distribution = #mhlo.rng_distribution<UNIFORM>}: (tensor<f32>, tensor<f32>, tensor<1x3xi64>) -> tensor<2x3x5xf32>
+  %0 = "mhlo.rng"(%a, %b, %shape) <{rng_distribution = #mhlo.rng_distribution<UNIFORM>}>: (tensor<f32>, tensor<f32>, tensor<1x3xi64>) -> tensor<2x3x5xf32>
   func.return %0 : tensor<2x3x5xf32>
 }
 
@@ -2086,7 +2074,7 @@ func.func @rng_uniform_invalid_shape_rank(%a: tensor<f32>, %b: tensor<f32>) -> t
 func.func @rng_uniform_invalid_type(%a: tensor<complex<f32>>, %b: tensor<f32>) -> tensor<2x3x5xf32> {
   %shape = mhlo.constant dense<[2, 3, 5]> : tensor<3xi64>
   // expected-error@+1 {{op operand #0 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3B11FNUZ type or f8E4M3FN type or f8E4M3FNUZ type or f8E5M2 type or f8E5M2FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<complex<f32>>'}}
-  %0 = "mhlo.rng"(%a, %b, %shape) {rng_distribution = #mhlo.rng_distribution<UNIFORM>}: (tensor<complex<f32>>, tensor<f32>, tensor<3xi64>) -> tensor<2x3x5xf32>
+  %0 = "mhlo.rng"(%a, %b, %shape) <{rng_distribution = #mhlo.rng_distribution<UNIFORM>}>: (tensor<complex<f32>>, tensor<f32>, tensor<3xi64>) -> tensor<2x3x5xf32>
   func.return %0 : tensor<2x3x5xf32>
 }
 
@@ -2201,7 +2189,7 @@ func.func @select_element_type_mismatch(%arg0: tensor<i1>, %arg1: tensor<2x3xf32
 
 // CHECK-LABEL: func @slice
 func.func @slice(%arg0: tensor<3x4xi32>) -> tensor<1x2xi32> {
-  %0 = "mhlo.slice"(%arg0) {start_indices = dense<[1, 0]> : tensor<2xi64>, limit_indices = dense<[2, 4]> : tensor<2xi64>, strides = dense<[1, 2]> : tensor<2xi64>} : (tensor<3x4xi32>) -> tensor<1x2xi32>
+  %0 = "mhlo.slice"(%arg0) <{start_indices = dense<[1, 0]> : tensor<2xi64>, limit_indices = dense<[2, 4]> : tensor<2xi64>, strides = dense<[1, 2]> : tensor<2xi64>}> : (tensor<3x4xi32>) -> tensor<1x2xi32>
   func.return %0 : tensor<1x2xi32>
 }
 
@@ -2299,7 +2287,7 @@ func.func @slice_i2(%arg0: tensor<3x4xi32>) -> tensor<1x2xi32> {
 
 // CHECK-LABEL: func @dynamic_slice
 func.func @dynamic_slice(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<1x4xi32> {
-  %0 = "mhlo.dynamic_slice"(%arg0, %arg1, %arg2) {slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
+  %0 = "mhlo.dynamic_slice"(%arg0, %arg1, %arg2) <{slice_sizes = dense<[1, 4]> : tensor<2xi64>}> : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
   func.return %0 : tensor<1x4xi32>
 }
 
@@ -2308,7 +2296,7 @@ func.func @dynamic_slice(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>, %arg2: tens
 func.func @dynamic_slice_c2(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<1x4xi32> {
   // @expected-error@+2 {{'mhlo.dynamic_slice' op failed to infer returned types}}
   // expected-error@+1 {{has mismatched number of slice sizes (1) and number of start indices (2)}}
-  %0 = "mhlo.dynamic_slice"(%arg0, %arg1, %arg2) {slice_sizes = dense<[4]> : tensor<1xi64>} : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
+  %0 = "mhlo.dynamic_slice"(%arg0, %arg1, %arg2) <{slice_sizes = dense<[4]> : tensor<1xi64>}> : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
   func.return %0 : tensor<1x4xi32>
 }
 
@@ -2317,7 +2305,7 @@ func.func @dynamic_slice_c2(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>, %arg2: t
 func.func @dynamic_slice_c2(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>) -> tensor<1x4xi32> {
   // @expected-error@+2 {{'mhlo.dynamic_slice' op failed to infer returned types}}
   // expected-error@+1 {{has mismatched number of start indices (1) and the rank of operand (2)}}
-  %0 = "mhlo.dynamic_slice"(%arg0, %arg1) {slice_sizes = dense<[1]> : tensor<1xi64>} : (tensor<3x4xi32>, tensor<i64>) -> tensor<1x4xi32>
+  %0 = "mhlo.dynamic_slice"(%arg0, %arg1) <{slice_sizes = dense<[1]> : tensor<1xi64>}> : (tensor<3x4xi32>, tensor<i64>) -> tensor<1x4xi32>
   func.return %0 : tensor<1x4xi32>
 }
 
@@ -2326,7 +2314,7 @@ func.func @dynamic_slice_c2(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>) -> tenso
 func.func @dynamic_slice_c3(%arg0: tensor<3x4xi32>, %arg1: tensor<i32>, %arg2: tensor<i64>) -> tensor<1x4xi32> {
   // @expected-error@+2 {{'mhlo.dynamic_slice' op failed to infer returned types}}
   // expected-error@+1 {{start indices must have same element type}}
-  %0 = "mhlo.dynamic_slice"(%arg0, %arg1, %arg2) {slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor<3x4xi32>, tensor<i32>, tensor<i64>) -> tensor<1x4xi32>
+  %0 = "mhlo.dynamic_slice"(%arg0, %arg1, %arg2) <{slice_sizes = dense<[1, 4]> : tensor<2xi64>}> : (tensor<3x4xi32>, tensor<i32>, tensor<i64>) -> tensor<1x4xi32>
   func.return %0 : tensor<1x4xi32>
 }
 
@@ -2335,7 +2323,7 @@ func.func @dynamic_slice_c3(%arg0: tensor<3x4xi32>, %arg1: tensor<i32>, %arg2: t
 func.func @dynamic_slice_c4(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<1x4xi32> {
   // @expected-error@+2 {{'mhlo.dynamic_slice' op failed to infer returned types}}
   // expected-error@+1 {{has negative size index to dynamic slice: -1}}
-  %0 = "mhlo.dynamic_slice"(%arg0, %arg1, %arg2) {slice_sizes = dense<[-1, 4]> : tensor<2xi64>} : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
+  %0 = "mhlo.dynamic_slice"(%arg0, %arg1, %arg2) <{slice_sizes = dense<[-1, 4]> : tensor<2xi64>}> : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
   func.return %0 : tensor<1x4xi32>
 }
 
@@ -2344,7 +2332,7 @@ func.func @dynamic_slice_c4(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>, %arg2: t
 func.func @dynamic_slice_c4(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<1x4xi32> {
   // @expected-error@+2 {{'mhlo.dynamic_slice' op failed to infer returned types}}
   // expected-error@+1 {{has slice size 10 greater than dimension size 4 in dimension 1 of operand}}
-  %0 = "mhlo.dynamic_slice"(%arg0, %arg1, %arg2) {slice_sizes = dense<[1, 10]> : tensor<2xi64>} : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
+  %0 = "mhlo.dynamic_slice"(%arg0, %arg1, %arg2) <{slice_sizes = dense<[1, 10]> : tensor<2xi64>}> : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
   func.return %0 : tensor<1x4xi32>
 }
 
@@ -2353,7 +2341,7 @@ func.func @dynamic_slice_c4(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>, %arg2: t
 func.func @dynamic_slice_c5(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<2x4xi32> {
   // @expected-error@+2 {{'mhlo.dynamic_slice' op failed to infer returned types}}
   // expected-error@+1 {{inferred type(s) 'tensor<1x4xi32>' are incompatible with return type(s) of operation 'tensor<2x4xi32>'}}
-  %0 = "mhlo.dynamic_slice"(%arg0, %arg1, %arg2) {slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<2x4xi32>
+  %0 = "mhlo.dynamic_slice"(%arg0, %arg1, %arg2) <{slice_sizes = dense<[1, 4]> : tensor<2xi64>}> : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<2x4xi32>
   func.return %0 : tensor<2x4xi32>
 }
 
@@ -2361,7 +2349,7 @@ func.func @dynamic_slice_c5(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>, %arg2: t
 
 // CHECK-LABEL: func @dynamic_slice_dynamic_dim
 func.func @dynamic_slice_dynamic_dim(%arg0: tensor<?x4xi32>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<1x4xi32> {
-  %0 = "mhlo.dynamic_slice"(%arg0, %arg1, %arg2) {slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor<?x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
+  %0 = "mhlo.dynamic_slice"(%arg0, %arg1, %arg2) <{slice_sizes = dense<[1, 4]> : tensor<2xi64>}> : (tensor<?x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
   func.return %0 : tensor<1x4xi32>
 }
 
@@ -2370,7 +2358,7 @@ func.func @dynamic_slice_dynamic_dim(%arg0: tensor<?x4xi32>, %arg1: tensor<i64>,
 func.func @dynamic_slice_i3(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<1x4xi32> {
   // @expected-error@+2 {{'mhlo.dynamic_slice' op failed to infer returned types}}
   // expected-error@+1 {{slice_sizes has rank 0 instead of required rank 1.}}
-  %0 = "mhlo.dynamic_slice"(%arg0, %arg1, %arg2) {slice_sizes = dense<1> : tensor<i64>} : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
+  %0 = "mhlo.dynamic_slice"(%arg0, %arg1, %arg2) <{slice_sizes = dense<1> : tensor<i64>}> : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
   func.return %0 : tensor<1x4xi32>
 }
 
@@ -2463,14 +2451,14 @@ func.func @dynamic_update_slice_dynamic_sizes(%operand: tensor<?x4xi64>, %update
 
 // CHECK-LABEL: func @transpose
 func.func @transpose(%arg0: tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32> {
-  %0 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
+  %0 = "mhlo.transpose"(%arg0) <{permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>}> : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
   func.return %0: tensor<2x1x4x3xi32>
 }
 
 // -----
 
 func.func @transpose_ranked(%arg0: tensor<?x?x?x?xi32>) ->  tensor<?x?x?x?xi32> {
-  %0 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>} : (tensor<?x?x?x?xi32>) -> tensor<?x?x?x?xi32>
+  %0 = "mhlo.transpose"(%arg0) <{permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>}> : (tensor<?x?x?x?xi32>) -> tensor<?x?x?x?xi32>
   func.return %0: tensor<?x?x?x?xi32>
 }
 
@@ -2478,7 +2466,7 @@ func.func @transpose_ranked(%arg0: tensor<?x?x?x?xi32>) ->  tensor<?x?x?x?xi32>
 
 func.func @transpose_missing_permutation(%arg0: tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32> {
   // expected-error@+1 {{requires attribute 'permutation'}}
-  %0 = "mhlo.transpose"(%arg0) {} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
+  %0 = "mhlo.transpose"(%arg0) : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
   func.return %0: tensor<2x1x4x3xi32>
 }
 
@@ -2487,7 +2475,7 @@ func.func @transpose_missing_permutation(%arg0: tensor<1x2x3x4xi32>) -> tensor<2
 func.func @transpose_bad_permutations_rank(%arg0: tensor<1x2x3x4xi32>) ->  tensor<2x1x4x3xi32> {
   // @expected-error@+2 {{'mhlo.transpose' op failed to infer returned types}}
   // expected-error@+1 {{permutation has rank 2 instead of required rank 1.}}
-  %0 = "mhlo.transpose"(%arg0) {permutation = dense<[[1]]> : tensor<1x1xi64>} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
+  %0 = "mhlo.transpose"(%arg0) <{permutation = dense<[[1]]> : tensor<1x1xi64>}> : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
   func.return %0: tensor<2x1x4x3xi32>
 }
 
@@ -2496,7 +2484,7 @@ func.func @transpose_bad_permutations_rank(%arg0: tensor<1x2x3x4xi32>) ->  tenso
 func.func @transpose_bad_permutations_size(%arg0: tensor<1x2x3x4xi32>) ->  tensor<2x1x4x3xi32> {
   // @expected-error@+2 {{'mhlo.transpose' op failed to infer returned types}}
   // expected-error@+1 {{TransposeOp operand rank 4 does not match permutation size 1}}
-  %0 = "mhlo.transpose"(%arg0) {permutation = dense<[1]> : tensor<1xi64>} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
+  %0 = "mhlo.transpose"(%arg0) <{permutation = dense<[1]> : tensor<1xi64>}> : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
   func.return %0: tensor<2x1x4x3xi32>
 }
 
@@ -2505,7 +2493,7 @@ func.func @transpose_bad_permutations_size(%arg0: tensor<1x2x3x4xi32>) ->  tenso
 func.func @transpose_bad_permutation(%arg0: tensor<1x2x3x4xi32>) ->  tensor<2x1x4x3xi32> {
   // @expected-error@+2 {{'mhlo.transpose' op failed to infer returned types}}
   // expected-error@+1 {{attribute permutation must be a permutation of [0, 1, 2, 3] but got 1, 0, 3, 9}}
-  %0 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0, 3, 9]> : tensor<4xi64>} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
+  %0 = "mhlo.transpose"(%arg0) <{permutation = dense<[1, 0, 3, 9]> : tensor<4xi64>}> : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
   func.return %0: tensor<2x1x4x3xi32>
 }
 
@@ -2514,7 +2502,7 @@ func.func @transpose_bad_permutation(%arg0: tensor<1x2x3x4xi32>) ->  tensor<2x1x
 func.func @transpose_operand_result_rank_mismatch(%arg0: tensor<1x2x3x4xi32>) ->  tensor<2xi32> {
   // @expected-error@+2 {{'mhlo.transpose' op failed to infer returned types}}
   // expected-error@+1 {{op inferred type(s) 'tensor<2x1x4x3xi32>' are incompatible with return type(s) of operation 'tensor<2xi32>'}}
-  %0 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>} : (tensor<1x2x3x4xi32>) -> tensor<2xi32>
+  %0 = "mhlo.transpose"(%arg0) <{permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>}> : (tensor<1x2x3x4xi32>) -> tensor<2xi32>
   func.return %0: tensor<2xi32>
 }
 
@@ -2523,7 +2511,7 @@ func.func @transpose_operand_result_rank_mismatch(%arg0: tensor<1x2x3x4xi32>) ->
 func.func @transpose_operand_result_permutation_mismatch(%arg0: tensor<1x?x3x?xi32>) ->  tensor<?x2x?x?xi32> {
   // @expected-error@+2 {{'mhlo.transpose' op failed to infer returned types}}
   // expected-error@+1 {{op inferred type(s) 'tensor<?x1x?x3xi32>' are incompatible with return type(s) of operation 'tensor<?x2x?x?xi32>}}
-  %0 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>} : (tensor<1x?x3x?xi32>) -> tensor<?x2x?x?xi32>
+  %0 = "mhlo.transpose"(%arg0) <{permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>}> : (tensor<1x?x3x?xi32>) -> tensor<?x2x?x?xi32>
   func.return %0: tensor<?x2x?x?xi32>
 }
 
@@ -4259,14 +4247,14 @@ func.func @dynamic_gather(%operand : tensor<?x?x?xi32>, %start_indices : tensor<
 func.func @get_dimension_size(%I: tensor<1x128x512xf32>) -> tensor<i32> {
   // @expected-error@+2 {{'mhlo.get_dimension_size' op failed to infer returned types}}
   // expected-error@+1 {{requires dimension attribute in range [0, 3); found (3)}}
-  %size = "mhlo.get_dimension_size"(%I) {dimension = 3 : i64} : (tensor<1x128x512xf32>) -> tensor<i32>
+  %size = "mhlo.get_dimension_size"(%I) <{dimension = 3 : i64}> : (tensor<1x128x512xf32>) -> tensor<i32>
   func.return %size : tensor<i32>
 }
 
 // -----
 
 func.func @get_dimension_size(%I: tensor<1x128x512xf32>) -> tensor<i32> {
-  %size = "mhlo.get_dimension_size"(%I) {dimension = 2 : i64} : (tensor<1x128x512xf32>) -> tensor<i32>
+  %size = "mhlo.get_dimension_size"(%I) <{dimension = 2 : i64}> : (tensor<1x128x512xf32>) -> tensor<i32>
   func.return %size : tensor<i32>
 }
 
@@ -4275,7 +4263,7 @@ func.func @get_dimension_size(%I: tensor<1x128x512xf32>) -> tensor<i32> {
 func.func @get_dimension_size_negative_dimension(%I: tensor<1x128x512xf32>) -> tensor<i32> {
   // @expected-error@+2 {{'mhlo.get_dimension_size' op failed to infer returned types}}
   // expected-error@+1 {{requires non-negative dimension attribute; found (-1)}}
-  %size = "mhlo.get_dimension_size"(%I) {dimension = -1 : i64} : (tensor<1x128x512xf32>) -> tensor<i32>
+  %size = "mhlo.get_dimension_size"(%I) <{dimension = -1 : i64}> : (tensor<1x128x512xf32>) -> tensor<i32>
   func.return %size : tensor<i32>
 }
 
@@ -4284,7 +4272,7 @@ func.func @get_dimension_size_negative_dimension(%I: tensor<1x128x512xf32>) -> t
 func.func @get_dimension_size_invalid_dimension(%I: tensor<1x128x512xf32>) -> tensor<i32> {
   // @expected-error@+2 {{'mhlo.get_dimension_size' op failed to infer returned types}}
   // expected-error@+1 {{requires dimension attribute in range [0, 3); found (3)}}
-  %size = "mhlo.get_dimension_size"(%I) {dimension = 3 : i64} : (tensor<1x128x512xf32>) -> tensor<i32>
+  %size = "mhlo.get_dimension_size"(%I) <{dimension = 3 : i64}> : (tensor<1x128x512xf32>) -> tensor<i32>
   func.return %size : tensor<i32>
 }
 
@@ -5323,7 +5311,7 @@ func.func @local_xla.rng_get_and_update_state() -> tensor<2xui64> {
 
 // CHECK-LABEL: @fft
 func.func @fft(%arg0: tensor<3x9xcomplex<f32>>) -> tensor<3x9xcomplex<f32>> {
-  %0 = "mhlo.fft"(%arg0) { fft_length = dense<9> : tensor<1xi64>, fft_type = #mhlo<fft_type FFT> } : (tensor<3x9xcomplex<f32>>) -> tensor<3x9xcomplex<f32>>
+  %0 = "mhlo.fft"(%arg0) <{ fft_length = dense<9> : tensor<1xi64>, fft_type = #mhlo<fft_type FFT> }> : (tensor<3x9xcomplex<f32>>) -> tensor<3x9xcomplex<f32>>
   func.return %0 : tensor<3x9xcomplex<f32>>
 }
 
@@ -5331,7 +5319,7 @@ func.func @fft(%arg0: tensor<3x9xcomplex<f32>>) -> tensor<3x9xcomplex<f32>> {
 
 // CHECK-LABEL: @ifft
 func.func @ifft(%arg0: tensor<3x9xcomplex<f32>>) -> tensor<3x9xcomplex<f32>> {
-  %0 = "mhlo.fft"(%arg0) { fft_length = dense<9> : tensor<1xi64>, fft_type = #mhlo<fft_type IFFT> } : (tensor<3x9xcomplex<f32>>) -> tensor<3x9xcomplex<f32>>
+  %0 = "mhlo.fft"(%arg0) <{ fft_length = dense<9> : tensor<1xi64>, fft_type = #mhlo<fft_type IFFT> }> : (tensor<3x9xcomplex<f32>>) -> tensor<3x9xcomplex<f32>>
   func.return %0 : tensor<3x9xcomplex<f32>>
 }
 
@@ -5339,7 +5327,7 @@ func.func @ifft(%arg0: tensor<3x9xcomplex<f32>>) -> tensor<3x9xcomplex<f32>> {
 
 // CHECK-LABEL: @rfft
 func.func @rfft(%arg0: tensor<3x9xf32>) -> tensor<3x5xcomplex<f32>> {
-  %0 = "mhlo.fft"(%arg0) { fft_length = dense<9> : tensor<1xi64>, fft_type = #mhlo<fft_type RFFT> } : (tensor<3x9xf32>) -> tensor<3x5xcomplex<f32>>
+  %0 = "mhlo.fft"(%arg0) <{ fft_length = dense<9> : tensor<1xi64>, fft_type = #mhlo<fft_type RFFT> }> : (tensor<3x9xf32>) -> tensor<3x5xcomplex<f32>>
   func.return %0 : tensor<3x5xcomplex<f32>>
 }
 
@@ -5347,7 +5335,7 @@ func.func @rfft(%arg0: tensor<3x9xf32>) -> tensor<3x5xcomplex<f32>> {
 
 // CHECK-LABEL: @irfft
 func.func @irfft(%arg0: tensor<3x9xcomplex<f32>>) -> tensor<3x16xf32> {
-  %0 = "mhlo.fft"(%arg0) { fft_length = dense<16> : tensor<1xi64>, fft_type = #mhlo<fft_type IRFFT> } : (tensor<3x9xcomplex<f32>>) -> tensor<3x16xf32>
+  %0 = "mhlo.fft"(%arg0) <{ fft_length = dense<16> : tensor<1xi64>, fft_type = #mhlo<fft_type IRFFT> }> : (tensor<3x9xcomplex<f32>>) -> tensor<3x16xf32>
   func.return %0 : tensor<3x16xf32>
 }
 
@@ -5356,7 +5344,7 @@ func.func @irfft(%arg0: tensor<3x9xcomplex<f32>>) -> tensor<3x16xf32> {
 func.func @rfft_not_float32or64(%arg0: tensor<3x9xf16>) -> tensor<3x5xcomplex<f32>> {
   // @expected-error@+2 {{'mhlo.fft' op failed to infer returned types}}
   // expected-error@+1 {{RFFT requires f32 or f64 input type, but is given 'f16'.}}
-  %0 = "mhlo.fft"(%arg0) { fft_length = dense<9> : tensor<1xi64>, fft_type = #mhlo<fft_type RFFT> } : (tensor<3x9xf16>) -> tensor<3x5xcomplex<f32>>
+  %0 = "mhlo.fft"(%arg0) <{ fft_length = dense<9> : tensor<1xi64>, fft_type = #mhlo<fft_type RFFT> }> : (tensor<3x9xf16>) -> tensor<3x5xcomplex<f32>>
   func.return %0 : tensor<3x5xcomplex<f32>>
 }
 
@@ -5365,7 +5353,7 @@ func.func @rfft_not_float32or64(%arg0: tensor<3x9xf16>) -> tensor<3x5xcomplex<f3
 func.func @fft_invalid_rank(%arg0: tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>> {
   // @expected-error@+2 {{'mhlo.fft' op failed to infer returned types}}
   // expected-error@+1 {{rank must be between 1 and 3, but got 4.}}
-  %0 = "mhlo.fft"(%arg0) { fft_length = dense<9> : tensor<4xi64>, fft_type = #mhlo<fft_type RFFT> } : (tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>>
+  %0 = "mhlo.fft"(%arg0) <{ fft_length = dense<9> : tensor<4xi64>, fft_type = #mhlo<fft_type RFFT> }> : (tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>>
   func.return %0 : tensor<3x9xcomplex<f32>>
 }
 
@@ -5374,7 +5362,7 @@ func.func @fft_invalid_rank(%arg0: tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>>
 func.func @fft_rank_mismatch(%arg0: tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>> {
   // @expected-error@+2 {{'mhlo.fft' op failed to infer returned types}}
   // expected-error@+1 {{operand rank must not be less than fft rank of 3 for operand of type 'tensor<3x9xf32>'}}
-  %0 = "mhlo.fft"(%arg0) { fft_length = dense<9> : tensor<3xi64>, fft_type = #mhlo<fft_type RFFT> } : (tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>>
+  %0 = "mhlo.fft"(%arg0) <{ fft_length = dense<9> : tensor<3xi64>, fft_type = #mhlo<fft_type RFFT> }> : (tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>>
   func.return %0 : tensor<3x9xcomplex<f32>>
 }
 
@@ -5383,7 +5371,7 @@ func.func @fft_rank_mismatch(%arg0: tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>>
 func.func @rfft_invalid_dim(%arg0: tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>> {
   // @expected-error@+2 {{'mhlo.fft' op failed to infer returned types}}
   // expected-error@+1 {{RFFT requires innermost dimensions to be compatible with fft_length. Got: 3, 9 but wanted 9, 9.}}
-  %0 = "mhlo.fft"(%arg0) { fft_length = dense<9> : tensor<2xi64>, fft_type = #mhlo<fft_type RFFT> } : (tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>>
+  %0 = "mhlo.fft"(%arg0) <{ fft_length = dense<9> : tensor<2xi64>, fft_type = #mhlo<fft_type RFFT> }> : (tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>>
   func.return %0 : tensor<3x9xcomplex<f32>>
 }
 
@@ -5392,7 +5380,7 @@ func.func @rfft_invalid_dim(%arg0: tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>>
 func.func @irfft_invalid_dim(%arg0: tensor<3x9xcomplex<f32>>) -> tensor<3x9xf32> {
   // @expected-error@+2 {{'mhlo.fft' op failed to infer returned types}}
   // expected-error@+1 {{IRFFT requires non-final dimensions to be compatible with fft_length. Got: 3, 9 but wanted 9, 9, and 3 != 9.}}
-  %0 = "mhlo.fft"(%arg0) { fft_length = dense<9> : tensor<2xi64>, fft_type = #mhlo<fft_type IRFFT> } : (tensor<3x9xcomplex<f32>>) -> tensor<3x9xf32>
+  %0 = "mhlo.fft"(%arg0) <{ fft_length = dense<9> : tensor<2xi64>, fft_type = #mhlo<fft_type IRFFT> }> : (tensor<3x9xcomplex<f32>>) -> tensor<3x9xf32>
   func.return %0 : tensor<3x9xf32>
 }
 
@@ -5401,7 +5389,7 @@ func.func @irfft_invalid_dim(%arg0: tensor<3x9xcomplex<f32>>) -> tensor<3x9xf32>
 func.func @irfft_invalid_dim(%arg0: tensor<3x9xcomplex<f32>>) -> tensor<3x9xf32> {
   // @expected-error@+2 {{'mhlo.fft' op failed to infer returned types}}
   // expected-error@+1 {{IRFFT requires innermost dimension to be compatible with fft_length[-1]/2+1. Got: 9 but fft_length is 9.}}
-  %0 = "mhlo.fft"(%arg0) { fft_length = dense<9> : tensor<1xi64>, fft_type = #mhlo<fft_type IRFFT> } : (tensor<3x9xcomplex<f32>>) -> tensor<3x9xf32>
+  %0 = "mhlo.fft"(%arg0) <{ fft_length = dense<9> : tensor<1xi64>, fft_type = #mhlo<fft_type IRFFT> }> : (tensor<3x9xcomplex<f32>>) -> tensor<3x9xf32>
   func.return %0 : tensor<3x9xf32>
 }
 
@@ -5410,7 +5398,7 @@ func.func @irfft_invalid_dim(%arg0: tensor<3x9xcomplex<f32>>) -> tensor<3x9xf32>
 func.func @irfft_invalid_elt(%arg0: tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>> {
   // @expected-error@+2 {{'mhlo.fft' op failed to infer returned types}}
   // expected-error@+1 {{FFT/IFFT/IRFFT take a complex tensor as input, but is given 'tensor<3x9xf32>'}}
-  %0 = "mhlo.fft"(%arg0) { fft_length = dense<16> : tensor<1xi64>, fft_type = #mhlo<fft_type IRFFT> } : (tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>>
+  %0 = "mhlo.fft"(%arg0) <{ fft_length = dense<16> : tensor<1xi64>, fft_type = #mhlo<fft_type IRFFT> }> : (tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>>
   func.return %0 : tensor<3x9xcomplex<f32>>
 }
 
@@ -5419,7 +5407,7 @@ func.func @irfft_invalid_elt(%arg0: tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>>
 func.func @irfft_invalid_ret_elt(%arg0: tensor<3x9xcomplex<f32>>) -> tensor<3x16xcomplex<f32>> {
   // @expected-error@+2 {{'mhlo.fft' op failed to infer returned types}}
   // expected-error@+1 {{inferred type(s) 'tensor<3x16xf32>' are incompatible with return type(s) of operation 'tensor<3x16xcomplex<f32>>'}}
-  %0 = "mhlo.fft"(%arg0) { fft_length = dense<16> : tensor<1xi64>, fft_type = #mhlo<fft_type IRFFT> } : (tensor<3x9xcomplex<f32>>) -> tensor<3x16xcomplex<f32>>
+  %0 = "mhlo.fft"(%arg0) <{ fft_length = dense<16> : tensor<1xi64>, fft_type = #mhlo<fft_type IRFFT> }> : (tensor<3x9xcomplex<f32>>) -> tensor<3x16xcomplex<f32>>
   func.return %0 : tensor<3x16xcomplex<f32>>
 }
 
@@ -5428,7 +5416,7 @@ func.func @irfft_invalid_ret_elt(%arg0: tensor<3x9xcomplex<f32>>) -> tensor<3x16
 func.func @rfft_invalid_ret_elt(%arg0: tensor<3x9xf32>) -> tensor<3x9xf32> {
   // @expected-error@+2 {{'mhlo.fft' op failed to infer returned types}}
   // expected-error@+1 {{inferred type(s) 'tensor<3x5xcomplex<f32>>' are incompatible with return type(s) of operation 'tensor<3x9xf32>'}}
-  %0 = "mhlo.fft"(%arg0) { fft_length = dense<9> : tensor<1xi64>, fft_type = #mhlo<fft_type RFFT> } : (tensor<3x9xf32>) -> tensor<3x9xf32>
+  %0 = "mhlo.fft"(%arg0) <{ fft_length = dense<9> : tensor<1xi64>, fft_type = #mhlo<fft_type RFFT> }> : (tensor<3x9xf32>) -> tensor<3x9xf32>
   func.return %0 : tensor<3x9xf32>
 }
 
@@ -5436,7 +5424,7 @@ func.func @rfft_invalid_ret_elt(%arg0: tensor<3x9xf32>) -> tensor<3x9xf32> {
 
 // CHECK-LABEL: @rfft_dynamic
 func.func @rfft_dynamic(%arg0: tensor<?x?xf32>) -> tensor<?x?xcomplex<f32>> {
-  %0 = "mhlo.fft"(%arg0) { fft_length = dense<9> : tensor<1xi64>, fft_type = #mhlo<fft_type RFFT> } : (tensor<?x?xf32>) -> tensor<?x?xcomplex<f32>>
+  %0 = "mhlo.fft"(%arg0) <{ fft_length = dense<9> : tensor<1xi64>, fft_type = #mhlo<fft_type RFFT> }> : (tensor<?x?xf32>) -> tensor<?x?xcomplex<f32>>
   func.return %0 : tensor<?x?xcomplex<f32>>
 }
 
@@ -5445,7 +5433,7 @@ func.func @rfft_dynamic(%arg0: tensor<?x?xf32>) -> tensor<?x?xcomplex<f32>> {
 func.func @rfft_dynamic_incompatible_dims(%arg0: tensor<3x10xf32>) -> tensor<?x?xcomplex<f32>> {
   // @expected-error@+2 {{'mhlo.fft' op failed to infer returned types}}
   // expected-error@+1{{RFFT requires innermost dimensions to be compatible with fft_length. Got: 3, 10 but wanted 9.}}
-  %0 = "mhlo.fft"(%arg0) { fft_length = dense<9> : tensor<1xi64>, fft_type = #mhlo<fft_type RFFT> } : (tensor<3x10xf32>) -> tensor<?x?xcomplex<f32>>
+  %0 = "mhlo.fft"(%arg0) <{ fft_length = dense<9> : tensor<1xi64>, fft_type = #mhlo<fft_type RFFT> }> : (tensor<3x10xf32>) -> tensor<?x?xcomplex<f32>>
   func.return %0 : tensor<?x?xcomplex<f32>>
 }
 
@@ -5453,7 +5441,7 @@ func.func @rfft_dynamic_incompatible_dims(%arg0: tensor<3x10xf32>) -> tensor<?x?
 
 // CHECK-LABEL: @irfft_dynamic
 func.func @irfft_dynamic(%arg0: tensor<?x?xcomplex<f32>>) -> tensor<?x?xf32> {
-  %0 = "mhlo.fft"(%arg0) { fft_length = dense<16> : tensor<1xi64>, fft_type = #mhlo<fft_type IRFFT> } : (tensor<?x?xcomplex<f32>>) -> tensor<?x?xf32>
+  %0 = "mhlo.fft"(%arg0) <{ fft_length = dense<16> : tensor<1xi64>, fft_type = #mhlo<fft_type IRFFT> }> : (tensor<?x?xcomplex<f32>>) -> tensor<?x?xf32>
   func.return %0 : tensor<?x?xf32>
 }
 
@@ -5462,7 +5450,7 @@ func.func @irfft_dynamic(%arg0: tensor<?x?xcomplex<f32>>) -> tensor<?x?xf32> {
 func.func @irfft_dynamic_incompatible_non_final_dims(%arg0: tensor<?x3x15xcomplex<f32>>) -> tensor<?x?x?xf32> {
   // @expected-error@+2 {{'mhlo.fft' op failed to infer returned types}}
   // expected-error@+1{{IRFFT requires non-final dimensions to be compatible with fft_length. Got: -9223372036854775808, 3, 15 but wanted 4, 16, and 3 != 4}}
-  %0 = "mhlo.fft"(%arg0) { fft_length = dense<[4, 16]> : tensor<2xi64>, fft_type = #mhlo<fft_type IRFFT> } : (tensor<?x3x15xcomplex<f32>>) -> tensor<?x?x?xf32>
+  %0 = "mhlo.fft"(%arg0) <{ fft_length = dense<[4, 16]> : tensor<2xi64>, fft_type = #mhlo<fft_type IRFFT> }> : (tensor<?x3x15xcomplex<f32>>) -> tensor<?x?x?xf32>
   func.return %0 : tensor<?x?x?xf32>
 }
 
@@ -5471,7 +5459,7 @@ func.func @irfft_dynamic_incompatible_non_final_dims(%arg0: tensor<?x3x15xcomple
 func.func @irfft_dynamic_incompatible_final_dim(%arg0: tensor<?x8xcomplex<f32>>) -> tensor<?x?xf32> {
   // @expected-error@+2 {{'mhlo.fft' op failed to infer returned types}}
   // expected-error@+1{{IRFFT requires innermost dimension to be compatible with fft_length[-1]/2+1. Got: 8 but fft_length is 16.}}
-  %0 = "mhlo.fft"(%arg0) { fft_length = dense<16> : tensor<1xi64>, fft_type = #mhlo<fft_type IRFFT> } : (tensor<?x8xcomplex<f32>>) -> tensor<?x?xf32>
+  %0 = "mhlo.fft"(%arg0) <{ fft_length = dense<16> : tensor<1xi64>, fft_type = #mhlo<fft_type IRFFT> }> : (tensor<?x8xcomplex<f32>>) -> tensor<?x?xf32>
   func.return %0 : tensor<?x?xf32>
 }
 
@@ -5479,7 +5467,7 @@ func.func @irfft_dynamic_incompatible_final_dim(%arg0: tensor<?x8xcomplex<f32>>)
 
 // CHECK-LABEL: @irfft_dynamic
 func.func @irfft_dynamic(%arg0: tensor<?x?xcomplex<f32>>) -> tensor<?x?xf32> {
-  %0 = "mhlo.fft"(%arg0) { fft_length = dense<16> : tensor<1xi64>, fft_type = #mhlo<fft_type IRFFT> } : (tensor<?x?xcomplex<f32>>) -> tensor<?x?xf32>
+  %0 = "mhlo.fft"(%arg0) <{ fft_length = dense<16> : tensor<1xi64>, fft_type = #mhlo<fft_type IRFFT> }> : (tensor<?x?xcomplex<f32>>) -> tensor<?x?xf32>
   func.return %0 : tensor<?x?xf32>
 }
 
@@ -5630,7 +5618,7 @@ func.func @quantized_constants() -> (tensor<2x!quant.uniform<i8:f32, 2.0:15>>, t
   %3 = mhlo.uniform_quantize %2 : (tensor<2xf32>) -> tensor<2x!quant.uniform<i8:f32, 2.0:15>>
   %4 = mhlo.uniform_quantize %1 : (tensor<2xf32>) -> tensor<2x!quant.uniform<ui8:f32, 34.0:16>>
   func.return %0, %4, %3 : tensor<2x!quant.uniform<i8:f32, 2.0:15>>, tensor<2x!quant.uniform<ui8:f32, 34.0:16>>, tensor<2x!quant.uniform<i8:f32, 2.0:15>>
-  // CHECK: mhlo.constant() {value = dense<[1, 2]> : tensor<2xi8>} : () -> tensor<2x!quant.uniform<i8:f32, 2.000000e+00:15>>
+  // CHECK: mhlo.constant() <{value = dense<[1, 2]> : tensor<2xi8>}> : () -> tensor<2x!quant.uniform<i8:f32, 2.000000e+00:15>>
   // CHECK-NEXT: mhlo.constant dense<[1.000000e+01, 1.200000e+01]> : tensor<2xf32>
   // CHECK-NEXT: mhlo.constant dense<[3.000000e+00, 1.000000e+02]> : tensor<2xf32>
 }
@@ -5663,7 +5651,7 @@ func.func @dot_i8xi8_i16(%arg0: tensor<1x2xi8>, %arg1: tensor<2x1xi8>) -> tensor
 
 // CHECK-LABEL: func @einsum_i4xi4_i8
 func.func @einsum_i4xi4_i8(%arg0: tensor<1x2xi4>, %arg1: tensor<2x1xi4>) -> tensor<1x1xi8> {
-  %0 = "mhlo.einsum"(%arg0, %arg1) {einsum_config = "ab,bc->ac"} : (tensor<1x2xi4>, tensor<2x1xi4>) -> tensor<1x1xi8>
+  %0 = "mhlo.einsum"(%arg0, %arg1) <{einsum_config = "ab,bc->ac"}> : (tensor<1x2xi4>, tensor<2x1xi4>) -> tensor<1x1xi8>
   func.return %0: tensor<1x1xi8>
 }
 
@@ -5671,7 +5659,7 @@ func.func @einsum_i4xi4_i8(%arg0: tensor<1x2xi4>, %arg1: tensor<2x1xi4>) -> tens
 
 // CHECK-LABEL: func @einsum_i8xi8_i16
 func.func @einsum_i8xi8_i16(%arg0: tensor<1x2xi8>, %arg1: tensor<2x1xi8>) -> tensor<1x1xi16> {
-  %0 = "mhlo.einsum"(%arg0, %arg1) {einsum_config = "ab,bc->ac"} : (tensor<1x2xi8>, tensor<2x1xi8>) -> tensor<1x1xi16>
+  %0 = "mhlo.einsum"(%arg0, %arg1) <{einsum_config = "ab,bc->ac"}> : (tensor<1x2xi8>, tensor<2x1xi8>) -> tensor<1x1xi16>
   func.return %0: tensor<1x1xi16>
 }
 
@@ -5946,10 +5934,10 @@ func.func @quantization_supported_ops(%arg0: tensor<1x2x2x!quant.uniform<i8:f32,
 }
 
 func.func @per_axis_quantized_ops(%arg0: tensor<1x2x2x!quant.uniform<i8<-128:127>:f32:2, {0.1:-30, 0.5:-20}>>, %arg1: tensor<1x2x2x!quant.uniform<i8<-128:127>:f32:0, {0.1:-30}>>) {
-  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0,1,3]> : tensor<3xi64>} : (tensor<1x2x2x!quant.uniform<i8<-128:127>:f32:2, {0.1:-30, 0.5:-20}>>) -> tensor<1x2x3x2x!quant.uniform<i8<-128:127>:f32:3, {0.1:-30, 0.5:-20}>>
-  %1 = "mhlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<[0,1,2]> : tensor<3xi64>} : (tensor<1x2x2x!quant.uniform<i8<-128:127>:f32:0, {0.1:-30}>>) -> tensor<2x2x2x!quant.uniform<i8<-128:127>:f32:0, {0.1:-30, 0.1:-30}>>
+  %0 = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<[0,1,3]> : tensor<3xi64>}> : (tensor<1x2x2x!quant.uniform<i8<-128:127>:f32:2, {0.1:-30, 0.5:-20}>>) -> tensor<1x2x3x2x!quant.uniform<i8<-128:127>:f32:3, {0.1:-30, 0.5:-20}>>
+  %1 = "mhlo.broadcast_in_dim"(%arg1) <{broadcast_dimensions = dense<[0,1,2]> : tensor<3xi64>}> : (tensor<1x2x2x!quant.uniform<i8<-128:127>:f32:0, {0.1:-30}>>) -> tensor<2x2x2x!quant.uniform<i8<-128:127>:f32:0, {0.1:-30, 0.1:-30}>>
   %2 = mhlo.reshape %arg0 : (tensor<1x2x2x!quant.uniform<i8<-128:127>:f32:2, {0.1:-30, 0.5:-20}>>) -> tensor<2x2x!quant.uniform<i8<-128:127>:f32:1, {0.1:-30, 0.5:-20}>>
-  %3 = "mhlo.transpose"(%arg0) {permutation = dense<[0,2,1]> : tensor<3xi64>}: (tensor<1x2x2x!quant.uniform<i8<-128:127>:f32:2, {0.1:-30, 0.5:-20}>>) -> tensor<1x2x2x!quant.uniform<i8<-128:127>:f32:1, {0.1:-30, 0.5:-20}>>
+  %3 = "mhlo.transpose"(%arg0) <{permutation = dense<[0,2,1]> : tensor<3xi64>}> : (tensor<1x2x2x!quant.uniform<i8<-128:127>:f32:2, {0.1:-30, 0.5:-20}>>) -> tensor<1x2x2x!quant.uniform<i8<-128:127>:f32:1, {0.1:-30, 0.5:-20}>>
   func.return
 }
 
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/optimize-hlo.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/optimize-hlo.mlir
index f002d87fde84c9..2bd6f76e7ad1e4 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/optimize-hlo.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/optimize-hlo.mlir
@@ -3,7 +3,7 @@
 // CHECK-LABEL: @gather_is_slice_no_rank
 func.func @gather_is_slice_no_rank(%arg0: tensor<2x1x2xi32>, %arg1: tensor<i64>) -> tensor<1x2xi32> {
   // CHECK: [[CST:%.+]] = mhlo.constant dense<0> : tensor<i64>
-  // CHECK: [[SLICE:%.+]] = "mhlo.dynamic_slice"(%arg0, %arg1, [[CST]], [[CST]]) {slice_sizes = dense<[1, 1, 2]> : tensor<3xi64>}
+  // CHECK: [[SLICE:%.+]] = "mhlo.dynamic_slice"(%arg0, %arg1, [[CST]], [[CST]]) <{slice_sizes = dense<[1, 1, 2]> : tensor<3xi64>}>
   // CHECK: [[RESHAPE:%.+]] = mhlo.reshape [[SLICE]]
    %res = "mhlo.gather"(%arg0, %arg1) {
     dimension_numbers = #mhlo.gather<
@@ -23,7 +23,7 @@ func.func @gather_is_slice_no_rank(%arg0: tensor<2x1x2xi32>, %arg1: tensor<i64>)
 func.func @gather_is_slice(%arg0: tensor<2x1x2xi32>, %arg1: tensor<1xi64>) -> tensor<1x2xi32> {
    // CHECK: [[CST:%.+]] = mhlo.constant dense<0> : tensor<i64>
    // CHECK: [[RESHAPE:%.+]] = mhlo.reshape %arg1
-   // CHECK: [[SLICE:%.+]] = "mhlo.dynamic_slice"(%arg0, [[RESHAPE]], [[CST]], [[CST]]) {slice_sizes = dense<[1, 1, 2]> : tensor<3xi64>}
+   // CHECK: [[SLICE:%.+]] = "mhlo.dynamic_slice"(%arg0, [[RESHAPE]], [[CST]], [[CST]]) <{slice_sizes = dense<[1, 1, 2]> : tensor<3xi64>}>
    // CHECK: [[RES:%.+]] = mhlo.reshape [[SLICE]]
 
    %res = "mhlo.gather"(%arg0, %arg1) {
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/prepare-for-export.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/prepare-for-export.mlir
index 0872ff45825723..6944e796b73f65 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/prepare-for-export.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/prepare-for-export.mlir
@@ -22,6 +22,25 @@ func.func @splat_constant_complex_float() -> tensor<128x1014x508xcomplex<f64>> {
 
 // -----
 
+// CHECK-LABEL: @while_without_implicit_capture
+func.func @while_without_implicit_capture(%arg0: tensor<i64>) -> tensor<i64> {
+  // CHECK: mhlo.while
+  // CHECK-SAME: (%[[ARG1:.*]] = %arg0, %[[ARG2:.*]] = %arg0)
+  // CHECK-SAME: {mhlo.sharding = "{{\{}}{replicated},{replicated}}"}
+  %0:2 = "mhlo.while"(%arg0, %arg0) ({
+  ^bb0(%arg1: tensor<i64>, %arg2: tensor<i64>):
+    %1 = "mhlo.compare"(%arg1, %arg2) {comparison_direction = #mhlo<comparison_direction LT>} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+    "mhlo.return"(%1) : (tensor<i1>) -> ()
+  },  {
+  ^bb0(%arg1: tensor<i64>, %arg2: tensor<i64>):
+    %2 = mhlo.add %arg1, %arg1 : tensor<i64>
+    "mhlo.return"(%2, %arg2) : (tensor<i64>, tensor<i64>) -> ()
+  }) {mhlo.sharding = "{{replicated},{replicated}}"} : (tensor<i64>, tensor<i64>) -> (tensor<i64>, tensor<i64>)
+  func.return %0#0 : tensor<i64>
+}
+
+// -----
+
 // CHECK-LABEL: @while_with_implicit_arg_capture
 func.func @while_with_implicit_arg_capture(%arg0: tensor<i64>) -> tensor<i64> {
   // CHECK: mhlo.while
@@ -52,7 +71,7 @@ func.func @while_with_implicit_capture(%arg0 :  tensor<i1>, %arg1 : tensor<5xi32
   %1 = mhlo.constant dense<false> : tensor<i1>
   // Check that the iota implicit capture is made explicit
   // CHECK: %[[IOTA:.*]] = "mhlo.iota
-  %2 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<5xi32>
+  %2 = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<5xi32>
   // CHECK: mhlo.while{{.*}} %[[IOTA]])
   %3:2 = "mhlo.while"(%arg0, %arg1) ({
   ^bb0(%arg2: tensor<i1>, %arg3 : tensor<5xi32>):
@@ -96,7 +115,7 @@ func.func @broadcast_in_dim_dimension_unsorted(%arg0: tensor<1x2xi32>) -> tensor
 // Unfuse the transpose from the broadcastInDim before export.
 // CHECK: %[[TRANSPOSE:.*]] = "mhlo.transpose"(%arg0){{.*}}permutation = dense<[1, 0]>{{.*}} -> tensor<2x1xi32>
 // CHECK: mhlo.broadcast_in_dim"(%[[TRANSPOSE]]){{.*}}broadcast_dimensions = dense<[1, 2]>
-  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[2, 1]> : tensor<2xi64>} : (tensor<1x2xi32>) -> tensor<1x2x3xi32>
+  %0 = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<[2, 1]> : tensor<2xi64>}> : (tensor<1x2xi32>) -> tensor<1x2x3xi32>
   func.return %0 : tensor<1x2x3xi32>
 }
 
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/restrict_max_rank.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/restrict_max_rank.mlir
index 21a5b7557df028..45405222f4c9af 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/restrict_max_rank.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/restrict_max_rank.mlir
@@ -5,14 +5,14 @@
 func.func @ReduceTransposeReduce4D(%arg0 : tensor<17x6x35x13xf32>) -> tensor<357x2x5x13xf32> {
 
   // CHECK: %[[OUT0:.*]] = mhlo.reshape %arg0 : (tensor<17x6x35x13xf32>) -> tensor<17x6x5x7x13xf32>
-  // CHECK: %[[OUT1:.*]] = "mhlo.transpose"(%[[OUT0]]) {permutation = dense<[3, 0, 1, 2, 4]> : tensor<5xi64>} : (tensor<17x6x5x7x13xf32>) -> tensor<7x17x6x5x13xf32>
+  // CHECK: %[[OUT1:.*]] = "mhlo.transpose"(%[[OUT0]]) <{permutation = dense<[3, 0, 1, 2, 4]> : tensor<5xi64>}> : (tensor<17x6x5x7x13xf32>) -> tensor<7x17x6x5x13xf32>
   // CHECK: %[[OUT2:.*]] = mhlo.reshape %[[OUT1]] : (tensor<7x17x6x5x13xf32>) -> tensor<119x2x3x5x13xf32>
-  // CHECK: %[[OUT3:.*]] = "mhlo.transpose"(%[[OUT2]]) {permutation = dense<[2, 0, 1, 3, 4]> : tensor<5xi64>} : (tensor<119x2x3x5x13xf32>) -> tensor<3x119x2x5x13xf32>
+  // CHECK: %[[OUT3:.*]] = "mhlo.transpose"(%[[OUT2]]) <{permutation = dense<[2, 0, 1, 3, 4]> : tensor<5xi64>}> : (tensor<119x2x3x5x13xf32>) -> tensor<3x119x2x5x13xf32>
   // CHECK: %[[OUT4:.*]] = mhlo.reshape %[[OUT3]] : (tensor<3x119x2x5x13xf32>) -> tensor<357x2x5x13xf32>
   // CHECK: return %[[OUT4]]
 
   %0 = "mhlo.reshape"(%arg0) : (tensor<17x6x35x13xf32>) -> tensor<17x2x3x5x7x13xf32>
-  %1 = "mhlo.transpose"(%0) {permutation = dense<[2, 4, 0, 1, 3, 5]> : tensor<6xi64>} : (tensor<17x2x3x5x7x13xf32>) -> tensor<3x7x17x2x5x13xf32>
+  %1 = "mhlo.transpose"(%0) <{permutation = dense<[2, 4, 0, 1, 3, 5]> : tensor<6xi64>}> : (tensor<17x2x3x5x7x13xf32>) -> tensor<3x7x17x2x5x13xf32>
   %2 = "mhlo.reshape"(%1) : (tensor<3x7x17x2x5x13xf32>) -> tensor<357x2x5x13xf32>
   return %2 : tensor<357x2x5x13xf32>
 }
@@ -23,16 +23,16 @@ func.func @ReduceTransposeReduce4D(%arg0 : tensor<17x6x35x13xf32>) -> tensor<357
 func.func @ReduceTransposeReduce5D(%arg0 : tensor<17x6x35x15x13xf32>) -> tensor<1785x2x5x3x13xf32> {
 
   // CHECK: %[[OUT0:.*]] = mhlo.reshape %arg0 : (tensor<17x6x35x15x13xf32>) -> tensor<17x6x35x3x5x13xf32>
-  // CHECK: %[[OUT1:.*]] = "mhlo.transpose"(%[[OUT0]]) {permutation = dense<[4, 0, 1, 2, 3, 5]> : tensor<6xi64>} : (tensor<17x6x35x3x5x13xf32>) -> tensor<5x17x6x35x3x13xf32>
+  // CHECK: %[[OUT1:.*]] = "mhlo.transpose"(%[[OUT0]]) <{permutation = dense<[4, 0, 1, 2, 3, 5]> : tensor<6xi64>}> : (tensor<17x6x35x3x5x13xf32>) -> tensor<5x17x6x35x3x13xf32>
   // CHECK: %[[OUT2:.*]] = mhlo.reshape %[[OUT1]] : (tensor<5x17x6x35x3x13xf32>) -> tensor<85x6x5x7x3x13xf32>
-  // CHECK: %[[OUT3:.*]] = "mhlo.transpose"(%[[OUT2]]) {permutation = dense<[3, 0, 1, 2, 4, 5]> : tensor<6xi64>} : (tensor<85x6x5x7x3x13xf32>) -> tensor<7x85x6x5x3x13xf32>
+  // CHECK: %[[OUT3:.*]] = "mhlo.transpose"(%[[OUT2]]) <{permutation = dense<[3, 0, 1, 2, 4, 5]> : tensor<6xi64>}> : (tensor<85x6x5x7x3x13xf32>) -> tensor<7x85x6x5x3x13xf32>
   // CHECK: %[[OUT4:.*]] = mhlo.reshape %[[OUT3]] : (tensor<7x85x6x5x3x13xf32>) -> tensor<595x2x3x5x3x13xf32>
-  // CHECK: %[[OUT5:.*]] = "mhlo.transpose"(%[[OUT4]]) {permutation = dense<[2, 0, 1, 3, 4, 5]> : tensor<6xi64>} : (tensor<595x2x3x5x3x13xf32>) -> tensor<3x595x2x5x3x13xf32>
+  // CHECK: %[[OUT5:.*]] = "mhlo.transpose"(%[[OUT4]]) <{permutation = dense<[2, 0, 1, 3, 4, 5]> : tensor<6xi64>}> : (tensor<595x2x3x5x3x13xf32>) -> tensor<3x595x2x5x3x13xf32>
   // CHECK: %[[OUT6:.*]] = mhlo.reshape %[[OUT5]] : (tensor<3x595x2x5x3x13xf32>) -> tensor<1785x2x5x3x13xf32>
   // CHECK: return %[[OUT6]]
 
   %0 = "mhlo.reshape"(%arg0) : (tensor<17x6x35x15x13xf32>) -> tensor<17x2x3x5x7x3x5x13xf32>
-  %1 = "mhlo.transpose"(%0) {permutation = dense<[2, 4, 6, 0, 1, 3, 5, 7]> : tensor<8xi64>} : (tensor<17x2x3x5x7x3x5x13xf32>) -> tensor<3x7x5x17x2x5x3x13xf32>
+  %1 = "mhlo.transpose"(%0) <{permutation = dense<[2, 4, 6, 0, 1, 3, 5, 7]> : tensor<8xi64>}> : (tensor<17x2x3x5x7x3x5x13xf32>) -> tensor<3x7x5x17x2x5x3x13xf32>
   %2 = "mhlo.reshape"(%1) : (tensor<3x7x5x17x2x5x3x13xf32>) -> tensor<1785x2x5x3x13xf32>
   return %2 : tensor<1785x2x5x3x13xf32>
 }
@@ -44,9 +44,9 @@ func.func @ReduceTransposeReduce4D(%arg0 : tensor<17x6x35x13xf32>) -> tensor<357
   %0 = "mhlo.reshape"(%arg0) : (tensor<17x6x35x13xf32>) -> tensor<17x2x3x5x7x13xf32>
 
   // Shouldn't modify this transpose op as it doesn't meet the criteria.
-  // CHECK: "mhlo.transpose"(%{{.*}}) {permutation = dense<[4, 2, 0, 1, 3, 5]> : tensor<6xi64>} : (tensor<17x2x3x5x7x13xf32>) -> tensor<7x3x17x2x5x13xf32>
+  // CHECK: "mhlo.transpose"(%{{.*}}) <{permutation = dense<[4, 2, 0, 1, 3, 5]> : tensor<6xi64>}> : (tensor<17x2x3x5x7x13xf32>) -> tensor<7x3x17x2x5x13xf32>
 
-  %1 = "mhlo.transpose"(%0) {permutation = dense<[4, 2, 0, 1, 3, 5]> : tensor<6xi64>} : (tensor<17x2x3x5x7x13xf32>) -> tensor<7x3x17x2x5x13xf32>
+  %1 = "mhlo.transpose"(%0) <{permutation = dense<[4, 2, 0, 1, 3, 5]> : tensor<6xi64>}> : (tensor<17x2x3x5x7x13xf32>) -> tensor<7x3x17x2x5x13xf32>
   %2 = "mhlo.reshape"(%1) : (tensor<7x3x17x2x5x13xf32>) -> tensor<357x2x5x13xf32>
   return %2 : tensor<357x2x5x13xf32>
 }
@@ -58,8 +58,8 @@ func.func @ReduceTransposeReduce4D(%arg0 : tensor<17x6x35x13xf32>) -> tensor<3x2
   %0 = "mhlo.reshape"(%arg0) : (tensor<17x6x35x13xf32>) -> tensor<17x2x3x5x7x13xf32>
 
   // Shouldn't modify this transpose op as it doesn't meet the criteria.
-  // CHECK: "mhlo.transpose"(%{{.*}}) {permutation = dense<[2, 4, 0, 1, 3, 5]> : tensor<6xi64>} : (tensor<17x2x3x5x7x13xf32>) -> tensor<3x7x17x2x5x13xf32>
-  %1 = "mhlo.transpose"(%0) {permutation = dense<[2, 4, 0, 1, 3, 5]> : tensor<6xi64>} : (tensor<17x2x3x5x7x13xf32>) -> tensor<3x7x17x2x5x13xf32>
+  // CHECK: "mhlo.transpose"(%{{.*}}) <{permutation = dense<[2, 4, 0, 1, 3, 5]> : tensor<6xi64>}> : (tensor<17x2x3x5x7x13xf32>) -> tensor<3x7x17x2x5x13xf32>
+  %1 = "mhlo.transpose"(%0) <{permutation = dense<[2, 4, 0, 1, 3, 5]> : tensor<6xi64>}> : (tensor<17x2x3x5x7x13xf32>) -> tensor<3x7x17x2x5x13xf32>
 
   %2 = "mhlo.reshape"(%1) : (tensor<3x7x17x2x5x13xf32>) -> tensor<3x238x5x13xf32>
   return %2 : tensor<3x238x5x13xf32>
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/shape_cstr_legalize_to_hlo.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/shape_cstr_legalize_to_hlo.mlir
index 8ca1474912ca10..e5b645c816a534 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/shape_cstr_legalize_to_hlo.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/shape_cstr_legalize_to_hlo.mlir
@@ -17,9 +17,9 @@ func.func @shape_cstr_broadcastable(%arg0: tensor<2xindex>, %arg1: tensor<2xinde
   // CHECK-NEXT: %[[DIMS_EQ:.*]] = mhlo.compare  EQ, %[[DIMS1]], %[[DIMS2]],  NOTYPE : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
   // CHECK-NEXT: %[[DIMS_BROADCASTABLE:.*]] = mhlo.or %[[EITHER_DIM_IS_1]], %[[DIMS_EQ]] : tensor<2xi1>
   // CHECK-NEXT: %[[TRUE:.*]] = mhlo.constant dense<true> : tensor<1xi1>
-  // CHECK-NEXT: %[[DIM1_BROADCASTABLE:.*]] = "mhlo.slice"(%[[DIMS_BROADCASTABLE]]) {limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi1>) -> tensor<1xi1>
+  // CHECK-NEXT: %[[DIM1_BROADCASTABLE:.*]] = "mhlo.slice"(%[[DIMS_BROADCASTABLE]]) <{limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi1>) -> tensor<1xi1>
   // CHECK-NEXT: %[[BROADCASTABLE_TEMP:.*]] = mhlo.and %[[TRUE]], %[[DIM1_BROADCASTABLE]] : tensor<1xi1>
-  // CHECK-NEXT: %[[DIM2_BROADCASTABLE:.*]] = "mhlo.slice"(%[[DIMS_BROADCASTABLE]]) {limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi1>) -> tensor<1xi1>
+  // CHECK-NEXT: %[[DIM2_BROADCASTABLE:.*]] = "mhlo.slice"(%[[DIMS_BROADCASTABLE]]) <{limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi1>) -> tensor<1xi1>
   // CHECK-NEXT: %[[ALL_BROADCASTABLE:.*]] = mhlo.and %[[BROADCASTABLE_TEMP]], %[[DIM2_BROADCASTABLE]] : tensor<1xi1>
   // CHECK-NEXT: %[[ALL_BROADCASTABLE_SCALAR:.*]] = mhlo.reshape %[[ALL_BROADCASTABLE]] : (tensor<1xi1>) -> tensor<i1>
   // CHECK-NEXT: mhlo.custom_call @shape_assertion(%[[ALL_BROADCASTABLE_SCALAR]]) {error_message = "Shape assertion failed", has_side_effect = true} : (tensor<i1>) -> ()
@@ -49,7 +49,7 @@ func.func @shape_cstr_broadcastable_different_dims_1(%arg0: tensor<2xindex>, %ar
   //      CHECK: %[[DIMS1:.*]] = builtin.unrealized_conversion_cast %arg0 : tensor<2xindex> to tensor<2xi32>
   // CHECK-NEXT: %[[DIMS2:.*]] = builtin.unrealized_conversion_cast %arg1 : tensor<1xindex> to tensor<1xi32>
   // CHECK-NEXT: %[[PAD:.*]] = mhlo.constant dense<1> : tensor<1xi32>
-  // CHECK-NEXT: %[[DIMS2_PAD:.*]] = "mhlo.concatenate"(%[[PAD]], %[[DIMS2]]) {dimension = 0 : i64} : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+  // CHECK-NEXT: %[[DIMS2_PAD:.*]] = "mhlo.concatenate"(%[[PAD]], %[[DIMS2]]) <{dimension = 0 : i64}> : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
   // CHECK-NEXT: %[[ONES:.*]] = mhlo.constant dense<1> : tensor<2xi32>
   // CHECK-NEXT: %[[DIMS1_IS_1:.*]] = mhlo.compare  EQ, %[[DIMS1]], %[[ONES:.*]],  NOTYPE : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
   // CHECK-NEXT: %[[DIMS2_IS_1:.*]] = mhlo.compare  EQ, %[[DIMS2_PAD]], %[[ONES:.*]],  NOTYPE : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
@@ -57,9 +57,9 @@ func.func @shape_cstr_broadcastable_different_dims_1(%arg0: tensor<2xindex>, %ar
   // CHECK-NEXT: %[[DIMS_EQ:.*]] = mhlo.compare  EQ, %[[DIMS1]], %[[DIMS2_PAD]],  NOTYPE : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
   // CHECK-NEXT: %[[DIMS_BROADCASTABLE:.*]] = mhlo.or %[[EITHER_DIM_IS_1]], %[[DIMS_EQ]] : tensor<2xi1>
   // CHECK-NEXT: %[[TRUE:.*]] = mhlo.constant dense<true> : tensor<1xi1>
-  // CHECK-NEXT: %[[DIM1_BROADCASTABLE:.*]] = "mhlo.slice"(%[[DIMS_BROADCASTABLE]]) {limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi1>) -> tensor<1xi1>
+  // CHECK-NEXT: %[[DIM1_BROADCASTABLE:.*]] = "mhlo.slice"(%[[DIMS_BROADCASTABLE]]) <{limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi1>) -> tensor<1xi1>
   // CHECK-NEXT: %[[BROADCASTABLE_TEMP:.*]] = mhlo.and %[[TRUE]], %[[DIM1_BROADCASTABLE]] : tensor<1xi1>
-  // CHECK-NEXT: %[[DIM2_BROADCASTABLE:.*]] = "mhlo.slice"(%[[DIMS_BROADCASTABLE]]) {limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi1>) -> tensor<1xi1>
+  // CHECK-NEXT: %[[DIM2_BROADCASTABLE:.*]] = "mhlo.slice"(%[[DIMS_BROADCASTABLE]]) <{limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi1>) -> tensor<1xi1>
   // CHECK-NEXT: %[[ALL_BROADCASTABLE:.*]] = mhlo.and %[[BROADCASTABLE_TEMP]], %[[DIM2_BROADCASTABLE]] : tensor<1xi1>
   // CHECK-NEXT: %[[ALL_BROADCASTABLE_SCALAR:.*]] = mhlo.reshape %[[ALL_BROADCASTABLE]] : (tensor<1xi1>) -> tensor<i1>
   // CHECK-NEXT: mhlo.custom_call @shape_assertion(%[[ALL_BROADCASTABLE_SCALAR]]) {error_message = "Shape assertion failed", has_side_effect = true} : (tensor<i1>) -> ()
@@ -79,7 +79,7 @@ func.func @shape_cstr_broadcastable_different_dims_2(%arg0: tensor<1xindex>, %ar
   //      CHECK: %[[DIMS1:.*]] = builtin.unrealized_conversion_cast %arg0 : tensor<1xindex> to tensor<1xi32>
   // CHECK-NEXT: %[[DIMS2:.*]] = builtin.unrealized_conversion_cast %arg1 : tensor<2xindex> to tensor<2xi32>
   // CHECK-NEXT: %[[PAD:.*]] = mhlo.constant dense<1> : tensor<1xi32>
-  // CHECK-NEXT: %[[DIMS1_PAD:.*]] = "mhlo.concatenate"(%[[PAD]], %[[DIMS1]]) {dimension = 0 : i64} : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+  // CHECK-NEXT: %[[DIMS1_PAD:.*]] = "mhlo.concatenate"(%[[PAD]], %[[DIMS1]]) <{dimension = 0 : i64}> : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
   // CHECK-NEXT: %[[ONES:.*]] = mhlo.constant dense<1> : tensor<2xi32>
   // CHECK-NEXT: %[[DIMS1_IS_1:.*]] = mhlo.compare  EQ, %[[DIMS1_PAD]], %[[ONES:.*]],  NOTYPE : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
   // CHECK-NEXT: %[[DIMS2_IS_1:.*]] = mhlo.compare  EQ, %[[DIMS2]], %[[ONES:.*]],  NOTYPE : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
@@ -87,9 +87,9 @@ func.func @shape_cstr_broadcastable_different_dims_2(%arg0: tensor<1xindex>, %ar
   // CHECK-NEXT: %[[DIMS_EQ:.*]] = mhlo.compare  EQ, %[[DIMS1_PAD]], %[[DIMS2]],  NOTYPE : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
   // CHECK-NEXT: %[[DIMS_BROADCASTABLE:.*]] = mhlo.or %[[EITHER_DIM_IS_1]], %[[DIMS_EQ]] : tensor<2xi1>
   // CHECK-NEXT: %[[TRUE:.*]] = mhlo.constant dense<true> : tensor<1xi1>
-  // CHECK-NEXT: %[[DIM1_BROADCASTABLE:.*]] = "mhlo.slice"(%[[DIMS_BROADCASTABLE]]) {limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi1>) -> tensor<1xi1>
+  // CHECK-NEXT: %[[DIM1_BROADCASTABLE:.*]] = "mhlo.slice"(%[[DIMS_BROADCASTABLE]]) <{limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi1>) -> tensor<1xi1>
   // CHECK-NEXT: %[[BROADCASTABLE_TEMP:.*]] = mhlo.and %[[TRUE]], %[[DIM1_BROADCASTABLE]] : tensor<1xi1>
-  // CHECK-NEXT: %[[DIM2_BROADCASTABLE:.*]] = "mhlo.slice"(%[[DIMS_BROADCASTABLE]]) {limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi1>) -> tensor<1xi1>
+  // CHECK-NEXT: %[[DIM2_BROADCASTABLE:.*]] = "mhlo.slice"(%[[DIMS_BROADCASTABLE]]) <{limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi1>) -> tensor<1xi1>
   // CHECK-NEXT: %[[ALL_BROADCASTABLE:.*]] = mhlo.and %[[BROADCASTABLE_TEMP]], %[[DIM2_BROADCASTABLE]] : tensor<1xi1>
   // CHECK-NEXT: %[[ALL_BROADCASTABLE_SCALAR:.*]] = mhlo.reshape %[[ALL_BROADCASTABLE]] : (tensor<1xi1>) -> tensor<i1>
   // CHECK-NEXT: mhlo.custom_call @shape_assertion(%[[ALL_BROADCASTABLE_SCALAR]]) {error_message = "Shape assertion failed", has_side_effect = true} : (tensor<i1>) -> ()
@@ -119,13 +119,13 @@ func.func @mhlo_cstr_reshapable(%arg0: index, %arg1: tensor<2xindex>) {
   //  CHECK-DAG: %[[MINUS_ONE:.*]] = mhlo.constant dense<-1> : tensor<i32>
   //  CHECK-DAG: %[[ONE:.*]] = mhlo.constant dense<1> : tensor<i32>
   //  CHECK-DAG: %[[ZERO:.*]] = mhlo.constant dense<0> : tensor<i32>
-  // CHECK-NEXT: %[[DIM_SIZE_1:.*]] = "mhlo.slice"(%[[DYNAMIC_SHAPE]]) {limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi32>) -> tensor<1xi32>
+  // CHECK-NEXT: %[[DIM_SIZE_1:.*]] = "mhlo.slice"(%[[DYNAMIC_SHAPE]]) <{limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi32>) -> tensor<1xi32>
   // CHECK-NEXT: %[[DIM_SIZE_SCALAR_1:.*]] = mhlo.reshape %[[DIM_SIZE_1]] : (tensor<1xi32>) -> tensor<i32>
   // CHECK-NEXT: %[[ALL_DIMS_PRODUCT_1:.*]] = mhlo.multiply %[[ONE]], %[[DIM_SIZE_SCALAR_1]] : tensor<i32>
   // CHECK-NEXT: %[[EQ_MINUS_ONE_1:.*]] = mhlo.compare  EQ, %[[DIM_SIZE_SCALAR_1]], %[[MINUS_ONE]],  NOTYPE : (tensor<i32>, tensor<i32>) -> tensor<i1>
   // CHECK-NEXT: %[[DYNAMIC_DIM_1:.*]] = mhlo.select %[[EQ_MINUS_ONE_1]], %[[ONE]], %[[ZERO]] : tensor<i1>, tensor<i32>
   // CHECK-NEXT: %[[NUM_DYNAMIC_DIM_1:.*]] = mhlo.add %[[ZERO]], %[[DYNAMIC_DIM_1]] : tensor<i32>
-  // CHECK-NEXT: %[[DIM_SIZE_2:.*]] = "mhlo.slice"(%[[DYNAMIC_SHAPE]]) {limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi32>) -> tensor<1xi32>
+  // CHECK-NEXT: %[[DIM_SIZE_2:.*]] = "mhlo.slice"(%[[DYNAMIC_SHAPE]]) <{limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi32>) -> tensor<1xi32>
   // CHECK-NEXT: %[[DIM_SIZE_SCALAR_2:.*]] = mhlo.reshape %[[DIM_SIZE_2]] : (tensor<1xi32>) -> tensor<i32>
   // CHECK-NEXT: %[[ALL_DIMS_PRODUCT:.*]] = mhlo.multiply %[[ALL_DIMS_PRODUCT_1]], %[[DIM_SIZE_SCALAR_2]] : tensor<i32>
   // CHECK-NEXT: %[[EQ_MINUS_ONE_2:.*]] = mhlo.compare  EQ, %[[DIM_SIZE_SCALAR_2]], %[[MINUS_ONE]],  NOTYPE : (tensor<i32>, tensor<i32>) -> tensor<i1>
@@ -156,13 +156,13 @@ func.func @mhlo_cstr_reshapable_const(%arg0: tensor<?x2xf32>) {
   //  CHECK-DAG: %[[MINUS_ONE:.*]] = mhlo.constant dense<-1> : tensor<i32>
   //  CHECK-DAG: %[[ONE:.*]] = mhlo.constant dense<1> : tensor<i32>
   //  CHECK-DAG: %[[ZERO:.*]] = mhlo.constant dense<0> : tensor<i32>
-  // CHECK-NEXT: %[[DIM_SIZE_1:.*]] = "mhlo.slice"(%[[DYNAMIC_SHAPE]]) {limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi32>) -> tensor<1xi32>
+  // CHECK-NEXT: %[[DIM_SIZE_1:.*]] = "mhlo.slice"(%[[DYNAMIC_SHAPE]]) <{limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi32>) -> tensor<1xi32>
   // CHECK-NEXT: %[[DIM_SIZE_SCALAR_1:.*]] = mhlo.reshape %[[DIM_SIZE_1]] : (tensor<1xi32>) -> tensor<i32>
   // CHECK-NEXT: %[[ALL_DIMS_PRODUCT_1:.*]] = mhlo.multiply %[[ONE]], %[[DIM_SIZE_SCALAR_1]] : tensor<i32>
   // CHECK-NEXT: %[[EQ_MINUS_ONE_1:.*]] = mhlo.compare  EQ, %[[DIM_SIZE_SCALAR_1]], %[[MINUS_ONE]],  NOTYPE : (tensor<i32>, tensor<i32>) -> tensor<i1>
   // CHECK-NEXT: %[[DYNAMIC_DIM_1:.*]] = mhlo.select %[[EQ_MINUS_ONE_1]], %[[ONE]], %[[ZERO]] : tensor<i1>, tensor<i32>
   // CHECK-NEXT: %[[NUM_DYNAMIC_DIM_1:.*]] = mhlo.add %[[ZERO]], %[[DYNAMIC_DIM_1]] : tensor<i32>
-  // CHECK-NEXT: %[[DIM_SIZE_2:.*]] = "mhlo.slice"(%[[DYNAMIC_SHAPE]]) {limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi32>) -> tensor<1xi32>
+  // CHECK-NEXT: %[[DIM_SIZE_2:.*]] = "mhlo.slice"(%[[DYNAMIC_SHAPE]]) <{limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi32>) -> tensor<1xi32>
   // CHECK-NEXT: %[[DIM_SIZE_SCALAR_2:.*]] = mhlo.reshape %[[DIM_SIZE_2]] : (tensor<1xi32>) -> tensor<i32>
   // CHECK-NEXT: %[[ALL_DIMS_PRODUCT:.*]] = mhlo.multiply %[[ALL_DIMS_PRODUCT_1]], %[[DIM_SIZE_SCALAR_2]] : tensor<i32>
   // CHECK-NEXT: %[[EQ_MINUS_ONE_2:.*]] = mhlo.compare  EQ, %[[DIM_SIZE_SCALAR_2]], %[[MINUS_ONE]],  NOTYPE : (tensor<i32>, tensor<i32>) -> tensor<i1>
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/shape_legalize_to_hlo.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/shape_legalize_to_hlo.mlir
index bb8ba2fc253a9d..dc338bca947731 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/shape_legalize_to_hlo.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/shape_legalize_to_hlo.mlir
@@ -6,10 +6,10 @@ func.func @compute_reshape_shape(%arg0: index, %arg1: tensor<2xi32>) -> tensor<2
   func.return %0 : tensor<2xi32>
   //      CHECK: %[[ARG0_I32:.*]] = builtin.unrealized_conversion_cast %arg0 : index to tensor<i32>
   // CHECK-NEXT: %[[TMP0:.*]] = mhlo.constant dense<-1> : tensor<i32>
-  // CHECK-NEXT: %[[INPUT_SIZE0x1:.*]] = "mhlo.slice"(%arg1) {limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi32>) -> tensor<1xi32>
+  // CHECK-NEXT: %[[INPUT_SIZE0x1:.*]] = "mhlo.slice"(%arg1) <{limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi32>) -> tensor<1xi32>
   // CHECK-NEXT: %[[INPUT_SIZE0:.*]] = mhlo.reshape %[[INPUT_SIZE0x1]] : (tensor<1xi32>) -> tensor<i32>
   // CHECK-NEXT: %[[TMP1:.*]] = mhlo.multiply %[[TMP0]], %[[INPUT_SIZE0]] : tensor<i32>
-  // CHECK-NEXT: %[[INPUT_SIZE1x1:.*]] = "mhlo.slice"(%arg1) {limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi32>) -> tensor<1xi32>
+  // CHECK-NEXT: %[[INPUT_SIZE1x1:.*]] = "mhlo.slice"(%arg1) <{limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi32>) -> tensor<1xi32>
   // CHECK-NEXT: %[[INPUT_SIZE1:.*]] = mhlo.reshape %[[INPUT_SIZE1x1]] : (tensor<1xi32>) -> tensor<i32>
   // CHECK-NEXT: %[[INPUT_SIZE_PRODUCT:.*]] = mhlo.multiply %[[TMP1]], %[[INPUT_SIZE1]] : tensor<i32>
   // CHECK-NEXT: %[[COMPUTED_SIZE:.*]] = mhlo.divide %[[ARG0_I32]], %[[INPUT_SIZE_PRODUCT]] : tensor<i32>
@@ -20,7 +20,7 @@ func.func @compute_reshape_shape(%arg0: index, %arg1: tensor<2xi32>) -> tensor<2
   // CHECK-NEXT: %[[INPUT_SIZE1_EQ_M1:.*]] = mhlo.compare  EQ, %6, %[[M1]],  NOTYPE : (tensor<i32>, tensor<i32>) -> tensor<i1>
   // CHECK-NEXT: %[[RESULT_SIZE1:.*]] = mhlo.select %[[INPUT_SIZE1_EQ_M1]], %[[COMPUTED_SIZE]], %6 : tensor<i1>, tensor<i32>
   // CHECK-NEXT: %[[RESULT_SIZE1x1:.*]] = mhlo.reshape %[[RESULT_SIZE1]] : (tensor<i32>) -> tensor<1xi32>
-  // CHECK-NEXT: %[[RESULT:.*]] = "mhlo.concatenate"(%[[RESULT_SIZE0x1]], %[[RESULT_SIZE1x1]]) {dimension = 0 : i64} : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+  // CHECK-NEXT: %[[RESULT:.*]] = "mhlo.concatenate"(%[[RESULT_SIZE0x1]], %[[RESULT_SIZE1x1]]) <{dimension = 0 : i64}> : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
   // CHECK-NEXT: return %[[RESULT]] : tensor<2xi32>
 }
 
@@ -32,10 +32,10 @@ func.func @num_elements_tensor_to_index(%arg0: tensor<2xindex>) -> index {
   func.return %0 : index
   //      CHECK: %[[ARG0_I32:.*]] = builtin.unrealized_conversion_cast %arg0 : tensor<2xindex> to tensor<2xi32>
   // CHECK-NEXT: %[[TMP0:.*]] = mhlo.constant dense<1> : tensor<i32>
-  // CHECK-NEXT: %[[SIZE0x1:.*]] = "mhlo.slice"(%[[ARG0_I32]]) {limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi32>) -> tensor<1xi32>
+  // CHECK-NEXT: %[[SIZE0x1:.*]] = "mhlo.slice"(%[[ARG0_I32]]) <{limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi32>) -> tensor<1xi32>
   // CHECK-NEXT: %[[SIZE0:.*]] = mhlo.reshape %[[SIZE0x1]] : (tensor<1xi32>) -> tensor<i32>
   // CHECK-NEXT: %[[TMP1:.*]] = mhlo.multiply %[[TMP0]], %[[SIZE0]] : tensor<i32>
-  // CHECK-NEXT: %[[SIZE1x1:.*]] = "mhlo.slice"(%[[ARG0_I32]]) {limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi32>) -> tensor<1xi32>
+  // CHECK-NEXT: %[[SIZE1x1:.*]] = "mhlo.slice"(%[[ARG0_I32]]) <{limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi32>) -> tensor<1xi32>
   // CHECK-NEXT: %[[SIZE1:.*]] = mhlo.reshape %[[SIZE1x1]] : (tensor<1xi32>) -> tensor<i32>
   // CHECK-NEXT: %[[RESULT_I32:.*]] = mhlo.multiply %[[TMP1]], %[[SIZE1]] : tensor<i32>
   // CHECK-NEXT: %[[RESULT_INDEX:.*]] = builtin.unrealized_conversion_cast %[[RESULT_I32]] : tensor<i32> to index
@@ -64,11 +64,11 @@ func.func @num_elements_xxx_to_size(%arg0: tensor<2xindex>) -> !shape.size {
 func.func @shape_of_ranked_to_index(%arg0: tensor<?x1xf32>) -> tensor<2xindex> {
   %0 = shape.shape_of %arg0 : tensor<?x1xf32> -> tensor<2xindex>
   func.return %0 : tensor<2xindex>
-  //      CHECK: %[[SIZE0x1:.*]] = "mhlo.get_dimension_size"(%arg0) {dimension = 0 : i64} : (tensor<?x1xf32>) -> tensor<i32>
+  //      CHECK: %[[SIZE0x1:.*]] = "mhlo.get_dimension_size"(%arg0) <{dimension = 0 : i64}> : (tensor<?x1xf32>) -> tensor<i32>
   // CHECK-NEXT: %[[SIZE0:.*]] = mhlo.reshape %[[SIZE0x1]] : (tensor<i32>) -> tensor<1xi32>
-  // CHECK-NEXT: %[[SIZE1x1:.*]] = "mhlo.get_dimension_size"(%arg0) {dimension = 1 : i64} : (tensor<?x1xf32>) -> tensor<i32>
+  // CHECK-NEXT: %[[SIZE1x1:.*]] = "mhlo.get_dimension_size"(%arg0) <{dimension = 1 : i64}> : (tensor<?x1xf32>) -> tensor<i32>
   // CHECK-NEXT: %[[SIZE1:.*]] = mhlo.reshape %[[SIZE1x1]] : (tensor<i32>) -> tensor<1xi32>
-  // CHECK-NEXT: %[[RESULT_I32:.*]] = "mhlo.concatenate"(%[[SIZE0]], %[[SIZE1]]) {dimension = 0 : i64} : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+  // CHECK-NEXT: %[[RESULT_I32:.*]] = "mhlo.concatenate"(%[[SIZE0]], %[[SIZE1]]) <{dimension = 0 : i64}> : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
   // CHECK-NEXT: %[[RESULT_INDEX:.*]] = builtin.unrealized_conversion_cast %[[RESULT_I32]] : tensor<2xi32> to tensor<2xindex>
   // CHECK-NEXT: return %[[RESULT_INDEX]] : tensor<2xindex>
 }
@@ -96,7 +96,7 @@ func.func @tensor_dim(%arg0: tensor<?x?xf32>) -> index {
   %c0 = arith.constant 0 : index
   %dim = tensor.dim %arg0, %c0 : tensor<?x?xf32>
   func.return %dim : index
-  //      CHECK: %[[DIM_SIZE:.*]] = "mhlo.get_dimension_size"(%arg0) {dimension = 0 : i64} : (tensor<?x?xf32>) -> tensor<i32>
+  //      CHECK: %[[DIM_SIZE:.*]] = "mhlo.get_dimension_size"(%arg0) <{dimension = 0 : i64}> : (tensor<?x?xf32>) -> tensor<i32>
   // CHECK-NEXT: %[[DIM_SIZE_INDEX:.*]] = builtin.unrealized_conversion_cast %[[DIM_SIZE]] : tensor<i32> to index
   // CHECK-NEXT: return %[[DIM_SIZE_INDEX]] : index
 }
@@ -119,7 +119,7 @@ func.func @tensor_from_elements(%arg0: index) -> tensor<2xindex> {
   //      CHECK: %[[ELEMENT1_SCALAR:.*]] = builtin.unrealized_conversion_cast %arg0 : index to tensor<i32>
   // CHECK-NEXT: %[[ELEMENT1:.*]] = mhlo.reshape %[[ELEMENT1_SCALAR]] : (tensor<i32>) -> tensor<1xi32>
   // CHECK-NEXT: %[[ELEMENT2:.*]] = mhlo.constant dense<0> : tensor<1xi32>
-  // CHECK-NEXT: %[[CONCAT:.*]] = "mhlo.concatenate"(%[[ELEMENT1]], %[[ELEMENT2]]) {dimension = 0 : i64} : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+  // CHECK-NEXT: %[[CONCAT:.*]] = "mhlo.concatenate"(%[[ELEMENT1]], %[[ELEMENT2]]) <{dimension = 0 : i64}> : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
   // CHECK-NEXT: %[[CONCAT_INDEX:.*]] = builtin.unrealized_conversion_cast %[[CONCAT]] : tensor<2xi32> to tensor<2xindex>
   // CHECK-NEXT: return %[[CONCAT_INDEX]] : tensor<2xindex>
 }
@@ -173,7 +173,7 @@ func.func @shape_broadcast_different_dims(%arg0: tensor<4xindex>, %arg1: tensor<
   //      CHECK: %[[LHS:.*]] = builtin.unrealized_conversion_cast %arg0 : tensor<4xindex> to tensor<4xi32>
   // CHECK-NEXT: %[[RHS:.*]] = builtin.unrealized_conversion_cast %arg1 : tensor<6xindex> to tensor<6xi32>
   // CHECK-NEXT: %[[PAD:.*]] = mhlo.constant dense<1> : tensor<2xi32>
-  // CHECK-NEXT: %[[LHS_PAD:.*]] = "mhlo.concatenate"(%[[PAD]], %[[LHS]]) {dimension = 0 : i64} : (tensor<2xi32>, tensor<4xi32>) -> tensor<6xi32>
+  // CHECK-NEXT: %[[LHS_PAD:.*]] = "mhlo.concatenate"(%[[PAD]], %[[LHS]]) <{dimension = 0 : i64}> : (tensor<2xi32>, tensor<4xi32>) -> tensor<6xi32>
   // CHECK-NEXT: %[[BROADCAST:.*]] = mhlo.maximum %[[LHS_PAD]], %[[RHS]] : tensor<6xi32>
   // CHECK-NEXT: %[[BROADCAST_INDEX:.*]] = builtin.unrealized_conversion_cast %[[BROADCAST]] : tensor<6xi32> to tensor<6xindex>
   // CHECK-NEXT: return %[[BROADCAST_INDEX]] : tensor<6xindex>
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir
index b13a71452a1edc..8c8c890dbf87ac 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir
@@ -341,12 +341,12 @@ func.func @op_after_all(%arg0: !stablehlo.token) -> !stablehlo.token {
 
 // CHECK-LABEL: "op_all_gather"
 func.func @op_all_gather(%arg0: tensor<16x8xf32>) -> tensor<16x16xf32> {
-  //               CHECK: "mhlo.all_gather"(%arg0) {
+  //               CHECK: "mhlo.all_gather"(%arg0) <{
   //          CHECK-SAME:   all_gather_dim = 1 : i64,
   //          CHECK-SAME:   channel_handle = #mhlo.channel_handle<handle = 0, type = 0>,
   // CHECK-SAME{LITERAL}:   replica_groups = dense<[[0], [1]]> : tensor<2x1xi64>,
   //          CHECK-SAME:   use_global_device_ids
-  //          CHECK-SAME: } : (tensor<16x8xf32>) -> tensor<16x16xf32>
+  //          CHECK-SAME: }> : (tensor<16x8xf32>) -> tensor<16x16xf32>
   %0 = "stablehlo.all_gather"(%arg0) {
     all_gather_dim = 1 : i64,
     replica_groups = dense<[[0], [1]]> : tensor<2x1xi64>,
@@ -358,15 +358,15 @@ func.func @op_all_gather(%arg0: tensor<16x8xf32>) -> tensor<16x16xf32> {
 
 // CHECK-LABEL: "op_all_reduce"
 func.func @op_all_reduce(%arg0: tensor<f32>) -> tensor<f32> {
-  //               CHECK: "mhlo.all_reduce"(%arg0) ({
-  //          CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG1:arg.*]]: tensor<f32>, %[[ARG2:arg.*]]: tensor<f32>):
-  //          CHECK-NEXT:     %[[VAL1:.*]] = "mhlo.add"(%[[ARG1]], %[[ARG2]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
-  //          CHECK-NEXT:     "mhlo.return"(%[[VAL1]]) : (tensor<f32>) -> ()
-  //          CHECK-NEXT: }) {
+  //               CHECK: "mhlo.all_reduce"(%arg0) <{
   //          CHECK-SAME:   channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
   // CHECK-SAME{LITERAL}:   replica_groups = dense<[[0], [1]]> : tensor<2x1xi64>,
   //          CHECK-SAME:   use_global_device_ids
-  //          CHECK-SAME: } : (tensor<f32>) -> tensor<f32>
+  //          CHECK-SAME: }> ({
+  //          CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG1:arg.*]]: tensor<f32>, %[[ARG2:arg.*]]: tensor<f32>):
+  //          CHECK-NEXT:     %[[VAL1:.*]] = "mhlo.add"(%[[ARG1]], %[[ARG2]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  //          CHECK-NEXT:     "mhlo.return"(%[[VAL1]]) : (tensor<f32>) -> ()
+  //          CHECK-NEXT: }) : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.all_reduce"(%arg0) ({
     ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
       %1 = "stablehlo.add"(%arg1, %arg2) : (tensor<f32>, tensor<f32>) -> tensor<f32>
@@ -381,11 +381,11 @@ func.func @op_all_reduce(%arg0: tensor<f32>) -> tensor<f32> {
 
 // CHECK-LABEL: "op_all_reduce_tuple"
 func.func @op_all_reduce_tuple(%arg0: tensor<8xf32>, %arg1: tensor<f32>) -> (tensor<8xf32>, tensor<f32>) {
-  //      CHECK: "mhlo.all_reduce"(%[[ARG0:.*]], %[[ARG1:.*]]) ({
+  //      CHECK: "mhlo.all_reduce"(%[[ARG0:.*]], %[[ARG1:.*]]) <{replica_groups = dense<> : tensor<0x0xi64>}> ({
   // CHECK-NEXT:   ^bb0(%[[ARG2:.*]]: tensor<f32>, %[[ARG3:.*]]: tensor<f32>):
   // CHECK-NEXT:     %[[ADD:.*]] = "mhlo.add"(%arg2, %arg3) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   // CHECK-NEXT:     "mhlo.return"(%[[ADD]]) : (tensor<f32>) -> ()
-  // CHECK-NEXT: }) {replica_groups = dense<> : tensor<0x0xi64>} : (tensor<8xf32>, tensor<f32>) -> (tensor<8xf32>, tensor<f32>)
+  // CHECK-NEXT: }) : (tensor<8xf32>, tensor<f32>) -> (tensor<8xf32>, tensor<f32>)
   %0:2 = stablehlo.custom_call @mhlo.all_reduce(%arg0, %arg1) {called_computations = [@all_reduce0], mhlo.attributes = {replica_groups = dense<> : tensor<0x0xi64>}} : (tensor<8xf32>, tensor<f32>) -> (tensor<8xf32>, tensor<f32>)
   return %0#0, %0#1 : tensor<8xf32>, tensor<f32>
 }
@@ -396,13 +396,13 @@ func.func @all_reduce0(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
 
 // CHECK-LABEL: "op_all_to_all"
 func.func @op_all_to_all(%arg0: tensor<4x16xf32>) -> tensor<16x4xf32> {
-  //               CHECK: "mhlo.all_to_all"(%arg0) {
+  //               CHECK: "mhlo.all_to_all"(%arg0) <{
   //          CHECK-SAME:   channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
   //          CHECK-SAME:   concat_dimension = 0 : i64,
   // CHECK-SAME{LITERAL}:   replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
   //          CHECK-SAME:   split_count = 4 : i64,
   //          CHECK-SAME:   split_dimension = 1 : i64
-  //          CHECK-SAME: } : (tensor<4x16xf32>) -> tensor<16x4xf32>
+  //          CHECK-SAME: }> : (tensor<4x16xf32>) -> tensor<16x4xf32>
   %0 = "stablehlo.all_to_all"(%arg0) {
     split_dimension = 1 : i64,
     concat_dimension = 0 : i64,
@@ -429,10 +429,10 @@ func.func @op_atan2(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
 
 // CHECK-LABEL: "op_batch_norm_grad"
 func.func @op_batch_norm_grad(%arg0: tensor<16x16x16x16xf32>, %arg1: tensor<16xf32>, %arg2: tensor<16xf32>, %arg3: tensor<16xf32>, %arg4: tensor<16x16x16x16xf32>) -> (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>) {
-  //      CHECK: "mhlo.batch_norm_grad"(%arg0, %arg1, %arg2, %arg3, %arg4) {
+  //      CHECK: "mhlo.batch_norm_grad"(%arg0, %arg1, %arg2, %arg3, %arg4) <{
   // CHECK-SAME:   epsilon = 1.000000e-03 : f32,
   // CHECK-SAME:   feature_index = 0 : i64
-  // CHECK-SAME: } : (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16x16x16x16xf32>) -> (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>)
+  // CHECK-SAME: }> : (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16x16x16x16xf32>) -> (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>)
   %0:3 = "stablehlo.batch_norm_grad"(%arg0, %arg1, %arg2, %arg3, %arg4) {
     epsilon = 0.001 : f32,
     feature_index = 0 : i64
@@ -442,10 +442,10 @@ func.func @op_batch_norm_grad(%arg0: tensor<16x16x16x16xf32>, %arg1: tensor<16xf
 
 // CHECK-LABEL: "op_batch_norm_inference"
 func.func @op_batch_norm_inference(%arg0: tensor<16x16x16x16xf32>, %arg1: tensor<16xf32>, %arg2: tensor<16xf32>, %arg3: tensor<16xf32>, %arg4: tensor<16xf32>) -> tensor<16x16x16x16xf32> {
-  //      CHECK: "mhlo.batch_norm_inference"(%arg0, %arg1, %arg2, %arg3, %arg4) {
+  //      CHECK: "mhlo.batch_norm_inference"(%arg0, %arg1, %arg2, %arg3, %arg4) <{
   // CHECK-SAME:   epsilon = 1.000000e-03 : f32,
   // CHECK-SAME:   feature_index = 0 : i64
-  // CHECK-SAME: } : (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>) -> tensor<16x16x16x16xf32>
+  // CHECK-SAME: }> : (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>) -> tensor<16x16x16x16xf32>
   %0 = "stablehlo.batch_norm_inference"(%arg0, %arg1, %arg2, %arg3, %arg4) {
     epsilon = 0.001 : f32,
     feature_index = 0 : i64
@@ -455,10 +455,10 @@ func.func @op_batch_norm_inference(%arg0: tensor<16x16x16x16xf32>, %arg1: tensor
 
 // CHECK-LABEL: "op_batch_norm_training"
 func.func @op_batch_norm_training(%arg0: tensor<16x16x16x16xf32>, %arg1: tensor<16xf32>, %arg2: tensor<16xf32>) -> (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>) {
-  //      CHECK: "mhlo.batch_norm_training"(%arg0, %arg1, %arg2) {
+  //      CHECK: "mhlo.batch_norm_training"(%arg0, %arg1, %arg2) <{
   // CHECK-SAME:   epsilon = 1.000000e-03 : f32,
   // CHECK-SAME:   feature_index = 0 : i64
-  // CHECK-SAME: } : (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>) -> (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>)
+  // CHECK-SAME: }> : (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>) -> (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>)
   %0:3 = "stablehlo.batch_norm_training"(%arg0, %arg1, %arg2) {
     epsilon = 0.001 : f32,
     feature_index = 0 : i64
@@ -475,9 +475,9 @@ func.func @op_bitcast_convert(%arg0: tensor<i32>) -> tensor<f32> {
 
 // CHECK-LABEL: "op_broadcast_in_dim"
 func.func @op_broadcast_in_dim(%arg0: tensor<16xf32>) -> tensor<16x16xf32> {
-  //      CHECK: "mhlo.broadcast_in_dim"(%arg0) {
+  //      CHECK: "mhlo.broadcast_in_dim"(%arg0) <{
   // CHECK-SAME:   broadcast_dimensions = dense<1> : tensor<1xi64>
-  // CHECK-SAME: } : (tensor<16xf32>) -> tensor<16x16xf32>
+  // CHECK-SAME: }> : (tensor<16xf32>) -> tensor<16x16xf32>
   %0 = "stablehlo.broadcast_in_dim"(%arg0) {
     broadcast_dimensions = array<i64: 1>
   } : (tensor<16xf32>) -> tensor<16x16xf32>
@@ -486,9 +486,9 @@ func.func @op_broadcast_in_dim(%arg0: tensor<16xf32>) -> tensor<16x16xf32> {
 
 // CHECK-LABEL: "op_broadcast"
 func.func @op_broadcast(%arg0: tensor<16xf32>) -> tensor<16x16xf32> {
-  //      CHECK: "mhlo.broadcast"(%arg0) {
+  //      CHECK: "mhlo.broadcast"(%arg0) <{
   // CHECK-SAME:   broadcast_sizes = dense<16> : tensor<1xi64>
-  // CHECK-SAME: } : (tensor<16xf32>) -> tensor<16x16xf32>
+  // CHECK-SAME: }> : (tensor<16xf32>) -> tensor<16x16xf32>
   %0 = "stablehlo.broadcast"(%arg0) {
     broadcast_sizes = array<i64: 16>
   } : (tensor<16xf32>) -> tensor<16x16xf32>
@@ -522,9 +522,9 @@ func.func @op_ceil(%arg0: tensor<f32>) -> tensor<f32> {
 
 // CHECK-LABEL: "op_cholesky"
 func.func @op_cholesky(%arg0: tensor<1x16x16xf32>) -> tensor<1x16x16xf32> {
-  //      CHECK: "mhlo.cholesky"(%arg0) {
+  //      CHECK: "mhlo.cholesky"(%arg0) <{
   // CHECK-SAME:   lower = true
-  // CHECK-SAME: } : (tensor<1x16x16xf32>) -> tensor<1x16x16xf32>
+  // CHECK-SAME: }> : (tensor<1x16x16xf32>) -> tensor<1x16x16xf32>
   %0 = "stablehlo.cholesky"(%arg0) {
     lower = true
   } : (tensor<1x16x16xf32>) -> tensor<1x16x16xf32>
@@ -547,10 +547,10 @@ func.func @op_count_leading_zeros(%arg0: tensor<i32>) -> tensor<i32> {
 
 // CHECK-LABEL: "op_collective_broadcast"
 func.func @op_collective_broadcast(%arg0: tensor<1x2xi64>) -> tensor<1x2xi64> {
-  //               CHECK: "mhlo.collective_broadcast"(%arg0) {
+  //               CHECK: "mhlo.collective_broadcast"(%arg0) <{
   //          CHECK-SAME:   channel_handle = #mhlo.channel_handle<handle = 0, type = 0>,
   // CHECK-SAME{LITERAL}:   replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>
-  //          CHECK-SAME: } : (tensor<1x2xi64>) -> tensor<1x2xi64>
+  //          CHECK-SAME: }> : (tensor<1x2xi64>) -> tensor<1x2xi64>
   %0 = "stablehlo.collective_broadcast"(%arg0) {
     replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>,
     channel_handle = #stablehlo.channel_handle<handle = 0, type = 0>
@@ -560,10 +560,10 @@ func.func @op_collective_broadcast(%arg0: tensor<1x2xi64>) -> tensor<1x2xi64> {
 
 // CHECK-LABEL: "op_collective_permute"
 func.func @op_collective_permute(%arg0: tensor<16x8xf32>) -> tensor<16x8xf32> {
-  //               CHECK: "mhlo.collective_permute"(%arg0) {
+  //               CHECK: "mhlo.collective_permute"(%arg0) <{
   //          CHECK-SAME:   channel_handle = #mhlo.channel_handle<handle = 0, type = 0>,
   // CHECK-SAME{LITERAL}:   source_target_pairs = dense<[[0, 1], [1, 2], [2, 3]]> : tensor<3x2xi64>
-  //          CHECK-SAME: } : (tensor<16x8xf32>) -> tensor<16x8xf32>
+  //          CHECK-SAME: }> : (tensor<16x8xf32>) -> tensor<16x8xf32>
   %0 = "stablehlo.collective_permute"(%arg0) {
     source_target_pairs = dense<[[0, 1], [1, 2], [2, 3]]> : tensor<3x2xi64>,
     channel_handle = #stablehlo.channel_handle<handle = 0, type = 0>
@@ -573,10 +573,10 @@ func.func @op_collective_permute(%arg0: tensor<16x8xf32>) -> tensor<16x8xf32> {
 
 // CHECK-LABEL: "op_compare"
 func.func @op_compare(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<i1> {
-  //      CHECK: "mhlo.compare"(%arg0, %arg1) {
+  //      CHECK: "mhlo.compare"(%arg0, %arg1) <{
   // CHECK-SAME:   compare_type = #mhlo<comparison_type TOTALORDER>,
   // CHECK-SAME:   comparison_direction = #mhlo<comparison_direction EQ>
-  // CHECK-SAME: } : (tensor<f32>, tensor<f32>) -> tensor<i1>
+  // CHECK-SAME: }> : (tensor<f32>, tensor<f32>) -> tensor<i1>
   %0 = "stablehlo.compare"(%arg0, %arg1) {
     comparison_direction = #stablehlo<comparison_direction EQ>,
     compare_type = #stablehlo<comparison_type TOTALORDER>
@@ -593,7 +593,7 @@ func.func @op_complex(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<complex<
 
 // CHECK-LABEL: "op_composite"
 func.func @op_composite(%arg0 : tensor<i64>) -> tensor<i64> {
-  // CHECK: "mhlo.composite"(%arg0) {composite_attributes = {n = 2 : i64}, decomposition = @add_n.impl, name = "stablehlo.add_n"} : (tensor<i64>) -> tensor<i64>
+  // CHECK: "mhlo.composite"(%arg0) <{composite_attributes = {n = 2 : i64}, decomposition = @add_n.impl, name = "stablehlo.add_n"}> : (tensor<i64>) -> tensor<i64>
   %0 = stablehlo.composite "stablehlo.add_n" %arg0 {
     composite_attributes = { n = 2 : i64 },
     decomposition = @add_n.impl
@@ -616,9 +616,9 @@ func.func @op_compute_reshape_shape(%arg0: index, %arg1: tensor<1xindex>) -> ten
 
 // CHECK-LABEL: "op_concatenate"
 func.func @op_concatenate(%arg0: tensor<8xf32>, %arg1: tensor<8xf32>) -> tensor<16xf32> {
-  //      CHECK: "mhlo.concatenate"(%arg0, %arg1) {
+  //      CHECK: "mhlo.concatenate"(%arg0, %arg1) <{
   // CHECK-SAME:   dimension = 0 : i64
-  // CHECK-SAME: } : (tensor<8xf32>, tensor<8xf32>) -> tensor<16xf32>
+  // CHECK-SAME: }> : (tensor<8xf32>, tensor<8xf32>) -> tensor<16xf32>
   %0 = "stablehlo.concatenate"(%arg0, %arg1) {
     dimension = 0 : i64
   } : (tensor<8xf32>, tensor<8xf32>) -> tensor<16xf32>
@@ -627,9 +627,9 @@ func.func @op_concatenate(%arg0: tensor<8xf32>, %arg1: tensor<8xf32>) -> tensor<
 
 // CHECK-LABEL: "op_constant"
 func.func @op_constant(%arg0: tensor<f32>) -> tensor<f32> {
-  //      CHECK: "mhlo.constant"() {
+  //      CHECK: "mhlo.constant"() <{
   // CHECK-SAME:   value = dense<0.000000e+00> : tensor<f32>
-  // CHECK-SAME: } : () -> tensor<f32>
+  // CHECK-SAME: }> : () -> tensor<f32>
   %0 = "stablehlo.constant"() {
     value = dense<0.0> : tensor<f32>
   } : () -> tensor<f32>
@@ -645,7 +645,7 @@ func.func @op_convert(%arg0: tensor<i32>) -> tensor<f32> {
 
 // CHECK-LABEL: "op_convolution"
 func.func @op_convolution(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32> {
-  //      CHECK: "mhlo.convolution"(%arg0, %arg1) {
+  //      CHECK: "mhlo.convolution"(%arg0, %arg1) <{
   // CHECK-SAME:   batch_group_count = 1 : i64,
   // CHECK-SAME:   dimension_numbers = #mhlo.conv<[b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]>,
   // CHECK-SAME:   feature_group_count = 1 : i64,
@@ -655,7 +655,7 @@ func.func @op_convolution(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x16
   // CHECK-SAME:   rhs_dilation = dense<1> : tensor<2xi64>,
   // CHECK-SAME:   window_reversal = dense<false> : tensor<2xi1>,
   // CHECK-SAME:   window_strides = dense<1> : tensor<2xi64>
-  // CHECK-SAME: } : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32>
+  // CHECK-SAME: }> : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32>
   %0 = "stablehlo.convolution"(%arg0, %arg1) {
     window_strides = array<i64: 1, 1>,
     padding = dense<1> : tensor<2x2xi64>,
@@ -686,9 +686,9 @@ func.func @op_create_token() -> !stablehlo.token {
 
 // CHECK-LABEL: "op_cross_replica_sum"
 func.func @op_cross_replica_sum(%arg0: tensor<f32>) -> tensor<f32> {
-  //               CHECK: "mhlo.cross-replica-sum"(%arg0) {
+  //               CHECK: "mhlo.cross-replica-sum"(%arg0) <{
   // CHECK-SAME{LITERAL}:   replica_groups = dense<[[0], [1]]> : tensor<2x1xi64>
-  //          CHECK-SAME: } : (tensor<f32>) -> tensor<f32>
+  //          CHECK-SAME: }> : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.cross-replica-sum"(%arg0) {
     replica_groups = dense<[[0], [1]]> : tensor<2x1xi64>
   } : (tensor<f32>) -> tensor<f32>
@@ -705,7 +705,7 @@ func.func @op_cstr_reshapable(%arg0: index, %arg1: tensor<1xindex>) -> !shape.wi
 // CHECK-LABEL: "op_custom_call_api_version_original"
 func.func @called_computation() { func.return }
 func.func @op_custom_call_api_version_original(%arg0: tensor<f32>) -> tensor<f32> {
-  //      CHECK: "mhlo.custom_call"(%arg0) {
+  //      CHECK: "mhlo.custom_call"(%arg0) <{
   // CHECK-SAME:   api_version = 1 : i32,
   // CHECK-SAME:   backend_config = "",
   // CHECK-SAME:   call_target_name = "foo",
@@ -718,7 +718,7 @@ func.func @op_custom_call_api_version_original(%arg0: tensor<f32>) -> tensor<f32
   // CHECK-SAME:       operand_index = 0,
   // CHECK-SAME:       operand_tuple_indices = []>]
   // CHECK-SAME:   result_layouts = [dense<> : tensor<0xindex>]
-  // CHECK-SAME: } : (tensor<f32>) -> tensor<f32>
+  // CHECK-SAME: }> : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.custom_call"(%arg0) {
     call_target_name = "foo",
     has_side_effect = false,
@@ -737,11 +737,11 @@ func.func @op_custom_call_api_version_original(%arg0: tensor<f32>) -> tensor<f32
 
 // CHECK-LABEL: "op_custom_call_api_version_typed_ffi"
 func.func @op_custom_call_api_version_typed_ffi(%arg0: tensor<f32>) -> tensor<f32> {
-  //      CHECK: "mhlo.custom_call"(%arg0) {
+  //      CHECK: "mhlo.custom_call"(%arg0) <{
   // CHECK-SAME:   api_version = 4 : i32,
   // CHECK-SAME:   backend_config = {foo = "bar"},
   // CHECK-SAME:   call_target_name = "foo"
-  // CHECK-SAME: } : (tensor<f32>) -> tensor<f32>
+  // CHECK-SAME: }> : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.custom_call"(%arg0) {
     call_target_name = "mhlo.custom_call",
     mhlo.attributes = {api_version = 4 : i32, backend_config = {foo = "bar"}, call_target_name = "foo"},
@@ -752,11 +752,11 @@ func.func @op_custom_call_api_version_typed_ffi(%arg0: tensor<f32>) -> tensor<f3
 
 // CHECK-LABEL: op_custom_call_mhlo_backend_config
 func.func @op_custom_call_mhlo_backend_config(%arg0: tensor<16x256xbf16>) -> tensor<16x4xbf16> {
-  // CHECK: "mhlo.custom_call"(%arg0) {
+  // CHECK: "mhlo.custom_call"(%arg0) <{
   // CHECK-SAME: api_version = 4 : i32,
   // CHECK-SAME: backend_config = {aggregate_to_topk = true},
   // CHECK-SAME: call_target_name = "foo"
-  // CHECK-SAME: } : (tensor<16x256xbf16>) -> tensor<16x4xbf16>
+  // CHECK-SAME: }> : (tensor<16x256xbf16>) -> tensor<16x4xbf16>
   %4 = stablehlo.custom_call @foo(%arg0) {
     "mhlo.backend_config" = {aggregate_to_topk = true}
   } : (tensor<16x256xbf16>) -> tensor<16x4xbf16>
@@ -772,7 +772,7 @@ func.func @op_divide(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
 
 // CHECK-LABEL: "op_dot_general"
 func.func @op_dot_general(%arg0: tensor<8x8x16xf32>, %arg1: tensor<8x16x8xf32>) -> tensor<8x8x8xf32> {
-  //      CHECK: "mhlo.dot_general"(%arg0, %arg1) {
+  //      CHECK: "mhlo.dot_general"(%arg0, %arg1) <{
   // CHECK-SAME:   dot_dimension_numbers = #mhlo.dot<
   // CHECK-SAME:     lhs_batching_dimensions = [0],
   // CHECK-SAME:     rhs_batching_dimensions = [0],
@@ -780,7 +780,7 @@ func.func @op_dot_general(%arg0: tensor<8x8x16xf32>, %arg1: tensor<8x16x8xf32>)
   // CHECK-SAME:     rhs_contracting_dimensions = [1]
   // CHECK-SAME:   >,
   // CHECK-SAME:   precision_config = []
-  // CHECK-SAME: } : (tensor<8x8x16xf32>, tensor<8x16x8xf32>) -> tensor<8x8x8xf32>
+  // CHECK-SAME: }> : (tensor<8x8x16xf32>, tensor<8x16x8xf32>) -> tensor<8x8x8xf32>
   %0 = "stablehlo.dot_general"(%arg0, %arg1) {
     dot_dimension_numbers = #stablehlo.dot<
       lhs_batching_dimensions = [0],
@@ -795,9 +795,9 @@ func.func @op_dot_general(%arg0: tensor<8x8x16xf32>, %arg1: tensor<8x16x8xf32>)
 
 // CHECK-LABEL: "op_dot"
 func.func @op_dot(%arg0: tensor<8x16xf32>, %arg1: tensor<16x8xf32>) -> tensor<8x8xf32> {
-  //      CHECK: "mhlo.dot"(%arg0, %arg1) {
+  //      CHECK: "mhlo.dot"(%arg0, %arg1) <{
   // CHECK-SAME:   precision_config = []
-  // CHECK-SAME: } : (tensor<8x16xf32>, tensor<16x8xf32>) -> tensor<8x8xf32>
+  // CHECK-SAME: }> : (tensor<8x16xf32>, tensor<16x8xf32>) -> tensor<8x8xf32>
   %0 = "stablehlo.dot"(%arg0, %arg1) {
     precision_config = []
   } : (tensor<8x16xf32>, tensor<16x8xf32>) -> tensor<8x8xf32>
@@ -806,11 +806,11 @@ func.func @op_dot(%arg0: tensor<8x16xf32>, %arg1: tensor<16x8xf32>) -> tensor<8x
 
 // CHECK-LABEL: "op_dynamic_broadcast_in_dim"
 func.func @op_dynamic_broadcast_in_dim(%arg0: tensor<?xf32>, %arg1: tensor<2xindex>) -> tensor<?x?xf32> {
-  //      CHECK: "mhlo.dynamic_broadcast_in_dim"(%arg0, %arg1) {
+  //      CHECK: "mhlo.dynamic_broadcast_in_dim"(%arg0, %arg1) <{
   // CHECK-SAME:   broadcast_dimensions = dense<1> : tensor<1xi64>,
   // CHECK-SAME:   known_expanding_dimensions = dense<> : tensor<0xi64>,
   // CHECK-SAME:   known_nonexpanding_dimensions = dense<0> : tensor<1xi64>
-  // CHECK-SAME: } : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
+  // CHECK-SAME: }> : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
   %0 = "stablehlo.dynamic_broadcast_in_dim"(%arg0, %arg1) {
     broadcast_dimensions = array<i64: 1>,
     known_expanding_dimensions = array<i64>,
@@ -821,7 +821,7 @@ func.func @op_dynamic_broadcast_in_dim(%arg0: tensor<?xf32>, %arg1: tensor<2xind
 
 // CHECK-LABEL: "op_dynamic_conv"
 func.func @op_dynamic_conv(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x16xf32>, %arg2: tensor<4xi32>) -> tensor<1x?x?x16xf32> {
-  //      CHECK: "mhlo.dynamic_conv"(%arg0, %arg1, %arg2) {
+  //      CHECK: "mhlo.dynamic_conv"(%arg0, %arg1, %arg2) <{
   // CHECK-SAME:   batch_group_count = 1 : i64,
   // CHECK-SAME:   dimension_numbers = #mhlo.conv<[b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]>,
   // CHECK-SAME:   feature_group_count = 1 : i64,
@@ -831,7 +831,7 @@ func.func @op_dynamic_conv(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x1
   // CHECK-SAME:   rhs_dilation = dense<1> : tensor<2xi64>,
   // CHECK-SAME:   window_reversal = dense<false> : tensor<2xi1>,
   // CHECK-SAME:   window_strides = dense<1> : tensor<2xi64>
-  // CHECK-SAME: } : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>, tensor<4xi32>) -> tensor<1x?x?x16xf32>
+  // CHECK-SAME: }> : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>, tensor<4xi32>) -> tensor<1x?x?x16xf32>
   %0 = "stablehlo.dynamic_conv"(%arg0, %arg1, %arg2) {
     window_strides = array<i64: 1, 1>,
     padding = dense<1> : tensor<2x2xi64>,
@@ -848,7 +848,7 @@ func.func @op_dynamic_conv(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x1
 
 // CHECK-LABEL: "op_dynamic_gather"
 func.func @op_dynamic_gather(%arg0 : tensor<2x4x9xf32>, %arg1 : tensor<1x5x2xi32>, %arg2 : tensor<3xi32>) -> tensor<1x5x8xf32> {
-  //      CHECK: "mhlo.dynamic_gather"(%arg0, %arg1, %arg2) {
+  //      CHECK: "mhlo.dynamic_gather"(%arg0, %arg1, %arg2) <{
   // CHECK-SAME:   dimension_numbers = #mhlo.gather<
   // CHECK-SAME:     offset_dims = [2],
   // CHECK-SAME:     collapsed_slice_dims = [0, 1],
@@ -856,7 +856,7 @@ func.func @op_dynamic_gather(%arg0 : tensor<2x4x9xf32>, %arg1 : tensor<1x5x2xi32
   // CHECK-SAME:     index_vector_dim = 2
   // CHECK-SAME:   >,
   // CHECK-SAME:   indices_are_sorted = false
-  // CHECK-SAME: } : (tensor<2x4x9xf32>, tensor<1x5x2xi32>, tensor<3xi32>) -> tensor<1x5x8xf32>
+  // CHECK-SAME: }> : (tensor<2x4x9xf32>, tensor<1x5x2xi32>, tensor<3xi32>) -> tensor<1x5x8xf32>
   %0 = "stablehlo.dynamic_gather"(%arg0, %arg1, %arg2) {
     dimension_numbers = #stablehlo.gather<
       offset_dims = [2],
@@ -871,9 +871,9 @@ func.func @op_dynamic_gather(%arg0 : tensor<2x4x9xf32>, %arg1 : tensor<1x5x2xi32
 
 // CHECK-LABEL: "op_dynamic_iota"
 func.func @op_dynamic_iota(%arg0: tensor<1xindex>) -> tensor<?xf32> {
-  //      CHECK: "mhlo.dynamic_iota"(%arg0) {
+  //      CHECK: "mhlo.dynamic_iota"(%arg0) <{
   // CHECK-SAME:   iota_dimension = 0 : i64
-  // CHECK-SAME: } : (tensor<1xindex>) -> tensor<?xf32>
+  // CHECK-SAME: }> : (tensor<1xindex>) -> tensor<?xf32>
   %0 = "stablehlo.dynamic_iota"(%arg0) {
     iota_dimension = 0 : i64
   } : (tensor<1xindex>) -> tensor<?xf32>
@@ -896,9 +896,9 @@ func.func @op_dynamic_reshape(%arg0: tensor<16xf32>, %arg1: tensor<2xindex>) ->
 
 // CHECK-LABEL: "op_dynamic_slice"
 func.func @op_dynamic_slice(%arg0: tensor<16xf32>, %arg1: tensor<i64>) -> tensor<4xf32> {
-  //      CHECK: "mhlo.dynamic_slice"(%arg0, %arg1) {
+  //      CHECK: "mhlo.dynamic_slice"(%arg0, %arg1) <{
   // CHECK-SAME:   slice_sizes = dense<4> : tensor<1xi64>
-  // CHECK-SAME: } : (tensor<16xf32>, tensor<i64>) -> tensor<4xf32>
+  // CHECK-SAME: }> : (tensor<16xf32>, tensor<i64>) -> tensor<4xf32>
   %0 = "stablehlo.dynamic_slice"(%arg0, %arg1) {
     slice_sizes = array<i64: 4>
   } : (tensor<16xf32>, tensor<i64>) -> tensor<4xf32>
@@ -914,9 +914,9 @@ func.func @op_dynamic_update_slice(%arg0: tensor<16xf32>, %arg1: tensor<4xf32>,
 
 // CHECK-LABEL: "op_einsum"
 func.func @op_einsum(%arg0: tensor<8x16xf32>, %arg1: tensor<16x8xf32>) -> tensor<8x8xf32> {
-  //      CHECK: "mhlo.einsum"(%arg0, %arg1) {
+  //      CHECK: "mhlo.einsum"(%arg0, %arg1) <{
   // CHECK-SAME:   einsum_config = "ab,bc->ac"
-  // CHECK-SAME: } : (tensor<8x16xf32>, tensor<16x8xf32>) -> tensor<8x8xf32>
+  // CHECK-SAME: }> : (tensor<8x16xf32>, tensor<16x8xf32>) -> tensor<8x8xf32>
   %0 = "stablehlo.einsum"(%arg0, %arg1) {
     einsum_config = "ab,bc->ac"
   } : (tensor<8x16xf32>, tensor<16x8xf32>) -> tensor<8x8xf32>
@@ -939,10 +939,10 @@ func.func @op_exponential(%arg0: tensor<f32>) -> tensor<f32> {
 
 // CHECK-LABEL: "op_fft"
 func.func @op_fft(%arg0: tensor<16xcomplex<f32>>) -> tensor<16xcomplex<f32>> {
-  //      CHECK: "mhlo.fft"(%arg0) {
+  //      CHECK: "mhlo.fft"(%arg0) <{
   // CHECK-SAME:   fft_length = dense<16> : tensor<1xi64>,
   // CHECK-SAME:   fft_type = #mhlo<fft_type FFT>
-  // CHECK-SAME: } : (tensor<16xcomplex<f32>>) -> tensor<16xcomplex<f32>>
+  // CHECK-SAME: }> : (tensor<16xcomplex<f32>>) -> tensor<16xcomplex<f32>>
   %0 = "stablehlo.fft"(%arg0) {
     fft_type = #stablehlo<fft_type FFT>,
     fft_length = array<i64: 16>
@@ -959,7 +959,7 @@ func.func @op_floor(%arg0: tensor<f32>) -> tensor<f32> {
 
 // CHECK-LABEL: "op_gather"
 func.func @op_gather(%arg0 : tensor<2x4x9xf32>, %arg1 : tensor<1x5x2xi32>) -> tensor<1x5x1xf32> {
-  //      CHECK: "mhlo.gather"(%arg0, %arg1) {
+  //      CHECK: "mhlo.gather"(%arg0, %arg1) <{
   // CHECK-SAME:   dimension_numbers = #mhlo.gather<
   // CHECK-SAME:     offset_dims = [2],
   // CHECK-SAME:     collapsed_slice_dims = [0, 1],
@@ -968,7 +968,7 @@ func.func @op_gather(%arg0 : tensor<2x4x9xf32>, %arg1 : tensor<1x5x2xi32>) -> te
   // CHECK-SAME:   >,
   // CHECK-SAME:   indices_are_sorted = false,
   // CHECK-SAME:   slice_sizes = dense<1> : tensor<3xi64>
-  // CHECK-SAME: } : (tensor<2x4x9xf32>, tensor<1x5x2xi32>) -> tensor<1x5x1xf32>
+  // CHECK-SAME: }> : (tensor<2x4x9xf32>, tensor<1x5x2xi32>) -> tensor<1x5x1xf32>
   %0 = "stablehlo.gather"(%arg0, %arg1) {
     dimension_numbers = #stablehlo.gather<
       offset_dims = [2],
@@ -984,9 +984,9 @@ func.func @op_gather(%arg0 : tensor<2x4x9xf32>, %arg1 : tensor<1x5x2xi32>) -> te
 
 // CHECK-LABEL: "op_get_dimension_size"
 func.func @op_get_dimension_size(%arg0: tensor<?xf32>) -> tensor<i32> {
-  //      CHECK: "mhlo.get_dimension_size"(%arg0) {
+  //      CHECK: "mhlo.get_dimension_size"(%arg0) <{
   // CHECK-SAME:   dimension = 0 : i64
-  // CHECK-SAME: } : (tensor<?xf32>) -> tensor<i32>
+  // CHECK-SAME: }> : (tensor<?xf32>) -> tensor<i32>
   %0 = "stablehlo.get_dimension_size"(%arg0) {
     dimension = 0 : i64
   } : (tensor<?xf32>) -> tensor<i32>
@@ -995,9 +995,9 @@ func.func @op_get_dimension_size(%arg0: tensor<?xf32>) -> tensor<i32> {
 
 // CHECK-LABEL: "op_get_tuple_element"
 func.func @op_get_tuple_element(%arg0: tuple<tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>>) -> tensor<f32> {
-  //      CHECK: "mhlo.get_tuple_element"(%arg0) {
+  //      CHECK: "mhlo.get_tuple_element"(%arg0) <{
   // CHECK-SAME:   index = 4 : i32
-  // CHECK-SAME: } : (tuple<tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>>) -> tensor<f32>
+  // CHECK-SAME: }> : (tuple<tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>>) -> tensor<f32>
   %0 = "stablehlo.get_tuple_element"(%arg0) {
     index = 4 : i32
   } : (tuple<tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>>) -> tensor<f32>
@@ -1028,10 +1028,10 @@ func.func @op_imag(%arg0: tensor<complex<f32>>) -> tensor<f32> {
 
 // CHECK-LABEL: "op_infeed"
 func.func @op_infeed(%arg0: !stablehlo.token) -> (tensor<f32>, !stablehlo.token) {
-  //               CHECK: "mhlo.infeed"(%arg0) {
+  //               CHECK: "mhlo.infeed"(%arg0) <{
   //          CHECK-SAME:   infeed_config = "",
   // CHECK-SAME{LITERAL}:   layout = [[]]
-  //          CHECK-SAME: } : (!mhlo.token) -> (tensor<f32>, !mhlo.token)
+  //          CHECK-SAME: }> : (!mhlo.token) -> (tensor<f32>, !mhlo.token)
   %0:2 = "stablehlo.infeed"(%arg0) {
     infeed_config = "",
     layout = [[]]
@@ -1041,9 +1041,9 @@ func.func @op_infeed(%arg0: !stablehlo.token) -> (tensor<f32>, !stablehlo.token)
 
 // CHECK-LABEL: "op_iota"
 func.func @op_iota() -> tensor<16xf32> {
-  //      CHECK: "mhlo.iota"() {
+  //      CHECK: "mhlo.iota"() <{
   // CHECK-SAME:   iota_dimension = 0 : i64
-  // CHECK-SAME: } : () -> tensor<16xf32>
+  // CHECK-SAME: }> : () -> tensor<16xf32>
   %0 = "stablehlo.iota"() {
     iota_dimension = 0 : i64
   } : () -> tensor<16xf32>
@@ -1080,13 +1080,13 @@ func.func @op_logistic(%arg0: tensor<f32>) -> tensor<f32> {
 
 // CHECK-LABEL: "op_map"
 func.func @op_map(%arg0: tensor<16xf32>) -> tensor<16xf32> {
-  //      CHECK: "mhlo.map"(%arg0) ({
+  //      CHECK: "mhlo.map"(%arg0) <{
+  // CHECK-SAME:   dimensions = dense<0> : tensor<1xi64>
+  // CHECK-SAME: }> ({
   // CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG1:arg.*]]: tensor<f32>):
   // CHECK-NEXT:     %[[VAL1:.*]] = "mhlo.abs"(%[[ARG1]]) : (tensor<f32>) -> tensor<f32>
   // CHECK-NEXT:     "mhlo.return"(%[[VAL1]]) : (tensor<f32>) -> ()
-  // CHECK-NEXT: }) {
-  // CHECK-SAME:   dimensions = dense<0> : tensor<1xi64>
-  // CHECK-SAME: } : (tensor<16xf32>) -> tensor<16xf32>
+  // CHECK-NEXT: }) : (tensor<16xf32>) -> tensor<16xf32>
   %0 = "stablehlo.map"(%arg0) ({
     ^bb0(%arg1: tensor<f32>):
       %1 = "stablehlo.abs"(%arg1) : (tensor<f32>) -> tensor<f32>
@@ -1148,9 +1148,9 @@ func.func @op_or(%arg0: tensor<i1>, %arg1: tensor<i1>) -> tensor<i1> {
 
 // CHECK-LABEL: "op_outfeed"
 func.func @op_outfeed(%arg0: tensor<f32>, %arg1: !stablehlo.token) -> !stablehlo.token {
-  //      CHECK: "mhlo.outfeed"(%arg0, %arg1) {
+  //      CHECK: "mhlo.outfeed"(%arg0, %arg1) <{
   // CHECK-SAME:   outfeed_config = ""
-  // CHECK-SAME: } : (tensor<f32>, !mhlo.token) -> !mhlo.token
+  // CHECK-SAME: }> : (tensor<f32>, !mhlo.token) -> !mhlo.token
   %0 = "stablehlo.outfeed"(%arg0, %arg1) {
     outfeed_config = ""
   } : (tensor<f32>, !stablehlo.token) -> !stablehlo.token
@@ -1159,11 +1159,11 @@ func.func @op_outfeed(%arg0: tensor<f32>, %arg1: !stablehlo.token) -> !stablehlo
 
 // CHECK-LABEL: "op_pad"
 func.func @op_pad(%arg0: tensor<8xf32>, %arg1: tensor<f32>) -> tensor<16xf32> {
-  //      CHECK: "mhlo.pad"(%arg0, %arg1) {
+  //      CHECK: "mhlo.pad"(%arg0, %arg1) <{
   // CHECK-SAME:   edge_padding_high = dense<4> : tensor<1xi64>,
   // CHECK-SAME:   edge_padding_low = dense<4> : tensor<1xi64>,
   // CHECK-SAME:   interior_padding = dense<0> : tensor<1xi64>
-  // CHECK-SAME: } : (tensor<8xf32>, tensor<f32>) -> tensor<16xf32>
+  // CHECK-SAME: }> : (tensor<8xf32>, tensor<f32>) -> tensor<16xf32>
   %0 = "stablehlo.pad"(%arg0, %arg1) {
     edge_padding_high = array<i64: 4>,
     edge_padding_low = array<i64: 4>,
@@ -1209,10 +1209,10 @@ func.func @op_real(%arg0: tensor<complex<f32>>) -> tensor<f32> {
 
 // CHECK-LABEL: "op_recv"
 func.func @op_recv(%arg0: !stablehlo.token) -> (tensor<f32>, !stablehlo.token) {
-  //      CHECK: "mhlo.recv"(%arg0) {
+  //      CHECK: "mhlo.recv"(%arg0) <{
   // CHECK-SAME:   channel_handle = #mhlo.channel_handle<handle = 0, type = 3>,
   // CHECK-SAME:   is_host_transfer = true
-  // CHECK-SAME: } : (!mhlo.token) -> (tensor<f32>, !mhlo.token)
+  // CHECK-SAME: }> : (!mhlo.token) -> (tensor<f32>, !mhlo.token)
   %0:2 = "stablehlo.recv"(%arg0) {
     channel_handle = #stablehlo.channel_handle<handle = 0, type = 3>,
     is_host_transfer = true
@@ -1234,10 +1234,10 @@ func.func @op_reduce(%arg0: tensor<16xf32>, %arg1: tensor<f32>) -> tensor<f32> {
 
 // CHECK-LABEL: "op_reduce_precision"
 func.func @op_reduce_precision(%arg0: tensor<f32>) -> tensor<f32> {
-  //      CHECK: "mhlo.reduce_precision"(%arg0) {
+  //      CHECK: "mhlo.reduce_precision"(%arg0) <{
   // CHECK-SAME:   exponent_bits = 8 : i32,
   // CHECK-SAME:   mantissa_bits = 10 : i32
-  // CHECK-SAME: } : (tensor<f32>) -> tensor<f32>
+  // CHECK-SAME: }> : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.reduce_precision"(%arg0) {
     exponent_bits = 8 : i32,
     mantissa_bits = 10 : i32
@@ -1247,15 +1247,15 @@ func.func @op_reduce_precision(%arg0: tensor<f32>) -> tensor<f32> {
 
 // CHECK-LABEL: "op_reduce_scatter"
 func.func @op_reduce_scatter(%arg0: tensor<16xf32>) -> tensor<16xf32> {
-  //               CHECK: "mhlo.reduce_scatter"(%arg0) ({
-  //          CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG1:arg.*]]: tensor<f32>, %[[ARG2:arg.*]]: tensor<f32>):
-  //          CHECK-NEXT:     %[[VAL1:.*]] = "mhlo.add"(%[[ARG1]], %[[ARG2]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
-  //          CHECK-NEXT:     "mhlo.return"(%[[VAL1]]) : (tensor<f32>) -> ()
-  //          CHECK-NEXT: }) {
+  //               CHECK: "mhlo.reduce_scatter"(%arg0) <{
   //          CHECK-SAME:   channel_handle = #mhlo.channel_handle<handle = 0, type = 0>,
   // CHECK-SAME{LITERAL}:   replica_groups = dense<[[0], [1]]> : tensor<2x1xi64>,
   //          CHECK-SAME:   scatter_dimension = 0 : i64
-  //          CHECK-SAME: } : (tensor<16xf32>) -> tensor<16xf32>
+  //          CHECK-SAME: }> ({
+  //          CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG1:arg.*]]: tensor<f32>, %[[ARG2:arg.*]]: tensor<f32>):
+  //          CHECK-NEXT:     %[[VAL1:.*]] = "mhlo.add"(%[[ARG1]], %[[ARG2]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  //          CHECK-NEXT:     "mhlo.return"(%[[VAL1]]) : (tensor<f32>) -> ()
+  //          CHECK-NEXT: }) : (tensor<16xf32>) -> tensor<16xf32>
   %0 = "stablehlo.reduce_scatter"(%arg0) ({
     ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
       %1 = "stablehlo.add"(%arg1, %arg2) : (tensor<f32>, tensor<f32>) -> tensor<f32>
@@ -1270,17 +1270,17 @@ func.func @op_reduce_scatter(%arg0: tensor<16xf32>) -> tensor<16xf32> {
 
 // CHECK-LABEL: "op_reduce_window"
 func.func @op_reduce_window(%arg0: tensor<2x17x31x7xf32>, %arg1: tensor<f32>) -> tensor<2x5x8x7xf32> {
-  //               CHECK: "mhlo.reduce_window"(%arg0, %arg1) ({
-  //          CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG2:arg.*]]: tensor<f32>, %[[ARG3:arg.*]]: tensor<f32>):
-  //          CHECK-NEXT:     %[[VAL1:.*]] = "mhlo.maximum"(%[[ARG2]], %[[ARG3]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
-  //          CHECK-NEXT:     "mhlo.return"(%[[VAL1]]) : (tensor<f32>) -> ()
-  //          CHECK-NEXT: }) {
+  //               CHECK: "mhlo.reduce_window"(%arg0, %arg1) <{
   //          CHECK-SAME:   base_dilations = dense<1> : tensor<4xi64>,
   // CHECK-SAME{LITERAL}:   padding = dense<[[0, 0], [2, 0], [0, 2], [0, 0]]> : tensor<4x2xi64>,
   //          CHECK-SAME:   window_dilations = dense<[1, 2, 2, 1]> : tensor<4xi64>,
   //          CHECK-SAME:   window_dimensions = dense<[1, 2, 2, 1]> : tensor<4xi64>,
   //          CHECK-SAME:   window_strides = dense<[1, 4, 4, 1]> : tensor<4xi64>
-  //          CHECK-SAME: } : (tensor<2x17x31x7xf32>, tensor<f32>) -> tensor<2x5x8x7xf32>
+  //          CHECK-SAME: }> ({
+  //          CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG2:arg.*]]: tensor<f32>, %[[ARG3:arg.*]]: tensor<f32>):
+  //          CHECK-NEXT:     %[[VAL1:.*]] = "mhlo.maximum"(%[[ARG2]], %[[ARG3]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  //          CHECK-NEXT:     "mhlo.return"(%[[VAL1]]) : (tensor<f32>) -> ()
+  //          CHECK-NEXT: }) : (tensor<2x17x31x7xf32>, tensor<f32>) -> tensor<2x5x8x7xf32>
   %0 = "stablehlo.reduce_window"(%arg0, %arg1) ({
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
       %1 = "stablehlo.maximum"(%arg2, %arg3) : (tensor<f32>, tensor<f32>) -> tensor<f32>
@@ -1329,9 +1329,9 @@ func.func @op_return(%arg0: tensor<i32>, %arg1: tensor<f32>) -> tensor<f32> {
 
 // CHECK-LABEL: "op_reverse"
 func.func @op_reverse(%arg0: tensor<16xf32>) -> tensor<16xf32> {
-  //      CHECK: "mhlo.reverse"(%arg0) {
+  //      CHECK: "mhlo.reverse"(%arg0) <{
   // CHECK-SAME:   dimensions = dense<0> : tensor<1xi64>
-  // CHECK-SAME: } : (tensor<16xf32>) -> tensor<16xf32>
+  // CHECK-SAME: }> : (tensor<16xf32>) -> tensor<16xf32>
   %0 = "stablehlo.reverse"(%arg0) {
     dimensions = array<i64: 0>
   } : (tensor<16xf32>) -> tensor<16xf32>
@@ -1340,9 +1340,9 @@ func.func @op_reverse(%arg0: tensor<16xf32>) -> tensor<16xf32> {
 
 // CHECK-LABEL: "op_rng_bit_generator"
 func.func @op_rng_bit_generator(%arg0: tensor<f32>) -> (tensor<f32>, tensor<f32>) {
-  //      CHECK: "mhlo.rng_bit_generator"(%arg0) {
+  //      CHECK: "mhlo.rng_bit_generator"(%arg0) <{
   // CHECK-SAME:   rng_algorithm = #mhlo.rng_algorithm<PHILOX>
-  // CHECK-SAME: } : (tensor<f32>) -> (tensor<f32>, tensor<f32>)
+  // CHECK-SAME: }> : (tensor<f32>) -> (tensor<f32>, tensor<f32>)
   %0:2 = "stablehlo.rng_bit_generator"(%arg0) {
     rng_algorithm = #stablehlo<rng_algorithm PHILOX>
   } : (tensor<f32>) -> (tensor<f32>, tensor<f32>)
@@ -1351,9 +1351,9 @@ func.func @op_rng_bit_generator(%arg0: tensor<f32>) -> (tensor<f32>, tensor<f32>
 
 // CHECK-LABEL: "op_rng"
 func.func @op_rng(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<0xindex>) -> tensor<f32> {
-  //      CHECK: "mhlo.rng"(%arg0, %arg1, %arg2) {
+  //      CHECK: "mhlo.rng"(%arg0, %arg1, %arg2) <{
   // CHECK-SAME:   rng_distribution = #mhlo.rng_distribution<NORMAL>
-  // CHECK-SAME: } : (tensor<f32>, tensor<f32>, tensor<0xindex>) -> tensor<f32>
+  // CHECK-SAME: }> : (tensor<f32>, tensor<f32>, tensor<0xindex>) -> tensor<f32>
   %0 = "stablehlo.rng"(%arg0, %arg1, %arg2) {
     rng_distribution = #stablehlo<rng_distribution NORMAL>
   } : (tensor<f32>, tensor<f32>, tensor<0xindex>) -> tensor<f32>
@@ -1383,11 +1383,7 @@ func.func @op_rsqrt(%arg0: tensor<f32>) -> tensor<f32> {
 
 // CHECK-LABEL: "op_scatter"
 func.func @op_scatter(%arg0: tensor<200x100x300xf32>, %arg1: tensor<10x2xi32>, %arg2: tensor<10x300xf32>) -> tensor<200x100x300xf32> {
-  //      CHECK: "mhlo.scatter"(%arg0, %arg1, %arg2) ({
-  // CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG3:arg.*]]: tensor<f32>, %[[ARG4:arg.*]]: tensor<f32>):
-  // CHECK-NEXT:     %[[VAL1:.*]] = "mhlo.add"(%[[ARG3]], %[[ARG4]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
-  // CHECK-NEXT:     "mhlo.return"(%[[VAL1]]) : (tensor<f32>) -> ()
-  // CHECK-NEXT: }) {
+  //      CHECK: "mhlo.scatter"(%arg0, %arg1, %arg2) <{
   // CHECK-SAME:  indices_are_sorted = true,
   // CHECK-SAME:  scatter_dimension_numbers = #mhlo.scatter<
   // CHECK-SAME:    update_window_dims = [1],
@@ -1396,7 +1392,11 @@ func.func @op_scatter(%arg0: tensor<200x100x300xf32>, %arg1: tensor<10x2xi32>, %
   // CHECK-SAME:    index_vector_dim = 1
   // CHECK-SAME:  >,
   // CHECK-SAME:  unique_indices = true
-  // CHECK-SAME: } : (tensor<200x100x300xf32>, tensor<10x2xi32>, tensor<10x300xf32>) -> tensor<200x100x300xf32>
+  // CHECK-SAME: }> ({
+  // CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG3:arg.*]]: tensor<f32>, %[[ARG4:arg.*]]: tensor<f32>):
+  // CHECK-NEXT:     %[[VAL1:.*]] = "mhlo.add"(%[[ARG3]], %[[ARG4]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK-NEXT:     "mhlo.return"(%[[VAL1]]) : (tensor<f32>) -> ()
+  // CHECK-NEXT: }) : (tensor<200x100x300xf32>, tensor<10x2xi32>, tensor<10x300xf32>) -> tensor<200x100x300xf32>
   %0 = "stablehlo.scatter"(%arg0, %arg1, %arg2) ({
     ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
       %1 = "stablehlo.add"(%arg3, %arg4) : (tensor<f32>, tensor<f32>) -> tensor<f32>
@@ -1416,19 +1416,19 @@ func.func @op_scatter(%arg0: tensor<200x100x300xf32>, %arg1: tensor<10x2xi32>, %
 
 // CHECK-LABEL: "op_select_and_scatter"
 func.func @op_select_and_scatter(%arg0: tensor<10x24x24x64xf32>, %arg1: tensor<10x12x12x64xf32>, %arg2: tensor<f32>) -> tensor<10x24x24x64xf32> {
-  //      CHECK: "mhlo.select_and_scatter"(%arg0, %arg1, %arg2) ({
+  //      CHECK: "mhlo.select_and_scatter"(%arg0, %arg1, %arg2) <{
+  // CHECK-SAME:   padding = dense<0> : tensor<4x2xi64>,
+  // CHECK-SAME:   window_dimensions = dense<[1, 2, 2, 1]> : tensor<4xi64>,
+  // CHECK-SAME:   window_strides = dense<[1, 2, 2, 1]> : tensor<4xi64>
+  // CHECK-SAME: }> ({
   // CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG31:arg.*]]: tensor<f32>, %[[ARG41:arg.*]]: tensor<f32>):
-  // CHECK-NEXT:     %[[VAL11:.*]] = "mhlo.compare"(%[[ARG31]], %[[ARG41]]) {compare_type = #mhlo<comparison_type TOTALORDER>, comparison_direction = #mhlo<comparison_direction GE>} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+  // CHECK-NEXT:     %[[VAL11:.*]] = "mhlo.compare"(%[[ARG31]], %[[ARG41]]) <{compare_type = #mhlo<comparison_type TOTALORDER>, comparison_direction = #mhlo<comparison_direction GE>}> : (tensor<f32>, tensor<f32>) -> tensor<i1>
   // CHECK-NEXT:     "mhlo.return"(%[[VAL11]]) : (tensor<i1>) -> ()
   // CHECK-NEXT: }, {
   // CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG32:arg.*]]: tensor<f32>, %[[ARG42:arg.*]]: tensor<f32>):
   // CHECK-NEXT:     %[[VAL12:.*]] = "mhlo.add"(%[[ARG32]], %[[ARG42]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   // CHECK-NEXT:     "mhlo.return"(%[[VAL12]]) : (tensor<f32>) -> ()
-  // CHECK-NEXT: }) {
-  // CHECK-SAME:   padding = dense<0> : tensor<4x2xi64>,
-  // CHECK-SAME:   window_dimensions = dense<[1, 2, 2, 1]> : tensor<4xi64>,
-  // CHECK-SAME:   window_strides = dense<[1, 2, 2, 1]> : tensor<4xi64>
-  // CHECK-SAME: } : (tensor<10x24x24x64xf32>, tensor<10x12x12x64xf32>, tensor<f32>) -> tensor<10x24x24x64xf32>
+  // CHECK-NEXT: }) : (tensor<10x24x24x64xf32>, tensor<10x12x12x64xf32>, tensor<f32>) -> tensor<10x24x24x64xf32>
   %0 = "stablehlo.select_and_scatter"(%arg0, %arg1, %arg2) ({
     ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
       %1 = "stablehlo.compare"(%arg3, %arg4) {compare_type = #stablehlo<comparison_type TOTALORDER>, comparison_direction = #stablehlo<comparison_direction GE>} : (tensor<f32>, tensor<f32>) -> tensor<i1>
@@ -1454,10 +1454,10 @@ func.func @op_select(%arg0: tensor<i1>, %arg1: tensor<f32>, %arg2: tensor<f32>)
 
 // CHECK-LABEL: "op_send"
 func.func @op_send(%arg0: tensor<f32>, %arg1: !stablehlo.token) -> !stablehlo.token {
-  //      CHECK: "mhlo.send"(%arg0, %arg1) {
+  //      CHECK: "mhlo.send"(%arg0, %arg1) <{
   // CHECK-SAME:   channel_handle = #mhlo.channel_handle<handle = 0, type = 2>,
   // CHECK-SAME:   is_host_transfer = true
-  // CHECK-SAME: } : (tensor<f32>, !mhlo.token) -> !mhlo.token
+  // CHECK-SAME: }> : (tensor<f32>, !mhlo.token) -> !mhlo.token
   %0 = "stablehlo.send"(%arg0, %arg1) {
     channel_handle = #stablehlo.channel_handle<handle = 0, type = 2>,
     is_host_transfer = true
@@ -1467,9 +1467,9 @@ func.func @op_send(%arg0: tensor<f32>, %arg1: !stablehlo.token) -> !stablehlo.to
 
 // CHECK-LABEL: "op_set_dimension_size"
 func.func @op_set_dimension_size(%arg0: tensor<?xf32>, %arg1: tensor<i32>) -> tensor<16xf32> {
-  //      CHECK: "mhlo.set_dimension_size"(%arg0, %arg1) {
+  //      CHECK: "mhlo.set_dimension_size"(%arg0, %arg1) <{
   // CHECK-SAME:   dimension = 0 : i64
-  // CHECK-SAME: } : (tensor<?xf32>, tensor<i32>) -> tensor<16xf32>
+  // CHECK-SAME: }> : (tensor<?xf32>, tensor<i32>) -> tensor<16xf32>
   %0 = "stablehlo.set_dimension_size"(%arg0, %arg1) {
     dimension = 0 : i64
   } : (tensor<?xf32>, tensor<i32>) -> tensor<16xf32>
@@ -1513,11 +1513,11 @@ func.func @op_sine(%arg0: tensor<f32>) -> tensor<f32> {
 
 // CHECK-LABEL: "op_slice"
 func.func @op_slice(%arg0: tensor<16xf32>) -> tensor<4xf32> {
-  //      CHECK: "mhlo.slice"(%arg0) {
+  //      CHECK: "mhlo.slice"(%arg0) <{
   // CHECK-SAME:   limit_indices = dense<4> : tensor<1xi64>,
   // CHECK-SAME:   start_indices = dense<0> : tensor<1xi64>,
   // CHECK-SAME:   strides = dense<1> : tensor<1xi64>
-  // CHECK-SAME: } : (tensor<16xf32>) -> tensor<4xf32>
+  // CHECK-SAME: }> : (tensor<16xf32>) -> tensor<4xf32>
   %0 = "stablehlo.slice"(%arg0) {
     start_indices = array<i64: 0>,
     limit_indices = array<i64: 4>,
@@ -1528,14 +1528,14 @@ func.func @op_slice(%arg0: tensor<16xf32>) -> tensor<4xf32> {
 
 // CHECK-LABEL: "op_sort"
 func.func @op_sort(%arg0: tensor<16xf32>) -> tensor<16xf32> {
-  //      CHECK: "mhlo.sort"(%arg0) ({
-  // CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG1:arg.*]]: tensor<f32>, %[[ARG2:arg.*]]: tensor<f32>):
-  // CHECK-NEXT:     %[[VAL1:.*]] = "mhlo.compare"(%[[ARG1]], %[[ARG2]]) {compare_type = #mhlo<comparison_type FLOAT>, comparison_direction = #mhlo<comparison_direction GT>} : (tensor<f32>, tensor<f32>) -> tensor<i1>
-  // CHECK-NEXT:     "mhlo.return"(%[[VAL1]]) : (tensor<i1>) -> ()
-  // CHECK-NEXT: }) {
+  //      CHECK: "mhlo.sort"(%arg0) <{
   // CHECK-SAME:   dimension = 0 : i64,
   // CHECK-SAME:   is_stable = true
-  // CHECK-SAME: } : (tensor<16xf32>) -> tensor<16xf32>
+  // CHECK-SAME: }> ({
+  // CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG1:arg.*]]: tensor<f32>, %[[ARG2:arg.*]]: tensor<f32>):
+  // CHECK-NEXT:     %[[VAL1:.*]] = "mhlo.compare"(%[[ARG1]], %[[ARG2]]) <{compare_type = #mhlo<comparison_type FLOAT>, comparison_direction = #mhlo<comparison_direction GT>}> : (tensor<f32>, tensor<f32>) -> tensor<i1>
+  // CHECK-NEXT:     "mhlo.return"(%[[VAL1]]) : (tensor<i1>) -> ()
+  // CHECK-NEXT: }) : (tensor<16xf32>) -> tensor<16xf32>
   %0 = "stablehlo.sort"(%arg0) ({
     ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
       %1 = "stablehlo.compare"(%arg1, %arg2) {compare_type = #stablehlo<comparison_type FLOAT>, comparison_direction = #stablehlo<comparison_direction GT>} : (tensor<f32>, tensor<f32>) -> tensor<i1>
@@ -1582,10 +1582,10 @@ func.func @op_tanh(%arg0: tensor<f32>) -> tensor<f32> {
 
 // CHECK-LABEL: "op_torch_index_select"
 func.func @op_torch_index_select(%arg0: tensor<5x1x5xf32>, %arg1: tensor<2xi32>) ->  tensor<2x1x5xf32> {
-  //      CHECK: "mhlo.torch_index_select"(%arg0, %arg1) {
+  //      CHECK: "mhlo.torch_index_select"(%arg0, %arg1) <{
   // CHECK-SAME:   batch_dims = 0 : i64,
   // CHECK-SAME:   dim = 0 : i64
-  // CHECK-SAME: } : (tensor<5x1x5xf32>, tensor<2xi32>) -> tensor<2x1x5xf32>
+  // CHECK-SAME: }> : (tensor<5x1x5xf32>, tensor<2xi32>) -> tensor<2x1x5xf32>
   %0 = "stablehlo.torch_index_select"(%arg0, %arg1) {
     dim = 0 : i64,
     batch_dims = 0 : i64
@@ -1595,9 +1595,9 @@ func.func @op_torch_index_select(%arg0: tensor<5x1x5xf32>, %arg1: tensor<2xi32>)
 
 // CHECK-LABEL: "op_trace"
 func.func @op_trace(%arg0: tensor<f32>) {
-  //      CHECK: "mhlo.trace"(%arg0) {
+  //      CHECK: "mhlo.trace"(%arg0) <{
   // CHECK-SAME:   tag = "foo"
-  // CHECK-SAME: } : (tensor<f32>) -> ()
+  // CHECK-SAME: }> : (tensor<f32>) -> ()
   "stablehlo.trace"(%arg0) {
     tag = "foo"
   } : (tensor<f32>) -> ()
@@ -1606,9 +1606,9 @@ func.func @op_trace(%arg0: tensor<f32>) {
 
 // CHECK-LABEL: "op_transpose"
 func.func @op_transpose(%arg0: tensor<16x8xf32>) ->  tensor<8x16xf32> {
-  //      CHECK: "mhlo.transpose"(%arg0) {
+  //      CHECK: "mhlo.transpose"(%arg0) <{
   // CHECK-SAME:   permutation = dense<[1, 0]> : tensor<2xi64>
-  // CHECK-SAME: } : (tensor<16x8xf32>) -> tensor<8x16xf32>
+  // CHECK-SAME: }> : (tensor<16x8xf32>) -> tensor<8x16xf32>
   %0 = "stablehlo.transpose"(%arg0) {
     permutation = array<i64: 1, 0>
   } : (tensor<16x8xf32>) -> tensor<8x16xf32>
@@ -1617,12 +1617,12 @@ func.func @op_transpose(%arg0: tensor<16x8xf32>) ->  tensor<8x16xf32> {
 
 // CHECK-LABEL: "op_triangular_solve"
 func.func @op_triangular_solve(%arg0: tensor<16x16xf32>, %arg1: tensor<16x16xf32>) ->  tensor<16x16xf32> {
-  //      CHECK: "mhlo.triangular_solve"(%arg0, %arg1) {
+  //      CHECK: "mhlo.triangular_solve"(%arg0, %arg1) <{
   // CHECK-SAME:   left_side = true,
   // CHECK-SAME:   lower = true,
   // CHECK-SAME:   transpose_a = #mhlo<transpose NO_TRANSPOSE>,
   // CHECK-SAME:   unit_diagonal = true
-  // CHECK-SAME: } : (tensor<16x16xf32>, tensor<16x16xf32>) -> tensor<16x16xf32>
+  // CHECK-SAME: }> : (tensor<16x16xf32>, tensor<16x16xf32>) -> tensor<16x16xf32>
   %0 = "stablehlo.triangular_solve"(%arg0, %arg1) {
     left_side = true,
     lower = true,
@@ -1641,9 +1641,9 @@ func.func @op_tuple(%arg0: tensor<f32>) -> tuple<tensor<f32>> {
 
 // CHECK-LABEL: "op_unary_einsum"
 func.func @op_unary_einsum(%arg0: tensor<8x16xf32>) -> tensor<8xf32> {
-  //      CHECK: "mhlo.unary_einsum"(%arg0) {
+  //      CHECK: "mhlo.unary_einsum"(%arg0) <{
   // CHECK-SAME:   einsum_config = "ab->a"
-  // CHECK-SAME: } : (tensor<8x16xf32>) -> tensor<8xf32>
+  // CHECK-SAME: }> : (tensor<8x16xf32>) -> tensor<8xf32>
   %0 = "stablehlo.unary_einsum"(%arg0) {
     einsum_config = "ab->a"
   } : (tensor<8x16xf32>) -> tensor<8xf32>
@@ -1969,7 +1969,7 @@ func.func @op_custom_call_botched_mhlo_backend_config_version(%arg0: tensor<f32>
 
 // CHECK-LABEL: "op_topk_mhlo_v1"
 func.func @op_topk_mhlo_v1(%arg0: tensor<5x10xf32>) -> (tensor<5x8xf32>, tensor<5x8xi32>) {
-  // CHECK: "mhlo.topk"(%arg0) {k = 8 : i64, largest = true} : (tensor<5x10xf32>) -> (tensor<5x8xf32>, tensor<5x8xi32>)
+  // CHECK: "mhlo.topk"(%arg0) <{k = 8 : i64, largest = true}> : (tensor<5x10xf32>) -> (tensor<5x8xf32>, tensor<5x8xi32>)
   %0:2 = "stablehlo.custom_call"(%arg0) {
     backend_config = "",
     call_target_name = "mhlo.topk",
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/symbolic-shape-optimization.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/symbolic-shape-optimization.mlir
index 4eb4d418396de4..258d78b613e71a 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/symbolic-shape-optimization.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/symbolic-shape-optimization.mlir
@@ -396,7 +396,7 @@ func.func @reshape_integration(%arg0: tensor<512x512xf32>,
     // CHECK: shape.assuming_yield
     shape.assuming_yield %21 : tensor<?x8x?x64xf32>
   }
-  %5 = "mhlo.transpose"(%4) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>}
+  %5 = "mhlo.transpose"(%4) <{permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>}>
       : (tensor<?x8x?x64xf32>) -> tensor<?x?x8x64xf32>
   %6 = "mhlo.transpose"(%5) {permutation = dense<[0, 1, 3, 2]>
       : tensor<4xi64>} : (tensor<?x?x8x64xf32>) -> tensor<?x?x64x8xf32>
@@ -412,7 +412,7 @@ func.func @reshape_integration(%arg0: tensor<512x512xf32>,
   %12 = "mhlo.reshape"(%11) : (tensor<1xi32>) -> tensor<i32>
   %13 = mhlo.multiply %10, %12 : tensor<i32>
   %14 = "mhlo.reshape"(%13) : (tensor<i32>) -> tensor<1xi32>
-  %15 = "mhlo.concatenate"(%14, %0) {dimension = 0 : i64}
+  %15 = "mhlo.concatenate"(%14, %0) <{dimension = 0 : i64}>
       : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
   %16 = shape.shape_of %6 : tensor<?x?x64x8xf32> -> tensor<4xindex>
   %17 = shape.num_elements %16 : tensor<4xindex> -> index
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/unfuse_batch_norm.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/unfuse_batch_norm.mlir
index b947a12c3ec864..9b0d39c50ac51e 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/unfuse_batch_norm.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/unfuse_batch_norm.mlir
@@ -13,10 +13,10 @@ func.func @batchNormInference_2D_inner_features(
   // CHECK-DAG: %[[EPS_BCAST:.+]] = mhlo.constant dense<1.001000e-05> : tensor<256xf32>
   // CHECK-DAG: %[[VARIANCE_EPS:.+]] = mhlo.add %[[VARIANCE]], %[[EPS_BCAST]] : tensor<256xf32>
   // CHECK-DAG: %[[STDDEV:.+]] = mhlo.sqrt %[[VARIANCE_EPS]] : tensor<256xf32>
-  // CHECK-DAG: %[[STDDEV_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[STDDEV]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<256xf32>) -> tensor<4x256xf32>
-  // CHECK-DAG: %[[SCALE_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[SCALE]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<256xf32>) -> tensor<4x256xf32>
-  // CHECK-DAG: %[[OFFSET_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[OFFSET]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<256xf32>) -> tensor<4x256xf32>
-  // CHECK-DAG: %[[MEAN_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[MEAN]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<256xf32>) -> tensor<4x256xf32>
+  // CHECK-DAG: %[[STDDEV_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[STDDEV]]) <{broadcast_dimensions = dense<1> : tensor<1xi64>}> : (tensor<256xf32>) -> tensor<4x256xf32>
+  // CHECK-DAG: %[[SCALE_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[SCALE]]) <{broadcast_dimensions = dense<1> : tensor<1xi64>}> : (tensor<256xf32>) -> tensor<4x256xf32>
+  // CHECK-DAG: %[[OFFSET_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[OFFSET]]) <{broadcast_dimensions = dense<1> : tensor<1xi64>}> : (tensor<256xf32>) -> tensor<4x256xf32>
+  // CHECK-DAG: %[[MEAN_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[MEAN]]) <{broadcast_dimensions = dense<1> : tensor<1xi64>}> : (tensor<256xf32>) -> tensor<4x256xf32>
   // CHECK-DAG: %[[X_CENTER:.+]] = mhlo.subtract %[[X]], %[[MEAN_BCAST]] : tensor<4x256xf32>
   // CHECK-DAG: %[[X_SCALED:.+]] = mhlo.multiply %[[X_CENTER]], %[[SCALE_BCAST]] : tensor<4x256xf32>
   // CHECK-DAG: %[[X_NORMED:.+]] = mhlo.divide %[[X_SCALED]], %[[STDDEV_BCAST]] : tensor<4x256xf32>
@@ -48,13 +48,13 @@ func.func @batchNormTraining_2D_inner_features(
   // CHECK-DAG: %[[VARIANCE:.+]] = mhlo.subtract %[[EX2]], %[[E2X]] : tensor<256xf32>
   // CHECK-DAG: %[[VARIANCE_EPS:.+]] = mhlo.add %[[VARIANCE]], %[[EPS_BCAST]] : tensor<256xf32>
   // CHECK-DAG: %[[STDDEV:.+]] = mhlo.sqrt %[[VARIANCE_EPS]] : tensor<256xf32>
-  // CHECK-DAG: %[[EX_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[EX]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<256xf32>) -> tensor<4x256xf32>
+  // CHECK-DAG: %[[EX_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[EX]]) <{broadcast_dimensions = dense<1> : tensor<1xi64>}> : (tensor<256xf32>) -> tensor<4x256xf32>
   // CHECK-DAG: %[[X_CENTER:.+]] = mhlo.subtract %[[X]], %[[EX_BCAST]] : tensor<4x256xf32>
-  // CHECK-DAG: %[[STDDEV_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[STDDEV]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<256xf32>) -> tensor<4x256xf32>
+  // CHECK-DAG: %[[STDDEV_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[STDDEV]]) <{broadcast_dimensions = dense<1> : tensor<1xi64>}> : (tensor<256xf32>) -> tensor<4x256xf32>
   // CHECK-DAG: %[[X_NORMED:.+]] = mhlo.divide %[[X_CENTER]], %[[STDDEV_BCAST]] : tensor<4x256xf32>
-  // CHECK-DAG: %[[SCALE_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[SCALE]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<256xf32>) -> tensor<4x256xf32>
+  // CHECK-DAG: %[[SCALE_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[SCALE]]) <{broadcast_dimensions = dense<1> : tensor<1xi64>}> : (tensor<256xf32>) -> tensor<4x256xf32>
   // CHECK-DAG: %[[X_NORMED_SCALED:.+]] = mhlo.multiply %[[X_NORMED]], %[[SCALE_BCAST]] : tensor<4x256xf32>
-  // CHECK-DAG: %[[OFFSET_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[OFFSET]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<256xf32>) -> tensor<4x256xf32>
+  // CHECK-DAG: %[[OFFSET_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[OFFSET]]) <{broadcast_dimensions = dense<1> : tensor<1xi64>}> : (tensor<256xf32>) -> tensor<4x256xf32>
   // CHECK-DAG: %[[X_NORMED_SCALED_OFFSET:.+]] = mhlo.add %[[X_NORMED_SCALED]], %[[OFFSET_BCAST]] : tensor<4x256xf32>
   // CHECK-DAG: return %[[X_NORMED_SCALED_OFFSET]], %[[EX]], %[[VARIANCE]] : tensor<4x256xf32>, tensor<256xf32>, tensor<256xf32>
   %0:3 = "mhlo.batch_norm_training"(%x, %scale, %offset)
@@ -69,7 +69,7 @@ func.func @batchNormTraining_2D_inner_features(
 // the verifier to enforce the rest.
 // CHECK-SAME: %[[X:[^:]+]]
 // CHECK-SAME: %[[SCALE:[^:]+]]
-// CHECK-DAG: %[[SCALE_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[SCALE]]) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<256xf32>) -> tensor<3x4x256x6xf32>
+// CHECK-DAG: %[[SCALE_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[SCALE]]) <{broadcast_dimensions = dense<2> : tensor<1xi64>}> : (tensor<256xf32>) -> tensor<3x4x256x6xf32>
 func.func @batchNormInference_4D_middle_features(
     %x: tensor<3x4x256x6xf32>, %scale: tensor<256xf32>, %offset: tensor<256xf32>,
     %mean: tensor<256xf32>, %variance: tensor<256xf32>)
@@ -101,13 +101,13 @@ func.func @batchNormTraining_4D_middle_features(
   // CHECK-DAG: %[[VARIANCE:.+]] = mhlo.subtract %[[EX2]], %[[E2X]] : tensor<256xf32>
   // CHECK-DAG: %[[VARIANCE_EPS:.+]] = mhlo.add %[[VARIANCE]], %[[EPS_BCAST]] : tensor<256xf32>
   // CHECK-DAG: %[[STDDEV:.+]] = mhlo.sqrt %[[VARIANCE_EPS]] : tensor<256xf32>
-  // CHECK-DAG: %[[EX_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[EX]]) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<256xf32>) -> tensor<3x4x256x6xf32>
+  // CHECK-DAG: %[[EX_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[EX]]) <{broadcast_dimensions = dense<2> : tensor<1xi64>}> : (tensor<256xf32>) -> tensor<3x4x256x6xf32>
   // CHECK-DAG: %[[X_CENTER:.+]] = mhlo.subtract %[[X]], %[[EX_BCAST]] : tensor<3x4x256x6xf32>
-  // CHECK-DAG: %[[STDDEV_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[STDDEV]]) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<256xf32>) -> tensor<3x4x256x6xf32>
+  // CHECK-DAG: %[[STDDEV_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[STDDEV]]) <{broadcast_dimensions = dense<2> : tensor<1xi64>}> : (tensor<256xf32>) -> tensor<3x4x256x6xf32>
   // CHECK-DAG: %[[X_NORMED:.+]] = mhlo.divide %[[X_CENTER]], %[[STDDEV_BCAST]] : tensor<3x4x256x6xf32>
-  // CHECK-DAG: %[[SCALE_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[SCALE]]) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<256xf32>) -> tensor<3x4x256x6xf32>
+  // CHECK-DAG: %[[SCALE_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[SCALE]]) <{broadcast_dimensions = dense<2> : tensor<1xi64>}> : (tensor<256xf32>) -> tensor<3x4x256x6xf32>
   // CHECK-DAG: %[[X_NORMED_SCALED:.+]] = mhlo.multiply %[[X_NORMED]], %[[SCALE_BCAST]] : tensor<3x4x256x6xf32>
-  // CHECK-DAG: %[[OFFSET_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[OFFSET]]) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<256xf32>) -> tensor<3x4x256x6xf32>
+  // CHECK-DAG: %[[OFFSET_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[OFFSET]]) <{broadcast_dimensions = dense<2> : tensor<1xi64>}> : (tensor<256xf32>) -> tensor<3x4x256x6xf32>
   // CHECK-DAG: %[[X_NORMED_SCALED_OFFSET:.+]] = mhlo.add %[[X_NORMED_SCALED]], %[[OFFSET_BCAST]] : tensor<3x4x256x6xf32>
   // CHECK-DAG: return %[[X_NORMED_SCALED_OFFSET]], %[[EX]], %[[VARIANCE]] : tensor<3x4x256x6xf32>, tensor<256xf32>, tensor<256xf32>
   %0:3 = "mhlo.batch_norm_training"(%x, %scale, %offset)
@@ -210,14 +210,14 @@ func.func @batchNormInference_dynamic_shape(
     -> tensor<?x?x?x?xf32> {
   // CHECK-DAG: %[[EPS:.+]] = mhlo.constant dense<1.000000e-03> : tensor<f32>
   // CHECK-DAG: %[[VAR_SHAPE:.+]] = shape.shape_of %[[VARIANCE]] : tensor<?xf32> -> tensor<1xindex>
-  // CHECK-DAG: %[[EPS_BCAST:.+]] =  "mhlo.dynamic_broadcast_in_dim"(%[[EPS]], %[[VAR_SHAPE]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<1xindex>) -> tensor<?xf32>
+  // CHECK-DAG: %[[EPS_BCAST:.+]] =  "mhlo.dynamic_broadcast_in_dim"(%[[EPS]], %[[VAR_SHAPE]]) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<f32>, tensor<1xindex>) -> tensor<?xf32>
   // CHECK-DAG: %[[VARIANCE_EPS:.+]] = mhlo.add %[[VARIANCE]], %[[EPS_BCAST]] : tensor<?xf32>
   // CHECK-DAG: %[[STDDEV:.+]] = mhlo.sqrt %[[VARIANCE_EPS]] : tensor<?xf32>
   // CHECK-DAG: %[[X_SHAPE:.+]] = shape.shape_of %[[X]] : tensor<?x?x?x?xf32> -> tensor<4xindex>
-  // CHECK-DAG: %[[STDDEV_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[STDDEV]], %[[X_SHAPE]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
-  // CHECK-DAG: %[[SCALE_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[SCALE]], %[[X_SHAPE]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
-  // CHECK-DAG: %[[OFFSET_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[OFFSET]], %[[X_SHAPE]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
-  // CHECK-DAG: %[[MEAN_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[MEAN]], %[[X_SHAPE]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
+  // CHECK-DAG: %[[STDDEV_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[STDDEV]], %[[X_SHAPE]]) <{broadcast_dimensions = dense<1> : tensor<1xi64>}> : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
+  // CHECK-DAG: %[[SCALE_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[SCALE]], %[[X_SHAPE]]) <{broadcast_dimensions = dense<1> : tensor<1xi64>}> : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
+  // CHECK-DAG: %[[OFFSET_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[OFFSET]], %[[X_SHAPE]]) <{broadcast_dimensions = dense<1> : tensor<1xi64>}> : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
+  // CHECK-DAG: %[[MEAN_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[MEAN]], %[[X_SHAPE]]) <{broadcast_dimensions = dense<1> : tensor<1xi64>}> : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
   // CHECK-DAG: %[[X_CENTER:.+]] = mhlo.subtract %[[X]], %[[MEAN_BCAST]] : tensor<?x?x?x?xf32>
   // CHECK-DAG: %[[X_SCALED:.+]] = mhlo.multiply %[[X_CENTER]], %[[SCALE_BCAST]] : tensor<?x?x?x?xf32>
   // CHECK-DAG: %[[X_NORMED:.+]] = mhlo.divide %[[X_SCALED]], %[[STDDEV_BCAST]] : tensor<?x?x?x?xf32>
@@ -241,7 +241,7 @@ func.func @batchNormTraining_dynamic_shape(
   // CHECK-DAG: %[[ZERO:.+]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
   // CHECK-DAG: %[[EPS:.+]] = mhlo.constant dense<1.001000e-05> : tensor<f32>
   // CHECK-DAG: %[[SCALE_SHAPE:.+]] = shape.shape_of %[[SCALE]] : tensor<?xf32> -> tensor<1xindex>
-  // CHECK-DAG: %[[EPS_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[EPS]], %[[SCALE_SHAPE]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<1xindex>) -> tensor<?xf32>
+  // CHECK-DAG: %[[EPS_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[EPS]], %[[SCALE_SHAPE]]) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<f32>, tensor<1xindex>) -> tensor<?xf32>
   // CHECK-DAG: %[[X_SHAPE:.+]] = shape.shape_of %[[X]] : tensor<?x?x?x?xf32> -> tensor<4xindex>
   // CHECK-DAG: %[[X_SIZE:.+]] = shape.num_elements %[[X_SHAPE]] : tensor<4xindex> -> index
   // CHECK-DAG: %[[SCALE_SIZE:.+]] = shape.num_elements %[[SCALE_SHAPE]] : tensor<1xindex> -> index
@@ -250,7 +250,7 @@ func.func @batchNormTraining_dynamic_shape(
   // CHECK-DAG: %[[REDUCE_SIZE_TENSOR:.+]] = tensor.from_elements %[[INDEX_CAST]] : tensor<1xi64>
   // CHECK-DAG: %[[REDUCE_SIZE_TENSOR_FP:.+]] = mhlo.convert %[[REDUCE_SIZE_TENSOR]] : (tensor<1xi64>) -> tensor<1xf32>
   // CHECK-DAG: %[[REDUCE_SIZE_RESHAPE:.+]] = mhlo.reshape %[[REDUCE_SIZE_TENSOR_FP]] : (tensor<1xf32>) -> tensor<f32>
-  // CHECK-DAG: %[[REDUCE_SIZE_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[REDUCE_SIZE_RESHAPE]], %[[SCALE_SHAPE]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<1xindex>) -> tensor<?xf32>
+  // CHECK-DAG: %[[REDUCE_SIZE_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[REDUCE_SIZE_RESHAPE]], %[[SCALE_SHAPE]]) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<f32>, tensor<1xindex>) -> tensor<?xf32>
   // CHECK-DAG: %[[X_SUM:.+]] = mhlo.reduce(%[[X]] init: %[[ZERO]]) applies mhlo.add across dimensions = [0, 1, 3] : (tensor<?x?x?x?xf32>, tensor<f32>) -> tensor<?xf32>
   // CHECK-DAG: %[[X2:.+]] = mhlo.multiply %[[X]], %[[X]] : tensor<?x?x?x?xf32>
   // CHECK-DAG: %[[X2_SUM:.+]] = mhlo.reduce(%[[X2]] init: %[[ZERO]]) applies mhlo.add across dimensions = [0, 1, 3] : (tensor<?x?x?x?xf32>, tensor<f32>) -> tensor<?xf32>
@@ -260,13 +260,13 @@ func.func @batchNormTraining_dynamic_shape(
   // CHECK-DAG: %[[VARX:.+]] = mhlo.subtract %[[EX2]], %[[EX_2]] : tensor<?xf32>
   // CHECK-DAG: %[[VARX_EPS:.+]] = mhlo.add %[[VARX]], %[[EPS_BCAST]] : tensor<?xf32>
   // CHECK-DAG: %[[STDX:.+]] = mhlo.sqrt %[[VARX_EPS]] : tensor<?xf32>
-  // CHECK-DAG: %[[EX_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[EX]], %[[X_SHAPE]]) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
+  // CHECK-DAG: %[[EX_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[EX]], %[[X_SHAPE]]) <{broadcast_dimensions = dense<2> : tensor<1xi64>}> : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
   // CHECK-DAG: %[[X_SUB_EX:.+]] = mhlo.subtract %[[X]], %[[EX_BCAST]] : tensor<?x?x?x?xf32>
-  // CHECK-DAG: %[[STDX_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[STDX]], %[[X_SHAPE]]) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
+  // CHECK-DAG: %[[STDX_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[STDX]], %[[X_SHAPE]]) <{broadcast_dimensions = dense<2> : tensor<1xi64>}> : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
   // CHECK-DAG: %[[X_CENTOR:.+]] = mhlo.divide %[[X_SUB_EX]], %[[STDX_BCAST]] : tensor<?x?x?x?xf32>
-  // CHECK-DAG: %[[SCALE_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[SCALE]], %[[X_SHAPE]]) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
+  // CHECK-DAG: %[[SCALE_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[SCALE]], %[[X_SHAPE]]) <{broadcast_dimensions = dense<2> : tensor<1xi64>}> : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
   // CHECK-DAG: %[[X_SCALED:.+]] = mhlo.multiply %[[X_CENTOR]], %[[SCALE_BCAST]] : tensor<?x?x?x?xf32>
-  // CHECK-DAG: %[[OFFSET_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[OFFSET]], %[[X_SHAPE]]) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
+  // CHECK-DAG: %[[OFFSET_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[OFFSET]], %[[X_SHAPE]]) <{broadcast_dimensions = dense<2> : tensor<1xi64>}> : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
   // CHECK-DAG: %[[RESULT:.+]] = mhlo.add %[[X_SCALED]], %[[OFFSET_BCAST]] : tensor<?x?x?x?xf32>
   // CHECK-DAG: return %[[RESULT]], %[[EX]], %[[VARX]] : tensor<?x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>
   %0:3 = "mhlo.batch_norm_training"(%x, %scale, %offset)
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/verifier_while_op.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/verifier_while_op.mlir
index 37f354192734c7..0508f501fbbeef 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/verifier_while_op.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/verifier_while_op.mlir
@@ -28,13 +28,13 @@ func.func @while_with_different_types(%arg0: tensor<3xf32>) -> tensor<3xf32> {
   %1:4 = "mhlo.while"(%cst_0, %cst_1, %cst_2, %arg0) ({
   ^bb0(%arg1: tensor<1xi32>, %arg2: tensor<2xi32>, %arg3: tensor<1xf32>, %arg4: tensor<3xf32>):
     %2 = arith.constant dense<0> : tensor<i32>
-    %3 = "mhlo.slice"(%arg2) {limit_indices = dense<[1]> : tensor<1xi64>, start_indices = dense<[0]> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi32>) -> tensor<1xi32>
+    %3 = "mhlo.slice"(%arg2) <{limit_indices = dense<[1]> : tensor<1xi64>, start_indices = dense<[0]> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi32>) -> tensor<1xi32>
     %4 = "mhlo.compare"(%arg1, %3) {comparison_direction = #mhlo<comparison_direction LT>} : (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi1>
     %5 = "mhlo.reshape"(%4) : (tensor<1xi1>) -> tensor<i1>
     "mhlo.return"(%5) : (tensor<i1>) -> ()
   },  {
   ^bb0(%arg1: tensor<1xi32>, %arg2: tensor<2xi32>, %arg3: tensor<1xf32>, %arg4: tensor<3xf32>):
-    %3 = "mhlo.broadcast_in_dim"(%arg3) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<3xf32>
+    %3 = "mhlo.broadcast_in_dim"(%arg3) <{broadcast_dimensions = dense<0> : tensor<1xi64>}> : (tensor<1xf32>) -> tensor<3xf32>
     %4 = mhlo.add %3, %arg4 : tensor<3xf32>
     "mhlo.return"(%arg1, %arg2, %arg3, %4) : (tensor<1xi32>, tensor<2xi32>, tensor<1xf32>, tensor<3xf32>) -> ()
   }) : (tensor<1xi32>, tensor<2xi32>, tensor<1xf32>, tensor<3xf32>) -> (tensor<1xi32>, tensor<2xi32>, tensor<1xf32>, tensor<3xf32>)
@@ -51,13 +51,13 @@ func.func @while_dynamic(%arg0: tensor<3xf32>) -> tensor<?xf32> {
   %1:4 = "mhlo.while"(%cst_0, %cst_1, %cst_2, %arg0) ({
   ^bb0(%arg1: tensor<1xi32>, %arg2: tensor<2xi32>, %arg3: tensor<1xf32>, %arg4: tensor<?xf32>):
     %2 = arith.constant dense<0> : tensor<i32>
-    %3 = "mhlo.slice"(%arg2) {limit_indices = dense<[1]> : tensor<1xi64>, start_indices = dense<[0]> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi32>) -> tensor<1xi32>
+    %3 = "mhlo.slice"(%arg2) <{limit_indices = dense<[1]> : tensor<1xi64>, start_indices = dense<[0]> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi32>) -> tensor<1xi32>
     %4 = "mhlo.compare"(%arg1, %3) {comparison_direction = #mhlo<comparison_direction LT>} : (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi1>
     %5 = "mhlo.reshape"(%4) : (tensor<1xi1>) -> tensor<i1>
     "mhlo.return"(%5) : (tensor<i1>) -> ()
   },  {
   ^bb0(%arg1: tensor<1xi32>, %arg2: tensor<?xi32>, %arg3: tensor<1xf32>, %arg4: tensor<3xf32>):
-    %3 = "mhlo.broadcast_in_dim"(%arg3) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<3xf32>
+    %3 = "mhlo.broadcast_in_dim"(%arg3) <{broadcast_dimensions = dense<0> : tensor<1xi64>}> : (tensor<1xf32>) -> tensor<3xf32>
     %4 = mhlo.add %3, %arg4 : tensor<3xf32>
     "mhlo.return"(%arg1, %arg2, %arg3, %4) : (tensor<1xi32>, tensor<?xi32>, tensor<1xf32>, tensor<3xf32>) -> ()
   }) : (tensor<1xi32>, tensor<2xi32>, tensor<1xf32>, tensor<3xf32>) -> (tensor<1xi32>, tensor<2xi32>, tensor<1xf32>, tensor<?xf32>)
@@ -77,13 +77,13 @@ func.func @while_with_invalid_types(%arg0: tensor<3xf32>) -> tensor<3xf32> {
   %1:4 = "mhlo.while"(%cst_0, %cst_1, %cst_2, %arg0) ({
   ^bb0(%arg1: tensor<1xi32>, %arg2: tensor<2xi32>, %arg3: tensor<1xf32>, %arg4: tensor<3xf32>):
     %2 = arith.constant dense<0> : tensor<i32>
-    %3 = "mhlo.slice"(%arg2) {limit_indices = dense<[1]> : tensor<1xi64>, start_indices = dense<[0]> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi32>) -> tensor<1xi32>
+    %3 = "mhlo.slice"(%arg2) <{limit_indices = dense<[1]> : tensor<1xi64>, start_indices = dense<[0]> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi32>) -> tensor<1xi32>
     %4 = "mhlo.compare"(%arg1, %3) {comparison_direction = #mhlo<comparison_direction LT>} : (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi1>
     %5 = "mhlo.reshape"(%4) : (tensor<1xi1>) -> tensor<i1>
     "mhlo.return"(%5) : (tensor<i1>) -> ()
   },  {
   ^bb0(%arg1: tensor<1xi32>, %arg2: tensor<2xi32>, %arg3: tensor<1xf32>, %arg4: tensor<3xf32>):
-    %3 = "mhlo.broadcast_in_dim"(%arg3) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<3xf32>
+    %3 = "mhlo.broadcast_in_dim"(%arg3) <{broadcast_dimensions = dense<0> : tensor<1xi64>}> : (tensor<1xf32>) -> tensor<3xf32>
     %4 = mhlo.add %3, %arg4 : tensor<3xf32>
     "mhlo.return"(%arg1, %arg2, %arg3, %4) : (tensor<1xi32>, tensor<2xi32>, tensor<1xf32>, tensor<3xf32>) -> ()
   }) : (tensor<1xi32>, tensor<2xi32>, tensor<1xf32>, tensor<3xf32>) -> (tensor<1xi32>, tensor<2xi32>, tensor<3xf32>, tensor<1xf32>)
@@ -103,7 +103,7 @@ func.func @while_with_invalid_tuples(%arg0: tensor<3xf32>) -> tensor<3xf32> {
   ^bb0(%arg1: tensor<1xi32>, %arg2: tuple<tensor<2xi32>, tuple<tensor<1xf32>, tensor<3xf32>>>):
     %t0 = "mhlo.get_tuple_element"(%arg2) {index = 0 : i32} : (tuple<tensor<2xi32>, tuple<tensor<1xf32>, tensor<3xf32>>>) -> tensor<2xi32>
     %3 = arith.constant dense<0> : tensor<i32>
-    %4 = "mhlo.slice"(%t0) {limit_indices = dense<[1]> : tensor<1xi64>, start_indices = dense<[0]> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi32>) -> tensor<1xi32>
+    %4 = "mhlo.slice"(%t0) <{limit_indices = dense<[1]> : tensor<1xi64>, start_indices = dense<[0]> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi32>) -> tensor<1xi32>
     %5 = "mhlo.compare"(%arg1, %4) {comparison_direction = #mhlo<comparison_direction LT>} : (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi1>
     "mhlo.return"(%5) : (tensor<1xi1>) -> ()
   },  {
@@ -112,7 +112,7 @@ func.func @while_with_invalid_tuples(%arg0: tensor<3xf32>) -> tensor<3xf32> {
     %t1_2 = "mhlo.get_tuple_element"(%arg2) {index = 1 : i32} : (tuple<tensor<2xi32>, tuple<tensor<1xf32>, tensor<3xf32>>>) -> tuple<tensor<1xf32>, tensor<3xf32>>
     %t1 = "mhlo.get_tuple_element"(%t1_2) {index = 0 : i32} : (tuple<tensor<1xf32>, tensor<3xf32>>) -> tensor<1xf32>
     %t2 = "mhlo.get_tuple_element"(%t1_2) {index = 1 : i32} : (tuple<tensor<1xf32>, tensor<3xf32>>) -> tensor<3xf32>
-    %3 = "mhlo.broadcast_in_dim"(%t1) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<3xf32>
+    %3 = "mhlo.broadcast_in_dim"(%t1) <{broadcast_dimensions = dense<0> : tensor<1xi64>}> : (tensor<1xf32>) -> tensor<3xf32>
     %4 = mhlo.add %3, %t2 : tensor<3xf32>
     %5 = "mhlo.tuple"(%t1, %4) : (tensor<1xf32>, tensor<3xf32>) -> tuple<tensor<1xf32>, tensor<3xf32>>
     %6 = "mhlo.tuple"(%t0, %5) : (tensor<2xi32>, tuple<tensor<1xf32>, tensor<3xf32>>) -> tuple<tensor<2xi32>, tuple<tensor<1xf32>, tensor<3xf32>>>
@@ -132,13 +132,13 @@ func.func @while_with_different_types(%arg0: tensor<3xf32>) -> tensor<3xf32> {
   %1:4 = "mhlo.while"(%cst_0, %cst_1, %cst_2, %arg0) ({
   ^bb0(%arg1: tensor<1xi32>, %arg2: tensor<2xi32>, %arg3: tensor<3xf32>, %arg4: tensor<3xf32>):
     %2 = arith.constant dense<0> : tensor<i32>
-    %3 = "mhlo.slice"(%arg2) {limit_indices = dense<[1]> : tensor<1xi64>, start_indices = dense<[0]> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi32>) -> tensor<1xi32>
+    %3 = "mhlo.slice"(%arg2) <{limit_indices = dense<[1]> : tensor<1xi64>, start_indices = dense<[0]> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi32>) -> tensor<1xi32>
     %4 = "mhlo.compare"(%arg1, %3) {comparison_direction = #mhlo<comparison_direction LT>} : (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi1>
     %5 = "mhlo.reshape"(%4) : (tensor<1xi1>) -> tensor<i1>
     "mhlo.return"(%5) : (tensor<i1>) -> ()
   },  {
   ^bb0(%arg1: tensor<1xi32>, %arg2: tensor<2xi32>, %arg3: tensor<1xf32>, %arg4: tensor<3xf32>):
-    %3 = "mhlo.broadcast_in_dim"(%arg3) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<3xf32>
+    %3 = "mhlo.broadcast_in_dim"(%arg3) <{broadcast_dimensions = dense<0> : tensor<1xi64>}> : (tensor<1xf32>) -> tensor<3xf32>
     %4 = mhlo.add %3, %arg4 : tensor<3xf32>
     "mhlo.return"(%arg1, %arg2, %arg3, %4) : (tensor<1xi32>, tensor<2xi32>, tensor<1xf32>, tensor<3xf32>) -> ()
   }) : (tensor<1xi32>, tensor<2xi32>, tensor<1xf32>, tensor<3xf32>) -> (tensor<1xi32>, tensor<2xi32>, tensor<1xf32>, tensor<3xf32>)
@@ -155,13 +155,13 @@ func.func @while_with_different_types(%arg0: tensor<3xf32>) -> tensor<3xf32> {
   %1:4 = "mhlo.while"(%cst_0, %cst_1, %cst_2, %arg0) ({
   ^bb0(%arg1: tensor<1xi32>, %arg2: tensor<2xi32>, %arg3: tensor<1xf32>, %arg4: tensor<3xf32>):
     %2 = arith.constant dense<0> : tensor<i32>
-    %3 = "mhlo.slice"(%arg2) {limit_indices = dense<[1]> : tensor<1xi64>, start_indices = dense<[0]> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi32>) -> tensor<1xi32>
+    %3 = "mhlo.slice"(%arg2) <{limit_indices = dense<[1]> : tensor<1xi64>, start_indices = dense<[0]> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi32>) -> tensor<1xi32>
     %4 = "mhlo.compare"(%arg1, %3) {comparison_direction = #mhlo<comparison_direction LT>} : (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi1>
     %5 = "mhlo.reshape"(%4) : (tensor<1xi1>) -> tensor<i1>
     "mhlo.return"(%5) : (tensor<i1>) -> ()
   },  {
   ^bb0(%arg1: tensor<1xi32>, %arg2: tensor<3xi32>, %arg3: tensor<1xf32>, %arg4: tensor<3xf32>):
-    %3 = "mhlo.broadcast_in_dim"(%arg3) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<3xf32>
+    %3 = "mhlo.broadcast_in_dim"(%arg3) <{broadcast_dimensions = dense<0> : tensor<1xi64>}> : (tensor<1xf32>) -> tensor<3xf32>
     %4 = mhlo.add %3, %arg4 : tensor<3xf32>
     "mhlo.return"(%arg1, %arg2, %arg3, %4) : (tensor<1xi32>, tensor<3xi32>, tensor<1xf32>, tensor<3xf32>) -> ()
   }) : (tensor<1xi32>, tensor<2xi32>, tensor<1xf32>, tensor<3xf32>) -> (tensor<1xi32>, tensor<2xi32>, tensor<1xf32>, tensor<3xf32>)
@@ -178,13 +178,13 @@ func.func @while_with_block_count_mismatch(%arg0: tensor<3xf32>) -> tensor<3xf32
   %1:4 = "mhlo.while"(%cst_0, %cst_1, %cst_2, %arg0) ({
   ^bb0(%arg1: tensor<1xi32>, %arg2: tensor<2xi32>, %arg3: tensor<1xf32>):
     %2 = arith.constant dense<0> : tensor<i32>
-    %3 = "mhlo.slice"(%arg2) {limit_indices = dense<[1]> : tensor<1xi64>, start_indices = dense<[0]> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi32>) -> tensor<1xi32>
+    %3 = "mhlo.slice"(%arg2) <{limit_indices = dense<[1]> : tensor<1xi64>, start_indices = dense<[0]> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi32>) -> tensor<1xi32>
     %4 = "mhlo.compare"(%arg1, %3) {comparison_direction = #mhlo<comparison_direction LT>} : (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi1>
     %5 = "mhlo.reshape"(%4) : (tensor<1xi1>) -> tensor<i1>
     "mhlo.return"(%5) : (tensor<i1>) -> ()
   },  {
   ^bb0(%arg1: tensor<1xi32>, %arg2: tensor<3xi32>, %arg3: tensor<1xf32>, %arg4: tensor<3xf32>):
-    %3 = "mhlo.broadcast_in_dim"(%arg3) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<3xf32>
+    %3 = "mhlo.broadcast_in_dim"(%arg3) <{broadcast_dimensions = dense<0> : tensor<1xi64>}> : (tensor<1xf32>) -> tensor<3xf32>
     %4 = mhlo.add %3, %arg4 : tensor<3xf32>
     "mhlo.return"(%arg1, %arg2, %arg3, %4) : (tensor<1xi32>, tensor<3xi32>, tensor<1xf32>, tensor<3xf32>) -> ()
   }) : (tensor<1xi32>, tensor<2xi32>, tensor<1xf32>, tensor<3xf32>) -> (tensor<1xi32>, tensor<2xi32>, tensor<1xf32>, tensor<3xf32>)
@@ -201,13 +201,13 @@ func.func @while_with_block_count_mismatch(%arg0: tensor<3xf32>) -> tensor<3xf32
   %1:4 = "mhlo.while"(%cst_0, %cst_1, %cst_2, %arg0) ({
   ^bb0(%arg1: tensor<1xi32>, %arg2: tensor<2xi32>, %arg3: tensor<1xf32>, %arg4: tensor<3xf32>):
     %2 = arith.constant dense<0> : tensor<i32>
-    %3 = "mhlo.slice"(%arg2) {limit_indices = dense<[1]> : tensor<1xi64>, start_indices = dense<[0]> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi32>) -> tensor<1xi32>
+    %3 = "mhlo.slice"(%arg2) <{limit_indices = dense<[1]> : tensor<1xi64>, start_indices = dense<[0]> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi32>) -> tensor<1xi32>
     %4 = "mhlo.compare"(%arg1, %3) {comparison_direction = #mhlo<comparison_direction LT>} : (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi1>
     %5 = "mhlo.reshape"(%4) : (tensor<1xi1>) -> tensor<i1>
     "mhlo.return"(%5) : (tensor<i1>) -> ()
   },  {
   ^bb0(%arg1: tensor<1xi32>, %arg2: tensor<3xi32>, %arg3: tensor<1xf32>):
-    %3 = "mhlo.broadcast_in_dim"(%arg3) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<3xf32>
+    %3 = "mhlo.broadcast_in_dim"(%arg3) <{broadcast_dimensions = dense<0> : tensor<1xi64>}> : (tensor<1xf32>) -> tensor<3xf32>
     "mhlo.return"(%arg1, %arg2, %arg3, %3) : (tensor<1xi32>, tensor<3xi32>, tensor<1xf32>, tensor<3xf32>) -> ()
   }) : (tensor<1xi32>, tensor<2xi32>, tensor<1xf32>, tensor<3xf32>) -> (tensor<1xi32>, tensor<2xi32>, tensor<1xf32>, tensor<3xf32>)
   func.return %1#3: tensor<3xf32>
@@ -226,7 +226,7 @@ func.func @while_with_cond_return_width_mismatch(%arg0: tensor<3xf32>) -> tensor
     "mhlo.return"(%2) : (tensor<i32>) -> ()
   },  {
   ^bb0(%arg1: tensor<1xi32>, %arg2: tensor<2xi32>, %arg3: tensor<1xf32>, %arg4: tensor<3xf32>):
-    %3 = "mhlo.broadcast_in_dim"(%arg3) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<3xf32>
+    %3 = "mhlo.broadcast_in_dim"(%arg3) <{broadcast_dimensions = dense<0> : tensor<1xi64>}> : (tensor<1xf32>) -> tensor<3xf32>
     %4 = mhlo.add %3, %arg4 : tensor<3xf32>
     "mhlo.return"(%arg1, %arg2, %arg3, %4) : (tensor<1xi32>, tensor<2xi32>, tensor<1xf32>, tensor<3xf32>) -> ()
   }) : (tensor<1xi32>, tensor<2xi32>, tensor<1xf32>, tensor<3xf32>) -> (tensor<1xi32>, tensor<2xi32>, tensor<1xf32>, tensor<3xf32>)
@@ -242,12 +242,12 @@ func.func @while_with_cond_return_rank_mismatch(%arg0: tensor<3xf32>) -> tensor<
   // expected-error @+1 {{expect condition block return a zero-ranked tensor of i1 but got 'tensor<1xi1>'}}
   %1:4 = "mhlo.while"(%cst_0, %cst_1, %cst_2, %arg0) ({
   ^bb0(%arg1: tensor<1xi32>, %arg2: tensor<2xi32>, %arg3: tensor<1xf32>, %arg4: tensor<3xf32>):
-    %3 = "mhlo.slice"(%arg2) {limit_indices = dense<[1]> : tensor<1xi64>, start_indices = dense<[0]> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi32>) -> tensor<1xi32>
+    %3 = "mhlo.slice"(%arg2) <{limit_indices = dense<[1]> : tensor<1xi64>, start_indices = dense<[0]> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi32>) -> tensor<1xi32>
     %4 = "mhlo.compare"(%arg1, %3) {comparison_direction = #mhlo<comparison_direction LT>} : (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi1>
     "mhlo.return"(%4) : (tensor<1xi1>) -> ()
   },  {
   ^bb0(%arg1: tensor<1xi32>, %arg2: tensor<2xi32>, %arg3: tensor<1xf32>, %arg4: tensor<3xf32>):
-    %3 = "mhlo.broadcast_in_dim"(%arg3) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<3xf32>
+    %3 = "mhlo.broadcast_in_dim"(%arg3) <{broadcast_dimensions = dense<0> : tensor<1xi64>}> : (tensor<1xf32>) -> tensor<3xf32>
     %4 = mhlo.add %3, %arg4 : tensor<3xf32>
     "mhlo.return"(%arg1, %arg2, %arg3, %4) : (tensor<1xi32>, tensor<2xi32>, tensor<1xf32>, tensor<3xf32>) -> ()
   }) : (tensor<1xi32>, tensor<2xi32>, tensor<1xf32>, tensor<3xf32>) -> (tensor<1xi32>, tensor<2xi32>, tensor<1xf32>, tensor<3xf32>)
@@ -267,7 +267,7 @@ func.func @while_with_cond_return_type_mismatch(%arg0: tensor<3xf32>) -> tensor<
     "mhlo.return"(%2) : (tensor<f32>) -> ()
   },  {
   ^bb0(%arg1: tensor<1xi32>, %arg2: tensor<2xi32>, %arg3: tensor<1xf32>, %arg4: tensor<3xf32>):
-    %3 = "mhlo.broadcast_in_dim"(%arg3) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<3xf32>
+    %3 = "mhlo.broadcast_in_dim"(%arg3) <{broadcast_dimensions = dense<0> : tensor<1xi64>}> : (tensor<1xf32>) -> tensor<3xf32>
     %4 = mhlo.add %3, %arg4 : tensor<3xf32>
     "mhlo.return"(%arg1, %arg2, %arg3, %4) : (tensor<1xi32>, tensor<2xi32>, tensor<1xf32>, tensor<3xf32>) -> ()
   }) : (tensor<1xi32>, tensor<2xi32>, tensor<1xf32>, tensor<3xf32>) -> (tensor<1xi32>, tensor<2xi32>, tensor<1xf32>, tensor<3xf32>)
@@ -284,7 +284,7 @@ func.func @while_with_body_return_mismatch(%arg0: tensor<3xf32>) -> tensor<3xf32
   %1:4 = "mhlo.while"(%cst_0, %cst_1, %cst_2, %arg0) ({
   ^bb0(%arg1: tensor<1xi32>, %arg2: tensor<2xi32>, %arg3: tensor<1xf32>, %arg4: tensor<3xf32>):
     %2 = arith.constant dense<0> : tensor<i32>
-    %3 = "mhlo.slice"(%arg2) {limit_indices = dense<[1]> : tensor<1xi64>, start_indices = dense<[0]> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi32>) -> tensor<1xi32>
+    %3 = "mhlo.slice"(%arg2) <{limit_indices = dense<[1]> : tensor<1xi64>, start_indices = dense<[0]> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi32>) -> tensor<1xi32>
     %4 = "mhlo.compare"(%arg1, %3) {comparison_direction = #mhlo<comparison_direction LT>} : (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi1>
     %5 = "mhlo.reshape"(%4) : (tensor<1xi1>) -> tensor<i1>
     "mhlo.return"(%5) : (tensor<i1>) -> ()
@@ -305,13 +305,13 @@ func.func @while_with_multiple_operand_in_cond_return(%arg0: tensor<3xf32>) -> t
   %1:4 = "mhlo.while"(%cst_0, %cst_1, %cst_2, %arg0) ({
   ^bb0(%arg1: tensor<1xi32>, %arg2: tensor<2xi32>, %arg3: tensor<1xf32>, %arg4: tensor<3xf32>):
     %2 = arith.constant dense<0> : tensor<i32>
-    %3 = "mhlo.slice"(%arg2) {limit_indices = dense<[1]> : tensor<1xi64>, start_indices = dense<[0]> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi32>) -> tensor<1xi32>
+    %3 = "mhlo.slice"(%arg2) <{limit_indices = dense<[1]> : tensor<1xi64>, start_indices = dense<[0]> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi32>) -> tensor<1xi32>
     %4 = "mhlo.compare"(%arg1, %3) {comparison_direction = #mhlo<comparison_direction LT>} : (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi1>
     %5 = "mhlo.reshape"(%4) : (tensor<1xi1>) -> tensor<i1>
     "mhlo.return"(%5, %5) : (tensor<i1>, tensor<i1>) -> ()
   },  {
   ^bb0(%arg1: tensor<1xi32>, %arg2: tensor<2xi32>, %arg3: tensor<1xf32>, %arg4: tensor<3xf32>):
-    %3 = "mhlo.broadcast_in_dim"(%arg3) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<3xf32>
+    %3 = "mhlo.broadcast_in_dim"(%arg3) <{broadcast_dimensions = dense<0> : tensor<1xi64>}> : (tensor<1xf32>) -> tensor<3xf32>
     %4 = mhlo.add %3, %arg4 : tensor<3xf32>
     "mhlo.return"(%arg1, %arg2, %arg3, %4) : (tensor<1xi32>, tensor<2xi32>, tensor<1xf32>, tensor<3xf32>) -> ()
   }) : (tensor<1xi32>, tensor<2xi32>, tensor<1xf32>, tensor<3xf32>) -> (tensor<1xi32>, tensor<2xi32>, tensor<1xf32>, tensor<3xf32>)
@@ -328,13 +328,13 @@ func.func @while_mismatch_operand_count_with_body_return(%arg0: tensor<3xf32>) -
   %1:4 = "mhlo.while"(%cst_0, %cst_1, %cst_2, %arg0) ({
   ^bb0(%arg1: tensor<1xi32>, %arg2: tensor<2xi32>, %arg3: tensor<1xf32>, %arg4: tensor<3xf32>):
     %2 = arith.constant dense<0> : tensor<i32>
-    %3 = "mhlo.slice"(%arg2) {limit_indices = dense<[1]> : tensor<1xi64>, start_indices = dense<[0]> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi32>) -> tensor<1xi32>
+    %3 = "mhlo.slice"(%arg2) <{limit_indices = dense<[1]> : tensor<1xi64>, start_indices = dense<[0]> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi32>) -> tensor<1xi32>
     %4 = "mhlo.compare"(%arg1, %3) {comparison_direction = #mhlo<comparison_direction LT>} : (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi1>
     %5 = "mhlo.reshape"(%4) : (tensor<1xi1>) -> tensor<i1>
     "mhlo.return"(%5) : (tensor<i1>) -> ()
   },  {
   ^bb0(%arg1: tensor<1xi32>, %arg2: tensor<2xi32>, %arg3: tensor<1xf32>, %arg4: tensor<3xf32>):
-    %3 = "mhlo.broadcast_in_dim"(%arg3) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<3xf32>
+    %3 = "mhlo.broadcast_in_dim"(%arg3) <{broadcast_dimensions = dense<0> : tensor<1xi64>}> : (tensor<1xf32>) -> tensor<3xf32>
     %4 = mhlo.add %3, %arg4 : tensor<3xf32>
     "mhlo.return"(%arg1, %arg2, %arg3) : (tensor<1xi32>, tensor<2xi32>, tensor<1xf32>) -> ()
   }) : (tensor<1xi32>, tensor<2xi32>, tensor<1xf32>, tensor<3xf32>) -> (tensor<1xi32>, tensor<2xi32>, tensor<1xf32>, tensor<3xf32>)
diff --git a/third_party/xla/xla/mlir_hlo/tests/bufferize.mlir b/third_party/xla/xla/mlir_hlo/tests/bufferize.mlir
index d91f923467f976..1341eeb1942bb6 100644
--- a/third_party/xla/xla/mlir_hlo/tests/bufferize.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/bufferize.mlir
@@ -257,7 +257,7 @@ func.func @slice(%t : tensor<3xi32>) -> tensor<1xi32> {
 
 func.func @dynamic_broadcast_return(%t : tensor<?x?xf32>, %shape : tensor<2xi32>) -> tensor<?x?xf32> {
   // CHECK: memref.copy
-  %bcast = "mhlo.dynamic_broadcast_in_dim"(%t, %shape) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xi32>) -> tensor<?x?xf32>
+  %bcast = "mhlo.dynamic_broadcast_in_dim"(%t, %shape) <{broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}> : (tensor<?x?xf32>, tensor<2xi32>) -> tensor<?x?xf32>
   func.return %bcast : tensor<?x?xf32>
 }
 
diff --git a/third_party/xla/xla/mlir_hlo/tests/bufferize_one_shot.mlir b/third_party/xla/xla/mlir_hlo/tests/bufferize_one_shot.mlir
index 91d04a84872f28..278ac3d360ba0f 100644
--- a/third_party/xla/xla/mlir_hlo/tests/bufferize_one_shot.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/bufferize_one_shot.mlir
@@ -109,7 +109,7 @@ func.func @slice(%t : tensor<3xi32>) -> tensor<1xi32> {
 
 func.func @dynamic_broadcast_return(%t : tensor<?x?xf32>, %shape : tensor<2xi32>) -> tensor<?x?xf32> {
   // CHECK: memref.copy
-  %bcast = "mhlo.dynamic_broadcast_in_dim"(%t, %shape) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xi32>) -> tensor<?x?xf32>
+  %bcast = "mhlo.dynamic_broadcast_in_dim"(%t, %shape) <{broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}> : (tensor<?x?xf32>, tensor<2xi32>) -> tensor<?x?xf32>
   func.return %bcast : tensor<?x?xf32>
 }
 
diff --git a/third_party/xla/xla/mlir_hlo/transforms/collapse_parallel_loops_to_1d_pass.cc b/third_party/xla/xla/mlir_hlo/transforms/collapse_parallel_loops_to_1d_pass.cc
index 5a0fda898e3fb1..a73a9c3bc7772e 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/collapse_parallel_loops_to_1d_pass.cc
+++ b/third_party/xla/xla/mlir_hlo/transforms/collapse_parallel_loops_to_1d_pass.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/SCF/Utils/Utils.h"
+#include "mlir/IR/PatternMatch.h"
 #include "transforms/passes.h"
 
 using ::mlir::scf::ParallelOp;
@@ -48,12 +49,13 @@ struct CollapseParallelLoopsTo1D
 using namespace mlir;
 
 void mlir::CollapseParallelLoopsTo1D::runOnOperation() {
+  IRRewriter rewriter(&getContext());
   getOperation()->walk([&](ParallelOp op) {
     unsigned numLoops = op.getNumLoops();
     if (numLoops == 1) return;
     std::vector<unsigned> combinedLoops(numLoops);
     std::iota(combinedLoops.begin(), combinedLoops.end(), 0u);
-    mlir::collapseParallelLoops(op, {combinedLoops});
+    mlir::collapseParallelLoops(rewriter, op, {combinedLoops});
   });
 }
 
diff --git a/third_party/xla/xla/pjrt/BUILD b/third_party/xla/xla/pjrt/BUILD
index fe75981da57781..509bdb61234be9 100644
--- a/third_party/xla/xla/pjrt/BUILD
+++ b/third_party/xla/xla/pjrt/BUILD
@@ -1,4 +1,3 @@
-load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load(
     "@local_tsl//tsl/platform:build_config.bzl",
     "tf_proto_library",
@@ -7,6 +6,7 @@ load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 # Placeholder: load py_proto_library
 load("//xla:xla.bzl", "xla_cc_test")
+load("//xla/tsl:tsl.bzl", "internal_visibility")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -21,6 +21,7 @@ package_group(
         "//xla:internal",
     ],
     packages = [
+        "//tensorflow/core/tfrt/ifrt/...",
         "//third_party/australis/...",
         "//third_party/openxla_pjrt_plugin/...",
         "//third_party/py/jax/...",
@@ -593,12 +594,44 @@ cc_library(
 
 cc_library(
     name = "pjrt_future",
+    srcs = ["pjrt_future.cc"],
     hdrs = ["pjrt_future.h"],
     visibility = internal_visibility([":friends"]),
     deps = [
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/concurrency:async_value",
         "@local_tsl//tsl/concurrency:ref_count",
+        "@local_tsl//tsl/platform:logging",
+    ],
+)
+
+xla_cc_test(
+    name = "pjrt_future_test",
+    srcs = ["pjrt_future_test.cc"],
+    deps = [
+        ":pjrt_future",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_main",
+    ],
+)
+
+cc_library(
+    name = "host_memory_spaces",
+    srcs = ["host_memory_spaces.cc"],
+    hdrs = ["host_memory_spaces.h"],
+    deps = [
+        ":pjrt_client",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:fingerprint",
     ],
 )
 
@@ -807,7 +840,7 @@ cc_library(
     name = "host_callback",
     srcs = ["host_callback.cc"],
     hdrs = ["host_callback.h"],
-    visibility = [":friends"],
+    visibility = internal_visibility([":friends"]),
     deps = [
         ":pjrt_client",
         ":pjrt_future",
diff --git a/third_party/xla/xla/pjrt/c/BUILD b/third_party/xla/xla/pjrt/c/BUILD
index bf9a3a85cb1b3c..172679b66dc995 100644
--- a/third_party/xla/xla/pjrt/c/BUILD
+++ b/third_party/xla/xla/pjrt/c/BUILD
@@ -169,6 +169,25 @@ cc_library(
     ],
 )
 
+# PJRT CPU plugin.
+xla_cc_binary(
+    name = "pjrt_c_api_cpu_plugin.so",
+    linkopts = [
+        "-Wl,--version-script,$(location :pjrt_c_api_cpu_version_script.lds)",
+        "-Wl,--no-undefined",
+    ],
+    linkshared = True,
+    tags = [
+        "noasan",
+        "nomsan",
+        "notsan",
+    ],
+    deps = [
+        ":pjrt_c_api_cpu",
+        ":pjrt_c_api_cpu_version_script.lds",
+    ],
+)
+
 cc_library(
     name = "pjrt_c_api_gpu_internal",
     srcs = ["pjrt_c_api_gpu_internal.cc"],
diff --git a/third_party/xla/xla/pjrt/c/CHANGELOG.md b/third_party/xla/xla/pjrt/c/CHANGELOG.md
index cb9cb750d81940..7201180ea208cc 100644
--- a/third_party/xla/xla/pjrt/c/CHANGELOG.md
+++ b/third_party/xla/xla/pjrt/c/CHANGELOG.md
@@ -1,7 +1,16 @@
 # PJRT C API changelog
 
+## 0.48
+* Added ``PjRtCApiMemorySpace::kind_id`` for uniquely identifying memory space kinds.
+* Renamed memory space kind to ``PjRtCApiMemorySpace::memory_space_kind`` to
+  ``PjRtCApiMemorySpace::kind``.
+* Added new host buffer semantics enum
+  ``PJRT_HostBufferSemantics_kMutableZeroCopy``
+
 ## 0.47
 * Added ``PJRT_Extension_Type::PJRT_Extension_Type_Custom_Partitioner``.
+* Renamed host buffer semantics enum from ``PJRT_HostBufferSemantics_kZeroCopy``
+  to ``PJRT_HostBufferSemantics_kImmutableZeroCopy``.
 
 ## 0.46 (Feb 29, 2024)
 * Update outdated struct sizes from previous changes to
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api.h b/third_party/xla/xla/pjrt/c/pjrt_c_api.h
index da1934e64f2e64..09eddb63157d02 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api.h
@@ -76,7 +76,7 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Extension_Base, next);
 // Changes include:
 // * Adding a new field to the PJRT_Api or argument structs
 // * Renaming a method or argument (doesn't affect ABI)
-#define PJRT_API_MINOR 47
+#define PJRT_API_MINOR 48
 
 // The plugin should set the major_version and minor_version of
 // PJRT_Api.pjrt_api_version to be the `PJRT_API_MAJOR` and `PJRT_API_MINOR` in
@@ -652,11 +652,24 @@ typedef enum {
   PJRT_HostBufferSemantics_kImmutableUntilTransferCompletes,
 
   // The PjRtBuffer may alias `data` internally and the runtime may use the
-  // `data` contents as long as the buffer is alive. The caller promises to
-  // keep `data` alive and not to mutate its contents as long as the buffer is
-  // alive; to notify the caller that the buffer may be freed, the runtime
-  // will call `done_with_host_buffer` when the PjRtBuffer is freed.
-  PJRT_HostBufferSemantics_kZeroCopy,
+  // `data` contents as long as the buffer is alive. The runtime promises not
+  // to mutate contents of the buffer (i.e. it will not use it for aliased
+  // output buffers). The caller promises to keep `data` alive and not to mutate
+  // its contents as long as the buffer is alive; to notify the caller that the
+  // buffer may be freed, the runtime will call `done_with_host_buffer` when the
+  // PjRtBuffer is freed.
+  PJRT_HostBufferSemantics_kImmutableZeroCopy,
+
+  // The PjRtBuffer may alias `data` internally and the runtime may use the
+  // `data` contents as long as the buffer is alive. The runtime is allowed
+  // to mutate contents of the buffer (i.e. use it for aliased output
+  // buffers). The caller promises to keep `data` alive and not to mutate its
+  // contents as long as the buffer is alive (otherwise it could be a data
+  // race with the runtime); to notify the caller that the buffer may be
+  // freed, the runtime will call `on_done_with_host_buffer` when the
+  // PjRtBuffer is freed. On non-CPU platforms this acts identically to
+  // kImmutableUntilTransferCompletes.
+  PJRT_HostBufferSemantics_kMutableZeroCopy,
 } PJRT_HostBufferSemantics;
 
 typedef enum {
@@ -1026,14 +1039,25 @@ struct PJRT_Memory_Kind_Args {
   PJRT_Extension_Base* extension_start;
   PJRT_Memory* memory;
   // `memory_kind` has same lifetime as `memory`.
-  const char* memory_kind;  // out
-  size_t memory_kind_size;  // out
+  const char* kind;  // out
+  size_t kind_size;  // out
 };
-PJRT_DEFINE_STRUCT_TRAITS(PJRT_Memory_Kind_Args, memory_kind_size);
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Memory_Kind_Args, kind_size);
 
 // A platform-dependent string that uniquely identifies the kind of the memory.
 typedef PJRT_Error* PJRT_Memory_Kind(PJRT_Memory_Kind_Args* args);
 
+struct PJRT_Memory_Kind_Id_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Memory* memory;
+  int kind_id;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Memory_Kind_Id_Args, kind_id);
+
+// A platform-dependent ID that uniquely identifies the kind of the memory.
+typedef PJRT_Error* PJRT_Memory_Kind_Id(PJRT_Memory_Kind_Id_Args* args);
+
 struct PJRT_Memory_DebugString_Args {
   size_t struct_size;
   PJRT_Extension_Base* extension_start;
@@ -2176,6 +2200,8 @@ typedef struct {
   _PJRT_API_STRUCT_FIELD(PJRT_Client_TopologyDescription);
 
   _PJRT_API_STRUCT_FIELD(PJRT_Executable_GetCompiledMemoryStats);
+
+  _PJRT_API_STRUCT_FIELD(PJRT_Memory_Kind_Id);
 } PJRT_Api;
 
 enum {
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_cpu_version_script.lds b/third_party/xla/xla/pjrt/c/pjrt_c_api_cpu_version_script.lds
new file mode 100644
index 00000000000000..46cc4278883d11
--- /dev/null
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_cpu_version_script.lds
@@ -0,0 +1,12 @@
+# The symbols listed in the "global" section of this file--and only those--
+#    are discoverable by programs or frameworks that `dlopen()` libtpu.
+# The linker will expose those symbols and no others to frameworks.
+VERS_1.0 {
+  global:
+    extern "C" {
+      GetPjrtApi;
+    };
+
+  local:
+    *;
+};
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
index 211fcc5d538c03..297c678655298b 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
@@ -255,7 +255,8 @@ const PJRT_Api* GetGpuPjrtApi() {
       pjrt::CreatePjrtApi(pjrt::gpu_plugin::PJRT_Client_Create,
                           pjrt::gpu_plugin::PJRT_GpuDeviceTopology_Create,
                           pjrt::PJRT_Plugin_Initialize_NoOp,
-                          reinterpret_cast<PJRT_Extension_Base*>(&custom_call));
+                          reinterpret_cast<PJRT_Extension_Base*>(&custom_call),
+                          pjrt::PJRT_Plugin_Attributes_Xla);
 
   return &pjrt_api;
 }
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
index f5583b3878dd12..b155cfc92329e8 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
@@ -148,7 +148,7 @@ TEST_F(PjrtCApiGpuTest, CreateViewOfDeviceBuffer) {
   PJRT_Error* to_host_error = api_->PJRT_Buffer_ToHostBuffer(&to_host_args);
 
   ASSERT_EQ(to_host_error, nullptr);
-  xla::PjRtFuture<absl::Status> transfer_to_host =
+  xla::PjRtFuture<> transfer_to_host =
       ::pjrt::ConvertCEventToCppFuture(to_host_args.event, api_);
   TF_CHECK_OK(transfer_to_host.Await());
   ASSERT_EQ(literal->data<float>().size(), 4);
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc
index d334345ff489d5..f54feed682fe2a 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc
@@ -349,8 +349,10 @@ const char* HostBufferSemanticsToString(
   switch (h) {
     case xla::PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall:
       return "xla::PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall";
-    case xla::PjRtClient::HostBufferSemantics::kZeroCopy:
-      return "xla::PjRtClient::HostBufferSemantics::kZeroCopy";
+    case xla::PjRtClient::HostBufferSemantics::kImmutableZeroCopy:
+      return "xla::PjRtClient::HostBufferSemantics::kImmutableZeroCopy";
+    case xla::PjRtClient::HostBufferSemantics::kMutableZeroCopy:
+      return "xla::PjRtClient::HostBufferSemantics::kMutableZeroCopy";
     case xla::PjRtClient::HostBufferSemantics::kImmutableUntilTransferCompletes:
       return "xla::PjRtClient::HostBufferSemantics::"
              "kImmutableUntilTransferCompletes";
@@ -366,8 +368,12 @@ PJRT_HostBufferSemantics ConvertToPjRtHostBufferSemantics(
     case xla::PjRtClient::HostBufferSemantics::kImmutableUntilTransferCompletes:
       return PJRT_HostBufferSemantics::
           PJRT_HostBufferSemantics_kImmutableUntilTransferCompletes;
-    case xla::PjRtClient::HostBufferSemantics::kZeroCopy:
-      return PJRT_HostBufferSemantics::PJRT_HostBufferSemantics_kZeroCopy;
+    case xla::PjRtClient::HostBufferSemantics::kImmutableZeroCopy:
+      return PJRT_HostBufferSemantics::
+          PJRT_HostBufferSemantics_kImmutableZeroCopy;
+    case xla::PjRtClient::HostBufferSemantics::kMutableZeroCopy:
+      return PJRT_HostBufferSemantics::
+          PJRT_HostBufferSemantics_kMutableZeroCopy;
     default:
       CHECK(false)
           << "Input host buffer semantics is not supported in C API layer: "
@@ -385,28 +391,29 @@ xla::PjRtClient::HostBufferSemantics ConvertFromPjRtHostBufferSemantics(
         PJRT_HostBufferSemantics_kImmutableUntilTransferCompletes:
       return xla::PjRtClient::HostBufferSemantics::
           kImmutableUntilTransferCompletes;
-    case PJRT_HostBufferSemantics::PJRT_HostBufferSemantics_kZeroCopy:
-      return xla::PjRtClient::HostBufferSemantics::kZeroCopy;
+    case PJRT_HostBufferSemantics::PJRT_HostBufferSemantics_kImmutableZeroCopy:
+      return xla::PjRtClient::HostBufferSemantics::kImmutableZeroCopy;
+    case PJRT_HostBufferSemantics::PJRT_HostBufferSemantics_kMutableZeroCopy:
+      return xla::PjRtClient::HostBufferSemantics::kMutableZeroCopy;
   }
 }
 
-xla::PjRtFuture<absl::Status> ConvertCEventToCppFuture(PJRT_Event* c_event,
-                                                       const PJRT_Api* c_api) {
+xla::PjRtFuture<> ConvertCEventToCppFuture(PJRT_Event* c_event,
+                                           const PJRT_Api* c_api) {
   using absl::Status, xla::PjRtFuture;
   PJRT_Event_OnReady_Args event_onready_args;
   event_onready_args.struct_size = PJRT_Event_OnReady_Args_STRUCT_SIZE;
   event_onready_args.extension_start = nullptr;
   event_onready_args.event = c_event;
 
-  PjRtFuture<Status>::Promise promise = PjRtFuture<Status>::CreatePromise();
+  PjRtFuture<>::Promise promise = PjRtFuture<>::CreatePromise();
   event_onready_args.user_arg = new std::function<void(PJRT_Error*)>(
       [promise, c_event, c_api](PJRT_Error* error) mutable {
         if (error != nullptr) {
-          absl::Status s = ::pjrt::PjrtErrorToStatus(error, c_api);
-          promise.Set(s);
+          promise.Set(::pjrt::PjrtErrorToStatus(error, c_api));
           ::pjrt::MakeErrorDeleter(c_api)(error);
         } else {
-          promise.Set(absl::OkStatus());
+          promise.Set();
         }
         ::pjrt::MakeEventDeleter(c_api)(c_event);
       });
@@ -419,10 +426,9 @@ xla::PjRtFuture<absl::Status> ConvertCEventToCppFuture(PJRT_Event* c_event,
 
   PJRT_Error* error = c_api->PJRT_Event_OnReady(&event_onready_args);
   if (error != nullptr) {
-    absl::Status s = ::pjrt::PjrtErrorToStatus(error, c_api);
-    return PjRtFuture<Status>(s);
+    return PjRtFuture<>(::pjrt::PjrtErrorToStatus(error, c_api));
   }
-  return PjRtFuture<Status>(std::move(promise));
+  return PjRtFuture<>(std::move(promise));
 }
 
 static absl::StatusOr<PJRT_NamedValue> ConvertToPjRtNamedValue(
@@ -561,6 +567,22 @@ absl::Status ValidateCreateOptions(
   return absl::OkStatus();
 }
 
+const std::vector<PJRT_NamedValue>& GetXlaPluginCAttributes() {
+  constexpr absl::string_view kXlaVersion = "xla_version";
+  PJRT_NamedValue c_value;
+  c_value.struct_size = PJRT_NamedValue_STRUCT_SIZE;
+  c_value.extension_start = nullptr;
+  c_value.name = kXlaVersion.data();
+  c_value.name_size = kXlaVersion.size();
+  c_value.type = PJRT_NamedValue_Type::PJRT_NamedValue_kInt64;
+  // TODO(b/327203806): figure out where to keep the xla_version.
+  c_value.int64_value = 1;
+  c_value.value_size = 1;
+  static const std::vector<PJRT_NamedValue>* c_values =
+      new std::vector<PJRT_NamedValue>({c_value});
+  return *c_values;
+}
+
 static std::string StructSizeErrorMsg(absl::string_view struct_name,
                                       size_t expected_size,
                                       size_t actual_size) {
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.h
index c20221d043aad4..05c2aa779beaf1 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.h
@@ -39,15 +39,15 @@ ABSL_CONST_INIT extern const absl::string_view kHloWithConfigFormat;
 
 // Return error status if not success and frees the PJRT_Error returned by
 // `expr`.
-#define RETURN_STATUS_IF_PJRT_ERROR(expr, c_api)                        \
-  do {                                                                  \
-    PJRT_Error* error = (expr);                                         \
-    std::unique_ptr<PJRT_Error, pjrt::PJRT_ErrorDeleter> _error(        \
-        error, pjrt::MakeErrorDeleter(c_api));                          \
-    xla::Status _status = pjrt::PjrtErrorToStatus(_error.get(), c_api); \
-    if (!_status.ok()) {                                                \
-      return _status;                                                   \
-    }                                                                   \
+#define RETURN_STATUS_IF_PJRT_ERROR(expr, c_api)                         \
+  do {                                                                   \
+    PJRT_Error* error = (expr);                                          \
+    std::unique_ptr<PJRT_Error, pjrt::PJRT_ErrorDeleter> _error(         \
+        error, pjrt::MakeErrorDeleter(c_api));                           \
+    absl::Status _status = pjrt::PjrtErrorToStatus(_error.get(), c_api); \
+    if (!_status.ok()) {                                                 \
+      return _status;                                                    \
+    }                                                                    \
   } while (false)
 
 using PJRT_ClientDeleter = std::function<void(PJRT_Client*)>;
@@ -109,7 +109,7 @@ absl::string_view GetPjrtErrorMessage(const PJRT_Error* error,
 
 PJRT_Error_Code GetErrorCode(const PJRT_Error* error, const PJRT_Api* api);
 
-xla::Status PjrtErrorToStatus(const PJRT_Error* error, const PJRT_Api* api);
+absl::Status PjrtErrorToStatus(const PJRT_Error* error, const PJRT_Api* api);
 
 absl::StatusCode PjrtErrorToStatusCode(const PJRT_Error* error,
                                        const PJRT_Api* api);
@@ -135,8 +135,8 @@ xla::PjRtClient::HostBufferSemantics ConvertFromPjRtHostBufferSemantics(
 
 // Create and return a `PjRtFuture`  which will be set when `c_event` is ready.
 // This also deletes `c_event` when the `PjRtFuture` is set.
-xla::PjRtFuture<xla::Status> ConvertCEventToCppFuture(PJRT_Event* c_event,
-                                                      const PJRT_Api* c_api);
+xla::PjRtFuture<> ConvertCEventToCppFuture(PJRT_Event* c_event,
+                                           const PJRT_Api* c_api);
 
 // The data of returned variable-length PJRT_NamedValue list is backed by
 // `cpp_value_map`, so `cpp_value_map` must outlive the returned list. It will
@@ -151,18 +151,22 @@ ConvertFromPjRtNamedValueList(const PJRT_NamedValue* c_value_list,
 // Validates that all entries in value_map have a matching name and type in
 // expected_name_and_type. expected_name_and_type may contain extra entries
 // not in value_map without error.
-xla::Status ValidateCreateOptions(
+absl::Status ValidateCreateOptions(
     const absl::flat_hash_map<std::string, xla::PjRtValueType>& value_map,
     const absl::flat_hash_map<std::string, PJRT_NamedValue_Type>&
         expected_name_and_types);
 
+// Returns attributes for plugin that uses XLA compiler. The attributes have the
+// lifetime of the process.
+const std::vector<PJRT_NamedValue>& GetXlaPluginCAttributes();
+
 // Helper function for checking the actual C API argument struct size is greater
 // than or equal to the expected size. The actual struct size can be larger if
 // it comes from a forwards-compatible caller built at a later version than this
 // check. Returns a non-OK status if the expected is smaller.
-xla::Status ActualStructSizeIsGreaterOrEqual(absl::string_view struct_name,
-                                             size_t expected_size,
-                                             size_t actual_size);
+absl::Status ActualStructSizeIsGreaterOrEqual(absl::string_view struct_name,
+                                              size_t expected_size,
+                                              size_t actual_size);
 
 absl::string_view GetPlatformVersion(PJRT_Client* client, const PJRT_Api* api);
 absl::string_view GetPlatformName(PJRT_Client* client, const PJRT_Api* api);
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_profiler_extension.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_profiler_extension.h
index c821916add71af..77603203e4a64c 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_profiler_extension.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_profiler_extension.h
@@ -26,7 +26,7 @@ limitations under the License.
 extern "C" {
 #endif
 
-#define PJRT_API_PROFILER_EXTENSION_VERSION 0
+#define PJRT_API_PROFILER_EXTENSION_VERSION 1
 
 typedef struct PJRT_Profiler_Extension {
   size_t struct_size;
@@ -37,7 +37,7 @@ typedef struct PJRT_Profiler_Extension {
   // valid only when used as an args extension
   int64_t traceme_context_id;
 } PJRT_Profiler_Extension;
-PJRT_DEFINE_STRUCT_TRAITS(PJRT_Profiler_Extension, profiler_api);
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Profiler_Extension, traceme_context_id);
 
 #ifdef __cplusplus
 }
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc
index 573a43c7aa6865..0da109801119c2 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc
@@ -559,7 +559,7 @@ class PjrtCApiBufferTest : public PjrtCApiTest {
   }
 
   std::unique_ptr<PJRT_Buffer, ::pjrt::PJRT_BufferDeleter> buffer_;
-  xla::PjRtFuture<absl::Status> event_;
+  xla::PjRtFuture<> event_;
 };
 
 TEST_F(PjrtCApiBufferTest, IsDeleted) {
@@ -645,7 +645,7 @@ TEST_F(PjrtCApiBufferTest, ToHostBufferNoHostLayout) {
   args.event = nullptr;
 
   PJRT_Error* error = api_->PJRT_Buffer_ToHostBuffer(&args);
-  xla::PjRtFuture<absl::Status> transfer_to_host =
+  xla::PjRtFuture<> transfer_to_host =
       ::pjrt::ConvertCEventToCppFuture(args.event, api_);
   TF_CHECK_OK(transfer_to_host.Await());
 
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.cc
index aa383745be2794..b2a42edf5a31e0 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.cc
@@ -172,7 +172,7 @@ PjrtCApiTestBase::CreateBufferFromHostBufferArgs(
 }
 
 std::pair<std::unique_ptr<PJRT_Buffer, ::pjrt::PJRT_BufferDeleter>,
-          xla::PjRtFuture<absl::Status>>
+          xla::PjRtFuture<>>
 PjrtCApiTestBase::create_buffer(PJRT_Device* device) {
   xla::Shape shape = xla::ShapeUtil::MakeShapeWithType<float>({4});
   std::vector<float> float_data(4);
@@ -200,7 +200,7 @@ PjrtCApiTestBase::create_buffer(PJRT_Device* device) {
   auto ready_event_error =
       ToUniquePtr(api_->PJRT_Buffer_ReadyEvent(&get_event_args));
   EXPECT_EQ(ready_event_error, nullptr);
-  xla::PjRtFuture<absl::Status> buffer_ready_event =
+  xla::PjRtFuture<> buffer_ready_event =
       ::pjrt::ConvertCEventToCppFuture(get_event_args.event, api_);
 
   return std::make_pair(std::move(buffer), buffer_ready_event);
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.h
index 28fe1f660d1b4e..f6b7c97fa48f26 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.h
@@ -64,7 +64,7 @@ class PjrtCApiTestBase : public ::testing::Test {
       PJRT_Device* device = nullptr);
 
   std::pair<std::unique_ptr<PJRT_Buffer, ::pjrt::PJRT_BufferDeleter>,
-            xla::PjRtFuture<absl::Status>>
+            xla::PjRtFuture<>>
   create_buffer(PJRT_Device* device = nullptr);
 
   std::unique_ptr<PJRT_Error, ::pjrt::PJRT_ErrorDeleter> ToUniquePtr(
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
index faa64f6631c027..51ae06bd1d8947 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
@@ -99,7 +99,8 @@ static PJRT_Memory* GetCMemory(const PJRT_Client* client,
 // Performs one-time cost-analysis on an executable, and populates its cost
 // analysis properties. After this returns successfully, cost analysis
 // properties of the executable can be accessed without mutex.
-static xla::Status PopulateExecutableCostAnalysis(PJRT_Executable* executable) {
+static absl::Status PopulateExecutableCostAnalysis(
+    PJRT_Executable* executable) {
   // Call GetCostAnalysis in the underlying PjRtExecutable
   using PropertiesMapType =
       absl::flat_hash_map<std::string, xla::PjRtValueType>;
@@ -142,7 +143,7 @@ static xla::Status PopulateExecutableCostAnalysis(PJRT_Executable* executable) {
   return xla::OkStatus();
 }
 
-static xla::Status PopulateExecutableOutputElementTypes(
+static absl::Status PopulateExecutableOutputElementTypes(
     PJRT_Executable* executable) {
   TF_ASSIGN_OR_RETURN(auto output_types,
                       executable->get()->GetOutputElementTypes());
@@ -168,7 +169,7 @@ static xla::Status PopulateExecutableOutputElementTypes(
   return xla::OkStatus();
 }
 
-static xla::Status PopulateExecutableOutputDimensions(
+static absl::Status PopulateExecutableOutputDimensions(
     PJRT_Executable* executable) {
   TF_ASSIGN_OR_RETURN(auto output_dims,
                       executable->get()->GetOutputDimensions());
@@ -202,7 +203,7 @@ static xla::Status PopulateExecutableOutputDimensions(
   return xla::OkStatus();
 }
 
-static xla::Status PopulateExecutableOutputMemoryKinds(
+static absl::Status PopulateExecutableOutputMemoryKinds(
     PJRT_Executable* executable) {
   TF_ASSIGN_OR_RETURN(
       std::vector<std::vector<absl::string_view>> output_memories,
@@ -245,8 +246,8 @@ class CApiKeyValueStore : public xla::KeyValueStoreInterface {
     PJRT_CallbackError callback_error = [](PJRT_Error_Code code,
                                            const char* message,
                                            size_t message_size) {
-      return new PJRT_Error{xla::Status(static_cast<absl::StatusCode>(code),
-                                        std::string(message, message_size))};
+      return new PJRT_Error{absl::Status(static_cast<absl::StatusCode>(code),
+                                         std::string(message, message_size))};
     };
     PJRT_KeyValueGetCallback_Args args;
     args.key = key.data();
@@ -267,8 +268,8 @@ class CApiKeyValueStore : public xla::KeyValueStoreInterface {
     PJRT_CallbackError callback_error = [](PJRT_Error_Code code,
                                            const char* message,
                                            size_t message_size) {
-      return new PJRT_Error{xla::Status(static_cast<absl::StatusCode>(code),
-                                        std::string(message, message_size))};
+      return new PJRT_Error{absl::Status(static_cast<absl::StatusCode>(code),
+                                         std::string(message, message_size))};
     };
     PJRT_KeyValuePutCallback_Args args;
     args.key = key.data();
@@ -304,7 +305,7 @@ std::shared_ptr<xla::KeyValueStoreInterface> ToCppKeyValueStore(
 // ---------------------------------- Errors -----------------------------------
 
 void PJRT_Error_Destroy(PJRT_Error_Destroy_Args* args) {
-  xla::Status struct_size_check = ActualStructSizeIsGreaterOrEqual(
+  absl::Status struct_size_check = ActualStructSizeIsGreaterOrEqual(
       "PJRT_Error_Destroy_Args", PJRT_Error_Destroy_Args_STRUCT_SIZE,
       args->struct_size);
   if (!struct_size_check.ok()) {
@@ -316,14 +317,14 @@ void PJRT_Error_Destroy(PJRT_Error_Destroy_Args* args) {
 }
 
 void PJRT_Error_Message(PJRT_Error_Message_Args* args) {
-  xla::Status struct_size_check = ActualStructSizeIsGreaterOrEqual(
+  absl::Status struct_size_check = ActualStructSizeIsGreaterOrEqual(
       "PJRT_Error_Message_Args", PJRT_Error_Message_Args_STRUCT_SIZE,
       args->struct_size);
   if (!struct_size_check.ok()) {
     LOG(ERROR) << struct_size_check.message();
   }
   if (args->struct_size >= PJRT_STRUCT_SIZE(PJRT_Error_Destroy_Args, error)) {
-    const xla::Status* status = &args->error->status;
+    const absl::Status* status = &args->error->status;
     args->message = status->message().data();
     args->message_size = status->message().size();
   }
@@ -340,11 +341,23 @@ PJRT_Error* PJRT_Error_GetCode(PJRT_Error_GetCode_Args* args) {
 
 // ---------------------------------- Plugin -----------------------------------
 
-PJRT_Error* PJRT_Plugin_Attributes(PJRT_Plugin_Attributes_Args* args) {
+PJRT_Error* PJRT_Plugin_Attributes_Empty(PJRT_Plugin_Attributes_Args* args) {
   PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
       "PJRT_Plugin_Attributes_Args", PJRT_Plugin_Attributes_Args_STRUCT_SIZE,
       args->struct_size));
   args->num_attributes = 0;
+  args->attributes = nullptr;
+  return nullptr;
+}
+
+PJRT_Error* PJRT_Plugin_Attributes_Xla(PJRT_Plugin_Attributes_Args* args) {
+  PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
+      "PJRT_Plugin_Attributes_Args", PJRT_Plugin_Attributes_Args_STRUCT_SIZE,
+      args->struct_size));
+  const std::vector<PJRT_NamedValue>& attributes =
+      pjrt::GetXlaPluginCAttributes();
+  args->num_attributes = attributes.size();
+  args->attributes = attributes.data();
   return nullptr;
 }
 
@@ -623,7 +636,7 @@ PJRT_Error* PJRT_Client_DefaultDeviceAssignment(
   const int partitions = args->num_partitions;
   const size_t buffer_size = args->default_assignment_size;
   if (buffer_size < replicas * partitions) {
-    xla::Status status = tsl::errors::FailedPrecondition(
+    absl::Status status = absl::FailedPreconditionError(
         absl::StrCat(__func__, ": `default_assignment_size` ", buffer_size,
                      " < `num_replicas * num_partitions`, ", replicas, " * ",
                      partitions, " = ", replicas * partitions));
@@ -677,11 +690,10 @@ PJRT_Error* PJRT_Client_BufferFromHostBuffer(
     }
   }
 
-  xla::PjRtFuture<xla::Status>::Promise promise =
-      xla::PjRtFuture<xla::Status>::CreatePromise();
+  xla::PjRtFuture<>::Promise promise = xla::PjRtFuture<>::CreatePromise();
 
   absl::AnyInvocable<void() &&> on_done_with_host_buffer = [promise]() mutable {
-    promise.Set(xla::OkStatus());
+    promise.Set();
   };
 
   std::unique_ptr<xla::PjRtBuffer> buffer;
@@ -729,7 +741,7 @@ PJRT_Error* PJRT_Client_BufferFromHostBuffer(
 
   args->buffer = new PJRT_Buffer{std::move(buffer), args->client};
   args->done_with_host_buffer =
-      new PJRT_Event{xla::PjRtFuture<xla::Status>(std::move(promise))};
+      new PJRT_Event{xla::PjRtFuture<>(std::move(promise))};
 
   return nullptr;
 }
@@ -940,9 +952,16 @@ PJRT_Error* PJRT_Memory_Kind(PJRT_Memory_Kind_Args* args) {
   PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
       "PJRT_Memory_Kind_Args", PJRT_Memory_Kind_Args_STRUCT_SIZE,
       args->struct_size));
-  args->memory_kind = args->memory->memory_space->memory_space_kind().data();
-  args->memory_kind_size =
-      args->memory->memory_space->memory_space_kind().size();
+  args->kind = args->memory->memory_space->kind().data();
+  args->kind_size = args->memory->memory_space->kind().size();
+  return nullptr;
+}
+
+PJRT_Error* PJRT_Memory_Kind_Id(PJRT_Memory_Kind_Id_Args* args) {
+  PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
+      "PJRT_Memory_Kind_Id_Args", PJRT_Memory_Kind_Id_Args_STRUCT_SIZE,
+      args->struct_size));
+  args->kind_id = args->memory->memory_space->kind_id();
   return nullptr;
 }
 
@@ -1073,7 +1092,7 @@ PJRT_Error* PJRT_Executable_SizeOfGeneratedCodeInBytes(
   return nullptr;
 }
 
-static xla::Status VerifyOptimizedProgramArgs(
+static absl::Status VerifyOptimizedProgramArgs(
     PJRT_Executable_OptimizedProgram_Args* args) {
   TF_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
       "PJRT_Executable_OptimizedProgram_Args",
@@ -1264,7 +1283,7 @@ static xla::SendCallback CSendCallbackToCpp(
       [user_arg = c_callback.user_arg, callback = c_callback.send_callback](
           const xla::PjRtTransferMetadata& unused_metadata,
           xla::PjRtChunk input, size_t total_size_in_bytes,
-          bool done) -> xla::Status {
+          bool done) -> absl::Status {
         PJRT_Chunk c_chunk = ConvertFromCppChunk(std::move(input));
         // PJRT_CallbackError creates PJRT_Error in the implementation, but
         // using the caller's callback status code & message. This way, the
@@ -1273,8 +1292,8 @@ static xla::SendCallback CSendCallbackToCpp(
         PJRT_CallbackError c_callback_error =
             [](PJRT_Error_Code code, const char* message, size_t message_size) {
               return new PJRT_Error{
-                  xla::Status(static_cast<absl::StatusCode>(code),
-                              std::string(message, message_size))};
+                  absl::Status(static_cast<absl::StatusCode>(code),
+                               std::string(message, message_size))};
             };
 
         std::unique_ptr<PJRT_Error> error(callback(
@@ -1402,7 +1421,7 @@ PJRT_Error* PJRT_LoadedExecutable_Execute(
     std::vector<std::vector<std::unique_ptr<xla::PjRtBuffer>>> cpp_buffer_lists;
     if (args->device_complete_events != nullptr ||
         !cpp_send_callbacks->empty() || !cpp_recv_callbacks->empty()) {
-      std::optional<std::vector<xla::PjRtFuture<xla::Status>>> returned_futures;
+      std::optional<std::vector<xla::PjRtFuture<>>> returned_futures;
       returned_futures.emplace();
       PJRT_ASSIGN_OR_RETURN(cpp_buffer_lists,
                             args->executable->get()->Execute(
@@ -1415,7 +1434,7 @@ PJRT_Error* PJRT_LoadedExecutable_Execute(
       if (!cpp_send_callbacks->empty() || !cpp_recv_callbacks->empty()) {
         for (int i = 0; i < returned_futures->size(); ++i) {
           (*returned_futures)[i].OnReady(
-              [cpp_send_callbacks, cpp_recv_callbacks](xla::Status status) {
+              [cpp_send_callbacks, cpp_recv_callbacks](absl::Status status) {
                 // Keeps C++ callbacks alive until execution completes on all
                 // devices.
               });
@@ -1456,7 +1475,7 @@ PJRT_Error* PJRT_LoadedExecutable_Execute(
     }
 
     std::vector<std::unique_ptr<xla::PjRtBuffer>> cpp_buffer_list;
-    std::optional<xla::PjRtFuture<xla::Status>> returned_future;
+    std::optional<xla::PjRtFuture<>> returned_future;
     bool fill_future = args->device_complete_events != nullptr;
     PJRT_ASSIGN_OR_RETURN(xla::CompileOptions compile_options,
                           args->executable->get()->GetCompileOptions());
@@ -1782,12 +1801,11 @@ PJRT_Error* PJRT_Buffer_ToHostBuffer(PJRT_Buffer_ToHostBuffer_Args* args) {
 
   auto literal = std::make_unique<xla::MutableBorrowingLiteral>(
       static_cast<char*>(args->dst), host_shape);
-  xla::PjRtFuture<xla::Status> future =
-      args->src->buffer->ToLiteral(literal.get());
+  xla::PjRtFuture<> future = args->src->buffer->ToLiteral(literal.get());
 
   args->event = new PJRT_Event{std::move(future)};
   args->event->future.OnReady(
-      [literal{std::move(literal)}](xla::Status status) {
+      [literal{std::move(literal)}](absl::Status status) {
         /* To keep literal alive */
       });
 
@@ -1806,8 +1824,7 @@ PJRT_Error* PJRT_Buffer_ReadyEvent(PJRT_Buffer_ReadyEvent_Args* args) {
   PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
       "PJRT_Buffer_ReadyEvent_Args", PJRT_Buffer_ReadyEvent_Args_STRUCT_SIZE,
       args->struct_size));
-  xla::PjRtFuture<xla::Status> wrapped_promise =
-      args->buffer->buffer->GetReadyFuture();
+  xla::PjRtFuture<> wrapped_promise = args->buffer->buffer->GetReadyFuture();
   args->event = new PJRT_Event{std::move(wrapped_promise)};
   return nullptr;
 }
@@ -1847,7 +1864,7 @@ PJRT_Error* PJRT_Buffer_DecreaseExternalReferenceCount(
     args->buffer->external_references.pop_back();
     return nullptr;
   }
-  xla::Status status = xla::InvalidArgument(
+  absl::Status status = xla::InvalidArgument(
       "Attempting to decrease reference on a buffer with zero reference "
       "count.");
   PJRT_Error* error = new PJRT_Error{std::move(status)};
@@ -1885,7 +1902,7 @@ PJRT_Error* PJRT_CopyToDeviceStream_AddChunk(
       "PJRT_CopyToDeviceStream_AddChunk_Args",
       PJRT_CopyToDeviceStream_AddChunk_Args_STRUCT_SIZE, args->struct_size));
 
-  xla::PjRtFuture<xla::Status> future =
+  xla::PjRtFuture<> future =
       args->stream->stream->AddChunk(ConvertToCppChunk(*args->chunk));
   args->transfer_complete = new PJRT_Event{std::move(future)};
   return nullptr;
@@ -1978,7 +1995,7 @@ PJRT_Error* PJRT_Event_OnReady(PJRT_Event_OnReady_Args* args) {
 
   PJRT_Event_OnReadyCallback callback = args->callback;
   void* user_arg = args->user_arg;
-  auto impl_callback = [callback, user_arg](xla::Status status) -> void {
+  auto impl_callback = [callback, user_arg](absl::Status status) -> void {
     PJRT_Error* error = nullptr;
     if (!status.ok()) {
       error = new PJRT_Error{status};
@@ -2269,7 +2286,8 @@ namespace pjrt {
 PJRT_Api CreatePjrtApi(PJRT_Client_Create* create_fn,
                        PJRT_TopologyDescription_Create* topology_create_fn,
                        PJRT_Plugin_Initialize* plugin_initialize_fn,
-                       PJRT_Extension_Base* extension_start) {
+                       PJRT_Extension_Base* extension_start,
+                       PJRT_Plugin_Attributes* plugin_attributes_fn) {
   return PJRT_Api{
       /*struct_size=*/PJRT_Api_STRUCT_SIZE,
       /*extension_start=*/extension_start,
@@ -2285,7 +2303,7 @@ PJRT_Api CreatePjrtApi(PJRT_Client_Create* create_fn,
       /*PJRT_Error_GetCode=*/pjrt::PJRT_Error_GetCode,
 
       /*PJRT_Plugin_Initialize=*/plugin_initialize_fn,
-      /*PJRT_Plugin_Attributes=*/pjrt::PJRT_Plugin_Attributes,
+      /*PJRT_Plugin_Attributes=*/plugin_attributes_fn,
 
       /*PJRT_Event_Destroy=*/pjrt::PJRT_Event_Destroy,
       /*PJRT_Event_IsReady=*/pjrt::PJRT_Event_IsReady,
@@ -2434,6 +2452,7 @@ PJRT_Api CreatePjrtApi(PJRT_Client_Create* create_fn,
       pjrt::PJRT_Client_TopologyDescription,
       /*PJRT_Executable_GetCompiledMemoryStats= */
       pjrt::PJRT_Executable_GetCompiledMemoryStats,
+      /*PJRT_Memory_Kind_Id=*/pjrt::PJRT_Memory_Kind_Id,
   };
 }
 
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
index a6f4d3b21e1c45..dcd1395b55e54a 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
@@ -25,6 +25,7 @@ limitations under the License.
 
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "xla/pjrt/c/pjrt_c_api.h"
@@ -41,7 +42,7 @@ limitations under the License.
 #include "tsl/platform/casts.h"
 
 struct PJRT_Error {
-  xla::Status status;
+  absl::Status status;
 };
 
 struct PJRT_TopologyDescription {
@@ -177,12 +178,15 @@ struct PJRT_Buffer {
 };
 
 struct PJRT_Event {
-  xla::PjRtFuture<xla::Status> future;
+  xla::PjRtFuture<> future;
+  // TODO(b/333538339): It's safe to Await() on PjRtFuture<> multiple times,
+  // remove this workaround.
+  //
   // Set and stored upon future.Await(), as PjRtFuture only allows its result to
   // be queried through Await() and Await() can only safely be called once. This
   // variable allows C API users to check for error status any time after
   // Await() has been called.
-  std::optional<xla::Status> status;
+  std::optional<absl::Status> status;
 };
 
 struct PJRT_SerializedExecutable {
@@ -211,7 +215,8 @@ void PJRT_Error_Destroy(PJRT_Error_Destroy_Args* args);
 void PJRT_Error_Message(PJRT_Error_Message_Args* args);
 PJRT_Error* PJRT_Error_GetCode(PJRT_Error_GetCode_Args* args);
 
-PJRT_Error* PJRT_Plugin_Attributes(PJRT_Plugin_Attributes_Args* args);
+PJRT_Error* PJRT_Plugin_Attributes_Empty(PJRT_Plugin_Attributes_Args* args);
+PJRT_Error* PJRT_Plugin_Attributes_Xla(PJRT_Plugin_Attributes_Args* args);
 
 PJRT_Error* PJRT_Event_Destroy(PJRT_Event_Destroy_Args* args);
 PJRT_Error* PJRT_Event_IsReady(PJRT_Event_IsReady_Args* args);
@@ -262,6 +267,7 @@ PJRT_Error* PJRT_Device_MemoryStats(PJRT_Device_MemoryStats_Args* args);
 
 PJRT_Error* PJRT_Memory_Id(PJRT_Memory_Id_Args* args);
 PJRT_Error* PJRT_Memory_Kind(PJRT_Memory_Kind_Args* args);
+PJRT_Error* PJRT_Memory_Kind_Id(PJRT_Memory_Kind_Id_Args* args);
 PJRT_Error* PJRT_Memory_DebugString(PJRT_Memory_DebugString_Args* args);
 PJRT_Error* PJRT_Memory_ToString(PJRT_Memory_ToString_Args* args);
 PJRT_Error* PJRT_Memory_AddressableByDevices(
@@ -366,7 +372,7 @@ PJRT_Error* PJRT_Compile(PJRT_Compile_Args* args);
 
 #define PJRT_RETURN_IF_ERROR(expr)                                \
   do {                                                            \
-    xla::Status _status = (expr);                                 \
+    absl::Status _status = (expr);                                \
     if (!_status.ok()) {                                          \
       PJRT_Error* _c_status = new PJRT_Error{std::move(_status)}; \
       return _c_status;                                           \
@@ -431,7 +437,9 @@ PJRT_Error* PJRT_Plugin_Initialize_NoOp(PJRT_Plugin_Initialize_Args* args);
 PJRT_Api CreatePjrtApi(PJRT_Client_Create* create_fn,
                        PJRT_TopologyDescription_Create* topology_create_fn,
                        PJRT_Plugin_Initialize* plugin_initialize_fn,
-                       PJRT_Extension_Base* extension_start = nullptr);
+                       PJRT_Extension_Base* extension_start = nullptr,
+                       PJRT_Plugin_Attributes* plugin_attributes_fn =
+                           pjrt::PJRT_Plugin_Attributes_Empty);
 
 }  // namespace pjrt
 
diff --git a/third_party/xla/xla/pjrt/cpu/BUILD b/third_party/xla/xla/pjrt/cpu/BUILD
index 603a8ef30e2dcc..c9c693072b27e4 100644
--- a/third_party/xla/xla/pjrt/cpu/BUILD
+++ b/third_party/xla/xla/pjrt/cpu/BUILD
@@ -1,7 +1,7 @@
-load("@local_tsl//tsl:tsl.bzl", "if_oss", "internal_visibility")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//xla:xla.bzl", "xla_cc_test")
+load("//xla/tsl:tsl.bzl", "if_oss", "internal_visibility")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -32,6 +32,7 @@ cc_library(
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/concurrency:async_value",
@@ -174,12 +175,12 @@ cc_library(
         "//xla/service:hlo_module_util",
         "//xla/service:hlo_proto_cc",
         "//xla/service:hlo_value",
-        "//xla/service/cpu:buffer_desc",
         "//xla/service/cpu:collectives_interface",
         "//xla/service/cpu:cpu_compiler",
         "//xla/service/cpu:cpu_executable",
         "//xla/service/cpu:cpu_executable_run_options",
         "//xla/service/cpu:cpu_xfeed",
+        "//xla/service/cpu:simple_orc_jit",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:dynamic_annotations",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -286,6 +287,32 @@ cc_library(
     ],
 )
 
+xla_cc_test(
+    name = "gloo_collectives_test",
+    srcs = ["gloo_collectives_test.cc"],
+    deps = [
+        ":gloo_collectives",
+        ":gloo_kv_store",
+        "//third_party/gloo:transport_tcp",
+        "//xla:executable_run_options",
+        "//xla:xla_data_proto_cc",
+        "//xla/pjrt/distributed:in_memory_key_value_store",
+        "//xla/pjrt/distributed:key_value_store_interface",
+        "//xla/service:collective_ops_utils",
+        "//xla/service:global_device_id",
+        "//xla/service/cpu:collectives_interface",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_main",
+    ],
+)
+
 cc_library(
     name = "mpi_collectives",
     srcs = if_oss(["mpi_collectives.cc"]),
diff --git a/third_party/xla/xla/pjrt/cpu/abstract_tfrt_cpu_buffer.cc b/third_party/xla/xla/pjrt/cpu/abstract_tfrt_cpu_buffer.cc
index 31ee4204277079..88de16f8068925 100644
--- a/third_party/xla/xla/pjrt/cpu/abstract_tfrt_cpu_buffer.cc
+++ b/third_party/xla/xla/pjrt/cpu/abstract_tfrt_cpu_buffer.cc
@@ -133,14 +133,6 @@ ShapedBuffer AsShapedBuffer(
 
 }  //  namespace
 
-UnpinnedHostMemorySpace::UnpinnedHostMemorySpace(int id, PjRtClient* client)
-    : id_(id), client_(client) {
-  debug_string_ = absl::StrFormat(
-      "UnpinnedHostMemorySpace(id=%i, process_index=%i, client=%s)", id_,
-      client_->process_index(), client_->platform_name());
-  to_string_ = absl::StrFormat("UNPINNED_HOST_%i", id_);
-}
-
 AbstractTfrtCpuBuffer::AbstractTfrtCpuBuffer(
     Shape on_device_shape,
     std::unique_ptr<TrackedTfrtCpuDeviceBuffer> tracked_device_buffer)
@@ -346,8 +338,7 @@ AbstractTfrtCpuBuffer::Release(bool wait_for_operations_to_complete) {
     for (const auto& av : events) {
       BlockUntilReady(av.GetAsyncValue());
       if (auto* error = av.GetErrorIfPresent()) {
-        first_error.Update(
-            Internal("Error Execute: %s", error->message()));
+        first_error.Update(Internal("Error Execute: %s", error->message()));
       }
     }
     if (!first_error.ok()) return std::move(first_error);
@@ -388,19 +379,18 @@ AbstractTfrtCpuBuffer::AcquireDonation() {
   return DonationTransaction(this, std::move(tracked_device_buffer_));
 }
 
-PjRtFuture<Status> AbstractTfrtCpuBuffer::ToLiteralHelper(
+PjRtFuture<> AbstractTfrtCpuBuffer::ToLiteralHelper(
     MutableLiteralBase* literal, AsyncWorkRunner* async_work_runner) {
   std::string message = absl::StrCat(buffer_name(), "::ToLiteral");
   absl::string_view message_view(message);
   tsl::profiler::TraceMe traceme(message_view);
   if (IsEmptyTuple()) {
-    return PjRtFuture<Status>(
-        InvalidArgument("ToLiteral called on empty tuple"));
+    return PjRtFuture<>(InvalidArgument("ToLiteral called on empty tuple"));
   }
   auto usage_event = tsl::MakeConstructedAsyncValueRef<CpuEvent>();
   auto* device_buffer = AcquireUsage(usage_event);
   if (device_buffer == nullptr) {
-    return PjRtFuture<Status>(InvalidArgument(
+    return PjRtFuture<>(InvalidArgument(
         "CopyToHostAsync() called on deleted or donated buffer"));
   }
   MarkEventReadyOnExit ready_on_exit(std::move(usage_event));
@@ -414,14 +404,14 @@ PjRtFuture<Status> AbstractTfrtCpuBuffer::ToLiteralHelper(
                           literal->size_bytes() < kSmallDataTransferByteSize;
   absl::StatusOr<Shape> device_shape = logical_on_device_shape();
   if (!device_shape.ok()) {
-    return PjRtFuture<Status>(device_shape.status());
+    return PjRtFuture<>(device_shape.status());
   }
   if (should_sync_copy) {
     CopyCpuBufferToLiteral(*device_shape, device_buffer, literal);
     // Unblock ToLiteral caller.
-    return PjRtFuture<Status>(OkStatus());
+    return PjRtFuture<>(OkStatus());
   } else {
-    auto ready_event = tsl::MakeUnconstructedAsyncValueRef<Status>();
+    PjRtFuture<>::Promise promise = PjRtFuture<>::CreatePromise();
     // Wait for buffer definition events to finish before d2h dispatch. D2H
     // dispatch should be in parallel, e.g. one Execute event finish may trigger
     // multiple outputs' D2H, they should happen in different threads in
@@ -429,22 +419,22 @@ PjRtFuture<Status> AbstractTfrtCpuBuffer::ToLiteralHelper(
     async_work_runner->ScheduleWhenReady(
         device_buffer_wait_avs,
         [device_buffer_wait_avs = std::move(device_buffer_wait_avs_copy),
-         literal, ready_event = ready_event.CopyRef(), device_buffer,
-         device_shape, ready_on_exit = std::move(ready_on_exit)]() mutable {
+         literal, promise, device_buffer, device_shape,
+         ready_on_exit = std::move(ready_on_exit)]() mutable {
           tsl::profiler::TraceMe traceme("D2H Dispatch");
           // Errors in src buffer are surfaced to user.
           for (const auto& av : device_buffer_wait_avs) {
             if (auto* error = av->GetErrorIfPresent()) {
-              ready_event.emplace(*error);
+              promise.Set(*error);
               return;
             }
           }
           CopyCpuBufferToLiteral(*device_shape, device_buffer, literal);
           // Unblock ToLiteral event.
-          ready_event.emplace(OkStatus());
+          promise.Set();
         });
-    return PjRtFuture<Status>(
-        std::move(ready_event),
+    return PjRtFuture<>(
+        std::move(promise),
         /*on_block_start=*/
         [message]() {
           absl::string_view message_view(message);
@@ -474,7 +464,7 @@ AbstractTfrtCpuBuffer::CopyToDeviceAcrossClients(PjRtDevice* dst_device) {
   return dst_device->client()->BufferFromHostBuffer(
       literal_pointer->untyped_data(), literal_pointer->shape().element_type(),
       literal_pointer->shape().dimensions(), byte_strides,
-      PjRtClient::HostBufferSemantics::kZeroCopy,
+      PjRtClient::HostBufferSemantics::kImmutableZeroCopy,
       [literal{std::move(literal)}]() { /* frees literal */ }, dst_device);
 }
 
@@ -543,12 +533,12 @@ AbstractTfrtCpuBuffer::CopyToDeviceHelper(AsyncWorkRunner* async_work_runner) {
       std::move(dst_definition_events));
 }
 
-PjRtFuture<Status> AbstractTfrtCpuBuffer::GetReadyFuture() {
+PjRtFuture<> AbstractTfrtCpuBuffer::GetReadyFuture() {
   tsl::AsyncValueRef<CpuEvent> definition_event;
   {
     absl::MutexLock lock(&mu_);
     if (!tracked_device_buffer_) {
-      return PjRtFuture<Status>(InvalidArgument(
+      return PjRtFuture<>(InvalidArgument(
           "GetReadyFuture() called on deleted or donated buffer"));
     }
     definition_event = tracked_device_buffer_->definition_event();
@@ -557,29 +547,26 @@ PjRtFuture<Status> AbstractTfrtCpuBuffer::GetReadyFuture() {
 
   if (definition_event.IsAvailable()) {
     if (definition_event.IsError()) {
-      return PjRtFuture<Status>(
+      return PjRtFuture<>(
           FailedPrecondition("Buffer Definition Event: %s",
                              definition_event.GetError().message()));
     }
-    return PjRtFuture<Status>(OkStatus());
+    return PjRtFuture<>(OkStatus());
   } else {
-    tsl::AsyncValueRef<Status> status_event =
-        tsl::MakeUnconstructedAsyncValueRef<Status>();
-
-    definition_event.AndThen(
-        [definition_event = definition_event.AsPtr(), status_event]() {
-          if (definition_event.IsError()) {
-            status_event.emplace(
-                FailedPrecondition("Buffer Definition Event: %s",
-                                   definition_event.GetError().message()));
-          } else {
-            status_event.emplace(OkStatus());
-          }
-        });
+    PjRtFuture<>::Promise promise = PjRtFuture<>::CreatePromise();
+    definition_event.AndThen([definition_event = definition_event.AsPtr(),
+                              promise]() mutable {
+      if (definition_event.IsError()) {
+        promise.Set(FailedPrecondition("Buffer Definition Event: %s",
+                                       definition_event.GetError().message()));
+      } else {
+        promise.Set();
+      }
+    });
 
     std::string message = absl::StrCat(buffer_name(), "::Await");
-    return PjRtFuture<Status>(
-        std::move(status_event),
+    return PjRtFuture<>(
+        std::move(promise),
         /*on_block_start=*/
         [message]() {
           absl::string_view message_view(message);
@@ -689,24 +676,46 @@ AbstractTfrtCpuBuffer::BufferFromHostBufferHelper(
       !byte_strides || HasMajorToMinorLayout(type, dims, *byte_strides);
   // Int4 arrays are unpacked on host and packed on device.
   bool is_int4 = primitive_util::Is4BitType(type);
+
   // If the input buffer has a default layout and is sufficiently aligned, we
   // can simply point to the input array's data without any further copies. At
   // the time of writing we require a 16-byte alignment because XLA may generate
   // code which requires it.
+  bool is_aligned_data = ((absl::bit_cast<std::uintptr_t>(data) &
+                           (cpu_function_runtime::MinAlign() - 1)) == 0);
+
+  using HostBufferSemantics = PjRtClient::HostBufferSemantics;
+  bool immutable_zero_copy_semantics =
+      host_buffer_semantics == HostBufferSemantics::kImmutableZeroCopy;
+  bool mutable_zero_copy_semantics =
+      host_buffer_semantics == HostBufferSemantics::kMutableZeroCopy;
+
   bool can_use_zero_copy =
-      has_default_layout && !is_int4 &&
-      host_buffer_semantics == PjRtClient::HostBufferSemantics::kZeroCopy &&
-      ((absl::bit_cast<std::uintptr_t>(data) &
-        (cpu_function_runtime::MinAlign() - 1)) == 0);
+      has_default_layout && !is_int4 && is_aligned_data &&
+      (immutable_zero_copy_semantics || mutable_zero_copy_semantics);
+
   absl::InlinedVector<std::shared_ptr<MaybeOwningCpuMemory>, 4> buffers;
   absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events;
   absl::AnyInvocable<void() &&> on_delete_callback;
   size_t byte_size = ShapeUtil::ByteSizeOf(shape);
-  if (can_use_zero_copy) {
-    auto device_buffer = std::make_shared<MaybeOwningCpuMemory>(
-        const_cast<void*>(data), byte_size);
-    buffers.push_back(std::move(device_buffer));
+
+  if (can_use_zero_copy && mutable_zero_copy_semantics) {
+    // For a mutable zero copy semantics we pass a no-op deleter because
+    // underlying buffer is owned by the caller and it will free it when
+    // PjRt will call `on_done_with_host_buffer` callback.
+    MaybeOwningCpuMemory::OwnedDataPtr::deleter_type no_op = +[](void*) {};
+    buffers.push_back(std::make_shared<MaybeOwningCpuMemory>(
+        MaybeOwningCpuMemory::OwnedDataPtr(
+            reinterpret_cast<uint8_t*>(const_cast<void*>(data)), no_op),
+        byte_size));
     on_delete_callback = std::move(on_done_with_host_buffer);
+
+  } else if (can_use_zero_copy && immutable_zero_copy_semantics) {
+    // For immutable zero-copy semantics we pass non-owning cpu memory.
+    buffers.push_back(std::make_shared<MaybeOwningCpuMemory>(
+        const_cast<void*>(data), byte_size));
+    on_delete_callback = std::move(on_done_with_host_buffer);
+
   } else {
     size_t dst_byte_size =
         is_int4 ? CeilOfRatio(byte_size, size_t{2}) : byte_size;
diff --git a/third_party/xla/xla/pjrt/cpu/abstract_tfrt_cpu_buffer.h b/third_party/xla/xla/pjrt/cpu/abstract_tfrt_cpu_buffer.h
index d5e2e1df780840..98f4fbf22cfa32 100644
--- a/third_party/xla/xla/pjrt/cpu/abstract_tfrt_cpu_buffer.h
+++ b/third_party/xla/xla/pjrt/cpu/abstract_tfrt_cpu_buffer.h
@@ -90,37 +90,6 @@ class AsyncWorkRunner {
       absl::AnyInvocable<void()> work) = 0;
 };
 
-// Represents the unpinned host memory accessible to a PjRtDevice.
-class UnpinnedHostMemorySpace : public PjRtMemorySpace {
- public:
-  static constexpr absl::string_view kMemorySpaceKind = "unpinned_host";
-
-  UnpinnedHostMemorySpace(int id, PjRtClient* client);
-
-  PjRtClient* client() const override { return client_; }
-
-  absl::Span<PjRtDevice* const> devices() const override { return devices_; }
-
-  int id() const override { return id_; }
-
-  absl::string_view memory_space_kind() const override {
-    return kMemorySpaceKind;
-  }
-
-  absl::string_view DebugString() const override { return debug_string_; }
-
-  absl::string_view ToString() const override { return to_string_; }
-
-  void AttachDevice(PjRtDevice* device) { devices_.push_back(device); }
-
- private:
-  int id_;
-  PjRtClient* client_;
-  std::vector<PjRtDevice*> devices_;
-  std::string debug_string_;
-  std::string to_string_;
-};
-
 class AbstractTfrtCpuBuffer : public PjRtBuffer {
  public:
   AbstractTfrtCpuBuffer(
@@ -140,9 +109,9 @@ class AbstractTfrtCpuBuffer : public PjRtBuffer {
 
   absl::StatusOr<size_t> GetOnDeviceSizeInBytes() const override;
 
-  PjRtFuture<Status> CopyRawToHost(void* dst, int64_t offset,
-                                   int64_t transfer_size) override {
-    return PjRtFuture<Status>(Unimplemented("CopyRawToHost not implemented"));
+  PjRtFuture<> CopyRawToHost(void* dst, int64_t offset,
+                             int64_t transfer_size) override {
+    return PjRtFuture<>(Unimplemented("CopyRawToHost not implemented"));
   }
 
   absl::StatusOr<std::unique_ptr<PjRtBuffer>> CopyToMemorySpace(
@@ -155,14 +124,15 @@ class AbstractTfrtCpuBuffer : public PjRtBuffer {
   bool IsDeleted() override;
 
   void CopyToRemoteDevice(
-      PjRtFuture<StatusOr<std::string>> serialized_descriptor,
+      PjRtFuture<absl::StatusOr<std::string>> serialized_descriptor,
       RemoteSendCallback on_done) override {
     on_done(Unimplemented("CopyToRemoteDevice not implemented."),
             /*sends_were_enqueued=*/false);
   }
 
   void CopyToRemoteDeviceScattered(
-      PjRtFuture<StatusOr<std::vector<std::string>>> serialized_descriptors,
+      PjRtFuture<absl::StatusOr<std::vector<std::string>>>
+          serialized_descriptors,
       std::vector<RemoteSendCallback> callbacks,
       const xla::PjRtBuffer::ScatterDetails& scatter_details) override {
     for (const auto& on_done : callbacks) {
@@ -171,7 +141,7 @@ class AbstractTfrtCpuBuffer : public PjRtBuffer {
     }
   }
 
-  PjRtFuture<Status> GetReadyFuture() override;
+  PjRtFuture<> GetReadyFuture() override;
 
   bool IsOnCpu() const override { return true; }
 
@@ -276,8 +246,8 @@ class AbstractTfrtCpuBuffer : public PjRtBuffer {
  protected:
   virtual absl::string_view buffer_name() const = 0;
 
-  PjRtFuture<Status> ToLiteralHelper(MutableLiteralBase* literal,
-                                     AsyncWorkRunner* async_work_runner);
+  PjRtFuture<> ToLiteralHelper(MutableLiteralBase* literal,
+                               AsyncWorkRunner* async_work_runner);
 
   absl::StatusOr<std::unique_ptr<PjRtBuffer>> CopyToDeviceAcrossClients(
       PjRtDevice* dst_device);
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_client.cc b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
index 63f930a3c037af..7d0cc23b7bde62 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_client.cc
+++ b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
@@ -76,12 +76,12 @@ limitations under the License.
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/compiler.h"
 #include "xla/service/computation_placer.h"
-#include "xla/service/cpu/buffer_desc.h"
 #include "xla/service/cpu/collectives_interface.h"
 #include "xla/service/cpu/cpu_compiler.h"
 #include "xla/service/cpu/cpu_executable.h"
 #include "xla/service/cpu/cpu_executable_run_options.h"
 #include "xla/service/cpu/cpu_xfeed.h"
+#include "xla/service/cpu/simple_orc_jit.h"
 #include "xla/service/custom_call_status.h"
 #include "xla/service/dump.h"
 #include "xla/service/executable.h"
@@ -93,7 +93,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
-#include "xla/statusor.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
@@ -271,7 +270,8 @@ absl::string_view TfrtCpuDeviceDescription::ToString() const {
 /*static*/ TfrtCpuTopologyDescription TfrtCpuTopologyDescription::Create(
     PjRtPlatformId platform_id, absl::string_view platform_name,
     absl::string_view platform_version,
-    absl::Span<const std::unique_ptr<TfrtCpuDevice>> devices) {
+    absl::Span<const std::unique_ptr<TfrtCpuDevice>> devices,
+    absl::Span<const std::string> machine_attributes) {
   std::vector<CpuTopology::CpuDevice> cpu_devices;
   cpu_devices.reserve(devices.size());
   for (auto& device : devices) {
@@ -279,7 +279,14 @@ absl::string_view TfrtCpuDeviceDescription::ToString() const {
         {device->id(), device->process_index(), device->local_hardware_id()});
   }
   return TfrtCpuTopologyDescription(platform_id, platform_name,
-                                    platform_version, cpu_devices);
+                                    platform_version, cpu_devices,
+                                    machine_attributes);
+}
+
+absl::StatusOr<Layout> TfrtCpuTopologyDescription::GetDefaultLayout(
+    PrimitiveType element_type, absl::Span<const int64_t> dims) const {
+  Shape shape = ShapeUtil::MakeShape(element_type, dims);
+  return LayoutUtil::GetWithDefaultLayout(shape).layout();
 }
 
 absl::StatusOr<std::string> TfrtCpuTopologyDescription::Serialize() const {
@@ -360,7 +367,7 @@ absl::StatusOr<std::unique_ptr<PjRtClient>> GetTfrtCpuClient(
 
   return std::unique_ptr<PjRtClient>(std::make_unique<TfrtCpuClient>(
       /*process_index=*/options.node_id, std::move(devices),
-      std::move(options.collectives), num_threads));
+      std::move(options.collectives), num_threads, options.asynchronous));
 }
 
 static tsl::ThreadOptions GetThreadOptions() {
@@ -373,7 +380,8 @@ static tsl::ThreadOptions GetThreadOptions() {
 
 TfrtCpuClient::TfrtCpuClient(
     int process_index, std::vector<std::unique_ptr<TfrtCpuDevice>> devices,
-    std::shared_ptr<cpu::CollectivesInterface> collectives, size_t num_threads)
+    std::shared_ptr<cpu::CollectivesInterface> collectives, size_t num_threads,
+    bool asynchronous)
     : process_index_(process_index),
       owned_devices_(std::move(devices)),
       computation_placer_(std::make_unique<ComputationPlacer>()),
@@ -392,7 +400,9 @@ TfrtCpuClient::TfrtCpuClient(
       transpose_cache_(1024),
       collectives_(std::move(collectives)),
       topology_(TfrtCpuTopologyDescription::Create(
-          platform_id(), platform_name(), platform_version(), owned_devices_)) {
+          platform_id(), platform_name(), platform_version(), owned_devices_,
+          cpu::DetectMachineAttributes())),
+      asynchronous_(asynchronous) {
   for (const std::unique_ptr<TfrtCpuDevice>& device : owned_devices_) {
     devices_.push_back(device.get());
     CHECK(id_to_device_.insert({device->id(), device.get()}).second)
@@ -935,15 +945,15 @@ static std::vector<tsl::RCReference<tsl::AsyncValue>> CopyAsyncValues(
   return avs;
 }
 
-PjRtFuture<absl::Status> TfrtCpuBuffer::ToLiteral(MutableLiteralBase* literal) {
+PjRtFuture<> TfrtCpuBuffer::ToLiteral(MutableLiteralBase* literal) {
   return ToLiteralHelper(literal, client()->async_work_runner());
 }
 
-PjRtFuture<absl::Status> TfrtCpuBuffer::LazyToLiteral(
+PjRtFuture<> TfrtCpuBuffer::LazyToLiteral(
     absl::AnyInvocable<absl::StatusOr<MutableLiteralBase*>() &&> generator) {
   auto buffer = std::move(generator)();
   if (!buffer.ok()) {
-    return PjRtFuture<Status>(buffer.status());
+    return PjRtFuture<>(buffer.status());
   }
   return ToLiteralHelper(buffer.value(), client()->async_work_runner());
 }
@@ -1002,10 +1012,12 @@ TfrtCpuExecutable::TfrtCpuExecutable(
       addressable_devices_(std::move(addressable_devices)) {
   auto hlo_cost_analysis =
       std::make_unique<HloCostAnalysis>(cpu::CpuExecutable::ShapeSizeBytes);
-  // Cache to avoid std::map lookup in flop_count() on critical path.
-  // The magic constant 1000 is determined by correlating computation with flop
-  // estimate. It is a crude heuristic to find computation less than the thread
-  // context switch time (~5us).
+  CHECK_OK(cpu_executable_->module().entry_computation()->Accept(
+      hlo_cost_analysis.get()));
+  // Cache to avoid std::map lookup in flop_count() on critical path. The magic
+  // constant 1000 is determined by correlating computation with flop estimate.
+  // It is a crude heuristic to find computation less than the thread context
+  // switch time (~5us).
   cheap_computation_ = hlo_cost_analysis->flop_count() < 1000;
 
   const auto& computation_layout =
@@ -1138,17 +1150,6 @@ Status TfrtCpuExecutable::CheckBufferCompatibilities(
   return OkStatus();
 }
 
-// Create a descriptor table for XLA Runtime from a buffer table.
-static std::vector<xla::cpu::BufferDesc> MakeXLARuntimeDescriptorTable(
-    absl::Span<const std::shared_ptr<MaybeOwningCpuMemory>> buffer_table) {
-  std::vector<xla::cpu::BufferDesc> descriptor_table;
-  descriptor_table.reserve(buffer_table.size());
-  for (const auto& buf : buffer_table) {
-    descriptor_table.emplace_back(buf->data(), buf->size());
-  }
-  return descriptor_table;
-}
-
 absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
     absl::Span<PjRtBuffer* const> argument_handles, int replica, int partition,
     const RunId& run_id, const ExecuteOptions& options,
@@ -1333,7 +1334,7 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
     input_deps.push_back(std::move(last_collective_launch_event));
   }
 
-  bool execute_inline = cheap_computation_;
+  bool execute_inline = cheap_computation_ || !client_->asynchronous_;
 
   // Overwrite `execute_inline` if it is specified in the ExecuteOptions.
   if (options.execution_mode == ExecuteOptions::ExecutionMode::kAsynchronous) {
@@ -1354,15 +1355,9 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
     XlaCustomCallStatus status;
 
     // Call generated function.
-    if (cpu_executable->IsXlaRuntime()) {
-      Status status = cpu_executable->ExecuteXlaRuntime(
-          MakeXLARuntimeDescriptorTable(buffer_table), &run_options);
-      if (!status.ok()) return status;
-    } else {
-      cpu_executable->compute_function()(result_buffer, &run_options, nullptr,
-                                         buffer_pointers.data(), &status,
-                                         nullptr);
-    }
+    cpu_executable->compute_function()(result_buffer, &run_options, nullptr,
+                                       buffer_pointers.data(), &status,
+                                       nullptr);
 
     for (auto& donation_transaction : donation_transactions) {
       std::move(donation_transaction).Commit();
@@ -1409,6 +1404,16 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
             }
           }
 
+          // Commit donation transactions early before execution to prevent a
+          // tricky deadlock case when we have both donated args and python host
+          // callbacks: `~TfrtCpuBuffer()` will wait for all pending donations
+          // to be committed, while it may be holding the python GIL. However,
+          // when the computation contains python callbacks, they also need the
+          // python GIL, which leads to a deadlock.
+          for (auto& donation_transaction : donation_transactions) {
+            std::move(donation_transaction).Commit();
+          }
+
           // Set denormal and rounding behavior to match the default TF
           // ThreadPool behavior.
           tsl::port::ScopedFlushDenormal flush;
@@ -1416,24 +1421,11 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
 
           // Call generated function.
           std::optional<absl::string_view> error_message;
-          if (cpu_executable->IsXlaRuntime()) {
-            Status s = cpu_executable->ExecuteXlaRuntime(
-                MakeXLARuntimeDescriptorTable(buffer_table), &run_options);
-            if (!s.ok()) {
-              // TODO(kramerb): Propagate custom call error messages.
-              error_message = "XLA Runtime execution failed";
-            }
-          } else {
-            XlaCustomCallStatus status;
-            cpu_executable->compute_function()(result_buffer, &run_options,
-                                               nullptr, buffer_pointers.data(),
-                                               &status, nullptr);
-            error_message = xla::CustomCallStatusGetMessage(&status);
-          }
-
-          for (auto& donation_transaction : donation_transactions) {
-            std::move(donation_transaction).Commit();
-          }
+          XlaCustomCallStatus status;
+          cpu_executable->compute_function()(result_buffer, &run_options,
+                                             nullptr, buffer_pointers.data(),
+                                             &status, nullptr);
+          error_message = xla::CustomCallStatusGetMessage(&status);
 
           if (error_message) {
             // CPU computation fails with an error.
@@ -1476,18 +1468,17 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
         result_shape, std::move(tracked_device_buffer), client_, device);
     res.push_back(std::move(tfrt_output_buffer));
   }
-  std::optional<PjRtFuture<Status>> future;
+  std::optional<PjRtFuture<>> future;
   if (fill_future) {
-    auto done_event = tsl::MakeUnconstructedAsyncValueRef<Status>();
-    execute_event.AndThen(
-        [done_event = done_event.CopyRef(), event = execute_event.CopyRef()]() {
-          Status s;
-          if (auto* error = event.GetErrorIfPresent()) {
-            s = Internal("Compute error: %s", error->message());
-          }
-          done_event.emplace(std::move(s));
-        });
-    future = PjRtFuture<Status>(std::move(done_event));
+    PjRtFuture<>::Promise promise = PjRtFuture<>::CreatePromise();
+    execute_event.AndThen([promise, event = execute_event.CopyRef()]() mutable {
+      if (auto* error = event.GetErrorIfPresent()) {
+        promise.Set(Internal("Compute error: %s", error->message()));
+      } else {
+        promise.Set();
+      }
+    });
+    future = PjRtFuture<>(std::move(promise));
   }
   return Result({/*future=*/std::move(future), /*buffers=*/std::move(res)});
 }
@@ -1530,7 +1521,7 @@ absl::StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>>
 TfrtCpuExecutable::Execute(
     absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
     const ExecuteOptions& options,
-    std::optional<std::vector<PjRtFuture<Status>>>& returned_futures) {
+    std::optional<std::vector<PjRtFuture<>>>& returned_futures) {
   tsl::profiler::TraceMe traceme("TfrtCpuExecutable::Execute");
   if (device_assignment_ == nullptr) {
     return InvalidArgument("Execute expects a non-null device_assignment");
@@ -1652,8 +1643,8 @@ TfrtCpuExecutable::Execute(
 absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
 TfrtCpuExecutable::ExecuteSharded(
     absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
-    const ExecuteOptions& options,
-    std::optional<PjRtFuture<Status>>& returned_future, bool fill_future) {
+    const ExecuteOptions& options, std::optional<PjRtFuture<>>& returned_future,
+    bool fill_future) {
   tsl::profiler::TraceMe traceme("TfrtCpuExecutable::ExecuteSharded");
   if (device_assignment_ == nullptr) {
     return InvalidArgument("ExecuteShard expects a non-null device_assignment");
@@ -1683,8 +1674,8 @@ TfrtCpuExecutable::ExecuteSharded(
 absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
 TfrtCpuExecutable::ExecutePortable(
     absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
-    const ExecuteOptions& options,
-    std::optional<PjRtFuture<Status>>& returned_future, bool fill_future) {
+    const ExecuteOptions& options, std::optional<PjRtFuture<>>& returned_future,
+    bool fill_future) {
   tsl::profiler::TraceMe traceme("TfrtCpuExecutable::ExecutePortable");
   if (device_assignment_ != nullptr) {
     return InvalidArgument("ExecutePortable gets a non-portable executable");
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_client.h b/third_party/xla/xla/pjrt/cpu/cpu_client.h
index 4f9187aa1d41fd..494d85b827ef6b 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_client.h
+++ b/third_party/xla/xla/pjrt/cpu/cpu_client.h
@@ -38,12 +38,14 @@ limitations under the License.
 #include "xla/client/xla_computation.h"
 #include "xla/executable_run_options.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/layout.h"
 #include "xla/literal.h"
 #include "xla/pjrt/cpu/abstract_tfrt_cpu_buffer.h"
 #include "xla/pjrt/cpu/cpu_topology.h"
 #include "xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_device_description.h"
 #include "xla/pjrt/pjrt_executable.h"
@@ -106,18 +108,22 @@ class TfrtCpuTopologyDescription : public PjRtTopologyDescription {
   static TfrtCpuTopologyDescription Create(
       PjRtPlatformId platform_id, absl::string_view platform_name,
       absl::string_view platform_version,
-      absl::Span<const std::unique_ptr<TfrtCpuDevice>> devices);
+      absl::Span<const std::unique_ptr<TfrtCpuDevice>> devices,
+      absl::Span<const std::string> machine_attributes);
 
   // `cpu_device_ids` is the list of logical device ids for the CPU devices and
   // will be used to initialize the CPU topology.
   TfrtCpuTopologyDescription(
       const PjRtPlatformId platform_id, const absl::string_view platform_name,
       const absl::string_view platform_version,
-      const std::vector<CpuTopology::CpuDevice> cpu_devices)
+      const std::vector<CpuTopology::CpuDevice> cpu_devices,
+      absl::Span<const std::string> machine_attributes)
       : platform_id_(platform_id),
         platform_name_(platform_name),
         platform_version_(platform_version),
-        cpu_topology_(std::move(cpu_devices)) {}
+        cpu_topology_(std::move(cpu_devices),
+                      std::vector<std::string>(machine_attributes.begin(),
+                                               machine_attributes.end())) {}
 
   bool operator==(const TfrtCpuTopologyDescription& other) const {
     return this->platform_id() == other.platform_id() &&
@@ -179,6 +185,10 @@ class TfrtCpuTopologyDescription : public PjRtTopologyDescription {
     return attributes_;
   }
 
+  StatusOr<Layout> GetDefaultLayout(
+      PrimitiveType element_type,
+      absl::Span<const int64_t> dims) const override;
+
  private:
   const PjRtPlatformId platform_id_;
   const std::string platform_name_;
@@ -252,7 +262,7 @@ class TfrtCpuClient final : public PjRtClient {
   TfrtCpuClient(int process_index,
                 std::vector<std::unique_ptr<TfrtCpuDevice>> devices,
                 std::shared_ptr<cpu::CollectivesInterface> collectives,
-                size_t num_threads);
+                size_t num_threads, bool asynchronous);
   ~TfrtCpuClient() override;
 
   int process_index() const override { return process_index_; }
@@ -445,6 +455,10 @@ class TfrtCpuClient final : public PjRtClient {
   std::shared_ptr<cpu::CollectivesInterface> collectives_;
 
   xla::TfrtCpuTopologyDescription topology_;
+
+  // Used to control whether asynchronous computation dispatch is available for
+  // this client. Only applies to non-parallel computations.
+  bool asynchronous_;
 };
 
 class TfrtCpuBuffer final : public AbstractTfrtCpuBuffer {
@@ -464,8 +478,8 @@ class TfrtCpuBuffer final : public AbstractTfrtCpuBuffer {
   TfrtCpuClient* client() const override { return client_; }
 
   using PjRtBuffer::ToLiteralSync;
-  PjRtFuture<Status> ToLiteral(MutableLiteralBase* literal) override;
-  PjRtFuture<Status> LazyToLiteral(
+  PjRtFuture<> ToLiteral(MutableLiteralBase* literal) override;
+  PjRtFuture<> LazyToLiteral(
       absl::AnyInvocable<absl::StatusOr<MutableLiteralBase*>() &&> generator)
       override;
 
@@ -547,22 +561,19 @@ class TfrtCpuExecutable final : public PjRtLoadedExecutable {
   absl::StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>> Execute(
       absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
       const ExecuteOptions& options,
-      std::optional<std::vector<PjRtFuture<Status>>>& returned_futures)
-      override;
+      std::optional<std::vector<PjRtFuture<>>>& returned_futures) override;
 
   using PjRtLoadedExecutable::ExecuteSharded;
   absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteSharded(
       absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
       const ExecuteOptions& options,
-      std::optional<PjRtFuture<Status>>& returned_future,
-      bool fill_future) override;
+      std::optional<PjRtFuture<>>& returned_future, bool fill_future) override;
 
   using PjRtLoadedExecutable::ExecutePortable;
   absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecutePortable(
       absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
       const ExecuteOptions& options,
-      std::optional<PjRtFuture<Status>>& returned_future,
-      bool fill_future) override;
+      std::optional<PjRtFuture<>>& returned_future, bool fill_future) override;
 
   void Delete() override;
 
@@ -646,7 +657,11 @@ class TfrtCpuExecutable final : public PjRtLoadedExecutable {
 };
 
 struct CpuClientOptions {
-  // Does nothing at the moment. Ignored.
+  // Used to control whether asynchronous computation dispatch is available for
+  // this client. Only applies to non-parallel computations, because collectives
+  // may exist when there are multiple cpu devices and we need to do async
+  // dispatch in that case. If it is set to be `false`, we will always run
+  // computations inline.
   bool asynchronous = true;
 
   // Number of CPU devices. If not provided, the value of
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_topology.cc b/third_party/xla/xla/pjrt/cpu/cpu_topology.cc
index 164b5f4e1b88f9..72e5ece634f45b 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_topology.cc
+++ b/third_party/xla/xla/pjrt/cpu/cpu_topology.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <cstddef>
 #include <memory>
+#include <string>
 #include <utility>
 #include <vector>
 
@@ -33,7 +34,14 @@ std::unique_ptr<const CpuTopology> CpuTopology::FromProto(
                        cpu_device_proto.local_hardware_id()});
   }
 
-  return std::make_unique<CpuTopology>(std::move(devices));
+  std::vector<std::string> machine_attributes;
+  machine_attributes.reserve(cpu_topology_proto.machine_attributes_size());
+  for (size_t i = 0; i < cpu_topology_proto.machine_attributes_size(); ++i) {
+    machine_attributes.push_back(cpu_topology_proto.machine_attributes(i));
+  }
+
+  return std::make_unique<CpuTopology>(std::move(devices),
+                                       std::move(machine_attributes));
 }
 
 CpuTopologyProto CpuTopology::ToProto() const {
@@ -44,6 +52,9 @@ CpuTopologyProto CpuTopology::ToProto() const {
     cpu_device_proto->set_process_index(cpu_device.process_index);
     cpu_device_proto->set_local_hardware_id(cpu_device.local_hardware_id);
   }
+  for (const std::string& machine_attribute : machine_attributes_) {
+    proto.add_machine_attributes(machine_attribute);
+  }
   return proto;
 }
 
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_topology.h b/third_party/xla/xla/pjrt/cpu/cpu_topology.h
index 89a49ff452e912..d698f2cb14f5ed 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_topology.h
+++ b/third_party/xla/xla/pjrt/cpu/cpu_topology.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define XLA_PJRT_CPU_CPU_TOPOLOGY_H_
 
 #include <memory>
+#include <string>
 #include <utility>
 #include <vector>
 
@@ -37,11 +38,16 @@ class CpuTopology {
     }
   };
 
-  explicit CpuTopology(std::vector<CpuDevice> cpu_deices)
-      : cpu_devices_(std::move(cpu_deices)) {}
+  explicit CpuTopology(std::vector<CpuDevice> cpu_devices,
+                       std::vector<std::string> machine_attributes)
+      : cpu_devices_(std::move(cpu_devices)),
+        machine_attributes_(std::move(machine_attributes)) {}
 
   int number_of_devices() const { return cpu_devices_.size(); }
   absl::Span<const CpuDevice> devices() const { return cpu_devices_; }
+  absl::Span<const std::string> machine_attributes() const {
+    return machine_attributes_;
+  }
 
   static std::unique_ptr<const CpuTopology> FromProto(
       const CpuTopologyProto& proto);
@@ -49,6 +55,7 @@ class CpuTopology {
 
  private:
   const std::vector<CpuDevice> cpu_devices_;
+  const std::vector<std::string> machine_attributes_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_topology.proto b/third_party/xla/xla/pjrt/cpu/cpu_topology.proto
index 183489b21c408b..85167fc5ffcff8 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_topology.proto
+++ b/third_party/xla/xla/pjrt/cpu/cpu_topology.proto
@@ -10,4 +10,5 @@ message CpuTopologyProto {
     int32 local_hardware_id = 3;
   }
   repeated CpuDevice cpu_devices = 1;
+  repeated string machine_attributes = 4;
 }
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_topology_test.cc b/third_party/xla/xla/pjrt/cpu/cpu_topology_test.cc
index 7b53e3d6080ae5..148f1edc353d8c 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_topology_test.cc
+++ b/third_party/xla/xla/pjrt/cpu/cpu_topology_test.cc
@@ -29,6 +29,7 @@ TEST(CpuTopology, FromProto) {
       R"pb(
         cpu_devices:
         [ { id: 1, process_index: 2, local_hardware_id: 3 }]
+        machine_attributes: [ "x86_64", "Intel" ]
       )pb",
       &msg));
 
@@ -37,15 +38,21 @@ TEST(CpuTopology, FromProto) {
   EXPECT_EQ(cpu_topology->devices()[0].id, 1);
   EXPECT_EQ(cpu_topology->devices()[0].process_index, 2);
   EXPECT_EQ(cpu_topology->devices()[0].local_hardware_id, 3);
+  EXPECT_EQ(cpu_topology->machine_attributes().size(), 2);
+  EXPECT_EQ(cpu_topology->machine_attributes()[0], "x86_64");
+  EXPECT_EQ(cpu_topology->machine_attributes()[1], "Intel");
 }
 
 TEST(CpuTopology, ToProto) {
-  CpuTopology cpu_topology({{1, 2, 3}});
+  CpuTopology cpu_topology({{1, 2, 3}}, {"ab", "cd"});
   CpuTopologyProto msg = cpu_topology.ToProto();
   EXPECT_EQ(msg.cpu_devices_size(), 1);
   EXPECT_EQ(msg.cpu_devices(0).id(), 1);
   EXPECT_EQ(msg.cpu_devices(0).process_index(), 2);
   EXPECT_EQ(msg.cpu_devices(0).local_hardware_id(), 3);
+  EXPECT_EQ(msg.machine_attributes_size(), 2);
+  EXPECT_EQ(msg.machine_attributes(0), "ab");
+  EXPECT_EQ(msg.machine_attributes(1), "cd");
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/pjrt/cpu/gloo_collectives_test.cc b/third_party/xla/xla/pjrt/cpu/gloo_collectives_test.cc
new file mode 100644
index 00000000000000..6214306e54a58b
--- /dev/null
+++ b/third_party/xla/xla/pjrt/cpu/gloo_collectives_test.cc
@@ -0,0 +1,128 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/pjrt/cpu/gloo_collectives.h"
+
+#include <unistd.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/time/time.h"
+#include "absl/types/span.h"
+#include "third_party/gloo/gloo/transport/tcp/attr.h"
+#include "third_party/gloo/gloo/transport/tcp/device.h"
+#include "xla/executable_run_options.h"
+#include "xla/pjrt/cpu/gloo_kv_store.h"
+#include "xla/pjrt/distributed/in_memory_key_value_store.h"
+#include "xla/pjrt/distributed/key_value_store_interface.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/service/cpu/collectives_interface.h"
+#include "xla/service/global_device_id.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/test.h"
+#include "tsl/platform/threadpool.h"
+
+namespace xla::cpu {
+
+namespace {
+using ::testing::Each;
+using ::testing::Eq;
+
+constexpr int kNumParticipants = 2;
+constexpr size_t kBufferSize = 256;
+constexpr absl::Duration kTimeout = absl::Seconds(5);
+
+absl::StatusOr<std::shared_ptr<CollectivesCommunicator>> GetCommunicator(
+    size_t kNumParticipants, absl::Span<GlobalDeviceId const> global_devices,
+    const std::shared_ptr<xla::KeyValueStoreInterface>& kv_store, int rank) {
+  auto collectives = std::make_shared<cpu::GlooCollectives>(
+      std::make_unique<cpu::GlooKeyValueStore>(kv_store),
+      gloo::transport::tcp::CreateDevice(gloo::transport::tcp::attr()));
+  return collectives->GetCommunicator(global_devices, rank);
+}
+
+RendezvousKey MakeRendezvousKey(std::vector<GlobalDeviceId> global_devices) {
+  return RendezvousKey(RunId(0), global_devices, kNumParticipants,
+                       RendezvousKey::CollectiveOpKind::kCrossModule,
+                       /*op_id=*/0);
+}
+
+// TODO(cobley) - add tests for other collectives.
+
+absl::StatusOr<std::vector<uint8_t>> AllReduce(
+    const std::shared_ptr<xla::KeyValueStoreInterface>& kv_store,
+    const std::vector<uint8_t>& input_buffer,
+    std::vector<GlobalDeviceId> global_devices, int rank) {
+  std::vector<uint8_t> output_buffer(kBufferSize);
+  RendezvousKey rendezvous_key = MakeRendezvousKey(global_devices);
+  TF_ASSIGN_OR_RETURN(
+      auto communicator,
+      GetCommunicator(kNumParticipants, global_devices, kv_store, rank));
+
+  TF_RETURN_IF_ERROR(communicator->AllReduce(
+      rendezvous_key, xla::ReductionKind::SUM, xla::PrimitiveType::U8,
+      kBufferSize, input_buffer.data(), output_buffer.data(), kTimeout));
+
+  return output_buffer;
+}
+
+TEST(GlooCollectives, AllReduce) {
+  std::vector<GlobalDeviceId> global_devices;
+  global_devices.reserve(kNumParticipants);
+  for (int rank = 0; rank < kNumParticipants; ++rank) {
+    global_devices.push_back(GlobalDeviceId(rank));
+  }
+
+  auto kv_store = std::make_shared<xla::InMemoryKeyValueStore>();
+
+  // Create a vector of output buffers with one buffer per participant.
+  std::vector<absl::StatusOr<std::vector<uint8_t>>> output_buffers(
+      kNumParticipants);
+
+  {
+    // Perform the collective with each participant in a separate thread.
+    tsl::thread::ThreadPool thread_pool(
+        tsl::Env::Default(), "AllReduceParticipants", kNumParticipants);
+    for (int rank = 0; rank < kNumParticipants; ++rank) {
+      thread_pool.Schedule(
+          [rank, &output_buffers, &kv_store, &global_devices]() {
+            std::vector<uint8_t> input_buffer(kBufferSize, rank + 1);
+            output_buffers[rank] =
+                AllReduce(kv_store, input_buffer, global_devices, rank);
+          });
+    }
+  }
+  // thread_pool is now out of scope, so all threads have joined.
+
+  // Verify that all participants successfully executed the collective.
+  for (int rank = 0; rank < kNumParticipants; ++rank) {
+    TF_ASSERT_OK(output_buffers[rank].status());
+  }
+  // Verify that all participants received the expected result.
+  for (int rank = 0; rank < kNumParticipants; ++rank) {
+    EXPECT_THAT(output_buffers[rank].value(),
+                Each(Eq(kNumParticipants * (kNumParticipants + 1) / 2)));
+  }
+}
+}  // namespace
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer.h b/third_party/xla/xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer.h
index 3a624b22934e16..4992db8f036aa8 100644
--- a/third_party/xla/xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer.h
+++ b/third_party/xla/xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer.h
@@ -16,35 +16,35 @@ limitations under the License.
 #ifndef XLA_PJRT_CPU_TRACKED_TFRT_CPU_DEVICE_BUFFER_H_
 #define XLA_PJRT_CPU_TRACKED_TFRT_CPU_DEVICE_BUFFER_H_
 
-#include <functional>
+#include <cstddef>
+#include <cstdint>
 #include <memory>
 #include <utility>
 
 #include "absl/container/inlined_vector.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/status/statusor.h"
 #include "absl/types/span.h"
 #include "xla/cpu_function_runtime.h"
 #include "xla/runtime/cpu_event.h"
 #include "xla/shape_util.h"
 #include "xla/util.h"
 #include "tsl/concurrency/async_value_ref.h"
-#include "tsl/platform/env.h"
 #include "tsl/platform/mem.h"
-#include "tsl/platform/threadpool.h"
 
 namespace xla {
 
 class MaybeOwningCpuMemory {
  public:
+  using OwnedDataPtr = std::unique_ptr<uint8_t[], void (*)(void*)>;
+
   MaybeOwningCpuMemory() = default;
 
   // Non-owning.
-  explicit MaybeOwningCpuMemory(void* buf, size_t size)
-      : buf_(buf), size_(size) {}
+  MaybeOwningCpuMemory(void* buf, size_t size) : buf_(buf), size_(size) {}
 
   // Owning.
-  using OwnedDataPtr =
-      std::unique_ptr<uint8_t[], decltype(tsl::port::AlignedFree)*>;
-  explicit MaybeOwningCpuMemory(OwnedDataPtr data, size_t size)
+  MaybeOwningCpuMemory(OwnedDataPtr data, size_t size)
       : buf_(data.get()), data_(std::move(data)), size_(size) {}
 
   // Move-only.
diff --git a/third_party/xla/xla/pjrt/distributed/BUILD b/third_party/xla/xla/pjrt/distributed/BUILD
index 1085bb236703f4..31b996faffe52a 100644
--- a/third_party/xla/xla/pjrt/distributed/BUILD
+++ b/third_party/xla/xla/pjrt/distributed/BUILD
@@ -1,7 +1,7 @@
-load("@local_tsl//tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//xla:xla.bzl", "xla_cc_test")
+load("//xla/tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
 
 licenses(["notice"])
 
@@ -27,15 +27,15 @@ cc_library(
         "//xla:statusor",
         "//xla:types",
         "//xla:util",
+        "//xla/tsl/distributed_runtime/coordination:coordination_service",
+        "//xla/tsl/distributed_runtime/coordination:coordination_service_impl",
+        "//xla/tsl/distributed_runtime/rpc:async_service_interface",
+        "//xla/tsl/distributed_runtime/rpc/coordination:grpc_coordination_service_impl",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
-        "@local_tsl//tsl/distributed_runtime/coordination:coordination_service",
-        "@local_tsl//tsl/distributed_runtime/coordination:coordination_service_impl",
-        "@local_tsl//tsl/distributed_runtime/rpc:async_service_interface",
-        "@local_tsl//tsl/distributed_runtime/rpc/coordination:grpc_coordination_service_impl",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:random",
@@ -73,15 +73,15 @@ cc_library(
         "//xla:statusor",
         "//xla:types",
         "//xla:util",
+        "//xla/tsl/distributed_runtime/coordination:coordination_client",
+        "//xla/tsl/distributed_runtime/coordination:coordination_service_agent",
+        "//xla/tsl/distributed_runtime/coordination:coordination_service_error_util",
+        "//xla/tsl/distributed_runtime/rpc/coordination:grpc_coordination_client",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
-        "@local_tsl//tsl/distributed_runtime/coordination:coordination_client",
-        "@local_tsl//tsl/distributed_runtime/coordination:coordination_service_agent",
-        "@local_tsl//tsl/distributed_runtime/coordination:coordination_service_error_util",
-        "@local_tsl//tsl/distributed_runtime/rpc/coordination:grpc_coordination_client",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
diff --git a/third_party/xla/xla/pjrt/distributed/client.cc b/third_party/xla/xla/pjrt/distributed/client.cc
index ce4ab02d2a7df1..5da921ac2c5035 100644
--- a/third_party/xla/xla/pjrt/distributed/client.cc
+++ b/third_party/xla/xla/pjrt/distributed/client.cc
@@ -28,10 +28,10 @@ limitations under the License.
 #include "absl/time/time.h"
 #include "grpcpp/channel.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
-#include "tsl/distributed_runtime/coordination/coordination_client.h"
-#include "tsl/distributed_runtime/coordination/coordination_service_agent.h"
-#include "tsl/distributed_runtime/coordination/coordination_service_error_util.h"
-#include "tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_client.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_error_util.h"
+#include "xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.h"
 #include "tsl/platform/errors.h"
 #include "tsl/protobuf/coordination_config.pb.h"
 #include "tsl/protobuf/coordination_service.pb.h"
@@ -48,17 +48,17 @@ class DistributedRuntimeCoordinationServiceClient
       : DistributedRuntimeCoordinationServiceClient(channel, Options()) {}
   ~DistributedRuntimeCoordinationServiceClient() override;
 
-  xla::Status Connect() override;
-  xla::Status Shutdown() override;
+  absl::Status Connect() override;
+  absl::Status Shutdown() override;
   absl::StatusOr<std::string> BlockingKeyValueGet(
       std::string_view key, absl::Duration timeout) override;
   absl::StatusOr<std::vector<std::pair<std::string, std::string>>>
   KeyValueDirGet(std::string_view key) override;
-  xla::Status KeyValueSet(std::string_view key,
-                          std::string_view value) override;
-  xla::Status KeyValueDelete(std::string_view key) override;
-  xla::Status WaitAtBarrier(std::string barrier_id,
-                            absl::Duration timeout) override;
+  absl::Status KeyValueSet(std::string_view key,
+                           std::string_view value) override;
+  absl::Status KeyValueDelete(std::string_view key) override;
+  absl::Status WaitAtBarrier(std::string barrier_id,
+                             absl::Duration timeout) override;
   absl::StatusOr<tsl::CoordinationServiceAgent*> GetCoordinationServiceAgent()
       override;
 
@@ -107,7 +107,7 @@ DistributedRuntimeCoordinationServiceClient::
 DistributedRuntimeCoordinationServiceClient::
     ~DistributedRuntimeCoordinationServiceClient() = default;
 
-xla::Status DistributedRuntimeCoordinationServiceClient::Connect() {
+absl::Status DistributedRuntimeCoordinationServiceClient::Connect() {
   const absl::Time deadline =
       absl::Now() +
       absl::Milliseconds(config_.cluster_register_timeout_in_ms());
@@ -130,7 +130,7 @@ xla::Status DistributedRuntimeCoordinationServiceClient::Connect() {
   return s;
 }
 
-xla::Status DistributedRuntimeCoordinationServiceClient::Shutdown() {
+absl::Status DistributedRuntimeCoordinationServiceClient::Shutdown() {
   LOG(INFO) << "Distributed task shutdown initiated.";
   Status s = coord_agent_->Shutdown();
   LOG(INFO) << "Distributed task shutdown result: " << s;
@@ -162,17 +162,17 @@ DistributedRuntimeCoordinationServiceClient::KeyValueDirGet(
   return kvs;
 }
 
-xla::Status DistributedRuntimeCoordinationServiceClient::KeyValueDelete(
+absl::Status DistributedRuntimeCoordinationServiceClient::KeyValueDelete(
     std::string_view key) {
   return coord_agent_->DeleteKeyValue(key);
 }
 
-xla::Status DistributedRuntimeCoordinationServiceClient::KeyValueSet(
+absl::Status DistributedRuntimeCoordinationServiceClient::KeyValueSet(
     std::string_view key, std::string_view value) {
   return coord_agent_->InsertKeyValue(key, value);
 }
 
-xla::Status DistributedRuntimeCoordinationServiceClient::WaitAtBarrier(
+absl::Status DistributedRuntimeCoordinationServiceClient::WaitAtBarrier(
     std::string barrier_id, absl::Duration timeout) {
   return coord_agent_->WaitAtBarrier(barrier_id, timeout, /*tasks=*/{});
 }
diff --git a/third_party/xla/xla/pjrt/distributed/client.h b/third_party/xla/xla/pjrt/distributed/client.h
index ddff101ea1a1c8..37e675cae871aa 100644
--- a/third_party/xla/xla/pjrt/distributed/client.h
+++ b/third_party/xla/xla/pjrt/distributed/client.h
@@ -71,9 +71,9 @@ class DistributedRuntimeClient {
     // is reported by the coordinator, or we have not heard from the coordinator
     // recently. `coordinator_reported_failure` is true in the former case.
     // Exposed so tests can override this behavior to something non-fatal.
-    std::function<void(xla::Status, bool coordinator_reported_failure)>
+    std::function<void(absl::Status, bool coordinator_reported_failure)>
         missed_heartbeat_callback =
-            [](xla::Status status, bool coordinator_reported_failure) {
+            [](absl::Status status, bool coordinator_reported_failure) {
               if (coordinator_reported_failure) {
                 LOG(QFATAL)
                     << "Terminating process because the coordinator detected "
@@ -104,12 +104,12 @@ class DistributedRuntimeClient {
   // connected.
   // Not thread-safe, i.e., calls to Connect()/Shutdown() must be serialized by
   // some other means.
-  virtual xla::Status Connect() = 0;
+  virtual absl::Status Connect() = 0;
 
   // Reports to the master that the client is ready to shutdown, and blocks
   // until all clients are ready to shutdown or the shutdown timeout expires.
   // Not thread-safe.
-  virtual xla::Status Shutdown() = 0;
+  virtual absl::Status Shutdown() = 0;
 
   // The following APIs are thread-safe.
 
@@ -127,17 +127,17 @@ class DistributedRuntimeClient {
   virtual absl::StatusOr<std::vector<std::pair<std::string, std::string>>>
   KeyValueDirGet(std::string_view key) = 0;
 
-  virtual xla::Status KeyValueSet(std::string_view key,
-                                  std::string_view value) = 0;
+  virtual absl::Status KeyValueSet(std::string_view key,
+                                   std::string_view value) = 0;
 
   // Delete the key-value. If the key is a directory, recursively clean
   // up all key-values under the directory.
-  virtual xla::Status KeyValueDelete(std::string_view key) = 0;
+  virtual absl::Status KeyValueDelete(std::string_view key) = 0;
 
   // Blocks until all nodes are at the barrier or the barrier times out.
   // `barrier_id` should be unique across barriers.
-  virtual xla::Status WaitAtBarrier(std::string barrier_id,
-                                    absl::Duration timeout) = 0;
+  virtual absl::Status WaitAtBarrier(std::string barrier_id,
+                                     absl::Duration timeout) = 0;
 
   // Returns pointer to coordination service agent, or InternalError if the
   // client does not use coordination service.
diff --git a/third_party/xla/xla/pjrt/distributed/client_server_test.cc b/third_party/xla/xla/pjrt/distributed/client_server_test.cc
index 8d55a3d41efdf3..f9a9a5c0a5cda6 100644
--- a/third_party/xla/xla/pjrt/distributed/client_server_test.cc
+++ b/third_party/xla/xla/pjrt/distributed/client_server_test.cc
@@ -115,7 +115,7 @@ TEST_F(ClientServerTest, ConnectAndShutdownAreBarriers) {
 
   absl::Barrier barrier(num_nodes);
 
-  auto thread_fn = [&](int node_id) -> xla::Status {
+  auto thread_fn = [&](int node_id) -> absl::Status {
     auto client = GetClient(node_id);
 
     // Allow the threads to call Connect one-by-one in order.
@@ -155,7 +155,7 @@ TEST_F(ClientServerTest, ConnectAndShutdownAreBarriers) {
     return OkStatus();
   };
 
-  std::vector<xla::Status> statuses(num_nodes);
+  std::vector<absl::Status> statuses(num_nodes);
   {
     tsl::thread::ThreadPool thread_pool(tsl::Env::Default(), "test_threads",
                                         num_nodes);
@@ -207,7 +207,7 @@ TEST_F(ClientServerTest, ConnectAndEnumerateDevices) {
   // client. This ensures that devices are sent out of turn (compared to their
   // node ids).
   absl::Notification n;
-  auto thread0_fn = [&]() -> xla::Status {
+  auto thread0_fn = [&]() -> absl::Status {
     auto client = GetClient(/*node_id=*/0);
     GlobalTopologyProto topology;
     TF_RETURN_IF_ERROR(client->Connect());
@@ -234,7 +234,7 @@ TEST_F(ClientServerTest, ConnectAndEnumerateDevices) {
     TF_RET_CHECK(value == "value2");
     return OkStatus();
   };
-  auto thread1_fn = [&]() -> xla::Status {
+  auto thread1_fn = [&]() -> absl::Status {
     auto client = GetClient(/*node_id=*/1);
     GlobalTopologyProto topology;
     TF_RETURN_IF_ERROR(client->Connect());
@@ -261,9 +261,9 @@ TEST_F(ClientServerTest, ConnectAndEnumerateDevices) {
     return OkStatus();
   };
 
-  std::vector<std::function<xla::Status()>> functions = {thread0_fn,
-                                                         thread1_fn};
-  std::vector<xla::Status> statuses(functions.size());
+  std::vector<std::function<absl::Status()>> functions = {thread0_fn,
+                                                          thread1_fn};
+  std::vector<absl::Status> statuses(functions.size());
   {
     tsl::thread::ThreadPool thread_pool(tsl::Env::Default(), "test_threads",
                                         functions.size());
@@ -299,7 +299,7 @@ TEST_F(ClientServerTest, EnumerateElevenDevices) {
     node->mutable_devices(0)->set_slice_index(i % 2);
   }
 
-  auto thread_fn = [&](int node_id) -> xla::Status {
+  auto thread_fn = [&](int node_id) -> absl::Status {
     auto client = GetClient(node_id);
     GlobalTopologyProto topology;
     TF_RETURN_IF_ERROR(client->Connect());
@@ -315,7 +315,7 @@ TEST_F(ClientServerTest, EnumerateElevenDevices) {
     return OkStatus();
   };
 
-  std::vector<xla::Status> statuses(num_nodes);
+  std::vector<absl::Status> statuses(num_nodes);
   {
     tsl::thread::ThreadPool thread_pool(tsl::Env::Default(), "test_threads",
                                         num_nodes);
@@ -336,7 +336,7 @@ TEST_F(ClientServerTest, ZeroInitTimeoutShouldStillWaitForOtherTasks) {
 
   absl::Barrier barrier(num_nodes);
 
-  auto thread_fn = [&](int node_id) -> xla::Status {
+  auto thread_fn = [&](int node_id) -> absl::Status {
     DistributedRuntimeClient::Options client_options;
     client_options.init_timeout = absl::ZeroDuration();
     auto client = GetClient(node_id, client_options);
@@ -351,7 +351,7 @@ TEST_F(ClientServerTest, ZeroInitTimeoutShouldStillWaitForOtherTasks) {
     return OkStatus();
   };
 
-  std::vector<xla::Status> statuses(num_nodes);
+  std::vector<absl::Status> statuses(num_nodes);
   {
     tsl::thread::ThreadPool thread_pool(tsl::Env::Default(), "test_threads",
                                         num_nodes);
@@ -368,11 +368,11 @@ TEST_F(ClientServerTest, ClientsTerminateShutdownIfAnyClientGoesAway) {
   int num_nodes = 3;
   StartService(num_nodes);
 
-  auto thread_fn = [&](int node_id) -> xla::Status {
+  auto thread_fn = [&](int node_id) -> absl::Status {
     DistributedRuntimeClient::Options client_options;
     client_options.shutdown_on_destruction = node_id != 0;
     client_options.missed_heartbeat_callback =
-        [&](xla::Status status, bool coordinator_initiated) {};
+        [&](absl::Status status, bool coordinator_initiated) {};
     auto client = GetClient(node_id, client_options);
 
     TF_RETURN_IF_ERROR(client->Connect());
@@ -387,7 +387,7 @@ TEST_F(ClientServerTest, ClientsTerminateShutdownIfAnyClientGoesAway) {
     return OkStatus();
   };
 
-  std::vector<xla::Status> statuses(num_nodes);
+  std::vector<absl::Status> statuses(num_nodes);
   {
     tsl::thread::ThreadPool thread_pool(tsl::Env::Default(), "test_threads",
                                         num_nodes);
@@ -413,11 +413,11 @@ TEST_F(ClientServerTest, ClientsReceiveMissedHeartbeatIfAnyClientGoesAway) {
   int num_nodes = 3;
   StartService(num_nodes);
 
-  auto thread_fn = [&](int node_id) -> xla::Status {
+  auto thread_fn = [&](int node_id) -> absl::Status {
     DistributedRuntimeClient::Options client_options;
     client_options.shutdown_on_destruction = (node_id != 0);
     absl::Notification shutdown;
-    client_options.missed_heartbeat_callback = [&](xla::Status status,
+    client_options.missed_heartbeat_callback = [&](absl::Status status,
                                                    bool coordinator_initiated) {
       shutdown.Notify();
     };
@@ -432,7 +432,7 @@ TEST_F(ClientServerTest, ClientsReceiveMissedHeartbeatIfAnyClientGoesAway) {
     return OkStatus();
   };
 
-  std::vector<xla::Status> statuses(num_nodes);
+  std::vector<absl::Status> statuses(num_nodes);
   {
     tsl::thread::ThreadPool thread_pool(tsl::Env::Default(), "test_threads",
                                         num_nodes);
@@ -456,12 +456,12 @@ TEST_F(ClientServerTest, ClientsTerminateIfServiceGoesAway) {
 
   absl::Barrier barrier(num_nodes + 1);
 
-  auto thread_fn = [&](int node_id) -> xla::Status {
+  auto thread_fn = [&](int node_id) -> absl::Status {
     DistributedRuntimeClient::Options client_options;
     client_options.rpc_timeout = absl::Seconds(1);
     client_options.shutdown_timeout = absl::Seconds(10);
     absl::Notification shutdown;
-    client_options.missed_heartbeat_callback = [&](xla::Status status,
+    client_options.missed_heartbeat_callback = [&](absl::Status status,
                                                    bool coordinator_initiated) {
       shutdown.Notify();
     };
@@ -480,7 +480,7 @@ TEST_F(ClientServerTest, ClientsTerminateIfServiceGoesAway) {
     return OkStatus();
   };
 
-  std::vector<xla::Status> statuses(num_nodes);
+  std::vector<absl::Status> statuses(num_nodes);
   {
     tsl::thread::ThreadPool thread_pool(tsl::Env::Default(), "test_threads",
                                         num_nodes);
@@ -502,7 +502,7 @@ TEST_F(ClientServerTest, LateClientsAreOk) {
 
   absl::Barrier barrier(num_nodes);
 
-  auto thread_fn = [&](int node_id) -> xla::Status {
+  auto thread_fn = [&](int node_id) -> absl::Status {
     DistributedRuntimeClient::Options client_options;
     client_options.init_timeout = absl::Seconds(20);
     client_options.rpc_timeout = absl::Milliseconds(200);
@@ -515,7 +515,7 @@ TEST_F(ClientServerTest, LateClientsAreOk) {
     return OkStatus();
   };
 
-  std::vector<xla::Status> statuses(num_nodes);
+  std::vector<absl::Status> statuses(num_nodes);
   {
     tsl::thread::ThreadPool thread_pool(tsl::Env::Default(), "test_threads",
                                         num_nodes);
@@ -537,13 +537,13 @@ TEST_F(ClientServerTest, ConnectEventuallyTimesOutIfAClientDoesNotShowUp) {
   service_options.shutdown_timeout = timeout;
   StartService(num_nodes, service_options);
 
-  auto thread_fn = [&](int node_id) -> xla::Status {
+  auto thread_fn = [&](int node_id) -> absl::Status {
     DistributedRuntimeClient::Options client_options;
     client_options.init_timeout = timeout;
     client_options.rpc_timeout = timeout;
     // Overwrite the default error callback which invokes LOG(QFATAL).
     client_options.missed_heartbeat_callback =
-        [](xla::Status status, bool coordinator_reported_failure) {
+        [](absl::Status status, bool coordinator_reported_failure) {
           LOG(ERROR) << "Distributed client has missing heartbeats: " << status;
         };
     auto client = GetClient(node_id, client_options);
@@ -554,7 +554,7 @@ TEST_F(ClientServerTest, ConnectEventuallyTimesOutIfAClientDoesNotShowUp) {
   };
 
   // Note: one fewer thread than 'num_nodes'.
-  std::vector<xla::Status> statuses(num_nodes - 1);
+  std::vector<absl::Status> statuses(num_nodes - 1);
   {
     tsl::thread::ThreadPool thread_pool(tsl::Env::Default(), "test_threads",
                                         num_nodes);
@@ -571,7 +571,7 @@ TEST_F(ClientServerTest, WaitAtBarrier_Succeed) {
   int num_nodes = 2;
   StartService(num_nodes);
 
-  auto thread_fn = [&](int node_id) -> xla::Status {
+  auto thread_fn = [&](int node_id) -> absl::Status {
     auto client = GetClient(node_id);
     TF_RETURN_IF_ERROR(client->Connect());
 
@@ -582,7 +582,7 @@ TEST_F(ClientServerTest, WaitAtBarrier_Succeed) {
     return xla::OkStatus();
   };
 
-  std::vector<xla::Status> statuses(num_nodes);
+  std::vector<absl::Status> statuses(num_nodes);
   {
     tsl::thread::ThreadPool thread_pool(tsl::Env::Default(), "test_threads",
                                         num_nodes);
@@ -600,7 +600,7 @@ TEST_F(ClientServerTest, WaitAtBarrier_Timeout) {
   StartService(num_nodes);
   absl::Notification n;
 
-  auto thread_fn = [&](int node_id) -> xla::Status {
+  auto thread_fn = [&](int node_id) -> absl::Status {
     auto client = GetClient(node_id);
     TF_RETURN_IF_ERROR(client->Connect());
 
@@ -619,7 +619,7 @@ TEST_F(ClientServerTest, WaitAtBarrier_Timeout) {
     return xla::OkStatus();
   };
 
-  std::vector<xla::Status> statuses(num_nodes);
+  std::vector<absl::Status> statuses(num_nodes);
   {
     tsl::thread::ThreadPool thread_pool(tsl::Env::Default(), "test_threads",
                                         num_nodes);
@@ -639,7 +639,7 @@ TEST_F(ClientServerTest, WaitAtBarrier_TimeoutWithDifferentBarrierId) {
   int num_nodes = 2;
   StartService(num_nodes);
 
-  auto thread_fn = [&](int node_id) -> xla::Status {
+  auto thread_fn = [&](int node_id) -> absl::Status {
     auto client = GetClient(node_id);
     TF_RETURN_IF_ERROR(client->Connect());
 
@@ -655,7 +655,7 @@ TEST_F(ClientServerTest, WaitAtBarrier_TimeoutWithDifferentBarrierId) {
     return xla::OkStatus();
   };
 
-  std::vector<xla::Status> statuses(num_nodes);
+  std::vector<absl::Status> statuses(num_nodes);
   {
     tsl::thread::ThreadPool thread_pool(tsl::Env::Default(), "test_threads",
                                         num_nodes);
@@ -673,7 +673,7 @@ TEST_F(ClientServerTest, WaitAtBarrier_FailWithSameBarrierId) {
   int num_nodes = 2;
   StartService(num_nodes);
 
-  auto thread_fn = [&](int node_id) -> xla::Status {
+  auto thread_fn = [&](int node_id) -> absl::Status {
     auto client = GetClient(node_id);
     TF_RETURN_IF_ERROR(client->Connect());
 
@@ -684,7 +684,7 @@ TEST_F(ClientServerTest, WaitAtBarrier_FailWithSameBarrierId) {
     return xla::OkStatus();
   };
 
-  std::vector<xla::Status> statuses(num_nodes);
+  std::vector<absl::Status> statuses(num_nodes);
   {
     tsl::thread::ThreadPool thread_pool(tsl::Env::Default(), "test_threads",
                                         num_nodes);
diff --git a/third_party/xla/xla/pjrt/distributed/service.cc b/third_party/xla/xla/pjrt/distributed/service.cc
index 6128d4dad65169..058859d364b904 100644
--- a/third_party/xla/xla/pjrt/distributed/service.cc
+++ b/third_party/xla/xla/pjrt/distributed/service.cc
@@ -21,10 +21,10 @@ limitations under the License.
 
 #include "absl/time/time.h"
 #include "grpcpp/server_builder.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service.h"
+#include "xla/tsl/distributed_runtime/rpc/async_service_interface.h"
+#include "xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h"
 #include "xla/util.h"
-#include "tsl/distributed_runtime/coordination/coordination_service.h"
-#include "tsl/distributed_runtime/rpc/async_service_interface.h"
-#include "tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/threadpool.h"
 #include "tsl/protobuf/coordination_config.pb.h"
diff --git a/third_party/xla/xla/pjrt/distributed/service.h b/third_party/xla/xla/pjrt/distributed/service.h
index 3e7a6f68672064..1bc5416fb5db9e 100644
--- a/third_party/xla/xla/pjrt/distributed/service.h
+++ b/third_party/xla/xla/pjrt/distributed/service.h
@@ -27,9 +27,9 @@ limitations under the License.
 #include "grpcpp/security/server_credentials.h"
 #include "grpcpp/server_builder.h"
 #include "xla/statusor.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service.h"
+#include "xla/tsl/distributed_runtime/rpc/async_service_interface.h"
 #include "xla/types.h"
-#include "tsl/distributed_runtime/coordination/coordination_service.h"
-#include "tsl/distributed_runtime/rpc/async_service_interface.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/threadpool.h"
 
diff --git a/third_party/xla/xla/pjrt/event_pool.cc b/third_party/xla/xla/pjrt/event_pool.cc
index 34c37a1b003941..a5e2f83c3e9d87 100644
--- a/third_party/xla/xla/pjrt/event_pool.cc
+++ b/third_party/xla/xla/pjrt/event_pool.cc
@@ -25,7 +25,7 @@ namespace xla {
 
 EventPool::Handle::~Handle() {
   if (pool_ && event_) {
-    absl::MutexLock lock(&pool_->mu_);
+    absl::MutexLock lock(&pool_->mu_free_events_);
     pool_->free_events_.push(std::move(event_));
   }
 }
@@ -39,7 +39,7 @@ absl::StatusOr<EventPool::Handle> EventPool::AllocateEvent(
 
   if (allow_reuse_) {
     event.pool_ = this;
-    absl::MutexLock lock(&mu_);
+    absl::MutexLock lock(&mu_free_events_);
     if (!free_events_.empty()) {
       event.event_ = std::move(free_events_.top());
       free_events_.pop();
@@ -53,7 +53,7 @@ absl::StatusOr<EventPool::Handle> EventPool::AllocateEvent(
 }
 
 void EventPool::ThenRecordEvent(se::Stream* stream, EventPool::Handle& handle) {
-  absl::MutexLock lock(&mu_);
+  absl::MutexLock lock(&mu_sequence_number_);
   stream->RecordEvent(handle.event_.get()).IgnoreError();
   handle.sequence_number_ = next_sequence_number_++;
 }
diff --git a/third_party/xla/xla/pjrt/event_pool.h b/third_party/xla/xla/pjrt/event_pool.h
index 89b8f6d8161a7b..2ce9bc7cbcc74e 100644
--- a/third_party/xla/xla/pjrt/event_pool.h
+++ b/third_party/xla/xla/pjrt/event_pool.h
@@ -86,9 +86,12 @@ class EventPool {
  private:
   const bool allow_reuse_;
 
-  absl::Mutex mu_;
-  std::stack<std::unique_ptr<se::Event>> free_events_ ABSL_GUARDED_BY(mu_);
-  uint64_t next_sequence_number_ ABSL_GUARDED_BY(mu_);
+  absl::Mutex mu_free_events_;
+  std::stack<std::unique_ptr<se::Event>> free_events_
+      ABSL_GUARDED_BY(mu_free_events_);
+
+  absl::Mutex mu_sequence_number_;
+  uint64_t next_sequence_number_ ABSL_GUARDED_BY(mu_sequence_number_);
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/gpu/BUILD b/third_party/xla/xla/pjrt/gpu/BUILD
index d6f5bfe7af8fcf..e446772dcd6e10 100644
--- a/third_party/xla/xla/pjrt/gpu/BUILD
+++ b/third_party/xla/xla/pjrt/gpu/BUILD
@@ -1,10 +1,10 @@
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
-load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//xla:xla.bzl", "xla_cc_test")
 load("//xla/stream_executor:build_defs.bzl", "if_cuda_or_rocm", "if_gpu_is_configured")
+load("//xla/tsl:tsl.bzl", "if_google", "internal_visibility")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -171,8 +171,8 @@ cc_library(
         "//xla/pjrt/distributed:client",
         "//xla/pjrt/distributed:key_value_store_interface",
         "//xla/service:global_device_id",
-        "//xla/service/gpu:nccl_api",
         "//xla/service/gpu:nccl_clique_key",
+        "//xla/service/gpu/runtime:nccl_api",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -278,11 +278,10 @@ xla_cc_test(
     name = "se_gpu_pjrt_compiler_test",
     srcs = if_gpu_is_configured(["se_gpu_pjrt_compiler_test.cc"]),
     tags = [
-        "config-cuda-only",
         "gpu",
         "no_oss",
         "requires-gpu-nvidia",
-    ],
+    ] + if_google(["config-cuda-only"]),
     deps = [
         ":se_gpu_pjrt_client",
         ":se_gpu_pjrt_compiler",
@@ -309,11 +308,10 @@ xla_cc_test(
     name = "se_gpu_pjrt_compiler_aot_test",
     srcs = if_gpu_is_configured(["se_gpu_pjrt_compiler_aot_test.cc"]),
     tags = [
-        "config-cuda-only",
         "gpu",
         "no_oss",
         "requires-gpu-nvidia",
-    ],
+    ] + if_google(["config-cuda-only"]),
     deps = [
         ":se_gpu_pjrt_client",
         ":se_gpu_pjrt_compiler",
@@ -327,8 +325,6 @@ xla_cc_test(
         "//xla/service:compiler",
         "//xla/service:gpu_plugin",
         "//xla/service:hlo_parser",
-        "//xla/service/gpu:nvptx_compiler_impl",
-        "//xla/stream_executor/cuda:cublas_plugin",
         "//xla/tests:literal_test_util",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
@@ -342,5 +338,11 @@ xla_cc_test(
         "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test_main",
-    ],
+    ] + if_cuda([
+        "//xla/service/gpu:nvptx_compiler_impl",
+        "//xla/stream_executor/cuda:cublas_plugin",
+    ]) + if_rocm([
+        "//xla/service/gpu:amdgpu_compiler_impl",
+        "//xla/stream_executor/rocm:rocblas_plugin",
+    ]),
 )
diff --git a/third_party/xla/xla/pjrt/gpu/gpu_helpers.cc b/third_party/xla/xla/pjrt/gpu/gpu_helpers.cc
index c9c6fe4da4dcef..b9477360de1190 100644
--- a/third_party/xla/xla/pjrt/gpu/gpu_helpers.cc
+++ b/third_party/xla/xla/pjrt/gpu/gpu_helpers.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/pjrt/gpu/gpu_helpers.h"
 
+#include <cstdint>
 #include <memory>
 #include <optional>
 #include <set>
@@ -72,7 +73,8 @@ void EnablePeerAccess(absl::Span<se::StreamExecutor* const> executors) {
 
 // Builds a BFCAllocator for all local GPUs.
 absl::StatusOr<std::unique_ptr<tsl::BFCAllocator>> CreateBFCAllocator(
-    se::StreamExecutor* executor, double memory_fraction, bool preallocate) {
+    se::StreamExecutor* executor, double memory_fraction, bool preallocate,
+    std::optional<int64_t> gpu_system_memory_size) {
   bool enable_unified_memory;
   Status status = tsl::ReadBoolFromEnvVar("TF_FORCE_UNIFIED_MEMORY", false,
                                           &enable_unified_memory);
@@ -103,6 +105,11 @@ absl::StatusOr<std::unique_ptr<tsl::BFCAllocator>> CreateBFCAllocator(
   size_t allocator_memory = enable_unified_memory
                                 ? total_memory * fmax(1.0, memory_fraction)
                                 : total_memory * memory_fraction;
+  // If gpu_system_memory_size is set, use it instead of default value.
+  if (gpu_system_memory_size.has_value()) {
+    allocator_memory = gpu_system_memory_size.value();
+  }
+
   if (preallocate) {
     LOG(INFO) << "XLA backend allocating " << allocator_memory
               << " bytes on device " << device_ordinal << " for BFCAllocator.";
diff --git a/third_party/xla/xla/pjrt/gpu/gpu_helpers.h b/third_party/xla/xla/pjrt/gpu/gpu_helpers.h
index 6ee4cc0c886b3e..cdda4ff7e075c9 100644
--- a/third_party/xla/xla/pjrt/gpu/gpu_helpers.h
+++ b/third_party/xla/xla/pjrt/gpu/gpu_helpers.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_PJRT_GPU_GPU_HELPERS_H_
 #define XLA_PJRT_GPU_GPU_HELPERS_H_
 
+#include <cstdint>
 #include <memory>
 #include <optional>
 #include <set>
@@ -50,8 +51,17 @@ struct GpuAllocatorConfig {
 
   // Only used if kind == kBFC. The maximum fraction of available memory to
   // allocate. This is the default value of XLA_PYTHON_CLIENT_MEM_FRACTION.
+  //
+  // If `gpu_system_memory_size` is set, it determines memory allocation.
+  // `memory_fraction` won't be used in this case.
   double memory_fraction = 0.75;
 
+  // Only used if kind == kBFC. The absolute size of reserved memory space for
+  // GPU system in bytes.
+  //
+  // If null, the default value `memory_fraction` will be used.
+  std::optional<int64_t> gpu_system_memory_size = std::nullopt;
+
   // Only used if kind == kBFC. If true, the allocator will immediately allocate
   // the maximum amount allowed by `memory_fraction`. This reduces
   // fragmentation, allowing more of the total memory to be used. If false, the
@@ -71,7 +81,8 @@ std::unique_ptr<tsl::BFCAllocator> GetGpuHostAllocator(
 
 // Builds a BFCAllocator for all local GPUs.
 absl::StatusOr<std::unique_ptr<tsl::BFCAllocator>> CreateBFCAllocator(
-    se::StreamExecutor* executor, double memory_fraction, bool preallocate);
+    se::StreamExecutor* executor, double memory_fraction, bool preallocate,
+    std::optional<int64_t> gpu_system_memory_size);
 
 // Builds a BFCAllocator for all local GPUs that uses collective memory.
 absl::StatusOr<std::unique_ptr<tsl::BFCAllocator>> CreateCollectiveBFCAllocator(
diff --git a/third_party/xla/xla/pjrt/gpu/nccl_id_store.cc b/third_party/xla/xla/pjrt/gpu/nccl_id_store.cc
index facdecada09afd..55726a05427888 100644
--- a/third_party/xla/xla/pjrt/gpu/nccl_id_store.cc
+++ b/third_party/xla/xla/pjrt/gpu/nccl_id_store.cc
@@ -20,8 +20,8 @@ limitations under the License.
 
 #include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
-#include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/nccl_clique_key.h"
+#include "xla/service/gpu/runtime/nccl_api.h"
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
 #include "tsl/platform/errors.h"
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
index 9dec918f993274..3d0f3d3ba76253 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
@@ -503,7 +503,7 @@ StreamExecutorGpuClient::GetDefaultDeviceAssignment(int num_replicas,
                                                               num_partitions);
 }
 
-PjRtFuture<absl::Status> StreamExecutorGpuClient::CopyRawSubBufferToHost(
+PjRtFuture<> StreamExecutorGpuClient::CopyRawSubBufferToHost(
     PjRtBuffer* pjrt_buffer, void* dst, int64_t offset, int64_t transfer_size) {
   auto* buffer = tensorflow::down_cast<PjRtStreamExecutorBuffer*>(pjrt_buffer);
   DCHECK(buffer);
@@ -517,17 +517,16 @@ PjRtFuture<absl::Status> StreamExecutorGpuClient::CopyRawSubBufferToHost(
 
   PjRtStreamExecutorBuffer::ScopedHold hold(buffer->GetBufferWithUsageHold());
   if (!hold.ok()) {
-    return PjRtFuture<absl::Status>(hold.status());
+    return PjRtFuture<>(hold.status());
   }
   auto device_buffer = hold.buffer();
   if (device_buffer->device_memory().size() != 1) {
-    return PjRtFuture<absl::Status>(
-        InvalidArgument("Copy raw buffer called on tuple"));
+    return PjRtFuture<>(InvalidArgument("Copy raw buffer called on tuple"));
   }
   auto& device_memory = device_buffer->device_memory()[0];
   if (offset < 0 || offset > device_memory.size() ||
       device_memory.size() - offset < transfer_size) {
-    return PjRtFuture<absl::Status>(
+    return PjRtFuture<>(
         InvalidArgument("Copy raw buffer called on buffer size %lld with "
                         "invalid offset %lld, transfer size %lld",
                         device_memory.size(), offset, transfer_size));
@@ -536,7 +535,7 @@ PjRtFuture<absl::Status> StreamExecutorGpuClient::CopyRawSubBufferToHost(
   absl::StatusOr<EventPool::Handle> event_or =
       local_device->event_pool().AllocateEvent(stream->parent());
   if (!event_or.ok()) {
-    return PjRtFuture<absl::Status>(event_or.status());
+    return PjRtFuture<>(event_or.status());
   }
 
   std::unique_ptr<se::DeviceMemoryBase> sub_buffer;
@@ -550,7 +549,7 @@ PjRtFuture<absl::Status> StreamExecutorGpuClient::CopyRawSubBufferToHost(
   if (transfer_size != 0) {
     if (should_stage_host_to_device_transfers()) {
       if (host_memory_allocator() == nullptr) {
-        return PjRtFuture<absl::Status>(InvalidArgument(
+        return PjRtFuture<>(InvalidArgument(
             "host_memory_allocator should be initialized for staging buffer "
             "transfer."));
       }
@@ -564,7 +563,7 @@ PjRtFuture<absl::Status> StreamExecutorGpuClient::CopyRawSubBufferToHost(
       if (auto status =
               stream->Memcpy(staging_buffer.get(), *sub_buffer, transfer_size);
           !status.ok()) {
-        return PjRtFuture<absl::Status>(status);
+        return PjRtFuture<>(status);
       }
       auto copy_to_staging_buffer = [dst, transfer_size,
                                      staging_buffer]() mutable {
@@ -572,7 +571,7 @@ PjRtFuture<absl::Status> StreamExecutorGpuClient::CopyRawSubBufferToHost(
       };
       if (auto status = stream->DoHostCallback(copy_to_staging_buffer);
           !status.ok()) {
-        return PjRtFuture<absl::Status>(status);
+        return PjRtFuture<>(status);
       }
     } else {
       // D2H request holds a non-owned pointer into sub_buffer base address
@@ -580,7 +579,7 @@ PjRtFuture<absl::Status> StreamExecutorGpuClient::CopyRawSubBufferToHost(
       // invoked.
       auto status = stream->Memcpy(dst, *sub_buffer, transfer_size);
       if (!status.ok()) {
-        return PjRtFuture<absl::Status>(status);
+        return PjRtFuture<>(status);
       }
     }
   }
@@ -594,20 +593,20 @@ PjRtFuture<absl::Status> StreamExecutorGpuClient::CopyRawSubBufferToHost(
   hold.ConvertUsageHold(stream.get(), std::move(usage_event),
                         /*reference_held=*/false);
 
-  auto promise = PjRtFuture<absl::Status>::CreatePromise();
+  auto promise = PjRtFuture<>::CreatePromise();
   auto stream_ptr = stream.get();
   auto callback_status = local_device->ThenExecuteCallback(
       stream_ptr,
       [promise, free_stream = stream.release(), local_device]() mutable {
         auto stream = std::unique_ptr<se::Stream>(free_stream);
         local_device->ReturnStreamToPool(std::move(stream));
-        promise.Set(OkStatus());
+        promise.Set();
       });
   if (!callback_status.ok()) {
-    return PjRtFuture<absl::Status>(callback_status);
+    return PjRtFuture<>(callback_status);
   }
 
-  return PjRtFuture<Status>(
+  return PjRtFuture<>(
       std::move(promise),
       /*on_block_start=*/
       []() {
@@ -857,7 +856,8 @@ GetStreamExecutorGpuDeviceAllocator(
             auto bfc_allocator,
             CreateBFCAllocator(ordinal_and_device.second->executor(),
                                allocator_config.memory_fraction,
-                               allocator_config.preallocate));
+                               allocator_config.preallocate,
+                               allocator_config.gpu_system_memory_size));
         allocators.emplace_back(std::move(bfc_allocator),
                                 ordinal_and_device.second->compute_stream(),
                                 /*memory_space=*/0);
@@ -1130,6 +1130,12 @@ absl::StatusOr<std::string> StreamExecutorGpuTopologyDescription::Serialize()
   return result;
 }
 
+absl::StatusOr<Layout> StreamExecutorGpuTopologyDescription::GetDefaultLayout(
+    PrimitiveType element_type, absl::Span<const int64_t> dims) const {
+  Shape shape = ShapeUtil::MakeShape(element_type, dims);
+  return LayoutUtil::GetWithDefaultLayout(shape).layout();
+}
+
 std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> BuildLocalDevices(
     std::map<int, std::unique_ptr<LocalDeviceState>> local_device_states,
     int node_id) {
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
index 529258e2a90cc0..cbdb85351a239c 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
@@ -132,6 +132,10 @@ class StreamExecutorGpuTopologyDescription : public PjRtTopologyDescription {
     return attributes_;
   }
 
+  StatusOr<Layout> GetDefaultLayout(
+      PrimitiveType element_type,
+      absl::Span<const int64_t> dims) const override;
+
  private:
   const PjRtPlatformId platform_id_;
   const std::string platform_name_;
@@ -192,9 +196,9 @@ class StreamExecutorGpuClient : public xla::PjRtStreamExecutorClient {
   CreateBuffersForAsyncHostToDevice(absl::Span<const Shape> shapes,
                                     PjRtDevice* device) override;
 
-  PjRtFuture<Status> CopyRawSubBufferToHost(PjRtBuffer* buffer, void* dst,
-                                            int64_t offset,
-                                            int64_t transfer_size) override;
+  PjRtFuture<> CopyRawSubBufferToHost(PjRtBuffer* buffer, void* dst,
+                                      int64_t offset,
+                                      int64_t transfer_size) override;
 
   absl::StatusOr<const xla::PjRtTopologyDescription*> GetTopologyDescription()
       const override {
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
index 18ad76139a73e8..ebb68c2a483f93 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
@@ -195,7 +195,12 @@ StreamExecutorGpuCompiler::Compile(CompileOptions options,
 }
 
 STREAM_EXECUTOR_REGISTER_MODULE_INITIALIZER(pjrt_register_se_gpu_compiler, {
-  PjRtRegisterCompiler(CudaName(),
-                       std::make_unique<StreamExecutorGpuCompiler>());
+  PjRtRegisterCompiler(
+#if TENSORFLOW_USE_ROCM
+      RocmName(),
+#else
+                       CudaName(),
+#endif
+      std::make_unique<StreamExecutorGpuCompiler>());
 });
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc
index fe987f5c5b3b73..3d0c2d0319ed6b 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc
@@ -39,7 +39,11 @@ limitations under the License.
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/service/compiler.h"
+#if GOOGLE_CUDA
 #include "xla/service/gpu/nvptx_compiler.h"
+#elif TENSORFLOW_USE_ROCM
+#include "xla/service/gpu/amdgpu_compiler.h"
+#endif
 #include "xla/service/hlo_parser.h"
 #include "xla/tests/literal_test_util.h"
 #include "tsl/platform/casts.h"
@@ -116,7 +120,11 @@ TEST(StreamExecutorGpuCompilerTest, SuccessAotCompileXlaAndLoad) {
                           GetStreamExecutorGpuClient(GpuClientOptions()));
   auto se_client = absl::WrapUnique(
       tensorflow::down_cast<StreamExecutorGpuClient*>(client.release()));
+#if GOOGLE_CUDA
   auto gpu_compiler = gpu::NVPTXCompiler();
+#elif TENSORFLOW_USE_ROCM
+  auto gpu_compiler = gpu::AMDGPUCompiler();
+#endif
   Compiler::TargetConfig gpu_target_config{
       se_client->client()->backend().default_stream_executor()};
   StreamExecutorGpuCompiler compiler;
diff --git a/third_party/xla/xla/pjrt/host_callback_test.cc b/third_party/xla/xla/pjrt/host_callback_test.cc
index a1125a601736af..d142b878471539 100644
--- a/third_party/xla/xla/pjrt/host_callback_test.cc
+++ b/third_party/xla/xla/pjrt/host_callback_test.cc
@@ -57,11 +57,11 @@ class TestStream : public CopyToDeviceStream {
         chunk_(chunk),
         done_(done) {}
 
-  PjRtFuture<Status> AddChunk(PjRtChunk chunk) override {
+  PjRtFuture<> AddChunk(PjRtChunk chunk) override {
     CHECK(!done_.HasBeenNotified());
     chunk_ = std::move(chunk);
     done_.Notify();
-    return PjRtFuture<Status>(OkStatus());
+    return PjRtFuture<>(OkStatus());
   }
 
  private:
diff --git a/third_party/xla/xla/pjrt/host_memory_spaces.cc b/third_party/xla/xla/pjrt/host_memory_spaces.cc
new file mode 100644
index 00000000000000..b052b32de01285
--- /dev/null
+++ b/third_party/xla/xla/pjrt/host_memory_spaces.cc
@@ -0,0 +1,57 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/pjrt/host_memory_spaces.h"
+
+#include <cstdint>
+
+#include "absl/log/check.h"
+#include "absl/strings/str_format.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "tsl/platform/fingerprint.h"
+
+namespace xla {
+
+UnpinnedHostMemorySpace::UnpinnedHostMemorySpace(int id, PjRtDevice* device)
+    : id_(id), device_(device) {
+  DCHECK(device_ != nullptr && device_->client() != nullptr);
+  auto* client = device_->client();
+  debug_string_ = absl::StrFormat(
+      "UnpinnedHostMemorySpace(id=%i, process_index=%i, client=%s)", id_,
+      client->process_index(), client->platform_name());
+  to_string_ = absl::StrFormat("UNPINNED_HOST_%i", id_);
+}
+
+const int UnpinnedHostMemorySpace::kKindId = []() {
+  uint32_t kind_id = tsl::Fingerprint32(UnpinnedHostMemorySpace::kKind);
+  return static_cast<int>(kind_id);
+}();
+
+PinnedHostMemorySpace::PinnedHostMemorySpace(int id, PjRtDevice* device)
+    : id_(id), device_(device) {
+  DCHECK(device_ != nullptr && device_->client() != nullptr);
+  auto* client = device_->client();
+  debug_string_ =
+      absl::StrFormat("PinnedHostMemory(id=%i, process_index=%i, client=%s)",
+                      id_, client->process_index(), client->platform_name());
+  to_string_ = absl::StrFormat("PINNED_HOST_%i", id_);
+}
+
+const int PinnedHostMemorySpace::kKindId = []() {
+  uint32_t kind_id = tsl::Fingerprint32(PinnedHostMemorySpace::kKind);
+  return static_cast<int>(kind_id);
+}();
+
+}  // namespace xla
diff --git a/third_party/xla/xla/pjrt/host_memory_spaces.h b/third_party/xla/xla/pjrt/host_memory_spaces.h
new file mode 100644
index 00000000000000..4aa790ef3648a8
--- /dev/null
+++ b/third_party/xla/xla/pjrt/host_memory_spaces.h
@@ -0,0 +1,96 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_HOST_MEMORY_SPACES_H_
+#define XLA_PJRT_HOST_MEMORY_SPACES_H_
+
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/pjrt/pjrt_client.h"
+
+namespace xla {
+
+// Represents the unpinned host memory accessible to a `PjRtDevice`.
+// An "unpinned" host memory space accommodates ordinary host buffers that are
+// not mapped to any virtual memory of the attached `PjRtDevice`.
+class UnpinnedHostMemorySpace : public PjRtMemorySpace {
+ public:
+  static constexpr absl::string_view kKind = "unpinned_host";
+  static const int kKindId;
+
+  UnpinnedHostMemorySpace(int id, PjRtDevice* device);
+
+  PjRtClient* client() const override { return device_->client(); }
+
+  absl::Span<PjRtDevice* const> devices() const override {
+    return absl::Span<PjRtDevice* const>(&device_, device_ != nullptr ? 1 : 0);
+  }
+
+  int id() const override { return id_; }
+
+  absl::string_view kind() const override { return kKind; }
+
+  int kind_id() const override { return kKindId; }
+
+  absl::string_view DebugString() const override { return debug_string_; }
+
+  absl::string_view ToString() const override { return to_string_; }
+
+ private:
+  int id_;
+  PjRtDevice* device_ = nullptr;
+  std::string debug_string_;
+  std::string to_string_;
+};
+
+// Represents the pinned host memory accessible to a `PjRtDevice`.
+// A "pinned" host memory space accommodates host buffers that are mapped to a
+// virtual memory of the attached `PjRtDevice`. The `PjRtDevice` may have the
+// capability to direct-memory-access (DMA) the buffers in this memory space.
+class PinnedHostMemorySpace : public PjRtMemorySpace {
+ public:
+  static constexpr absl::string_view kKind = "pinned_host";
+  static const int kKindId;
+
+  PinnedHostMemorySpace(int id, PjRtDevice* device);
+
+  PjRtClient* client() const override { return device_->client(); }
+
+  absl::Span<PjRtDevice* const> devices() const override {
+    return absl::Span<PjRtDevice* const>(&device_, device_ != nullptr ? 1 : 0);
+  }
+
+  int id() const override { return id_; }
+
+  absl::string_view kind() const override { return kKind; }
+
+  int kind_id() const override { return kKindId; }
+
+  absl::string_view DebugString() const override { return debug_string_; }
+
+  absl::string_view ToString() const override { return to_string_; }
+
+ private:
+  int id_;
+  PjRtDevice* device_ = nullptr;
+  std::string debug_string_;
+  std::string to_string_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_HOST_MEMORY_SPACES_H_
diff --git a/third_party/xla/xla/pjrt/pjrt_api.cc b/third_party/xla/xla/pjrt/pjrt_api.cc
index a710d8b62135b4..6cfb7d2b07058f 100644
--- a/third_party/xla/xla/pjrt/pjrt_api.cc
+++ b/third_party/xla/xla/pjrt/pjrt_api.cc
@@ -57,7 +57,7 @@ static std::string CanonicalizeDeviceType(absl::string_view device_type) {
   return absl::AsciiStrToLower(device_type);
 }
 
-xla::StatusOr<const PJRT_Api*> PjrtApi(absl::string_view device_type) {
+absl::StatusOr<const PJRT_Api*> PjrtApi(absl::string_view device_type) {
   std::string canonicalize_device_type = CanonicalizeDeviceType(device_type);
   auto iter = pjrt_apis->find(canonicalize_device_type);
   if (iter == pjrt_apis->end()) {
@@ -67,7 +67,7 @@ xla::StatusOr<const PJRT_Api*> PjrtApi(absl::string_view device_type) {
   return iter->second.first;
 }
 
-xla::Status SetPjrtApi(absl::string_view device_type, const PJRT_Api* api) {
+absl::Status SetPjrtApi(absl::string_view device_type, const PJRT_Api* api) {
   std::string canonicalize_device_type = CanonicalizeDeviceType(device_type);
   if (auto iter = pjrt_apis->find(canonicalize_device_type);
       iter != pjrt_apis->end()) {
@@ -81,8 +81,8 @@ xla::Status SetPjrtApi(absl::string_view device_type, const PJRT_Api* api) {
 }
 
 typedef const PJRT_Api* (*PjrtApiInitFn)();
-xla::StatusOr<const PJRT_Api*> LoadPjrtPlugin(absl::string_view device_type,
-                                              absl::string_view library_path) {
+absl::StatusOr<const PJRT_Api*> LoadPjrtPlugin(absl::string_view device_type,
+                                               absl::string_view library_path) {
 #ifdef PLATFORM_WINDOWS
   return tsl::errors::Unimplemented(
       "LoadPjrtPlugin is not implemented on windows yet.");
@@ -105,7 +105,7 @@ xla::StatusOr<const PJRT_Api*> LoadPjrtPlugin(absl::string_view device_type,
 #endif
 }
 
-xla::StatusOr<bool> IsPjrtPluginInitialized(absl::string_view device_type) {
+absl::StatusOr<bool> IsPjrtPluginInitialized(absl::string_view device_type) {
   std::string canonicalize_device_type = CanonicalizeDeviceType(device_type);
   auto iter = pjrt_apis->find(canonicalize_device_type);
   if (iter == pjrt_apis->end()) {
@@ -128,7 +128,7 @@ static bool IsPjRtCompatibilityEnabled() {
   return enabled;
 }
 
-xla::Status InitializePjrtPlugin(absl::string_view device_type) {
+absl::Status InitializePjrtPlugin(absl::string_view device_type) {
   std::string canonicalize_device_type = CanonicalizeDeviceType(device_type);
   auto iter = pjrt_apis->find(canonicalize_device_type);
   if (iter == pjrt_apis->end()) {
diff --git a/third_party/xla/xla/pjrt/pjrt_api.h b/third_party/xla/xla/pjrt/pjrt_api.h
index dd55d850bdbfd8..eff361c3cbe7b6 100644
--- a/third_party/xla/xla/pjrt/pjrt_api.h
+++ b/third_party/xla/xla/pjrt/pjrt_api.h
@@ -25,22 +25,22 @@ namespace pjrt {
 
 // Gets and sets the global map for PJRT_Api*. Not thread safe. `device_type` is
 // case insensitive.
-xla::StatusOr<const PJRT_Api*> PjrtApi(absl::string_view device_type);
-xla::Status SetPjrtApi(absl::string_view device_type, const PJRT_Api* api);
+absl::StatusOr<const PJRT_Api*> PjrtApi(absl::string_view device_type);
+absl::Status SetPjrtApi(absl::string_view device_type, const PJRT_Api* api);
 
 // Loads a PJRT plugin. The library provided by library_path must export a
 // symbol called `GetPjrtApi` with function signature `const PJRT_Api*
 // GetPjrtApi()`. This method dlopen the plugin library, dlsym `GetPjrtApi`,
 // calls `GetPjrtApi` and `SetPjrtApi`. Returns the loaded PJRT_Api* if
 // successful.
-xla::StatusOr<const PJRT_Api*> LoadPjrtPlugin(absl::string_view device_type,
-                                              absl::string_view library_path);
+absl::StatusOr<const PJRT_Api*> LoadPjrtPlugin(absl::string_view device_type,
+                                               absl::string_view library_path);
 
 // Requires that SetPjrtApi has been successfully called on `device_type` before
 // calling this method.
-xla::StatusOr<bool> IsPjrtPluginInitialized(absl::string_view device_type);
+absl::StatusOr<bool> IsPjrtPluginInitialized(absl::string_view device_type);
 // Initializes a PJRT plugin with `PJRT_Plugin_Initialize`.
-xla::Status InitializePjrtPlugin(absl::string_view device_type);
+absl::Status InitializePjrtPlugin(absl::string_view device_type);
 
 }  // namespace pjrt
 
diff --git a/third_party/xla/xla/pjrt/pjrt_c_api_client.cc b/third_party/xla/xla/pjrt/pjrt_c_api_client.cc
index 8cea8ef39858f1..70b826e827f458 100644
--- a/third_party/xla/xla/pjrt/pjrt_c_api_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_c_api_client.cc
@@ -93,15 +93,15 @@ namespace xla {
 
 // Return error future if not success and frees the PJRT_Error returned by
 // `expr`.
-#define RETURN_FUTURE_IF_ERROR(expr, c_api)                             \
-  do {                                                                  \
-    PJRT_Error* error = (expr);                                         \
-    std::unique_ptr<PJRT_Error, pjrt::PJRT_ErrorDeleter> _error(        \
-        error, pjrt::MakeErrorDeleter(c_api));                          \
-    xla::Status _status = pjrt::PjrtErrorToStatus(_error.get(), c_api); \
-    if (!_status.ok()) {                                                \
-      return PjRtFuture<Status>(_status);                               \
-    }                                                                   \
+#define RETURN_FUTURE_IF_ERROR(expr, c_api)                              \
+  do {                                                                   \
+    PJRT_Error* error = (expr);                                          \
+    std::unique_ptr<PJRT_Error, pjrt::PJRT_ErrorDeleter> _error(         \
+        error, pjrt::MakeErrorDeleter(c_api));                           \
+    absl::Status _status = pjrt::PjrtErrorToStatus(_error.get(), c_api); \
+    if (!_status.ok()) {                                                 \
+      return PjRtFuture<>(_status);                                      \
+    }                                                                    \
   } while (false)
 
 // ---------------------------------- Client -----------------------------------
@@ -131,6 +131,7 @@ PjRtCApiClient::PjRtCApiClient(
       platform_name_(::pjrt::GetPlatformName(c_client, c_api)),
       platform_id_(tsl::Fingerprint64(platform_name_)) {
   InitDevicesAndMemorySpaces();
+  InitAttributes();
   LOG(INFO) << "PjRtCApiClient created.";
 }
 
@@ -252,6 +253,15 @@ void PjRtCApiClient::InitDevicesAndMemorySpaces() {
   }
 }
 
+void PjRtCApiClient::InitAttributes() {
+  PJRT_Plugin_Attributes_Args args;
+  args.struct_size = PJRT_Plugin_Attributes_Args_STRUCT_SIZE;
+  args.extension_start = nullptr;
+  pjrt::LogFatalIfPjrtError(c_api_->PJRT_Plugin_Attributes(&args), c_api_);
+  attributes_ =
+      pjrt::ConvertFromPjRtNamedValueList(args.attributes, args.num_attributes);
+}
+
 int PjRtCApiClient::device_count() const { return devices_.size(); }
 
 int PjRtCApiClient::addressable_device_count() const {
@@ -281,6 +291,12 @@ absl::string_view PjRtCApiClient::platform_version() const {
   return platform_version_;
 }
 
+std::optional<PjRtPluginAttributes> PjRtCApiClient::plugin_attributes() const {
+  return PjRtPluginAttributes{c_api_->pjrt_api_version.major_version,
+                              c_api_->pjrt_api_version.minor_version,
+                              attributes_};
+}
+
 static DeviceAssignment CalculateDefaultAssignment(
     int num_replicas, int num_partitions,
     absl::Span<const int> device_assignment) {
@@ -468,13 +484,13 @@ PjRtCApiClient::BufferFromHostBufferInternalImpl(
     std::variant<PjRtDevice*, PjRtMemorySpace*> device_or_memory,
     const Layout* device_layout) {
   if (host_buffer_semantics != HostBufferSemantics::kImmutableOnlyDuringCall &&
-      host_buffer_semantics != HostBufferSemantics::kZeroCopy &&
+      host_buffer_semantics != HostBufferSemantics::kImmutableZeroCopy &&
       host_buffer_semantics !=
           HostBufferSemantics::kImmutableUntilTransferCompletes) {
     return Unimplemented(
         "PJRT C API does not support HostBufferSemantics other than "
         "HostBufferSemantics::kImmutableOnlyDuringCall, "
-        "HostBufferSemantics::kZeroCopy and "
+        "HostBufferSemantics::kImmutableZeroCopy and "
         "HostBufferSemantics::kImmutableUntilTransferCompletes.");
   }
 
@@ -868,7 +884,7 @@ int PjRtCApiMemorySpace::id() const {
   return args.id;
 }
 
-absl::string_view PjRtCApiMemorySpace::memory_space_kind() const {
+absl::string_view PjRtCApiMemorySpace::kind() const {
   PJRT_Memory_Kind_Args args;
   args.struct_size = PJRT_Memory_Kind_Args_STRUCT_SIZE;
   args.extension_start = nullptr;
@@ -877,7 +893,22 @@ absl::string_view PjRtCApiMemorySpace::memory_space_kind() const {
   pjrt::LogFatalIfPjrtError(pjrt_c_api()->PJRT_Memory_Kind(&args),
                             pjrt_c_api());
 
-  return absl::string_view(args.memory_kind, args.memory_kind_size);
+  return absl::string_view(args.kind, args.kind_size);
+}
+
+int PjRtCApiMemorySpace::kind_id() const {
+  PJRT_Memory_Kind_Id_Args args;
+  args.struct_size = PJRT_Memory_Kind_Id_Args_STRUCT_SIZE;
+  args.extension_start = nullptr;
+  args.memory = c_memory_;
+  if (pjrt_c_api()->pjrt_api_version.major_version > 0 ||
+      pjrt_c_api()->pjrt_api_version.minor_version >= 48) {
+    // The `kind_id` API is added in version 0.48.
+    pjrt::LogFatalIfPjrtError(pjrt_c_api()->PJRT_Memory_Kind_Id(&args),
+                              pjrt_c_api());
+    return args.kind_id;
+  }
+  return tsl::Fingerprint32(kind());
 }
 
 absl::string_view PjRtCApiMemorySpace::DebugString() const {
@@ -1229,9 +1260,9 @@ PJRT_SendCallbackInfo CppSendCallbackToC(
     // PJRT C API doesn't support
     // use_major_to_minor_data_layout_for_callbacks = false
     xla::Shape dummy_shape;
-    xla::Status status = send_callback(xla::PjRtTransferMetadata{dummy_shape},
-                                       ::pjrt::ConvertToCppChunk(*chunk),
-                                       total_size_in_bytes, done);
+    absl::Status status = send_callback(xla::PjRtTransferMetadata{dummy_shape},
+                                        ::pjrt::ConvertToCppChunk(*chunk),
+                                        total_size_in_bytes, done);
     if (!status.ok()) {
       absl::string_view message = status.message();
       return (*callback_error)(pjrt::StatusCodeToPjrtErrorCode(status.code()),
@@ -1291,7 +1322,7 @@ CApiCopyToDeviceStream::~CApiCopyToDeviceStream() {
       c_api_->PJRT_CopyToDeviceStream_Destroy(&destroy_args), c_api_);
 }
 
-PjRtFuture<Status> CApiCopyToDeviceStream::AddChunk(PjRtChunk chunk) {
+PjRtFuture<> CApiCopyToDeviceStream::AddChunk(PjRtChunk chunk) {
   PJRT_Chunk c_chunk = ::pjrt::ConvertFromCppChunk(std::move(chunk));
 
   PJRT_CopyToDeviceStream_AddChunk_Args add_chunk_args;
@@ -1402,7 +1433,7 @@ static void CppRecvCallbackListsToC(
   }
 }
 
-xla::StatusOr<PJRT_LoadedExecutable_Execute_Args>
+absl::StatusOr<PJRT_LoadedExecutable_Execute_Args>
 PjRtCApiLoadedExecutable::GetCommonExecuteArgs(
     absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
     const ExecuteOptions& options, PJRT_ExecuteOptions& c_options,
@@ -1504,7 +1535,7 @@ StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>>
 PjRtCApiLoadedExecutable::Execute(
     absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
     const ExecuteOptions& options,
-    std::optional<std::vector<PjRtFuture<Status>>>& returned_futures) {
+    std::optional<std::vector<PjRtFuture<>>>& returned_futures) {
   std::vector<std::vector<PJRT_Buffer*>> c_argument_lists_storage;
   std::vector<std::vector<PJRT_Buffer*>> c_output_lists_storage;
   std::vector<PJRT_Buffer**> c_output_lists;
@@ -1538,22 +1569,22 @@ PjRtCApiLoadedExecutable::Execute(
       pjrt_c_api()->PJRT_LoadedExecutable_Execute(&args), pjrt_c_api());
 
   if (device_complete_events.has_value()) {
-    std::vector<PjRtFuture<Status>> device_complete_futures;
-    device_complete_futures.resize(args.num_devices);
-    for (int i = 0; i < device_complete_futures.size(); ++i) {
-      device_complete_futures[i] = pjrt::ConvertCEventToCppFuture(
-          args.device_complete_events[i], pjrt_c_api());
+    std::vector<PjRtFuture<>> device_complete_futures;
+    device_complete_futures.reserve(args.num_devices);
+    for (int i = 0; i < args.num_devices; ++i) {
+      device_complete_futures.push_back(pjrt::ConvertCEventToCppFuture(
+          args.device_complete_events[i], pjrt_c_api()));
       if (!callback_data->c_send_callbacks.empty() ||
           !callback_data->c_recv_callbacks.empty()) {
-        device_complete_futures[i].OnReady([callback_data](xla::Status status) {
-          // Keeps C callbacks alive until execution completes on all
-          // devices.
-        });
+        device_complete_futures.back().OnReady(
+            [callback_data](absl::Status status) {
+              // Keeps C callbacks alive until execution completes on all
+              // devices.
+            });
       }
     }
 
     if (returned_futures.has_value()) {
-      returned_futures->resize(device_complete_futures.size());
       *returned_futures = std::move(device_complete_futures);
     }
   }
@@ -1566,8 +1597,8 @@ PjRtCApiLoadedExecutable::Execute(
 StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
 PjRtCApiLoadedExecutable::ExecuteWithSingleDevice(
     absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
-    const ExecuteOptions& options,
-    std::optional<PjRtFuture<Status>>& returned_future, bool fill_future) {
+    const ExecuteOptions& options, std::optional<PjRtFuture<>>& returned_future,
+    bool fill_future) {
   if (!options.send_callbacks.empty() || !options.recv_callbacks.empty()) {
     return Status(absl::StatusCode::kUnimplemented,
                   "Send/recv callbacks not implemented for "
@@ -1622,8 +1653,8 @@ PjRtCApiLoadedExecutable::ExecuteWithSingleDevice(
 StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
 PjRtCApiLoadedExecutable::ExecuteSharded(
     absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
-    const ExecuteOptions& options,
-    std::optional<PjRtFuture<Status>>& returned_future, bool fill_future) {
+    const ExecuteOptions& options, std::optional<PjRtFuture<>>& returned_future,
+    bool fill_future) {
   return ExecuteWithSingleDevice(argument_handles, device, options,
                                  returned_future, fill_future);
 }
@@ -1631,8 +1662,8 @@ PjRtCApiLoadedExecutable::ExecuteSharded(
 StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
 PjRtCApiLoadedExecutable::ExecutePortable(
     absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
-    const ExecuteOptions& options,
-    std::optional<PjRtFuture<Status>>& returned_future, bool fill_future) {
+    const ExecuteOptions& options, std::optional<PjRtFuture<>>& returned_future,
+    bool fill_future) {
   return ExecuteWithSingleDevice(argument_handles, device, options,
                                  returned_future, fill_future);
 }
@@ -1792,16 +1823,16 @@ StatusOr<std::vector<int64_t>> PjRtCApiBuffer::logical_dimensions() {
                               args.unpadded_dims + args.num_dims);
 }
 
-PjRtFuture<absl::Status> PjRtCApiBuffer::LazyToLiteral(
+PjRtFuture<> PjRtCApiBuffer::LazyToLiteral(
     absl::AnyInvocable<absl::StatusOr<MutableLiteralBase*>() &&> generator) {
   auto buffer = std::move(generator)();
   if (!buffer.ok()) {
-    return PjRtFuture<Status>(buffer.status());
+    return PjRtFuture<>(buffer.status());
   }
   return ToLiteral(buffer.value());
 }
 
-PjRtFuture<Status> PjRtCApiBuffer::ToLiteral(MutableLiteralBase* literal) {
+PjRtFuture<> PjRtCApiBuffer::ToLiteral(MutableLiteralBase* literal) {
   PJRT_Buffer_ToHostBuffer_Args args;
   args.struct_size = PJRT_Buffer_ToHostBuffer_Args_STRUCT_SIZE;
   args.extension_start = nullptr;
@@ -1810,19 +1841,19 @@ PjRtFuture<Status> PjRtCApiBuffer::ToLiteral(MutableLiteralBase* literal) {
   const xla::Shape& shape = literal->shape();
 
   if (!shape.IsArray()) {
-    return PjRtFuture<Status>(
+    return PjRtFuture<>(
         Unimplemented("PjRtCApiBuffer::ToLiteral: Shapes other than array are"
                       "not supported."));
   }
 
   args.dst_size = ShapeUtil::ByteSizeOfElements(shape);
   args.dst = literal->untyped_data();
-  xla::StatusOr<pjrt::BufferMemoryLayoutData> c_layout_data;
+  absl::StatusOr<pjrt::BufferMemoryLayoutData> c_layout_data;
   if (literal->shape().has_layout()) {
     c_layout_data =
         pjrt::ConvertToBufferMemoryLayoutData(literal->shape().layout());
     if (!c_layout_data.ok()) {
-      return PjRtFuture<Status>(c_layout_data.status());
+      return PjRtFuture<>(c_layout_data.status());
     }
     args.host_layout = &(c_layout_data->c_layout);
   } else {
@@ -1836,8 +1867,7 @@ PjRtFuture<Status> PjRtCApiBuffer::ToLiteral(MutableLiteralBase* literal) {
       ::pjrt::MakeErrorDeleter(api)};
 
   if (error != nullptr) {
-    xla::Status s = ::pjrt::PjrtErrorToStatus(error.get(), api);
-    return PjRtFuture<Status>(s);
+    return PjRtFuture<>(::pjrt::PjrtErrorToStatus(error.get(), api));
   }
 
   return pjrt::ConvertCEventToCppFuture(args.event, api);
@@ -1927,7 +1957,7 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCApiBuffer::CopyToDevice(
         literal_pointer->untyped_data(),
         literal_pointer->shape().element_type(),
         literal_pointer->shape().dimensions(), byte_strides,
-        PjRtClient::HostBufferSemantics::kZeroCopy,
+        PjRtClient::HostBufferSemantics::kImmutableZeroCopy,
         [literal{std::move(literal)}]() { /* frees literal */ }, dst_device);
   }
 }
@@ -1959,7 +1989,7 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCApiBuffer::CopyToMemorySpace(
         literal_pointer->untyped_data(),
         literal_pointer->shape().element_type(),
         literal_pointer->shape().dimensions(), byte_strides,
-        PjRtClient::HostBufferSemantics::kZeroCopy,
+        PjRtClient::HostBufferSemantics::kImmutableZeroCopy,
         [literal{std::move(literal)}]() { /* frees literal */ }, dst_memory,
         /*device_layout=*/nullptr);
   }
@@ -1997,8 +2027,7 @@ void PjRtCApiBuffer::MakePromiseTrackEvent() {
   args.event = GetReadyEvent();
   args.user_arg = new std::function<void(PJRT_Error*)>(
       [promise = readiness_promise_, api](PJRT_Error* error) -> void {
-        Status status = ::pjrt::PjrtErrorToStatus(error, api);
-        promise->Set(status);
+        promise->Set(::pjrt::PjrtErrorToStatus(error, api));
         ::pjrt::MakeErrorDeleter(api)(error);
       });
   args.callback = [](PJRT_Error* error, void* callback_ptr) {
@@ -2016,13 +2045,13 @@ void PjRtCApiBuffer::MakePromiseTrackEvent() {
   }
 }
 
-PjRtFuture<Status> PjRtCApiBuffer::GetReadyFuture() {
+PjRtFuture<> PjRtCApiBuffer::GetReadyFuture() {
   if (readiness_promise_ == nullptr) {
-    readiness_promise_ = std::make_shared<PjRtFuture<Status>::Promise>(
-        PjRtFuture<Status>::CreatePromise());
+    readiness_promise_ =
+        std::make_shared<PjRtFuture<>::Promise>(PjRtFuture<>::CreatePromise());
     MakePromiseTrackEvent();
   }
-  return PjRtFuture<Status>{*readiness_promise_};
+  return PjRtFuture<>{*readiness_promise_};
 }
 
 StatusOr<std::unique_ptr<PjRtBuffer::ExternalReference>>
diff --git a/third_party/xla/xla/pjrt/pjrt_c_api_client.h b/third_party/xla/xla/pjrt/pjrt_c_api_client.h
index 06f5a36315b0b5..3dcf33ed28587f 100644
--- a/third_party/xla/xla/pjrt/pjrt_c_api_client.h
+++ b/third_party/xla/xla/pjrt/pjrt_c_api_client.h
@@ -101,7 +101,8 @@ class PjRtCApiMemorySpace : public PjRtMemorySpace {
 
   int id() const override;
 
-  absl::string_view memory_space_kind() const override;
+  absl::string_view kind() const override;
+  int kind_id() const override;
 
   absl::string_view DebugString() const override;
 
@@ -218,6 +219,12 @@ class PjRtCApiTopologyDescription : public PjRtTopologyDescription {
     return attributes_;
   }
 
+  StatusOr<Layout> GetDefaultLayout(
+      PrimitiveType element_type,
+      absl::Span<const int64_t> dims) const override {
+    return Unimplemented("PJRT C API does not support GetDefaultLayout");
+  }
+
  private:
   std::unique_ptr<PjRtCApiCompiler> compiler_;
   const PJRT_Api* c_api_;
@@ -265,10 +272,7 @@ class PjRtCApiClient : public PjRtClient {
 
   absl::string_view platform_version() const override;
 
-  std::optional<PjRtPluginAttributes> plugin_attributes() const override {
-    return PjRtPluginAttributes{c_api_->pjrt_api_version.major_version,
-                                c_api_->pjrt_api_version.minor_version};
-  }
+  std::optional<PjRtPluginAttributes> plugin_attributes() const override;
 
   // TODO(b/244756954): Rethink this function altogether
   PjRtRuntimeType runtime_type() const override {
@@ -415,6 +419,7 @@ class PjRtCApiClient : public PjRtClient {
 
  private:
   void InitDevicesAndMemorySpaces();
+  void InitAttributes();
 
   StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBufferInternalImpl(
       const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
@@ -444,6 +449,7 @@ class PjRtCApiClient : public PjRtClient {
   const std::string platform_version_;
   const std::string platform_name_;
   const PjRtPlatformId platform_id_;
+  absl::flat_hash_map<std::string, xla::PjRtValueType> attributes_;
 };
 
 class PjRtCApiBuffer : public PjRtBuffer {
@@ -483,16 +489,16 @@ class PjRtCApiBuffer : public PjRtBuffer {
   StatusOr<std::unique_ptr<ExternalReference>> AcquireExternalReference()
       override;
 
-  PjRtFuture<absl::Status> ToLiteral(MutableLiteralBase* literal) override;
-  PjRtFuture<absl::Status> LazyToLiteral(
+  PjRtFuture<> ToLiteral(MutableLiteralBase* literal) override;
+  PjRtFuture<> LazyToLiteral(
       absl::AnyInvocable<absl::StatusOr<MutableLiteralBase*>() &&> generator)
       override;
 
   StatusOr<size_t> GetOnDeviceSizeInBytes() const override;
 
-  PjRtFuture<Status> CopyRawToHost(void* dst, int64_t offset,
-                                   int64_t transfer_size) override {
-    return PjRtFuture<Status>(
+  PjRtFuture<> CopyRawToHost(void* dst, int64_t offset,
+                             int64_t transfer_size) override {
+    return PjRtFuture<>(
         Unimplemented("PJRT C API does not support CopyRawToHost"));
   }
 
@@ -525,7 +531,7 @@ class PjRtCApiBuffer : public PjRtBuffer {
     LOG(ERROR) << "PJRT C API does not support CopyToRemoteDeviceScattered";
   }
 
-  PjRtFuture<Status> GetReadyFuture() override;
+  PjRtFuture<> GetReadyFuture() override;
 
   bool IsOnCpu() const override;
 
@@ -549,7 +555,7 @@ class PjRtCApiBuffer : public PjRtBuffer {
   // This is a shared_ptr to keep the underlying future alive even if
   // `readiness_promise` is destroyed before `readiness_event`, and the callback
   // we set on `readiness_event` modifies `readiness_promise_`.
-  std::shared_ptr<PjRtFuture<Status>::Promise> readiness_promise_;
+  std::shared_ptr<PjRtFuture<>::Promise> readiness_promise_;
   // Set and cached the first time layout() is called.
   mutable std::optional<PjRtXlaLayout> layout_;
   // Set and cached the first time is_dynamic_dimension() is called.
@@ -687,20 +693,17 @@ class PjRtCApiLoadedExecutable : public PjRtLoadedExecutable {
   StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>> Execute(
       absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
       const ExecuteOptions& options,
-      std::optional<std::vector<PjRtFuture<Status>>>& returned_futures)
-      override;
+      std::optional<std::vector<PjRtFuture<>>>& returned_futures) override;
 
   StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteSharded(
       absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
       const ExecuteOptions& options,
-      std::optional<PjRtFuture<Status>>& returned_future,
-      bool fill_future) override;
+      std::optional<PjRtFuture<>>& returned_future, bool fill_future) override;
 
   StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecutePortable(
       absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
       const ExecuteOptions& options,
-      std::optional<PjRtFuture<Status>>& returned_future,
-      bool fill_future) override;
+      std::optional<PjRtFuture<>>& returned_future, bool fill_future) override;
 
   void Delete() override;
   bool IsDeleted() override;
@@ -744,7 +747,7 @@ class PjRtCApiLoadedExecutable : public PjRtLoadedExecutable {
   // Gets common Execute_Args between Execute, ExecuteSharded and
   // ExecutePortable. device_complete_events in the return is set if the input
   // device_complete_events has value.
-  xla::StatusOr<PJRT_LoadedExecutable_Execute_Args> GetCommonExecuteArgs(
+  absl::StatusOr<PJRT_LoadedExecutable_Execute_Args> GetCommonExecuteArgs(
       absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
       const ExecuteOptions& options, PJRT_ExecuteOptions& c_options,
       std::vector<std::vector<PJRT_Buffer*>>& c_argument_lists_storage,
@@ -758,7 +761,7 @@ class PjRtCApiLoadedExecutable : public PjRtLoadedExecutable {
   StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteWithSingleDevice(
       absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
       const ExecuteOptions& options,
-      std::optional<PjRtFuture<Status>>& returned_future, bool fill_future);
+      std::optional<PjRtFuture<>>& returned_future, bool fill_future);
 
   PjRtCApiClient* client_;
   std::unique_ptr<PJRT_LoadedExecutable, ::pjrt::PJRT_LoadedExecutableDeleter>
@@ -775,7 +778,7 @@ class CApiCopyToDeviceStream : public CopyToDeviceStream {
                          const PJRT_Api* c_api);
   ~CApiCopyToDeviceStream() override;
 
-  PjRtFuture<Status> AddChunk(PjRtChunk chunk) override;
+  PjRtFuture<> AddChunk(PjRtChunk chunk) override;
 
  private:
   PJRT_CopyToDeviceStream* c_stream_;
diff --git a/third_party/xla/xla/pjrt/pjrt_client.cc b/third_party/xla/xla/pjrt/pjrt_client.cc
index 1b2bf4644643f8..35a397f4469731 100644
--- a/third_party/xla/xla/pjrt/pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_client.cc
@@ -43,24 +43,24 @@ StatusOr<std::uintptr_t> PjRtClient::UnsafeBufferPointer(PjRtBuffer* buffer) {
   return absl::bit_cast<std::uintptr_t>(ptr);
 }
 
-PjRtFuture<Status> PjRtBuffer::CopyRawToHostFuture(
-    PjRtFuture<StatusOr<void*>> dst, int64_t offset, int64_t transfer_size) {
-  auto promise = PjRtFuture<Status>::CreatePromise();
+PjRtFuture<> PjRtBuffer::CopyRawToHostFuture(PjRtFuture<StatusOr<void*>> dst,
+                                             int64_t offset,
+                                             int64_t transfer_size) {
+  auto promise = PjRtFuture<>::CreatePromise();
   dst.OnReady(
       [this, promise, offset, transfer_size](StatusOr<void*> dst) mutable {
         if (dst.ok()) {
           CopyRawToHost(*dst, offset, transfer_size)
               .OnReady([promise = std::move(promise)](Status status) mutable {
-                promise.Set(status);
+                promise.Set(std::move(status));
               });
         } else {
           promise.Set(dst.status());
         }
       });
-  return PjRtFuture<Status>(std::move(promise));
+  return PjRtFuture<>(std::move(promise));
 }
 
-
 std::string CompiledMemoryStats::DebugString() const {
   return absl::Substitute(
       "CompiledMemoryStats("
diff --git a/third_party/xla/xla/pjrt/pjrt_client.h b/third_party/xla/xla/pjrt/pjrt_client.h
index cd13f400cc1b90..7437944dbbdf14 100644
--- a/third_party/xla/xla/pjrt/pjrt_client.h
+++ b/third_party/xla/xla/pjrt/pjrt_client.h
@@ -23,7 +23,6 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
-#include <string_view>
 #include <utility>
 #include <vector>
 
@@ -36,7 +35,6 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/synchronization/notification.h"
-#include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "xla/client/xla_computation.h"
@@ -92,7 +90,12 @@ class PjRtMemorySpace {
 
   // A platform-dependent string that uniquely identifies the kind of the
   // memory space.
-  virtual absl::string_view memory_space_kind() const = 0;
+  virtual absl::string_view kind() const = 0;
+
+  // An ID uniquely identifies the kind of the memory space among those attached
+  // to the same `PjRtClient`. The IDs assigned to a kind is implementation
+  // specific.
+  virtual int kind_id() const = 0;
 
   // Debug string suitable for logging when errors occur. Should be verbose
   // enough to describe the current memory space unambiguously.
@@ -372,14 +375,14 @@ class CopyToDeviceStream {
 
   virtual ~CopyToDeviceStream();
 
-  // Emplaces a new Chunk of data to copy to the device. Returns a non-OK status
+  // Emplaces a new Chunk of data to copy to the device. Returns an error future
   // if the Chunk's size causes the amount of transferred data to exceed
   // total_bytes(), if the stream is already complete, or if the chunk is not a
   // multiple of granule_size_in_bytes().
   //
   // The transfer is started immediately, and the returned future is fulfilled
   // when the transfer completes or fails.
-  virtual PjRtFuture<Status> AddChunk(PjRtChunk chunk) = 0;
+  virtual PjRtFuture<> AddChunk(PjRtChunk chunk) = 0;
 
   // Returns the total amount of data the stream expects to be transferred.
   int64_t total_bytes() const { return total_bytes_; }
@@ -441,6 +444,7 @@ class PjRtLoadedExecutable;
 struct PjRtPluginAttributes {
   int64_t pjrt_c_api_major_version;
   int64_t pjrt_c_api_minor_version;
+  absl::flat_hash_map<std::string, PjRtValueType> attributes;
 };
 
 // Encapsulates the state of Python session with XLA.
@@ -812,13 +816,25 @@ class PjRtClient {
     kImmutableUntilTransferCompletes,
 
     // The PjRtBuffer may alias `data` internally and the runtime may use the
-    // `data` contents as long as the buffer is alive. The caller promises to
-    // keep `data` alive and not to mutate its contents as long as the buffer is
-    // alive; to notify the caller that the buffer may be freed, the runtime
-    // will call `on_done_with_host_buffer` when the PjRtBuffer is freed. On
-    // non-CPU platforms this acts identically to
+    // `data` contents as long as the buffer is alive. The runtime promises not
+    // to mutate contents of the buffer (i.e. it will not use it for aliased
+    // output buffers). The caller promises to keep `data` alive and also not to
+    // mutate its contents as long as the buffer is alive; to notify the caller
+    // that the buffer may be freed, the runtime will call
+    // `on_done_with_host_buffer` when the PjRtBuffer is freed. On non-CPU
+    // platforms this acts identically to kImmutableUntilTransferCompletes.
+    kImmutableZeroCopy,
+
+    // The PjRtBuffer may alias `data` internally and the runtime may use the
+    // `data` contents as long as the buffer is alive. The runtime is allowed
+    // to mutate contents of the buffer (i.e. use it for aliased output
+    // buffers). The caller promises to keep `data` alive and not to mutate its
+    // contents as long as the buffer is alive (otherwise it could be a data
+    // race with the runtime); to notify the caller that the buffer may be
+    // freed, the runtime will call `on_done_with_host_buffer` when the
+    // PjRtBuffer is freed. On non-CPU platforms this acts identically to
     // kImmutableUntilTransferCompletes.
-    kZeroCopy,
+    kMutableZeroCopy,
   };
 
   // on_done_with_host_buffer is optional and may be null.
@@ -1095,13 +1111,13 @@ class PjRtBuffer {
   // Return value is a future the caller can use to discover when the copy has
   // completed. The transfer respects the layout of `literal`; to specify a
   // particular layout, set the layout before calling `ToLiteral`.
-  virtual PjRtFuture<Status> ToLiteral(MutableLiteralBase* literal) = 0;
+  virtual PjRtFuture<> ToLiteral(MutableLiteralBase* literal) = 0;
   // This version of ToLiteral allows the implementation to defer the
   // construction of the literal (e.g. until the underlying buffer is ready).
   // The specific timing of calling `generator` is implementation defined, and
   // might be done eagerly, but it is guaranteed to be earlier than when the
   // returned future becomes ready.
-  virtual PjRtFuture<Status> LazyToLiteral(
+  virtual PjRtFuture<> LazyToLiteral(
       absl::AnyInvocable<absl::StatusOr<MutableLiteralBase*>() &&>
           generator) = 0;
 
@@ -1165,8 +1181,8 @@ class PjRtBuffer {
   // Note that the underlying driver may have requirements
   // on the alignment of `dst` and `offset` as well. Look at implementations of
   // this method for specific alignment requirements.
-  virtual PjRtFuture<Status> CopyRawToHost(void* dst, int64_t offset,
-                                           int64_t transfer_size) = 0;
+  virtual PjRtFuture<> CopyRawToHost(void* dst, int64_t offset,
+                                     int64_t transfer_size) = 0;
 
   // As above, but the transfer will not happen until `dst` is fulfilled with a
   // valid pointer. If `dst` is fulfilled with a non-Ok status, then the
@@ -1176,8 +1192,9 @@ class PjRtBuffer {
   // before `dst` is fulfilled.
   //
   // Note that the default implementation will block until `dst` is fulfilled.
-  virtual PjRtFuture<Status> CopyRawToHostFuture(
-      PjRtFuture<StatusOr<void*>> dst, int64_t offset, int64_t transfer_size);
+  virtual PjRtFuture<> CopyRawToHostFuture(PjRtFuture<StatusOr<void*>> dst,
+                                           int64_t offset,
+                                           int64_t transfer_size);
 
   // Drops the buffer's reference to its associated device memory, leaving the
   // buffer in an invalid state. The memory will be freed lazily when all async
@@ -1304,10 +1321,19 @@ class PjRtBuffer {
   // If either 'this' or 'dependency' transitions to error, then the returned
   // buffer will transition to error.
   virtual StatusOr<std::unique_ptr<PjRtBuffer>> DonateWithControlDependency(
-      PjRtFuture<Status> dependency) {
+      PjRtFuture<> dependency) {
     return Unimplemented("DonateWithControlDependency is not supported.");
   }
 
+  // TODO(b/333538339): Delete this adaptor once all users migrate from
+  // PjRtFuture<Status> to PjRtFuture<>.
+  StatusOr<std::unique_ptr<PjRtBuffer>> DonateWithControlDependency(
+      PjRtFuture<Status> dependency) {
+    PjRtFuture<>::Promise promise = PjRtFuture<>::CreatePromise();
+    dependency.OnReady([promise](Status s) mutable { promise.Set(s); });
+    return DonateWithControlDependency(PjRtFuture<>(std::move(promise)));
+  }
+
   // Helper to allow a caller to indicate that it is going to do some "sends"
   // of the buffer a later date, where a send is a transfer out of a device
   // buffer, either copying to host, or to a remote device.
@@ -1331,11 +1357,11 @@ class PjRtBuffer {
     virtual ~AsyncSendPlaceholder() = default;
 
     // Equivalent to PjRtBuffer::ToLiteral on the underlying buffer;
-    virtual PjRtFuture<Status> ToLiteral(MutableLiteralBase* literal) = 0;
+    virtual PjRtFuture<> ToLiteral(MutableLiteralBase* literal) = 0;
 
     // Equivalent to PjRtBuffer::CopyRawToHost on the underlying buffer;
-    virtual PjRtFuture<Status> CopyRawToHost(void* dst, int64_t offset,
-                                             int64_t transfer_size) = 0;
+    virtual PjRtFuture<> CopyRawToHost(void* dst, int64_t offset,
+                                       int64_t transfer_size) = 0;
 
     // Equivalent to PjRtBuffer::CopyToRemoteDevice on the underlying buffer;
     virtual void CopyToRemoteDevice(absl::string_view serialized_descriptor,
@@ -1362,7 +1388,7 @@ class PjRtBuffer {
   // the buffer has been deleted or donated then the returned future will stay
   // valid (will not transition to error as a consequence of buffer deletion)
   // even if the buffer is subsequently donated or deleted.
-  virtual PjRtFuture<Status> GetReadyFuture() = 0;
+  virtual PjRtFuture<> GetReadyFuture() = 0;
 
   // Blocks the host until the buffer's value has been computed and is ready for
   // immediate use on the device. Useful in particular for timing benchmarks.
@@ -1448,12 +1474,12 @@ class PjRtLoadedExecutable : public PjRtExecutable {
   virtual StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>>
   Execute(absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
           const ExecuteOptions& options,
-          std::optional<std::vector<PjRtFuture<Status>>>& returned_futures) = 0;
+          std::optional<std::vector<PjRtFuture<>>>& returned_futures) = 0;
   // Convenience wrapper for Execute that never returns futures.
   StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>> Execute(
       absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
       const ExecuteOptions& options) {
-    std::optional<std::vector<PjRtFuture<Status>>> returned_futures;
+    std::optional<std::vector<PjRtFuture<>>> returned_futures;
     return Execute(std::move(argument_handles), options, returned_futures);
   }
 
@@ -1470,12 +1496,12 @@ class PjRtLoadedExecutable : public PjRtExecutable {
   virtual StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteSharded(
       absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
       const ExecuteOptions& options,
-      std::optional<PjRtFuture<Status>>& returned_future, bool fill_future) = 0;
+      std::optional<PjRtFuture<>>& returned_future, bool fill_future) = 0;
   // Convenience wrapper for ExecuteSharded that always returns a future.
   StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteSharded(
       absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
       const ExecuteOptions& options,
-      std::optional<PjRtFuture<Status>>& returned_future) {
+      std::optional<PjRtFuture<>>& returned_future) {
     return ExecuteSharded(std::move(argument_handles), device, options,
                           returned_future, /*fill_future=*/true);
   }
@@ -1483,7 +1509,7 @@ class PjRtLoadedExecutable : public PjRtExecutable {
   StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteSharded(
       absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
       const ExecuteOptions& options) {
-    std::optional<PjRtFuture<Status>> returned_future;
+    std::optional<PjRtFuture<>> returned_future;
     return ExecuteSharded(std::move(argument_handles), device, options,
                           returned_future, /*fill_future=*/false);
   }
@@ -1501,12 +1527,12 @@ class PjRtLoadedExecutable : public PjRtExecutable {
   virtual StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecutePortable(
       absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
       const ExecuteOptions& options,
-      std::optional<PjRtFuture<Status>>& returned_future, bool fill_future) = 0;
+      std::optional<PjRtFuture<>>& returned_future, bool fill_future) = 0;
   // Convenience wrapper for ExecutePortable that always returns a future.
   StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecutePortable(
       absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
       const ExecuteOptions& options,
-      std::optional<PjRtFuture<Status>>& returned_future) {
+      std::optional<PjRtFuture<>>& returned_future) {
     return ExecutePortable(std::move(argument_handles), device, options,
                            returned_future, /*fill_future=*/true);
   }
@@ -1514,7 +1540,7 @@ class PjRtLoadedExecutable : public PjRtExecutable {
   StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecutePortable(
       absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
       const ExecuteOptions& options) {
-    std::optional<PjRtFuture<Status>> returned_future;
+    std::optional<PjRtFuture<>> returned_future;
     return ExecutePortable(std::move(argument_handles), device, options,
                            returned_future, /*fill_future=*/false);
   }
@@ -1539,7 +1565,7 @@ class PjRtLoadedExecutable : public PjRtExecutable {
   // combining the result buffers with a future that becomes ready when the
   // execution completes.
   struct Result {
-    std::optional<PjRtFuture<Status>> future;
+    std::optional<PjRtFuture<>> future;
     std::vector<std::unique_ptr<PjRtBuffer>> buffers;
   };
 };
diff --git a/third_party/xla/xla/pjrt/pjrt_client_test.cc b/third_party/xla/xla/pjrt/pjrt_client_test.cc
index fbb2b18e7f6791..1bc5d6704abbf3 100644
--- a/third_party/xla/xla/pjrt/pjrt_client_test.cc
+++ b/third_party/xla/xla/pjrt/pjrt_client_test.cc
@@ -176,7 +176,7 @@ TEST_P(PjRtClientTest, ExecuteWithTupleZeroCopy) {
                        /*byte_strides=*/std::nullopt,
                        // Use kZeroCopy to test the correctness of
                        // `on_done_with_host_buffer`.
-                       PjRtClient::HostBufferSemantics::kZeroCopy,
+                       PjRtClient::HostBufferSemantics::kImmutableZeroCopy,
                        /*on_done_with_host_buffer=*/
                        [&data]() {
                          // Deliberately modifying the content of `data`. A
@@ -216,8 +216,8 @@ TEST_P(PjRtClientTest, ExecuteWithDonation) {
       auto buffer, client->BufferFromHostBuffer(
                        data.data(), shape.element_type(), shape.dimensions(),
                        /*byte_strides=*/std::nullopt,
-                       PjRtClient::HostBufferSemantics::kZeroCopy, nullptr,
-                       client->addressable_devices()[0]));
+                       PjRtClient::HostBufferSemantics::kImmutableZeroCopy,
+                       nullptr, client->addressable_devices()[0]));
 
   ExecuteOptions options;
   options.execution_mode = GetParam();
@@ -249,8 +249,8 @@ TEST_P(PjRtClientTest, ExecuteWithDonationAbort) {
       auto buffer, client->BufferFromHostBuffer(
                        data.data(), shape.element_type(), shape.dimensions(),
                        /*byte_strides=*/std::nullopt,
-                       PjRtClient::HostBufferSemantics::kZeroCopy, nullptr,
-                       client->addressable_devices()[0]));
+                       PjRtClient::HostBufferSemantics::kImmutableZeroCopy,
+                       nullptr, client->addressable_devices()[0]));
 
   auto external_reference = buffer->AcquireExternalReference();
 
@@ -323,8 +323,8 @@ TEST_P(PjRtClientTest, ExecuteWithConcurrentUsageAndDonation) {
       auto buffer, client->BufferFromHostBuffer(
                        data.data(), shape.element_type(), shape.dimensions(),
                        /*byte_strides=*/std::nullopt,
-                       PjRtClient::HostBufferSemantics::kZeroCopy, nullptr,
-                       client->addressable_devices()[0]));
+                       PjRtClient::HostBufferSemantics::kImmutableZeroCopy,
+                       nullptr, client->addressable_devices()[0]));
 
   ExecuteOptions options;
   options.execution_mode = GetParam();
diff --git a/third_party/xla/xla/pjrt/pjrt_compiler.h b/third_party/xla/xla/pjrt/pjrt_compiler.h
index d624fd0cf99cd0..46c363ace13619 100644
--- a/third_party/xla/xla/pjrt/pjrt_compiler.h
+++ b/third_party/xla/xla/pjrt/pjrt_compiler.h
@@ -139,6 +139,15 @@ class PjRtTopologyDescription {
   // Returns vendor specific attributes about the topology.
   virtual const absl::flat_hash_map<std::string, PjRtDeviceAttribute>&
   Attributes() const = 0;
+
+  // Returns the default device layout for a buffer with `element_type` and
+  // `dims`. The default layout is a platform-specific layout used when no other
+  // layout is specified, e.g. for host-to-device transfers. When compiling, the
+  // default layout is used for program arguments and outputs unless
+  // user-specified or compiler-chosen layouts are requested via the
+  // "mhlo.layout_mode" attribute.
+  virtual StatusOr<Layout> GetDefaultLayout(
+      PrimitiveType element_type, absl::Span<const int64_t> dims) const = 0;
 };
 
 // Abstract interface that all registered compilers must implement.
diff --git a/third_party/xla/xla/pjrt/pjrt_compiler_test.cc b/third_party/xla/xla/pjrt/pjrt_compiler_test.cc
index 182e3ba9f7b85d..98a2b8e8d5e16b 100644
--- a/third_party/xla/xla/pjrt/pjrt_compiler_test.cc
+++ b/third_party/xla/xla/pjrt/pjrt_compiler_test.cc
@@ -56,6 +56,11 @@ class PjRtTestTopology : public PjRtTopologyDescription {
       const override {
     LOG(FATAL) << "Unused";
   }
+  StatusOr<Layout> GetDefaultLayout(
+      PrimitiveType element_type,
+      absl::Span<const int64_t> dims) const override {
+    return Unimplemented("TestTopology does not support GetDefaultLayout");
+  }
 };
 
 TEST(PjRtCompilerTest, CompilerNotRegistered) {
@@ -85,6 +90,11 @@ TEST(PjRtCompilerTest, CompilerRegistered) {
         const override {
       LOG(FATAL) << "Unused";
     }
+    StatusOr<Layout> GetDefaultLayout(
+        PrimitiveType element_type,
+        absl::Span<const int64_t> dims) const override {
+      return Unimplemented("TestTopology does not support GetDefaultLayout");
+    }
   };
   PjRtTestTopology topology;
 
diff --git a/third_party/xla/xla/pjrt/pjrt_future.cc b/third_party/xla/xla/pjrt/pjrt_future.cc
new file mode 100644
index 00000000000000..0d208ef8ec2740
--- /dev/null
+++ b/third_party/xla/xla/pjrt/pjrt_future.cc
@@ -0,0 +1,74 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/pjrt/pjrt_future.h"
+
+#include <atomic>
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "tsl/platform/logging.h"
+
+namespace xla {
+
+namespace {
+struct State {
+  explicit State(int32_t size)
+      : pending_count(size), promise(PjRtFuture<>::CreatePromise()) {}
+
+  std::atomic<int32_t> pending_count;
+  PjRtFuture<>::Promise promise;
+
+  absl::Mutex mu;
+  absl::Status status ABSL_GUARDED_BY(&mu);
+};
+}  // namespace
+
+PjRtFuture<> JoinFutures(absl::Span<const PjRtFuture<>> futures) {
+  if (futures.empty()) {
+    return PjRtFuture<>(absl::OkStatus());
+  } else if (futures.size() == 1) {
+    return futures.front();
+  }
+
+  auto state = std::make_shared<State>(futures.size());
+
+  for (const PjRtFuture<>& future : futures) {
+    future.OnReady([state](absl::Status status) {
+      if (!status.ok()) {
+        absl::MutexLock lock(&state->mu);
+        state->status.Update(status);
+      }
+
+      const int pending_count =
+          state->pending_count.fetch_sub(1, std::memory_order_acq_rel);
+      CHECK_GE(pending_count, 1) << "Pending count can't drop below 0";
+
+      if (pending_count == 1) {
+        absl::MutexLock lock(&state->mu);
+        state->promise.Set(std::move(state->status));
+      }
+    });
+  }
+
+  return PjRtFuture<>(state->promise);
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/pjrt/pjrt_future.h b/third_party/xla/xla/pjrt/pjrt_future.h
index 9944b1f046b755..ed8f8da6a6e9cb 100644
--- a/third_party/xla/xla/pjrt/pjrt_future.h
+++ b/third_party/xla/xla/pjrt/pjrt_future.h
@@ -19,19 +19,33 @@ limitations under the License.
 #include <algorithm>
 #include <cstdint>
 #include <functional>
+#include <optional>
 #include <type_traits>
 #include <utility>
 
+#include "absl/base/optimization.h"
 #include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "absl/types/span.h"
 #include "tsl/concurrency/async_value.h"
 #include "tsl/concurrency/async_value_ref.h"
 #include "tsl/concurrency/ref_count.h"
+#include "tsl/platform/logging.h"
 
 namespace xla {
 
-template <class T>
+template <class T = void>
 class PjRtFuture;
 
+namespace internal {
+template <class T>
+class PjRtFutureBase;
+}
+
+// Returns a `PjRtFuture` that will be successful if all `futures` complete
+// successfully, or return a first encountered error.
+PjRtFuture<> JoinFutures(absl::Span<const PjRtFuture<>> futures);
+
 // An RAII event that a caller can use to tell the PjRtClient about asynchronous
 // actions outside PjRt.
 //
@@ -55,7 +69,7 @@ class ScopedAsyncTrackingEvent {
 
  private:
   template <class T>
-  friend class PjRtFuture;
+  friend class internal::PjRtFutureBase;
 
   // Indicates that the ScopedAsyncTrackingEvent won't complete until dependency
   // becomes available. Called only by PjRtFuture.
@@ -73,14 +87,138 @@ struct PjRtFutureHelpers {
   struct ProfilingKeys {
     uint64_t traceme_context_id = -1;
   };
+
   // Signature of handler called by the PjRtFuture class before it starts to
   // block a thread.
   using OnBlockStartFn = std::function<ProfilingKeys()>;
+
   // Signature of handler called by the PjRtFuture class after it finishes
   // blocking a thread.
   using OnBlockEndFn = std::function<void(ProfilingKeys)>;
 };
 
+namespace internal {
+// A base class for a stateful future PjRtFuture<T> and a stateless future
+// PjRtFuture<void>.
+template <typename T>
+class PjRtFutureBase {
+ public:
+  bool IsValid() const { return promise_ != nullptr; }
+
+  // Two functions exist to know whether the future is ready, to accommodate
+  // the fact some backends (e.g. distributed ones) could take a non-trivial
+  // time to check the state of a future.
+  //
+  // `IsReady()` is guaranteed to return true if the future became ready
+  // before `IsReady()` was called. `IsReady()` will return immediately if a
+  // call to `Await()` has already returned, or any callback passed to
+  // `OnReady` has already been triggered. Otherwise IsReady() may block for
+  // the duration of a network message on some backends.
+  bool IsReady() {
+    CHECK(IsValid());
+    return promise_.IsAvailable();
+  }
+  // `IsKnownReady()` is guaranteed to return immediately. `IsKnownReady()` will
+  // always return true if a call to `Await()` has already returned, or any
+  // callback passed to `OnReady` has already been triggered. Otherwise,
+  // `IsKnownReady()` may return false in some cases in which the future was
+  // ready before `IsKnownReady()` was called.
+  bool IsKnownReady() {
+    CHECK(IsValid());
+    return promise_.IsAvailable();
+  }
+
+  // Indicates that event will not complete until after this becomes ready.
+  //
+  // May safely be called with event==nullptr in which case AssertHappensBefore
+  // has no effect.
+  void AssertHappensBefore(ScopedAsyncTrackingEvent* event) {
+    CHECK(IsValid());
+    if (event) event->AddDependency(promise_.CopyRCRef());
+  }
+
+ protected:
+  // Wrapper for AsyncValueRef<T> that can be used by clients that don't
+  // natively use TSL concurrency library. Stateless and stateful PjRtFuture<T>
+  // specializations define their own Promise type inheriting from this one.
+  class Promise {
+   public:
+    Promise() = default;
+
+    Promise(Promise&& other) = default;
+    Promise& operator=(Promise&& other) = default;
+
+    Promise(const Promise& other) : ref_(other.ref_.CopyRef()) {}
+    Promise& operator=(const Promise& other) {
+      ref_ = other.ref_.CopyRef();
+      return *this;
+    }
+
+    operator bool() const { return static_cast<bool>(ref_); }  // NOLINT
+
+   protected:
+    friend class PjRtFuture<T>;
+    friend class PjRtFuture<void>;
+
+    explicit Promise(tsl::AsyncValueRef<T> ref) : ref_(std::move(ref)) {}
+
+    void SetStateConcrete() {
+      DCHECK(ref_) << "Promise must wrap an async value";
+      ref_.SetStateConcrete();
+    }
+
+    void SetError(absl::Status error) {
+      DCHECK(ref_) << "Promise must wrap an async value";
+      ref_.SetError(std::move(error));
+    }
+
+    template <typename... Args>
+    void emplace(Args&&... args) const {
+      DCHECK(ref_) << "Promise must wrap an async value";
+      ref_.template emplace<T>(std::forward<Args>(args)...);
+    }
+
+    tsl::AsyncValueRef<T> ExtractRef() && { return std::move(ref_); }
+
+    tsl::RCReference<tsl::AsyncValue> CopyRCRef() const {
+      return ref_.CopyRCRef();
+    }
+
+   private:
+    tsl::AsyncValueRef<T> ref_;
+  };
+
+  PjRtFutureBase() = default;
+
+  PjRtFutureBase(tsl::AsyncValueRef<T> promise,
+                 PjRtFutureHelpers::OnBlockStartFn on_block_start,
+                 PjRtFutureHelpers::OnBlockEndFn on_block_end)
+      : promise_(std::move(promise)),
+        on_block_start_(std::move(on_block_start)),
+        on_block_end_(std::move(on_block_end)) {}
+
+  tsl::AsyncValuePtr<T> promise() const { return promise_.AsPtr(); }
+
+  PjRtFutureHelpers::ProfilingKeys OnBlockStart() {
+    return on_block_start_ ? on_block_start_()
+                           : PjRtFutureHelpers::ProfilingKeys();
+  }
+
+  void OnBlockEnd(PjRtFutureHelpers::ProfilingKeys keys) {
+    if (on_block_end_) on_block_end_(std::move(keys));
+  }
+
+ private:
+  tsl::AsyncValueRef<T> promise_;
+
+  // Function that is called before a thread starts blocking on the promise.
+  PjRtFutureHelpers::OnBlockStartFn on_block_start_;
+  // Function that is called after a thread finishes blocking on the promise.
+  PjRtFutureHelpers::OnBlockEndFn on_block_end_;
+};
+
+}  // namespace internal
+
 // PjRtFuture<T> is a simple future that is returned by PjRt APIs that
 // enqueue asynchronous work, reporting a value of type T (frequently T=Status)
 // when the work is complete.
@@ -88,7 +226,7 @@ struct PjRtFutureHelpers {
 // PjRtFuture can be used by the client to wait for work to complete, either via
 // a blocking call or a callback.
 //
-// The implementation wraps a TFRT AsyncValueRef<T>, but we prefer to
+// The implementation wraps a tsl::AsyncValueRef<T>, but we prefer to
 // encapsulate the AVR rather than returning it directly for two reasons.
 //
 // First, we want to retain portability in case a future implementation moves
@@ -99,124 +237,80 @@ struct PjRtFutureHelpers {
 // integration between blocking and profiling (e.g., TraceMe).
 //
 // There are two ways to construct a PjRtFuture, one used by clients that
-// natively use TFRT, which already have import APIs for constructing
-// AsyncValueRefs; and another that avoids exposing TFRT APIs and can be used by
-// non-TFRT clients.
+// natively use TSL concurrency library, which already have import APIs for
+// constructing AsyncValueRefs; and another that avoids exposing TSL APIs and
+// can be used by non-TSL clients.
 template <class T>
-class PjRtFuture {
+class PjRtFuture : public internal::PjRtFutureBase<T> {
+  using Base = internal::PjRtFutureBase<T>;
+
  public:
   // Wrapper for AsyncValueRef<T> that can be used by clients that don't
-  // natively use TFRT.
-  struct Promise {
+  // natively use TSL concurrency library.
+  class Promise : public Base::Promise {
    public:
-    // Creates an empty promise with !this == true.
-    explicit Promise() = default;
-    Promise(Promise&& other) = default;
-    Promise(const Promise& other) : avr(other.avr.CopyRef()) {}
-    Promise& operator=(const Promise& other) {
-      avr = other.avr.CopyRef();
-      return *this;
-    }
-    bool operator!() { return !avr; }
+    using Base::Promise::Promise;
 
     // Sets the value of the promise. Must be called at most once.
     //
-    // After Set is called, value will be delivered to waiters on the parent
-    // PjRtFuture, via blocking or callbacks.
-    void Set(T value) { avr.emplace(std::move(value)); }
-
-   private:
-    friend class PjRtFuture<T>;
-    explicit Promise(tsl::AsyncValueRef<T> ref) : avr(std::move(ref)) {}
-    // The underlying TFRT value that can be waited on.
-    tsl::AsyncValueRef<T> avr;
+    // After Set is called, value will be delivered to waiters on the PjRtFuture
+    // constructed from a promise, via blocking or callbacks.
+    void Set(T value) { Base::Promise::emplace(std::move(value)); }
   };
 
   // Returns a Promise that can be used to construct a PjRtFuture, and then Set
   // later.
   //
-  // Used by clients that do not use TFRT natively.
+  // Used by clients that do not use TSL concurrency library natively.
   static Promise CreatePromise() {
     return Promise(tsl::MakeUnconstructedAsyncValueRef<T>());
   }
 
   PjRtFuture() = default;
 
-  bool IsValid() const { return promise_ref_ != nullptr; }
-
   // Constructor for an already-available PjRtFuture.
   //
   // Typically used to eagerly return error values when async work will not
   // be enqueued, e.g., due to invalid arguments.
   explicit PjRtFuture(T t)
-      : promise_ref_(tsl::MakeAvailableAsyncValueRef<T>(std::move(t))),
-        on_block_start_([]() { return PjRtFutureHelpers::ProfilingKeys(); }),
-        on_block_end_([](PjRtFutureHelpers::ProfilingKeys) {}) {}
+      : Base(tsl::MakeAvailableAsyncValueRef<T>(std::move(t)),
+             /*on_block_start=*/nullptr,
+             /*on_block_end=*/nullptr) {}
 
-  // Constructor used by clients that natively use TFRT and already have a
-  // host_ctx that should be used for awaiting promises.
+  // Constructor used by clients that natively use TSL concurrency library.
   //
   // on_block_start is called before Await starts to block.
   // on_block_end is called after Await finishes blocking.
   explicit PjRtFuture(
       tsl::AsyncValueRef<T> async_value,
-      PjRtFutureHelpers::OnBlockStartFn on_block_start =
-          []() { return PjRtFutureHelpers::ProfilingKeys(); },
-      PjRtFutureHelpers::OnBlockEndFn on_block_end =
-          [](PjRtFutureHelpers::ProfilingKeys) {})
-      : promise_ref_(std::move(async_value)),
-        on_block_start_(std::move(on_block_start)),
-        on_block_end_(std::move(on_block_end)) {}
+      PjRtFutureHelpers::OnBlockStartFn on_block_start = nullptr,
+      PjRtFutureHelpers::OnBlockEndFn on_block_end = nullptr)
+      : Base(std::move(async_value), std::move(on_block_start),
+             std::move(on_block_end)) {}
 
-  // Constructor used by clients that don't natively use TFRT and want to use
-  // the wrapped PjRtFuture<T>::Promise class.
+  // Constructor used by clients that don't natively use TSL concurrency library
+  // and want to use the wrapped PjRtFuture<T>::Promise class.
   //
   // on_block_start is called before Await starts to block.
   // on_block_end is called after Await finishes blocking.
   explicit PjRtFuture(
       Promise promise,
-      PjRtFutureHelpers::OnBlockStartFn on_block_start =
-          []() { return PjRtFutureHelpers::ProfilingKeys(); },
-      PjRtFutureHelpers::OnBlockEndFn on_block_end =
-          [](PjRtFutureHelpers::ProfilingKeys) {})
-      : promise_ref_(std::move(promise.avr)),
-        on_block_start_(std::move(on_block_start)),
-        on_block_end_(std::move(on_block_end)) {}
-
-  // Two functions exist to know whether the future is ready, to accomodate
-  // the fact some backends (e.g. disributed ones) could take a non-trivial time
-  // to check the state of a future.
-  //
-  // `IsReady()` is guaranteed to return true if the future became ready before
-  // `IsReady()` was called. `IsReady()` will return immediately if a call to
-  // `Await()` has already returned, or any callback passed to `OnReady` has
-  // already been triggered. Otherwise IsReady() may block for the duration of a
-  // network message on some backends.
-  bool IsReady() {
-    CHECK(IsValid());
-    return promise_ref_.IsAvailable();
-  }
-  // `IsKnownReady()` is guaranteed to return immediately. `IsKnownReady()` will
-  // always return true if a call to `Await()` has already returned, or any
-  // callback passed to `OnReady` has already been triggered. Otherwise,
-  // `IsKnownReady()` may return false in some cases in which the future was
-  // ready before `IsKnownReady()` was called.
-  bool IsKnownReady() {
-    CHECK(IsValid());
-    return promise_ref_.IsAvailable();
-  }
+      PjRtFutureHelpers::OnBlockStartFn on_block_start = nullptr,
+      PjRtFutureHelpers::OnBlockEndFn on_block_end = nullptr)
+      : Base(std::move(promise).ExtractRef(), std::move(on_block_start),
+             std::move(on_block_end)) {}
 
-  // Blocks the calling thread until the promise is ready, then returns the
+  // Blocks the calling thread until the future is ready, then returns the
   // final value.
   T Await() {
-    CHECK(IsValid());
-    if (!promise_ref_.IsAvailable()) {
-      const auto keys = on_block_start_();
-      BlockUntilReady(promise_ref_.GetAsyncValue());
-      on_block_end_(keys);
+    CHECK(Base::IsValid());
+    if (!Base::promise().IsAvailable()) {
+      PjRtFutureHelpers::ProfilingKeys keys = Base::OnBlockStart();
+      BlockUntilReady(Base::promise());
+      Base::OnBlockEnd(std::move(keys));
     }
-    DCHECK(promise_ref_.IsConcrete());
-    return *promise_ref_;
+    DCHECK(Base::promise().IsConcrete());
+    return *Base::promise();
   }
 
   // Registers callback to be called once the promise is ready, with the final
@@ -227,41 +321,141 @@ class PjRtFuture {
   // callback, for example by using the callback to enqueue work on a
   // client-owned threadpool.
   void OnReady(absl::AnyInvocable<void(T) &&> callback) {
-    CHECK(IsValid());
-    promise_ref_.AndThen([promise = promise_ref_.AsPtr(),
-                          callback = std::move(callback)]() mutable {
-      DCHECK(promise.IsConcrete());
-      if constexpr (std::is_copy_constructible_v<T>) {
-        std::move(callback)(*promise);
-        return;
+    CHECK(Base::IsValid());
+    Base::promise().AndThen(
+        [promise = Base::promise(), callback = std::move(callback)]() mutable {
+          DCHECK(promise.IsConcrete());
+          if constexpr (std::is_copy_constructible_v<T>) {
+            std::move(callback)(*promise);
+            return;
+          }
+          // For non-copyable types, we have no ways to check the number of
+          // waiters but we have to move the data into the consumer callback.
+          // Registering two callbacks will lead to double-move of the data. It
+          // is users' responsibility to make sure only one waiter is
+          // registered.
+          // TODO(yunlongl): Implement `PjRtUniqueFuture`.
+          std::move(callback)(std::move(*promise));
+        });
+  }
+};
+
+// PjRtFuture<void> specialization for communicating stateless events.
+//
+// See PjRtFuture<T> documentation above for more details.
+template <>
+class PjRtFuture<void> : public internal::PjRtFutureBase<std::nullopt_t> {
+  using Base = internal::PjRtFutureBase<std::nullopt_t>;
+
+ public:
+  // Wrapper for AsyncValueRef<T> that can be used by clients that don't
+  // natively use TSL concurrency library.
+  class Promise : public Base::Promise {
+   public:
+    using Base::Promise::Promise;
+
+    // Returns a reference to the underlying AsyncValue that can be used to
+    // track completion of a promise. It is undefined behavior to access the
+    // value stored in the AsyncValue.
+    using Base::Promise::CopyRCRef;
+
+    // Sets the promise completed with a given status. Must be called at most
+    // once.
+    //
+    // After Set is called, completion event will be delivered to waiters on the
+    // PjRtFuture constructed from a promise, via blocking or callbacks.
+    void Set(absl::Status status = absl::OkStatus()) {
+      if (ABSL_PREDICT_TRUE(status.ok())) {
+        Base::Promise::SetStateConcrete();
+      } else {
+        Base::Promise::SetError(std::move(status));
       }
-      // For non-copyable types, we have no ways to check the number of waiters
-      // but we have to move the data into the consumer callback. Registering
-      // two callbacks will lead to double-move of the data. It is users'
-      // responsibility to make sure only one waiter is registered.
-      // TODO(yunlongl): Implement `PjRtUniqueFuture`.
-      std::move(callback)(std::move(*promise));
-    });
+    }
+
+    // TODO(b/333538339): Remove this method in favor if Set() above.
+    void SetError(absl::Status status) { Set(std::move(status)); }
+  };
+
+  // Returns a Promise that can be used to construct a PjRtFuture, and then Set
+  // later.
+  //
+  // Used by clients that do not use TSL concurrency library.
+  static Promise CreatePromise() {
+    return Promise(
+        tsl::MakeConstructedAsyncValueRef<std::nullopt_t>(std::nullopt));
   }
 
-  // Indicates that event will not complete until after this becomes ready.
+  PjRtFuture() = default;
+
+  // Constructor for an already-available PjRtFuture. OkStatus means that future
+  // is already successfully completed. Error means that future is already
+  // completed with an error.
+  explicit PjRtFuture(absl::Status status)
+      : Base(status.ok()
+                 ? tsl::MakeAvailableAsyncValueRef<std::nullopt_t>(std::nullopt)
+                 : tsl::MakeErrorAsyncValueRef(std::move(status)),
+             /*on_block_start=*/nullptr, /*on_block_end=*/nullptr) {}
+
+  // Constructor for an unavailable PjRtFuture that will be resolved later by
+  // setting the promise completed.
   //
-  // May safely be called with event==nullptr in which case AssertHappensBefore
-  // has no effect.
-  void AssertHappensBefore(ScopedAsyncTrackingEvent* event) {
-    CHECK(IsValid());
-    if (event) {
-      event->AddDependency(promise_ref_.CopyRCRef());
+  // on_block_start is called before Await starts to block.
+  // on_block_end is called after Await finishes blocking.
+  explicit PjRtFuture(
+      Promise promise,
+      PjRtFutureHelpers::OnBlockStartFn on_block_start = nullptr,
+      PjRtFutureHelpers::OnBlockEndFn on_block_end = nullptr)
+      : Base(std::move(promise).ExtractRef(), std::move(on_block_start),
+             std::move(on_block_end)) {}
+
+  // Blocks the calling thread until the future is ready.
+  absl::Status Await() {
+    CHECK(Base::IsValid());
+    if (!Base::promise().IsAvailable()) {
+      PjRtFutureHelpers::ProfilingKeys keys = Base::OnBlockStart();
+      BlockUntilReady(Base::promise());
+      Base::OnBlockEnd(std::move(keys));
     }
+    return Base::promise().IsError() ? Base::promise().GetError()
+                                     : absl::OkStatus();
   }
 
- private:
-  // Wrapped object to wait on.
-  tsl::AsyncValueRef<T> promise_ref_;
-  // Function that is called before a thread starts blocking on the promise.
-  PjRtFutureHelpers::OnBlockStartFn on_block_start_;
-  // Function that is called after a thread finishes blocking on the promise.
-  PjRtFutureHelpers::OnBlockEndFn on_block_end_;
+  // TODO(b/333538339): Remove when all users of PjRtFuture<Status> will be
+  // converted to PjRtFuture<>. Currently this is an escape hatch to convert
+  // implicit error of a stateless event to a stateful future.
+  PjRtFuture<absl::Status> ToStatusFuture() {
+    auto promise = PjRtFuture<absl::Status>::CreatePromise();
+    OnReady([promise](absl::Status status) mutable {
+      promise.Set(std::move(status));
+    });
+    return PjRtFuture<absl::Status>(std::move(promise));
+  }
+
+  // TODO(b/333538339): Remove when all users of PjRtFuture<Status> will be
+  // converted to PjRtFuture<>. Currently this is an escape hatch to convert
+  // explicit error carried in a stateful future to a stateless future.
+  static PjRtFuture<> FromStatusFuture(PjRtFuture<absl::Status> future) {
+    PjRtFuture<>::Promise promise = PjRtFuture<>::CreatePromise();
+    future.OnReady([promise](absl::Status status) mutable {
+      if (status.ok()) {
+        promise.Set();
+      } else {
+        promise.SetError(std::move(status));
+      }
+    });
+    return PjRtFuture<>(std::move(promise));
+  }
+
+  // Registers callback to be called once the future is ready.
+  //
+  // callback may be called on an internal system thread or the calling thread.
+  // The client should avoid any potentially re-entrant API calls within the
+  // callback, for example by using the callback to enqueue work on a
+  // client-owned threadpool.
+  void OnReady(absl::AnyInvocable<void(absl::Status)> callback) const {
+    CHECK(Base::IsValid());
+    Base::promise().AndThen(std::move(callback));
+  }
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/pjrt_future_test.cc b/third_party/xla/xla/pjrt/pjrt_future_test.cc
new file mode 100644
index 00000000000000..9666d6a5cc3f4e
--- /dev/null
+++ b/third_party/xla/xla/pjrt/pjrt_future_test.cc
@@ -0,0 +1,165 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/pjrt/pjrt_future.h"
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "tsl/platform/test.h"
+
+namespace xla {
+
+TEST(PjRtFutureTest, StatelessFuture) {
+  auto promise = PjRtFuture<>::CreatePromise();
+  PjRtFuture<> future(promise);
+
+  EXPECT_FALSE(future.IsReady());
+  promise.Set();
+  EXPECT_TRUE(future.IsReady());
+
+  EXPECT_EQ(future.Await(), absl::OkStatus());
+
+  future.OnReady(
+      [](absl::Status status) { EXPECT_EQ(status, absl::OkStatus()); });
+}
+
+TEST(PjRtFutureTest, StatelessError) {
+  auto promise = PjRtFuture<>::CreatePromise();
+  PjRtFuture<> future(promise);
+
+  EXPECT_FALSE(future.IsReady());
+  promise.Set(absl::InternalError("test"));
+  EXPECT_TRUE(future.IsReady());
+
+  absl::Status status = future.Await();
+  EXPECT_EQ(status, absl::InternalError("test"));
+
+  future.OnReady([](absl::Status status) {
+    EXPECT_EQ(status, absl::InternalError("test"));
+  });
+}
+
+TEST(PjRtFutureTest, StatelessImmediate) {
+  PjRtFuture<> ok_future(absl::OkStatus());
+  PjRtFuture<> error_future(absl::InternalError("test"));
+
+  EXPECT_TRUE(ok_future.IsReady());
+  EXPECT_TRUE(error_future.IsReady());
+
+  EXPECT_EQ(ok_future.Await(), absl::OkStatus());
+  EXPECT_EQ(error_future.Await(), absl::InternalError("test"));
+
+  ok_future.OnReady(
+      [](absl::Status status) { EXPECT_EQ(status, absl::OkStatus()); });
+
+  error_future.OnReady([](absl::Status status) {
+    EXPECT_EQ(status, absl::InternalError("test"));
+  });
+}
+
+TEST(PjRtFutureTest, StatefulFuture) {
+  auto promise = PjRtFuture<int32_t>::CreatePromise();
+  PjRtFuture<int32_t> future(promise);
+
+  EXPECT_FALSE(future.IsReady());
+  promise.Set(42);
+  EXPECT_TRUE(future.IsReady());
+
+  future.OnReady([](int32_t value) { EXPECT_EQ(value, 42); });
+}
+
+TEST(PjRtFutureTest, StatusFuture) {
+  auto promise = PjRtFuture<absl::Status>::CreatePromise();
+  PjRtFuture<absl::Status> future(promise);
+
+  EXPECT_FALSE(future.IsReady());
+  promise.Set(absl::OkStatus());
+  EXPECT_TRUE(future.IsReady());
+
+  future.OnReady(
+      [](absl::Status status) { EXPECT_EQ(status, absl::OkStatus()); });
+}
+
+TEST(PjRtFutureTest, StatusOrFuture) {
+  auto promise = PjRtFuture<absl::StatusOr<int32_t>>::CreatePromise();
+  PjRtFuture<absl::StatusOr<int32_t>> future(promise);
+
+  EXPECT_FALSE(future.IsReady());
+  promise.Set(absl::StatusOr<int32_t>(42));
+  EXPECT_TRUE(future.IsReady());
+
+  future.OnReady([](absl::StatusOr<int32_t> value) { EXPECT_EQ(*value, 42); });
+}
+
+TEST(PjRtFutureTest, JoinFutures) {
+  auto empty_join = JoinFutures({});
+  EXPECT_TRUE(empty_join.IsReady());
+  EXPECT_EQ(empty_join.Await(), absl::OkStatus());
+
+  auto promise0 = PjRtFuture<>::CreatePromise();
+  auto promise1 = PjRtFuture<>::CreatePromise();
+
+  std::vector<PjRtFuture<>> futures0 = {PjRtFuture<>(promise0)};
+  std::vector<PjRtFuture<>> futures1 = {PjRtFuture<>(promise0),
+                                        PjRtFuture<>(promise1)};
+
+  auto join_one = JoinFutures(futures0);
+  EXPECT_FALSE(join_one.IsReady());
+
+  auto join_two = JoinFutures(futures1);
+  EXPECT_FALSE(join_two.IsReady());
+
+  promise0.Set();
+  EXPECT_TRUE(join_one.IsReady());
+  EXPECT_FALSE(join_two.IsReady());
+  EXPECT_EQ(join_one.Await(), absl::OkStatus());
+
+  promise1.Set();
+  EXPECT_TRUE(join_two.IsReady());
+  EXPECT_EQ(join_two.Await(), absl::OkStatus());
+}
+
+TEST(PjRtFutureTest, JoinErrors) {
+  auto empty_join = JoinFutures({});
+  EXPECT_TRUE(empty_join.IsReady());
+  EXPECT_EQ(empty_join.Await(), absl::OkStatus());
+
+  auto promise0 = PjRtFuture<>::CreatePromise();
+  auto promise1 = PjRtFuture<>::CreatePromise();
+
+  std::vector<PjRtFuture<>> futures0 = {PjRtFuture<>(promise0)};
+  std::vector<PjRtFuture<>> futures1 = {PjRtFuture<>(promise0),
+                                        PjRtFuture<>(promise1)};
+
+  auto join_one = JoinFutures(futures0);
+  EXPECT_FALSE(join_one.IsReady());
+
+  auto join_two = JoinFutures(futures1);
+  EXPECT_FALSE(join_two.IsReady());
+
+  promise0.Set(absl::InternalError("error #0"));
+  EXPECT_TRUE(join_one.IsReady());
+  EXPECT_FALSE(join_two.IsReady());
+  EXPECT_EQ(join_one.Await(), absl::InternalError("error #0"));
+
+  promise1.Set(absl::InternalError("error #1"));
+  EXPECT_TRUE(join_two.IsReady());
+  EXPECT_EQ(join_two.Await(), absl::InternalError("error #0"));
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
index 83261bcd7f98a6..0b47a863ec9696 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
@@ -726,8 +726,7 @@ PjRtStreamExecutorBuffer::ReleaseDeviceMemoryOwnership(
 }
 
 absl::StatusOr<std::unique_ptr<PjRtBuffer>>
-PjRtStreamExecutorBuffer::DonateWithControlDependency(
-    PjRtFuture<absl::Status> dependency) {
+PjRtStreamExecutorBuffer::DonateWithControlDependency(PjRtFuture<> dependency) {
   VLOG(1) << "PjRtStreamExecutorBuffer::DonateWithControlDependency";
   std::unique_ptr<PjRtBuffer> new_buffer;
 
@@ -1492,21 +1491,19 @@ void PjRtStreamExecutorBuffer::DropHold(ScopedHold::Type type,
   }
 }
 
-PjRtFuture<absl::Status> PjRtStreamExecutorBuffer::LazyToLiteral(
+PjRtFuture<> PjRtStreamExecutorBuffer::LazyToLiteral(
     absl::AnyInvocable<absl::StatusOr<MutableLiteralBase*>() &&> generator) {
   auto buffer = std::move(generator)();
   if (!buffer.ok()) {
-    return PjRtFuture<Status>(buffer.status());
+    return PjRtFuture<>(buffer.status());
   }
   return ToLiteral(buffer.value());
 }
 
-PjRtFuture<Status> PjRtStreamExecutorBuffer::ToLiteral(
-    MutableLiteralBase* literal) {
+PjRtFuture<> PjRtStreamExecutorBuffer::ToLiteral(MutableLiteralBase* literal) {
   VLOG(1) << "PjRtStreamExecutorBuffer::ToLiteral";
   if (IsEmptyTuple()) {
-    return PjRtFuture<Status>(
-        InvalidArgument("ToLiteral called on empty tuple"));
+    return PjRtFuture<>(InvalidArgument("ToLiteral called on empty tuple"));
   }
   LocalDeviceState* local_device = device_->local_device_state();
   se::Stream* stream = local_device->GetDeviceToHostStream();
@@ -1516,13 +1513,13 @@ PjRtFuture<Status> PjRtStreamExecutorBuffer::ToLiteral(
     // We can't perform any other action while a donation hold is in progress.
     WaitForOutstandingDonationHold();
     if (device_buffer_ == nullptr) {
-      return PjRtFuture<Status>(InvalidArgument(
+      return PjRtFuture<>(InvalidArgument(
           "CopyToHostAsync() called on deleted or donated buffer"));
     }
     AcquireHoldLocked(&device_buffer);
   }
 
-  auto promise = PjRtFuture<Status>::CreatePromise();
+  auto promise = PjRtFuture<>::CreatePromise();
   auto usage_event =
       std::make_shared<BufferSequencingEvent>(client_->thread_pool());
 
@@ -1577,7 +1574,7 @@ PjRtFuture<Status> PjRtStreamExecutorBuffer::ToLiteral(
 
     transfer_manager->TransferLiteralFromDevice(
         stream, shaped_buffer, literal,
-        [promise](Status status) mutable { promise.Set(status); },
+        [promise](Status status) mutable { promise.Set(std::move(status)); },
         transfer_metadata_ptr);
 
     local_device->event_pool().ThenRecordEvent(stream, event_or.value());
@@ -1593,7 +1590,7 @@ PjRtFuture<Status> PjRtStreamExecutorBuffer::ToLiteral(
       absl::StrFormat("async_to_literal_%p", literal),
       std::move(async_to_literal));
 
-  return PjRtFuture<Status>(
+  return PjRtFuture<>(
       std::move(promise),
       /*on_block_start=*/
       []() {
@@ -1623,14 +1620,14 @@ StatusOr<size_t> PjRtStreamExecutorBuffer::GetOnDeviceSizeInBytes() const {
   return device_buffer_->device_memory()[0].size();
 }
 
-PjRtFuture<Status> PjRtStreamExecutorBuffer::CopyRawToHost(
-    void* dst, int64_t offset, int64_t transfer_size) {
+PjRtFuture<> PjRtStreamExecutorBuffer::CopyRawToHost(void* dst, int64_t offset,
+                                                     int64_t transfer_size) {
   return client_->CopyRawSubBufferToHost(this, dst, offset, transfer_size);
 }
 
-PjRtFuture<Status> PjRtStreamExecutorBuffer::CopyRawToHostFuture(
+PjRtFuture<> PjRtStreamExecutorBuffer::CopyRawToHostFuture(
     PjRtFuture<StatusOr<void*>> dst, int64_t offset, int64_t transfer_size) {
-  auto promise = PjRtFuture<Status>::CreatePromise();
+  auto promise = PjRtFuture<>::CreatePromise();
   dst.OnReady([this, promise, offset,
                transfer_size](absl::StatusOr<void*> dst) mutable {
     if (dst.ok()) {
@@ -1641,14 +1638,14 @@ PjRtFuture<Status> PjRtStreamExecutorBuffer::CopyRawToHostFuture(
            promise = std::move(promise)]() mutable {
             CopyRawToHost(dst, offset, transfer_size)
                 .OnReady([promise = std::move(promise)](Status status) mutable {
-                  promise.Set(status);
+                  promise.Set(std::move(status));
                 });
           });
     } else {
       promise.Set(dst.status());
     }
   });
-  return PjRtFuture<Status>(std::move(promise));
+  return PjRtFuture<>(std::move(promise));
 }
 
 StatusOr<ShapedBuffer> PjRtStreamExecutorBuffer::AsShapedBuffer() const {
@@ -1805,7 +1802,7 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtStreamExecutorBuffer::CopyToDevice(
         literal_pointer->untyped_data(),
         literal_pointer->shape().element_type(),
         literal_pointer->shape().dimensions(), byte_strides,
-        PjRtStreamExecutorClient::HostBufferSemantics::kZeroCopy,
+        PjRtStreamExecutorClient::HostBufferSemantics::kImmutableZeroCopy,
         [literal{std::move(literal)}]() { /* frees literal */ }, dst_device);
   }
 
@@ -1888,18 +1885,18 @@ void PjRtStreamExecutorBuffer::CopyToRemoteDeviceScattered(
   }
 }
 
-PjRtFuture<Status> PjRtStreamExecutorBuffer::GetReadyFuture() {
+PjRtFuture<> PjRtStreamExecutorBuffer::GetReadyFuture() {
   std::shared_ptr<TrackedDeviceBuffer> device_buffer;
-  PjRtFuture<Status>::Promise definition_promise;
+  PjRtFuture<>::Promise definition_promise;
   {
     absl::MutexLock lock(&mu_);
     if (device_buffer_ == nullptr) {
-      return PjRtFuture<Status>(InvalidArgument(
+      return PjRtFuture<>(InvalidArgument(
           "GetReadyFuture() called on deleted or donated buffer"));
     }
     if (!definition_promise_) {
       device_buffer = device_buffer_;
-      definition_promise_ = PjRtFuture<Status>::CreatePromise();
+      definition_promise_ = PjRtFuture<>::CreatePromise();
     }
     definition_promise = definition_promise_;
   }
@@ -1956,7 +1953,7 @@ PjRtFuture<Status> PjRtStreamExecutorBuffer::GetReadyFuture() {
         std::move(async_wait_for_events));
   }
 
-  return PjRtFuture<Status>(
+  return PjRtFuture<>(
       std::move(definition_promise),
       /*on_block_start=*/
       []() {
@@ -2386,7 +2383,7 @@ class StreamExecutorCopyToDeviceStream : public CopyToDeviceStream {
         dst_(dst),
         done_(std::move(done)) {}
 
-  PjRtFuture<Status> AddChunk(PjRtChunk chunk) final {
+  PjRtFuture<> AddChunk(PjRtChunk chunk) final {
     tsl::profiler::TraceMe trace([&] {
       return tsl::profiler::TraceMeEncode(
           "StreamExecutorCopyToDeviceStream::AddChunk",
@@ -2404,7 +2401,7 @@ class StreamExecutorCopyToDeviceStream : public CopyToDeviceStream {
       done_.SetError(absl::InvalidArgumentError(absl::StrFormat(
           "Chunk size (%d) was not a multiple of the granule size (%d)",
           chunk.size(), granule_size_in_bytes())));
-      return PjRtFuture<Status>(done_.GetError());
+      return PjRtFuture<>(done_.GetError());
     }
 
     if (current_bytes_ + chunk.size() > total_bytes_) {
@@ -2412,7 +2409,7 @@ class StreamExecutorCopyToDeviceStream : public CopyToDeviceStream {
           absl::StrFormat("Adding chunk of size %d would overflow buffer of "
                           "size %d (%d already transferred)",
                           chunk.size(), total_bytes_, current_bytes_)));
-      return PjRtFuture<Status>(done_.GetError());
+      return PjRtFuture<>(done_.GetError());
     }
 
     se::DeviceMemoryBase dst(
@@ -2426,7 +2423,7 @@ class StreamExecutorCopyToDeviceStream : public CopyToDeviceStream {
     auto copied = stream_->Memcpy(&dst, chunk.data(), chunk.size());
     if (!copied.ok()) {
       done_.SetError(copied);
-      return PjRtFuture<Status>(done_.GetError());
+      return PjRtFuture<>(done_.GetError());
     }
 
     // Delete chunk once the memcpy operation completes.
@@ -2434,7 +2431,7 @@ class StreamExecutorCopyToDeviceStream : public CopyToDeviceStream {
     auto deleted = stream_->DoHostCallback([chunk_ptr]() { delete chunk_ptr; });
     if (!deleted.ok()) {
       done_.SetError(deleted);
-      return PjRtFuture<Status>(done_.GetError());
+      return PjRtFuture<>(done_.GetError());
     }
 
     // Record done event once processed the last chunk. It is the caller
@@ -2444,12 +2441,12 @@ class StreamExecutorCopyToDeviceStream : public CopyToDeviceStream {
       auto recorded = stream_->RecordEvent(&done_.get());
       if (!recorded.ok()) {
         done_.SetError(recorded);
-        return PjRtFuture<Status>(done_.GetError());
+        return PjRtFuture<>(done_.GetError());
       }
       done_.SetStateConcrete();
     }
 
-    return PjRtFuture<Status>(OkStatus());
+    return PjRtFuture<>(OkStatus());
   }
 
  private:
@@ -2851,12 +2848,12 @@ PjRtStreamExecutorLoadedExecutable::ExecuteHelper(
     }
   }
 
-  std::optional<PjRtFuture<Status>> future;
+  std::optional<PjRtFuture<>> future;
   if (fill_future) {
-    auto promise = PjRtFuture<Status>::CreatePromise();
-    future = PjRtFuture<Status>(promise);
+    auto promise = PjRtFuture<>::CreatePromise();
+    future = PjRtFuture<>(promise);
     compute_callbacks.push_back(
-        [promise = std::move(promise)]() mutable { promise.Set(OkStatus()); });
+        [promise = std::move(promise)]() mutable { promise.Set(); });
   }
   TF_RETURN_IF_ERROR(device_state->ThenExecuteCallback(
       stream, [callbacks{std::move(compute_callbacks)},
@@ -2874,7 +2871,7 @@ StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>>
 PjRtStreamExecutorLoadedExecutable::Execute(
     absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
     const ExecuteOptions& options,
-    std::optional<std::vector<PjRtFuture<Status>>>& returned_futures) {
+    std::optional<std::vector<PjRtFuture<>>>& returned_futures) {
   if (device_assignment_ == nullptr) {
     return InvalidArgument("Execute expects a non-null device_assignment");
   }
@@ -2998,8 +2995,8 @@ PjRtStreamExecutorLoadedExecutable::Execute(
 StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
 PjRtStreamExecutorLoadedExecutable::ExecuteSharded(
     absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
-    const ExecuteOptions& options,
-    std::optional<PjRtFuture<Status>>& returned_future, bool fill_future) {
+    const ExecuteOptions& options, std::optional<PjRtFuture<>>& returned_future,
+    bool fill_future) {
   if (device_assignment_ == nullptr) {
     return InvalidArgument("ExecuteShard expects a non-null device_assignment");
   }
@@ -3027,8 +3024,8 @@ PjRtStreamExecutorLoadedExecutable::ExecuteSharded(
 StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
 PjRtStreamExecutorLoadedExecutable::ExecutePortable(
     absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
-    const ExecuteOptions& options,
-    std::optional<PjRtFuture<Status>>& returned_future, bool fill_future) {
+    const ExecuteOptions& options, std::optional<PjRtFuture<>>& returned_future,
+    bool fill_future) {
   if (device_assignment_ != nullptr) {
     return InvalidArgument("ExecutePortable gets a non-portable executable");
   }
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
index a94d3e28bf4345..93429c0917902f 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
@@ -403,11 +403,10 @@ class PjRtStreamExecutorClient : public PjRtClient {
     }
   }
 
-  virtual PjRtFuture<Status> CopyRawSubBufferToHost(PjRtBuffer* buffer,
-                                                    void* dst, int64_t offset,
-                                                    int64_t transfer_size) {
-    return PjRtFuture<Status>(
-        Unimplemented("Raw copies to host not implemented."));
+  virtual PjRtFuture<> CopyRawSubBufferToHost(PjRtBuffer* buffer, void* dst,
+                                              int64_t offset,
+                                              int64_t transfer_size) {
+    return PjRtFuture<>(Unimplemented("Raw copies to host not implemented."));
   }
 
   // Helper function for creating PjRtStreamExecutorExecutables. Modifies
@@ -666,19 +665,19 @@ class PjRtStreamExecutorBuffer : public PjRtBuffer {
       bool wait_for_operations_to_complete) override;
 
   using PjRtBuffer::ToLiteralSync;
-  PjRtFuture<absl::Status> ToLiteral(MutableLiteralBase* literal) override;
-  PjRtFuture<absl::Status> LazyToLiteral(
+  PjRtFuture<> ToLiteral(MutableLiteralBase* literal) override;
+  PjRtFuture<> LazyToLiteral(
       absl::AnyInvocable<absl::StatusOr<MutableLiteralBase*>() &&> generator)
       override;
 
   StatusOr<size_t> GetOnDeviceSizeInBytes() const override;
 
-  PjRtFuture<Status> CopyRawToHost(void* dst, int64_t offset,
-                                   int64_t transfer_size) override;
+  PjRtFuture<> CopyRawToHost(void* dst, int64_t offset,
+                             int64_t transfer_size) override;
 
-  PjRtFuture<Status> CopyRawToHostFuture(PjRtFuture<StatusOr<void*>> dst,
-                                         int64_t offset,
-                                         int64_t transfer_size) override;
+  PjRtFuture<> CopyRawToHostFuture(PjRtFuture<StatusOr<void*>> dst,
+                                   int64_t offset,
+                                   int64_t transfer_size) override;
 
   // Drops the buffer's reference to its associated device memory, leaving the
   // buffer in an invalid state. The memory will be freed lazily when all async
@@ -724,7 +723,7 @@ class PjRtStreamExecutorBuffer : public PjRtBuffer {
       std::vector<RemoteSendCallback> callbacks,
       const ScatterDetails& scatter_details) override;
 
-  PjRtFuture<Status> GetReadyFuture() override;
+  PjRtFuture<> GetReadyFuture() override;
 
   bool IsOnCpu() const override;
 
@@ -746,7 +745,7 @@ class PjRtStreamExecutorBuffer : public PjRtBuffer {
       bool wait_for_operations_to_complete);
 
   absl::StatusOr<std::unique_ptr<PjRtBuffer>> DonateWithControlDependency(
-      PjRtFuture<absl::Status> dependency) override;
+      PjRtFuture<> dependency) override;
 
  private:
   friend class PjRtClient;
@@ -804,7 +803,7 @@ class PjRtStreamExecutorBuffer : public PjRtBuffer {
   std::shared_ptr<TrackedDeviceBuffer> device_buffer_ ABSL_GUARDED_BY(mu_);
   // Count of holds on the buffer.
   std::array<int, ScopedHold::Type::kMaxValue> holds_ ABSL_GUARDED_BY(mu_);
-  PjRtFuture<Status>::Promise definition_promise_ ABSL_GUARDED_BY(mu_);
+  PjRtFuture<>::Promise definition_promise_ ABSL_GUARDED_BY(mu_);
 };
 
 // Wraps one or more XLA LocalExecutables (one per partition, as specified by
@@ -881,22 +880,19 @@ class PjRtStreamExecutorLoadedExecutable : public PjRtLoadedExecutable {
   StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>> Execute(
       absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
       const ExecuteOptions& options,
-      std::optional<std::vector<PjRtFuture<Status>>>& returned_futures)
-      override;
+      std::optional<std::vector<PjRtFuture<>>>& returned_futures) override;
 
   using PjRtLoadedExecutable::ExecuteSharded;
   StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteSharded(
       absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
       const ExecuteOptions& options,
-      std::optional<PjRtFuture<Status>>& returned_future,
-      bool fill_future) override;
+      std::optional<PjRtFuture<>>& returned_future, bool fill_future) override;
 
   using PjRtLoadedExecutable::ExecutePortable;
   StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecutePortable(
       absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
       const ExecuteOptions& options,
-      std::optional<PjRtFuture<Status>>& returned_future,
-      bool fill_future) override;
+      std::optional<PjRtFuture<>>& returned_future, bool fill_future) override;
 
   void Delete() override { executables_.clear(); }
 
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client_test.cc b/third_party/xla/xla/pjrt/pjrt_stream_executor_client_test.cc
index b308ff82bb1b52..a0870acc1ec1a7 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client_test.cc
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client_test.cc
@@ -42,7 +42,7 @@ limitations under the License.
 namespace xla {
 namespace {
 
-xla::StatusOr<std::unique_ptr<PjRtStreamExecutorClient>> GetClient() {
+absl::StatusOr<std::unique_ptr<PjRtStreamExecutorClient>> GetClient() {
   LocalClient* local_client = xla::ClientLibrary::LocalClientOrDie();
   TF_ASSIGN_OR_RETURN(se::Platform * platform,
                       PlatformUtil::GetPlatform("Host"));
@@ -130,8 +130,8 @@ TEST(PjRtStreamExecutorClientTest, DonateWithControlDependency) {
       std::unique_ptr<PjRtBuffer> buffer,
       client->BufferFromHostLiteral(literal, client->addressable_devices()[0]));
 
-  auto avr = tsl::MakeUnconstructedAsyncValueRef<absl::Status>();
-  PjRtFuture<absl::Status> future(avr);
+  PjRtFuture<>::Promise promise = PjRtFuture<>::CreatePromise();
+  PjRtFuture<> future(promise);
   auto blocked_buffer =
       std::move(*(buffer->DonateWithControlDependency(future)));
   EXPECT_TRUE(buffer->IsDeleted());
@@ -150,7 +150,7 @@ TEST(PjRtStreamExecutorClientTest, DonateWithControlDependency) {
 
   EXPECT_FALSE(got_literal);
 
-  avr.emplace(absl::OkStatus());
+  promise.Set();
   EXPECT_TRUE(future.IsReady());
 
   {
diff --git a/third_party/xla/xla/pjrt/status_casters.h b/third_party/xla/xla/pjrt/status_casters.h
index 915153f9c372c9..bd4a044f6c3042 100644
--- a/third_party/xla/xla/pjrt/status_casters.h
+++ b/third_party/xla/xla/pjrt/status_casters.h
@@ -50,7 +50,7 @@ namespace xla {
 // pointer-to-member-function:
 // xla::ThrowIfErrorWrapper(&MyClass::MyMethod)
 
-inline void ThrowIfError(xla::Status src) {
+inline void ThrowIfError(absl::Status src) {
   if (!src.ok()) {
     throw xla::XlaRuntimeError(src);
   }
@@ -79,29 +79,29 @@ ThrowIfErrorWrapper(F) -> ThrowIfErrorWrapper<decltype(&F::operator()), F>;
 
 // For callable types (with operator()).
 template <typename... Args>
-ThrowIfErrorWrapper(xla::Status (&)(Args...))
-    -> ThrowIfErrorWrapper<xla::Status(Args...), xla::Status (&)(Args...)>;
+ThrowIfErrorWrapper(absl::Status (&)(Args...))
+    -> ThrowIfErrorWrapper<absl::Status(Args...), absl::Status (&)(Args...)>;
 
 // For unbound nonstatic member functions.
 template <typename C, typename... Args>
-ThrowIfErrorWrapper(xla::Status (C::*)(Args...))
-    -> ThrowIfErrorWrapper<xla::Status(Args...), C>;
+ThrowIfErrorWrapper(absl::Status (C::*)(Args...))
+    -> ThrowIfErrorWrapper<absl::Status(Args...), C>;
 
 // Template specializations.
 
 // For free functions.
 template <typename... Args>
-struct ThrowIfErrorWrapper<xla::Status(Args...), xla::Status (&)(Args...)> {
-  explicit ThrowIfErrorWrapper(xla::Status (&f)(Args...)) : func(f) {}
+struct ThrowIfErrorWrapper<absl::Status(Args...), absl::Status (&)(Args...)> {
+  explicit ThrowIfErrorWrapper(absl::Status (&f)(Args...)) : func(f) {}
   void operator()(Args... args) const {
     xla::ThrowIfError(func(std::forward<Args>(args)...));
   }
-  xla::Status (&func)(Args...);
+  absl::Status (&func)(Args...);
 };
 
 // For callable types (with operator()), non-const and const versions.
 template <typename C, typename... Args, typename F>
-struct ThrowIfErrorWrapper<xla::Status (C::*)(Args...), F> {
+struct ThrowIfErrorWrapper<absl::Status (C::*)(Args...), F> {
   explicit ThrowIfErrorWrapper(F&& f) : func(std::move(f)) {}
   void operator()(Args... args) const {
     xla::ThrowIfError(func(std::forward<Args>(args)...));
@@ -109,7 +109,7 @@ struct ThrowIfErrorWrapper<xla::Status (C::*)(Args...), F> {
   F func;
 };
 template <typename C, typename... Args, typename F>
-struct ThrowIfErrorWrapper<xla::Status (C::*)(Args...) const, F> {
+struct ThrowIfErrorWrapper<absl::Status (C::*)(Args...) const, F> {
   explicit ThrowIfErrorWrapper(F&& f) : func(std::move(f)) {}
   void operator()(Args... args) const {
     xla::ThrowIfError(func(std::forward<Args>(args)...));
@@ -120,21 +120,21 @@ struct ThrowIfErrorWrapper<xla::Status (C::*)(Args...) const, F> {
 // For unbound nonstatic member functions, non-const and const versions.
 // `ptmf` stands for "pointer to member function".
 template <typename C, typename... Args>
-struct ThrowIfErrorWrapper<xla::Status(Args...), C> {
-  explicit ThrowIfErrorWrapper(xla::Status (C::*ptmf)(Args...)) : ptmf(ptmf) {}
+struct ThrowIfErrorWrapper<absl::Status(Args...), C> {
+  explicit ThrowIfErrorWrapper(absl::Status (C::*ptmf)(Args...)) : ptmf(ptmf) {}
   void operator()(C& instance, Args... args) const {
     xla::ThrowIfError((instance.*ptmf)(std::forward<Args>(args)...));
   }
-  xla::Status (C::*ptmf)(Args...);
+  absl::Status (C::*ptmf)(Args...);
 };
 template <typename C, typename... Args>
-struct ThrowIfErrorWrapper<xla::Status(Args...) const, C> {
-  explicit ThrowIfErrorWrapper(xla::Status (C::*ptmf)(Args...) const)
+struct ThrowIfErrorWrapper<absl::Status(Args...) const, C> {
+  explicit ThrowIfErrorWrapper(absl::Status (C::*ptmf)(Args...) const)
       : ptmf(ptmf) {}
   void operator()(const C& instance, Args... args) const {
     xla::ThrowIfError((instance.*ptmf)(std::forward<Args>(args)...));
   }
-  xla::Status (C::*ptmf)(Args...) const;
+  absl::Status (C::*ptmf)(Args...) const;
 };
 
 // Utilities for `StatusOr`.
@@ -153,30 +153,30 @@ template <typename F>
 ValueOrThrowWrapper(F) -> ValueOrThrowWrapper<decltype(&F::operator()), F>;
 
 template <typename R, typename... Args>
-ValueOrThrowWrapper(xla::StatusOr<R> (&)(Args...))
-    -> ValueOrThrowWrapper<xla::StatusOr<R>(Args...),
-                           xla::StatusOr<R> (&)(Args...)>;
+ValueOrThrowWrapper(absl::StatusOr<R> (&)(Args...))
+    -> ValueOrThrowWrapper<absl::StatusOr<R>(Args...),
+                           absl::StatusOr<R> (&)(Args...)>;
 
 template <typename C, typename R, typename... Args>
-ValueOrThrowWrapper(xla::StatusOr<R> (C::*)(Args...))
-    -> ValueOrThrowWrapper<xla::StatusOr<R>(Args...), C>;
+ValueOrThrowWrapper(absl::StatusOr<R> (C::*)(Args...))
+    -> ValueOrThrowWrapper<absl::StatusOr<R>(Args...), C>;
 
 // Deduction guide for const methods.
 template <typename C, typename R, typename... Args>
-ValueOrThrowWrapper(xla::StatusOr<R> (C::*)(Args...) const)
-    -> ValueOrThrowWrapper<xla::StatusOr<R>(Args...) const, C>;
+ValueOrThrowWrapper(absl::StatusOr<R> (C::*)(Args...) const)
+    -> ValueOrThrowWrapper<absl::StatusOr<R>(Args...) const, C>;
 
 template <typename R, typename... Args>
-struct ValueOrThrowWrapper<xla::StatusOr<R>(Args...),
-                           xla::StatusOr<R> (&)(Args...)> {
-  explicit ValueOrThrowWrapper(xla::StatusOr<R> (&f)(Args...)) : func(f) {}
+struct ValueOrThrowWrapper<absl::StatusOr<R>(Args...),
+                           absl::StatusOr<R> (&)(Args...)> {
+  explicit ValueOrThrowWrapper(absl::StatusOr<R> (&f)(Args...)) : func(f) {}
   R operator()(Args... args) const {
     return xla::ValueOrThrow(func(std::forward<Args>(args)...));
   }
-  xla::StatusOr<R> (&func)(Args...);
+  absl::StatusOr<R> (&func)(Args...);
 };
 template <typename R, typename C, typename... Args, typename F>
-struct ValueOrThrowWrapper<xla::StatusOr<R> (C::*)(Args...), F> {
+struct ValueOrThrowWrapper<absl::StatusOr<R> (C::*)(Args...), F> {
   explicit ValueOrThrowWrapper(F&& f) : func(std::move(f)) {}
   R operator()(Args... args) const {
     return xla::ValueOrThrow(func(std::forward<Args>(args)...));
@@ -184,7 +184,7 @@ struct ValueOrThrowWrapper<xla::StatusOr<R> (C::*)(Args...), F> {
   F func;
 };
 template <typename R, typename C, typename... Args, typename F>
-struct ValueOrThrowWrapper<xla::StatusOr<R> (C::*)(Args...) const, F> {
+struct ValueOrThrowWrapper<absl::StatusOr<R> (C::*)(Args...) const, F> {
   explicit ValueOrThrowWrapper(F&& f) : func(std::move(f)) {}
   R operator()(Args... args) const {
     return xla::ValueOrThrow(func(std::forward<Args>(args)...));
@@ -195,22 +195,22 @@ struct ValueOrThrowWrapper<xla::StatusOr<R> (C::*)(Args...) const, F> {
 // For unbound nonstatic member functions, non-const and const versions.
 // `ptmf` stands for "pointer to member function".
 template <typename R, typename C, typename... Args>
-struct ValueOrThrowWrapper<xla::StatusOr<R>(Args...), C> {
-  explicit ValueOrThrowWrapper(xla::StatusOr<R> (C::*ptmf)(Args...))
+struct ValueOrThrowWrapper<absl::StatusOr<R>(Args...), C> {
+  explicit ValueOrThrowWrapper(absl::StatusOr<R> (C::*ptmf)(Args...))
       : ptmf(ptmf) {}
   R operator()(C& instance, Args... args) const {
     return xla::ValueOrThrow((instance.*ptmf)(std::forward<Args>(args)...));
   }
-  xla::StatusOr<R> (C::*ptmf)(Args...);
+  absl::StatusOr<R> (C::*ptmf)(Args...);
 };
 template <typename R, typename C, typename... Args>
-struct ValueOrThrowWrapper<xla::StatusOr<R>(Args...) const, C> {
-  explicit ValueOrThrowWrapper(xla::StatusOr<R> (C::*ptmf)(Args...) const)
+struct ValueOrThrowWrapper<absl::StatusOr<R>(Args...) const, C> {
+  explicit ValueOrThrowWrapper(absl::StatusOr<R> (C::*ptmf)(Args...) const)
       : ptmf(ptmf) {}
   R operator()(const C& instance, Args... args) const {
     return xla::ValueOrThrow((instance.*ptmf)(std::forward<Args>(args)...));
   }
-  xla::StatusOr<R> (C::*ptmf)(Args...) const;
+  absl::StatusOr<R> (C::*ptmf)(Args...) const;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/tf_pjrt_client.cc b/third_party/xla/xla/pjrt/tf_pjrt_client.cc
index d9d5102faf53cc..816a6b8b3dbedd 100644
--- a/third_party/xla/xla/pjrt/tf_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/tf_pjrt_client.cc
@@ -54,7 +54,7 @@ StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>>
 TfPjRtExecutable::Execute(
     absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
     const ExecuteOptions& options,
-    std::optional<std::vector<PjRtFuture<Status>>>& returned_futures) {
+    std::optional<std::vector<PjRtFuture<>>>& returned_futures) {
   std::vector<std::vector<PjRtBuffer*>> unwrapped_argument_handles;
   unwrapped_argument_handles.reserve(argument_handles.size());
   for (auto& handles : argument_handles) {
@@ -77,10 +77,11 @@ TfPjRtExecutable::Execute(
 }
 
 StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
-TfPjRtExecutable::ExecuteSharded(
-    absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
-    const ExecuteOptions& options,
-    std::optional<PjRtFuture<Status>>& returned_future, bool fill_future) {
+TfPjRtExecutable::ExecuteSharded(absl::Span<PjRtBuffer* const> argument_handles,
+                                 PjRtDevice* device,
+                                 const ExecuteOptions& options,
+                                 std::optional<PjRtFuture<>>& returned_future,
+                                 bool fill_future) {
   std::vector<PjRtBuffer*> unwrapped_argument_handles;
   unwrapped_argument_handles.reserve(argument_handles.size());
   for (PjRtBuffer* buffer : argument_handles) {
@@ -98,8 +99,8 @@ TfPjRtExecutable::ExecuteSharded(
 StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
 TfPjRtExecutable::ExecutePortable(
     absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
-    const ExecuteOptions& options,
-    std::optional<PjRtFuture<Status>>& returned_future, bool fill_future) {
+    const ExecuteOptions& options, std::optional<PjRtFuture<>>& returned_future,
+    bool fill_future) {
   std::vector<PjRtBuffer*> unwrapped_argument_handles;
   unwrapped_argument_handles.reserve(argument_handles.size());
   for (PjRtBuffer* buffer : argument_handles) {
diff --git a/third_party/xla/xla/pjrt/tf_pjrt_client.h b/third_party/xla/xla/pjrt/tf_pjrt_client.h
index b440c15a822499..e01d1b5260dbc8 100644
--- a/third_party/xla/xla/pjrt/tf_pjrt_client.h
+++ b/third_party/xla/xla/pjrt/tf_pjrt_client.h
@@ -57,10 +57,10 @@ class TfPjRtBuffer : public PjRtBuffer {
       override {
     return wrapped_->AcquireExternalReference();
   }
-  PjRtFuture<absl::Status> ToLiteral(MutableLiteralBase* literal) override {
+  PjRtFuture<> ToLiteral(MutableLiteralBase* literal) override {
     return wrapped_->ToLiteral(literal);
   }
-  PjRtFuture<absl::Status> LazyToLiteral(
+  PjRtFuture<> LazyToLiteral(
       absl::AnyInvocable<absl::StatusOr<MutableLiteralBase*>() &&> generator)
       override {
     return wrapped_->LazyToLiteral(std::move(generator));
@@ -68,8 +68,8 @@ class TfPjRtBuffer : public PjRtBuffer {
   StatusOr<size_t> GetOnDeviceSizeInBytes() const override {
     return wrapped_->GetOnDeviceSizeInBytes();
   }
-  PjRtFuture<Status> CopyRawToHost(void* dst, int64_t offset,
-                                   int64_t transfer_size) override {
+  PjRtFuture<> CopyRawToHost(void* dst, int64_t offset,
+                             int64_t transfer_size) override {
     return wrapped_->CopyRawToHost(dst, offset, transfer_size);
   }
   void Delete() override { wrapped_->Delete(); }
@@ -99,9 +99,7 @@ class TfPjRtBuffer : public PjRtBuffer {
         std::move(serialized_descriptors), std::move(callbacks),
         scatter_details);
   }
-  PjRtFuture<Status> GetReadyFuture() override {
-    return wrapped_->GetReadyFuture();
-  }
+  PjRtFuture<> GetReadyFuture() override { return wrapped_->GetReadyFuture(); }
   bool IsOnCpu() const override { return wrapped_->IsOnCpu(); }
 
   // Not thread-safe. The caller should promises to have some external
@@ -153,20 +151,17 @@ class TfPjRtExecutable : public PjRtLoadedExecutable {
   StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>> Execute(
       absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
       const ExecuteOptions& options,
-      std::optional<std::vector<PjRtFuture<Status>>>& returned_futures)
-      override;
+      std::optional<std::vector<PjRtFuture<>>>& returned_futures) override;
   using PjRtLoadedExecutable::ExecuteSharded;
   StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteSharded(
       absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
       const ExecuteOptions& options,
-      std::optional<PjRtFuture<Status>>& returned_future,
-      bool fill_future) override;
+      std::optional<PjRtFuture<>>& returned_future, bool fill_future) override;
   using PjRtLoadedExecutable::ExecutePortable;
   StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecutePortable(
       absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
       const ExecuteOptions& options,
-      std::optional<PjRtFuture<Status>>& returned_future,
-      bool fill_future) override;
+      std::optional<PjRtFuture<>>& returned_future, bool fill_future) override;
 
   void Delete() override { return wrapped_->Delete(); }
   bool IsDeleted() override { return wrapped_->IsDeleted(); }
@@ -225,7 +220,7 @@ class TfPjRtClient : public PjRtClient {
       PjRtLocalDeviceId local_device_id) const override {
     if (wrapped_ == nullptr) {
       return tsl::errors::Internal(
-          "Wrapped PJRT client in TfPjRtClient is already destoryed.");
+          "Wrapped PJRT client in TfPjRtClient is already destroyed.");
     }
     return wrapped_->LookupAddressableDevice(local_device_id);
   }
diff --git a/third_party/xla/xla/pjrt/transpose.cc b/third_party/xla/xla/pjrt/transpose.cc
index 2f16afbc2f6d6b..67dae968451d27 100644
--- a/third_party/xla/xla/pjrt/transpose.cc
+++ b/third_party/xla/xla/pjrt/transpose.cc
@@ -189,6 +189,12 @@ template <typename T, int inner_bs,
 void Transpose(const char* __restrict a, int outer_bs_a, char* __restrict b,
                int outer_bs_b, TransposePlan::Node const* __restrict node,
                void* __restrict scratch) {
+  tsl::profiler::TraceMe traceme([&]() {
+    return tsl::profiler::TraceMeEncode("Transpose",
+                                        {{"inner_bs", inner_bs},
+                                         {"outer_bs_a", outer_bs_a},
+                                         {"outer_bs_b", outer_bs_b}});
+  });
   DVLOG(10) << "Transpose " << outer_bs_a << " " << outer_bs_b;
   DCHECK_GT(outer_bs_a, 0);
   DCHECK_GT(outer_bs_b, 0);
@@ -400,6 +406,13 @@ void TransposeConstStride1(const char* __restrict a, char* __restrict b,
 template <typename T, TransposePlan::Transformation transformation>
 void TransposePlan::ExecuteTyped(const char* a, char* b,
                                  absl::Span<Node const> nodes) const {
+  tsl::profiler::TraceMe traceme([&]() {
+    return tsl::profiler::TraceMeEncode(
+        "TransposePlan::ExecuteTyped",
+        {{"inner_kernel_is_memcpy", inner_kernel_is_memcpy_},
+         {"inner_block_elems", inner_block_elems_}});
+  });
+
   if (inner_kernel_is_memcpy_) {
     DCHECK(transformation_ == Transformation::kNone);
     TransposeConstStride1<T>(a, b, nodes.data());
@@ -454,6 +467,7 @@ void TransposePlan::Execute(
   if (num_elems_ == 0) {
     return;
   }
+  tsl::profiler::TraceMe traceme("Transpose::Execute", /*level=*/2);
 
   const char* ac = static_cast<const char*>(a);
   char* bc = static_cast<char*>(b);
@@ -494,7 +508,6 @@ void TransposePlan::Execute(
     for (size_t i = 1; i < nodes_.size(); ++i) {
       absl::Span<Node const> nodes = nodes_[i];
       schedule_work([&, nodes]() {
-        tsl::profiler::TraceMe traceme("Transpose::Execute", /*level=*/2);
         execute_by_type(nodes);
         counter.DecrementCount();
       });
diff --git a/third_party/xla/xla/pjrt/transpose_kernels.h b/third_party/xla/xla/pjrt/transpose_kernels.h
index b16074d165ae20..18b79bdae2e3f4 100644
--- a/third_party/xla/xla/pjrt/transpose_kernels.h
+++ b/third_party/xla/xla/pjrt/transpose_kernels.h
@@ -55,7 +55,7 @@ namespace xla {
 // and row 2 and row 3 together:
 //
 // row 0: w0 x0 w1 x1
-// row 1: w2 x2 x3 x3
+// row 1: w2 x2 w3 x3
 // row 2: y0 z0 y1 z1
 // row 3: y2 z2 y3 z3
 //
@@ -88,8 +88,8 @@ namespace xla {
 // row 1 but they are not in the right order. We need to shuffle them once to
 // get them in the right order:
 //
-// row 0: w0 x0 y0 z0 w1 x1 y1 z1
-// row 1: w1 x1 y1 z1 w2 x2 y2 z2
+// row 0: w0 x0 y0 z0 w2 x2 y2 z2
+// row 1: w1 x1 y1 z1 w3 x3 y3 z3
 //
 // Now, we can extract two rows of 4 elements from row 0 and two rows of 4
 // elements from row 1 to store into memory.
diff --git a/third_party/xla/xla/pjrt/utils.cc b/third_party/xla/xla/pjrt/utils.cc
index c34f67fd0faffd..c5a0f6016c5ad8 100644
--- a/third_party/xla/xla/pjrt/utils.cc
+++ b/third_party/xla/xla/pjrt/utils.cc
@@ -788,7 +788,7 @@ StatusOr<std::vector<int>> ComputeParametersThatMustBeDonated(
   const HloInputOutputAliasConfig& config = module.input_output_alias_config();
   TF_RETURN_IF_ERROR(config.ForEachAliasWithStatus(
       [&](const ShapeIndex& output_index,
-          const HloInputOutputAliasConfig::Alias& alias) {
+          const HloInputOutputAliasConfig::Alias& alias) -> absl::Status {
         if (tuple_inputs) {
           if (alias.parameter_number != 0) {
             return InvalidArgument(
diff --git a/third_party/xla/xla/python/BUILD b/third_party/xla/xla/python/BUILD
index 660ef5ecb87adb..c1d96723a03582 100644
--- a/third_party/xla/xla/python/BUILD
+++ b/third_party/xla/xla/python/BUILD
@@ -1,12 +1,6 @@
 load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
-load(
-    "@local_tsl//tsl:tsl.bzl",
-    "if_cuda_or_rocm",
-    "internal_visibility",
-)
-load("@local_tsl//tsl:tsl.default.bzl", "tsl_pybind_extension")
 load("@local_tsl//tsl/platform:build_config.bzl", "pyx_library", "tf_proto_library")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load(
@@ -20,6 +14,13 @@ load(
     "xla_cc_test",
     "xla_py_test_deps",
 )
+load(
+    "//xla/tsl:tsl.bzl",
+    "if_cuda_or_rocm",
+    "if_google",
+    "internal_visibility",
+)
+load("//xla/tsl:tsl.default.bzl", "tsl_pybind_extension")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -132,10 +133,9 @@ py_strict_test(
     python_version = "PY3",
     srcs_version = "PY3",
     tags = [
-        "config-cuda-only",
         "no_oss",
         "requires-gpu-nvidia",
-    ],  # TODO(phawkins): This test passes, but requires --config=monolithic.
+    ] + if_google(["config-cuda-only"]),  # TODO(phawkins): This test passes, but requires --config=monolithic.
     deps = [
         ":xla_client",
         ":xla_extension",
@@ -289,6 +289,7 @@ cc_library(
         "py_executable.cc",
         "py_host_callback.cc",
         "py_memory_space.cc",
+        "py_program.cc",
         "py_values.cc",
         "sharding.cc",
     ],
@@ -301,6 +302,7 @@ cc_library(
         "py_executable.h",
         "py_host_callback.h",
         "py_memory_space.h",
+        "py_program.h",
         "py_values.h",
         "sharded_device_array.h",
         "sharding.h",
@@ -334,7 +336,9 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/hash",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -370,6 +374,8 @@ cc_library(
         "//xla/pjrt:status_casters",
         "//xla/pjrt:transpose",
         "//xla/python/ifrt",
+        "//xla/python/ifrt:plugin_program",
+        "//xla/python/ifrt:plugin_program_serdes",
         "//xla/python/pjrt_ifrt",
         "//xla/python/pjrt_ifrt:xla_host_callback_proto_cc",
         "//xla/python/pjrt_ifrt:xla_ifrt",
@@ -497,6 +503,7 @@ cc_library(
         "//xla/python/ifrt",
         "//xla/python/pjrt_ifrt",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -653,7 +660,6 @@ cc_library(
     deps = [
         "//xla:literal",
         "//xla:shape_util",
-        "//xla:statusor",
         "//xla:util",
         "//xla/client:executable_build_options",
         "//xla/client:sharding_builder",
@@ -661,8 +667,21 @@ cc_library(
         "//xla/client:xla_computation",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_executable",
+        "//xla/python/pjrt_ifrt",
+        "//xla/service:computation_placer_hdr",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:casts",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/profiler/lib:traceme",
     ],
 )
@@ -798,15 +817,18 @@ cc_library(
         # placeholder for index annotation deps
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/synchronization",
         "//third_party/nanobind",
         "//xla:literal",
         "//xla/client:executable_build_options",
         "//xla/client:xla_builder",
-        "//xla/pjrt:pjrt_client",
         "//xla/pjrt:status_casters",
+        "//xla/python/ifrt",
+        "//xla/python/pjrt_ifrt",
         "@local_tsl//tsl/platform:logging",
+        "@llvm-project//llvm:Support",
     ],
 )
 
@@ -874,6 +896,7 @@ cc_library(
     deps = [
         ":refine_polymorphic_shapes",
         # placeholder for index annotation deps
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "//third_party/nanobind",
         "//xla:status",
@@ -889,6 +912,7 @@ cc_library(
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:BytecodeWriter",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FuncExtensions",
         "@llvm-project//mlir:IR",
@@ -910,6 +934,7 @@ cc_library(
     deps = [
         "//xla/mlir/utils:error_util",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:BytecodeWriter",
         "@llvm-project//mlir:FuncDialect",
@@ -920,6 +945,8 @@ cc_library(
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
         "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+        "@stablehlo//:base",
         "@stablehlo//:chlo_ops",
         "@stablehlo//:stablehlo_ops",
         "@stablehlo//stablehlo/experimental:experimental_stablehlo_passes",
@@ -1211,6 +1238,7 @@ tsl_pybind_extension(
         # placeholder for index annotation deps
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/hash",
         "@com_google_absl//absl/log:initialize",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -1247,19 +1275,25 @@ tsl_pybind_extension(
         "//xla/pjrt/distributed:service",
         "//xla/pjrt/gpu:gpu_helpers",
         "//xla/python/ifrt",
+        "//xla/python/ifrt:plugin_program",
+        "//xla/python/ifrt:plugin_program_serdes",
         "//xla/python/ifrt_proxy/client:py_module",
         "//xla/python/pjrt_ifrt",
+        "//xla/python/pjrt_ifrt:xla_ifrt",
         "//xla/service/cpu:collectives_interface",
+        "//xla/tsl/distributed_runtime/preemption:preemption_sync_manager",
         "//xla/tsl/python/lib/core:numpy",
-        "@local_tsl//tsl/distributed_runtime/preemption:preemption_sync_manager",
+        "@local_tsl//tsl/concurrency:ref_count",
         "@local_tsl//tsl/platform",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform/cloud:gcs_file_system",
+        "@llvm-project//mlir:IR",
     ] + select({
         # gloo transport only builds on linux
-        "@local_tsl//tsl:macos": [],
-        "@local_tsl//tsl:windows": [],
+        "//xla/tsl:macos": [],
+        "//xla/tsl:windows": [],
         "//conditions:default": [
             "//third_party/gloo:transport_tcp",
             "//xla/pjrt/cpu:gloo_collectives",
@@ -1267,7 +1301,7 @@ tsl_pybind_extension(
         ],
     }) + select({
         # mpitrampoline does not build on windows
-        "@local_tsl//tsl:windows": [],
+        "//xla/tsl:windows": [],
         "//conditions:default": [
             "//xla/pjrt/cpu:mpi_collectives",
         ],
diff --git a/third_party/xla/xla/python/dlpack.cc b/third_party/xla/xla/python/dlpack.cc
index 0cb187ba27a76a..f4089c63330d2a 100644
--- a/third_party/xla/xla/python/dlpack.cc
+++ b/third_party/xla/xla/python/dlpack.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
@@ -34,14 +35,17 @@ limitations under the License.
 #include "llvm/Support/Casting.h"
 #include "third_party/nanobind/include/nanobind/nanobind.h"
 #include "xla/layout.h"
+#include "xla/layout_util.h"
 #include "xla/pjrt/exceptions.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/device.h"
 #include "xla/python/nb_class_ptr.h"
 #include "xla/python/pjrt_ifrt/pjrt_array.h"
 #include "xla/python/pjrt_ifrt/pjrt_client.h"
+#include "xla/python/pjrt_ifrt/pjrt_device.h"
 #include "xla/python/py_array.h"
 #include "xla/python/py_client.h"
 #include "xla/python/python_ref_manager.h"
@@ -424,8 +428,19 @@ absl::StatusOr<nb::object> DLPackManagedTensorToBuffer(
   // for non-default layouts, and will return wrong results if a non-default
   // layout is passed to a computation expecting default layouts. Remove this
   // special case when non-default layouts are better supported by JAX.
-  TF_ASSIGN_OR_RETURN(Layout default_layout, device->client()->GetDefaultLayout(
-                                                 element_type, dimensions));
+  absl::StatusOr<Layout> default_layout_from_client =
+      device->client()->GetDefaultLayout(element_type, dimensions);
+  Layout default_layout;
+  if (default_layout_from_client.ok()) {
+    default_layout = *default_layout_from_client;
+  } else if (absl::IsUnimplemented(default_layout_from_client.status())) {
+    // TODO(skyewm): consider remove the fallback path when GetDefaultLayout is
+    // unimplemented.
+    Shape host_shape = ShapeUtil::MakeShape(element_type, dimensions);
+    default_layout = LayoutUtil::GetWithDefaultLayout(host_shape).layout();
+  } else {
+    return default_layout_from_client.status();
+  }
   if (shape.layout() != default_layout) {
     return Unimplemented(
         "from_dlpack got array with non-default layout with minor-to-major "
@@ -465,8 +480,14 @@ absl::StatusOr<nb::object> DLPackManagedTensorToBuffer(
 }
 
 absl::StatusOr<nb::object> DLPackManagedTensorToBuffer(
-    const nb::capsule& tensor, PjRtDevice* device,
+    const nb::capsule& tensor, ifrt::Device* ifrt_device,
     nb_class_ptr<PyClient> client, std::optional<std::intptr_t> stream) {
+  ifrt::PjRtDevice* device =
+      llvm::dyn_cast_or_null<ifrt::PjRtDevice>(ifrt_device);
+  if (device == nullptr) {
+    throw XlaRuntimeError(
+        "DLPack is supported for PjRt-compatible backends only.");
+  }
   if (std::string_view(tensor.name()) != kDlTensorCapsuleName) {
     return InvalidArgument(
         "DLPack tensor must be a capsule with name \"dltensor\", got \"%s\". "
@@ -502,11 +523,12 @@ absl::StatusOr<nb::object> DLPackManagedTensorToBuffer(
   if (dlmt->deleter) {
     on_delete_callback = [dlmt]() { dlmt->deleter(dlmt); };
   }
-  TF_ASSIGN_OR_RETURN(auto pjrt_buffer,
-                      device->client()->CreateViewOfDeviceBuffer(
-                          static_cast<char*>(dlmt->dl_tensor.data) +
-                              dlmt->dl_tensor.byte_offset,
-                          shape, device, on_delete_callback, stream));
+  TF_ASSIGN_OR_RETURN(
+      auto pjrt_buffer,
+      device->pjrt_device()->client()->CreateViewOfDeviceBuffer(
+          static_cast<char*>(dlmt->dl_tensor.data) +
+              dlmt->dl_tensor.byte_offset,
+          shape, device->pjrt_device(), on_delete_callback, stream));
   // We have taken ownership of the array inside the capsule; make sure the
   // capsule it cannot be used again.
   PyCapsule_SetName(tensor.ptr(), "used_dltensor");
diff --git a/third_party/xla/xla/python/ifrt/BUILD b/third_party/xla/xla/python/ifrt/BUILD
index c2f96ff92d35a0..a5b869268b1d4d 100644
--- a/third_party/xla/xla/python/ifrt/BUILD
+++ b/third_party/xla/xla/python/ifrt/BUILD
@@ -1,7 +1,7 @@
-load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
-load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
 load("//xla:xla.bzl", "xla_cc_test")
+load("//xla/tsl:tsl.bzl", "internal_visibility")
+load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 
 package_group(
     name = "friends",
@@ -35,6 +35,7 @@ exports_files([
 cc_library(
     name = "ifrt",
     srcs = [
+        "array_spec.cc",
         "array.cc",
         "client.cc",
         "compiler.cc",
@@ -46,6 +47,9 @@ cc_library(
         "index.cc",
         "index_domain.cc",
         "memory.cc",
+        "program.cc",
+        # TODO(hyeontaek): Move this SerDes out of `ifrt` target once `Compiler::DeserializeLoadedExecutable()` is split into executable deserialization and executable loading, and thus the method can be removed.
+        "executable_serdes.cc",
         "shape.cc",
         "sharding.cc",
         "tuple.cc",
@@ -53,6 +57,7 @@ cc_library(
     ],
     hdrs = [
         "array.h",
+        "array_spec.h",
         "client.h",
         "compiler.h",
         "device.h",
@@ -63,6 +68,9 @@ cc_library(
         "index.h",
         "index_domain.h",
         "memory.h",
+        "program.h",
+        # TODO(hyeontaek): Move this SerDes out of `ifrt` target once `Compiler::DeserializeLoadedExecutable()` is split into executable deserialization and executable loading, and thus the method can be removed.
+        "executable_serdes.h",
         "shape.h",
         "sharding.h",
         "tuple.h",
@@ -70,13 +78,21 @@ cc_library(
     ],
     compatible_with = get_compatible_with_portable(),
     deps = [
+        ":array_spec_proto_cc",
+        ":device_proto_cc",
+        ":dtype_proto_cc",
         ":serdes",
-        ":types_proto_cc",
+        ":shape_proto_cc",
+        ":sharding_proto_cc",
         "//xla:status",
         "//xla:statusor",
         "//xla:util",
+        "//xla/hlo/ir:hlo",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_common",
+        "//xla/pjrt:pjrt_device_description",
+        "//xla/pjrt:pjrt_executable",
+        "//xla/pjrt:pjrt_future",
         "//xla/pjrt:pjrt_layout",
         "//xla/python/ifrt/ir",
         "@com_google_absl//absl/algorithm:container",
@@ -85,14 +101,18 @@ cc_library(
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/container:node_hash_set",
         "@com_google_absl//absl/functional:function_ref",
+        "@com_google_absl//absl/hash",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@local_tsl//tsl/concurrency:ref_count",
+        "@local_tsl//tsl/lib/gtl:int_type",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
@@ -117,6 +137,7 @@ xla_cc_test(
     srcs = ["future_test.cc"],
     deps = [
         ":ifrt",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/lib/core:status_test_util",
@@ -160,7 +181,7 @@ xla_cc_test(
     srcs = ["shape_test.cc"],
     deps = [
         ":ifrt",
-        ":types_proto_cc",
+        ":shape_proto_cc",
         "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:status_matchers",
@@ -307,7 +328,9 @@ cc_library(
         "//xla:test",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_device_description",
+        "//xla/pjrt:pjrt_layout",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
@@ -343,6 +366,7 @@ xla_cc_test(
         ":serdes",
         ":serdes_proto_cc",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//llvm:Support",
@@ -357,17 +381,31 @@ tf_proto_library(
     srcs = ["serdes.proto"],
 )
 
+cc_library(
+    name = "program_serdes",
+    srcs = ["program_serdes.cc"],
+    hdrs = ["program_serdes.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":ifrt",
+        ":serdes",
+        "//xla:util",
+        "@com_google_absl//absl/status:statusor",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
 cc_library(
     name = "sharding_serdes",
     srcs = ["sharding_serdes.cc"],
-    hdrs = ["sharding_serdes.h"],
     compatible_with = get_compatible_with_portable(),
     deps = [
         ":ifrt",
         ":serdes",
-        ":sharding_proto_cc",
-        "//xla:statusor",
+        ":sharding_serdes_proto_cc",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
         "@llvm-project//llvm:Support",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -390,12 +428,128 @@ xla_cc_test(
 )
 
 tf_proto_library(
-    name = "types_proto",
-    srcs = ["types.proto"],
+    name = "array_spec_proto",
+    srcs = ["array_spec.proto"],
+    protodeps = [
+        ":dtype_proto",
+        ":shape_proto",
+        ":sharding_proto",
+    ],
+)
+
+xla_cc_test(
+    name = "array_spec_test",
+    size = "small",
+    srcs = ["array_spec_test.cc"],
+    deps = [
+        ":array_spec_proto_cc",
+        ":ifrt",
+        ":sharding_serdes",
+        ":sharding_test_util",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+tf_proto_library(
+    name = "device_proto",
+    srcs = ["device.proto"],
+)
+
+xla_cc_test(
+    name = "device_test",
+    size = "small",
+    srcs = ["device_test.cc"],
+    deps = [
+        ":device_proto_cc",
+        ":ifrt",
+        ":sharding_test_util",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:platform_port",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+tf_proto_library(
+    name = "dtype_proto",
+    srcs = ["dtype.proto"],
+)
+
+xla_cc_test(
+    name = "dtype_test",
+    size = "small",
+    srcs = ["dtype_test.cc"],
+    deps = [
+        ":dtype_proto_cc",
+        ":ifrt",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test",
+    ],
+)
+
+tf_proto_library(
+    name = "shape_proto",
+    srcs = ["shape.proto"],
 )
 
 tf_proto_library(
     name = "sharding_proto",
     srcs = ["sharding.proto"],
-    protodeps = [":types_proto"],
+    protodeps = [":serdes_proto"],
+)
+
+tf_proto_library(
+    name = "sharding_serdes_proto",
+    srcs = ["sharding_serdes.proto"],
+    protodeps = [
+        ":device_proto",
+        ":shape_proto",
+    ],
+)
+
+cc_library(
+    name = "plugin_program",
+    srcs = ["plugin_program.cc"],
+    hdrs = ["plugin_program.h"],
+    deps = [
+        ":ifrt",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "plugin_program_serdes",
+    srcs = ["plugin_program_serdes.cc"],
+    deps = [
+        ":plugin_program",
+        ":serdes",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//llvm:Support",
+    ],
+    alwayslink = True,
+)
+
+xla_cc_test(
+    name = "plugin_program_serdes_test",
+    srcs = ["plugin_program_serdes_test.cc"],
+    deps = [
+        ":plugin_program",
+        ":plugin_program_serdes",
+        ":serdes",
+        ":serdes_proto_cc",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/protobuf:error_codes_proto_impl_cc",
+        "@local_tsl//tsl/protobuf:status_proto_cc",
+    ],
 )
diff --git a/third_party/xla/xla/python/ifrt/array.h b/third_party/xla/xla/python/ifrt/array.h
index b63d9f4c90096f..9047945e02e447 100644
--- a/third_party/xla/xla/python/ifrt/array.h
+++ b/third_party/xla/xla/python/ifrt/array.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_PYTHON_IFRT_ARRAY_H_
 #define XLA_PYTHON_IFRT_ARRAY_H_
 
+#include <cstdint>
 #include <memory>
 #include <optional>
 #include <string>
@@ -116,7 +117,7 @@ class Array : public llvm::RTTIExtends<Array, Value> {
   // an API that lets users query the alignment requirement of the specific
   // implementation.
   ABSL_MUST_USE_RESULT
-  virtual Future<Status> CopyToHostBuffer(
+  virtual Future<> CopyToHostBuffer(
       void* data, std::optional<absl::Span<const int64_t>> byte_strides,
       ArrayCopySemantics semantics) = 0;
 
diff --git a/third_party/xla/xla/python/ifrt/array_impl_test_lib.cc b/third_party/xla/xla/python/ifrt/array_impl_test_lib.cc
index 022ad3d777bdbd..ec1f81424274f5 100644
--- a/third_party/xla/xla/python/ifrt/array_impl_test_lib.cc
+++ b/third_party/xla/xla/python/ifrt/array_impl_test_lib.cc
@@ -83,7 +83,7 @@ TEST_P(ArrayImplWithHostBufferSemanticsTest,
 
   // Regardless of the host buffer semantics chosen, the host buffer must not be
   // used by the runtime once `on_done_with_host_buffer` has been called.
-  if (semantics == Client::HostBufferSemantics::kZeroCopy) {
+  if (semantics == Client::HostBufferSemantics::kImmutableZeroCopy) {
     // `on_done_with_host_buffer` is called only when the `Array` is destroyed
     // if the runtime implements `kZeroCopy`. A deadlock will occur if we keep
     // the `Array` instance.
@@ -108,7 +108,7 @@ INSTANTIATE_TEST_CASE_P(
     testing::Values(
         Client::HostBufferSemantics::kImmutableOnlyDuringCall,
         Client::HostBufferSemantics::kImmutableUntilTransferCompletes,
-        Client::HostBufferSemantics::kZeroCopy));
+        Client::HostBufferSemantics::kImmutableZeroCopy));
 
 TEST(ArrayImplTest, MakeArrayFromHostBufferImmutableOnlyDuringCall) {
   TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
@@ -184,12 +184,12 @@ TEST(ArrayImplTest, MakeArrayFromHostBufferZeroCopy) {
   std::shared_ptr<const Sharding> sharding =
       SingleDeviceSharding::Create(device, MemoryKind());
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto array,
-      client->MakeArrayFromHostBuffer(data->data(), dtype, shape,
-                                      /*byte_strides=*/std::nullopt, sharding,
-                                      Client::HostBufferSemantics::kZeroCopy,
-                                      /*on_done_with_host_buffer=*/nullptr));
+  TF_ASSERT_OK_AND_ASSIGN(auto array,
+                          client->MakeArrayFromHostBuffer(
+                              data->data(), dtype, shape,
+                              /*byte_strides=*/std::nullopt, sharding,
+                              Client::HostBufferSemantics::kImmutableZeroCopy,
+                              /*on_done_with_host_buffer=*/nullptr));
 
   // The `Array` may alias the host buffer, but once the transfer is done and
   // the `Array` is destroyed, the host buffer is not accessed. This test would
diff --git a/third_party/xla/xla/python/ifrt/array_spec.cc b/third_party/xla/xla/python/ifrt/array_spec.cc
new file mode 100644
index 00000000000000..828bca230ab5de
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt/array_spec.cc
@@ -0,0 +1,58 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/python/ifrt/array_spec.h"
+
+#include <string>
+#include <utility>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "xla/python/ifrt/array_spec.pb.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace ifrt {
+
+absl::StatusOr<ArraySpec> ArraySpec::FromProto(
+    DeviceList::LookupDeviceFunc lookup_device, const ArraySpecProto& proto) {
+  TF_ASSIGN_OR_RETURN(auto dtype, DType::FromProto(proto.dtype()));
+  TF_ASSIGN_OR_RETURN(auto shape, Shape::FromProto(proto.shape()));
+  TF_ASSIGN_OR_RETURN(auto sharding,
+                      Sharding::FromProto(lookup_device, proto.sharding()));
+  return ArraySpec{/*dtype=*/dtype, /*shape=*/std::move(shape),
+                   /*sharding=*/std::move(sharding)};
+}
+
+absl::StatusOr<ArraySpecProto> ArraySpec::ToProto() const {
+  ArraySpecProto proto;
+  *proto.mutable_dtype() = dtype.ToProto();
+  *proto.mutable_shape() = shape.ToProto();
+  TF_ASSIGN_OR_RETURN(*proto.mutable_sharding(), sharding->ToProto());
+  return proto;
+}
+
+std::string ArraySpec::DebugString() const {
+  return absl::StrCat("ArraySpec(dtype=", dtype.DebugString(),
+                      ",shape=", shape.DebugString(),
+                      ",sharding=", sharding->DebugString(), ")");
+}
+
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt/array_spec.h b/third_party/xla/xla/python/ifrt/array_spec.h
new file mode 100644
index 00000000000000..b5e2ba3395f87f
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt/array_spec.h
@@ -0,0 +1,56 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_ARRAY_SPEC_H_
+#define XLA_PYTHON_IFRT_ARRAY_SPEC_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
+#include "xla/python/ifrt/array_spec.pb.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
+
+namespace xla {
+namespace ifrt {
+
+// Specification of an array that groups the static properties of an `Array`
+// together. Typically used for describing expected or requested static
+// properties of an input/output array of an operation.
+struct ArraySpec {
+  DType dtype;
+  Shape shape;
+  std::shared_ptr<const Sharding> sharding;
+  // TODO(hyeontaek): Add `layout` once expressing the default layout can be
+  // done in a symbolic manner.
+
+  // Constructs `ArraySpec` from `ArraySpecProto`.
+  static absl::StatusOr<ArraySpec> FromProto(
+      DeviceList::LookupDeviceFunc lookup_device, const ArraySpecProto& proto);
+
+  // Returns a `ArraySpecProto` representation.
+  absl::StatusOr<ArraySpecProto> ToProto() const;
+
+  std::string DebugString() const;
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_ARRAY_SPEC_H_
diff --git a/third_party/xla/xla/python/ifrt/array_spec.proto b/third_party/xla/xla/python/ifrt/array_spec.proto
new file mode 100644
index 00000000000000..6d61b71a004039
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt/array_spec.proto
@@ -0,0 +1,29 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto3";
+
+package xla.ifrt;
+
+import "xla/python/ifrt/dtype.proto";
+import "xla/python/ifrt/shape.proto";
+import "xla/python/ifrt/sharding.proto";
+
+// Proto equivalent of C++ `ArraySpec`.
+message ArraySpecProto {
+  DTypeProto dtype = 1;
+  ShapeProto shape = 2;
+  ShardingProto sharding = 3;
+}
diff --git a/third_party/xla/xla/python/ifrt/array_spec_test.cc b/third_party/xla/xla/python/ifrt/array_spec_test.cc
new file mode 100644
index 00000000000000..0dfa4d327de562
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt/array_spec_test.cc
@@ -0,0 +1,73 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/python/ifrt/array_spec.h"
+
+#include <gtest/gtest.h>
+#include "absl/status/statusor.h"
+#include "llvm/Support/Casting.h"
+#include "xla/python/ifrt/array_spec.pb.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/memory.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt/sharding_test_util.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace ifrt {
+namespace {
+
+class ArraySpecTest : public test_util::ShardingTest {};
+
+TEST_P(ArraySpecTest, ToFromProto) {
+  auto device_list = GetDevices({0, 1});
+  DType dtype(DType::kS32);
+  Shape shape({4, 2});
+  Shape shard_shape({2, 2});
+  ArraySpec spec{/*dtype=*/dtype, /*shape=*/shape,
+                 /*sharding=*/
+                 ConcreteEvenSharding::Create(device_list, MemoryKind(),
+                                              /*shape=*/shape,
+                                              /*shard_shape=*/shard_shape)};
+
+  auto lookup_device_func = [&](DeviceId device_id) -> absl::StatusOr<Device*> {
+    return client()->LookupDevice(device_id);
+  };
+  TF_ASSERT_OK_AND_ASSIGN(const ArraySpecProto proto, spec.ToProto());
+  TF_ASSERT_OK_AND_ASSIGN(const ArraySpec array_spec_copy,
+                          ArraySpec::FromProto(lookup_device_func, proto));
+
+  EXPECT_EQ(array_spec_copy.dtype, dtype);
+  EXPECT_EQ(array_spec_copy.shape, shape);
+
+  const auto* sharding =
+      llvm::dyn_cast<ConcreteEvenSharding>(array_spec_copy.sharding.get());
+  ASSERT_NE(sharding, nullptr);
+  EXPECT_EQ(sharding->devices(), spec.sharding->devices());
+  EXPECT_EQ(sharding->memory_kind(), spec.sharding->memory_kind());
+  EXPECT_EQ(sharding->shape(), shape);
+  EXPECT_EQ(sharding->shard_shape(), shard_shape);
+}
+
+INSTANTIATE_TEST_SUITE_P(NumDevices, ArraySpecTest,
+                         testing::Values(test_util::ShardingTestParam{
+                             /*num_devices=*/2,
+                             /*num_addressable_devices=*/2}));
+
+}  // namespace
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt/client.h b/third_party/xla/xla/python/ifrt/client.h
index a22969d4909685..6ac3368eb3156f 100644
--- a/third_party/xla/xla/python/ifrt/client.h
+++ b/third_party/xla/xla/python/ifrt/client.h
@@ -30,8 +30,10 @@ limitations under the License.
 #include "llvm/Support/ExtensibleRTTI.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/compiler.h"
+#include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/tuple.h"
 #include "xla/python/ifrt/value.h"
 #include "xla/statusor.h"
@@ -147,7 +149,7 @@ class Client : public llvm::RTTIExtends<Client, llvm::RTTIRoot> {
   // being used by JAX or will be replaced with explicit device assignment.
   virtual absl::StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
       int num_replicas, int num_partitions) const = 0;
-  virtual absl::StatusOr<Device*> LookupDevice(int device_id) const = 0;
+  virtual absl::StatusOr<Device*> LookupDevice(DeviceId device_id) const = 0;
   virtual absl::StatusOr<Device*> LookupAddressableDevice(
       int local_hardware_id) const = 0;
 
@@ -157,7 +159,14 @@ class Client : public llvm::RTTIExtends<Client, llvm::RTTIRoot> {
 
   // Returns a topology description for that covers the provided devices.
   virtual absl::StatusOr<std::shared_ptr<const xla::PjRtTopologyDescription>>
-  GetTopologyForDevices(absl::Span<Device* const> devices) const = 0;
+  GetTopologyForDevices(const DeviceList& devices) const = 0;
+
+  // Returns the default layout on `device` for a buffer with `dtype` and
+  // single-shard dimensions `dims`.
+  // TODO(hyeontaek): Change the API to take `Shape` and `Sharding` instead of
+  // single-shard dimensions and device.
+  virtual absl::StatusOr<std::unique_ptr<PjRtLayout>> GetDefaultLayoutForDevice(
+      DType dtype, absl::Span<const int64_t> dims, Device* device) const = 0;
 
   static char ID;  // NOLINT
 };
diff --git a/third_party/xla/xla/python/ifrt/client_impl_test_lib.cc b/third_party/xla/xla/python/ifrt/client_impl_test_lib.cc
index 5e739951ffb5d3..6a0f7e2cd7e27c 100644
--- a/third_party/xla/xla/python/ifrt/client_impl_test_lib.cc
+++ b/third_party/xla/xla/python/ifrt/client_impl_test_lib.cc
@@ -47,7 +47,7 @@ TEST(ClientImplTest, Devices) {
 
   for (Device* device : client->devices()) {
     TF_ASSERT_OK_AND_ASSIGN(auto* looked_up_device,
-                            client->LookupDevice(device->id()));
+                            client->LookupDevice(device->Id()));
     EXPECT_EQ(device, looked_up_device);
   }
 
diff --git a/third_party/xla/xla/python/ifrt/compiler.cc b/third_party/xla/xla/python/ifrt/compiler.cc
index 77b85268081366..5fe53b61aa0e52 100644
--- a/third_party/xla/xla/python/ifrt/compiler.cc
+++ b/third_party/xla/xla/python/ifrt/compiler.cc
@@ -18,9 +18,7 @@ limitations under the License.
 namespace xla {
 namespace ifrt {
 
-char Program::ID = 0;
 char CompileOptions::ID = 0;
-char DeserializeExecutableOptions::ID = 0;
 char Compiler::ID = 0;
 
 }  // namespace ifrt
diff --git a/third_party/xla/xla/python/ifrt/compiler.h b/third_party/xla/xla/python/ifrt/compiler.h
index 6bcce6334a1e94..92a7df23d11a17 100644
--- a/third_party/xla/xla/python/ifrt/compiler.h
+++ b/third_party/xla/xla/python/ifrt/compiler.h
@@ -18,19 +18,17 @@ limitations under the License.
 
 #include <memory>
 
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "llvm/Support/ExtensibleRTTI.h"
 #include "xla/python/ifrt/executable.h"
+#include "xla/python/ifrt/executable_serdes.h"
+#include "xla/python/ifrt/program.h"
 #include "xla/python/ifrt/serdes.h"
 
 namespace xla {
 namespace ifrt {
 
-// Abstract program.
-struct Program : llvm::RTTIExtends<Program, Serializable> {
-  static char ID;  // NOLINT
-};
-
 // Abstract options for compiling a program and load it as `LoadedExecutable`.
 // Ideally, compile options should be present in the program being compiled to
 // help static checking and completeness. This option structure is to express
@@ -41,16 +39,6 @@ struct CompileOptions : llvm::RTTIExtends<CompileOptions, Serializable> {
   static char ID;  // NOLINT
 };
 
-// Abstract options for deserializing an `Executable` and load it as
-// `LoadedExecutable`. This option structure is to express legacy compilation
-// options that are not included in the program.
-//
-// TODO(hyeontaek): Make an new `LoadOptions` that is specific for loading.
-struct DeserializeExecutableOptions
-    : llvm::RTTIExtends<DeserializeExecutableOptions, DeserializeOptions> {
-  static char ID;  // NOLINT
-};
-
 // Represents a compiler that creates an `Executable` that can run a computation
 // on devices.
 //
@@ -72,7 +60,9 @@ class Compiler : public llvm::RTTIExtends<Compiler, llvm::RTTIRoot> {
   // Deserializes a serialized executable as produced by
   // `LoadedExecutable::Serialize()`. The compatibility of `serialized` is
   // implementation specific.
-  // TODO(hyeontaek): Move executable loading to `Client`.
+  // TODO(hyeontaek): Move executable loading to `Client`. Then, the user can
+  // use standard IFRT deserialization instead of this custom deserialization
+  // function.
   virtual absl::StatusOr<std::unique_ptr<LoadedExecutable>>
   DeserializeLoadedExecutable(
       absl::string_view serialized,
diff --git a/third_party/xla/xla/python/ifrt/device.cc b/third_party/xla/xla/python/ifrt/device.cc
index 130883219387cd..59213af284a8ee 100644
--- a/third_party/xla/xla/python/ifrt/device.cc
+++ b/third_party/xla/xla/python/ifrt/device.cc
@@ -15,16 +15,27 @@ limitations under the License.
 
 #include "xla/python/ifrt/device.h"
 
+#include <atomic>
+#include <cstdint>
 #include <memory>
+#include <string>
 #include <utility>
 #include <vector>
 
-#include "xla/python/ifrt/types.pb.h"
+#include "absl/base/optimization.h"
+#include "absl/hash/hash.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "xla/python/ifrt/device.pb.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace ifrt {
 
-DeviceList::DeviceList(Devices devices) {
+char Device::ID = 0;
+
+DeviceList::DeviceList(Devices devices) : hash_(kUnsetHash) {
   if (devices.size() <= kInlineDeviceSize) {
     state_ = State{std::move(devices)};
   } else {
@@ -32,12 +43,34 @@ DeviceList::DeviceList(Devices devices) {
   }
 }
 
+DeviceList::DeviceList(const DeviceList& other)
+    : state_(other.state_),
+      hash_(other.hash_.load(std::memory_order_relaxed)) {}
+
+DeviceList::DeviceList(DeviceList&& other)
+    : state_(std::move(other.state_)),
+      hash_(other.hash_.load(std::memory_order_relaxed)) {}
+
+DeviceList& DeviceList::operator=(const DeviceList& other) {
+  state_ = other.state_;
+  hash_.store(other.hash_.load(std::memory_order_relaxed),
+              std::memory_order_relaxed);
+  return *this;
+}
+
+DeviceList& DeviceList::operator=(DeviceList&& other) {
+  state_ = std::move(other.state_);
+  hash_.store(other.hash_.load(std::memory_order_relaxed),
+              std::memory_order_relaxed);
+  return *this;
+}
+
 absl::StatusOr<DeviceList> DeviceList::FromProto(LookupDeviceFunc lookup_device,
                                                  const DeviceListProto& proto) {
   DeviceList::Devices devices;
   devices.reserve(proto.device_ids_size());
   for (int device_id : proto.device_ids()) {
-    TF_ASSIGN_OR_RETURN(Device * device, lookup_device(device_id));
+    TF_ASSIGN_OR_RETURN(Device * device, lookup_device(DeviceId(device_id)));
     devices.push_back(device);
   }
   return DeviceList(std::move(devices));
@@ -47,16 +80,38 @@ DeviceListProto DeviceList::ToProto() const {
   DeviceListProto proto;
   proto.mutable_device_ids()->Reserve(devices().size());
   for (Device* device : devices()) {
-    proto.mutable_device_ids()->AddAlreadyReserved(device->id());
+    proto.mutable_device_ids()->AddAlreadyReserved(device->Id().value());
   }
   return proto;
 }
 
-std::vector<int> GetDeviceIds(DeviceList device_list) {
-  std::vector<int> ids;
+uint64_t DeviceList::hash() const {
+  uint64_t hash = hash_.load(std::memory_order_relaxed);
+  if (ABSL_PREDICT_FALSE(hash == kUnsetHash)) {
+    hash = absl::HashOf(devices());
+    if (ABSL_PREDICT_FALSE(hash == kUnsetHash)) {
+      ++hash;
+    }
+    hash_.store(hash, std::memory_order_relaxed);
+  }
+  return hash;
+}
+
+std::string DeviceList::DebugString() const {
+  return absl::StrCat("[",
+                      absl::StrJoin(devices(), ",",
+                                    [](std::string* out, Device* device) {
+                                      absl::StrAppend(out,
+                                                      device->DebugString());
+                                    }),
+                      "]");
+}
+
+std::vector<DeviceId> GetDeviceIds(DeviceList device_list) {
+  std::vector<DeviceId> ids;
   ids.reserve(device_list.devices().size());
   for (const Device* device : device_list.devices()) {
-    ids.push_back(device->id());
+    ids.push_back(device->Id());
   }
   return ids;
 }
diff --git a/third_party/xla/xla/python/ifrt/device.h b/third_party/xla/xla/python/ifrt/device.h
index 9eb594d6d11297..a93b60454b4337 100644
--- a/third_party/xla/xla/python/ifrt/device.h
+++ b/third_party/xla/xla/python/ifrt/device.h
@@ -16,24 +16,89 @@ limitations under the License.
 #ifndef XLA_PYTHON_IFRT_DEVICE_H_
 #define XLA_PYTHON_IFRT_DEVICE_H_
 
+#include <atomic>
+#include <cstdint>
 #include <memory>
+#include <string>
 #include <type_traits>
 #include <variant>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/functional/function_ref.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
-#include "xla/pjrt/pjrt_client.h"
-#include "xla/python/ifrt/types.pb.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/pjrt/pjrt_device_description.h"
+#include "xla/python/ifrt/device.pb.h"
+#include "tsl/lib/gtl/int_type.h"
 
 namespace xla {
 namespace ifrt {
 
 class Client;
+class Memory;
 
-// Short-term alias to reuse `xla::PjRtDevice` without a separate abstract type.
-using Device = ::xla::PjRtDevice;
+// Globally unique device IDs.
+TSL_LIB_GTL_DEFINE_INT_TYPE(DeviceId, int32_t);
+
+// `Device` represents a single device that can run computations. The types of
+// supported computations depend on the runtime.
+class Device : public llvm::RTTIExtends<Device, llvm::RTTIRoot> {
+ public:
+  Device() = default;
+
+  // Not copyable or movable.
+  Device(const Device&) = delete;
+  Device(Device&&) = delete;
+  Device& operator=(const Device&) = delete;
+  Device& operator=(Device&&) = delete;
+
+  virtual Client* client() const = 0;
+
+  // The ID of this device. Globally unique across all processes.
+  virtual DeviceId Id() const = 0;
+
+  // Returns vendor specific attributes about the device. For example the model
+  // number of a GPU, or the mesh coordinates of a TPU device. The returned
+  // reference will remain valid for the lifetime of the Device.
+  virtual const absl::flat_hash_map<std::string, PjRtDeviceAttribute>&
+  Attributes() const = 0;
+
+  // A vendor-dependent string that uniquely identifies the kind of device,
+  // e.g., "Tesla V100-SXM2-16GB". May be used to determine whether two GPUs are
+  // compatible compilation.
+  virtual absl::string_view Kind() const = 0;
+
+  // Debug string suitable for reading by end users, should be reasonably terse,
+  // for example: "CpuDevice(id=0)".
+  virtual absl::string_view ToString() const = 0;
+
+  // Debug string suitable for logging when errors occur. Should be verbose
+  // enough to describe the current device unambiguously.
+  virtual absl::string_view DebugString() const = 0;
+
+  // Returns the default memory space attached to this device.
+  virtual absl::StatusOr<Memory*> DefaultMemory() const = 0;
+
+  // Returns all memory spaces attached to this device.
+  // The memory spaces are in no particular order.
+  virtual absl::Span<Memory* const> Memories() const = 0;
+
+  // Whether client can issue commands to this device.
+  virtual bool IsAddressable() const = 0;
+
+  // The index of the process that this device belongs to, i.e. is addressable
+  // from. This is not always identical to Client::process_index() in a
+  // multi-process setting, where each client can see devices from all
+  // processes, but only a subset of them are addressable and have the same
+  // process_index as the client.
+  virtual int ProcessIndex() const = 0;
+
+  static char ID;  // NOLINT
+};
 
 // Ordered list of devices.
 class DeviceList {
@@ -47,16 +112,18 @@ class DeviceList {
   // better performance.
   using Devices = absl::InlinedVector<Device*, kInlineDeviceSize>;
 
+  DeviceList() : DeviceList(Devices()) {}
+
   // Constructor with a pre-populated `devices`.
   explicit DeviceList(Devices devices);
 
-  DeviceList(const DeviceList& devices) = default;
-  DeviceList(DeviceList&& devices) = default;
-  DeviceList& operator=(const DeviceList& other) = default;
-  DeviceList& operator=(DeviceList&& other) = default;
+  DeviceList(const DeviceList& other);
+  DeviceList(DeviceList&& other);
+  DeviceList& operator=(const DeviceList& other);
+  DeviceList& operator=(DeviceList&& other);
 
   // Function that matches the semantics of `Client::LookupDevice()`.
-  using LookupDeviceFunc = absl::FunctionRef<absl::StatusOr<Device*>(int)>;
+  using LookupDeviceFunc = absl::FunctionRef<absl::StatusOr<Device*>(DeviceId)>;
 
   // Constructs `DeviceList` from `DeviceListProto`. Devices are looked up using
   // `lookup_device`. Device ids in the proto must be consistent with the
@@ -70,11 +137,19 @@ class DeviceList {
   absl::Span<Device* const> devices() const { return state().devices; }
 
   bool operator==(const DeviceList& other) const {
+    const std::shared_ptr<State>* lhs =
+        std::get_if<std::shared_ptr<State>>(&state_);
+    const std::shared_ptr<State>* rhs =
+        std::get_if<std::shared_ptr<State>>(&other.state_);
+    if (lhs != nullptr && rhs != nullptr && lhs->get() == rhs->get()) {
+      return true;
+    }
     return devices() == other.devices();
   }
-  bool operator!=(const DeviceList& other) const {
-    return devices() != other.devices();
-  }
+  bool operator!=(const DeviceList& other) const { return !(*this == other); }
+
+  // Returns the hash of devices. This hash is stable only within the process.
+  uint64_t hash() const;
 
   int size() const { return state().devices.size(); }
   bool empty() const { return state().devices.empty(); }
@@ -89,6 +164,8 @@ class DeviceList {
   auto end() const { return state().devices.end(); }
   auto cend() const { return state().devices.cend(); }
 
+  std::string DebugString() const;
+
  private:
   // Internal state that may be shared across `DeviceList` instances.
   struct State {
@@ -122,17 +199,22 @@ class DeviceList {
   }
 
   std::variant<State, std::shared_ptr<State>> state_;
+
+  // Cached hash. 0 indicates the hash needs to be computed and cached.
+  // May be written multiple times with the same non-zero value.
+  static constexpr uint64_t kUnsetHash = 0;
+  mutable std::atomic<uint64_t> hash_;
 };
 
 // Returns the id of each device in `device_list`.
-std::vector<int> GetDeviceIds(DeviceList device_list);
+std::vector<DeviceId> GetDeviceIds(DeviceList device_list);
 
 // Hash function for `DeviceList`. Assumes that every unique device has a unique
 // `Device` object, not duplicate `Device` objects ("d1 == d2 if d1->id() ==
 // d2->id()").
 template <typename H>
 H AbslHashValue(H h, const DeviceList& devices) {
-  return H::combine(std::move(h), devices.devices());
+  return H::combine(std::move(h), devices.hash());
 }
 
 }  // namespace ifrt
diff --git a/third_party/xla/xla/python/ifrt/device.proto b/third_party/xla/xla/python/ifrt/device.proto
new file mode 100644
index 00000000000000..53e384522d56ec
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt/device.proto
@@ -0,0 +1,25 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto3";
+
+package xla.ifrt;
+
+// Proto equivalent of C++ `DeviceList`.
+message DeviceListProto {
+  // Serialization and deserialization are expected to ensure that device ids
+  // are stable across proto construction and consumption.
+  repeated int32 device_ids = 1;
+}
diff --git a/third_party/xla/xla/python/ifrt/device_test.cc b/third_party/xla/xla/python/ifrt/device_test.cc
new file mode 100644
index 00000000000000..cef9d05d8347d8
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt/device_test.cc
@@ -0,0 +1,98 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/python/ifrt/device.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/status/statusor.h"
+#include "absl/synchronization/blocking_counter.h"
+#include "xla/python/ifrt/device.pb.h"
+#include "xla/python/ifrt/sharding_test_util.h"
+#include "tsl/platform/cpu_info.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/threadpool.h"
+
+namespace xla {
+namespace ifrt {
+namespace {
+
+class DeviceListTest : public test_util::ShardingTest {};
+
+TEST_P(DeviceListTest, ToFromProto) {
+  auto device_list = GetDevices({0, 1});
+  DeviceListProto proto = device_list.ToProto();
+  auto lookup_device_func = [&](DeviceId device_id) -> absl::StatusOr<Device*> {
+    return client()->LookupDevice(device_id);
+  };
+  TF_ASSERT_OK_AND_ASSIGN(auto device_list_copy,
+                          DeviceList::FromProto(lookup_device_func, proto));
+  EXPECT_EQ(device_list_copy, device_list);
+}
+
+TEST_P(DeviceListTest, IdenticalHashFromConcurrentCalls) {
+  auto device_list = GetDevices({0, 1});
+
+  const int num_threads = 16;
+  absl::BlockingCounter counter(num_threads);
+  tsl::thread::ThreadPool thread_pool(
+      tsl::Env::Default(), tsl::ThreadOptions(), "test_pool",
+      std::min(num_threads, tsl::port::MaxParallelism()));
+  std::vector<uint64_t> hashes(num_threads);
+  for (int i = 0; i < num_threads; ++i) {
+    thread_pool.Schedule([&, i]() {
+      hashes[i] = device_list.hash();
+      counter.DecrementCount();
+    });
+  }
+
+  counter.Wait();
+  for (int i = 0; i < num_threads; ++i) {
+    EXPECT_EQ(hashes[i], device_list.hash());
+  }
+  EXPECT_NE(device_list.hash(), 0);
+}
+
+TEST_P(DeviceListTest, EqualityTest) {
+  auto device_list1 = GetDevices({0, 1});
+  auto device_list2 = GetDevices({0, 1});
+  EXPECT_EQ(device_list1, device_list2);
+
+  auto device_list3 = device_list1;
+  EXPECT_EQ(device_list1, device_list3);
+
+  auto device_list4 = std::move(device_list2);
+  EXPECT_EQ(device_list1, device_list4);
+
+  auto device_list5 = GetDevices({0});
+  EXPECT_NE(device_list1, device_list5);
+
+  auto device_list6 = GetDevices({1, 0});
+  EXPECT_NE(device_list1, device_list6);
+}
+
+INSTANTIATE_TEST_SUITE_P(NumDevices, DeviceListTest,
+                         testing::Values(test_util::ShardingTestParam{
+                             /*num_devices=*/2,
+                             /*num_addressable_devices=*/2}));
+
+}  // namespace
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt/dtype.cc b/third_party/xla/xla/python/ifrt/dtype.cc
index e9ef9f1f32dbc5..b032d350a81564 100644
--- a/third_party/xla/xla/python/ifrt/dtype.cc
+++ b/third_party/xla/xla/python/ifrt/dtype.cc
@@ -19,7 +19,9 @@ limitations under the License.
 #include <ostream>
 #include <string>
 
+#include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
+#include "xla/python/ifrt/dtype.pb.h"
 
 namespace xla {
 namespace ifrt {
@@ -78,6 +80,89 @@ std::optional<int> DType::bit_size() const {
   }
 }
 
+absl::StatusOr<DType> DType::FromProto(const DTypeProto& dtype_proto) {
+  switch (dtype_proto.kind()) {
+    case DTypeProto::KIND_PRED:
+      return DType(DType::Kind::kPred);
+    case DTypeProto::KIND_TOKEN:
+      return DType(DType::Kind::kToken);
+#define CASE(X)              \
+  case DTypeProto::KIND_##X: \
+    return DType(DType::Kind::k##X);
+      CASE(S4);
+      CASE(S8);
+      CASE(S16);
+      CASE(S32);
+      CASE(S64);
+      CASE(U4);
+      CASE(U8);
+      CASE(U16);
+      CASE(U32);
+      CASE(U64);
+      CASE(F16);
+      CASE(F32);
+      CASE(F64);
+      CASE(BF16);
+      CASE(C64);
+      CASE(C128);
+      CASE(F8E4M3FN);
+      CASE(F8E4M3B11FNUZ);
+      CASE(F8E4M3FNUZ);
+      CASE(F8E5M2);
+      CASE(F8E5M2FNUZ);
+#undef CASE
+    case DTypeProto::KIND_STRING:
+      return DType(DType::Kind::kString);
+    default:
+      return DType(DType::Kind::kInvalid);
+  }
+}
+
+DTypeProto DType::ToProto() const {
+  DTypeProto dtype_proto;
+  switch (kind()) {
+    case DType::Kind::kPred:
+      dtype_proto.set_kind(DTypeProto::KIND_PRED);
+      break;
+    case DType::Kind::kToken:
+      dtype_proto.set_kind(DTypeProto::KIND_TOKEN);
+      break;
+#define CASE(X)                                 \
+  case DType::Kind::k##X:                       \
+    dtype_proto.set_kind(DTypeProto::KIND_##X); \
+    break;
+      CASE(S4);
+      CASE(S8);
+      CASE(S16);
+      CASE(S32);
+      CASE(S64);
+      CASE(U4);
+      CASE(U8);
+      CASE(U16);
+      CASE(U32);
+      CASE(U64);
+      CASE(F16);
+      CASE(F32);
+      CASE(F64);
+      CASE(BF16);
+      CASE(C64);
+      CASE(C128);
+      CASE(F8E4M3FN);
+      CASE(F8E4M3B11FNUZ);
+      CASE(F8E4M3FNUZ);
+      CASE(F8E5M2);
+      CASE(F8E5M2FNUZ);
+#undef CASE
+    case DType::Kind::kString:
+      dtype_proto.set_kind(DTypeProto::KIND_STRING);
+      break;
+    default:
+      dtype_proto.set_kind(DTypeProto::KIND_UNSPECIFIED);
+      break;
+  }
+  return dtype_proto;
+}
+
 std::string DType::DebugString() const {
   switch (kind_) {
     case kInvalid:
diff --git a/third_party/xla/xla/python/ifrt/dtype.h b/third_party/xla/xla/python/ifrt/dtype.h
index dcee8ce13b6561..ef461e6053de31 100644
--- a/third_party/xla/xla/python/ifrt/dtype.h
+++ b/third_party/xla/xla/python/ifrt/dtype.h
@@ -20,6 +20,9 @@ limitations under the License.
 #include <ostream>
 #include <string>
 
+#include "absl/status/statusor.h"
+#include "xla/python/ifrt/dtype.pb.h"
+
 namespace xla {
 namespace ifrt {
 
@@ -33,6 +36,7 @@ namespace ifrt {
 // * Add kString.
 class DType {
  public:
+  // LINT.IfChange
   enum Kind {
     // Invalid data type.
     kInvalid = 0,
@@ -86,6 +90,7 @@ class DType {
     // collision.
     kString = 99,
   };
+  // LINT.ThenChange(dtype.proto:DTypeProtoKind)
 
   explicit DType(Kind kind) : kind_(kind) {}
   DType(const DType&) = default;
@@ -112,6 +117,12 @@ class DType {
   // std::nullopt if there is no fixed size.
   std::optional<int> bit_size() const;
 
+  // Constructs `DType` from `DTypeProto`.
+  static absl::StatusOr<DType> FromProto(const DTypeProto& proto);
+
+  // Returns a `DTypeProto` representation.
+  DTypeProto ToProto() const;
+
   std::string DebugString() const;
 
  private:
diff --git a/third_party/xla/xla/python/ifrt/dtype.proto b/third_party/xla/xla/python/ifrt/dtype.proto
new file mode 100644
index 00000000000000..640a3ae5a6efad
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt/dtype.proto
@@ -0,0 +1,75 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto3";
+
+package xla.ifrt;
+
+// Proto equivalent of C++ `DType`.
+message DTypeProto {
+  // LINT.IfChange(DTypeProtoKind)
+  enum Kind {
+    KIND_UNSPECIFIED = 0;
+
+    // Predicates are two-state booleans.
+    KIND_PRED = 1;
+
+    // Signed integral values of fixed width.
+    KIND_S4 = 21;
+    KIND_S8 = 2;
+    KIND_S16 = 3;
+    KIND_S32 = 4;
+    KIND_S64 = 5;
+
+    // Unsigned integral values of fixed width.
+    KIND_U4 = 22;
+    KIND_U8 = 6;
+    KIND_U16 = 7;
+    KIND_U32 = 8;
+    KIND_U64 = 9;
+
+    // Floating-point values of fixed width.
+    KIND_F16 = 10;
+    KIND_F32 = 11;
+    KIND_F64 = 12;
+
+    // Truncated 16 bit floating-point format. This is similar to IEEE's 16 bit
+    // floating-point format, but uses 1 bit for the sign, 8 bits for the
+    // exponent and 7 bits for the mantissa.
+    KIND_BF16 = 16;
+
+    // Complex values of fixed width.
+    KIND_C64 = 15;   // Paired F32 (real, imag), as in std::complex<float>.
+    KIND_C128 = 18;  // Paired F64 (real, imag), as in std::complex<double>.
+
+    // A token type threaded between side-effecting operations. Shapes of this
+    // dtype will have empty dimensions.
+    KIND_TOKEN = 17;
+
+    KIND_F8E4M3FN = 20;
+    KIND_F8E4M3B11FNUZ = 23;
+    KIND_F8E4M3FNUZ = 25;
+    KIND_F8E5M2 = 19;
+    KIND_F8E5M2FNUZ = 24;
+
+    // Variable-length string represented as raw bytes, as in `bytes` in Python,
+    // i.e., no encoding enforcement. String is not support in XLA. DType.Kind
+    // needs to match xla.PrimitiveType enum, so choose a large enum to avoid
+    // collision.
+    KIND_STRING = 99;
+  }
+  // LINT.ThenChange()
+  Kind kind = 1;
+}
diff --git a/third_party/xla/xla/python/ifrt/dtype_test.cc b/third_party/xla/xla/python/ifrt/dtype_test.cc
new file mode 100644
index 00000000000000..4295c5e09ce296
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt/dtype_test.cc
@@ -0,0 +1,40 @@
+// Copyright 2024 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/python/ifrt/dtype.h"
+
+#include <gtest/gtest.h>
+#include "xla/python/ifrt/dtype.pb.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/test.h"
+
+namespace xla {
+namespace ifrt {
+namespace {
+
+TEST(DTypeTest, FromToFromProto) {
+  for (int i = 0; i < DTypeProto::Kind_descriptor()->value_count(); ++i) {
+    DTypeProto proto;
+    proto.set_kind(static_cast<DTypeProto::Kind>(
+        DTypeProto::Kind_descriptor()->value(i)->number()));
+    TF_ASSERT_OK_AND_ASSIGN(DType dtype, DType::FromProto(proto));
+    TF_ASSERT_OK_AND_ASSIGN(DType dtype_copy,
+                            DType::FromProto(dtype.ToProto()));
+    EXPECT_EQ(dtype_copy, dtype);
+  }
+}
+
+}  // namespace
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt/executable.h b/third_party/xla/xla/python/ifrt/executable.h
index 612827023d4da0..0000e6ac9df9bd 100644
--- a/third_party/xla/xla/python/ifrt/executable.h
+++ b/third_party/xla/xla/python/ifrt/executable.h
@@ -16,27 +16,39 @@ limitations under the License.
 #ifndef XLA_PYTHON_IFRT_EXECUTABLE_H_
 #define XLA_PYTHON_IFRT_EXECUTABLE_H_
 
+#include <cstdint>
 #include <memory>
 #include <optional>
 #include <string>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/pjrt_executable.h"
 #include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/future.h"
-#include "xla/statusor.h"
+#include "xla/status.h"
+#include "tsl/concurrency/ref_count.h"
 
 namespace xla {
 namespace ifrt {
 
 class Client;
+struct DeserializeExecutableOptions;
 
 // Wraps a computation that has been partially compiled and can be loaded.
 class Executable : public llvm::RTTIExtends<Executable, llvm::RTTIRoot> {
  public:
+  using DeserializeOptions = DeserializeExecutableOptions;
+
   // Unique name for this executable.
   virtual absl::string_view name() const = 0;
 
@@ -48,8 +60,8 @@ class Executable : public llvm::RTTIExtends<Executable, llvm::RTTIRoot> {
   virtual absl::StatusOr<std::string> Serialize() const = 0;
 
   // The following APIs are taken from `xla::PjRtExecutable` for fast
-  // prototyping. TODO(hyeontaek): Factor some of them out as
-  // `XlaCompatibleExecutable`.
+  // prototyping.
+  // TODO(hyeontaek): Factor some of them out as `XlaCompatibleExecutable`.
   virtual int num_devices() const = 0;
   virtual int64_t SizeOfGeneratedCodeInBytes() const = 0;
   virtual absl::StatusOr<CompiledMemoryStats> GetCompiledMemoryStats()
@@ -164,7 +176,7 @@ class LoadedExecutable
   // Result from an execution.
   struct ExecuteResult {
     // Resulting status of the execution.
-    Future<Status> status;
+    Future<> status;
     // Output arrays.
     std::vector<tsl::RCReference<Array>> outputs;
   };
diff --git a/third_party/xla/xla/python/ifrt/executable_serdes.cc b/third_party/xla/xla/python/ifrt/executable_serdes.cc
new file mode 100644
index 00000000000000..6ed1d25086bb24
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt/executable_serdes.cc
@@ -0,0 +1,24 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/python/ifrt/executable_serdes.h"
+
+namespace xla {
+namespace ifrt {
+
+char DeserializeExecutableOptions::ID = 0;
+
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt/executable_serdes.h b/third_party/xla/xla/python/ifrt/executable_serdes.h
new file mode 100644
index 00000000000000..b1592432bd3f27
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt/executable_serdes.h
@@ -0,0 +1,38 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_EXECUTABLE_SERDES_H_
+#define XLA_PYTHON_IFRT_EXECUTABLE_SERDES_H_
+
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/python/ifrt/serdes.h"
+
+namespace xla {
+namespace ifrt {
+
+// Abstract options for deserializing an `Executable` and load it as
+// `LoadedExecutable`. This option structure is to express legacy compilation
+// options that are not included in the program.
+//
+// TODO(hyeontaek): Make an new `LoadOptions` that is specific for loading.
+struct DeserializeExecutableOptions
+    : llvm::RTTIExtends<DeserializeExecutableOptions, DeserializeOptions> {
+  static char ID;  // NOLINT
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_EXECUTABLE_SERDES_H_
diff --git a/third_party/xla/xla/python/ifrt/future.cc b/third_party/xla/xla/python/ifrt/future.cc
index c4032c6274804a..3533b85af3e99e 100644
--- a/third_party/xla/xla/python/ifrt/future.cc
+++ b/third_party/xla/xla/python/ifrt/future.cc
@@ -21,6 +21,8 @@ limitations under the License.
 
 #include "absl/base/thread_annotations.h"
 #include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/pjrt/pjrt_future.h"
 #include "xla/status.h"
 
 namespace xla {
@@ -61,5 +63,9 @@ Future<Status> JoinFutures(absl::Span<Future<Status>> futures) {
   return future;
 }
 
+Future<> JoinFutures(absl::Span<Future<>> futures) {
+  return ::xla::JoinFutures(futures);
+}
+
 }  // namespace ifrt
 }  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt/future.h b/third_party/xla/xla/python/ifrt/future.h
index 358b38b05bb883..0c87042f971286 100644
--- a/third_party/xla/xla/python/ifrt/future.h
+++ b/third_party/xla/xla/python/ifrt/future.h
@@ -16,7 +16,8 @@ limitations under the License.
 #ifndef XLA_PYTHON_IFRT_FUTURE_H_
 #define XLA_PYTHON_IFRT_FUTURE_H_
 
-#include "xla/pjrt/pjrt_client.h"
+#include "absl/types/span.h"
+#include "xla/pjrt/pjrt_future.h"
 #include "xla/status.h"
 
 namespace xla {
@@ -36,15 +37,18 @@ namespace ifrt {
 // (1) no reference counting of `Future`s sharing the same `Promise` and (2)
 // safe mutable access to the value when the `Future` becomes ready, including
 // moving the value out of the `Future`/`Promise`.
-template <typename T>
+template <typename T = void>
 using Future = ::xla::PjRtFuture<T>;
 
-template <typename T>
+template <typename T = void>
 using Promise = typename ::xla::PjRtFuture<T>::Promise;
 
 // Returns a `Future` that aggregates the return status of all `Future`s.
 Future<Status> JoinFutures(absl::Span<Future<Status>> futures);
 
+// Returns a `Future` that aggregates the return status of all `Future`s.
+Future<> JoinFutures(absl::Span<Future<>> futures);
+
 }  // namespace ifrt
 }  // namespace xla
 
diff --git a/third_party/xla/xla/python/ifrt/future_test.cc b/third_party/xla/xla/python/ifrt/future_test.cc
index 5d61f84b72aab0..5a967318af84dd 100644
--- a/third_party/xla/xla/python/ifrt/future_test.cc
+++ b/third_party/xla/xla/python/ifrt/future_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/status_matchers.h"
@@ -30,7 +31,7 @@ using ::testing::HasSubstr;
 using ::tsl::testing::StatusIs;
 
 TEST(FutureTest, JoinZeroFuture) {
-  Future<Status> future = JoinFutures({});
+  Future<Status> future = JoinFutures(absl::Span<Future<Status>>());
 
   TF_EXPECT_OK(future.Await());
 }
@@ -55,7 +56,7 @@ TEST(FutureTest, JoinOneFailingFuture) {
   Future<Status> future = JoinFutures(absl::MakeSpan(futures));
 
   ASSERT_FALSE(future.IsReady());
-  promise.Set(InvalidArgument("Some error"));
+  promise.Set(absl::InvalidArgumentError("Some error"));
   EXPECT_THAT(future.Await(), StatusIs(absl::StatusCode::kInvalidArgument,
                                        HasSubstr("Some error")));
 }
@@ -95,7 +96,7 @@ TEST(FutureTest, JoinAllFailingFutures) {
 
   ASSERT_FALSE(future.IsReady());
   for (Promise<Status>& promise : promises) {
-    promise.Set(InvalidArgument("Some error"));
+    promise.Set(absl::InvalidArgumentError("Some error"));
   }
   EXPECT_THAT(future.Await(), StatusIs(absl::StatusCode::kInvalidArgument,
                                        HasSubstr("Some error")));
@@ -120,7 +121,7 @@ TEST_P(JoinAllOkFuturesExceptForOneTest, JoinAllOkFuturesExceptForOne) {
   ASSERT_FALSE(future.IsReady());
   for (int i = 0; i < kNumFutures; ++i) {
     if (i == failing_future_idx) {
-      promises[i].Set(InvalidArgument("Some error"));
+      promises[i].Set(absl::InvalidArgumentError("Some error"));
     } else {
       promises[i].Set(OkStatus());
     }
diff --git a/third_party/xla/xla/python/ifrt/ir/BUILD b/third_party/xla/xla/python/ifrt/ir/BUILD
index b40e3def5c8edc..50a5f28f87db7c 100644
--- a/third_party/xla/xla/python/ifrt/ir/BUILD
+++ b/third_party/xla/xla/python/ifrt/ir/BUILD
@@ -1,5 +1,5 @@
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
-load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
+load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -98,13 +98,21 @@ gentbl_cc_library(
     name = "ifrt_interfaces_inc_gen",
     compatible_with = get_compatible_with_portable(),
     tbl_outs = [
+        (
+            ["-gen-attr-interface-decls"],
+            "ifrt_attr_interfaces.h.inc",
+        ),
+        (
+            ["-gen-attr-interface-defs"],
+            "ifrt_attr_interfaces.cc.inc",
+        ),
         (
             ["-gen-op-interface-decls"],
-            "ifrt_interfaces.h.inc",
+            "ifrt_op_interfaces.h.inc",
         ),
         (
             ["-gen-op-interface-defs"],
-            "ifrt_interfaces.cc.inc",
+            "ifrt_op_interfaces.cc.inc",
         ),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
@@ -135,6 +143,7 @@ cc_library(
         ":ifrt_interfaces_inc_gen",
         ":ifrt_ops_inc_gen",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:CallOpInterfaces",  # buildcleaner: keep
diff --git a/third_party/xla/xla/python/ifrt/ir/compiler.h b/third_party/xla/xla/python/ifrt/ir/compiler.h
index 94dd1e19f15a5f..10c842478d5101 100644
--- a/third_party/xla/xla/python/ifrt/ir/compiler.h
+++ b/third_party/xla/xla/python/ifrt/ir/compiler.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "llvm/Support/ExtensibleRTTI.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "xla/python/ifrt/compiler.h"
+#include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/executable.h"
 #include "xla/statusor.h"
 
@@ -46,7 +47,7 @@ struct IfrtIRCompileOptions
     : llvm::RTTIExtends<IfrtIRCompileOptions, CompileOptions> {
   IfrtIRCompileOptions() = default;
   explicit IfrtIRCompileOptions(
-      std::vector<int> device_assignments,
+      std::vector<DeviceId> device_assignments,
       absl::flat_hash_map<std::string, LoadedExecutable*> loaded_exec_binding =
           {},
       std::shared_ptr<absl::flat_hash_map<
@@ -58,7 +59,7 @@ struct IfrtIRCompileOptions
 
   // Map from logical device ids in MLIR module to runtime device ids obtained
   // from IFRT client.
-  std::vector<int> device_assignments;
+  std::vector<DeviceId> device_assignments;
 
   // Map from `getSymName()` of declared LoadedExecutableOp in the `mlir_module`
   // to pre-compiled LoadedExecutable instance. The LoadedExecutables must
diff --git a/third_party/xla/xla/python/ifrt/ir/ifrt_dialect.cc b/third_party/xla/xla/python/ifrt/ir/ifrt_dialect.cc
index 165e9799e2093b..15e7cd2b22fb56 100644
--- a/third_party/xla/xla/python/ifrt/ir/ifrt_dialect.cc
+++ b/third_party/xla/xla/python/ifrt/ir/ifrt_dialect.cc
@@ -17,10 +17,11 @@ limitations under the License.
 
 #include <cstdint>
 
+#include "absl/status/statusor.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/raw_ostream.h"
@@ -35,6 +36,7 @@ limitations under the License.
 #include "mlir/IR/OpImplementation.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "xla/python/ifrt/ir/constants.h"
+#include "xla/python/ifrt/ir/ifrt_interfaces.h"
 #include "xla/python/ifrt/ir/ifrt_ops.h"
 
 // Generated definitions.
@@ -118,12 +120,74 @@ mlir::LogicalResult IfrtDialect::verifyRegionArgAttribute(
   return mlir::success();
 }
 
-mlir::LogicalResult IfrtShardingAttr::verify(
+//===----------------------------------------------------------------------===//
+// IfrtShardingParamAttr
+//===----------------------------------------------------------------------===//
+
+mlir::LogicalResult IfrtShardingParamAttr::verify(
+    llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
+    ShardingParam sharding_param) {
+  return sharding_param.verify(emitError);
+}
+
+mlir::LogicalResult IfrtShardingParamAttr::CanApplyTo(
     llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
-    ShardingParam sharding) {
-  return sharding.verify(emitError);
+    mlir::RankedTensorType shape, llvm::ArrayRef<int> device_ids) const {
+  return getSharding().CanApplyTo(emitError, shape, device_ids);
+}
+
+absl::StatusOr<llvm::SmallVector<int64_t>>
+IfrtShardingParamAttr::GlobalShapeFromLocalShape(
+    llvm::ArrayRef<int64_t> local_shape) const {
+  return getSharding().GlobalShapeFromLocalShape(local_shape);
+}
+
+absl::StatusOr<llvm::SmallVector<int64_t>>
+IfrtShardingParamAttr::LocalShapeFromGlobalShape(
+    llvm::ArrayRef<int64_t> global_shape) const {
+  return getSharding().LocalShapeFromGlobalShape(global_shape);
+}
+
+// Returns the number of devices the sharding applies to.
+int IfrtShardingParamAttr::NumDevices() const {
+  return getSharding().NumDevices();
+};
+
+//===----------------------------------------------------------------------===//
+// IfrtUnspecifiedShardingAttr
+//===----------------------------------------------------------------------===//
+
+mlir::LogicalResult IfrtUnspecifiedShardingAttr::CanApplyTo(
+    llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
+    mlir::RankedTensorType shape, llvm::ArrayRef<int> device_ids) const {
+  // The unspecified sharding can be applied to any array.
+  return mlir::success();
 }
 
+absl::StatusOr<llvm::SmallVector<int64_t>>
+IfrtUnspecifiedShardingAttr::GlobalShapeFromLocalShape(
+    llvm::ArrayRef<int64_t> local_shape) const {
+  // Unspecified sharding does not change the shape.
+  llvm::SmallVector<int64_t> global_shape(local_shape.begin(),
+                                          local_shape.end());
+  return global_shape;
+}
+
+absl::StatusOr<llvm::SmallVector<int64_t>>
+IfrtUnspecifiedShardingAttr::LocalShapeFromGlobalShape(
+    llvm::ArrayRef<int64_t> global_shape) const {
+  // Unspecified sharding does not change the shape.
+  llvm::SmallVector<int64_t> local_shape(global_shape.begin(),
+                                         global_shape.end());
+  return local_shape;
+}
+
+int IfrtUnspecifiedShardingAttr::NumDevices() const { return 0; }
+
+//===----------------------------------------------------------------------===//
+// IfrtArrayType
+//===----------------------------------------------------------------------===//
+
 // Returns an array of logical device ids.
 llvm::ArrayRef<int> IfrtArrayType::getDevices() const {
   return getDevicesAttr().getIds();
@@ -131,33 +195,15 @@ llvm::ArrayRef<int> IfrtArrayType::getDevices() const {
 
 mlir::LogicalResult IfrtArrayType::verify(
     llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
-    mlir::RankedTensorType shape, ShardingParam sharding,
+    mlir::RankedTensorType shape, IfrtShardingAttrInterface sharding_attr,
     IfrtDevicesAttr devices) {
-  if (mlir::failed(sharding.verify(emitError))) {
-    return mlir::failure();
-  }
-
-  if (shape.getRank() != sharding.dim_shards().size()) {
-    return emitError() << "Requires dim shards to have the same rank as the "
-                          "array. Array rank is "
-                       << shape.getRank() << " vs dim shards rank of "
-                       << sharding.dim_shards().size();
-  }
-
-  int devices_in_mesh = 1;
-  for (const int axis_size : sharding.minor_to_major().axis_sizes) {
-    devices_in_mesh *= axis_size;
-  }
-  if (llvm::ArrayRef<int> ids = devices.getIds();
-      devices_in_mesh != ids.size()) {
-    return emitError() << "Requires the same amount of `devices` and from "
-                          "`sharding`. Actual: "
-                       << ids.size() << " vs " << devices_in_mesh;
-  }
-
-  return mlir::success();
+  return sharding_attr.CanApplyTo(emitError, shape, devices.getIds());
 }
 
+//===----------------------------------------------------------------------===//
+// IfrtDevicesAttr
+//===----------------------------------------------------------------------===//
+
 IfrtDevicesAttr::operator llvm::ArrayRef<int>() const { return getIds(); }
 
 mlir::LogicalResult IfrtDevicesAttr::verify(
diff --git a/third_party/xla/xla/python/ifrt/ir/ifrt_dialect.h b/third_party/xla/xla/python/ifrt/ir/ifrt_dialect.h
index a81a5d431c2760..a8fcd0bc500b5e 100644
--- a/third_party/xla/xla/python/ifrt/ir/ifrt_dialect.h
+++ b/third_party/xla/xla/python/ifrt/ir/ifrt_dialect.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define XLA_PYTHON_IFRT_IR_IFRT_DIALECT_H_
 
 #include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "xla/python/ifrt/ir/ifrt_interfaces.h"
 #include "xla/python/ifrt/ir/sharding_param.h"
 
 // Generated definitions.
diff --git a/third_party/xla/xla/python/ifrt/ir/ifrt_dialect.td b/third_party/xla/xla/python/ifrt/ir/ifrt_dialect.td
index ebbc0753f794ba..044668aa915116 100644
--- a/third_party/xla/xla/python/ifrt/ir/ifrt_dialect.td
+++ b/third_party/xla/xla/python/ifrt/ir/ifrt_dialect.td
@@ -19,6 +19,11 @@ limitations under the License.
 include "mlir/IR/AttrTypeBase.td"
 include "mlir/IR/BuiltinTypes.td"
 include "mlir/IR/DialectBase.td"
+include "xla/python/ifrt/ir/ifrt_interfaces.td"
+
+//===---------------------------------------------------------------------------
+// Dialect
+//===---------------------------------------------------------------------------
 
 def Ifrt_Dialect : Dialect {
   let name = "ifrt";
@@ -28,13 +33,11 @@ def Ifrt_Dialect : Dialect {
   let useDefaultAttributePrinterParser = 1;
   let hasOperationAttrVerify = 1;
   let hasRegionArgAttrVerify = 1;
-  let usePropertiesForAttributes = 0;
 }
 
-def Ifrt_ShardingParameter :
-    AttrOrTypeParameter<"::xla::ifrt::ShardingParam", ""> {
-  let parser = "::xla::ifrt::ShardingParam::Parse($_parser)";
-}
+//===---------------------------------------------------------------------------
+// Attributes
+//===---------------------------------------------------------------------------
 
 def Ifrt_DevicesAttr : AttrDef<Ifrt_Dialect, "IfrtDevices"> {
   let mnemonic = "devices";
@@ -60,8 +63,15 @@ def Ifrt_DevicesAttr : AttrDef<Ifrt_Dialect, "IfrtDevices"> {
   }];
 }
 
-def Ifrt_ShardingAttr : AttrDef<Ifrt_Dialect, "IfrtSharding"> {
-  let mnemonic = "sharding";
+def Ifrt_ShardingParameter :
+    AttrOrTypeParameter<"::xla::ifrt::ShardingParam", ""> {
+  let parser = "::xla::ifrt::ShardingParam::Parse($_parser)";
+}
+
+def Ifrt_ShardingParamAttr : AttrDef<Ifrt_Dialect, "IfrtShardingParam", [
+    DeclareAttrInterfaceMethods<Ifrt_ShardingAttrInterface>
+]> {
+  let mnemonic = "sharding_param";
   let summary = "ShardingParam as an attribute.";
 
   let parameters = (ins Ifrt_ShardingParameter:$sharding);
@@ -70,26 +80,60 @@ def Ifrt_ShardingAttr : AttrDef<Ifrt_Dialect, "IfrtSharding"> {
   let genVerifyDecl = 1;
 }
 
+def Ifrt_UnspecifiedShardingAttr : AttrDef<Ifrt_Dialect,
+    "IfrtUnspecifiedSharding", [
+      DeclareAttrInterfaceMethods<Ifrt_ShardingAttrInterface>
+]> {
+  let mnemonic = "sharding_unspecified";
+  let summary = "Attribute to be used when sharding is unspecified.";
+
+  let parameters = (ins);
+  let assemblyFormat = "";
+
+  let genVerifyDecl = 1;
+}
+
+//===---------------------------------------------------------------------------
+// Types
+//===---------------------------------------------------------------------------
+
 def Ifrt_ArrayType : TypeDef<Ifrt_Dialect, "IfrtArray"> {
   let mnemonic = "array";
   let summary = "An Ifrt array sharded on a set of devices.";
 
   let parameters = (ins
     Builtin_RankedTensor:$shape,
-    Ifrt_ShardingParameter:$sharding,
+    "::xla::ifrt::IfrtShardingAttrInterface":$sharding_attr,
     Ifrt_DevicesAttr:$devices_attr);
 
   let builders = [
+    TypeBuilder<(ins
+      "::mlir::RankedTensorType":$shape,
+      "::xla::ifrt::IfrtShardingAttrInterface":$sharding_attr,
+      "::llvm::ArrayRef<int>":$devices), [{
+        return Base::get(
+          $_ctxt, shape, sharding_attr,
+          ::xla::ifrt::IfrtDevicesAttr::get($_ctxt, devices));
+      }]>,
+    TypeBuilder<(ins
+      "::mlir::RankedTensorType":$shape,
+      "::llvm::ArrayRef<int>":$devices), [{
+        return Base::get(
+          $_ctxt, shape, ::xla::ifrt::IfrtUnspecifiedShardingAttr::get($_ctxt),
+          ::xla::ifrt::IfrtDevicesAttr::get($_ctxt, devices));
+      }]>,
     TypeBuilder<(ins
       "::mlir::RankedTensorType":$shape,
       "::xla::ifrt::ShardingParam":$sharding,
       "::llvm::ArrayRef<int>":$devices), [{
-        return Base::get($_ctxt, shape, sharding,
-                         ::xla::ifrt::IfrtDevicesAttr::get($_ctxt, devices));
+        return Base::get(
+          $_ctxt, shape,
+          ::xla::ifrt::IfrtShardingParamAttr::get($_ctxt, sharding),
+          ::xla::ifrt::IfrtDevicesAttr::get($_ctxt, devices));
       }]>
   ];
 
-  let assemblyFormat = "`<` $shape`,` $sharding`,` $devices_attr`>`";
+  let assemblyFormat = "`<` $shape`,` $sharding_attr`,` $devices_attr`>`";
 
   let genVerifyDecl = 1;
 
diff --git a/third_party/xla/xla/python/ifrt/ir/ifrt_interfaces.cc b/third_party/xla/xla/python/ifrt/ir/ifrt_interfaces.cc
index f075e5887f61ff..a079f75fbdd597 100644
--- a/third_party/xla/xla/python/ifrt/ir/ifrt_interfaces.cc
+++ b/third_party/xla/xla/python/ifrt/ir/ifrt_interfaces.cc
@@ -21,7 +21,12 @@ limitations under the License.
 #include "xla/python/ifrt/ir/constants.h"
 
 // Generated definitions.
-#include "xla/python/ifrt/ir/ifrt_interfaces.cc.inc"
+
+#define GET_ATTR_INTERFACE_CLASSES
+#include "xla/python/ifrt/ir/ifrt_attr_interfaces.cc.inc"
+
+#define GET_OP_INTERFACE_CLASSES
+#include "xla/python/ifrt/ir/ifrt_op_interfaces.cc.inc"
 
 namespace mlir {
 namespace OpTrait {
diff --git a/third_party/xla/xla/python/ifrt/ir/ifrt_interfaces.h b/third_party/xla/xla/python/ifrt/ir/ifrt_interfaces.h
index bde3442de68464..b496f4b57d3be4 100644
--- a/third_party/xla/xla/python/ifrt/ir/ifrt_interfaces.h
+++ b/third_party/xla/xla/python/ifrt/ir/ifrt_interfaces.h
@@ -1,4 +1,5 @@
 #include "xla/python/ifrt/ir/constants.h"
+#include "xla/python/ifrt/ir/ifrt_dialect.h"
 /* Copyright 2023 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -80,8 +81,15 @@ class IfrtCallLikeTrait {
 }  // namespace OpTrait
 }  // namespace mlir
 
+// IWYU pragma: begin_exports
+
 // Generated definitions.
+#define GET_ATTR_INTERFACE_CLASSES
+#include "xla/python/ifrt/ir/ifrt_attr_interfaces.h.inc"
+
 #define GET_OP_INTERFACE_CLASSES
-#include "xla/python/ifrt/ir/ifrt_interfaces.h.inc"  // IWYU pragma: export
+#include "xla/python/ifrt/ir/ifrt_op_interfaces.h.inc"
+
+// IWYU pragma: end_exports
 
 #endif  // XLA_PYTHON_IFRT_IR_IFRT_INTERFACES_H_
diff --git a/third_party/xla/xla/python/ifrt/ir/ifrt_interfaces.td b/third_party/xla/xla/python/ifrt/ir/ifrt_interfaces.td
index fb1891ac918524..f1a94a517b91b8 100644
--- a/third_party/xla/xla/python/ifrt/ir/ifrt_interfaces.td
+++ b/third_party/xla/xla/python/ifrt/ir/ifrt_interfaces.td
@@ -18,6 +18,10 @@ limitations under the License.
 
 include "mlir/IR/OpBase.td"
 
+//===---------------------------------------------------------------------------
+// Op interfaces
+//===---------------------------------------------------------------------------
+
 def Ifrt_SpmdExpandableInterface : OpInterface<"IfrtSpmdExpandable"> {
   let cppNamespace = "::xla::ifrt";
 
@@ -63,4 +67,49 @@ def NestedInIfrtFunc : NativeOpTrait<"xla::ifrt::NestedInIfrtFuncTrait">;
 class IfrtCallLike<string callee_op_type>
     : ParamNativeOpTrait<"xla::ifrt::IfrtCallLikeTrait", callee_op_type>;
 
+//===---------------------------------------------------------------------------
+// Attribute interfaces
+//===---------------------------------------------------------------------------
+
+class Ifrt_AttrInterface<string name> : AttrInterface<name> {
+  let cppNamespace = "::xla::ifrt";
+}
+
+def Ifrt_ShardingAttrInterface : Ifrt_AttrInterface<"IfrtShardingAttrInterface"> {
+  let description = [{
+    Interface that all IFRT IR sharding attributes must implement.
+  }];
+
+  let methods = [
+    InterfaceMethod<
+      /*desc=*/"Verifies if the sharding can be applied to the array.",
+      /*retTy=*/"::mlir::LogicalResult",
+      /*methodName=*/"CanApplyTo",
+      /*args=*/(ins
+        "::llvm::function_ref<mlir::InFlightDiagnostic()>":$emitError,
+        "::mlir::RankedTensorType":$shape,
+        "llvm::ArrayRef<int>":$device_ids
+      )
+    >,
+    InterfaceMethod<
+      /*desc=*/"Returns the shape of the global array from a local array shape.",
+      /*retTy=*/"::absl::StatusOr<llvm::SmallVector<int64_t>>",
+      /*methodName=*/"GlobalShapeFromLocalShape",
+      /*args=*/(ins "llvm::ArrayRef<int64_t>":$local_shape)
+    >,
+    InterfaceMethod<
+      /*desc=*/"Returns the shape of the local array from a global array shape.",
+      /*retTy=*/"::absl::StatusOr<llvm::SmallVector<int64_t>>",
+      /*methodName=*/"LocalShapeFromGlobalShape",
+      /*args=*/(ins "llvm::ArrayRef<int64_t>":$global_shape)
+    >,
+    InterfaceMethod<
+      /*desc=*/"Returns the number of devices this sharding applied to.",
+      /*retTy=*/"int",
+      /*methodName=*/"NumDevices",
+      /*args=*/(ins)
+    >
+  ];
+}
+
 #endif // XLA_PYTHON_IFRT_IR_IFRT_INTERFACES_TD_
diff --git a/third_party/xla/xla/python/ifrt/ir/ifrt_ops.cc b/third_party/xla/xla/python/ifrt/ir/ifrt_ops.cc
index edae25f8f41e4b..c61e3b82b7e592 100644
--- a/third_party/xla/xla/python/ifrt/ir/ifrt_ops.cc
+++ b/third_party/xla/xla/python/ifrt/ir/ifrt_ops.cc
@@ -32,10 +32,12 @@ limitations under the License.
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/ValueRange.h"  // from @llvm-project
 #include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "xla/python/ifrt/ir/constants.h"
 #include "xla/python/ifrt/ir/ifrt_dialect.h"
+#include "xla/python/ifrt/ir/ifrt_interfaces.h"
 
 // Generated definitions.
 #define GET_OP_CLASSES
@@ -61,18 +63,16 @@ mlir::FailureOr<mlir::RankedTensorType> GetGlobalShape(mlir::Value value) {
 }
 
 mlir::FailureOr<mlir::RankedTensorType> GetGlobalShapeFromLocal(
-    mlir::Type type, ShardingParam shard_param) {
+    mlir::Type type, IfrtShardingAttrInterface sharding_attr) {
   if (auto local_ranked_tensor = type.dyn_cast<mlir::RankedTensorType>()) {
-    llvm::SmallVector<int64_t> global_shape;
-    auto local_shape = local_ranked_tensor.getShape();
-    if (local_shape.size() != shard_param.dim_shards().size()) {
+    auto global_shape =
+        sharding_attr.GlobalShapeFromLocalShape(local_ranked_tensor.getShape());
+    if (global_shape.ok()) {
+      return mlir::RankedTensorType::get(global_shape.value(),
+                                         local_ranked_tensor.getElementType());
+    } else {
       return mlir::failure();
     }
-    for (auto [idx, dim_shard] : llvm::enumerate(shard_param.dim_shards())) {
-      global_shape.push_back(dim_shard * local_shape[idx]);
-    }
-    return mlir::RankedTensorType::get(global_shape,
-                                       local_ranked_tensor.getElementType());
   } else {
     // IFRT arrays cannot be in the local view.
     return mlir::failure();
@@ -122,7 +122,7 @@ mlir::LogicalResult VerifyGlobalLocalShapesEquivalent(
   // Convert from local shape to global shape using the sharding provided
   // by the CallOp func signature.
   mlir::FailureOr<mlir::RankedTensorType> callee_shape =
-      GetGlobalShapeFromLocal(callee_type, array.getSharding());
+      GetGlobalShapeFromLocal(callee_type, array.getShardingAttr());
   if (mlir::failed(callee_shape)) {
     return op->emitOpError() << "fails to get global shape from "
                              << callee_mnemonic << ": " << callee_type;
diff --git a/third_party/xla/xla/python/ifrt/ir/sharding_param.cc b/third_party/xla/xla/python/ifrt/ir/sharding_param.cc
index 618cec5a5cf56b..67bec9cb2abe02 100644
--- a/third_party/xla/xla/python/ifrt/ir/sharding_param.cc
+++ b/third_party/xla/xla/python/ifrt/ir/sharding_param.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -28,6 +29,7 @@ limitations under the License.
 #include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "mlir/IR/OpImplementation.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
@@ -196,6 +198,70 @@ std::string ShardingParam::DebugString() const {
   return result;
 }
 
+mlir::LogicalResult ShardingParam::CanApplyTo(
+    llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
+    mlir::RankedTensorType shape, llvm::ArrayRef<int> device_ids) const {
+  if (mlir::failed(verify(emitError))) {
+    return mlir::failure();
+  }
+
+  if (shape.getRank() != dim_shards().size()) {
+    return emitError() << "Requires dim shards to have the same rank as the "
+                          "array. Array rank is "
+                       << shape.getRank() << " vs dim shards rank of "
+                       << dim_shards().size();
+  }
+
+  auto devices_in_mesh = NumDevices();
+  if (devices_in_mesh != device_ids.size()) {
+    return emitError() << "Requires the same amount of `devices` and from "
+                          "`sharding`. Actual: "
+                       << device_ids.size() << " vs " << devices_in_mesh;
+  }
+
+  return mlir::success();
+}
+
+absl::StatusOr<llvm::SmallVector<int64_t>>
+ShardingParam::GlobalShapeFromLocalShape(
+    llvm::ArrayRef<int64_t> local_shape) const {
+  llvm::SmallVector<int64_t> global_shape;
+  if (local_shape.size() != dim_shards().size()) {
+    return absl::InvalidArgumentError(
+        "Rank of local tensor differs from rank of `dim_shards`.");
+  }
+  for (auto [idx, dim_shard] : llvm::enumerate(dim_shards())) {
+    global_shape.push_back(dim_shard * local_shape[idx]);
+  }
+  return global_shape;
+}
+
+absl::StatusOr<llvm::SmallVector<int64_t>>
+ShardingParam::LocalShapeFromGlobalShape(
+    llvm::ArrayRef<int64_t> global_shape) const {
+  auto num_shards = dim_shards();
+  llvm::SmallVector<int64_t> local_shape;
+  local_shape.reserve(global_shape.size());
+  for (int i = 0; i < num_shards.size(); ++i) {
+    if (global_shape[i] % num_shards[i] != 0) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "Global shape is not divisible by the number of shards in dimension ",
+          i, ". Global size: ", global_shape[i],
+          ", number of shards: ", num_shards[i], "."));
+    }
+    local_shape.push_back(global_shape[i] / num_shards[i]);
+  }
+  return local_shape;
+}
+
+int ShardingParam::NumDevices() const {
+  int devices_in_mesh = 1;
+  for (const int axis_size : minor_to_major().axis_sizes) {
+    devices_in_mesh *= axis_size;
+  }
+  return devices_in_mesh;
+}
+
 llvm::hash_code hash_value(ShardingParam sharding) {
   return sharding.hash_value();
 }
diff --git a/third_party/xla/xla/python/ifrt/ir/sharding_param.h b/third_party/xla/xla/python/ifrt/ir/sharding_param.h
index 13de6a96e9dcb5..b2b4020d4a2eeb 100644
--- a/third_party/xla/xla/python/ifrt/ir/sharding_param.h
+++ b/third_party/xla/xla/python/ifrt/ir/sharding_param.h
@@ -20,11 +20,13 @@ limitations under the License.
 #include <string>
 
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "mlir/IR/OpImplementation.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
@@ -67,8 +69,6 @@ namespace ifrt {
 //   in axis-0.
 //
 // See `support` directory for conversions with other sharding annotations.
-//
-// TODO(b/271129892): Should we support maximal sharding here?
 class ShardingParam {
  public:
   // Represents a permutation of mesh dimensions from minor to major.
@@ -100,6 +100,20 @@ class ShardingParam {
   mlir::LogicalResult verify(
       llvm::function_ref<mlir::InFlightDiagnostic()> emit_error) const;
 
+  // Verifies if the sharding can be applied to the array.
+  mlir::LogicalResult CanApplyTo(
+      llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
+      mlir::RankedTensorType shape, llvm::ArrayRef<int> device_ids) const;
+
+  absl::StatusOr<llvm::SmallVector<int64_t>> GlobalShapeFromLocalShape(
+      llvm::ArrayRef<int64_t> local_shape) const;
+
+  absl::StatusOr<llvm::SmallVector<int64_t>> LocalShapeFromGlobalShape(
+      llvm::ArrayRef<int64_t> global_shape) const;
+
+  // Returns the number of devices the array is sharded over.
+  int NumDevices() const;
+
   llvm::ArrayRef<int64_t> dim_shards() const { return dim_shards_; }
   const MinorToMajor& minor_to_major() const { return minor_to_major_; }
 
diff --git a/third_party/xla/xla/python/ifrt/ir/tests/BUILD b/third_party/xla/xla/python/ifrt/ir/tests/BUILD
index 75bb1618e6e320..767db7c1bbffda 100644
--- a/third_party/xla/xla/python/ifrt/ir/tests/BUILD
+++ b/third_party/xla/xla/python/ifrt/ir/tests/BUILD
@@ -11,6 +11,7 @@ lit_test_suite(
     srcs = enforce_glob(
         [
             "ifrt_duplicated_callee_elimination.mlir",
+            "ifrt_verify_sharding_specified.mlir",
             "spmd_expansion.mlir",
             "spmd_interface_verification.mlir",
             "verify_array.mlir",
@@ -37,6 +38,7 @@ cc_binary(
     name = "ifrt-opt",
     srcs = ["ifrt-opt.cc"],
     deps = [
+        "//xla/mlir_hlo:hlo_dialect_registration",
         "//xla/python/ifrt/ir",
         "//xla/python/ifrt/ir/transforms:built_in_spmd_expansions",
         "//xla/python/ifrt/ir/transforms:passes",
diff --git a/third_party/xla/xla/python/ifrt/ir/tests/executable_impl_test_lib.cc b/third_party/xla/xla/python/ifrt/ir/tests/executable_impl_test_lib.cc
index aece29a2059946..ff0aeea5b0472b 100644
--- a/third_party/xla/xla/python/ifrt/ir/tests/executable_impl_test_lib.cc
+++ b/third_party/xla/xla/python/ifrt/ir/tests/executable_impl_test_lib.cc
@@ -50,7 +50,8 @@ class IfrtIrExecutableImplTest
 
 TEST_F(IfrtIrExecutableImplTest, CallXla) {
   std::string source = R"(
-!array = !ifrt.array<tensor<2x2xi32>, 2x1 to [0] on 2, [0,1]>
+!array = !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<2x1 to [0] on 2>,
+                     [0,1]>
 module {
   func.func @main(%arg0: !array) -> !array attributes {ifrt.function} {
     %0, %ctrl_0 = ifrt.Call @add_one(%arg0) on devices [0,1]
@@ -97,13 +98,15 @@ module {
 TEST_F(IfrtIrExecutableImplTest, Reshard) {
   std::string source = R"(
 module {
-  func.func @main(%arg0: !ifrt.array<tensor<2xi32>, 1 to [0] on 1, [0]>)
-      -> !ifrt.array<tensor<2xi32>, 1 to [0] on 1, [1]>
+  func.func @main(%arg0: !ifrt.array<tensor<2xi32>,
+                                     #ifrt.sharding_param<1 to [0] on 1>, [0]>)
+      -> !ifrt.array<tensor<2xi32>, #ifrt.sharding_param<1 to [0] on 1>, [1]>
       attributes {ifrt.function} {
     %0 = "ifrt.Reshard"(%arg0)
-        : (!ifrt.array<tensor<2xi32>, 1 to [0] on 1, [0]>)
-        -> !ifrt.array<tensor<2xi32>, 1 to [0] on 1, [1]>
-    return %0 : !ifrt.array<tensor<2xi32>, 1 to [0] on 1, [1]>
+        : (!ifrt.array<tensor<2xi32>, #ifrt.sharding_param<1 to [0] on 1>, [0]>)
+        -> !ifrt.array<tensor<2xi32>, #ifrt.sharding_param<1 to [0] on 1>, [1]>
+    return %0 : !ifrt.array<tensor<2xi32>,
+                            #ifrt.sharding_param<1 to [0] on 1>, [1]>
   }
 }
   )";
@@ -137,7 +140,8 @@ module {
 
 TEST_F(IfrtIrExecutableImplTest, ZeroInput) {
   std::string source = R"(
-!array = !ifrt.array<tensor<2x2xi32>, 2x1 to [0] on 2, [0,1]>
+!array = !ifrt.array<tensor<2x2xi32>,
+                     #ifrt.sharding_param<2x1 to [0] on 2>, [0,1]>
 module {
   func.func @main() -> !array attributes {ifrt.function} {
     %0, %ctrl_0 = ifrt.Call @one() on devices [0,1] : () -> !array
@@ -172,7 +176,8 @@ module {
 
 TEST_F(IfrtIrExecutableImplTest, ZeroOutput) {
   std::string source = R"(
-!array = !ifrt.array<tensor<2x2xi32>, 2x1 to [0] on 2, [0,1]>
+!array = !ifrt.array<tensor<2x2xi32>,
+                     #ifrt.sharding_param<2x1 to [0] on 2>, [0,1]>
 module {
   func.func @main(%arg0: !array) attributes {ifrt.function} {
     %ctrl_0 = ifrt.Call @add_one(%arg0) on devices [0,1] : (!array) -> ()
@@ -214,7 +219,8 @@ module {
 
 TEST_F(IfrtIrExecutableImplTest, BufferDonation) {
   std::string source = R"(
-!array = !ifrt.array<tensor<2x2xi32>, 2x1 to [0] on 2, [0,1]>
+!array = !ifrt.array<tensor<2x2xi32>,
+                     #ifrt.sharding_param<2x1 to [0] on 2>, [0,1]>
 module {
   func.func @main(%arg0: !array {ifrt.donated}) -> !array
       attributes {ifrt.function} {
@@ -288,7 +294,7 @@ module {
     exec_build_options.set_use_spmd_partitioning(true);
     xla::DeviceAssignment device_assignment(1, 2);
     for (auto [logical, device_id] : llvm::enumerate(GetDeviceIds(devices))) {
-      device_assignment(0, logical) = device_id;
+      device_assignment(0, logical) = device_id.value();
     }
     exec_build_options.set_device_assignment(device_assignment);
   }
@@ -299,7 +305,8 @@ module {
           std::make_unique<XlaCompileOptions>(std::move(xla_options))));
 
   std::string source = R"(
-!array = !ifrt.array<tensor<2x2xi32>, 2x1 to [0] on 2, [0,1]>
+!array = !ifrt.array<tensor<2x2xi32>,
+                     #ifrt.sharding_param<2x1 to [0] on 2>, [0,1]>
 module {
   func.func @main(%arg0: !array) -> !array attributes {ifrt.function} {
     %0, %ctrl_0 = ifrt.CallLoadedExecutable @add_one(%arg0)
diff --git a/third_party/xla/xla/python/ifrt/ir/tests/ifrt-opt.cc b/third_party/xla/xla/python/ifrt/ir/tests/ifrt-opt.cc
index af6993b78d4749..38a8979a14a981 100644
--- a/third_party/xla/xla/python/ifrt/ir/tests/ifrt-opt.cc
+++ b/third_party/xla/xla/python/ifrt/ir/tests/ifrt-opt.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "mlir/IR/DialectRegistry.h"  // from @llvm-project
 #include "mlir/InitAllDialects.h"  // from @llvm-project
 #include "mlir/Tools/mlir-opt/MlirOptMain.h"  // from @llvm-project
+#include "xla/mlir_hlo/mhlo/IR/register.h"
 #include "xla/python/ifrt/ir/ifrt_dialect.h"
 #include "xla/python/ifrt/ir/transforms/built_in_spmd_expansions.h"
 #include "xla/python/ifrt/ir/transforms/passes.h"
@@ -23,6 +24,7 @@ limitations under the License.
 int main(int argc, char** argv) {
   mlir::DialectRegistry registry;
   mlir::registerAllDialects(registry);
+  mlir::mhlo::registerAllMhloDialects(registry);
   registry.insert<xla::ifrt::IfrtDialect>();
   xla::ifrt::registerIfrtIrPasses();
 
diff --git a/third_party/xla/xla/python/ifrt/ir/tests/ifrt_duplicated_callee_elimination.mlir b/third_party/xla/xla/python/ifrt/ir/tests/ifrt_duplicated_callee_elimination.mlir
index fe6dee2e68736d..5cf62e23e0e59e 100644
--- a/third_party/xla/xla/python/ifrt/ir/tests/ifrt_duplicated_callee_elimination.mlir
+++ b/third_party/xla/xla/python/ifrt/ir/tests/ifrt_duplicated_callee_elimination.mlir
@@ -1,7 +1,8 @@
 // RUN: ifrt-opt %s -ifrt-duplicated-callee-elimination | FileCheck %s
 
 // CHECK-LABEL: @main
-func.func @main(%arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [0]>)
+func.func @main(%arg0: !ifrt.array<tensor<2x2xi32>,
+                                   #ifrt.sharding_param<1x1 to [0] on 1>, [0]>)
     attributes {ifrt.function} {
   // CHECK: %[[CTRL:.+]] = ifrt.Call @callee
   %ctrl_0 = ifrt.Call @callee() on devices [0,1] : () -> ()
@@ -17,7 +18,8 @@ func.func @main(%arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [0]>)
   // CHECK-NOT: ifrt.Call @callee
   // CHECK: ifrt.Call @callee_different_signature
   %ctrl_4 = ifrt.Call @callee_different_signature(%arg0) on devices [0,1]
-      : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [0]>) -> ()
+      : (!ifrt.array<tensor<2x2xi32>,
+                     #ifrt.sharding_param<1x1 to [0] on 1>, [0]>) -> ()
   return
 }
 
diff --git a/third_party/xla/xla/python/ifrt/ir/tests/ifrt_verify_sharding_specified.mlir b/third_party/xla/xla/python/ifrt/ir/tests/ifrt_verify_sharding_specified.mlir
new file mode 100644
index 00000000000000..6be0131833826d
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt/ir/tests/ifrt_verify_sharding_specified.mlir
@@ -0,0 +1,69 @@
+// RUN: ifrt-opt %s -ifrt-verify-sharding-specified -split-input-file -verify-diagnostics | FileCheck %s
+
+// CHECK-LABEL: @good_arrays
+#sharding = #ifrt.sharding_param<2 to [0] on 2>
+module @good_arrays {
+  func.func @main(%arg0: !ifrt.array<tensor<2xi32>, #sharding, [0,1]>)
+      -> !ifrt.array<tensor<2xi32>, #sharding, [2,3]>
+      attributes {ifrt.function} {
+    %0, %ctrl_1 = ifrt.Call @identity(%arg0) on devices [0,1]
+        : (!ifrt.array<tensor<2xi32>, #sharding, [0,1]>)
+        -> !ifrt.array<tensor<2xi32>, #sharding, [0,1]>
+    %1 = "ifrt.Reshard"(%0)
+        : (!ifrt.array<tensor<2xi32>, #sharding, [0,1]>)
+        -> !ifrt.array<tensor<2xi32>, #sharding, [2,3]>
+    return %1 : !ifrt.array<tensor<2xi32>, #sharding, [2,3]>
+  }
+
+  func.func private @identity(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+    return %arg0 : tensor<2xi32>
+  }
+}
+
+// -----
+
+module @main_arg_sharding_unspecified {
+  // expected-error @+1 {{'func.func' op argument 0 has unspecified sharding.}}
+  func.func @main(
+      %arg0: !ifrt.array<tensor<2xi32>, #ifrt.sharding_unspecified, [0,1]>)
+      attributes {ifrt.function} {
+    return
+  }
+}
+
+// -----
+
+#sharding = #ifrt.sharding_param<2 to [0] on 2>
+module @main_result_sharding_unspecified {
+  func.func @main()
+      -> !ifrt.array<tensor<2xi32>, #ifrt.sharding_unspecified, [0,1]>
+      attributes {ifrt.function} {
+    // expected-error @+1 {{'ifrt.Call' op result 0 has unspecified sharding.}}
+    %0, %ctrl_1 = ifrt.Call @create_array() on devices [0,1]
+        : () -> !ifrt.array<tensor<2xi32>, #ifrt.sharding_unspecified, [0,1]>
+    return %0 : !ifrt.array<tensor<2xi32>, #ifrt.sharding_unspecified, [0,1]>
+  }
+
+  func.func private @create_array() -> tensor<2xi32> {
+    %0 = mhlo.constant dense<1> : tensor<2xi32>
+    return %0 : tensor<2xi32>
+  }
+}
+
+// -----
+
+#sharding = #ifrt.sharding_param<2 to [0] on 2>
+module @reshard_with_unspecified_sharding {
+  func.func @main(%arg0: !ifrt.array<tensor<2xi32>, #sharding, [0,1]>)
+      -> !ifrt.array<tensor<2xi32>, #sharding, [2,3]>
+      attributes {ifrt.function} {
+    // expected-error @+1 {{'ifrt.Reshard' op result 0 has unspecified sharding.}}
+    %0 = ifrt.Reshard(%arg0)
+        : (!ifrt.array<tensor<2xi32>, #sharding, [0,1]>)
+        -> !ifrt.array<tensor<2xi32>, #ifrt.sharding_unspecified, [2,3]>
+    %1 = ifrt.Reshard(%0)
+        : (!ifrt.array<tensor<2xi32>, #ifrt.sharding_unspecified, [2,3]>)
+        -> !ifrt.array<tensor<2xi32>, #sharding, [2,3]>
+    return %1 : !ifrt.array<tensor<2xi32>, #sharding, [2,3]>
+  }
+}
diff --git a/third_party/xla/xla/python/ifrt/ir/tests/spmd_expansion.mlir b/third_party/xla/xla/python/ifrt/ir/tests/spmd_expansion.mlir
index f15908703b9ec5..28a1dda2b3f77d 100644
--- a/third_party/xla/xla/python/ifrt/ir/tests/spmd_expansion.mlir
+++ b/third_party/xla/xla/python/ifrt/ir/tests/spmd_expansion.mlir
@@ -1,7 +1,7 @@
 // RUN: ifrt-opt %s -spmd-expansion -split-input-file -verify-diagnostics | FileCheck %s
 
 #device = #ifrt<devices[0,1]>
-#sharding = #ifrt.sharding<2x1 to [0] on 2>
+#sharding = #ifrt.sharding_param<2x1 to [0] on 2>
 // CHECK-LABEL: @identity_axis0_sharded
 module @identity_axis0_sharded attributes {ifrt.devices = #device} {
   // CHECK-NEXT: func.func @main
@@ -20,7 +20,7 @@ module @identity_axis0_sharded attributes {ifrt.devices = #device} {
 // -----
 
 #device = #ifrt<devices[0,1]>
-#sharding = #ifrt.sharding<1x2 to [0] on 2>
+#sharding = #ifrt.sharding_param<1x2 to [0] on 2>
 // CHECK-LABEL: @identity_axis1_sharded
 module @identity_axis1_sharded
     attributes {ifrt.devices = #device, ifrt.entry_function = "entry_func"} {
@@ -40,7 +40,7 @@ module @identity_axis1_sharded
 // -----
 
 #device = #ifrt<devices[0,1,2,3,4,5]>
-#sharding = #ifrt.sharding<3x2 to [1,0] on 2x3>
+#sharding = #ifrt.sharding_param<3x2 to [1,0] on 2x3>
 // CHECK-LABEL: @identify_both_axes_sharded
 module @identify_both_axes_sharded attributes {ifrt.devices = #device} {
   // CHECK-NEXT: func.func @main
@@ -70,10 +70,12 @@ module @with_func_call attributes {ifrt.devices = #device} {
   // CHECK: return
   // CHECK-SAME: tensor<1x2xi32>
   func.func @main(
-      %arg0: tensor<2x2xi32> {ifrt.sharding = #ifrt.sharding<2x1 to [0] on 2>,
-      ifrt.devices = #device})
-      -> (tensor<2x2xi32> {ifrt.sharding = #ifrt.sharding<2x1 to [0] on 2>,
-      ifrt.devices = #device}) {
+      %arg0: tensor<2x2xi32> {
+        ifrt.sharding = #ifrt.sharding_param<2x1 to [0] on 2>,
+        ifrt.devices = #device})
+      -> (tensor<2x2xi32> {
+        ifrt.sharding = #ifrt.sharding_param<2x1 to [0] on 2>,
+        ifrt.devices = #device}) {
     %0 = func.call @identify(%arg0) : (tensor<2x2xi32>) -> tensor<2x2xi32>
     return %0 : tensor<2x2xi32>
   }
@@ -102,10 +104,12 @@ module @with_nested_func_call attributes {ifrt.devices = #device} {
   // CHECK: return
   // CHECK-SAME: tensor<1x2xi32>
   func.func @main(
-      %arg0: tensor<2x2xi32> {ifrt.sharding = #ifrt.sharding<2x1 to [0] on 2>,
-      ifrt.devices = #device})
-      -> (tensor<2x2xi32> {ifrt.sharding = #ifrt.sharding<2x1 to [0] on 2>,
-      ifrt.devices = #device}) {
+      %arg0: tensor<2x2xi32> {
+        ifrt.sharding = #ifrt.sharding_param<2x1 to [0] on 2>,
+        ifrt.devices = #device})
+      -> (tensor<2x2xi32> {
+        ifrt.sharding = #ifrt.sharding_param<2x1 to [0] on 2>,
+        ifrt.devices = #device}) {
     %0 = func.call @call_identify(%arg0) : (tensor<2x2xi32>) -> tensor<2x2xi32>
     return %0 : tensor<2x2xi32>
   }
@@ -136,7 +140,7 @@ module @with_nested_func_call attributes {ifrt.devices = #device} {
 // -----
 
 #device = #ifrt<devices[0,1]>
-#sharding = #ifrt.sharding<1x2 to [0] on 2>
+#sharding = #ifrt.sharding_param<1x2 to [0] on 2>
 // expected-error@+1 {{cannot find entry function `main`}}
 module @missing_main_function
     attributes {ifrt.devices = #device} {
@@ -145,7 +149,7 @@ module @missing_main_function
 // -----
 
 #device = #ifrt<devices[0,1]>
-#sharding = #ifrt.sharding<1x2 to [0] on 2>
+#sharding = #ifrt.sharding_param<1x2 to [0] on 2>
 // expected-error@+1 {{cannot find entry function `entry_func`}}
 module @missing_entry_function
     attributes {ifrt.devices = #device, ifrt.entry_function = "entry_func"} {
@@ -161,7 +165,7 @@ module @missing_entry_function
 // -----
 
 #device = #ifrt<devices[0,1]>
-#sharding = #ifrt.sharding<2x1 to [0] on 2>
+#sharding = #ifrt.sharding_param<2x1 to [0] on 2>
 module @non_divisible_global_shape attributes {ifrt.devices = #device} {
   // expected-error@+1 {{Global shape is not divisible by the number of shards in dimension 0. Global size: 3, number of shards: 2}}
   func.func @main(
diff --git a/third_party/xla/xla/python/ifrt/ir/tests/spmd_interface_verification.mlir b/third_party/xla/xla/python/ifrt/ir/tests/spmd_interface_verification.mlir
index b6c81d613b6023..0bd8ef268e0fa1 100644
--- a/third_party/xla/xla/python/ifrt/ir/tests/spmd_interface_verification.mlir
+++ b/third_party/xla/xla/python/ifrt/ir/tests/spmd_interface_verification.mlir
@@ -2,11 +2,14 @@
 
 module @good_return_only {
   func.func @main(
-      %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+      %arg0: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                         [0,1]>)
       attributes {ifrt.function} {
     %0, %ctrl_0 = ifrt.Call @simple_return(%arg0) on devices [0,1]
-      : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-      -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
+      : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                     [0,1]>)
+      -> !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                     [0,1]>
     return
   }
 
@@ -17,11 +20,14 @@ module @good_return_only {
 
 module @good_non_expandable_on_one_device{
   func.func @main(
-      %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [0]>)
+      %arg0: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 1>,
+                         [0]>)
       attributes {ifrt.function} {
     %0, %ctrl_0 = ifrt.Call @math_absi(%arg0) on devices [0]
-      : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [0]>)
-      -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [0]>
+      : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 1>,
+                     [0]>)
+      -> !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 1>,
+                     [0]>
     return
   }
 
@@ -33,11 +39,14 @@ module @good_non_expandable_on_one_device{
 
 module @good_excluded_dialect_on_two_devices {
   func.func @main(
-      %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+      %arg0: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                         [0,1]>)
       attributes {ifrt.function} {
     %0, %ctrl_0 = ifrt.Call @arith_self_add(%arg0) on devices [0,1]
-      : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-      -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
+      : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                     [0,1]>)
+      -> !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                     [0,1]>
     return
   }
 
@@ -51,11 +60,14 @@ module @good_excluded_dialect_on_two_devices {
 
 module @unexpandable_on_two_devices {
   func.func @main(
-      %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+      %arg0: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                         [0,1]>)
       attributes {ifrt.function} {
     %0, %ctrl_0 = ifrt.Call @math_absi(%arg0) on devices [0,1]
-      : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-      -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
+      : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                     [0,1]>)
+      -> !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                     [0,1]>
     return
   }
 
diff --git a/third_party/xla/xla/python/ifrt/ir/tests/verify_array.mlir b/third_party/xla/xla/python/ifrt/ir/tests/verify_array.mlir
index c2bf8634f1f5f1..a1379bdd644ebb 100644
--- a/third_party/xla/xla/python/ifrt/ir/tests/verify_array.mlir
+++ b/third_party/xla/xla/python/ifrt/ir/tests/verify_array.mlir
@@ -13,14 +13,17 @@ func.func @good_array() {
   /// The equivalent HloSharding is
   ///   {devices=[4,1,3]0,2,1,3,4,6,5,7,8,10,9,11 replicate_on_last_dim}
   %0 = builtin.unrealized_conversion_cast to
-      !ifrt.array<tensor<4x6xi32>, 4x1 to [1,0,2] on 2x2x3, [0,1,2,3,4,5,6,7,8,9,10,11]>
+      !ifrt.array<tensor<4x6xi32>,
+                  #ifrt.sharding_param<4x1 to [1,0,2] on 2x2x3>,
+                  [0,1,2,3,4,5,6,7,8,9,10,11]>
   return
 }
 
 #devices = #ifrt<devices[0,1,2,3]>
 func.func @good_array_with_aliased_devices() {
   %0 = builtin.unrealized_conversion_cast to
-      !ifrt.array<tensor<4x6xi32>, 4x1 to [0,1] on 2x2, #devices>
+      !ifrt.array<tensor<4x6xi32>, #ifrt.sharding_param<4x1 to [0,1] on 2x2>,
+                  #devices>
   return
 }
 
@@ -28,7 +31,7 @@ func.func @good_array_with_aliased_devices() {
 
 func.func @good_array_scalar() {
   %0 = builtin.unrealized_conversion_cast to
-      !ifrt.array<tensor<i32>, to [0,1] on 2x2, [0,1,2,3]>
+      !ifrt.array<tensor<i32>,#ifrt.sharding_param< to [0,1] on 2x2>, [0,1,2,3]>
   return
 }
 
@@ -38,44 +41,50 @@ func.func @array_devices_should_be_distinct() {
   // expected-error@+3 {{Device list has duplicate logical id 0}}
   // expected-error@+2 {{failed to parse Ifrt_ArrayType parameter 'devices_attr'}}
   %0 = builtin.unrealized_conversion_cast to
-      !ifrt.array<tensor<4x4xi32>, 1x1 to [0] on 2, [0,0]>
+      !ifrt.array<tensor<4x4xi32>, #ifrt.sharding_param<1x1 to [0] on 2>, [0,0]>
   return
 }
 
 // -----
 
 func.func @array_devices_should_be_non_negative() {
-  // expected-error@+3 {{Device list has negative logical id -1}}
-  // expected-error@+2 {{failed to parse Ifrt_ArrayType parameter 'devices_attr'}}
+  // expected-error@+4 {{Device list has negative logical id -1}}
+  // expected-error@+3 {{failed to parse Ifrt_ArrayType parameter 'devices_attr'}}
   %0 = builtin.unrealized_conversion_cast to
-      !ifrt.array<tensor<4x4xi32>, 1x1 to [0] on 2, [-1,0]>
+      !ifrt.array<tensor<4x4xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                  [-1,0]>
   return
 }
 
 // -----
 
 func.func @array_requires_same_permutation_and_axis_sizes() {
-  // expected-error@+2 {{Expect same non-zero size for `permutation` and `axis_sizes`. Actual 2 vs 1}}
+  // expected-error@+3 {{Expect same non-zero size for `permutation` and `axis_sizes`. Actual 2 vs 1}}
+  // expected-error@+2 {{failed to parse Ifrt_ArrayType parameter 'sharding_attr'}}
   %0 = builtin.unrealized_conversion_cast to
-      !ifrt.array<tensor<4x4xi32>, 1x1 to [0,1] on 2, [0,1]>
+      !ifrt.array<tensor<4x4xi32>, #ifrt.sharding_param<1x1 to [0,1] on 2>,
+                  [0,1]>
   return
 }
 
 // -----
 
 func.func @array_requires_enough_devices() {
-  // expected-error@+2 {{Can't shard the dims 2x2 to the mesh of [0] on 2}}
+  // expected-error@+3 {{Can't shard the dims 2x2 to the mesh of [0] on 2}}
+  // expected-error@+2 {{failed to parse Ifrt_ArrayType parameter 'sharding_attr'}}
   %0 = builtin.unrealized_conversion_cast to
-      !ifrt.array<tensor<4x4xi32>, 2x2 to [0] on 2, [0,1]>
+      !ifrt.array<tensor<4x4xi32>, #ifrt.sharding_param<2x2 to [0] on 2>, [0,1]>
   return
 }
 
 // -----
 
 func.func @array_requires_shard_distributable_to_axes() {
-  // expected-error@+2 {{Can't shard the dims 1x2 to the mesh of [0] on 3}}
+  // expected-error@+3 {{Can't shard the dims 1x2 to the mesh of [0] on 3}}
+  // expected-error@+2 {{failed to parse Ifrt_ArrayType parameter 'sharding_attr'}}
   %0 = builtin.unrealized_conversion_cast to
-      !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 3, [0,1,2]>
+      !ifrt.array<tensor<4x4xi32>, #ifrt.sharding_param<1x2 to [0] on 3>,
+                  [0,1,2]>
   return
 }
 
@@ -84,7 +93,8 @@ func.func @array_requires_shard_distributable_to_axes() {
 func.func @array_requires_same_size_of_devices_and_from_axes() {
   // expected-error@+2 {{Requires the same amount of `devices` and from `sharding`. Actual: 3 vs 4}}
   %0 = builtin.unrealized_conversion_cast to
-      !ifrt.array<tensor<4x4xi32>, 2x2 to [0,1] on 2x2, [0,1,2]>
+      !ifrt.array<tensor<4x4xi32>, #ifrt.sharding_param<2x2 to [0,1] on 2x2>,
+                  [0,1,2]>
   return
 }
 
@@ -93,15 +103,18 @@ func.func @array_requires_same_size_of_devices_and_from_axes() {
 func.func @array_requires_rank_matching_dim_shards() {
   // expected-error@+2 {{Requires dim shards to have the same rank as the array. Array rank is 2 vs dim shards rank of 0}}
   %0 = builtin.unrealized_conversion_cast to
-       !ifrt.array<tensor<4x4xi32>,  to [0,1] on 2x2, [0,1,2,3]>
+       !ifrt.array<tensor<4x4xi32>, #ifrt.sharding_param< to [0,1] on 2x2>,
+                   [0,1,2,3]>
   return
 }
 
 // -----
 
 func.func @array_requires_non_empty_permutation() {
-  // expected-error@+2 {{Expect same non-zero size for `permutation` and `axis_sizes`. Actual 0 vs 0}}
+  // expected-error@+3 {{Expect same non-zero size for `permutation` and `axis_sizes`. Actual 0 vs 0}}
+  // expected-error@+2 {{failed to parse Ifrt_ArrayType parameter 'sharding_attr'}}
   %0 = builtin.unrealized_conversion_cast to
-       !ifrt.array<tensor<4x4xi32>, 2x2 to [] on , [0,1,2,3]>
+       !ifrt.array<tensor<4x4xi32>, #ifrt.sharding_param<2x2 to [] on>,
+                   [0,1,2,3]>
   return
 }
diff --git a/third_party/xla/xla/python/ifrt/ir/tests/verify_assemble.mlir b/third_party/xla/xla/python/ifrt/ir/tests/verify_assemble.mlir
index dea843152255f1..9c3416961102d2 100644
--- a/third_party/xla/xla/python/ifrt/ir/tests/verify_assemble.mlir
+++ b/third_party/xla/xla/python/ifrt/ir/tests/verify_assemble.mlir
@@ -1,57 +1,77 @@
 // RUN: ifrt-opt %s -split-input-file -verify-diagnostics
 
 func.func @good_assemble(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [0]>,
-    %arg1: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [1]>)
+    %arg0: !ifrt.array<tensor<2x2xi32>,
+                       #ifrt.sharding_param<1x1 to [0] on 1>, [0]>,
+    %arg1: !ifrt.array<tensor<2x2xi32>,
+                       #ifrt.sharding_param<1x1 to [0] on 1>, [1]>)
     attributes {ifrt.function} {
   %0 = "ifrt.Assemble"(%arg0, %arg1)
       {operandSegmentSizes=array<i32: 2, 0>}
-      : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [0]>,
-         !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [1]>)
-      -> !ifrt.array<tensor<2x4xi32>, 1x2 to [0] on 2, [0,1]>
+      : (!ifrt.array<tensor<2x2xi32>,
+                     #ifrt.sharding_param<1x1 to [0] on 1>, [0]>,
+         !ifrt.array<tensor<2x2xi32>,
+                     #ifrt.sharding_param<1x1 to [0] on 1>, [1]>)
+      -> !ifrt.array<tensor<2x4xi32>,
+                     #ifrt.sharding_param<1x2 to [0] on 2>, [0,1]>
   return
 }
 
 // -----
 
 func.func @assemble_requires_in_ifrt_function(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [0]>,
-    %arg1: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [1]>) {
+    %arg0: !ifrt.array<tensor<2x2xi32>,
+                       #ifrt.sharding_param<1x1 to [0] on 1>, [0]>,
+    %arg1: !ifrt.array<tensor<2x2xi32>,
+                       #ifrt.sharding_param<1x1 to [0] on 1>, [1]>) {
   // expected-error@+1 {{'ifrt.Assemble' op must be in a FuncOp with attr `ifrt.function`}}
   %0 = "ifrt.Assemble"(%arg0, %arg1)
       {operandSegmentSizes=array<i32: 2, 0>}
-      : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [0]>,
-         !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [1]>)
-      -> !ifrt.array<tensor<2x4xi32>, 1x2 to [0] on 2, [0,1]>
+      : (!ifrt.array<tensor<2x2xi32>,
+                     #ifrt.sharding_param<1x1 to [0] on 1>, [0]>,
+         !ifrt.array<tensor<2x2xi32>,
+                     #ifrt.sharding_param<1x1 to [0] on 1>, [1]>)
+      -> !ifrt.array<tensor<2x4xi32>,
+                     #ifrt.sharding_param<1x2 to [0] on 2>, [0,1]>
   return
 }
 
 // -----
 
 func.func @assemble_requires_inputs_on_single_devices(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x2 to [0] on 2, [0,1]>,
-    %arg1: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [2]>)
+    %arg0: !ifrt.array<tensor<2x2xi32>,
+                       #ifrt.sharding_param<1x2 to [0] on 2>, [0,1]>,
+    %arg1: !ifrt.array<tensor<2x2xi32>,
+                       #ifrt.sharding_param<1x1 to [0] on 1>, [2]>)
     attributes {ifrt.function} {
-  // expected-error@+1 {{'ifrt.Assemble' op requires every input to be a single device array. Actual: '!ifrt.array<tensor<2x2xi32>, 1x2 to [0] on 2, [0, 1]>'}}
+  // expected-error@+1 {{'ifrt.Assemble' op requires every input to be a single device array. Actual: '!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x2 to [0] on 2>, [0, 1]>'}}
   %0 = "ifrt.Assemble"(%arg0, %arg1)
       {operandSegmentSizes=array<i32: 2, 0>}
-      : (!ifrt.array<tensor<2x2xi32>, 1x2 to [0] on 2, [0,1]>,
-         !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [2]>)
-      -> !ifrt.array<tensor<2x4xi32>, 1x3 to [0] on 3, [0,1,2]>
+      : (!ifrt.array<tensor<2x2xi32>,
+                     #ifrt.sharding_param<1x2 to [0] on 2>, [0,1]>,
+         !ifrt.array<tensor<2x2xi32>,
+                     #ifrt.sharding_param<1x1 to [0] on 1>, [2]>)
+      -> !ifrt.array<tensor<2x4xi32>,
+                     #ifrt.sharding_param<1x3 to [0] on 3>, [0,1,2]>
   return
 }
 
 // -----
 
 func.func @assemble_requires_same_device_list(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [0]>,
-    %arg1: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [1]>)
+    %arg0: !ifrt.array<tensor<2x2xi32>,
+                       #ifrt.sharding_param<1x1 to [0] on 1>, [0]>,
+    %arg1: !ifrt.array<tensor<2x2xi32>,
+                       #ifrt.sharding_param<1x1 to [0] on 1>, [1]>)
     attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.Assemble' op requires the same input/output device list. Input 0, 1 vs Output 1, 2}}
   %0 = "ifrt.Assemble"(%arg0, %arg1)
       {operandSegmentSizes=array<i32: 2, 0>}
-      : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [0]>,
-         !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [1]>)
-      -> !ifrt.array<tensor<2x4xi32>, 1x2 to [0] on 2, [1,2]>
+      : (!ifrt.array<tensor<2x2xi32>,
+                     #ifrt.sharding_param<1x1 to [0] on 1>, [0]>,
+         !ifrt.array<tensor<2x2xi32>,
+                     #ifrt.sharding_param<1x1 to [0] on 1>, [1]>)
+      -> !ifrt.array<tensor<2x4xi32>,
+                     #ifrt.sharding_param<1x2 to [0] on 2>, [1,2]>
   return
 }
diff --git a/third_party/xla/xla/python/ifrt/ir/tests/verify_attrs.mlir b/third_party/xla/xla/python/ifrt/ir/tests/verify_attrs.mlir
index 654a65617ed15e..45c18e7149367b 100644
--- a/third_party/xla/xla/python/ifrt/ir/tests/verify_attrs.mlir
+++ b/third_party/xla/xla/python/ifrt/ir/tests/verify_attrs.mlir
@@ -5,7 +5,9 @@ func.func @good_function_attr() attributes {ifrt.function} {
 }
 
 func.func @good_donated_attr(
-    %arg0: !ifrt.array<tensor<4x4xi32>, 1x1 to [0] on 1, [0]> {ifrt.donated})
+    %arg0: !ifrt.array<tensor<4x4xi32>,
+                       #ifrt.sharding_param<1x1 to [0] on 1>,
+                       [0]> {ifrt.donated})
     attributes {ifrt.function} {
   return
 }
@@ -26,7 +28,8 @@ module @func_attr_should_be_on_func_op attributes {ifrt.function} {}
 
 // expected-error@+1 {{'func.func' op has `ifrt.donated` arg attr that is not a UnitAttr}}
 func.func @donated_attr_should_be_unit(
-    %arg0: !ifrt.array<tensor<4x4xi32>, 1x1 to [0] on 1, [0]>
+    %arg0: !ifrt.array<tensor<4x4xi32>,
+                       #ifrt.sharding_param<1x1 to [0] on 1>, [0]>
         {ifrt.donated = "1"})
     attributes {ifrt.function} {
   return
@@ -36,6 +39,8 @@ func.func @donated_attr_should_be_unit(
 
 // expected-error@+1 {{'func.func' op has `ifrt.donated` arg attr but not has `ifrt.function` attr}}
 func.func @donated_attr_should_be_with_func_attr(
-    %arg0: !ifrt.array<tensor<4x4xi32>, 1x1 to [0] on 1, [0]> {ifrt.donated}) {
+    %arg0: !ifrt.array<tensor<4x4xi32>,
+                       #ifrt.sharding_param<1x1 to [0] on 1>,
+                       [0]> {ifrt.donated}) {
   return
 }
diff --git a/third_party/xla/xla/python/ifrt/ir/tests/verify_call.mlir b/third_party/xla/xla/python/ifrt/ir/tests/verify_call.mlir
index fbc862211336b1..b318756983e497 100644
--- a/third_party/xla/xla/python/ifrt/ir/tests/verify_call.mlir
+++ b/third_party/xla/xla/python/ifrt/ir/tests/verify_call.mlir
@@ -1,41 +1,53 @@
 // RUN: ifrt-opt %s -split-input-file -verify-diagnostics
 
 func.func @good_call(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    %arg0: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                       [0,1]>)
     attributes {ifrt.function} {
   %0, %ctrl_0 = ifrt.Call @callee(%arg0) on devices [0,1]
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-    -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
+    : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>)
+    -> !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>
   return
 }
 
 func.func @good_call_with_control_dep(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>,
+    %arg0: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                       [0,1]>,
     %arg1: !ifrt.control)
     attributes {ifrt.function} {
   %0, %ctrl_0 = ifrt.Call @callee(%arg0) after %arg1 on devices [0,1]
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-    -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
+    : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>)
+    -> !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>
   return
 }
 
 func.func @good_call_with_io_aliases(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    %arg0: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                       [0,1]>)
     attributes {ifrt.function} {
   %0, %ctrl_0 = ifrt.Call @callee(%arg0) on devices [0,1]
     {io_aliases=[array<i32: 0, 0>]}
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-    -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
+    : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>)
+    -> !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>
   return
 }
 
 #devices = #ifrt<devices[0,1]>
 func.func @good_call_with_aliased_devices(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, #devices>)
+    %arg0: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                       #devices>)
     attributes {ifrt.function} {
   %0, %ctrl_0 = ifrt.Call @callee(%arg0) on devices #devices
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, #devices>)
-    -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, #devices>
+    : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   #devices>)
+    -> !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   #devices>
   return
 }
 
@@ -47,11 +59,14 @@ func.func @callee(%arg0: tensor<2x2xi32>) -> tensor<2x2xi32> {
 
 
 func.func @call_requires_in_ifrt_function(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+    %arg0: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                       [0,1]>) {
   // expected-error@+1 {{'ifrt.Call' op must be in a FuncOp with attr `ifrt.function`}}
   %0, %ctrl_0 = ifrt.Call @callee(%arg0) on devices [0,1]
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-    -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
+    : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>)
+    -> !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>
   return
 }
 
@@ -62,24 +77,30 @@ func.func @callee(%arg0: tensor<2x2xi32>) -> tensor<2x2xi32> {
 // -----
 
 func.func @call_requires_valid_reference(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    %arg0: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                       [0,1]>)
     attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.Call' op requires '@missing_reference' to reference a valid `func.func`}}
   %0, %ctrl_0 = ifrt.Call @missing_reference(%arg0) on devices [0,1]
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-    -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
+    : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>)
+    -> !ifrt.array<tensor<4x4xi32>, #ifrt.sharding_param<1x2 to [0] on 2>,
+                   [0,1]>
   return
 }
 
 // -----
 
 func.func @call_requires_same_input_size(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    %arg0: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                       [0,1]>)
     attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.Call' op requires the same input size. Input 1 vs Callee 0}}
   %0, %ctrl_0 = ifrt.Call @callee(%arg0) on devices [0,1]
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-    -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
+    : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>)
+    -> !ifrt.array<tensor<4x4xi32>, #ifrt.sharding_param<1x2 to [0] on 2>,
+                   [0,1]>
   return
 }
 
@@ -91,12 +112,15 @@ func.func @callee() -> (tensor<4x4xi32>) {
 // -----
 
 func.func @call_requires_same_input_shape(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    %arg0: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                       [0,1]>)
     attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.Call' op requires the same global shape. Input #0 'tensor<2x2xi32>' vs Callee 'tensor<2x4xi32>'}}
   %0, %ctrl_0 = ifrt.Call @callee(%arg0) on devices [0,1]
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-    -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
+    : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>)
+    -> !ifrt.array<tensor<4x4xi32>, #ifrt.sharding_param<1x2 to [0] on 2>,
+                   [0,1]>
   return
 }
 
@@ -108,12 +132,15 @@ func.func @callee(%arg0: tensor<2x4xi32>) -> tensor<4x4xi32> {
 // -----
 
 func.func @call_requires_same_output_size(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    %arg0: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                       [0,1]>)
     attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.Call' op requires the same output size. Output 1 vs Callee 0}}
   %0, %ctrl_0 = ifrt.Call @callee(%arg0) on devices [0,1]
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-    -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
+    : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>)
+    -> !ifrt.array<tensor<4x4xi32>, #ifrt.sharding_param<1x2 to [0] on 2>,
+                   [0,1]>
   return
 }
 
@@ -124,12 +151,15 @@ func.func @callee(%arg0: tensor<2x2xi32>) {
 // -----
 
 func.func @call_requires_same_output_shape(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    %arg0: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                       [0,1]>)
     attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.Call' op requires the same global shape. Output #0 'tensor<4x4xi32>' vs Callee 'tensor<2x4xi32>'}}
   %0, %ctrl_0 = ifrt.Call @callee(%arg0) on devices [0,1]
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-    -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
+    : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>)
+    -> !ifrt.array<tensor<4x4xi32>, #ifrt.sharding_param<1x2 to [0] on 2>,
+                   [0,1]>
   return
 }
 
@@ -141,12 +171,15 @@ func.func @callee(%arg0: tensor<2x2xi32>) -> tensor<2x4xi32> {
 // -----
 
 func.func @call_requires_non_negative_devices_attr(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    %arg0: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                       [0,1]>)
     attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.Call' Device list has negative logical id -1}}
   %0, %ctrl_0 = ifrt.Call @callee(%arg0) on devices [0,1,-1]
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-    -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
+    : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>)
+    -> !ifrt.array<tensor<4x4xi32>, #ifrt.sharding_param<1x2 to [0] on 2>,
+                   [0,1]>
   return
 }
 
@@ -158,12 +191,15 @@ func.func @callee(%arg0: tensor<2x2xi32>) -> tensor<4x4xi32> {
 // -----
 
 func.func @call_requires_unique_devices_attr(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    %arg0: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                       [0,1]>)
     attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.Call' Device list has duplicate logical id 0}}
   %0, %ctrl_0 = ifrt.Call @callee(%arg0) on devices [0,0]
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-    -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
+    : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>)
+    -> !ifrt.array<tensor<4x4xi32>, #ifrt.sharding_param<1x2 to [0] on 2>,
+                   [0,1]>
   return
 }
 
@@ -175,12 +211,14 @@ func.func @callee(%arg0: tensor<2x2xi32>) -> tensor<4x4xi32> {
 // -----
 
 func.func @call_requires_input_place_on_devices(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,2]>)
+    %arg0: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                       [0,2]>)
     attributes {ifrt.function} {
-  // expected-error@+1 {{'ifrt.Call' op requires all inputs placed on `devices` attr. The following input is placed on device 2 not found in `devices` attr. '!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0, 2]>'}}
+  // expected-error@+1 {{'ifrt.Call' op requires all inputs placed on `devices` attr. The following input is placed on device 2 not found in `devices` attr. '!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>, [0, 2]>'}}
   %0, %ctrl_0 = ifrt.Call @callee(%arg0) on devices [0,1]
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,2]>)
-    -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
+    : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,2]>)
+    -> !ifrt.array<tensor<4x4xi32>, #ifrt.sharding_param<1x2 to [0] on 2>, [0,1]>
   return
 }
 
@@ -192,12 +230,15 @@ func.func @callee(%arg0: tensor<2x2xi32>) -> tensor<4x4xi32> {
 // -----
 
 func.func @call_requires_output_place_on_devices(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    %arg0: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                       [0,1]>)
     attributes {ifrt.function} {
-  // expected-error@+1 {{'ifrt.Call' op requires all outputs placed on `devices` attr. The following output is placed on device 2 not found in `devices` attr. '!ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0, 2]>'}}
+  // expected-error@+1 {{'ifrt.Call' op requires all outputs placed on `devices` attr. The following output is placed on device 2 not found in `devices` attr. '!ifrt.array<tensor<4x4xi32>, #ifrt.sharding_param<1x2 to [0] on 2>, [0, 2]>'}}
   %0, %ctrl_0 = ifrt.Call @callee(%arg0) on devices [0,1]
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-    -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,2]>
+    : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>)
+    -> !ifrt.array<tensor<4x4xi32>, #ifrt.sharding_param<1x2 to [0] on 2>,
+                   [0,2]>
   return
 }
 
@@ -209,13 +250,16 @@ func.func @callee(%arg0: tensor<2x2xi32>) -> tensor<4x4xi32> {
 // -----
 
 func.func @io_aliases_should_be_pairs(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    %arg0: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                       [0,1]>)
     attributes {ifrt.function} {
-  // expected-error@+1 {{'ifrt.Call' op attribute 'io_aliases' failed to satisfy constraint: Array of pairs of aliased input/output indices}}
+  // expected-error@+2 {{'ifrt.Call' op attribute 'io_aliases' failed to satisfy constraint: Array of pairs of aliased input/output indices}}
   %0, %ctrl_0 = ifrt.Call @callee(%arg0) on devices [0,1]
     {io_aliases=[array<i32: 0>]}
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-    -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
+    : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>)
+    -> !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>
   return
 }
 
@@ -226,13 +270,16 @@ func.func @callee(%arg0: tensor<2x2xi32>) -> tensor<2x2xi32> {
 // -----
 
 func.func @io_aliases_should_have_valid_input_index(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    %arg0: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                       [0,1]>)
     attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.Call' op can't alias input #1 to output #0 as only having 1 inputs}}
   %0, %ctrl_0 = ifrt.Call @callee(%arg0) on devices [0,1]
     {io_aliases=[array<i32: 1, 0>]}
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-    -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
+    : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>)
+    -> !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>
   return
 }
 
@@ -243,14 +290,18 @@ func.func @callee(%arg0: tensor<2x2xi32>) -> tensor<2x2xi32> {
 // -----
 
 func.func @io_aliases_should_only_alias_input_once(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    %arg0: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                       [0,1]>)
     attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.Call' op can't alias input #0 more than once}}
   %0, %1, %ctrl_0 = ifrt.Call @callee(%arg0) on devices [0,1]
     {io_aliases=[array<i32: 0, 0>, array<i32: 0, 1>]}
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-    -> (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>,
-        !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>)
+    -> (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                    [0,1]>,
+        !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                    [0,1]>)
   return
 }
 
@@ -262,13 +313,16 @@ func.func @callee(%arg0: tensor<2x2xi32>)
 // -----
 
 func.func @io_aliases_should_have_valid_output_index(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    %arg0: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                       [0,1]>)
     attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.Call' op can't alias input #0 to output #1 as only having 1 outputs}}
   %0, %ctrl_0 = ifrt.Call @callee(%arg0) on devices [0,1]
     {io_aliases=[array<i32: 0, 1>]}
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-    -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
+    : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>)
+    -> !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>
   return
 }
 
@@ -279,14 +333,18 @@ func.func @callee(%arg0: tensor<2x2xi32>) -> tensor<2x2xi32> {
 // -----
 
 func.func @io_aliases_should_only_alias_output_once(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    %arg0: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                       [0,1]>)
     attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.Call' op can't alias output #0 more than once}}
   %0, %ctrl_0 = ifrt.Call @callee(%arg0, %arg0) on devices [0,1]
     {io_aliases=[array<i32: 0, 0>, array<i32: 1, 0>]}
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>,
-       !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-    -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
+    : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>,
+       !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>)
+    -> !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>
   return
 }
 
@@ -298,13 +356,16 @@ func.func @callee(%arg0: tensor<2x2xi32>, %arg1: tensor<2x2xi32>)
 // -----
 
 func.func @io_aliases_should_have_same_type(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    %arg0: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                       [0,1]>)
     attributes {ifrt.function} {
-  // expected-error@+1 {{'ifrt.Call' op can't alias input #0 to output #0 with different types: '!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0, 1]>' vs '!ifrt.array<tensor<2x2xi32>, 2x1 to [0] on 2, [0, 1]>'}}
+  // expected-error@+1 {{'ifrt.Call' op can't alias input #0 to output #0 with different types: '!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>, [0, 1]>' vs '!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<2x1 to [0] on 2>, [0, 1]>'}}
   %0, %ctrl_0 = ifrt.Call @callee(%arg0) on devices [0,1]
     {io_aliases=[array<i32: 0, 0>]}
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-    -> !ifrt.array<tensor<2x2xi32>, 2x1 to [0] on 2, [0,1]>
+    : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>)
+    -> !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<2x1 to [0] on 2>,
+                   [0,1]>
   return
 }
 
@@ -315,11 +376,14 @@ func.func @callee(%arg0: tensor<2x2xi32>) -> tensor<2x2xi32> {
 // -----
 
 func.func @good_call_local_view(
-    %arg0: !ifrt.array<tensor<4x4xi32>, 2x2 to [0, 1] on 2x2, [0,1,2,3]>)
+    %arg0: !ifrt.array<tensor<4x4xi32>,
+                       #ifrt.sharding_param<2x2 to [0, 1] on 2x2>, [0,1,2,3]>)
     attributes {ifrt.function} {
   %0, %ctrl_0 = ifrt.Call @callee(%arg0) on devices [0,1,2,3] {ifrt.local_view}
-    : (!ifrt.array<tensor<4x4xi32>, 2x2 to [0, 1] on 2x2, [0,1,2,3]>)
-    -> !ifrt.array<tensor<4x4xi32>, 2x2 to [0, 1] on 2x2, [0,1,2,3]>
+    : (!ifrt.array<tensor<4x4xi32>,
+                   #ifrt.sharding_param<2x2 to [0, 1] on 2x2>, [0,1,2,3]>)
+    -> !ifrt.array<tensor<4x4xi32>,
+                   #ifrt.sharding_param<2x2 to [0, 1] on 2x2>, [0,1,2,3]>
   return
 }
 
@@ -330,12 +394,15 @@ func.func @callee(%arg0: tensor<2x2xi32>) -> tensor<2x2xi32> {
 // -----
 
 func.func @call_local_view_should_have_valid_shape(
-    %arg0: !ifrt.array<tensor<4x4xi32>, 2x2 to [0, 1] on 2x2, [0,1,2,3]>)
+    %arg0: !ifrt.array<tensor<4x4xi32>,
+                       #ifrt.sharding_param<2x2 to [0, 1] on 2x2>, [0,1,2,3]>)
     attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.Call' op requires the same global shape. Input #0 'tensor<4x4xi32>' vs Callee 'tensor<8x8xi32>'}}
   %0, %ctrl_0 = ifrt.Call @callee(%arg0) on devices [0,1,2,3] {ifrt.local_view}
-    : (!ifrt.array<tensor<4x4xi32>, 2x2 to [0, 1] on 2x2, [0,1,2,3]>)
-    -> !ifrt.array<tensor<4x4xi32>, 2x2 to [0, 1] on 2x2, [0,1,2,3]>
+    : (!ifrt.array<tensor<4x4xi32>,
+                   #ifrt.sharding_param<2x2 to [0, 1] on 2x2>, [0,1,2,3]>)
+    -> !ifrt.array<tensor<4x4xi32>,
+                   #ifrt.sharding_param<2x2 to [0, 1] on 2x2>, [0,1,2,3]>
   return
 }
 
diff --git a/third_party/xla/xla/python/ifrt/ir/tests/verify_call_loaded_executable.mlir b/third_party/xla/xla/python/ifrt/ir/tests/verify_call_loaded_executable.mlir
index ad6ae18f2dc04f..d5228419cb7c0f 100644
--- a/third_party/xla/xla/python/ifrt/ir/tests/verify_call_loaded_executable.mlir
+++ b/third_party/xla/xla/python/ifrt/ir/tests/verify_call_loaded_executable.mlir
@@ -1,42 +1,55 @@
 // RUN: ifrt-opt %s -split-input-file -verify-diagnostics
 
 func.func @good(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    %arg0: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                       [0,1]>)
     attributes {ifrt.function} {
   %0, %ctrl_0 = ifrt.CallLoadedExecutable @callee(%arg0)
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-    -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
+    : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>)
+    -> !ifrt.array<tensor<4x4xi32>, #ifrt.sharding_param<1x2 to [0] on 2>,
+                   [0,1]>
   return
 }
 
 func.func @good_with_control_dep(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>,
+    %arg0: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                       [0,1]>,
     %arg1: !ifrt.control)
     attributes {ifrt.function} {
   %0, %ctrl_0 = ifrt.CallLoadedExecutable @callee(%arg0) after %arg1
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-    -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
+    : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>)
+    -> !ifrt.array<tensor<4x4xi32>, #ifrt.sharding_param<1x2 to [0] on 2>,
+                   [0,1]>
   return
 }
 
 ifrt.LoadedExecutable @callee on devices [0,1]
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-    -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
+    : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>)
+    -> !ifrt.array<tensor<4x4xi32>, #ifrt.sharding_param<1x2 to [0] on 2>,
+                   [0,1]>
 
 // -----
 
 func.func @requires_in_ifrt_function(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+    %arg0: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                       [0,1]>) {
   // expected-error@+1 {{'ifrt.CallLoadedExecutable' op must be in a FuncOp with attr `ifrt.function`}}
   %0, %ctrl_0 = ifrt.CallLoadedExecutable @callee(%arg0)
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-    -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
+    : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>)
+    -> !ifrt.array<tensor<4x4xi32>, #ifrt.sharding_param<1x2 to [0] on 2>,
+                   [0,1]>
   return
 }
 
 ifrt.LoadedExecutable @callee on devices [0,1]
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-    -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
+    : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>)
+    -> !ifrt.array<tensor<4x4xi32>, #ifrt.sharding_param<1x2 to [0] on 2>,
+                   [0,1]>
 
 // -----
 
@@ -49,7 +62,8 @@ func.func @requires_valid_reference() attributes {ifrt.function} {
 // -----
 
 func.func @requires_loaded_executable_callee(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    %arg0: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                       [0,1]>)
     attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.CallLoadedExecutable' op requires '@wrong_reference' to reference a valid `ifrt.LoadedExecutable`}}
   %ctrl_0 = ifrt.CallLoadedExecutable @wrong_reference() : () -> ()
@@ -63,119 +77,156 @@ func.func @wrong_reference() {
 // -----
 
 func.func @requires_matching_signature(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    %arg0: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                       [0,1]>)
     attributes {ifrt.function} {
-  // expected-error@+1 {{'ifrt.CallLoadedExecutable' op requires callee signature matching '(!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0, 1]>) -> !ifrt.array<tensor<4x3xi32>, 1x2 to [0] on 2, [0, 1]>'. Actual '(!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0, 1]>) -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0, 1]>'}}
+  // expected-error@+1 {{'ifrt.CallLoadedExecutable' op requires callee signature matching '(!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>, [0, 1]>) -> !ifrt.array<tensor<4x3xi32>, #ifrt.sharding_param<1x2 to [0] on 2>, [0, 1]>'. Actual '(!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>, [0, 1]>) -> !ifrt.array<tensor<4x4xi32>, #ifrt.sharding_param<1x2 to [0] on 2>, [0, 1]>'}}
   %0, %ctrl_0 = ifrt.CallLoadedExecutable @callee(%arg0)
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-    -> !ifrt.array<tensor<4x3xi32>, 1x2 to [0] on 2, [0,1]>
+    : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>)
+    -> !ifrt.array<tensor<4x3xi32>, #ifrt.sharding_param<1x2 to [0] on 2>,
+                   [0,1]>
   return
 }
 
 ifrt.LoadedExecutable @callee on devices [0,1]
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-    -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
+    : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>)
+    -> !ifrt.array<tensor<4x4xi32>, #ifrt.sharding_param<1x2 to [0] on 2>,
+                   [0,1]>
 
 // -----
 
 func.func @io_aliases_should_be_pairs(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    %arg0: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                       [0,1]>)
     attributes {ifrt.function} {
-  // expected-error@+1 {{'ifrt.CallLoadedExecutable' op attribute 'io_aliases' failed to satisfy constraint: Array of pairs of aliased input/output indices}}
+  // expected-error@+2 {{'ifrt.CallLoadedExecutable' op attribute 'io_aliases' failed to satisfy constraint: Array of pairs of aliased input/output indices}}
   %0, %ctrl_0 = ifrt.CallLoadedExecutable @callee(%arg0)
     {io_aliases=[array<i32: 0>]}
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-    -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
+    : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>)
+    -> !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>
   return
 }
 
 ifrt.LoadedExecutable @callee on devices [0,1]
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-    -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
+    : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>)
+    -> !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>
 
 // -----
 
 func.func @io_aliases_should_have_valid_input_index(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    %arg0: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                       [0,1]>)
     attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.CallLoadedExecutable' op can't alias input #1 to output #0 as only having 1 inputs}}
   %0, %ctrl_0 = ifrt.CallLoadedExecutable @callee(%arg0)
     {io_aliases=[array<i32: 1, 0>]}
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-    -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
+    : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>)
+    -> !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>
   return
 }
 
 ifrt.LoadedExecutable @callee on devices [0,1]
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-    -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
+    : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>)
+    -> !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>
 
 // -----
 
 func.func @io_aliases_should_only_alias_input_once(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    %arg0: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                       [0,1]>)
     attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.CallLoadedExecutable' op can't alias input #0 more than once}}
   %0, %1, %ctrl_0 = ifrt.CallLoadedExecutable @callee(%arg0)
     {io_aliases=[array<i32: 0, 0>, array<i32: 0, 1>]}
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-    -> (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>,
-        !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>)
+    -> (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                    [0,1]>,
+        !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                    [0,1]>)
   return
 }
 
 ifrt.LoadedExecutable @callee on devices [0,1]
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-    -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
+    : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>)
+    -> !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>
 
 // -----
 
 func.func @io_aliases_should_have_valid_output_index(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    %arg0: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                       [0,1]>)
     attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.CallLoadedExecutable' op can't alias input #0 to output #1 as only having 1 outputs}}
   %0, %ctrl_0 = ifrt.CallLoadedExecutable @callee(%arg0)
     {io_aliases=[array<i32: 0, 1>]}
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-    -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
+    : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>)
+    -> !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>
   return
 }
 
 ifrt.LoadedExecutable @callee on devices [0,1]
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-    -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
+    : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>)
+    -> !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>
 
 // -----
 
 func.func @io_aliases_should_only_alias_output_once(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    %arg0: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                       [0,1]>)
     attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.CallLoadedExecutable' op can't alias output #0 more than once}}
   %0, %ctrl_0 = ifrt.CallLoadedExecutable @callee(%arg0, %arg0)
     {io_aliases=[array<i32: 0, 0>, array<i32: 1, 0>]}
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>,
-       !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-    -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
+    : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>,
+       !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>)
+    -> !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>
   return
 }
 
 ifrt.LoadedExecutable @callee on devices [0,1]
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-    -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
+    : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>)
+    -> !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>
 
 // -----
 
 func.func @io_aliases_should_have_same_type(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    %arg0: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                       [0,1]>)
     attributes {ifrt.function} {
-  // expected-error@+1 {{'ifrt.CallLoadedExecutable' op can't alias input #0 to output #0 with different types: '!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0, 1]>' vs '!ifrt.array<tensor<2x2xi32>, 2x1 to [0] on 2, [0, 1]>'}}
+  // expected-error@+1 {{'ifrt.CallLoadedExecutable' op can't alias input #0 to output #0 with different types: '!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>, [0, 1]>' vs '!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<2x1 to [0] on 2>, [0, 1]>'}}
   %0, %ctrl_0 = ifrt.CallLoadedExecutable @callee(%arg0)
     {io_aliases=[array<i32: 0, 0>]}
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-    -> !ifrt.array<tensor<2x2xi32>, 2x1 to [0] on 2, [0,1]>
+    : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>)
+    -> !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<2x1 to [0] on 2>,
+                   [0,1]>
   return
 }
 
 ifrt.LoadedExecutable @callee on devices [0,1]
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-    -> !ifrt.array<tensor<2x2xi32>, 2x1 to [0] on 2, [0,1]>
+    : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>)
+    -> !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<2x1 to [0] on 2>,
+                   [0,1]>
diff --git a/third_party/xla/xla/python/ifrt/ir/tests/verify_disassemble.mlir b/third_party/xla/xla/python/ifrt/ir/tests/verify_disassemble.mlir
index e7000af3b5fd7f..e36946470c23bc 100644
--- a/third_party/xla/xla/python/ifrt/ir/tests/verify_disassemble.mlir
+++ b/third_party/xla/xla/python/ifrt/ir/tests/verify_disassemble.mlir
@@ -1,53 +1,69 @@
 // RUN: ifrt-opt %s -split-input-file -verify-diagnostics
 
 func.func @good_disassemble(
-    %arg0: !ifrt.array<tensor<2x4xi32>, 1x2 to [0] on 2, [0,1]>)
+    %arg0: !ifrt.array<tensor<2x4xi32>,
+                       #ifrt.sharding_param<1x2 to [0] on 2>, [0,1]>)
     attributes {ifrt.function} {
   %0, %1 = "ifrt.Disassemble"(%arg0)
       {operand_segment_sizes=array<i32: 2, 0>}
-      : (!ifrt.array<tensor<2x4xi32>, 1x2 to [0] on 2, [0,1]>)
-      -> (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [0]>,
-          !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [1]>)
+      : (!ifrt.array<tensor<2x4xi32>,
+                     #ifrt.sharding_param<1x2 to [0] on 2>, [0,1]>)
+      -> (!ifrt.array<tensor<2x2xi32>,
+                      #ifrt.sharding_param<1x1 to [0] on 1>, [0]>,
+          !ifrt.array<tensor<2x2xi32>,
+                      #ifrt.sharding_param<1x1 to [0] on 1>, [1]>)
   return
 }
 
 // -----
 
 func.func @disassemble_requires_in_ifrt_function(
-    %arg0: !ifrt.array<tensor<2x4xi32>, 1x2 to [0] on 2, [0,1]>) {
+    %arg0: !ifrt.array<tensor<2x4xi32>,
+                       #ifrt.sharding_param<1x2 to [0] on 2>, [0,1]>) {
   // expected-error@+1 {{'ifrt.Disassemble' op must be in a FuncOp with attr `ifrt.function`}}
   %0, %1 = "ifrt.Disassemble"(%arg0)
       {operand_segment_sizes=array<i32: 2, 0>}
-      : (!ifrt.array<tensor<2x4xi32>, 1x2 to [0] on 2, [0,1]>)
-      -> (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [0]>,
-          !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [1]>)
+      : (!ifrt.array<tensor<2x4xi32>,
+                     #ifrt.sharding_param<1x2 to [0] on 2>, [0,1]>)
+      -> (!ifrt.array<tensor<2x2xi32>,
+                      #ifrt.sharding_param<1x1 to [0] on 1>, [0]>,
+          !ifrt.array<tensor<2x2xi32>,
+                      #ifrt.sharding_param<1x1 to [0] on 1>, [1]>)
   return
 }
 
 // -----
 
 func.func @disassemble_requires_outputs_on_single_devices(
-    %arg0: !ifrt.array<tensor<2x4xi32>, 1x4 to [0, 1] on 2x2, [0,1,2,3]>)
+    %arg0: !ifrt.array<tensor<2x4xi32>,
+                       #ifrt.sharding_param<1x4 to [0, 1] on 2x2>, [0,1,2,3]>)
     attributes {ifrt.function} {
-  // expected-error@+1 {{'ifrt.Disassemble' op requires every output to be a single device array. Actual: '!ifrt.array<tensor<2x2xi32>, 1x2 to [0] on 2, [0, 1]>'}}
+  // expected-error@+1 {{'ifrt.Disassemble' op requires every output to be a single device array. Actual: '!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x2 to [0] on 2>, [0, 1]>'}}
   %0, %1 = "ifrt.Disassemble"(%arg0)
       {operand_segment_sizes=array<i32: 2, 0>}
-      : (!ifrt.array<tensor<2x4xi32>, 1x4 to [0, 1] on 2x2, [0,1,2,3]>)
-      -> (!ifrt.array<tensor<2x2xi32>, 1x2 to [0] on 2, [0,1]>,
-          !ifrt.array<tensor<2x2xi32>, 1x2 to [0] on 2, [2,3]>)
+      : (!ifrt.array<tensor<2x4xi32>,
+                     #ifrt.sharding_param<1x4 to [0, 1] on 2x2>, [0,1,2,3]>)
+      -> (!ifrt.array<tensor<2x2xi32>,
+                      #ifrt.sharding_param<1x2 to [0] on 2>, [0,1]>,
+          !ifrt.array<tensor<2x2xi32>,
+                      #ifrt.sharding_param<1x2 to [0] on 2>, [2,3]>)
   return
 }
 
 // -----
 
 func.func @disassemble_requires_same_device_list(
-    %arg0: !ifrt.array<tensor<2x4xi32>, 1x2 to [0] on 2, [0,1]>)
+    %arg0: !ifrt.array<tensor<2x4xi32>,
+                       #ifrt.sharding_param<1x2 to [0] on 2>, [0,1]>)
     attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.Disassemble' op requires the same input/output device list. Input 0, 1 vs Output 1, 2}}
   %0, %1 = "ifrt.Disassemble"(%arg0)
       {operand_segment_sizes=array<i32: 2, 0>}
-      : (!ifrt.array<tensor<2x4xi32>, 1x2 to [0] on 2, [0,1]>)
-      -> (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [1]>,
-          !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [2]>)
+      : (!ifrt.array<tensor<2x4xi32>,
+                     #ifrt.sharding_param<1x2 to [0] on 2>, [0,1]>)
+      -> (!ifrt.array<tensor<2x2xi32>,
+                      #ifrt.sharding_param<1x1 to [0] on 1>, [1]>,
+          !ifrt.array<tensor<2x2xi32>,
+                      #ifrt.sharding_param<1x1 to [0] on 1>, [2]>)
   return
 }
diff --git a/third_party/xla/xla/python/ifrt/ir/tests/verify_loaded_executable.mlir b/third_party/xla/xla/python/ifrt/ir/tests/verify_loaded_executable.mlir
index 3f65e745a3c67c..23d6d9759ff42a 100644
--- a/third_party/xla/xla/python/ifrt/ir/tests/verify_loaded_executable.mlir
+++ b/third_party/xla/xla/python/ifrt/ir/tests/verify_loaded_executable.mlir
@@ -1,13 +1,17 @@
 // RUN: ifrt-opt %s -split-input-file -verify-diagnostics
 
 ifrt.LoadedExecutable @good on devices [0,1]
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-    -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
+    : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>)
+    -> !ifrt.array<tensor<4x4xi32>, #ifrt.sharding_param<1x2 to [0] on 2>,
+                   [0,1]>
 
 #devices = #ifrt<devices[0,1]>
 ifrt.LoadedExecutable @good_with_aliased_devices on devices #devices
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, #devices>)
-    -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, #devices>
+    : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   #devices>)
+    -> !ifrt.array<tensor<4x4xi32>, #ifrt.sharding_param<1x2 to [0] on 2>,
+                   #devices>
 
 // -----
 
@@ -25,19 +29,25 @@ ifrt.LoadedExecutable @requires_array_output on devices [0,1]
 
 // expected-error@+1 {{'ifrt.LoadedExecutable' Device list has duplicate logical id 0}}
 ifrt.LoadedExecutable @requires_unique_devices_attr on devices [0,0]
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-    -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
+    : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>)
+    -> !ifrt.array<tensor<4x4xi32>, #ifrt.sharding_param<1x2 to [0] on 2>,
+                   [0,1]>
 
 // -----
 
-// expected-error@+1 {{'ifrt.LoadedExecutable' op requires all inputs placed on `devices` attr. The following input is placed on device 2 not found in `devices` attr. '!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0, 2]>'}}
+// expected-error@+1 {{'ifrt.LoadedExecutable' op requires all inputs placed on `devices` attr. The following input is placed on device 2 not found in `devices` attr. '!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>, [0, 2]>'}}
 ifrt.LoadedExecutable @requires_input_place_on_devices on devices [0,1]
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,2]>)
-    -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
+    : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,2]>)
+    -> !ifrt.array<tensor<4x4xi32>, #ifrt.sharding_param<1x2 to [0] on 2>,
+                   [0,1]>
 
 // -----
 
-// expected-error@+1 {{'ifrt.LoadedExecutable' op requires all outputs placed on `devices` attr. The following output is placed on device 2 not found in `devices` attr. '!ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0, 2]>'}}
+// expected-error@+1 {{'ifrt.LoadedExecutable' op requires all outputs placed on `devices` attr. The following output is placed on device 2 not found in `devices` attr. '!ifrt.array<tensor<4x4xi32>, #ifrt.sharding_param<1x2 to [0] on 2>, [0, 2]>'}}
 ifrt.LoadedExecutable @requires_output_place_on_devices on devices [0,1]
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-    -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,2]>
+    : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                   [0,1]>)
+    -> !ifrt.array<tensor<4x4xi32>, #ifrt.sharding_param<1x2 to [0] on 2>,
+                   [0,2]>
diff --git a/third_party/xla/xla/python/ifrt/ir/tests/verify_reshard.mlir b/third_party/xla/xla/python/ifrt/ir/tests/verify_reshard.mlir
index a34af467efe6a7..7731b9664ce2b1 100644
--- a/third_party/xla/xla/python/ifrt/ir/tests/verify_reshard.mlir
+++ b/third_party/xla/xla/python/ifrt/ir/tests/verify_reshard.mlir
@@ -1,67 +1,87 @@
 // RUN: ifrt-opt %s -split-input-file -verify-diagnostics
 
 func.func @good_reshard(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    %arg0: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                       [0,1]>)
     attributes {ifrt.function} {
   %0 = ifrt.Reshard(%arg0)
-      : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-      -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 4, [0,1,2,3]>
+      : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                     [0,1]>)
+      -> !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 4>,
+                     [0,1,2,3]>
   return
 }
 
 func.func @good_reshard_with_control_dep(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>,
+    %arg0: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                       [0,1]>,
     %arg1: !ifrt.control)
     attributes {ifrt.function} {
   %0 = ifrt.Reshard(%arg0) after %arg1
-      : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-      -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 4, [0,1,2,3]>
+      : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                     [0,1]>)
+      -> !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 4>,
+                     [0,1,2,3]>
   return
 }
 
 // -----
 
 func.func @reshard_requires_in_ifrt_function(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+    %arg0: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                       [0,1]>) {
   // expected-error@+1 {{'ifrt.Reshard' op must be in a FuncOp with attr `ifrt.function`}}
   %0 = ifrt.Reshard(%arg0)
-      : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-      -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 4, [0,1,2,3]>
+      : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                     [0,1]>)
+      -> !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 4>,
+                     [0,1,2,3]>
   return
 }
 
 // -----
 
 func.func @reshard_requires_same_global_shape(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    %arg0: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                       [0,1]>)
     attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.Reshard' op requires the same global shape. Input 'tensor<2x2xi32>' vs Output 'tensor<2x1xi32>'}}
   %0 = ifrt.Reshard(%arg0)
-      : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-      -> !ifrt.array<tensor<2x1xi32>, 1x1 to [0] on 2, [2,3]>
+      : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                     [0,1]>)
+      -> !ifrt.array<tensor<2x1xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                     [2,3]>
   return
 }
 
 // -----
 
 func.func @reshard_requires_non_negative_axis_index(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    %arg0: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                       [0,1]>)
     attributes {ifrt.function} {
-  // expected-error@+3 {{Out of range axis -1 to the mesh of [-1] on 2}}
+  // expected-error@+5 {{Out of range axis -1 to the mesh of [-1] on 2}}
+  // expected-error@+4 {{failed to parse Ifrt_ArrayType parameter 'sharding_attr'}}
   %0 = ifrt.Reshard(%arg0)
-      : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-      -> !ifrt.array<tensor<2x1xi32>, 1x2 to [-1] on 2, [2,3]>
+      : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                     [0,1]>)
+      -> !ifrt.array<tensor<2x1xi32>, #ifrt.sharding_param<1x2 to [-1] on 2>,
+                     [2,3]>
   return
 }
 
 // -----
 
 func.func @reshard_requires_valid_axis_index(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    %arg0: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                       [0,1]>)
     attributes {ifrt.function} {
-  // expected-error@+3 {{Out of range axis 1234567890 to the mesh of [1234567890] on 2}}
+  // expected-error@+6 {{Out of range axis 1234567890 to the mesh of [1234567890] on 2}}
+  // expected-error@+5 {{failed to parse Ifrt_ArrayType parameter 'sharding_attr'}}
   %0 = ifrt.Reshard(%arg0)
-      : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-      -> !ifrt.array<tensor<2x1xi32>, 1x2 to [1234567890] on 2, [2,3]>
+      : (!ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 2>,
+                     [0,1]>)
+      -> !ifrt.array<tensor<2x1xi32>,
+                     #ifrt.sharding_param<1x2 to [1234567890] on 2>, [2,3]>
   return
 }
diff --git a/third_party/xla/xla/python/ifrt/ir/transforms/BUILD b/third_party/xla/xla/python/ifrt/ir/transforms/BUILD
index 4905efe8d33189..736863147045ee 100644
--- a/third_party/xla/xla/python/ifrt/ir/transforms/BUILD
+++ b/third_party/xla/xla/python/ifrt/ir/transforms/BUILD
@@ -1,5 +1,5 @@
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
-load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
+load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -30,6 +30,7 @@ cc_library(
     name = "passes",
     srcs = [
         "ifrt_duplicated_callee_elimination_pass.cc",
+        "ifrt_verify_sharding_specified_pass.cc",
         "spmd_expandable_interface_verification_pass.cc",
         "spmd_expansion_pass.cc",
     ],
@@ -43,7 +44,6 @@ cc_library(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
diff --git a/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_verify_sharding_specified_pass.cc b/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_verify_sharding_specified_pass.cc
new file mode 100644
index 00000000000000..9aaa990832a261
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_verify_sharding_specified_pass.cc
@@ -0,0 +1,106 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "xla/python/ifrt/ir/ifrt_dialect.h"
+#include "xla/python/ifrt/ir/transforms/passes.h"
+
+namespace xla {
+namespace ifrt {
+
+namespace {
+
+#define GEN_PASS_DEF_IFRTVERIFYSHARDINGSPECIFIEDPASS
+#include "xla/python/ifrt/ir/transforms/passes.h.inc"
+
+bool IsArrayWithUnspecifiedSharding(mlir::Type type) {
+  auto array_type = llvm::dyn_cast_or_null<xla::ifrt::IfrtArrayType>(type);
+  if (array_type == nullptr) {
+    return false;
+  }
+  return array_type.getShardingAttr().isa<IfrtUnspecifiedShardingAttr>();
+}
+
+class IfrtVerifyShardingSpecifiedPass
+    : public impl::IfrtVerifyShardingSpecifiedPassBase<
+          IfrtVerifyShardingSpecifiedPass> {
+ public:
+  void runOnOperation() override;
+};
+
+void IfrtVerifyShardingSpecifiedPass::runOnOperation() {
+  mlir::ModuleOp module_op = getOperation();
+  mlir::WalkResult result =
+      module_op.walk([](mlir::Operation* op) -> mlir::WalkResult {
+        auto func_op = llvm::dyn_cast_or_null<mlir::func::FuncOp>(op);
+        if (func_op != nullptr) {
+          mlir::FunctionType func_type = func_op.getFunctionType();
+          for (const auto [idx, input_type] :
+               llvm::enumerate(func_type.getInputs())) {
+            if (IsArrayWithUnspecifiedSharding(input_type)) {
+              return op->emitOpError()
+                     << "argument " << idx << " has unspecified sharding.";
+            }
+          }
+          for (const auto [idx, result_type] :
+               llvm::enumerate(func_type.getResults())) {
+            if (IsArrayWithUnspecifiedSharding(result_type)) {
+              return op->emitOpError()
+                     << "result " << idx << " has unspecified sharding.";
+            }
+          }
+        } else {
+          for (const auto [idx, operand_type] :
+               llvm::enumerate(op->getOperandTypes())) {
+            if (IsArrayWithUnspecifiedSharding(operand_type)) {
+              return op->emitOpError()
+                     << "argument " << idx << " has unspecified sharding.";
+            }
+          }
+          for (const auto [idx, result_type] :
+               llvm::enumerate(op->getResultTypes())) {
+            if (IsArrayWithUnspecifiedSharding(result_type)) {
+              return op->emitOpError()
+                     << "result " << idx << " has unspecified sharding.";
+            }
+          }
+        }
+        return mlir::WalkResult::advance();
+      });
+  if (result.wasInterrupted()) {
+    signalPassFailure();
+    return;
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateIfrtVerifyShardingSpecifiedPass() {
+  return std::make_unique<IfrtVerifyShardingSpecifiedPass>();
+}
+
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt/ir/transforms/passes.h b/third_party/xla/xla/python/ifrt/ir/transforms/passes.h
index fd15f5e66b4962..546eb55e681797 100644
--- a/third_party/xla/xla/python/ifrt/ir/transforms/passes.h
+++ b/third_party/xla/xla/python/ifrt/ir/transforms/passes.h
@@ -39,6 +39,9 @@ std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> CreateSpmdExpansionPass();
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
 CreateIfrtDuplicatedCalleeEliminationPass();
 
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateIfrtVerifyShardingSpecifiedPass();
+
 // Generated definitions. This should be placed after all Pass creations.
 #define GEN_PASS_REGISTRATION
 #include "xla/python/ifrt/ir/transforms/passes.h.inc"  // IWYU pragma: export
diff --git a/third_party/xla/xla/python/ifrt/ir/transforms/passes.td b/third_party/xla/xla/python/ifrt/ir/transforms/passes.td
index 59b5050537e677..fe9709004f35f5 100644
--- a/third_party/xla/xla/python/ifrt/ir/transforms/passes.td
+++ b/third_party/xla/xla/python/ifrt/ir/transforms/passes.td
@@ -62,7 +62,7 @@ For example, the following:
 
 ```mlir
 #device = #ifrt<devices[0,1]>
-#sharding = #ifrt.sharding<2x1 to [0] on 2>
+#sharding = #ifrt.sharding_param<2x1 to [0] on 2>
 module attributes {ifrt.devices = #device} {
   func.func @main(
       %arg0: tensor<2x2xi32> {ifrt.sharding = #sharding,
@@ -79,7 +79,7 @@ will be transformed into:
 ```mlir
 // The function's input and output shapes are now local.
 #device = #ifrt<devices[0,1]>
-#sharding = #ifrt.sharding<2x1 to [0] on 2>
+#sharding = #ifrt.sharding_param<2x1 to [0] on 2>
 module attributes {ifrt.devices = #device} {
   func.func @main(
       %arg0: tensor<1x2xi32> {ifrt.sharding = #sharding,
@@ -107,4 +107,16 @@ them. The duplicated callee `FuncOp` will not be removed.
   let constructor = "CreateIfrtDuplicatedCalleeEliminationPass()";
 }
 
+def IfrtVerifyShardingSpecifiedPass :
+    Pass<"ifrt-verify-sharding-specified", "mlir::ModuleOp"> {
+  let summary = "Verify that all `!ifrt.array` have sharding specified.";
+  let description = [{
+Verify that each `!ifrt.array` has sharding attribute that is not of type
+`!ifrt.sharding_unspecified`.
+  }];
+
+  let constructor = "CreateIfrtVerifyShardingSpecifiedPass()";
+}
+
+
 #endif  // XLA_PYTHON_IFRT_IR_TRANSFORMS_PASSES_TD_
diff --git a/third_party/xla/xla/python/ifrt/ir/transforms/spmd_expanders/BUILD b/third_party/xla/xla/python/ifrt/ir/transforms/spmd_expanders/BUILD
index 090ca21d09dcbc..9ff15e6ced87c0 100644
--- a/third_party/xla/xla/python/ifrt/ir/transforms/spmd_expanders/BUILD
+++ b/third_party/xla/xla/python/ifrt/ir/transforms/spmd_expanders/BUILD
@@ -1,4 +1,4 @@
-load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
+load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/python/ifrt/ir/transforms/spmd_expansion_pass.cc b/third_party/xla/xla/python/ifrt/ir/transforms/spmd_expansion_pass.cc
index 7342f60d20b520..cc13c47df669c5 100644
--- a/third_party/xla/xla/python/ifrt/ir/transforms/spmd_expansion_pass.cc
+++ b/third_party/xla/xla/python/ifrt/ir/transforms/spmd_expansion_pass.cc
@@ -13,25 +13,30 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstdint>
 #include <memory>
 #include <optional>
 #include <string>
-#include <vector>
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "absl/types/span.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "xla/python/ifrt/ir/ifrt_dialect.h"
 #include "xla/python/ifrt/ir/ifrt_interfaces.h"
-#include "xla/python/ifrt/ir/sharding_param.h"
 #include "xla/python/ifrt/ir/transforms/constants.h"
 #include "xla/python/ifrt/ir/transforms/passes.h"
 
@@ -140,23 +145,6 @@ mlir::Operation* TopologicalIterator::next() {
 
 bool TopologicalIterator::hasNext() { return !ops_to_visit_.empty(); }
 
-absl::StatusOr<std::vector<int64_t>> LocalShapeFromGlobalShape(
-    absl::Span<const int64_t> global_shape, ShardingParam sharding_param) {
-  auto num_shards = sharding_param.dim_shards();
-  std::vector<int64_t> local_shape;
-  local_shape.reserve(global_shape.size());
-  for (int i = 0; i < num_shards.size(); ++i) {
-    if (global_shape[i] % num_shards[i] != 0) {
-      return absl::InvalidArgumentError(absl::StrCat(
-          "Global shape is not divisible by the number of shards in dimension ",
-          i, ". Global size: ", global_shape[i],
-          ", number of shards: ", num_shards[i], "."));
-    }
-    local_shape.push_back(global_shape[i] / num_shards[i]);
-  }
-  return local_shape;
-}
-
 // Updates `function` input signature operand at `argument_index` with
 // `new_shape`.
 void UpdateFunctionInputShape(const int argument_index,
@@ -178,14 +166,13 @@ mlir::LogicalResult UpdateFunctionArgsUsingSharding(
   // can have resource type as input.
   for (int i = 0; i < function.getNumArguments(); ++i) {
     auto arg_sharding_attr =
-        function.getArgAttrOfType<IfrtShardingAttr>(i, kIfrtShardingAttrName);
+        function.getArgAttrOfType<IfrtShardingAttrInterface>(
+            i, kIfrtShardingAttrName);
     if (arg_sharding_attr == nullptr) {
       return function.emitOpError() << "requires `" << kIfrtShardingAttrName
                                     << "` attribute on arg " << i;
     }
 
-    ShardingParam sharding = arg_sharding_attr.getSharding();
-
     auto value = function.getFunctionType().getInput(i);
 
     mlir::RankedTensorType ranked_type =
@@ -196,8 +183,8 @@ mlir::LogicalResult UpdateFunctionArgsUsingSharding(
     }
 
     llvm::ArrayRef<int64_t> arg_shape = ranked_type.getShape();
-    absl::StatusOr<std::vector<int64_t>> arg_local_shape =
-        LocalShapeFromGlobalShape(arg_shape, sharding);
+    absl::StatusOr<llvm::SmallVector<int64_t>> arg_local_shape =
+        arg_sharding_attr.LocalShapeFromGlobalShape(arg_shape);
     if (!arg_local_shape.ok()) {
       return function.emitOpError() << arg_local_shape.status().message();
     }
diff --git a/third_party/xla/xla/python/ifrt/memory.cc b/third_party/xla/xla/python/ifrt/memory.cc
index caaa3e720fd9cc..c608950e3e8aef 100644
--- a/third_party/xla/xla/python/ifrt/memory.cc
+++ b/third_party/xla/xla/python/ifrt/memory.cc
@@ -63,12 +63,14 @@ MemoryKind CanonicalizeMemoryKind(MemoryKind memory_kind, Device* device) {
   if (memory_kind.memory_kind().has_value()) {
     return memory_kind;
   }
-  auto default_memory_space = device->default_memory_space();
-  if (default_memory_space.ok()) {
-    return MemoryKind((*default_memory_space)->memory_space_kind());
+  auto default_memory = device->DefaultMemory();
+  if (default_memory.ok()) {
+    return (*default_memory)->Kind();
   }
   return MemoryKind();
 }
 
+char Memory::ID = 0;
+
 }  // namespace ifrt
 }  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt/memory.h b/third_party/xla/xla/python/ifrt/memory.h
index 0a5525e693f922..7f0158ec5724d5 100644
--- a/third_party/xla/xla/python/ifrt/memory.h
+++ b/third_party/xla/xla/python/ifrt/memory.h
@@ -20,16 +20,13 @@ limitations under the License.
 #include <string>
 
 #include "absl/strings/string_view.h"
-#include "xla/pjrt/pjrt_client.h"
+#include "absl/types/span.h"
+#include "llvm/Support/ExtensibleRTTI.h"
 #include "xla/python/ifrt/device.h"
 
 namespace xla {
 namespace ifrt {
 
-// Short-term alias to reuse `xla::PjRtMemorySpace` without a separate abstract
-// type.
-using Memory = ::xla::PjRtMemorySpace;
-
 // `MemoryKind` uniquely identifies a group of memory spaces with a
 // platform-dependent string. When no specific memory kind is chosen, the
 // platform should use the default memory kind for a platform's device that is
@@ -82,6 +79,40 @@ class MemoryKind {
 // canonicalization.
 MemoryKind CanonicalizeMemoryKind(MemoryKind memory_kind, Device* device);
 
+TSL_LIB_GTL_DEFINE_INT_TYPE(MemoryId, int32_t);
+
+// `Memory` represents a memory space that one or more devices can be attached
+// to. A platform may have multiple memory spaces with different backing
+// hardware or memory region types.
+class Memory : public llvm::RTTIExtends<Memory, llvm::RTTIRoot> {
+ public:
+  Memory() = default;
+
+  // Not copyable or movable.
+  Memory(const Memory&) = delete;
+  Memory(Memory&&) = delete;
+  Memory& operator=(const Memory&) = delete;
+  Memory& operator=(Memory&&) = delete;
+
+  virtual MemoryId Id() const = 0;
+
+  // A platform-dependent string that uniquely identifies the kind of the
+  // memory.
+  virtual const MemoryKind& Kind() const = 0;
+
+  // Debug string suitable for reading by end users, should be reasonably terse.
+  virtual absl::string_view ToString() const = 0;
+
+  // Debug string suitable for logging when errors occur. Should be verbose
+  // enough to describe the current device unambiguously.
+  virtual absl::string_view DebugString() const = 0;
+
+  // The devices to which this memory space is attached.
+  virtual absl::Span<Device* const> Devices() const = 0;
+
+  static char ID;  // NOLINT
+};
+
 }  // namespace ifrt
 }  // namespace xla
 
diff --git a/third_party/xla/xla/python/ifrt/mock.cc b/third_party/xla/xla/python/ifrt/mock.cc
index 0684702b988647..a70648001eb916 100644
--- a/third_party/xla/xla/python/ifrt/mock.cc
+++ b/third_party/xla/xla/python/ifrt/mock.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/literal.h"
 #include "xla/pjrt/pjrt_device_description.h"
+#include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/device.h"
@@ -163,7 +164,7 @@ MockClient::MockClient(std::unique_ptr<xla::ifrt::Client> delegated)
         return delegated_->GetDefaultDeviceAssignment(num_replicas,
                                                       num_partitions);
       });
-  ON_CALL(*this, LookupDevice).WillByDefault([this](int device_id) {
+  ON_CALL(*this, LookupDevice).WillByDefault([this](DeviceId device_id) {
     return delegated_->LookupDevice(device_id);
   });
   ON_CALL(*this, LookupAddressableDevice)
@@ -174,9 +175,15 @@ MockClient::MockClient(std::unique_ptr<xla::ifrt::Client> delegated)
     return delegated_->GetDefaultCompiler();
   });
   ON_CALL(*this, GetTopologyForDevices)
-      .WillByDefault([this](absl::Span<xla::ifrt::Device* const> devices) {
+      .WillByDefault([this](const xla::ifrt::DeviceList& devices) {
         return delegated_->GetTopologyForDevices(devices);
       });
+  ON_CALL(*this, GetDefaultLayoutForDevice)
+      .WillByDefault([this](xla::ifrt::DType dtype,
+                            absl::Span<const int64_t> dims,
+                            xla::ifrt::Device* device) {
+        return delegated_->GetDefaultLayoutForDevice(dtype, dims, device);
+      });
 }
 // LINT.ThenChange()
 
@@ -188,28 +195,11 @@ MockDevice::MockDevice(Device* delegated) : delegated_(delegated) {
   ON_CALL(*this, IsAddressable).WillByDefault([this]() {
     return delegated_->IsAddressable();
   });
-  ON_CALL(*this, description)
-      .WillByDefault([this]() -> const xla::PjRtDeviceDescription& {
-        return delegated_->description();
-      });
-  ON_CALL(*this, global_device_id).WillByDefault([this]() {
-    return delegated_->global_device_id();
-  });
-  ON_CALL(*this, process_index).WillByDefault([this]() {
-    return delegated_->process_index();
-  });
-  ON_CALL(*this, local_device_id).WillByDefault([this]() {
-    return delegated_->local_device_id();
-  });
-  ON_CALL(*this, local_hardware_id_typed).WillByDefault([this]() {
-    return delegated_->local_hardware_id_typed();
-  });
-  ON_CALL(*this, local_hardware_id).WillByDefault([this]() {
-    return delegated_->local_hardware_id();
-  });
-  ON_CALL(*this, device_kind).WillByDefault([this]() {
-    return delegated_->device_kind();
+  ON_CALL(*this, Id).WillByDefault([this]() { return delegated_->Id(); });
+  ON_CALL(*this, ProcessIndex).WillByDefault([this]() {
+    return delegated_->ProcessIndex();
   });
+  ON_CALL(*this, Kind).WillByDefault([this]() { return delegated_->Kind(); });
   ON_CALL(*this, DebugString).WillByDefault([this]() {
     return delegated_->DebugString();
   });
@@ -222,26 +212,11 @@ MockDevice::MockDevice(Device* delegated) : delegated_(delegated) {
               -> const absl::flat_hash_map<std::string, PjRtDeviceAttribute>& {
             return delegated_->Attributes();
           });
-  ON_CALL(*this, CreateAsyncTrackingEvent)
-      .WillByDefault([this](absl::string_view description) {
-        return delegated_->CreateAsyncTrackingEvent(description);
-      });
-  ON_CALL(*this, TransferToInfeed)
-      .WillByDefault([this](const LiteralSlice& literal) {
-        return delegated_->TransferToInfeed(literal);
-      });
-  ON_CALL(*this, TransferFromOutfeed)
-      .WillByDefault([this](MutableBorrowingLiteral literal) {
-        return delegated_->TransferFromOutfeed(std::move(literal));
-      });
-  ON_CALL(*this, default_memory_space).WillByDefault([this]() {
-    return delegated_->default_memory_space();
-  });
-  ON_CALL(*this, GetAllocatorStats).WillByDefault([this]() {
-    return delegated_->GetAllocatorStats();
+  ON_CALL(*this, DefaultMemory).WillByDefault([this]() {
+    return delegated_->DefaultMemory();
   });
-  ON_CALL(*this, memory_spaces).WillByDefault([this]() {
-    return delegated_->memory_spaces();
+  ON_CALL(*this, Memories).WillByDefault([this]() {
+    return delegated_->Memories();
   });
 }
 // LINT.ThenChange()
diff --git a/third_party/xla/xla/python/ifrt/mock.h b/third_party/xla/xla/python/ifrt/mock.h
index 13ae0e1483f95d..942f0858e0c41a 100644
--- a/third_party/xla/xla/python/ifrt/mock.h
+++ b/third_party/xla/xla/python/ifrt/mock.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/Support/ExtensibleRTTI.h"
@@ -47,15 +48,15 @@ namespace ifrt {
 
 // array.h
 
-class MockArray final : public llvm::RTTIExtends<MockArray, Array> {
+class MockArray : public llvm::RTTIExtends<MockArray, Array> {
  public:
   MockArray() = default;
   explicit MockArray(tsl::RCReference<xla::ifrt::Array> delegated);
 
   // LINT.IfChange
   MOCK_METHOD(Client*, client, (), (const, final));
-  MOCK_METHOD(Future<Status>, GetReadyFuture, (), (const, final));
-  MOCK_METHOD(Future<Status>, Delete, (), (final));
+  MOCK_METHOD(Future<>, GetReadyFuture, (), (const, final));
+  MOCK_METHOD(Future<>, Delete, (), (final));
   MOCK_METHOD(bool, IsDeleted, (), (const, final));
   MOCK_METHOD(std::string, DebugString, (), (const, final));
 
@@ -71,7 +72,7 @@ class MockArray final : public llvm::RTTIExtends<MockArray, Array> {
               (final));
   MOCK_METHOD(absl::StatusOr<tsl::RCReference<Array>>, FullyReplicatedShard,
               (ArrayCopySemantics semantics), (final));
-  MOCK_METHOD(Future<Status>, CopyToHostBuffer,
+  MOCK_METHOD(Future<>, CopyToHostBuffer,
               (void* data,
                std::optional<absl::Span<const int64_t>> byte_strides,
                ArrayCopySemantics semantics),
@@ -92,7 +93,7 @@ class MockArray final : public llvm::RTTIExtends<MockArray, Array> {
 
 // client.h
 
-class MockClient final : public llvm::RTTIExtends<MockClient, Client> {
+class MockClient : public llvm::RTTIExtends<MockClient, Client> {
  public:
   MockClient() = default;
   explicit MockClient(std::unique_ptr<xla::ifrt::Client> delegated);
@@ -127,15 +128,20 @@ class MockClient final : public llvm::RTTIExtends<MockClient, Client> {
   MOCK_METHOD(int, process_index, (), (const, final));
   MOCK_METHOD(absl::StatusOr<DeviceAssignment>, GetDefaultDeviceAssignment,
               (int num_replicas, int num_partitions), (const, final));
-  MOCK_METHOD(absl::StatusOr<Device*>, LookupDevice, (int device_id),
+  MOCK_METHOD(absl::StatusOr<Device*>, LookupDevice, (DeviceId device_id),
               (const, final));
   MOCK_METHOD(absl::StatusOr<Device*>, LookupAddressableDevice,
               (int local_hardware_id), (const, final));
   MOCK_METHOD(Compiler*, GetDefaultCompiler, (), (final));
   MOCK_METHOD(
       absl::StatusOr<std::shared_ptr<const xla::PjRtTopologyDescription>>,
-      GetTopologyForDevices, (absl::Span<xla::ifrt::Device* const> devices),
+      GetTopologyForDevices, (const xla::ifrt::DeviceList& devices),
       (const, final));
+  MOCK_METHOD(absl::StatusOr<std::unique_ptr<xla::PjRtLayout>>,
+              GetDefaultLayoutForDevice,
+              (xla::ifrt::DType dtype, absl::Span<const int64_t> dims,
+               xla::ifrt::Device* device),
+              (const, final));
   // LINT.ThenChange(mock.cc:MockClientDelegation)
 
   xla::ifrt::Client* delegated() const { return delegated_.get(); }
@@ -148,7 +154,7 @@ class MockClient final : public llvm::RTTIExtends<MockClient, Client> {
 
 // compiler.h
 
-class MockCompiler final : public llvm::RTTIExtends<MockCompiler, Compiler> {
+class MockCompiler : public llvm::RTTIExtends<MockCompiler, Compiler> {
  public:
   MOCK_METHOD(absl::StatusOr<std::unique_ptr<LoadedExecutable>>, Compile,
               (std::unique_ptr<Program> program,
@@ -165,40 +171,24 @@ class MockCompiler final : public llvm::RTTIExtends<MockCompiler, Compiler> {
 
 // device.h
 
-class MockDevice final : public Device {
+class MockDevice : public Device {
  public:
   MockDevice() = default;
   explicit MockDevice(Device* delegated);
 
   // LINT.IfChange
-  MOCK_METHOD(xla::PjRtClient*, client, (), (const, final));
+  MOCK_METHOD(Client*, client, (), (const, final));
   MOCK_METHOD(bool, IsAddressable, (), (const, final));
-  MOCK_METHOD(const xla::PjRtDeviceDescription&, description, (),
-              (const, final));
-  MOCK_METHOD(xla::PjRtGlobalDeviceId, global_device_id, (), (const, final));
-  MOCK_METHOD(int, process_index, (), (const, final));
-  MOCK_METHOD(int, local_hardware_id, (), (const, final));
-  MOCK_METHOD(xla::PjRtLocalDeviceId, local_device_id, (), (const, final));
-  MOCK_METHOD(xla::PjRtLocalHardwareId, local_hardware_id_typed, (),
-              (const, final));
-  MOCK_METHOD(absl::string_view, device_kind, (), (const, final));
+  MOCK_METHOD(int, ProcessIndex, (), (const, final));
+  MOCK_METHOD(DeviceId, Id, (), (const, final));
+  MOCK_METHOD(absl::string_view, Kind, (), (const, final));
   MOCK_METHOD(absl::string_view, DebugString, (), (const, final));
   MOCK_METHOD(absl::string_view, ToString, (), (const, final));
   MOCK_METHOD(
       (const absl::flat_hash_map<std::string, xla::PjRtDeviceAttribute>&),
       Attributes, (), (const, final));
-  MOCK_METHOD(std::unique_ptr<ScopedAsyncTrackingEvent>,
-              CreateAsyncTrackingEvent, (absl::string_view description),
-              (const, final));
-  MOCK_METHOD(Status, TransferToInfeed, (const LiteralSlice& literal), (final));
-  MOCK_METHOD(Status, TransferFromOutfeed, (MutableBorrowingLiteral literal),
-              (final));
-  MOCK_METHOD(absl::StatusOr<xla::PjRtMemorySpace*>, default_memory_space, (),
-              (const, final));
-  MOCK_METHOD(absl::StatusOr<tsl::AllocatorStats>, GetAllocatorStats, (),
-              (const, final));
-  MOCK_METHOD(absl::Span<xla::PjRtMemorySpace* const>, memory_spaces, (),
-              (const, final));
+  MOCK_METHOD(absl::StatusOr<Memory*>, DefaultMemory, (), (const, final));
+  MOCK_METHOD(absl::Span<Memory* const>, Memories, (), (const, final));
   // LINT.ThenChange(mock.cc:MockDeviceDelegation)
 
   Device* delegated() const { return delegated_; }
@@ -209,20 +199,18 @@ class MockDevice final : public Device {
 
 // memory.h
 
-class MockMemory final : public Memory {
+class MockMemory : public Memory {
  public:
-  MOCK_METHOD(xla::PjRtClient*, client, (), (const, final));
-  MOCK_METHOD(absl::Span<Device* const>, devices, (), (const, final));
-  MOCK_METHOD(int, id, (), (const, final));
-  MOCK_METHOD(absl::string_view, memory_space_kind, (), (const, final));
+  MOCK_METHOD(MemoryId, Id, (), (const, final));
+  MOCK_METHOD(absl::Span<Device* const>, Devices, (), (const, final));
+  MOCK_METHOD(const MemoryKind&, Kind, (), (const, final));
   MOCK_METHOD(absl::string_view, DebugString, (), (const, final));
   MOCK_METHOD(absl::string_view, ToString, (), (const, final));
 };
 
 // executable.h
 
-class MockExecutable final
-    : public llvm::RTTIExtends<MockExecutable, Executable> {
+class MockExecutable : public llvm::RTTIExtends<MockExecutable, Executable> {
  public:
   MOCK_METHOD(absl::string_view, name, (), (const, final));
   MOCK_METHOD(absl::StatusOr<std::optional<std::string>>, Fingerprint, (),
@@ -249,7 +237,7 @@ class MockExecutable final
   static char ID;  // NOLINT
 };
 
-class MockLoadedExecutable final
+class MockLoadedExecutable
     : public llvm::RTTIExtends<MockLoadedExecutable, LoadedExecutable> {
  public:
   MOCK_METHOD(Client*, client, (), (const, final));
diff --git a/tensorflow/lite/examples/ios/camera/CameraExampleAppDelegate.h b/third_party/xla/xla/python/ifrt/plugin_program.cc
similarity index 65%
rename from tensorflow/lite/examples/ios/camera/CameraExampleAppDelegate.h
rename to third_party/xla/xla/python/ifrt/plugin_program.cc
index 55891c3ee18318..4e27deb38bdbe7 100644
--- a/tensorflow/lite/examples/ios/camera/CameraExampleAppDelegate.h
+++ b/third_party/xla/xla/python/ifrt/plugin_program.cc
@@ -1,10 +1,10 @@
-// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+// Copyright 2024 The OpenXLA Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//    http://www.apache.org/licenses/LICENSE-2.0
+//     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
@@ -12,10 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#import <UIKit/UIKit.h>
+#include "xla/python/ifrt/plugin_program.h"
 
-@interface CameraExampleAppDelegate : UIResponder<UIApplicationDelegate>
+namespace xla {
+namespace ifrt {
 
-@property(strong, nonatomic) UIWindow* window;
+char PluginProgram::ID = 0;
+char PluginCompileOptions::ID = 0;
 
-@end
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt/plugin_program.h b/third_party/xla/xla/python/ifrt/plugin_program.h
new file mode 100644
index 00000000000000..c4e18460a2c334
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt/plugin_program.h
@@ -0,0 +1,57 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_PLUGIN_PROGRAM_H_
+#define XLA_PYTHON_IFRT_PLUGIN_PROGRAM_H_
+
+#include <string>
+
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/python/ifrt/compiler.h"
+#include "xla/python/ifrt/program.h"
+
+namespace xla {
+namespace ifrt {
+
+// `PluginProgram` is a subclass of `xla::ifrt::Program` used mainly with
+// the IFRT proxy as of Apr 2024, and facilitates generic RPCs from the IFRT
+// frontend (on the proxy-client) to the IFRT backend (on the proxy-server). A
+// `PluginProgram` and its compiled executable need not be associated with a
+// particular `xla::ifrt::Device`; instead, IFRT backends are expected to
+// intercept and act on the compilation and subsequent executions of
+// PluginProgram without passing them to particular devices.
+//
+// Another way to think of `PluginProgram` is that it is associated with a
+// 'controller device', as opposed to CPU or GPU devices, where the term
+// 'controller' means the same as in 'JAX uses a multi-controller programming
+// model'.
+struct PluginProgram
+    : public llvm::RTTIExtends<PluginProgram, xla::ifrt::Program> {
+  std::string data;
+  static char ID;  // NOLINT
+};
+
+struct PluginCompileOptions
+    : llvm::RTTIExtends<PluginCompileOptions, CompileOptions> {
+  PluginCompileOptions() = default;
+  ~PluginCompileOptions() override = default;
+
+  static char ID;  // NOLINT
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PLUGIN_PROGRAM_H_
diff --git a/third_party/xla/xla/python/ifrt/plugin_program_serdes.cc b/third_party/xla/xla/python/ifrt/plugin_program_serdes.cc
new file mode 100644
index 00000000000000..df1f1b9b507550
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt/plugin_program_serdes.cc
@@ -0,0 +1,103 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/python/ifrt/plugin_program.h"
+#include "xla/python/ifrt/serdes.h"
+
+namespace xla {
+namespace ifrt {
+
+namespace {
+
+constexpr absl::string_view kSerializationPrefix =
+    "__serialized_plugin_program ";
+
+class PluginProgramSerDes
+    : public llvm::RTTIExtends<PluginProgramSerDes, SerDes> {
+ public:
+  absl::string_view type_name() const override {
+    return "xla::ifrt::PluginProgram";
+  }
+
+  absl::StatusOr<std::string> Serialize(Serializable& serializable) override {
+    return absl::StrCat(kSerializationPrefix,
+                        llvm::cast<PluginProgram>(serializable).data);
+  }
+
+  absl::StatusOr<std::unique_ptr<Serializable>> Deserialize(
+      const std::string& serialized,
+      std::unique_ptr<DeserializeOptions>) override {
+    if (!absl::StartsWith(serialized, kSerializationPrefix)) {
+      return absl::InvalidArgumentError(
+          absl::StrCat("Bad serialized ", type_name()));
+    }
+    absl::string_view data(serialized);
+    data.remove_prefix(kSerializationPrefix.size());
+    auto result = std::make_unique<PluginProgram>();
+    result->data = data;
+    return result;
+  }
+
+  static char ID;  // NOLINT
+};
+
+[[maybe_unused]] char PluginProgramSerDes::ID = 0;
+
+bool register_plugin_program_serdes = ([]() {
+  RegisterSerDes<PluginProgram>(
+      std::make_unique<PluginProgramSerDes>());
+}(), true);
+
+class PluginCompileOptionsSerDes
+    : public llvm::RTTIExtends<PluginCompileOptionsSerDes, SerDes> {
+ public:
+  absl::string_view type_name() const override {
+    return "xla::ifrt::PluginCompileOptions";
+  }
+
+  absl::StatusOr<std::string> Serialize(Serializable& serializable) override {
+    return "";
+  }
+
+  absl::StatusOr<std::unique_ptr<Serializable>> Deserialize(
+      const std::string& serialized,
+      std::unique_ptr<DeserializeOptions>) override {
+    return std::make_unique<PluginCompileOptions>();
+  }
+
+  static char ID;  // NOLINT
+};
+
+[[maybe_unused]] char PluginCompileOptionsSerDes::ID = 0;
+
+bool register_plugin_compile_options_serdes = ([]() {
+  RegisterSerDes<PluginCompileOptions>(
+      std::make_unique<PluginCompileOptionsSerDes>());
+}(), true);
+
+}  // namespace
+
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt/plugin_program_serdes_test.cc b/third_party/xla/xla/python/ifrt/plugin_program_serdes_test.cc
new file mode 100644
index 00000000000000..31dca456bd0ea4
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt/plugin_program_serdes_test.cc
@@ -0,0 +1,51 @@
+// Copyright 2024 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+
+#include <gtest/gtest.h>
+#include "xla/python/ifrt/plugin_program.h"
+#include "xla/python/ifrt/serdes.h"
+#include "xla/python/ifrt/serdes.pb.h"
+#include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/protobuf/error_codes.pb.h"
+#include "tsl/protobuf/status.pb.h"
+
+namespace xla {
+namespace ifrt {
+namespace {
+
+TEST(PluginProgramSerDesTest, RoundTrip) {
+  PluginProgram orig;
+  orig.data = "foo";
+  TF_ASSERT_OK_AND_ASSIGN(Serialized serialized, Serialize(orig));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<PluginProgram> deserialized_program,
+      Deserialize<PluginProgram>(serialized, /*options=*/nullptr));
+
+  EXPECT_EQ(deserialized_program->data, "foo");
+}
+
+TEST(PluginCompileOptionsSerDesTest, RoundTrip) {
+  PluginCompileOptions orig;
+  TF_ASSERT_OK_AND_ASSIGN(Serialized serialized, Serialize(orig));
+  TF_EXPECT_OK(
+      Deserialize<PluginCompileOptions>(serialized, /*options=*/nullptr)
+          .status());
+}
+
+}  // namespace
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt/program.cc b/third_party/xla/xla/python/ifrt/program.cc
new file mode 100644
index 00000000000000..97128c1ead2f93
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt/program.cc
@@ -0,0 +1,24 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/python/ifrt/program.h"
+
+namespace xla {
+namespace ifrt {
+
+char Program::ID = 0;
+
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/mock_nccl_xml.h b/third_party/xla/xla/python/ifrt/program.h
similarity index 51%
rename from third_party/xla/xla/service/gpu/mock_nccl_xml.h
rename to third_party/xla/xla/python/ifrt/program.h
index 9fbe41e92552fc..989d78711d6439 100644
--- a/third_party/xla/xla/service/gpu/mock_nccl_xml.h
+++ b/third_party/xla/xla/python/ifrt/program.h
@@ -1,4 +1,4 @@
-/* Copyright 2023 The OpenXLA Authors.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,28 +13,22 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_MOCK_NCCL_XML_H_
-#define XLA_SERVICE_GPU_MOCK_NCCL_XML_H_
+#ifndef XLA_PYTHON_IFRT_PROGRAM_H_
+#define XLA_PYTHON_IFRT_PROGRAM_H_
 
-#include <utility>
-
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "xla/status.h"
-
-struct ncclXml;
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/python/ifrt/serdes.h"
 
 namespace xla {
-namespace gpu {
-
-Status MockTopoGetXml(absl::string_view xml_str_view, ncclXml* xml);
+namespace ifrt {
 
-// Based on which local gpu devices participate the input clique, update the xml
-// topo graph.
-Status MockNcclTopoUpdateXml(absl::Span<const std::pair<int, int>> local_ranks,
-                             ncclXml* xml);
+// Abstract program. A program is any serializable code that can be compiled and
+// loaded onto devices to create a `LoadedExecutable`.
+struct Program : llvm::RTTIExtends<Program, Serializable> {
+  static char ID;  // NOLINT
+};
 
-}  // namespace gpu
+}  // namespace ifrt
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_MOCK_NCCL_XML_H_
+#endif  // XLA_PYTHON_IFRT_PROGRAM_H_
diff --git a/third_party/xla/xla/python/ifrt/program_serdes.cc b/third_party/xla/xla/python/ifrt/program_serdes.cc
new file mode 100644
index 00000000000000..3a762a957f496d
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt/program_serdes.cc
@@ -0,0 +1,31 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/python/ifrt/program_serdes.h"
+
+#include <memory>
+
+#include "absl/status/statusor.h"
+#include "llvm/Support/Casting.h"
+#include "xla/python/ifrt/serdes.h"
+#include "xla/util.h"
+
+namespace xla {
+namespace ifrt {
+
+char DeserializeProgramOptions::ID = 0;
+
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt/sharding_serdes.h b/third_party/xla/xla/python/ifrt/program_serdes.h
similarity index 56%
rename from third_party/xla/xla/python/ifrt/sharding_serdes.h
rename to third_party/xla/xla/python/ifrt/program_serdes.h
index 1203267e830cd6..310869ad9c52ad 100644
--- a/third_party/xla/xla/python/ifrt/sharding_serdes.h
+++ b/third_party/xla/xla/python/ifrt/program_serdes.h
@@ -1,4 +1,4 @@
-/* Copyright 2023 The OpenXLA Authors.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,27 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_PYTHON_IFRT_SHARDING_SERDES_H_
-#define XLA_PYTHON_IFRT_SHARDING_SERDES_H_
-
-#include <memory>
+#ifndef XLA_PYTHON_IFRT_PROGRAM_SERDES_H_
+#define XLA_PYTHON_IFRT_PROGRAM_SERDES_H_
 
 #include "llvm/Support/ExtensibleRTTI.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/serdes.h"
-#include "xla/statusor.h"
 
 namespace xla {
 namespace ifrt {
 
-class Client;
-
-// Options for deserializing shardings. Function referenced by `lookup_device`
-// must remain valid during deserialization.
-struct DeserializeShardingOptions
-    : llvm::RTTIExtends<DeserializeShardingOptions, DeserializeOptions> {
-  explicit DeserializeShardingOptions(
-      DeviceList::LookupDeviceFunc lookup_device)
+// Abstract options for deserializing an `Program`.
+struct DeserializeProgramOptions
+    : llvm::RTTIExtends<DeserializeProgramOptions, DeserializeOptions> {
+  explicit DeserializeProgramOptions(DeviceList::LookupDeviceFunc lookup_device)
       : lookup_device(lookup_device) {}
 
   static char ID;  // NOLINT
@@ -42,11 +35,7 @@ struct DeserializeShardingOptions
   DeviceList::LookupDeviceFunc lookup_device;
 };
 
-// Casts `DeserializeOptions` into `DeserializeShardingOptions`.
-absl::StatusOr<std::unique_ptr<DeserializeShardingOptions>>
-GetDeserializeShardingOptions(std::unique_ptr<DeserializeOptions> options);
-
 }  // namespace ifrt
 }  // namespace xla
 
-#endif  // XLA_PYTHON_IFRT_SHARDING_SERDES_H_
+#endif  // XLA_PYTHON_IFRT_PROGRAM_SERDES_H_
diff --git a/third_party/xla/xla/python/ifrt/serdes.cc b/third_party/xla/xla/python/ifrt/serdes.cc
index 4131a50b40097a..d6e841d82fcb69 100644
--- a/third_party/xla/xla/python/ifrt/serdes.cc
+++ b/third_party/xla/xla/python/ifrt/serdes.cc
@@ -101,7 +101,9 @@ absl::StatusOr<Serialized> Serialize(Serializable& serializable) {
   return proto;
 }
 
-absl::StatusOr<std::unique_ptr<Serializable>> Deserialize(
+namespace serdes_internal {
+
+absl::StatusOr<std::unique_ptr<Serializable>> DeserializeUnchecked(
     const Serialized& serialized, std::unique_ptr<DeserializeOptions> options) {
   SerDes* serdes;
   {
@@ -118,5 +120,7 @@ absl::StatusOr<std::unique_ptr<Serializable>> Deserialize(
   return serdes->Deserialize(serialized.data(), std::move(options));
 }
 
+}  // namespace serdes_internal
+
 }  // namespace ifrt
 }  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt/serdes.h b/third_party/xla/xla/python/ifrt/serdes.h
index f6d173489b5437..6b23a9a9687e46 100644
--- a/third_party/xla/xla/python/ifrt/serdes.h
+++ b/third_party/xla/xla/python/ifrt/serdes.h
@@ -20,24 +20,31 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ExtensibleRTTI.h"
 #include "xla/python/ifrt/serdes.pb.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace ifrt {
 
+// Base class for deserialization options to be passed to `Deserialize`.
+struct DeserializeOptions
+    : llvm::RTTIExtends<DeserializeOptions, llvm::RTTIRoot> {
+  static char ID;  // NOLINT
+};
+
 // Base class for serializable IFRT types.
 class Serializable : public llvm::RTTIExtends<Serializable, llvm::RTTIRoot> {
  public:
   static char ID;  // NOLINT
-};
 
-// Base class for deserialization options to be passed to `Deserialize`.
-struct DeserializeOptions
-    : llvm::RTTIExtends<DeserializeOptions, llvm::RTTIRoot> {
-  static char ID;  // NOLINT
+  // Expected `DeserializeOptions` type. A subclass of `Serializable` can
+  // customize it.
+  using DeserializeOptions = ::xla::ifrt::DeserializeOptions;
 };
 
 // Serializer and deserializer implementations for one `Serializable` type.
@@ -75,6 +82,15 @@ void RegisterSerDes(std::unique_ptr<SerDes> serdes) {
   RegisterSerDes(T::classID(), std::move(serdes));
 }
 
+namespace serdes_internal {
+
+// Internal implementation of Deserialize(). Performs deserialization with type
+// erased.
+absl::StatusOr<std::unique_ptr<Serializable>> DeserializeUnchecked(
+    const Serialized& serialized, std::unique_ptr<DeserializeOptions> options);
+
+}  // namespace serdes_internal
+
 // Serializes the given `Serializable` object. The returned proto message can be
 // deserialized by `Deserialize`.
 //
@@ -82,17 +98,29 @@ void RegisterSerDes(std::unique_ptr<SerDes> serdes) {
 // `SerDes` registered or the `SerDes` returns an error.
 absl::StatusOr<Serialized> Serialize(Serializable& serializable);
 
-// Deserializes the given proto message produced by `Serialize()` back to a
-// `Serializable` object of the original type.
+// Deserializes the given proto message produced by `Serialize()` back to an
+// object of type `InterfaceType`, where `serialized.type_name()` is expected to
+// be the same type or a subclass of `InterfaceType`.
 //
 // `options` is passed as-is to `SerDes::Deserialize()`, so it can be nullptr as
 // long as the `SerDes` implementation can handle nullptr options.
 //
-// Returns an error if the `Serializable` type from which `serialized` was
-// produced does not have a corresponding `SerDes` registered or the `SerDes`
+// Returns an error if the type indicated by `serialized.type_name()` does not
+// have a corresponding `SerDes` registered or the if the registered `SerDes`
 // returns an error.
-absl::StatusOr<std::unique_ptr<Serializable>> Deserialize(
-    const Serialized& serialized, std::unique_ptr<DeserializeOptions> options);
+template <typename InterfaceType>
+absl::StatusOr<std::unique_ptr<InterfaceType>> Deserialize(
+    const Serialized& serialized,
+    std::unique_ptr<typename InterfaceType::DeserializeOptions> options) {
+  TF_ASSIGN_OR_RETURN(auto result, serdes_internal::DeserializeUnchecked(
+                                       serialized, std::move(options)));
+  if (!llvm::isa<InterfaceType>(result.get())) {
+    return absl::InternalError(
+        "Unexpected Serializable type after deserialization");
+  }
+  return std::unique_ptr<InterfaceType>(
+      static_cast<InterfaceType*>(result.release()));
+}
 
 }  // namespace ifrt
 }  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt/serdes_test.cc b/third_party/xla/xla/python/ifrt/serdes_test.cc
index 6f80578d33051a..46e7256b08dede 100644
--- a/third_party/xla/xla/python/ifrt/serdes_test.cc
+++ b/third_party/xla/xla/python/ifrt/serdes_test.cc
@@ -17,9 +17,13 @@ limitations under the License.
 
 #include <memory>
 #include <string>
+#include <utility>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "llvm/Support/Casting.h"
@@ -35,7 +39,11 @@ namespace {
 
 using ::tsl::testing::StatusIs;
 
+struct TestNumberDeserializeOptions;
+
 struct TestNumber : llvm::RTTIExtends<TestNumber, Serializable> {
+  using DeserializeOptions = TestNumberDeserializeOptions;
+
   int number;
 
   explicit TestNumber(int number) : number(number) {}
@@ -43,7 +51,7 @@ struct TestNumber : llvm::RTTIExtends<TestNumber, Serializable> {
   static char ID;  // NOLINT
 };
 
-char TestNumber::ID = 0;  // NOLINT
+[[maybe_unused]] char TestNumber::ID = 0;  // NOLINT
 
 struct TestNumberDeserializeOptions
     : llvm::RTTIExtends<TestNumberDeserializeOptions, DeserializeOptions> {
@@ -52,7 +60,7 @@ struct TestNumberDeserializeOptions
   static char ID;  // NOLINT
 };
 
-char TestNumberDeserializeOptions::ID = 0;  // NOLINT
+[[maybe_unused]] char TestNumberDeserializeOptions::ID = 0;  // NOLINT
 
 class TestNumberSerDes : public llvm::RTTIExtends<TestNumberSerDes, SerDes> {
  public:
@@ -84,7 +92,7 @@ class TestNumberSerDes : public llvm::RTTIExtends<TestNumberSerDes, SerDes> {
   static char ID;  // NOLINT
 };
 
-char TestNumberSerDes::ID = 0;  // NOLINT
+[[maybe_unused]] char TestNumberSerDes::ID = 0;  // NOLINT
 
 class TestNumberTest : public testing::Test {
  protected:
@@ -96,9 +104,10 @@ class TestNumberTest : public testing::Test {
 TEST_F(TestNumberTest, RoundTrip) {
   auto obj = std::make_unique<TestNumber>(1234);
   TF_ASSERT_OK_AND_ASSIGN(Serialized serialized, Serialize(*obj));
-  TF_ASSERT_OK_AND_ASSIGN(auto deserialized,
-                          Deserialize(serialized, /*options=*/nullptr));
-  EXPECT_EQ(obj->number, llvm::cast<TestNumber>(*deserialized).number);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto deserialized,
+      Deserialize<TestNumber>(serialized, /*options=*/nullptr));
+  EXPECT_EQ(obj->number, deserialized->number);
 }
 
 TEST_F(TestNumberTest, WithOptions) {
@@ -107,7 +116,7 @@ TEST_F(TestNumberTest, WithOptions) {
 
   auto options = std::make_unique<TestNumberDeserializeOptions>();
   options->injected_failure = absl::InternalError("injected failure");
-  EXPECT_THAT(Deserialize(serialized, std::move(options)),
+  EXPECT_THAT(Deserialize<TestNumber>(serialized, std::move(options)),
               StatusIs(absl::StatusCode::kInternal, "injected failure"));
 }
 
diff --git a/third_party/xla/xla/python/ifrt/shape.cc b/third_party/xla/xla/python/ifrt/shape.cc
index 419b73de6b8683..0d6fb7c1963cbc 100644
--- a/third_party/xla/xla/python/ifrt/shape.cc
+++ b/third_party/xla/xla/python/ifrt/shape.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
-#include "xla/python/ifrt/types.pb.h"
+#include "xla/python/ifrt/shape.pb.h"
 #include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/util.h"
@@ -106,7 +106,7 @@ absl::StatusOr<DynamicShape> DynamicShape::Create(Shape shape,
                                                   DynamicShapeTag tag) {
   TF_RETURN_IF_ERROR(std::visit(
       overloaded{
-          [&](const BoundedDynamicShapeTag& tag) {
+          [&](const BoundedDynamicShapeTag& tag) -> absl::Status {
             if (tag.DynamicDims().size() != shape.dims().size()) {
               return InvalidArgument(
                   "Shape and tag must have the same number of dimensions.");
diff --git a/third_party/xla/xla/python/ifrt/shape.h b/third_party/xla/xla/python/ifrt/shape.h
index 430d6707c24d38..56cbb5fbb82bdf 100644
--- a/third_party/xla/xla/python/ifrt/shape.h
+++ b/third_party/xla/xla/python/ifrt/shape.h
@@ -28,8 +28,7 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/types/span.h"
-#include "xla/python/ifrt/types.pb.h"
-#include "xla/statusor.h"
+#include "xla/python/ifrt/shape.pb.h"
 
 namespace xla {
 namespace ifrt {
diff --git a/third_party/xla/xla/python/ifrt/types.proto b/third_party/xla/xla/python/ifrt/shape.proto
similarity index 63%
rename from third_party/xla/xla/python/ifrt/types.proto
rename to third_party/xla/xla/python/ifrt/shape.proto
index b4d1d950aefede..cd5d26e13826f9 100644
--- a/third_party/xla/xla/python/ifrt/types.proto
+++ b/third_party/xla/xla/python/ifrt/shape.proto
@@ -1,4 +1,4 @@
-/* Copyright 2023 The OpenXLA Authors.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -17,25 +17,19 @@ syntax = "proto3";
 
 package xla.ifrt;
 
-// Wire format for `DeviceList`.
-message DeviceListProto {
-  // Serialization and deserialization are expected to ensure that device ids
-  // are stable across proto construction and consumption.
-  repeated int32 device_ids = 1;
-}
-
-// Wire format for `Shape`. Currently support static shapes with all dimension
-// sizes greater than or equal to 0.
+// Proto equivalent of C++ `Shape`. Currently support static shapes with all
+// dimension sizes greater than or equal to 0.
 message ShapeProto {
   repeated int64 dims = 1;
 }
 
-// Wire format for `BoundedDynamicShapeTag`.
+// Proto equivalent of C++ `BoundedDynamicShapeTag`.
 message BoundedDynamicShapeTagProto {
   repeated bool is_dynamic_dims = 1;
 }
 
-// Wire format for `DynamicShape`. Currently only support bounded dynamic shape.
+// Proto equivalent of C++ `DynamicShape`. Currently only support bounded
+// dynamic shape.
 message DynamicShapeProto {
   ShapeProto shape = 1;
   oneof tag {
diff --git a/third_party/xla/xla/python/ifrt/shape_test.cc b/third_party/xla/xla/python/ifrt/shape_test.cc
index 4dc1987887261c..1477f842a79522 100644
--- a/third_party/xla/xla/python/ifrt/shape_test.cc
+++ b/third_party/xla/xla/python/ifrt/shape_test.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
-#include "xla/python/ifrt/types.pb.h"
+#include "xla/python/ifrt/shape.pb.h"
 #include "tsl/platform/status_matchers.h"
 #include "tsl/platform/statusor.h"
 
@@ -85,12 +85,27 @@ TEST(ShapeTest, NonZeroDimsNumElements) {
   }
 }
 
+TEST(ShapeTest, ToFromProto) {
+  {
+    Shape shape({});
+    ShapeProto proto = shape.ToProto();
+    TF_ASSERT_OK_AND_ASSIGN(Shape shape_copy, shape.FromProto(proto));
+    EXPECT_EQ(shape_copy, shape);
+  }
+  {
+    Shape shape({1, 2});
+    ShapeProto proto = shape.ToProto();
+    TF_ASSERT_OK_AND_ASSIGN(Shape shape_copy, shape.FromProto(proto));
+    EXPECT_EQ(shape_copy, shape);
+  }
+}
+
 TEST(BoundedDynamicShapeTagDeathTest, NoDynamicDim) {
   EXPECT_DEATH(BoundedDynamicShapeTag tag({false, false}),
                "At least one dimension needs to be dynamically sized");
 }
 
-TEST(BoundedDynamicShapeTagTest, FromToProto) {
+TEST(BoundedDynamicShapeTagTest, ToFromProto) {
   BoundedDynamicShapeTag tag({true, false});
   BoundedDynamicShapeTagProto proto = tag.ToProto();
   TF_ASSERT_OK_AND_ASSIGN(BoundedDynamicShapeTag tag_copy,
@@ -148,7 +163,7 @@ TEST(DynamicShapeTest, GetPaddedShape) {
   EXPECT_EQ(padded_shape, shape);
 }
 
-TEST(DynamicShapeTest, FromToProto) {
+TEST(DynamicShapeTest, ToFromProto) {
   TF_ASSERT_OK_AND_ASSIGN(
       DynamicShape shape,
       DynamicShape::Create(Shape({2, 4}),
diff --git a/third_party/xla/xla/python/ifrt/sharding.cc b/third_party/xla/xla/python/ifrt/sharding.cc
index 40ae547c6856c8..96ef77bdd1ca10 100644
--- a/third_party/xla/xla/python/ifrt/sharding.cc
+++ b/third_party/xla/xla/python/ifrt/sharding.cc
@@ -29,14 +29,19 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/ExtensibleRTTI.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/index.h"
 #include "xla/python/ifrt/index_domain.h"
+#include "xla/python/ifrt/ir/sharding_param.h"
 #include "xla/python/ifrt/memory.h"
+#include "xla/python/ifrt/serdes.h"
 #include "xla/python/ifrt/shape.h"
-#include "xla/statusor.h"
 #include "xla/util.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace ifrt {
@@ -159,6 +164,23 @@ char ConcreteSharding::ID = 0;
 char ConcreteEvenSharding::ID = 0;
 char ShardingParamSharding::ID = 0;
 
+char DeserializeShardingOptions::ID = 0;
+
+absl::StatusOr<std::unique_ptr<Sharding>> Sharding::FromProto(
+    DeviceList::LookupDeviceFunc lookup_device,
+    const ShardingProto& sharding_proto) {
+  return Deserialize<Sharding>(
+      sharding_proto.serialized_sharding(),
+      std::make_unique<DeserializeShardingOptions>(std::move(lookup_device)));
+}
+
+absl::StatusOr<ShardingProto> Sharding::ToProto() const {
+  ShardingProto sharding_proto;
+  TF_ASSIGN_OR_RETURN(*sharding_proto.mutable_serialized_sharding(),
+                      Serialize(const_cast<Sharding&>(*this)));
+  return sharding_proto;
+}
+
 std::ostream& operator<<(std::ostream& os, const Sharding& sharding) {
   return os << sharding.DebugString();
 }
diff --git a/third_party/xla/xla/python/ifrt/sharding.h b/third_party/xla/xla/python/ifrt/sharding.h
index f3d08c2d89f048..28614d69732645 100644
--- a/third_party/xla/xla/python/ifrt/sharding.h
+++ b/third_party/xla/xla/python/ifrt/sharding.h
@@ -31,13 +31,15 @@ limitations under the License.
 #include "xla/python/ifrt/memory.h"
 #include "xla/python/ifrt/serdes.h"
 #include "xla/python/ifrt/shape.h"
-#include "xla/statusor.h"
+#include "xla/python/ifrt/sharding.pb.h"
 
 namespace xla {
 namespace ifrt {
 
 // TODO(hyeontaek): Unify sharding types with jax::Sharding.
 
+struct DeserializeShardingOptions;
+
 // Abstract sharding type.
 //
 // TODO(hyeontaek): There is an indication that we may prefer to split logical
@@ -47,6 +49,8 @@ namespace ifrt {
 // sharding design may help reduce overhead around these operations.
 class Sharding : public llvm::RTTIExtends<Sharding, Serializable> {
  public:
+  using DeserializeOptions = DeserializeShardingOptions;
+
   // All devices in this sharding. Devices may appear more than once.
   const DeviceList& devices() const { return devices_; }
 
@@ -73,6 +77,18 @@ class Sharding : public llvm::RTTIExtends<Sharding, Serializable> {
   virtual absl::StatusOr<std::vector<IndexDomain>> IndexDomains(
       const Shape& shape) const = 0;
 
+  // Deserializes `ShardingProto` into `Sharding`.
+  // Note that `Sharding` serialization uses `SerDes` to handle an open set of
+  // `Sharding` subclasses. See `serdes.h`.
+  static absl::StatusOr<std::unique_ptr<Sharding>> FromProto(
+      DeviceList::LookupDeviceFunc lookup_device,
+      const ShardingProto& sharding_proto);
+
+  // Serializes `Sharding` into `ShardingProto`.
+  // Note that `Sharding` serialization uses `SerDes` to handle an open set of
+  // `Sharding` subclasses. See `serdes.h`.
+  absl::StatusOr<ShardingProto> ToProto() const;
+
   virtual std::string DebugString() const = 0;
 
   static char ID;  // NOLINT
@@ -87,6 +103,9 @@ class Sharding : public llvm::RTTIExtends<Sharding, Serializable> {
 
 std::ostream& operator<<(std::ostream& os, const Sharding& sharding);
 
+// TODO(hyeontaek): Move the subclasses of `Sharding` to a seperate file,
+// making this sharding.{h,cc} only define interface and common functions.
+
 // Single-device sharding.
 //
 // TODO(hyeontaek): `SingleDeviceSharding` tends to be created or consumed in a
@@ -313,6 +332,20 @@ class ShardingParamSharding
   ShardingParam sharding_param_;
 };
 
+// Options for deserializing shardings. Function referenced by `lookup_device`
+// must remain valid during deserialization.
+struct DeserializeShardingOptions
+    : llvm::RTTIExtends<DeserializeShardingOptions, DeserializeOptions> {
+  explicit DeserializeShardingOptions(
+      DeviceList::LookupDeviceFunc lookup_device)
+      : lookup_device(lookup_device) {}
+
+  static char ID;  // NOLINT
+
+  // Function that converts device ids to devices.
+  DeviceList::LookupDeviceFunc lookup_device;
+};
+
 }  // namespace ifrt
 }  // namespace xla
 
diff --git a/third_party/xla/xla/python/ifrt/sharding.proto b/third_party/xla/xla/python/ifrt/sharding.proto
index 43ff812c609e38..a6033bb348e021 100644
--- a/third_party/xla/xla/python/ifrt/sharding.proto
+++ b/third_party/xla/xla/python/ifrt/sharding.proto
@@ -1,4 +1,4 @@
-/* Copyright 2023 The OpenXLA Authors.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -17,38 +17,10 @@ syntax = "proto3";
 
 package xla.ifrt;
 
-import "xla/python/ifrt/types.proto";
+import "xla/python/ifrt/serdes.proto";
 
-// Wire format for `SingleDeviceSharding`.
-message SingleDeviceShardingProto {
-  // Serialization and deserialization are expected to ensure that device ids
-  // are stable across proto construction and consumption.
-  int32 device_id = 1;
-  optional string memory_kind = 2;
-}
-
-// Wire format for `OpaqueSharding`.
-message OpaqueShardingProto {
-  DeviceListProto devices = 1;
-  optional string memory_kind = 2;
-}
-
-// Wire format for `ConcreteSharding`.
-message ConcreteShardingProto {
-  DeviceListProto devices = 1;
-  optional string memory_kind = 4;
-  oneof shape_or_dynamic_shape {
-    ShapeProto shape = 2;
-    DynamicShapeProto dynamic_shape = 5;
-  }
-  repeated ShapeProto shard_shapes = 3;
-  repeated DynamicShapeProto shard_dynamic_shapes = 6;
-}
-
-// Wire format for `ConcreteEvenSharding`.
-message ConcreteEvenShardingProto {
-  DeviceListProto devices = 1;
-  optional string memory_kind = 4;
-  ShapeProto shape = 2;
-  ShapeProto shard_shape = 3;
+// Proto equivalent of C++ `Sharding`. A suitable serializer and deserializer
+// implementation must be registered.
+message ShardingProto {
+  xla.ifrt.Serialized serialized_sharding = 1;
 }
diff --git a/third_party/xla/xla/python/ifrt/sharding_serdes.cc b/third_party/xla/xla/python/ifrt/sharding_serdes.cc
index 5d34537d547e38..2fc1f28f9496ee 100644
--- a/third_party/xla/xla/python/ifrt/sharding_serdes.cc
+++ b/third_party/xla/xla/python/ifrt/sharding_serdes.cc
@@ -13,29 +13,28 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/python/ifrt/sharding_serdes.h"
-
 #include <memory>
-#include <string>
-#include <utility>
-#include <vector>
 
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "llvm/Support/Casting.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/memory.h"
 #include "xla/python/ifrt/serdes.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
-#include "xla/python/ifrt/sharding.pb.h"
+#include "xla/python/ifrt/sharding_serdes.pb.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace ifrt {
 
-char DeserializeShardingOptions::ID = 0;
-
 namespace {
 
+// TODO(hyeontaek): Rename sharding_serdes.cc once the subclasses of `Sharding`
+// are moved to to a separate file other than sharding.{h,cc}.
+
 // Serialization/deserialization for `SingleDeviceSharding`.
 class SingleDeviceShardingSerDes
     : public llvm::RTTIExtends<SingleDeviceShardingSerDes, SerDes> {
@@ -48,7 +47,7 @@ class SingleDeviceShardingSerDes
     const SingleDeviceSharding& sharding =
         llvm::cast<SingleDeviceSharding>(serializable);
     SingleDeviceShardingProto proto;
-    proto.set_device_id(sharding.devices().front()->id());
+    proto.set_device_id(sharding.devices().front()->Id().value());
     if (sharding.memory_kind().memory_kind().has_value()) {
       proto.set_memory_kind(std::string(*sharding.memory_kind().memory_kind()));
     }
@@ -58,16 +57,17 @@ class SingleDeviceShardingSerDes
   absl::StatusOr<std::unique_ptr<Serializable>> Deserialize(
       const std::string& serialized,
       std::unique_ptr<DeserializeOptions> options) override {
-    TF_ASSIGN_OR_RETURN(auto deserialize_sharding_options,
-                        GetDeserializeShardingOptions(std::move(options)));
+    const auto* deserialize_sharding_options =
+        llvm::cast<DeserializeShardingOptions>(options.get());
+
     SingleDeviceShardingProto proto;
     if (!proto.ParseFromString(serialized)) {
       return absl::InvalidArgumentError(
           "Failed to parse serialized SimpleDeviceSharding");
     }
-    TF_ASSIGN_OR_RETURN(
-        Device * device,
-        deserialize_sharding_options->lookup_device(proto.device_id()));
+    TF_ASSIGN_OR_RETURN(Device * device,
+                        deserialize_sharding_options->lookup_device(
+                            DeviceId(proto.device_id())));
     MemoryKind memory_kind;
     if (proto.has_memory_kind()) {
       memory_kind = MemoryKind(proto.memory_kind());
@@ -99,8 +99,8 @@ class OpaqueShardingSerDes
   absl::StatusOr<std::unique_ptr<Serializable>> Deserialize(
       const std::string& serialized,
       std::unique_ptr<DeserializeOptions> options) override {
-    TF_ASSIGN_OR_RETURN(auto deserialize_sharding_options,
-                        GetDeserializeShardingOptions(std::move(options)));
+    const auto* deserialize_sharding_options =
+        llvm::cast<DeserializeShardingOptions>(options.get());
 
     OpaqueShardingProto proto;
     if (!proto.ParseFromString(serialized)) {
@@ -155,8 +155,8 @@ class ConcreteShardingSerDes
   absl::StatusOr<std::unique_ptr<Serializable>> Deserialize(
       const std::string& serialized,
       std::unique_ptr<DeserializeOptions> options) override {
-    TF_ASSIGN_OR_RETURN(auto deserialize_sharding_options,
-                        GetDeserializeShardingOptions(std::move(options)));
+    const auto* deserialize_sharding_options =
+        llvm::cast<DeserializeShardingOptions>(options.get());
 
     ConcreteShardingProto proto;
     if (!proto.ParseFromString(serialized)) {
@@ -229,8 +229,8 @@ class ConcreteEvenShardingSerDes
   absl::StatusOr<std::unique_ptr<Serializable>> Deserialize(
       const std::string& serialized,
       std::unique_ptr<DeserializeOptions> options) override {
-    TF_ASSIGN_OR_RETURN(auto deserialize_sharding_options,
-                        GetDeserializeShardingOptions(std::move(options)));
+    const auto* deserialize_sharding_options =
+        llvm::cast<DeserializeShardingOptions>(options.get());
 
     ConcreteEvenShardingProto proto;
     if (!proto.ParseFromString(serialized)) {
@@ -287,14 +287,5 @@ bool register_concrete_even_sharding_serdes = ([]{
 
 }  // namespace
 
-absl::StatusOr<std::unique_ptr<DeserializeShardingOptions>>
-GetDeserializeShardingOptions(std::unique_ptr<DeserializeOptions> options) {
-  if (!llvm::isa<DeserializeShardingOptions>(options.get())) {
-    return xla::InvalidArgument("options must be DeserializeShardingOptions");
-  }
-  return std::unique_ptr<DeserializeShardingOptions>(
-      static_cast<DeserializeShardingOptions*>(options.release()));
-}
-
 }  // namespace ifrt
 }  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt/sharding_serdes.proto b/third_party/xla/xla/python/ifrt/sharding_serdes.proto
new file mode 100644
index 00000000000000..c95a766c592b6a
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt/sharding_serdes.proto
@@ -0,0 +1,55 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto3";
+
+package xla.ifrt;
+
+import "xla/python/ifrt/device.proto";
+import "xla/python/ifrt/shape.proto";
+
+// Proto equivalent of C++ `SingleDeviceSharding`.
+message SingleDeviceShardingProto {
+  // Serialization and deserialization are expected to ensure that device ids
+  // are stable across proto construction and consumption.
+  int32 device_id = 1;
+  optional string memory_kind = 2;
+}
+
+// Proto equivalent of C++ `OpaqueSharding`.
+message OpaqueShardingProto {
+  DeviceListProto devices = 1;
+  optional string memory_kind = 2;
+}
+
+// Proto equivalent of C++ `ConcreteSharding`.
+message ConcreteShardingProto {
+  DeviceListProto devices = 1;
+  optional string memory_kind = 4;
+  oneof shape_or_dynamic_shape {
+    ShapeProto shape = 2;
+    DynamicShapeProto dynamic_shape = 5;
+  }
+  repeated ShapeProto shard_shapes = 3;
+  repeated DynamicShapeProto shard_dynamic_shapes = 6;
+}
+
+// Proto equivalent of C++ `ConcreteEvenSharding`.
+message ConcreteEvenShardingProto {
+  DeviceListProto devices = 1;
+  optional string memory_kind = 4;
+  ShapeProto shape = 2;
+  ShapeProto shard_shape = 3;
+}
diff --git a/third_party/xla/xla/python/ifrt/sharding_serdes_test.cc b/third_party/xla/xla/python/ifrt/sharding_serdes_test.cc
index 663242e868a446..059da0c692d421 100644
--- a/third_party/xla/xla/python/ifrt/sharding_serdes_test.cc
+++ b/third_party/xla/xla/python/ifrt/sharding_serdes_test.cc
@@ -13,14 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/python/ifrt/sharding_serdes.h"
-
 #include <memory>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/functional/bind_front.h"
+#include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/memory.h"
 #include "xla/python/ifrt/serdes.h"
 #include "xla/python/ifrt/serdes.pb.h"
@@ -44,14 +43,11 @@ TEST_P(ShardingSerDesTest, SingleDeviceShardingRoundTrip) {
   TF_ASSERT_OK_AND_ASSIGN(auto serialized, Serialize(*sharding));
 
   TF_ASSERT_OK_AND_ASSIGN(
-      auto deserialized,
-      Deserialize(serialized,
-                  std::make_unique<DeserializeShardingOptions>(
-                      absl::bind_front(&Client::LookupDevice, client()))));
-
-  const auto* out_sharding =
-      llvm::dyn_cast<SingleDeviceSharding>(deserialized.get());
-  ASSERT_NE(out_sharding, nullptr);
+      auto out_sharding,
+      Deserialize<SingleDeviceSharding>(
+          serialized, std::make_unique<DeserializeShardingOptions>(
+                          absl::bind_front(&Client::LookupDevice, client()))));
+
   EXPECT_THAT(out_sharding->devices(), ElementsAreArray(sharding->devices()));
 }
 
@@ -61,13 +57,11 @@ TEST_P(ShardingSerDesTest, OpaqueShardingRoundTrip) {
   TF_ASSERT_OK_AND_ASSIGN(auto serialized, Serialize(*sharding));
 
   TF_ASSERT_OK_AND_ASSIGN(
-      auto deserialized,
-      Deserialize(serialized,
-                  std::make_unique<DeserializeShardingOptions>(
-                      absl::bind_front(&Client::LookupDevice, client()))));
+      auto out_sharding,
+      Deserialize<OpaqueSharding>(
+          serialized, std::make_unique<DeserializeShardingOptions>(
+                          absl::bind_front(&Client::LookupDevice, client()))));
 
-  const auto* out_sharding = llvm::dyn_cast<OpaqueSharding>(deserialized.get());
-  ASSERT_NE(out_sharding, nullptr);
   EXPECT_THAT(out_sharding->devices(), ElementsAreArray(sharding->devices()));
 }
 
@@ -80,14 +74,11 @@ TEST_P(ShardingSerDesTest, ConcreteShardingRoundTrip) {
   TF_ASSERT_OK_AND_ASSIGN(auto serialized, Serialize(*sharding));
 
   TF_ASSERT_OK_AND_ASSIGN(
-      auto deserialized,
-      Deserialize(serialized,
-                  std::make_unique<DeserializeShardingOptions>(
-                      absl::bind_front(&Client::LookupDevice, client()))));
-
-  const auto* out_sharding =
-      llvm::dyn_cast<ConcreteSharding>(deserialized.get());
-  ASSERT_NE(out_sharding, nullptr);
+      auto out_sharding,
+      Deserialize<ConcreteSharding>(
+          serialized, std::make_unique<DeserializeShardingOptions>(
+                          absl::bind_front(&Client::LookupDevice, client()))));
+
   EXPECT_THAT(out_sharding->devices(), ElementsAreArray(sharding->devices()));
   EXPECT_THAT(out_sharding->shape(), sharding->shape());
   EXPECT_THAT(out_sharding->shard_shapes(),
@@ -115,14 +106,11 @@ TEST_P(ShardingSerDesTest, ConcreteShardingWithDynamicShapeRoundTrip) {
   TF_ASSERT_OK_AND_ASSIGN(Serialized serialized, Serialize(*sharding));
 
   TF_ASSERT_OK_AND_ASSIGN(
-      auto deserialized,
-      Deserialize(serialized,
-                  std::make_unique<DeserializeShardingOptions>(
-                      absl::bind_front(&Client::LookupDevice, client()))));
-
-  const auto* out_sharding =
-      llvm::dyn_cast<ConcreteSharding>(deserialized.get());
-  ASSERT_NE(out_sharding, nullptr);
+      auto out_sharding,
+      Deserialize<ConcreteSharding>(
+          serialized, std::make_unique<DeserializeShardingOptions>(
+                          absl::bind_front(&Client::LookupDevice, client()))));
+
   EXPECT_THAT(out_sharding->devices(), ElementsAreArray(sharding->devices()));
   EXPECT_THAT(out_sharding->dynamic_shape(), sharding->dynamic_shape());
   EXPECT_THAT(out_sharding->shard_dynamic_shapes(),
@@ -138,14 +126,11 @@ TEST_P(ShardingSerDesTest, ConcreteEvenShardingRoundTrip) {
   TF_ASSERT_OK_AND_ASSIGN(auto serialized, Serialize(*sharding));
 
   TF_ASSERT_OK_AND_ASSIGN(
-      auto deserialized,
-      Deserialize(serialized,
-                  std::make_unique<DeserializeShardingOptions>(
-                      absl::bind_front(&Client::LookupDevice, client()))));
-
-  const auto* out_sharding =
-      llvm::dyn_cast<ConcreteEvenSharding>(deserialized.get());
-  ASSERT_NE(out_sharding, nullptr);
+      auto out_sharding,
+      Deserialize<ConcreteEvenSharding>(
+          serialized, std::make_unique<DeserializeShardingOptions>(
+                          absl::bind_front(&Client::LookupDevice, client()))));
+
   EXPECT_THAT(out_sharding->devices(), ElementsAreArray(sharding->devices()));
   EXPECT_THAT(out_sharding->shape(), sharding->shape());
   EXPECT_THAT(out_sharding->shard_shape(), sharding->shard_shape());
@@ -153,7 +138,8 @@ TEST_P(ShardingSerDesTest, ConcreteEvenShardingRoundTrip) {
 
 INSTANTIATE_TEST_SUITE_P(NumDevices, ShardingSerDesTest,
                          testing::Values(test_util::ShardingTestParam{
-                             .num_devices = 2, .num_addressable_devices = 2}));
+                             /*num_devices=*/2,
+                             /*num_addressable_devices=*/2}));
 
 }  // namespace
 }  // namespace ifrt
diff --git a/third_party/xla/xla/python/ifrt/sharding_test.cc b/third_party/xla/xla/python/ifrt/sharding_test.cc
index 8f615ab669dca4..9be2bf217d16de 100644
--- a/third_party/xla/xla/python/ifrt/sharding_test.cc
+++ b/third_party/xla/xla/python/ifrt/sharding_test.cc
@@ -23,6 +23,8 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "llvm/Support/Casting.h"
 #include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/index.h"
+#include "xla/python/ifrt/index_domain.h"
 #include "xla/python/ifrt/ir/sharding_param.h"
 #include "xla/python/ifrt/memory.h"
 #include "xla/python/ifrt/shape.h"
@@ -364,19 +366,24 @@ TEST_P(ShardingParamShardingTest, IndexDomainWithReplication) {
 
 INSTANTIATE_TEST_SUITE_P(NumDevices, SingleDeviceShardingTest,
                          testing::Values(test_util::ShardingTestParam{
-                             .num_devices = 6, .num_addressable_devices = 6}));
+                             /*num_devices=*/6,
+                             /*num_addressable_devices=*/6}));
 INSTANTIATE_TEST_SUITE_P(NumDevices, OpaqueShardingTest,
                          testing::Values(test_util::ShardingTestParam{
-                             .num_devices = 6, .num_addressable_devices = 6}));
+                             /*num_devices=*/6,
+                             /*num_addressable_devices=*/6}));
 INSTANTIATE_TEST_SUITE_P(NumDevices, ConcreteShardingTest,
                          testing::Values(test_util::ShardingTestParam{
-                             .num_devices = 6, .num_addressable_devices = 6}));
+                             /*num_devices=*/6,
+                             /*num_addressable_devices=*/6}));
 INSTANTIATE_TEST_SUITE_P(NumDevices, ConcreteEvenShardingTest,
                          testing::Values(test_util::ShardingTestParam{
-                             .num_devices = 6, .num_addressable_devices = 6}));
+                             /*num_devices=*/6,
+                             /*num_addressable_devices=*/6}));
 INSTANTIATE_TEST_SUITE_P(NumDevices, ShardingParamShardingTest,
                          testing::Values(test_util::ShardingTestParam{
-                             .num_devices = 6, .num_addressable_devices = 4}));
+                             /*num_devices=*/6,
+                             /*num_addressable_devices=*/4}));
 
 }  // namespace
 }  // namespace ifrt
diff --git a/third_party/xla/xla/python/ifrt/sharding_test_util.cc b/third_party/xla/xla/python/ifrt/sharding_test_util.cc
index 2f97ce7a738af6..ba51d315619e32 100644
--- a/third_party/xla/xla/python/ifrt/sharding_test_util.cc
+++ b/third_party/xla/xla/python/ifrt/sharding_test_util.cc
@@ -36,7 +36,7 @@ using ::testing::Return;
 // Internal state of a client for sharding tests.
 struct ShardingTestClientState {
   // Mapping from a device ID to the mock device object.
-  absl::flat_hash_map<int, std::unique_ptr<Device>> device_map;
+  absl::flat_hash_map<DeviceId, std::unique_ptr<Device>> device_map;
   // Raw pointers to mock devices.
   std::vector<Device*> devices;
 };
@@ -53,19 +53,17 @@ std::shared_ptr<MockClient> MakeShardingTestClient(
 
   for (int i = 0; i < num_addressable_devices; ++i) {
     auto device = std::make_unique<MockDevice>();
-    ON_CALL(*device, global_device_id)
-        .WillByDefault(Return(PjRtGlobalDeviceId(i + 10)));
+    ON_CALL(*device, Id).WillByDefault(Return(DeviceId(i + 10)));
     ON_CALL(*device, IsAddressable).WillByDefault(Return(true));
     state->devices.push_back(device.get());
-    state->device_map.insert({i + 10, std::move(device)});
+    state->device_map.insert({DeviceId(i + 10), std::move(device)});
   }
   for (int i = num_addressable_devices; i < num_devices; ++i) {
     auto device = std::make_unique<MockDevice>();
-    ON_CALL(*device, global_device_id)
-        .WillByDefault(Return(PjRtGlobalDeviceId(i + 10)));
+    ON_CALL(*device, Id).WillByDefault(Return(DeviceId(i + 10)));
     ON_CALL(*device, IsAddressable).WillByDefault(Return(false));
     state->devices.push_back(device.get());
-    state->device_map.insert({i + 10, std::move(device)});
+    state->device_map.insert({DeviceId(i + 10), std::move(device)});
   }
 
   auto client = std::make_shared<MockClient>();
@@ -73,10 +71,10 @@ std::shared_ptr<MockClient> MakeShardingTestClient(
       .WillByDefault(
           [state]() -> absl::Span<Device* const> { return state->devices; });
   ON_CALL(*client, LookupDevice)
-      .WillByDefault([state](int device_id) -> absl::StatusOr<Device*> {
+      .WillByDefault([state](DeviceId device_id) -> absl::StatusOr<Device*> {
         auto it = state->device_map.find(device_id);
         if (it == state->device_map.end()) {
-          return InvalidArgument("Unexpected device id: %d", device_id);
+          return InvalidArgument("Unexpected device id: %d", device_id.value());
         }
         return it->second.get();
       });
diff --git a/third_party/xla/xla/python/ifrt/value.h b/third_party/xla/xla/python/ifrt/value.h
index 11bcf519ce6dc2..5293e4da84ceaa 100644
--- a/third_party/xla/xla/python/ifrt/value.h
+++ b/third_party/xla/xla/python/ifrt/value.h
@@ -44,7 +44,7 @@ class Value : public tsl::ReferenceCounted<Value>,
 
   // Returns a future that becomes ready when the buffer is computed or has an
   // error.
-  virtual Future<Status> GetReadyFuture() const = 0;
+  virtual Future<> GetReadyFuture() const = 0;
 
   // Deletes the value from the devices. The operation may be asynchronous. The
   // returned future will have the result of the deletion on the devices, and
@@ -52,7 +52,7 @@ class Value : public tsl::ReferenceCounted<Value>,
   // Implementations that do not track the completion of the deletion operation
   // may make the future immediately ready with an OK status.
   // TODO(phawkins): decide if we want Delete() to be idempotent.
-  virtual Future<Status> Delete() = 0;
+  virtual Future<> Delete() = 0;
 
   // Returns whether the value has been enqueued for deletion from the devices.
   virtual bool IsDeleted() const = 0;
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/BUILD b/third_party/xla/xla/python/ifrt_proxy/client/BUILD
index 1f8b8fe5df28f4..0126a72a798fe8 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/BUILD
+++ b/third_party/xla/xla/python/ifrt_proxy/client/BUILD
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-load("@local_tsl//tsl:tsl.bzl", "if_google")
 load("//xla/python/ifrt_proxy/common:ifrt_proxy.bzl", "default_ifrt_proxy_visibility", "ifrt_proxy_cc_test")
+load("//xla/tsl:tsl.bzl", "if_google")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -133,6 +133,7 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@local_tsl//tsl/concurrency:ref_count",
+        "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
@@ -168,16 +169,14 @@ cc_library(
     srcs = ["device.cc"],
     hdrs = ["device.h"],
     deps = [
-        "//xla:literal",
-        "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_device_description",
-        "//xla/pjrt:pjrt_future",
         "//xla/python/ifrt",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
     ],
 )
 
@@ -189,11 +188,11 @@ cc_library(
         ":rpc_helper",
         "//xla:status_macros",
         "//xla/python/ifrt",
+        "//xla/python/ifrt:sharding_serdes",
         "//xla/python/ifrt_proxy/common:array_util",
         "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
         "//xla/python/ifrt_proxy/common:types",
         "//xla/python/ifrt_proxy/common:types_proto_cc",
-        "//xla/python/pjrt_ifrt",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/log",
@@ -328,6 +327,7 @@ cc_library(
         "//xla/pjrt:pjrt_executable",
         "//xla/pjrt:pjrt_layout",
         "//xla/python/ifrt",
+        "//xla/python/ifrt:sharding_serdes",
         "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
         "//xla/python/ifrt_proxy/common:types",
         "//xla/python/pjrt_ifrt",
@@ -346,6 +346,7 @@ cc_library(
         "@local_tsl//tsl/concurrency:ref_count",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:status_to_from_proto",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -456,6 +457,7 @@ cc_library(
         "//xla/python/ifrt",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
     ],
 )
 
@@ -481,6 +483,7 @@ ifrt_proxy_cc_test(
         "//xla/pjrt:pjrt_layout",
         "//xla/python/ifrt",
         "//xla/python/ifrt:mock",
+        "//xla/python/ifrt:sharding_serdes",
         "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
         "//xla/python/ifrt_proxy/common:types",
         "@com_google_absl//absl/status",
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/array.cc b/third_party/xla/xla/python/ifrt_proxy/client/array.cc
index 05b9147cd7b830..34edd2cbc7e347 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/array.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/array.cc
@@ -45,7 +45,6 @@
 #include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
 #include "xla/python/ifrt_proxy/common/types.h"
 #include "xla/python/ifrt_proxy/common/types.pb.h"
-#include "xla/python/pjrt_ifrt/pjrt_array.h"
 #include "xla/status_macros.h"
 #include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/errors.h"
@@ -78,9 +77,9 @@ Array::MakeArrayFromHostBuffer(
 
   auto req = std::make_unique<MakeArrayFromHostBufferRequest>();
   req->set_host_buffer_handle(host_buffer_handle);
-  req->set_dtype(ToDTypeProto(dtype));
-  *req->mutable_shape() = ToShapeProto(shape);
-  TF_ASSIGN_OR_RETURN(*req->mutable_sharding(), ToShardingProto(*sharding));
+  *req->mutable_dtype() = dtype.ToProto();
+  *req->mutable_shape() = shape.ToProto();
+  TF_ASSIGN_OR_RETURN(*req->mutable_sharding(), sharding->ToProto());
   if (byte_strides.has_value()) {
     *req->mutable_byte_strides() = ToByteStridesProto(*byte_strides);
   }
@@ -113,33 +112,40 @@ void Array::Destruct(RpcHelper* rpc_helper, ArrayHandle handle) {
           });
 }
 
-Future<absl::Status> Array::GetReadyFuture() const {
+Future<> Array::GetReadyFuture() const {
   auto req = std::make_unique<CheckArrayReadyRequest>();
   req->set_array_handle(handle_.handle);
 
-  auto promise = Future<absl::Status>::CreatePromise();
+  auto promise = Future<>::CreatePromise();
   rpc_helper_->CheckArrayReady(std::move(req))
       .OnReady(
           [promise](absl::StatusOr<std::shared_ptr<CheckArrayReadyResponse>>
-                        resp) mutable -> void { promise.Set(resp.status()); });
-  return Future<absl::Status>(std::move(promise));
+                        resp) mutable -> void {
+            if (resp.status().ok()) {
+              promise.Set();
+            } else {
+              promise.SetError(resp.status());
+            }
+          });
+  return Future<>(std::move(promise));
 }
 
-Future<absl::Status> Array::Delete() {
+Future<> Array::Delete() {
   auto req = std::make_unique<DeleteArrayRequest>();
   req->set_array_handle(handle_.handle);
 
   absl::StatusOr<std::shared_ptr<DeleteArrayResponse>> response =
       rpc_helper_->DeleteArray(std::move(req)).Await();
   if (!response.ok()) {
-    return Future<absl::Status>(response.status());
+    return Future<>(response.status());
   }
 
   // TODO(b/266635130): So that the caller is not blocked until the server
   // replies with the deletion's response, from within
   // `Future(status_handle_promise).OnReady()`, schedule `CheckFuture()` on a
   // separate thread.
-  return rpc_helper_->CheckFuture((*response)->deletion_future_handle());
+  return Future<>::FromStatusFuture(
+      rpc_helper_->CheckFuture((*response)->deletion_future_handle()));
 }
 
 bool Array::IsDeleted() const {
@@ -168,8 +174,8 @@ Array::AssembleArrayFromSingleDeviceArrays(
     ArrayCopySemantics semantics) {
   auto req = std::make_unique<AssembleArrayFromSingleDeviceArraysRequest>();
   TF_RET_CHECK(!arrays.empty());
-  *req->mutable_shape() = ToShapeProto(shape);
-  TF_ASSIGN_OR_RETURN(*req->mutable_sharding(), ToShardingProto(*sharding));
+  *req->mutable_shape() = shape.ToProto();
+  TF_ASSIGN_OR_RETURN(*req->mutable_sharding(), sharding->ToProto());
   req->set_copy_semantics(ToArrayCopySemanticsProto(semantics));
   for (const tsl::RCReference<xla::ifrt::Array>& rcref : arrays) {
     Array* array = llvm::dyn_cast<Array>(rcref.get());
@@ -253,7 +259,7 @@ absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> Array::Reshard(
     ArrayCopySemantics semantics) {
   auto req = std::make_unique<ReshardRequest>();
   req->set_array_handle(handle_.handle);
-  TF_ASSIGN_OR_RETURN(*req->mutable_sharding(), ToShardingProto(*new_sharding));
+  TF_ASSIGN_OR_RETURN(*req->mutable_sharding(), new_sharding->ToProto());
   req->set_copy_semantics(ToArrayCopySemanticsProto(semantics));
 
   TF_ASSIGN_OR_RETURN(std::shared_ptr<ReshardResponse> response,
@@ -264,13 +270,13 @@ absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> Array::Reshard(
       client_, rpc_helper_, dtype_, shape_, std::move(new_sharding), handle));
 }
 
-Future<absl::Status> Array::CopyToHostBuffer(
+Future<> Array::CopyToHostBuffer(
     void* data, std::optional<absl::Span<const int64_t>> byte_strides,
     ArrayCopySemantics semantics) {
   const auto mem_region = ArrayMemRegion::FromZerothElementPointer(
       /*zeroth_element=*/data, dtype_, shape_, byte_strides);
   if (!mem_region.ok()) {
-    return Future<absl::Status>(mem_region.status());
+    return Future<>(mem_region.status());
   }
 
   auto req = std::make_unique<CopyToHostBufferRequest>();
@@ -282,14 +288,14 @@ Future<absl::Status> Array::CopyToHostBuffer(
       rpc_helper_->host_buffer_store()->NextHandle();
   req->set_host_buffer_handle(host_buffer_handle);
 
-  auto promise = Future<absl::Status>::CreatePromise();
+  auto promise = Future<>::CreatePromise();
   auto on_ready = [host_buffer_store = rpc_helper_->host_buffer_store(),
                    promise, host_buffer_handle,
                    mem_region = mem_region->mem_region()](
                       absl::StatusOr<std::shared_ptr<CopyToHostBufferResponse>>
                           resp) mutable {
     if (!resp.ok()) {
-      promise.Set(resp.status());
+      promise.SetError(resp.status());
       return;
     }
 
@@ -308,7 +314,7 @@ Future<absl::Status> Array::CopyToHostBuffer(
           };
 
           if (!data.ok()) {
-            promise.Set(data.status());
+            promise.SetError(data.status());
             return;
           }
           if (data->size() != mem_region.size()) {
@@ -317,7 +323,7 @@ Future<absl::Status> Array::CopyToHostBuffer(
                              "response from proxy: ",
                              mem_region.size(), " vs ", data->size()));
             LOG(ERROR) << status;
-            promise.Set(status);
+            promise.SetError(status);
             return;
           }
 #if defined(PLATFORM_GOOGLE)
@@ -326,11 +332,11 @@ Future<absl::Status> Array::CopyToHostBuffer(
           std::memcpy(const_cast<char*>(mem_region.data()),
                       data->Flatten().data(), data->size());
 #endif
-          promise.Set(absl::OkStatus());
+          promise.Set();
         });
   };
   rpc_helper_->CopyToHostBuffer(std::move(req)).OnReady(std::move(on_ready));
-  return Future<absl::Status>(std::move(promise));
+  return Future<>(std::move(promise));
 }
 
 xla::ifrt::Client* Array::client() const { return client_; }
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/array.h b/third_party/xla/xla/python/ifrt_proxy/client/array.h
index 3b5e8d9d5e1149..9e96d14900a047 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/array.h
+++ b/third_party/xla/xla/python/ifrt_proxy/client/array.h
@@ -91,8 +91,8 @@ class Array final : public llvm::RTTIExtends<Array, xla::ifrt::Array> {
   ArrayHandle handle() const { return handle_; }
 
   xla::ifrt::Client* client() const override;
-  Future<absl::Status> GetReadyFuture() const override;
-  Future<absl::Status> Delete() override;
+  Future<> GetReadyFuture() const override;
+  Future<> Delete() override;
   bool IsDeleted() const override;
   std::string DebugString() const override;
 
@@ -114,7 +114,7 @@ class Array final : public llvm::RTTIExtends<Array, xla::ifrt::Array> {
       xla::ifrt::ArrayCopySemantics semantics) override;
 
   ABSL_MUST_USE_RESULT
-  Future<absl::Status> CopyToHostBuffer(
+  Future<> CopyToHostBuffer(
       void* data, std::optional<absl::Span<const int64_t>> byte_strides,
       ArrayCopySemantics semantics) override;
 
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/client.cc b/third_party/xla/xla/python/ifrt_proxy/client/client.cc
index 7153ad21420c1e..2839cb44db694b 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/client.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/client.cc
@@ -44,6 +44,7 @@
 #include "xla/python/ifrt_proxy/common/types.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/concurrency/ref_count.h"
+#include "tsl/platform/casts.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
@@ -60,8 +61,9 @@ absl::StatusOr<std::unique_ptr<Client>> Client::Create(
 
   absl::flat_hash_map<int, std::unique_ptr<Memory>> memories;
   for (const auto& m : init_response.memories()) {
-    auto memory = std::make_unique<Memory>(m.id(), m.memory_space_kind(),
-                                           m.debug_string(), m.to_string());
+    auto memory =
+        std::make_unique<Memory>(m.id(), m.memory_space_kind(), m.kind_id(),
+                                 m.debug_string(), m.to_string());
     memories.insert({m.id(), std::move(memory)});
   }
 
@@ -96,7 +98,7 @@ absl::StatusOr<std::unique_ptr<Client>> Client::Create(
         return absl::NotFoundError(
             absl::StrCat("Memory ", d.default_memory_id(), " not found"));
       }
-      device->default_memory_space_ = it->second.get();
+      device->default_memory_ = it->second.get();
     }
     for (const int memory_id : d.memory_ids()) {
       const auto it = memories.find(memory_id);
@@ -104,7 +106,7 @@ absl::StatusOr<std::unique_ptr<Client>> Client::Create(
         return absl::NotFoundError(
             absl::StrCat("Memory ", memory_id, " not found"));
       }
-      device->memory_spaces_.push_back(it->second.get());
+      device->memories_.push_back(it->second.get());
     }
 
     devices.insert({d.id(), std::move(device)});
@@ -128,12 +130,16 @@ absl::StatusOr<std::unique_ptr<Client>> Client::Create(
   std::string runtime_type =
       absl::StrCat("proxy/", init_response.runtime_type());
 
-  return absl::WrapUnique(new Client(
+  auto client = absl::WrapUnique(new Client(
       std::move(rpc_helper), init_response.session_id(),
       init_response.platform_name(), init_response.platform_version(),
       init_response.platform_id(), init_response.process_index(), runtime_type,
-      std::move(devices), std::move(device_ptrs),
-      std::move(addressable_device_ptrs), std::move(memories)));
+      std::move(devices), device_ptrs, std::move(addressable_device_ptrs),
+      std::move(memories)));
+  for (ifrt::Device* device : device_ptrs) {
+    tensorflow::down_cast<Device*>(device)->client_ = client.get();
+  }
+  return client;
 }
 
 Client::Client(std::shared_ptr<RpcHelper> rpc_helper, uint64_t session_id,
@@ -158,11 +164,12 @@ Client::Client(std::shared_ptr<RpcHelper> rpc_helper, uint64_t session_id,
 
 Client::~Client() { rpc_helper_->Disconnect(); }
 
-absl::StatusOr<xla::ifrt::Device*> Client::LookupDevice(int device_id) const {
-  auto it = devices_.find(device_id);
+absl::StatusOr<xla::ifrt::Device*> Client::LookupDevice(
+    DeviceId device_id) const {
+  auto it = devices_.find(device_id.value());
   if (it == devices_.end()) {
     return absl::NotFoundError(
-        absl::StrCat("Device ", device_id, " not found."));
+        absl::StrCat("Device ", device_id.value(), " not found."));
   }
   return it->second.get();
 }
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/client.h b/third_party/xla/xla/python/ifrt_proxy/client/client.h
index 8cca1cbf0258af..647451400db018 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/client.h
+++ b/third_party/xla/xla/python/ifrt_proxy/client/client.h
@@ -102,7 +102,8 @@ class Client final : public llvm::RTTIExtends<Client, xla::ifrt::Client> {
   int process_index() const override { return process_index_; }
   absl::StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
       int num_replicas, int num_partitions) const override;
-  absl::StatusOr<xla::ifrt::Device*> LookupDevice(int device_id) const override;
+  absl::StatusOr<xla::ifrt::Device*> LookupDevice(
+      DeviceId device_id) const override;
   absl::StatusOr<xla::ifrt::Device*> LookupAddressableDevice(
       int local_hardware_id) const override {
     return absl::UnimplementedError(
@@ -112,11 +113,16 @@ class Client final : public llvm::RTTIExtends<Client, xla::ifrt::Client> {
     return &default_compiler_;
   }
   absl::StatusOr<std::shared_ptr<const xla::PjRtTopologyDescription>>
-  GetTopologyForDevices(
-      absl::Span<xla::ifrt::Device* const> devices) const override {
+  GetTopologyForDevices(const xla::ifrt::DeviceList& devices) const override {
     return absl::UnimplementedError(
         "GetTopologyForDevices is not supported for the IFRT proxy client.");
   }
+  absl::StatusOr<std::unique_ptr<xla::PjRtLayout>> GetDefaultLayoutForDevice(
+      xla::ifrt::DType dtype, absl::Span<const int64_t> dims,
+      xla::ifrt::Device* device) const override {
+    return absl::UnimplementedError(
+        "GetDefaultLayout is not supported for the IFRT proxy client.");
+  }
 
   // For llvm::RTTIExtends.
   static char ID;  // NOLINT
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/client_test.cc b/third_party/xla/xla/python/ifrt_proxy/client/client_test.cc
index 95fabee93c1a3d..bed30ddc8d2164 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/client_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/client_test.cc
@@ -22,6 +22,7 @@
 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "xla/pjrt/pjrt_device_description.h"
+#include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/future.h"
 #include "xla/python/ifrt_proxy/client/client_session.h"
 #include "xla/python/ifrt_proxy/client/host_buffer.h"
@@ -106,11 +107,13 @@ class ClientTest : public ::testing::Test {
           memories {
             id: 0
             memory_space_kind: "mock"
+            kind_id: 0
             device_ids: [ 0 ]
           }
           memories {
             id: 1
             memory_space_kind: "mock"
+            kind_id: 1
             device_ids: [ 1 ]
           }
         )pb",
@@ -134,35 +137,35 @@ TEST_F(ClientTest, Init) {
   ASSERT_EQ(client_->device_count(), 2);
   ASSERT_EQ(client_->addressable_device_count(), 1);
 
-  TF_ASSERT_OK_AND_ASSIGN(auto* const device0, client_->LookupDevice(0));
-  EXPECT_EQ(device0->id(), 0);
-  EXPECT_EQ(device0->local_hardware_id(), 1234);
-  EXPECT_EQ(device0->device_kind(), "mock");
+  TF_ASSERT_OK_AND_ASSIGN(auto* const device0,
+                          client_->LookupDevice(DeviceId(0)));
+  EXPECT_EQ(device0->Id(), DeviceId(0));
+  EXPECT_EQ(device0->Kind(), "mock");
   EXPECT_THAT(device0->Attributes(),
               ElementsAre(Pair(
                   "name", xla::PjRtDeviceAttribute(std::string("device0")))));
 
-  ASSERT_THAT(device0->memory_spaces(), SizeIs(1));
-  auto* const memory0 = device0->memory_spaces()[0];
-  EXPECT_EQ(memory0->id(), 0);
-  EXPECT_EQ(memory0->memory_space_kind(), "mock");
-  EXPECT_THAT(memory0->devices(), UnorderedElementsAre(device0));
-  EXPECT_THAT(device0->default_memory_space(), IsOkAndHolds(memory0));
-
-  TF_ASSERT_OK_AND_ASSIGN(auto* const device1, client_->LookupDevice(1));
-  EXPECT_EQ(device1->id(), 1);
-  EXPECT_EQ(device1->local_hardware_id(), 1234);
-  EXPECT_EQ(device1->device_kind(), "mock");
+  ASSERT_THAT(device0->Memories(), SizeIs(1));
+  auto* const memory0 = device0->Memories()[0];
+  EXPECT_EQ(memory0->Id(), 0);
+  EXPECT_EQ(memory0->Kind().memory_kind(), "mock");
+  EXPECT_THAT(memory0->Devices(), UnorderedElementsAre(device0));
+  EXPECT_THAT(device0->DefaultMemory(), IsOkAndHolds(memory0));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto* const device1,
+                          client_->LookupDevice(DeviceId(1)));
+  EXPECT_EQ(device1->Id(), 1);
+  EXPECT_EQ(device1->Kind(), "mock");
   EXPECT_THAT(device1->Attributes(),
               ElementsAre(Pair(
                   "name", xla::PjRtDeviceAttribute(std::string("device1")))));
 
-  ASSERT_THAT(device1->memory_spaces(), SizeIs(1));
-  auto* const memory1 = device1->memory_spaces()[0];
-  EXPECT_EQ(memory1->id(), 1);
-  EXPECT_EQ(memory1->memory_space_kind(), "mock");
-  EXPECT_THAT(memory1->devices(), UnorderedElementsAre(device1));
-  EXPECT_THAT(device1->default_memory_space(), IsOkAndHolds(memory1));
+  ASSERT_THAT(device1->Memories(), SizeIs(1));
+  auto* const memory1 = device1->Memories()[0];
+  EXPECT_EQ(memory1->Id(), 1);
+  EXPECT_EQ(memory1->Kind().memory_kind(), "mock");
+  EXPECT_THAT(memory1->Devices(), UnorderedElementsAre(device1));
+  EXPECT_THAT(device1->DefaultMemory(), IsOkAndHolds(memory1));
 
   EXPECT_THAT(client_->addressable_devices(), ElementsAre(device1));
 }
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/compiler.cc b/third_party/xla/xla/python/ifrt_proxy/client/compiler.cc
index 55132de4cec640..d23dfc041b2cc4 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/compiler.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/compiler.cc
@@ -110,7 +110,7 @@ absl::StatusOr<std::unique_ptr<xla::ifrt::LoadedExecutable>> Compiler::Compile(
   addressable_devices.reserve(response->addressable_device_ids_size());
   for (const int32_t device_id : response->addressable_device_ids()) {
     TF_ASSIGN_OR_RETURN(xla::ifrt::Device* const device,
-                        client_->LookupDevice(device_id));
+                        client_->LookupDevice(DeviceId(device_id)));
     addressable_devices.push_back(device);
   }
 
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/compiler_test.cc b/third_party/xla/xla/python/ifrt_proxy/client/compiler_test.cc
index fbefdec79b00d3..2e457b8bda1820 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/compiler_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/compiler_test.cc
@@ -160,8 +160,8 @@ TEST_F(CompilerTest, Compile) {
   std::vector<MockDevice> devices(2);
 
   MockClient client;
-  ON_CALL(client, LookupDevice(_)).WillByDefault(Invoke([&](int id) {
-    return &devices[id];
+  ON_CALL(client, LookupDevice(_)).WillByDefault(Invoke([&](DeviceId id) {
+    return &devices[id.value()];
   }));
 
   Compiler compiler(&client, rpc_helper_);
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/device.cc b/third_party/xla/xla/python/ifrt_proxy/client/device.cc
index f43d9aec101f7a..a368f395fe37d4 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/device.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/device.cc
@@ -14,44 +14,43 @@
 
 #include "xla/python/ifrt_proxy/client/device.h"
 
-#include <memory>
-
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
-#include "xla/literal.h"
-#include "xla/pjrt/pjrt_client.h"
-#include "xla/pjrt/pjrt_future.h"
+#include "xla/python/ifrt/device.h"
 
 namespace xla {
 namespace ifrt {
 namespace proxy {
 
-std::unique_ptr<xla::ScopedAsyncTrackingEvent> Device::CreateAsyncTrackingEvent(
-    absl::string_view description) const {
-  return nullptr;
-}
+ifrt::Client* Device::client() const { return client_; }
 
-absl::Status Device::TransferToInfeed(const xla::LiteralSlice& literal) {
-  return absl::UnimplementedError("Device does not support TransferToInfeed");
-}
+DeviceId Device::Id() const { return DeviceId(description_.id()); }
 
-absl::Status Device::TransferFromOutfeed(xla::MutableBorrowingLiteral literal) {
-  return absl::UnimplementedError(
-      "Device does not support TransferFromOutfeed");
-}
+bool Device::IsAddressable() const { return is_addressable_; }
+
+absl::string_view Device::Kind() const { return description_.device_kind(); }
+absl::string_view Device::ToString() const { return description_.ToString(); }
 
-absl::Span<xla::PjRtMemorySpace* const> Device::memory_spaces() const {
-  return memory_spaces_;
+absl::string_view Device::DebugString() const {
+  return description_.DebugString();
 }
 
-absl::StatusOr<xla::PjRtMemorySpace*> Device::default_memory_space() const {
-  if (default_memory_space_ == nullptr) {
-    return absl::UnimplementedError(
-        "Device does not support default_memory_space");
+absl::Span<ifrt::Memory* const> Device::Memories() const { return memories_; }
+
+absl::StatusOr<ifrt::Memory*> Device::DefaultMemory() const {
+  if (default_memory_ == nullptr) {
+    return absl::UnimplementedError("Device does not support default_memory");
   }
-  return default_memory_space_;
+  return default_memory_;
+}
+
+int Device::ProcessIndex() const { return description_.process_index(); }
+
+const absl::flat_hash_map<std::string, PjRtDeviceAttribute>&
+Device::Attributes() const {
+  return description_.Attributes();
 }
 
 char Device::ID = 0;  // NOLINT
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/device.h b/third_party/xla/xla/python/ifrt_proxy/client/device.h
index 6cb461865818d9..cb6888d5699841 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/device.h
+++ b/third_party/xla/xla/python/ifrt_proxy/client/device.h
@@ -17,20 +17,16 @@
 #ifndef XLA_PYTHON_IFRT_PROXY_CLIENT_DEVICE_H_
 #define XLA_PYTHON_IFRT_PROXY_CLIENT_DEVICE_H_
 
-#include <memory>
 #include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
-#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
-#include "xla/literal.h"
-#include "xla/pjrt/pjrt_client.h"
+#include "llvm/Support/ExtensibleRTTI.h"
 #include "xla/pjrt/pjrt_device_description.h"
-#include "xla/pjrt/pjrt_future.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/memory.h"
 
@@ -77,7 +73,7 @@ class DeviceDescription final : public xla::PjRtDeviceDescription {
   absl::flat_hash_map<std::string, xla::PjRtDeviceAttribute> attributes_;
 };
 
-class Device final : public xla::ifrt::Device {
+class Device final : public llvm::RTTIExtends<Device, xla::ifrt::Device> {
  public:
   Device(DeviceDescription description, int local_device_id,
          int local_hardware_id, bool is_addressable)
@@ -86,50 +82,34 @@ class Device final : public xla::ifrt::Device {
         local_hardware_id_(local_hardware_id),
         is_addressable_(is_addressable) {}
 
-  xla::PjRtClient* client() const override { return nullptr; }
+  ifrt::Client* client() const override;
+  bool IsAddressable() const override;
 
-  bool IsAddressable() const override { return is_addressable_; }
+  DeviceId Id() const override;
+  absl::string_view Kind() const override;
+  absl::string_view ToString() const override;
+  absl::string_view DebugString() const override;
+  int ProcessIndex() const override;
 
-  const xla::PjRtDeviceDescription& description() const override {
-    return description_;
-  }
-
-  int local_hardware_id() const override {
-    return local_hardware_id_typed().value();
-  }
-
-  PjRtLocalDeviceId local_device_id() const override {
-    return PjRtLocalDeviceId(local_device_id_);
-  }
-
-  PjRtLocalHardwareId local_hardware_id_typed() const override {
-    return PjRtLocalHardwareId(local_hardware_id_);
-  }
-
-  std::unique_ptr<xla::ScopedAsyncTrackingEvent> CreateAsyncTrackingEvent(
-      absl::string_view description) const override;
-
-  absl::Status TransferToInfeed(const xla::LiteralSlice& literal) override;
-
-  absl::Status TransferFromOutfeed(
-      xla::MutableBorrowingLiteral literal) override;
-
-  absl::Span<xla::PjRtMemorySpace* const> memory_spaces() const override;
+  const absl::flat_hash_map<std::string, PjRtDeviceAttribute>& Attributes()
+      const override;
 
-  absl::StatusOr<xla::PjRtMemorySpace*> default_memory_space() const override;
+  absl::Span<ifrt::Memory* const> Memories() const override;
+  absl::StatusOr<ifrt::Memory*> DefaultMemory() const override;
 
   static char ID;  // NOLINT
 
  private:
-  friend class Client;  // For `memory_spaces_` initialization.
+  friend class Client;  // For `memories_` initialization.
 
+  ifrt::Client* client_;
   const DeviceDescription description_;
   const int local_device_id_;
   const int local_hardware_id_;
   const bool is_addressable_;
 
-  std::vector<xla::ifrt::Memory*> memory_spaces_;
-  xla::ifrt::Memory* default_memory_space_ = nullptr;
+  std::vector<xla::ifrt::Memory*> memories_;
+  xla::ifrt::Memory* default_memory_ = nullptr;
 };
 
 }  // namespace proxy
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/executable.cc b/third_party/xla/xla/python/ifrt_proxy/client/executable.cc
index ea110d6ed824c9..c05939bd732723 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/executable.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/executable.cc
@@ -47,6 +47,7 @@
 #include "xla/python/ifrt/future.h"
 #include "xla/python/ifrt/host_callback.h"
 #include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
 #include "xla/python/ifrt_proxy/client/array.h"
 #include "xla/python/ifrt_proxy/client/host_buffer.h"
 #include "xla/python/ifrt_proxy/client/rpc_helper.h"
@@ -56,10 +57,12 @@
 #include "xla/shape_util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/concurrency/ref_count.h"
+#include "tsl/platform/cpu_info.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/status_to_from_proto.h"
 #include "tsl/platform/statusor.h"
+#include "tsl/platform/threadpool.h"
 
 namespace xla {
 namespace ifrt {
@@ -419,7 +422,7 @@ LoadedExecutable::Execute(absl::Span<tsl::RCReference<xla::ifrt::Array>> args,
   TF_ASSIGN_OR_RETURN(*req->mutable_execute_options(), options.ToProto());
   if (devices.has_value()) {
     for (const auto* device : *devices) {
-      req->add_device_ids(device->id());
+      req->add_device_ids(device->Id().value());
     }
   }
 
@@ -434,7 +437,8 @@ LoadedExecutable::Execute(absl::Span<tsl::RCReference<xla::ifrt::Array>> args,
 
   // Populate the execution status future. `CheckFuture` deletes the server-side
   // futures after its completion.
-  result.status = rpc_helper_->CheckFuture(response->status_handle());
+  result.status = Future<>::FromStatusFuture(
+      rpc_helper_->CheckFuture(response->status_handle()));
 
   // Create output arrays. The cleanup logic ensures that all handles are
   // properly cleaned up on early return.
@@ -449,10 +453,10 @@ LoadedExecutable::Execute(absl::Span<tsl::RCReference<xla::ifrt::Array>> args,
   };
   const auto lookup_device = absl::bind_front(&Client::LookupDevice, client());
   for (const auto& output : response->outputs()) {
-    DType dtype = FromDTypeProto(output.dtype());
-    Shape shape = FromShapeProto(output.shape());
+    TF_ASSIGN_OR_RETURN(DType dtype, DType::FromProto(output.dtype()));
+    TF_ASSIGN_OR_RETURN(Shape shape, Shape::FromProto(output.shape()));
     TF_ASSIGN_OR_RETURN(auto sharding,
-                        FromShardingProto(lookup_device, output.sharding()));
+                        Sharding::FromProto(lookup_device, output.sharding()));
     result.outputs.push_back(tsl::MakeRef<Array>(
         client(), rpc_helper_, dtype, std::move(shape), std::move(sharding),
         ArrayHandle{output.array_handle()}));
@@ -498,6 +502,17 @@ absl::Span<xla::ifrt::Device* const> LoadedExecutable::addressable_devices()
   return addressable_devices_;
 }
 
+namespace {
+
+static tsl::ThreadOptions GetThreadOptions() {
+  tsl::ThreadOptions thread_options;
+  // Ensure the threads' stack is large enough for arbitrary Python code.
+  thread_options.stack_size = 2 * 1024 * 1024;  // 2 MiB
+  return thread_options;
+}
+
+}  // namespace
+
 void LoadedExecutable::PollLoadedHostCallback(
     uint64_t handle,
     tsl::RCReference<xla::ifrt::LoadedHostCallback> loaded_host_callback) {
@@ -553,7 +568,11 @@ void LoadedExecutable::PollLoadedHostCallback(
           });
     }
   };
-  tsl::Env::Default()->SchedClosure(std::move(f));
+
+  static auto* global_pool = new tsl::thread::ThreadPool(
+      tsl::Env::Default(), GetThreadOptions(), "XLAIFRTProxy",
+      std::min(16, tsl::port::MaxParallelism()));
+  global_pool->Schedule(std::move(f));
 }
 
 char LoadedExecutable::ID = 0;  // NOLINT
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/executable_test.cc b/third_party/xla/xla/python/ifrt_proxy/client/executable_test.cc
index 04b466d7529062..8b43f39d18b7e0 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/executable_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/executable_test.cc
@@ -178,11 +178,10 @@ TEST_F(LoadedExecutableTest, Metadata) {
 #if defined(PLATFORM_GOOGLE)
 TEST_F(LoadedExecutableTest, Execute) {
   MockDevice device;
-  ON_CALL(device, global_device_id())
-      .WillByDefault(Return(xla::PjRtGlobalDeviceId(1)));
+  ON_CALL(device, Id()).WillByDefault(Return(DeviceId(1)));
 
   MockClient client;
-  ON_CALL(client, LookupDevice(1)).WillByDefault(Return(&device));
+  ON_CALL(client, LookupDevice(DeviceId(1))).WillByDefault(Return(&device));
 
   LoadedExecutable executable(
       &client, rpc_helper_, /*handle=*/1234, /*name=*/"foo",
@@ -196,13 +195,13 @@ TEST_F(LoadedExecutableTest, Execute) {
                                             loaded_executable_execute_response {
                                               status_handle: 2000
                                               outputs {
-                                                dtype: DTYPE_F32
-                                                shape { dimensions: [ 4, 4 ] }
+                                                dtype { kind: KIND_F32 }
+                                                shape { dims: [ 4, 4 ] }
                                                 array_handle: 3000
                                               }
                                               outputs {
-                                                dtype: DTYPE_F16
-                                                shape { dimensions: [ 8 ] }
+                                                dtype { kind: KIND_F16 }
+                                                shape { dims: [ 8 ] }
                                                 array_handle: 3001
                                               }
                                             }
@@ -213,10 +212,10 @@ TEST_F(LoadedExecutableTest, Execute) {
                         ->mutable_outputs();
     TF_ASSERT_OK_AND_ASSIGN(
         *(*outputs)[0].mutable_sharding(),
-        ToShardingProto(*SingleDeviceSharding::Create(&device, MemoryKind())));
+        SingleDeviceSharding::Create(&device, MemoryKind())->ToProto());
     TF_ASSERT_OK_AND_ASSIGN(
         *(*outputs)[1].mutable_sharding(),
-        ToShardingProto(*SingleDeviceSharding::Create(&device, MemoryKind())));
+        SingleDeviceSharding::Create(&device, MemoryKind())->ToProto());
   }
   EXPECT_CALL(*session_, Enqueue(Pointee(Partially(EquivToProto(
                              R"pb(loaded_executable_execute_request {
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/memory.h b/third_party/xla/xla/python/ifrt_proxy/client/memory.h
index e33c3a1a30ac83..29eae8f5bbaac8 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/memory.h
+++ b/third_party/xla/xla/python/ifrt_proxy/client/memory.h
@@ -23,6 +23,7 @@
 
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "llvm/Support/ExtensibleRTTI.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/memory.h"
@@ -33,34 +34,28 @@ namespace proxy {
 
 class Client;
 
-class Memory : public xla::ifrt::Memory {
+class Memory : public llvm::RTTIExtends<Memory, xla::ifrt::Memory> {
  public:
-  Memory(int id, std::string memory_space_kind, std::string debug_string,
-         std::string to_string)
+  Memory(int id, std::string memory_space_kind, int kind_id,
+         std::string debug_string, std::string to_string)
       : id_(id),
-        memory_space_kind_(std::move(memory_space_kind)),
+        kind_(std::move(memory_space_kind)),
         debug_string_(std::move(debug_string)),
         to_string_(std::move(to_string)) {}
 
   // Not copyable or movable: IFRT expects `string_view` from
-  // `memory_space_kind()` to be stable throughout the client's lifetime.
+  // `kind()` to be stable throughout the client's lifetime.
   Memory(const Memory& other) = delete;
   Memory& operator=(const Memory& other) = delete;
 
-  PjRtClient* client() const override { return nullptr; }
+  MemoryId Id() const override { return MemoryId(id_); }
+  const MemoryKind& Kind() const override { return kind_; }
 
-  absl::Span<xla::ifrt::Device* const> devices() const override {
+  absl::Span<xla::ifrt::Device* const> Devices() const override {
     return devices_;
   }
 
-  int id() const override { return id_; }
-
-  absl::string_view memory_space_kind() const override {
-    return memory_space_kind_;
-  }
-
   absl::string_view DebugString() const override { return debug_string_; }
-
   absl::string_view ToString() const override { return to_string_; }
 
  private:
@@ -68,7 +63,7 @@ class Memory : public xla::ifrt::Memory {
 
   int id_;
   std::vector<xla::ifrt::Device*> devices_;
-  std::string memory_space_kind_;
+  MemoryKind kind_;
   std::string debug_string_;
   std::string to_string_;
 };
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/BUILD b/third_party/xla/xla/python/ifrt_proxy/common/BUILD
index fff6b664a95013..cfe004734bccef 100644
--- a/third_party/xla/xla/python/ifrt_proxy/common/BUILD
+++ b/third_party/xla/xla/python/ifrt_proxy/common/BUILD
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-load("@local_tsl//tsl:tsl.bzl", "if_google")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
 # copybara:uncomment load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
 load("//xla/python/ifrt_proxy/common:ifrt_proxy.bzl", "default_ifrt_proxy_visibility", "ifrt_proxy_cc_test")
+load("//xla/tsl:tsl.bzl", "if_google")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -60,7 +60,6 @@ cc_library(
 tf_proto_library(
     name = "types_proto",
     srcs = ["types.proto"],
-    protodeps = ["//xla/python/ifrt:serdes_proto"],
 )
 
 tf_proto_library(
@@ -71,7 +70,10 @@ tf_proto_library(
         # copybara:uncomment "//google/protobuf:any",
         "//xla:xla_data_proto",
         "//xla/pjrt:execute_options_proto",
+        "//xla/python/ifrt:dtype_proto",
         "//xla/python/ifrt:serdes_proto",
+        "//xla/python/ifrt:shape_proto",
+        "//xla/python/ifrt:sharding_proto",
         "@local_tsl//tsl/protobuf:status_proto",
     ],
 )
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/ifrt_service.proto b/third_party/xla/xla/python/ifrt_proxy/common/ifrt_service.proto
index b0f94c429e5c34..15af18661b0c5a 100644
--- a/third_party/xla/xla/python/ifrt_proxy/common/ifrt_service.proto
+++ b/third_party/xla/xla/python/ifrt_proxy/common/ifrt_service.proto
@@ -18,7 +18,10 @@ package xla.ifrt.proxy;
 
 import "google/protobuf/any.proto";
 import "xla/pjrt/execute_options.proto";
+import "xla/python/ifrt/dtype.proto";
 import "xla/python/ifrt/serdes.proto";
+import "xla/python/ifrt/shape.proto";
+import "xla/python/ifrt/sharding.proto";
 import "xla/python/ifrt_proxy/common/types.proto";
 import "xla/xla_data.proto";
 import "tsl/protobuf/status.proto";
@@ -216,6 +219,7 @@ message InitResponse {
   message Memory {
     int32 id = 1;
     string memory_space_kind = 2;
+    int32 kind_id = 6;
     repeated int32 device_ids = 3;
     string debug_string = 4;
     string to_string = 5;
@@ -242,9 +246,9 @@ message CheckFutureResponse {}
 // Makes an IFRT Array from the contents of a HostBuffer.
 // Equivalent to `ifrt::Client::MakeArrayFromHostBuffer`.
 message MakeArrayFromHostBufferRequest {
-  proto.DType dtype = 1;
-  proto.Shape shape = 2;
-  proto.Sharding sharding = 3;
+  DTypeProto dtype = 1;
+  ShapeProto shape = 2;
+  ShardingProto sharding = 3;
   fixed64 host_buffer_handle = 4;
   optional proto.ByteStrides byte_strides = 5;
 }
@@ -255,8 +259,8 @@ message MakeArrayFromHostBufferResponse {
 // Makes an IFRT Array from a set of single-device Arrays.
 // Equivalent to ifrt::Client::AssembleArrayFromSingleDeviceArrays.
 message AssembleArrayFromSingleDeviceArraysRequest {
-  proto.Shape shape = 1;
-  proto.Sharding sharding = 2;
+  ShapeProto shape = 1;
+  ShardingProto sharding = 2;
   repeated fixed64 single_device_array_handles = 3;
   proto.ArrayCopySemantics copy_semantics = 4;
 }
@@ -285,7 +289,7 @@ message DisassembleIntoSingleDeviceArraysResponse {
 
 message ReshardRequest {
   fixed64 array_handle = 1;
-  proto.Sharding sharding = 2;
+  ShardingProto sharding = 2;
   proto.ArrayCopySemantics copy_semantics = 3;
 }
 message ReshardResponse {
@@ -416,9 +420,9 @@ message LoadedExecutableExecuteResponse {
   fixed64 status_handle = 1;
 
   message Output {
-    proto.DType dtype = 1;
-    proto.Shape shape = 2;
-    proto.Sharding sharding = 3;
+    DTypeProto dtype = 1;
+    ShapeProto shape = 2;
+    ShardingProto sharding = 3;
     fixed64 array_handle = 4;
   }
 
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/types.cc b/third_party/xla/xla/python/ifrt_proxy/common/types.cc
index 1474a6dad1fd77..05e9dff6272d73 100644
--- a/third_party/xla/xla/python/ifrt_proxy/common/types.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/common/types.cc
@@ -15,9 +15,7 @@
 #include "xla/python/ifrt_proxy/common/types.h"
 
 #include <cstdint>
-#include <memory>
 #include <string>
-#include <utility>
 #include <variant>
 #include <vector>
 
@@ -26,106 +24,14 @@
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
-#include "llvm/Support/Casting.h"
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/python/ifrt/array.h"
-#include "xla/python/ifrt/device.h"
-#include "xla/python/ifrt/dtype.h"
-#include "xla/python/ifrt/serdes.h"
-#include "xla/python/ifrt/shape.h"
-#include "xla/python/ifrt/sharding.h"
-#include "xla/python/ifrt/sharding_serdes.h"
 #include "xla/python/ifrt_proxy/common/types.pb.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace ifrt {
 namespace proxy {
 
-DType FromDTypeProto(proto::DType dtype_proto) {
-  switch (dtype_proto) {
-    case proto::DType::DTYPE_PRED:
-      return DType(DType::Kind::kPred);
-    case proto::DType::DTYPE_TOKEN:
-      return DType(DType::Kind::kToken);
-#define CASE(X)                 \
-  case proto::DType::DTYPE_##X: \
-    return DType(DType::Kind::k##X);
-      CASE(S4);
-      CASE(S8);
-      CASE(S16);
-      CASE(S32);
-      CASE(S64);
-      CASE(U4);
-      CASE(U8);
-      CASE(U16);
-      CASE(U32);
-      CASE(U64);
-      CASE(F16);
-      CASE(F32);
-      CASE(F64);
-      CASE(BF16);
-      CASE(C64);
-      CASE(C128);
-      CASE(F8E4M3FN);
-      CASE(F8E4M3B11FNUZ);
-      CASE(F8E4M3FNUZ);
-      CASE(F8E5M2);
-      CASE(F8E5M2FNUZ);
-#undef CASE
-    default:
-      return DType(DType::Kind::kInvalid);
-  }
-}
-
-proto::DType ToDTypeProto(DType dtype) {
-  switch (dtype.kind()) {
-    case DType::Kind::kPred:
-      return proto::DType::DTYPE_PRED;
-    case DType::Kind::kToken:
-      return proto::DType::DTYPE_TOKEN;
-#define CASE(X)           \
-  case DType::Kind::k##X: \
-    return proto::DType::DTYPE_##X;
-      CASE(S4);
-      CASE(S8);
-      CASE(S16);
-      CASE(S32);
-      CASE(S64);
-      CASE(U4);
-      CASE(U8);
-      CASE(U16);
-      CASE(U32);
-      CASE(U64);
-      CASE(F16);
-      CASE(F32);
-      CASE(F64);
-      CASE(BF16);
-      CASE(C64);
-      CASE(C128);
-      CASE(F8E4M3FN);
-      CASE(F8E4M3B11FNUZ);
-      CASE(F8E4M3FNUZ);
-      CASE(F8E5M2);
-      CASE(F8E5M2FNUZ);
-#undef CASE
-    default:
-      return proto::DType::DTYPE_UNSPECIFIED;
-  }
-}
-
-Shape FromShapeProto(const proto::Shape& shape_proto) {
-  return Shape(shape_proto.dimensions());
-}
-
-proto::Shape ToShapeProto(const Shape& shape) {
-  proto::Shape shape_proto;
-  for (int64_t dim : shape.dims()) {
-    shape_proto.add_dimensions(dim);
-  }
-  return shape_proto;
-}
-
 absl::StatusOr<xla::PjRtValueType> FromVariantProto(
     const proto::Variant& variant_proto) {
   switch (variant_proto.value_case()) {
@@ -163,24 +69,6 @@ absl::StatusOr<proto::Variant> ToVariantProto(const xla::PjRtValueType& value) {
   return variant;
 }
 
-absl::StatusOr<std::shared_ptr<const Sharding>> FromShardingProto(
-    DeviceList::LookupDeviceFunc lookup_device,
-    const proto::Sharding& sharding_proto) {
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<Serializable> sharding,
-                      Deserialize(sharding_proto.serialized_sharding(),
-                                  std::make_unique<DeserializeShardingOptions>(
-                                      std::move(lookup_device))));
-  return std::shared_ptr<const Sharding>(
-      llvm::cast<Sharding>(sharding.release()));
-}
-
-absl::StatusOr<proto::Sharding> ToShardingProto(const Sharding& sharding) {
-  proto::Sharding sharding_proto;
-  TF_ASSIGN_OR_RETURN(*sharding_proto.mutable_serialized_sharding(),
-                      Serialize(const_cast<Sharding&>(sharding)));
-  return sharding_proto;
-}
-
 proto::ArrayCopySemantics ToArrayCopySemanticsProto(ArrayCopySemantics s) {
   switch (s) {
     case ArrayCopySemantics::kAlwaysCopy:
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/types.h b/third_party/xla/xla/python/ifrt_proxy/common/types.h
index bd894b70a68d6b..0c517e2da054a5 100644
--- a/third_party/xla/xla/python/ifrt_proxy/common/types.h
+++ b/third_party/xla/xla/python/ifrt_proxy/common/types.h
@@ -18,17 +18,12 @@
 #define XLA_PYTHON_IFRT_PROXY_COMMON_TYPES_H_
 
 #include <cstdint>
-#include <memory>
 #include <vector>
 
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/python/ifrt/array.h"
-#include "xla/python/ifrt/device.h"
-#include "xla/python/ifrt/dtype.h"
-#include "xla/python/ifrt/shape.h"
-#include "xla/python/ifrt/sharding.h"
 #include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
 #include "xla/python/ifrt_proxy/common/types.pb.h"
 
@@ -45,17 +40,6 @@ struct ArrayHandle {
   }
 };
 
-DType FromDTypeProto(proto::DType dtype_proto);
-proto::DType ToDTypeProto(DType dtype);
-
-Shape FromShapeProto(const proto::Shape& shape_proto);
-proto::Shape ToShapeProto(const Shape& shape);
-
-absl::StatusOr<std::shared_ptr<const Sharding>> FromShardingProto(
-    DeviceList::LookupDeviceFunc lookup_device,
-    const proto::Sharding& sharding_proto);
-absl::StatusOr<proto::Sharding> ToShardingProto(const Sharding& sharding);
-
 absl::StatusOr<ArrayCopySemantics> FromArrayCopySemanticsProto(
     proto::ArrayCopySemantics s);
 proto::ArrayCopySemantics ToArrayCopySemanticsProto(ArrayCopySemantics s);
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/types.proto b/third_party/xla/xla/python/ifrt_proxy/common/types.proto
index 2de88772abe906..ca3829891d7629 100644
--- a/third_party/xla/xla/python/ifrt_proxy/common/types.proto
+++ b/third_party/xla/xla/python/ifrt_proxy/common/types.proto
@@ -16,67 +16,6 @@ syntax = "proto3";
 
 package xla.ifrt.proto;
 
-import "xla/python/ifrt/serdes.proto";
-
-// Sharding of an Array or Executable parameter/output.
-// TODO(b/266635130): Remove `Sharding` and use `xla.ifrt.Serialized` directly
-// if caching of sharding is unnecessary.
-message Sharding {
-  xla.ifrt.Serialized serialized_sharding = 1;
-}
-
-// Shape of an Array. Currently we only support static shapes with all
-// dimension sizes greater than or equal to 0.
-message Shape {
-  repeated int64 dimensions = 1;
-}
-
-// Data types currently supported. Mirrors `xla::ifrt::DType`.
-enum DType {
-  DTYPE_UNSPECIFIED = 0;
-
-  // Predicates are two-state booleans.
-  DTYPE_PRED = 1;
-
-  // Signed integral values of fixed width.
-  DTYPE_S4 = 21;
-  DTYPE_S8 = 2;
-  DTYPE_S16 = 3;
-  DTYPE_S32 = 4;
-  DTYPE_S64 = 5;
-
-  // Unsigned integral values of fixed width.
-  DTYPE_U4 = 22;
-  DTYPE_U8 = 6;
-  DTYPE_U16 = 7;
-  DTYPE_U32 = 8;
-  DTYPE_U64 = 9;
-
-  // Floating-point values of fixed width.
-  DTYPE_F16 = 10;
-  DTYPE_F32 = 11;
-  DTYPE_F64 = 12;
-
-  // Truncated 16 bit floating-point format. This is similar to IEEE's 16 bit
-  // floating-point format, but uses 1 bit for the sign, 8 bits for the
-  // exponent and 7 bits for the mantissa.
-  DTYPE_BF16 = 16;
-
-  // Complex values of fixed width.
-  DTYPE_C64 = 15;   // Paired F32 (real, imag), as in std::complex<float>.
-  DTYPE_C128 = 18;  // Paired F64 (real, imag), as in std::complex<double>.
-
-  // A token type threaded between side-effecting operations. Shapes of this
-  // dtype will have empty dimensions.
-  DTYPE_TOKEN = 17;
-
-  DTYPE_F8E4M3FN = 20;
-  DTYPE_F8E4M3B11FNUZ = 23;
-  DTYPE_F8E4M3FNUZ = 25;
-  DTYPE_F8E5M2 = 19;
-  DTYPE_F8E5M2FNUZ = 24;
-}
-
 // Mirrors `xla::PjRtValueType`, which is used in IFRT to model
 // polymorphic-typed values, e.g., `xla::ifrt::Executable::CostAnalysisValue`.
 message Variant {
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/types_test.cc b/third_party/xla/xla/python/ifrt_proxy/common/types_test.cc
index 5d0df922f1a5a6..fdbf3ff123cc8a 100644
--- a/third_party/xla/xla/python/ifrt_proxy/common/types_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/common/types_test.cc
@@ -15,15 +15,12 @@
 #include "xla/python/ifrt_proxy/common/types.h"
 
 #include <cstdint>
-#include <utility>
+#include <string>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
 #include "xla/pjrt/pjrt_common.h"
-#include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt_proxy/common/types.pb.h"
 #include "tsl/platform/status_matchers.h"
 #include "tsl/platform/statusor.h"
@@ -36,103 +33,21 @@ namespace {
 
 using ::tsl::testing::IsOkAndHolds;
 
-#if defined(PLATFORM_GOOGLE)
-using ::testing::EquivToProto;
-#endif
+class VariantTest : public testing::TestWithParam<xla::PjRtValueType> {};
 
-TEST(DTypeTest, ToFromProto) {
-  for (int i = 0; i < proto::DType_descriptor()->value_count(); ++i) {
-    const proto::DType dtype = static_cast<proto::DType>(
-        proto::DType_descriptor()->value(i)->number());
-    EXPECT_EQ(ToDTypeProto(FromDTypeProto(dtype)), dtype);
-  }
-}
-
-// TODO(b/315809436): Test needs rewrite because protobuf matchers are not OSS
-#if defined(PLATFORM_GOOGLE)
-class ShapeTest
-    : public testing::TestWithParam<std::pair<Shape, proto::Shape>> {};
-
-TEST_P(ShapeTest, FromShapeProto) {
-  const auto& [shape, shape_proto] = GetParam();
-  EXPECT_EQ(FromShapeProto(shape_proto), shape);
-}
-
-TEST_P(ShapeTest, ToShapeProto) {
-  const auto& [shape, shape_proto] = GetParam();
-  EXPECT_THAT(ToShapeProto(shape), EquivToProto(shape_proto));
-}
-
-proto::Shape MakeProtoShape(absl::Span<const int64_t> dims) {
-  auto shape = proto::Shape();
-  for (auto dim : dims) {
-    shape.add_dimensions(dim);
-  }
-  return shape;
-}
-
-INSTANTIATE_TEST_SUITE_P(Shape, ShapeTest,
-                         testing::ValuesIn({
-                             std::make_pair(Shape({}), MakeProtoShape({})),
-                             std::make_pair(Shape({1, 2}),
-                                            MakeProtoShape({1, 2})),
-                         }));
-#endif
-
-// TODO(b/315809436): Test needs rewrite because protobuf matchers are not OSS
-#if defined(PLATFORM_GOOGLE)
-class VariantTest : public testing::TestWithParam<
-                        std::pair<xla::PjRtValueType, proto::Variant>> {};
-
-TEST_P(VariantTest, FromVariantProto) {
-  const auto& [variant, variant_proto] = GetParam();
+TEST_P(VariantTest, ToFromVariantProto) {
+  const auto& variant = GetParam();
+  TF_ASSERT_OK_AND_ASSIGN(proto::Variant variant_proto,
+                          ToVariantProto(variant));
   EXPECT_THAT(FromVariantProto(variant_proto), IsOkAndHolds(variant));
 }
 
-TEST_P(VariantTest, ToVariantProto) {
-  const auto& [variant, variant_proto] = GetParam();
-  EXPECT_THAT(ToVariantProto(variant),
-              IsOkAndHolds(EquivToProto(variant_proto)));
-}
-
-proto::Variant MakeProtoVariantString(absl::string_view arg) {
-  auto variant = proto::Variant();
-  variant.set_string_value(arg);
-  return variant;
-}
-
-proto::Variant MakeProtoVariantInt64(int64_t arg) {
-  auto variant = proto::Variant();
-  variant.set_int64_value(arg);
-  return variant;
-}
-
-proto::Variant MakeProtoVariantInt64List(absl::Span<const int64_t> arg) {
-  auto variant = proto::Variant();
-  for (auto arg : arg) {
-    variant.mutable_int64_list()->add_values(arg);
-  }
-  return variant;
-}
-
-proto::Variant MakeProtoVariantFloat(float arg) {
-  auto variant = proto::Variant();
-  variant.set_float_value(arg);
-  return variant;
-}
-
 INSTANTIATE_TEST_SUITE_P(
     Variant, VariantTest,
-    testing::ValuesIn({
-        std::make_pair(xla::PjRtValueType("foo"),
-                       MakeProtoVariantString("foo")),
-        std::make_pair(xla::PjRtValueType(static_cast<int64_t>(1234)),
-                       MakeProtoVariantInt64(1234)),
-        std::make_pair(xla::PjRtValueType(std::vector<int64_t>{1, 2}),
-                       MakeProtoVariantInt64List({1, 2})),
-        std::make_pair(xla::PjRtValueType(3.14f), MakeProtoVariantFloat(3.14f)),
-    }));
-#endif
+    testing::Values(xla::PjRtValueType(std::string("foo")),
+                    xla::PjRtValueType(static_cast<int64_t>(1234)),
+                    xla::PjRtValueType(std::vector<int64_t>{1, 2}),
+                    xla::PjRtValueType(3.14f)));
 
 class ByteStridesTest : public testing::TestWithParam<std::vector<int64_t>> {};
 
@@ -146,18 +61,20 @@ INSTANTIATE_TEST_SUITE_P(
     testing::ValuesIn(std::vector<std::vector<int64_t>>{
         {}, {1}, {0}, {4, 8}, {8, 4}, {1, 2, 3, 4}, {0, 4}, {4, 0}}));
 
-TEST(ArrayCopySemantics, ToFromProtoTest) {
-  // NOLINTNEXTLINE readability-proto-enum-for-loop
-  for (int proto_enum_int = proto::ArrayCopySemantics_MIN;
-       proto_enum_int <= proto::ArrayCopySemantics_MAX; ++proto_enum_int) {
-    const auto proto_enum =
-        static_cast<proto::ArrayCopySemantics>(proto_enum_int);
+TEST(ArrayCopySemanticsTest, FromToFromProto) {
+  for (int i = 0; i < proto::ArrayCopySemantics_descriptor()->value_count();
+       ++i) {
+    const auto proto_enum = static_cast<proto::ArrayCopySemantics>(
+        proto::ArrayCopySemantics_descriptor()->value(i)->number());
     if (proto_enum == proto::ARRAY_COPY_SEMANTICS_UNSPECIFIED) {
       continue;
     }
     TF_ASSERT_OK_AND_ASSIGN(const auto cpp_enum,
                             FromArrayCopySemanticsProto(proto_enum));
-    EXPECT_EQ(proto_enum, ToArrayCopySemanticsProto(cpp_enum));
+    TF_ASSERT_OK_AND_ASSIGN(
+        const auto cpp_enum_copy,
+        FromArrayCopySemanticsProto(ToArrayCopySemanticsProto(cpp_enum)));
+    EXPECT_EQ(cpp_enum_copy, cpp_enum);
   }
 }
 
diff --git a/third_party/xla/xla/python/ifrt_proxy/integration_tests/mock_array_test.cc b/third_party/xla/xla/python/ifrt_proxy/integration_tests/mock_array_test.cc
index 3d4cd8fd800d51..9a0121fdcb2826 100644
--- a/third_party/xla/xla/python/ifrt_proxy/integration_tests/mock_array_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/integration_tests/mock_array_test.cc
@@ -179,7 +179,7 @@ TEST_F(MockArrayTest, ReadyFuturePropagatesError) {
   TF_ASSERT_OK_AND_ASSIGN(ArrayPair arr, NewArray());
 
   EXPECT_CALL(*arr.backend_array, GetReadyFuture).WillOnce([&] {
-    return Future<absl::Status>(absl::InternalError("testing"));
+    return Future<>(absl::InternalError("testing"));
   });
 
   EXPECT_THAT(arr.proxy_client_array->GetReadyFuture().Await(),
@@ -198,12 +198,12 @@ TEST_F(MockArrayTest, DeletionFutureWaitsUntilDeleted) {
     // returns being blocked on `wait_ready`. That version of the testcase does
     // not currently work since both the client and the server synchronously
     // block until the MockArray's Delete() returns.
-    auto promise = Future<absl::Status>::CreatePromise();
+    auto promise = Future<>::CreatePromise();
     threads.Schedule([&, promise]() mutable {
       wait_ready.WaitForNotification();
       promise.Set(arr.backend_array->delegated()->Delete().Await());
     });
-    return Future<absl::Status>(promise);
+    return Future<>(promise);
   });
 
   EXPECT_FALSE(arr.proxy_client_array->IsDeleted());
@@ -222,7 +222,7 @@ TEST_F(MockArrayTest, DeletionPropagatesError) {
   TF_ASSERT_OK_AND_ASSIGN(ArrayPair arr, NewArray());
 
   EXPECT_CALL(*arr.backend_array, Delete).WillOnce([&] {
-    return Future<absl::Status>(absl::InternalError("testing"));
+    return Future<>(absl::InternalError("testing"));
   });
 
   EXPECT_FALSE(arr.proxy_client_array->IsDeleted());
@@ -258,7 +258,7 @@ TEST_F(MockArrayTest, CopyToHostFuturePropagatesError) {
   absl::Notification wait_ready;
 
   EXPECT_CALL(*arr.backend_array, CopyToHostBuffer).WillOnce([&] {
-    return Future<Status>(absl::InternalError("testing"));
+    return Future<>(absl::InternalError("testing"));
   });
 
   char data[1000];
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/BUILD b/third_party/xla/xla/python/ifrt_proxy/server/BUILD
index 3b072a9744b976..546a8a42564aef 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/BUILD
+++ b/third_party/xla/xla/python/ifrt_proxy/server/BUILD
@@ -157,7 +157,9 @@ cc_library(
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_layout",
         "//xla/python/ifrt",
+        "//xla/python/ifrt:program_serdes",
         "//xla/python/ifrt:serdes",
+        "//xla/python/ifrt:sharding_serdes",
         "//xla/python/ifrt_proxy/common:array_util",
         "//xla/python/ifrt_proxy/common:common_serdes",
         "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc
index 91ec171deae7ff..572e2414766bae 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc
@@ -44,12 +44,16 @@
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/compiler.h"
 #include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/dtype.h"
 #include "xla/python/ifrt/executable.h"
 #include "xla/python/ifrt/future.h"
 #include "xla/python/ifrt/host_callback.h"
 #include "xla/python/ifrt/memory.h"
+#include "xla/python/ifrt/program.h"
+#include "xla/python/ifrt/program_serdes.h"
 #include "xla/python/ifrt/serdes.h"
 #include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
 #include "xla/python/ifrt_proxy/common/array_util.h"
 #include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
 #include "xla/python/ifrt_proxy/common/proto_util.h"
@@ -72,24 +76,6 @@ namespace xla {
 namespace ifrt {
 namespace proxy {
 
-namespace {
-
-// Convenient wrapper for `xla::ifrt::Deserialize()`.
-template <typename T>
-absl::StatusOr<std::unique_ptr<T>> Deserialize(
-    const Serialized& serialized,
-    std::unique_ptr<DeserializeOptions> options = nullptr) {
-  TF_ASSIGN_OR_RETURN(auto deserialized,
-                      Deserialize(serialized, std::move(options)));
-  auto obj = absl::WrapUnique(llvm::dyn_cast<T>(deserialized.release()));
-  if (obj == nullptr) {
-    return absl::InvalidArgumentError("Deserialization type mismatch");
-  }
-  return obj;
-}
-
-}  // namespace
-
 IfrtBackend::IfrtBackend(IfrtProxyVersion version, uint64_t session_id,
                          std::unique_ptr<xla::ifrt::Client> ifrt_client,
                          std::shared_ptr<HostBufferStore> host_buffer_store)
@@ -265,16 +251,13 @@ BackendInterface::Response IfrtBackend::HandleInit(
 
   for (auto* device : client_->devices()) {
     InitResponse::Device* d = init_resp->add_devices();
-    d->set_id(device->id());
-    d->set_local_device_id(device->local_device_id().value());
-    d->set_local_hardware_id(device->local_hardware_id_typed().value());
-    d->set_device_kind(AsProtoStringData(device->device_kind()));
-    if (auto default_memory_space = device->default_memory_space();
-        default_memory_space.ok()) {
-      d->set_default_memory_id((*default_memory_space)->id());
+    d->set_id(device->Id().value());
+    d->set_device_kind(AsProtoStringData(device->Kind()));
+    if (auto default_memory = device->DefaultMemory(); default_memory.ok()) {
+      d->set_default_memory_id((*default_memory)->Id().value());
     }
-    for (const auto* memory : device->memory_spaces()) {
-      d->add_memory_ids(memory->id());
+    for (const auto* memory : device->Memories()) {
+      d->add_memory_ids(memory->Id().value());
     }
     d->set_debug_string(AsProtoStringData(device->DebugString()));
     d->set_to_string(AsProtoStringData(device->ToString()));
@@ -284,13 +267,14 @@ BackendInterface::Response IfrtBackend::HandleInit(
     }
   }
   for (auto* addressable_device : client_->addressable_devices()) {
-    init_resp->add_addressable_device_ids(addressable_device->id());
+    init_resp->add_addressable_device_ids(addressable_device->Id().value());
   }
 
   absl::flat_hash_map<int, xla::ifrt::Memory*> memories;
   for (auto* device : client_->devices()) {
-    for (xla::ifrt::Memory* memory : device->memory_spaces()) {
-      const auto [it, inserted] = memories.insert({memory->id(), memory});
+    for (xla::ifrt::Memory* memory : device->Memories()) {
+      const auto [it, inserted] =
+          memories.insert({memory->Id().value(), memory});
       if (!inserted && it->second != memory) {
         return absl::FailedPreconditionError(absl::StrCat(
             "Two memories cannot have the same id: ", memory->ToString(),
@@ -301,9 +285,9 @@ BackendInterface::Response IfrtBackend::HandleInit(
   for (const auto& [id, memory] : memories) {
     auto* m = init_resp->add_memories();
     m->set_id(id);
-    m->set_memory_space_kind(AsProtoStringData(memory->memory_space_kind()));
-    for (const auto* device : memory->devices()) {
-      m->add_device_ids(device->id());
+    m->set_memory_space_kind(AsProtoStringData(*memory->Kind().memory_kind()));
+    for (const auto* device : memory->Devices()) {
+      m->add_device_ids(device->Id().value());
     }
     m->set_debug_string(AsProtoStringData(memory->DebugString()));
     m->set_to_string(AsProtoStringData(memory->ToString()));
@@ -357,16 +341,18 @@ BackendInterface::Response IfrtBackend::HandleMakeArrayFromHostBufferRequest(
       request->mutable_make_array_from_host_buffer_request();
 
   TF_ASSIGN_OR_RETURN(
-      auto sharding,
-      FromShardingProto(absl::bind_front(&Client::LookupDevice, client_.get()),
-                        make_array_request->sharding()));
+      auto sharding, Sharding::FromProto(
+                         absl::bind_front(&Client::LookupDevice, client_.get()),
+                         make_array_request->sharding()));
 
   const auto byte_strides = [&]() -> std::optional<std::vector<int64_t>> {
     if (!make_array_request->has_byte_strides()) return std::nullopt;
     return FromByteStridesProto(make_array_request->byte_strides());
   }();
-  const auto shape = FromShapeProto(make_array_request->shape());
-  const auto dtype = FromDTypeProto(make_array_request->dtype());
+  TF_ASSIGN_OR_RETURN(const auto shape,
+                      Shape::FromProto(make_array_request->shape()));
+  TF_ASSIGN_OR_RETURN(const auto dtype,
+                      DType::FromProto(make_array_request->dtype()));
 
   const uint64_t host_buffer_handle = make_array_request->host_buffer_handle();
   absl::Cleanup cleanup = [&] {
@@ -421,11 +407,11 @@ IfrtBackend::HandleAssembleArrayFromSingleDeviceArraysRequest(
     }
   }
 
-  Shape shape = FromShapeProto(assemble_request.shape());
+  TF_ASSIGN_OR_RETURN(Shape shape, Shape::FromProto(assemble_request.shape()));
   TF_ASSIGN_OR_RETURN(
-      auto sharding,
-      FromShardingProto(absl::bind_front(&Client::LookupDevice, client_.get()),
-                        assemble_request.sharding()));
+      auto sharding, Sharding::FromProto(
+                         absl::bind_front(&Client::LookupDevice, client_.get()),
+                         assemble_request.sharding()));
   TF_ASSIGN_OR_RETURN(auto semantics, FromArrayCopySemanticsProto(
                                           assemble_request.copy_semantics()));
 
@@ -484,7 +470,7 @@ Future<BackendInterface::Response> IfrtBackend::HandleCopyToHostBufferRequest(
   }
 
   // TODO(b/282757875): Consider other ArrayCopySemantics.
-  Future<absl::Status> copy_status =
+  Future<> copy_status =
       (*array)->CopyToHostBuffer(mem_region->zeroth_element(), byte_strides,
                                  ArrayCopySemantics::kAlwaysCopy);
 
@@ -581,9 +567,10 @@ BackendInterface::Response IfrtBackend::HandleReshardRequest(
   const auto& reshard_request = request->reshard_request();
   TF_ASSIGN_OR_RETURN(auto array, GetArray(reshard_request.array_handle()));
   TF_ASSIGN_OR_RETURN(
-      auto sharding,
-      FromShardingProto(absl::bind_front(&Client::LookupDevice, client_.get()),
-                        reshard_request.sharding()));
+      std::shared_ptr<const Sharding> sharding,
+      Sharding::FromProto(
+          absl::bind_front(&Client::LookupDevice, client_.get()),
+          reshard_request.sharding()));
   TF_ASSIGN_OR_RETURN(auto semantics, FromArrayCopySemanticsProto(
                                           reshard_request.copy_semantics()));
 
@@ -641,7 +628,8 @@ BackendInterface::Response IfrtBackend::HandleDeleteArrayRequest(
   uint64_t future_handle = handle_generator_.New();
   {
     absl::MutexLock lock(&futures_mutex_);
-    futures_.insert({future_handle, std::move(deletion_future)});
+    futures_.insert(
+        {future_handle, std::move(deletion_future).ToStatusFuture()});
   }
 
   auto ifrt_resp = NewIfrtResponse(request->request_metadata().op_id());
@@ -690,10 +678,16 @@ Future<BackendInterface::Response> IfrtBackend::HandleCompileRequest(
                       std::move(request))]() -> Response {
     const CompileRequest& compile_request = request->compile_request();
 
-    TF_ASSIGN_OR_RETURN(auto program, Deserialize<xla::ifrt::Program>(
-                                          compile_request.program()));
+    auto deserialize_program_options =
+        std::make_unique<DeserializeProgramOptions>(
+            absl::bind_front(&Client::LookupDevice, client_.get()));
+    TF_ASSIGN_OR_RETURN(
+        auto program,
+        Deserialize<xla::ifrt::Program>(
+            compile_request.program(), std::move(deserialize_program_options)));
     TF_ASSIGN_OR_RETURN(auto options, Deserialize<xla::ifrt::CompileOptions>(
-                                          compile_request.compile_options()));
+                                          compile_request.compile_options(),
+                                          /*options=*/nullptr));
 
     // Deserialize host callbacks. IFRT proxy currently allows only one type of
     // host callbacks from the client (`RemoteLoadedHostCallback`) and this is
@@ -751,7 +745,7 @@ Future<BackendInterface::Response> IfrtBackend::HandleCompileRequest(
       proto->set_partition(logical_device_id.partition);
     }
     for (const auto* device : executable->addressable_devices()) {
-      compile_resp->add_addressable_device_ids(device->id());
+      compile_resp->add_addressable_device_ids(device->Id().value());
     }
     // TODO(b/282757875): Consider making fingerprint calculation asynchronous
     // if it is expected to take long.
@@ -899,7 +893,8 @@ BackendInterface::Response IfrtBackend::HandleLoadedExecutableExecuteRequest(
     DeviceList::Devices d;
     d.reserve(execute.device_ids_size());
     for (const int32_t device_id : execute.device_ids()) {
-      TF_ASSIGN_OR_RETURN(d.emplace_back(), client_->LookupDevice(device_id));
+      TF_ASSIGN_OR_RETURN(d.emplace_back(),
+                          client_->LookupDevice(DeviceId(device_id)));
     }
     devices = DeviceList(std::move(d));
   }
@@ -919,8 +914,8 @@ BackendInterface::Response IfrtBackend::HandleLoadedExecutableExecuteRequest(
   {
     absl::MutexLock lock(&futures_mutex_);
     execute_response->set_status_handle(handle_generator_.New());
-    futures_.insert(
-        {execute_response->status_handle(), std::move(result.status)});
+    futures_.insert({execute_response->status_handle(),
+                     std::move(result.status).ToStatusFuture()});
   }
 
   // Register output arrays. At this point, we should never early return because
@@ -934,10 +929,10 @@ BackendInterface::Response IfrtBackend::HandleLoadedExecutableExecuteRequest(
 
       LoadedExecutableExecuteResponse::Output* output =
           execute_response->add_outputs();
-      output->set_dtype(ToDTypeProto(array->dtype()));
-      *output->mutable_shape() = ToShapeProto(array->shape());
+      *output->mutable_dtype() = array->dtype().ToProto();
+      *output->mutable_shape() = array->shape().ToProto();
       TF_ASSIGN_OR_RETURN(*output->mutable_sharding(),
-                          ToShardingProto(array->sharding()));
+                          array->sharding().ToProto());
       output->set_array_handle(output_handles[i]);
 
       arrays_.insert({output_handles[i], std::move(array)});
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc
index 1da5d337863233..00be0ab1497868 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc
@@ -37,7 +37,6 @@
 #include "absl/types/span.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ExtensibleRTTI.h"
-#include "xla/layout.h"
 #include "xla/layout_util.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
@@ -54,11 +53,11 @@
 #include "xla/python/ifrt/host_callback.h"
 #include "xla/python/ifrt/memory.h"
 #include "xla/python/ifrt/mock.h"
+#include "xla/python/ifrt/program.h"
 #include "xla/python/ifrt/serdes.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
 #include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
-#include "xla/python/ifrt_proxy/common/types.h"
 #include "xla/python/ifrt_proxy/common/types.pb.h"
 #include "xla/python/ifrt_proxy/server/host_buffer.h"
 #include "xla/python/ifrt_proxy/server/host_callback.h"
@@ -231,8 +230,7 @@ class IfrtBackendHandlerTest : public testing::Test {
     std::vector<xla::ifrt::Device*> raw_device_ptrs;
     for (int i = 0; i < 2; ++i) {
       auto mock_device = std::make_unique<xla::ifrt::MockDevice>();
-      ON_CALL(*mock_device, global_device_id())
-          .WillByDefault(Return(xla::PjRtGlobalDeviceId(i)));
+      ON_CALL(*mock_device, Id()).WillByDefault(Return(DeviceId(i)));
       raw_device_ptrs.push_back(mock_device.get());
       mock_devices_.push_back(std::move(mock_device));
     }
@@ -240,12 +238,12 @@ class IfrtBackendHandlerTest : public testing::Test {
     ON_CALL(*mock_client, devices()).WillByDefault(Return(raw_device_ptrs));
     ON_CALL(*mock_client, LookupDevice(_))
         .WillByDefault(
-            Invoke([this](int id) -> absl::StatusOr<xla::ifrt::Device*> {
-              if (id < 0 || id >= mock_devices_.size()) {
+            Invoke([this](DeviceId id) -> absl::StatusOr<xla::ifrt::Device*> {
+              if (id.value() < 0 || id.value() >= mock_devices_.size()) {
                 return absl::NotFoundError(
-                    absl::StrCat("Unknown device id: ", id));
+                    absl::StrCat("Unknown device id: ", id.value()));
               }
-              return mock_devices_[id].get();
+              return mock_devices_[id.value()].get();
             }));
 
     // Remembering a raw pointer to the mock client here is OK, since most tests
@@ -292,14 +290,15 @@ class IfrtBackendHandlerTest : public testing::Test {
 
       auto* make_array =
           ifrt_request->mutable_make_array_from_host_buffer_request();
-      make_array->set_dtype(proto::DTYPE_S32);
-      make_array->mutable_shape()->add_dimensions(2);
+      make_array->mutable_dtype()->set_kind(DTypeProto::KIND_S32);
+      make_array->mutable_shape()->add_dims(2);
       make_array->set_host_buffer_handle(host_buffer_handle);
 
-      TF_ASSIGN_OR_RETURN(auto* device, mock_client_->LookupDevice(1));
+      TF_ASSIGN_OR_RETURN(auto* device,
+                          mock_client_->LookupDevice(DeviceId(1)));
       TF_ASSIGN_OR_RETURN(
           *make_array->mutable_sharding(),
-          ToShardingProto(*SingleDeviceSharding::Create(device, MemoryKind())));
+          SingleDeviceSharding::Create(device, MemoryKind())->ToProto());
     }
     TF_ASSIGN_OR_RETURN(auto make_array_response,
                         CallBackend(std::move(ifrt_request)));
@@ -370,12 +369,13 @@ TEST_F(IfrtBackendHandlerTest, Init) {
   }
 
   std::vector<MockMemory> mock_memories(mock_devices_.size());
+  MemoryKind kind("mock");
   for (int i = 0; i < mock_memories.size(); ++i) {
     MockMemory& memory = mock_memories[i];
-    EXPECT_CALL(memory, devices())
+    EXPECT_CALL(memory, Devices())
         .WillRepeatedly(Return(mock_memory_devices[i]));
-    EXPECT_CALL(memory, id()).WillRepeatedly(Return(i));
-    EXPECT_CALL(memory, memory_space_kind()).WillRepeatedly(Return("mock"));
+    EXPECT_CALL(memory, Id()).WillRepeatedly(Return(MemoryId(i)));
+    EXPECT_CALL(memory, Kind()).WillRepeatedly(ReturnRef(kind));
   }
 
   std::vector<std::vector<Memory*>> device_memories;
@@ -388,22 +388,15 @@ TEST_F(IfrtBackendHandlerTest, Init) {
       absl::flat_hash_map<std::string, xla::PjRtDeviceAttribute>;
   std::vector<AttributeMap> device_attributes(mock_devices_.size());
 
-  const uint32_t kLocalHardwareId = 1234;
   for (int i = 0; i < mock_devices_.size(); ++i) {
     device_attributes[i].insert({"name", absl::StrCat("device", i)});
 
     MockDevice& mock_device = *mock_devices_[i];
     // TODO(b/314368788): Clean up PJRT device ID APIs.
-    EXPECT_CALL(mock_device, local_hardware_id_typed())
-        .WillRepeatedly(Return(xla::PjRtLocalHardwareId(kLocalHardwareId)));
-    EXPECT_CALL(mock_device, local_hardware_id())
-        .WillRepeatedly(Return(kLocalHardwareId));
-    EXPECT_CALL(mock_device, local_device_id())
-        .WillRepeatedly(Return(xla::PjRtLocalDeviceId(kLocalHardwareId)));
-    EXPECT_CALL(mock_device, device_kind()).WillRepeatedly(Return("mock"));
-    EXPECT_CALL(mock_device, memory_spaces())
+    EXPECT_CALL(mock_device, Kind()).WillRepeatedly(Return("mock"));
+    EXPECT_CALL(mock_device, Memories())
         .WillRepeatedly(Return(device_memories[i]));
-    EXPECT_CALL(mock_device, default_memory_space())
+    EXPECT_CALL(mock_device, DefaultMemory())
         .WillRepeatedly(Return(&mock_memories[i]));
     EXPECT_CALL(mock_device, Attributes())
         .WillRepeatedly(ReturnRef(device_attributes[i]));
@@ -424,8 +417,6 @@ TEST_F(IfrtBackendHandlerTest, Init) {
                       runtime_type: "ifrt-service"
                       devices {
                         id: 0
-                        local_device_id: 1234
-                        local_hardware_id: 1234
                         device_kind: "mock"
                         default_memory_id: 0
                         memory_ids: [ 0 ]
@@ -436,8 +427,6 @@ TEST_F(IfrtBackendHandlerTest, Init) {
                       }
                       devices {
                         id: 1
-                        local_device_id: 1234
-                        local_hardware_id: 1234
                         device_kind: "mock"
                         default_memory_id: 1
                         memory_ids: [ 1 ]
@@ -510,16 +499,17 @@ TEST_F(IfrtBackendHandlerTest, MakeArrayFromHostBufferSuccess) {
         ifrt_request->mutable_make_array_from_host_buffer_request();
     ASSERT_TRUE(
         TextFormat::ParseFromString(R"pb(
-                                      dtype: DTYPE_F64
-                                      shape { dimensions: [ 5, 3, 4 ] }
+                                      dtype { kind: KIND_F64 }
+                                      shape { dims: [ 5, 3, 4 ] }
                                       byte_strides { strides: [ 8, 40, 120 ] }
                                     )pb",
                                     make_array));
     make_array->set_host_buffer_handle(kHostBufferHandle);
-    TF_ASSERT_OK_AND_ASSIGN(auto* device, mock_client_->LookupDevice(1));
+    TF_ASSERT_OK_AND_ASSIGN(auto* device,
+                            mock_client_->LookupDevice(DeviceId(1)));
     TF_ASSERT_OK_AND_ASSIGN(
         *make_array->mutable_sharding(),
-        ToShardingProto(*SingleDeviceSharding::Create(device, MemoryKind())));
+        SingleDeviceSharding::Create(device, MemoryKind())->ToProto());
   }
 
   const Shape expected_shape({5, 3, 4});
@@ -544,17 +534,18 @@ TEST_F(IfrtBackendHandlerTest, AssembleArrayFromSingleDeviceArrays) {
   {
     ASSERT_TRUE(TextFormat::ParseFromString(
         R"pb(
-          shape { dimensions: [ 2, 2 ] }
+          shape { dims: [ 2, 2 ] }
           copy_semantics: ARRAY_COPY_SEMANTICS_ALWAYS_COPY
         )pb",
         ifrt_request
             ->mutable_assemble_array_from_single_device_arrays_request()));
-    TF_ASSERT_OK_AND_ASSIGN(auto* device, mock_client_->LookupDevice(1));
+    TF_ASSERT_OK_AND_ASSIGN(auto* device,
+                            mock_client_->LookupDevice(DeviceId(1)));
     TF_ASSERT_OK_AND_ASSIGN(
         *ifrt_request
              ->mutable_assemble_array_from_single_device_arrays_request()
              ->mutable_sharding(),
-        ToShardingProto(*SingleDeviceSharding::Create(device, MemoryKind())));
+        SingleDeviceSharding::Create(device, MemoryKind())->ToProto());
   }
 
   std::vector<tsl::RCReference<xla::ifrt::MockArray>> single_device_arrays;
@@ -606,7 +597,7 @@ TEST_F(IfrtBackendHandlerTest, CopyToHostSuccess) {
   const std::optional<absl::Span<const int64_t>> expected_byte_strides =
       absl::Span<const int64_t>(expected_byte_strides_vec);
   EXPECT_CALL(*array, CopyToHostBuffer(_, expected_byte_strides, _))
-      .WillOnce(Return(Future<absl::Status>(absl::OkStatus())));
+      .WillOnce(Return(Future<>(absl::OkStatus())));
 
   TF_ASSERT_OK_AND_ASSIGN(auto response, CallBackend(std::move(ifrt_request)));
   // Given the above shape, dtype, and compact byte_strides, the size of the
@@ -665,10 +656,11 @@ TEST_F(IfrtBackendHandlerTest, ReshardSuccess) {
   auto* reshard_request = ifrt_request->mutable_reshard_request();
   reshard_request->set_array_handle(src_array_handle);
   reshard_request->set_copy_semantics(proto::ARRAY_COPY_SEMANTICS_ALWAYS_COPY);
-  TF_ASSERT_OK_AND_ASSIGN(auto* device, mock_client_->LookupDevice(1));
+  TF_ASSERT_OK_AND_ASSIGN(auto* device,
+                          mock_client_->LookupDevice(DeviceId(1)));
   TF_ASSERT_OK_AND_ASSIGN(
       *ifrt_request->mutable_reshard_request()->mutable_sharding(),
-      ToShardingProto(*SingleDeviceSharding::Create(device, MemoryKind())));
+      SingleDeviceSharding::Create(device, MemoryKind())->ToProto());
 
   TF_ASSERT_OK_AND_ASSIGN(auto response, CallBackend(std::move(ifrt_request)));
 
@@ -742,10 +734,11 @@ TEST_F(IfrtBackendHandlerTest, ReshardFailsWhenTheBackendFails) {
   auto* reshard_request = ifrt_request->mutable_reshard_request();
   reshard_request->set_array_handle(array_handle);
   reshard_request->set_copy_semantics(proto::ARRAY_COPY_SEMANTICS_ALWAYS_COPY);
-  TF_ASSERT_OK_AND_ASSIGN(auto* device, mock_client_->LookupDevice(1));
+  TF_ASSERT_OK_AND_ASSIGN(auto* device,
+                          mock_client_->LookupDevice(DeviceId(1)));
   TF_ASSERT_OK_AND_ASSIGN(
       *ifrt_request->mutable_reshard_request()->mutable_sharding(),
-      ToShardingProto(*SingleDeviceSharding::Create(device, MemoryKind())));
+      SingleDeviceSharding::Create(device, MemoryKind())->ToProto());
 
   EXPECT_THAT(CallBackend(std::move(ifrt_request)),
               StatusIs(absl::StatusCode::kUnknown, StrEq("injected error")));
@@ -766,9 +759,8 @@ TEST_F(IfrtBackendHandlerTest,
        CheckArrayReadyRequestRelaysTheResultFromBackend) {
   auto mock_array = tsl::MakeRef<xla::ifrt::MockArray>();
   EXPECT_CALL(*mock_array, GetReadyFuture())
-      .WillOnce(Return(Future<absl::Status>(absl::OkStatus())))
-      .WillOnce(
-          Return(Future<absl::Status>(absl::UnknownError("injected error"))));
+      .WillOnce(Return(Future<>(absl::OkStatus())))
+      .WillOnce(Return(Future<>(absl::UnknownError("injected error"))));
   TF_ASSERT_OK_AND_ASSIGN(auto array_handle,
                           MakeTestArray(std::move(mock_array)));
 
@@ -805,7 +797,7 @@ TEST_F(IfrtBackendHandlerTest, DeleteArraySuccess) {
   tsl::RCReference<xla::ifrt::MockArray> mock_array =
       tsl::MakeRef<xla::ifrt::MockArray>();
   EXPECT_CALL(*mock_array, Delete())
-      .WillOnce(Return(Future<absl::Status>(absl::OkStatus())));
+      .WillOnce(Return(Future<>(absl::OkStatus())));
   TF_ASSERT_OK_AND_ASSIGN(auto array_handle,
                           MakeTestArray(std::move(mock_array)));
 
@@ -883,8 +875,7 @@ TEST_F(IfrtBackendHandlerTest, DestructArrayTest) {
 TEST_F(IfrtBackendHandlerTest, CompileSuccess) {
   std::vector<MockDevice> devices(4);
   for (int i = 0; i < 4; ++i) {
-    EXPECT_CALL(devices[i], global_device_id())
-        .WillOnce(Return(xla::PjRtGlobalDeviceId(i)));
+    EXPECT_CALL(devices[i], Id()).WillOnce(Return(DeviceId(i)));
   }
 
   std::vector<xla::ifrt::LoadedExecutable::LogicalDeviceIds>
@@ -1044,8 +1035,7 @@ TEST_F(IfrtBackendHandlerTest, LoadedExecutableMetadata) {
 #if defined(PLATFORM_GOOGLE)
 TEST_F(IfrtBackendHandlerTest, LoadedExecutableExecute) {
   MockDevice device;
-  ON_CALL(device, global_device_id())
-      .WillByDefault(Return(xla::PjRtGlobalDeviceId(0)));
+  ON_CALL(device, Id()).WillByDefault(Return(DeviceId(0)));
 
   MockLoadedExecutable* executable;
   uint64_t handle;
@@ -1084,8 +1074,7 @@ TEST_F(IfrtBackendHandlerTest, LoadedExecutableExecute) {
                      std::optional<DeviceList> devices)
                      -> absl::StatusOr<LoadedExecutable::ExecuteResult> {
             return LoadedExecutable::ExecuteResult{
-                .status =
-                    Future<absl::Status>(absl::InternalError("injected error")),
+                .status = Future<>(absl::InternalError("injected error")),
                 .outputs = outputs,
             };
           }));
@@ -1107,18 +1096,18 @@ TEST_F(IfrtBackendHandlerTest, LoadedExecutableExecute) {
   EXPECT_THAT(response, Pointee(Partially(EquivToProto(R"pb(
                 loaded_executable_execute_response {
                   outputs {
-                    dtype: DTYPE_F32
-                    shape { dimensions: [ 2, 2 ] }
+                    dtype { kind: KIND_F32 }
+                    shape { dims: [ 2, 2 ] }
                   }
                   outputs {
-                    dtype: DTYPE_F32
-                    shape { dimensions: [ 2, 2 ] }
+                    dtype { kind: KIND_F32 }
+                    shape { dims: [ 2, 2 ] }
                   }
                 }
               )pb"))));
   TF_ASSERT_OK_AND_ASSIGN(
       auto sharding_proto,
-      ToShardingProto(*SingleDeviceSharding::Create(&device, MemoryKind())));
+      SingleDeviceSharding::Create(&device, MemoryKind())->ToProto());
   for (const auto& output :
        response->loaded_executable_execute_response().outputs()) {
     EXPECT_THAT(output.sharding(), EquivToProto(sharding_proto));
diff --git a/third_party/xla/xla/python/inspect_sharding.cc b/third_party/xla/xla/python/inspect_sharding.cc
index 81fb65606433d2..2ab386983b0a51 100644
--- a/third_party/xla/xla/python/inspect_sharding.cc
+++ b/third_party/xla/xla/python/inspect_sharding.cc
@@ -56,8 +56,8 @@ std::optional<xla::HloSharding> InspectShardingReadArgs(
 
 class InspectShardingCallPartitioner : public xla::CustomCallPartitioner {
  public:
-  xla::Status Partition(xla::spmd::SpmdPartitioningVisitor* partitioner,
-                        HloInstruction* instruction) const override {
+  absl::Status Partition(xla::spmd::SpmdPartitioningVisitor* partitioner,
+                         HloInstruction* instruction) const override {
     const HloInstruction* operand = instruction->operand(0);
     if (!operand->has_sharding()) {
       return xla::Internal(
diff --git a/third_party/xla/xla/python/mlir.cc b/third_party/xla/xla/python/mlir.cc
index da681b479f0d2c..1853583a014c8b 100644
--- a/third_party/xla/xla/python/mlir.cc
+++ b/third_party/xla/xla/python/mlir.cc
@@ -17,8 +17,10 @@ limitations under the License.
 #include <string_view>
 
 #include "mhlo/transforms/passes.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "llvm/Support/raw_ostream.h"
+#include "mlir/Bytecode/BytecodeWriter.h"  // from @llvm-project
 #include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"  // from @llvm-project
 #include "mlir/Dialect/Func/Extensions/AllExtensions.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -92,6 +94,16 @@ std::string PrintModule(mlir::ModuleOp module) {
   return s;
 }
 
+absl::StatusOr<std::string> SerializeUsingBytecode(mlir::ModuleOp module) {
+  std::string bytecode;
+  llvm::raw_string_ostream os(bytecode);
+  mlir::BytecodeWriterConfig config;
+  if (mlir::failed(mlir::writeBytecodeToFile(module, os, config))) {
+    return absl::InvalidArgumentError("mlir::writeBytecodeToFile failed");
+  }
+  return bytecode;
+}
+
 void EnablePrintBeforeAndAfter(mlir::PassManager& pm) {
   auto print_before = [](mlir::Pass*, mlir::Operation*) { return true; };
   auto print_after = [](mlir::Pass*, mlir::Operation*) { return true; };
@@ -138,7 +150,7 @@ absl::StatusOr<XlaComputation> PyMlirModuleToXlaComputation(
   return computation;
 }
 
-absl::StatusOr<std::string> PyMhloToStablehlo(std::string_view mlir_module) {
+absl::StatusOr<nb::bytes> PyMhloToStablehlo(std::string_view mlir_module) {
   mlir::MLIRContext context;
   if (VLOG_IS_ON(3)) context.disableMultithreading();
   // JAX can be customized in a way that involves operations from custom
@@ -156,10 +168,13 @@ absl::StatusOr<std::string> PyMhloToStablehlo(std::string_view mlir_module) {
   if (!mlir::succeeded(pm.run(*module))) {
     return tsl::errors::InvalidArgument("MHLO => StableHLO failed");
   }
-  return PrintModule(*module);
+  // Use bytecode, passing unregistered dialects with properties causes issues
+  // when using textual assembly.
+  TF_ASSIGN_OR_RETURN(std::string bytecode, SerializeUsingBytecode(*module));
+  return nb::bytes(bytecode.data(), bytecode.size());
 }
 
-absl::StatusOr<std::string> PyStablehloToMhlo(const nb::bytes& mlir_module) {
+absl::StatusOr<nb::bytes> PyStablehloToMhlo(const nb::bytes& mlir_module) {
   mlir::MLIRContext context;
   if (VLOG_IS_ON(3)) context.disableMultithreading();
   // See PyMhloToStablehlo for an explanation of why we're allowing unregistered
@@ -175,7 +190,11 @@ absl::StatusOr<std::string> PyStablehloToMhlo(const nb::bytes& mlir_module) {
   if (!mlir::succeeded(pm.run(*module))) {
     return tsl::errors::InvalidArgument("StableHLO => MHLO failed");
   }
-  return PrintModule(*module);
+
+  // Use bytecode, passing unregistered dialects with properties causes issues
+  // when using textual assembly.
+  TF_ASSIGN_OR_RETURN(std::string bytecode, SerializeUsingBytecode(*module));
+  return nb::bytes(bytecode.data(), bytecode.size());
 }
 
 absl::StatusOr<nb::bytes> PySerializePortableArtifact(
diff --git a/third_party/xla/xla/python/outfeed_receiver.cc b/third_party/xla/xla/python/outfeed_receiver.cc
index 2f30052b87e54a..539d1f2df6c308 100644
--- a/third_party/xla/xla/python/outfeed_receiver.cc
+++ b/third_party/xla/xla/python/outfeed_receiver.cc
@@ -25,15 +25,34 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
 #include "xla/client/executable_build_options.h"
 #include "xla/client/sharding_builder.h"
 #include "xla/client/xla_builder.h"
 #include "xla/client/xla_computation.h"
+#include "xla/layout_util.h"
+#include "xla/literal.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_executable.h"
+#include "xla/python/pjrt_ifrt/pjrt_client.h"
+#include "xla/python/pjrt_ifrt/pjrt_device.h"
+#include "xla/service/computation_placer.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/util.h"
+#include "tsl/platform/casts.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/logging.h"
+#include "tsl/platform/status.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/threadpool.h"
 #include "tsl/profiler/lib/traceme.h"
 
 // Implementation notes:
@@ -108,14 +127,14 @@ uint32_t constexpr kOutfeedCidShutdown = 0;
 // Encapsulates data received from a device outfeed.
 class OutfeedData {
  public:
-  OutfeedData(PjRtDevice* device, uint32_t consumer_id, Shape shape)
+  OutfeedData(ifrt::PjRtDevice* device, uint32_t consumer_id, Shape shape)
       : device_(device),
         consumer_id_(consumer_id),
         shape_(shape),
         literal_(nullptr),
         literal_size_bytes_(0) {}
 
-  PjRtDevice* device() { return device_; }
+  ifrt::PjRtDevice* device() { return device_; }
   uint32_t consumer_id() const { return consumer_id_; }
   Shape shape() const { return shape_; }
   std::unique_ptr<Literal> literal() {
@@ -130,7 +149,7 @@ class OutfeedData {
   std::string DebugString() const;
 
  private:
-  PjRtDevice* device_;
+  ifrt::PjRtDevice* device_;
   uint32_t consumer_id_;
   Shape shape_;
   std::unique_ptr<Literal> literal_;
@@ -158,7 +177,8 @@ std::string OutfeedData::DebugString() const {
 class OutfeedReceiverImpl {
  public:
   OutfeedReceiverImpl(
-      OutfeedReceiver::Callback callback, absl::Span<PjRtClient* const> clients,
+      OutfeedReceiver::Callback callback,
+      absl::Span<ifrt::PjRtClient* const> clients,
       ssize_t max_callback_queue_size_bytes,
       const std::optional<ExecutableBuildOptions>& executable_build_options);
 
@@ -189,11 +209,11 @@ class OutfeedReceiverImpl {
   void DeviceListenerThreadLoop(int device_idx);
 
   // Enqueues to a device an outfeed operation with a shutdown consumer ID.
-  Status SendShutdownOutfeedHeader(int device_idx);
+  absl::Status SendShutdownOutfeedHeader(int device_idx);
 
   // Receives a raw Literal from a device outfeed.
   absl::StatusOr<std::unique_ptr<Literal>> ReceiveRawFromOutfeed(
-      PjRtDevice* device, const Shape& shape);
+      ifrt::PjRtDevice* device, const Shape& shape);
 
   // Enqueues received data in the callbaback queue.
   void EnqueueReceivedData(uint32_t device_idx,
@@ -206,7 +226,7 @@ class OutfeedReceiverImpl {
 
   OutfeedReceiver::Callback callback_;
   // The devices on which we are listening.
-  std::vector<PjRtDevice*> devices_;
+  std::vector<ifrt::PjRtDevice*> devices_;
   // Maximum bytes capacity of the ensemble of callback queues.
   uint64_t max_callback_queue_size_bytes_;
   std::optional<ExecutableBuildOptions> executable_build_options_;
@@ -233,7 +253,8 @@ class OutfeedReceiverImpl {
 };
 
 OutfeedReceiverImpl::OutfeedReceiverImpl(
-    OutfeedReceiver::Callback callback, absl::Span<PjRtClient* const> clients,
+    OutfeedReceiver::Callback callback,
+    absl::Span<ifrt::PjRtClient* const> clients,
     ssize_t max_callback_queue_size_bytes,
     const std::optional<ExecutableBuildOptions>& executable_build_options)
     : executable_build_options_(executable_build_options) {
@@ -241,7 +262,7 @@ OutfeedReceiverImpl::OutfeedReceiverImpl(
   max_callback_queue_size_bytes_ = max_callback_queue_size_bytes;
   for (const auto& client : clients) {
     for (auto device : client->addressable_devices()) {
-      devices_.push_back(device);
+      devices_.push_back(tensorflow::down_cast<ifrt::PjRtDevice*>(device));
     }
   }
   CHECK_GT(devices_.size(), 0);
@@ -297,7 +318,7 @@ void OutfeedReceiverImpl::DeviceListenerThreadLoop(int device_idx) {
     absl::MutexLock lock(&mu_);
     ++num_listening_threads_;
   }
-  PjRtDevice* device = devices_[device_idx];
+  ifrt::PjRtDevice* device = devices_[device_idx];
   while (true) {
     Shape header_shape = ShapeUtil::MakeShape(U32, {kOutfeedHeaderWords});
     std::unique_ptr<Literal> header =
@@ -353,15 +374,16 @@ void OutfeedReceiverImpl::EnqueueReceivedData(
 }
 
 absl::StatusOr<std::unique_ptr<Literal>>
-OutfeedReceiverImpl::ReceiveRawFromOutfeed(PjRtDevice* device,
+OutfeedReceiverImpl::ReceiveRawFromOutfeed(ifrt::PjRtDevice* device,
                                            const Shape& shape) {
   auto literal = std::make_unique<Literal>(shape);
-  TF_RETURN_IF_ERROR(device->TransferFromOutfeed(literal.get()));
+  TF_RETURN_IF_ERROR(
+      device->client()->TransferFromOutfeed(device, literal.get()));
   return literal;
 }
 
 void OutfeedReceiverImpl::CallbackThreadLoop(int device_idx) {
-  const PjRtDevice* device = devices_[device_idx];
+  const ifrt::PjRtDevice* device = devices_[device_idx];
   {
     absl::MutexLock lock(&mu_);
     num_working_callback_threads_++;
@@ -403,8 +425,8 @@ void OutfeedReceiverImpl::CallbackThreadLoop(int device_idx) {
   }
 }
 
-Status OutfeedReceiverImpl::SendShutdownOutfeedHeader(int device_idx) {
-  const PjRtDevice* device = devices_[device_idx];
+absl::Status OutfeedReceiverImpl::SendShutdownOutfeedHeader(int device_idx) {
+  const ifrt::PjRtDevice* device = devices_[device_idx];
   constexpr int consumer_id = kOutfeedCidShutdown;
   VLOG(2) << "[" << device->DebugString()
           << "] SendSpecialHeader cons=" << consumer_id;
@@ -429,18 +451,18 @@ Status OutfeedReceiverImpl::SendShutdownOutfeedHeader(int device_idx) {
   compile_options.executable_build_options.set_num_replicas(1);
   compile_options.executable_build_options.set_num_partitions(1);
   DeviceAssignment device_assignment(1, 1);
-  device_assignment(0, 0) = device->id();
+  device_assignment(0, 0) = device->Id().value();
   compile_options.executable_build_options.set_device_assignment(
       device_assignment);
 
   TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtLoadedExecutable> executable,
-                      devices_[device_idx]->client()->Compile(
+                      devices_[device_idx]->client()->pjrt_client()->Compile(
                           computation, std::move(compile_options)));
   ExecuteOptions execute_options;
   TF_ASSIGN_OR_RETURN(
       std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> output_buffers,
       executable->Execute({{}}, execute_options));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 absl::StatusOr<XlaOp> OutfeedReceiverImpl::AddOutfeedToBuilder(
@@ -487,7 +509,7 @@ absl::StatusOr<XlaOp> OutfeedReceiverImpl::AddOutfeedToBuilder(
 }
 
 OutfeedReceiver::OutfeedReceiver(
-    Callback callback, absl::Span<PjRtClient* const> clients,
+    Callback callback, absl::Span<ifrt::PjRtClient* const> clients,
     ssize_t max_callback_queue_size_bytes,
     const std::optional<ExecutableBuildOptions>& executable_build_options) {
   p_impl_ = std::make_unique<OutfeedReceiverImpl>(callback, clients,
diff --git a/third_party/xla/xla/python/outfeed_receiver.h b/third_party/xla/xla/python/outfeed_receiver.h
index 4ca1f959e3e9ef..a9f47280c56ee6 100644
--- a/third_party/xla/xla/python/outfeed_receiver.h
+++ b/third_party/xla/xla/python/outfeed_receiver.h
@@ -22,12 +22,13 @@ limitations under the License.
 #include <optional>
 #include <vector>
 
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
 #include "xla/client/executable_build_options.h"
 #include "xla/client/xla_builder.h"
 #include "xla/literal.h"
-#include "xla/pjrt/pjrt_client.h"
-#include "xla/shape.h"
-#include "xla/statusor.h"
+#include "xla/python/pjrt_ifrt/pjrt_client.h"
+#include "xla/python/pjrt_ifrt/pjrt_device.h"
 
 namespace xla {
 
@@ -37,8 +38,8 @@ class OutfeedReceiverImpl;
 class OutfeedReceiver {
  public:
   // A callback takes: device, consumer id, received.
-  using Callback =
-      std::function<void(PjRtDevice*, uint32_t, std::shared_ptr<Literal>)>;
+  using Callback = std::function<void(ifrt::PjRtDevice*, uint32_t,
+                                      std::shared_ptr<Literal>)>;
 
   // Constructs the receiver for the given clients and callback function.
   //
@@ -50,7 +51,7 @@ class OutfeedReceiver {
   //     received outfeeds queued to be processed. When this limit is reached
   //     we pause receiving outfeeds from devices.
   OutfeedReceiver(
-      Callback callback, absl::Span<PjRtClient* const> clients,
+      Callback callback, absl::Span<ifrt::PjRtClient* const> clients,
       ssize_t max_callback_queue_size_bytes,
       const std::optional<ExecutableBuildOptions>& executable_build_options);
 
diff --git a/third_party/xla/xla/python/outfeed_receiver_py.cc b/third_party/xla/xla/python/outfeed_receiver_py.cc
index e802550840277c..a27756645aa780 100644
--- a/third_party/xla/xla/python/outfeed_receiver_py.cc
+++ b/third_party/xla/xla/python/outfeed_receiver_py.cc
@@ -24,8 +24,10 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/base/thread_annotations.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
+#include "llvm/Support/Casting.h"
 #include "third_party/nanobind/include/nanobind/nanobind.h"
 #include "third_party/nanobind/include/nanobind/stl/function.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/optional.h"  // IWYU pragma: keep
@@ -34,10 +36,11 @@ limitations under the License.
 #include "xla/client/executable_build_options.h"
 #include "xla/client/xla_builder.h"
 #include "xla/literal.h"
-#include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/status_casters.h"
+#include "xla/python/ifrt/device.h"
 #include "xla/python/nb_class_ptr.h"
 #include "xla/python/outfeed_receiver.h"
+#include "xla/python/pjrt_ifrt/pjrt_client.h"
 #include "xla/python/py_client.h"
 #include "xla/python/types.h"
 #include "tsl/platform/logging.h"
@@ -56,23 +59,41 @@ class OutfeedReceiverForPython {
   using CallbackToPython =
       std::function<void(nb_class_ptr<PyDevice>, uint32_t, nb::object)>;
 
+  static absl::StatusOr<std::unique_ptr<OutfeedReceiverForPython>> Create(
+      CallbackToPython callback_python,
+      std::vector<nb_class_ptr<PyClient>> clients,
+      ssize_t max_callback_queue_size_bytes,
+      const std::optional<ExecutableBuildOptions>& executable_build_options) {
+    std::vector<ifrt::PjRtClient*> client_ptrs;
+    client_ptrs.reserve(clients.size());
+    for (const auto& client : clients) {
+      ifrt::PjRtClient* client_ptr =
+          llvm::dyn_cast<ifrt::PjRtClient>(client->ifrt_client());
+      if (!client_ptr) {
+        return absl::InvalidArgumentError(
+            "Outfeed receiver only implemented for PJRT clients.");
+      }
+      client_ptrs.push_back(client_ptr);
+    }
+
+    return std::make_unique<OutfeedReceiverForPython>(
+        std::move(callback_python), std::move(clients), client_ptrs,
+        max_callback_queue_size_bytes, executable_build_options);
+  }
+
   OutfeedReceiverForPython(
       CallbackToPython callback_python,
       std::vector<nb_class_ptr<PyClient>> clients,
+      std::vector<ifrt::PjRtClient*> client_ptrs,
       ssize_t max_callback_queue_size_bytes,
       const std::optional<ExecutableBuildOptions>& executable_build_options)
       : callback_python_(std::move(callback_python)),
         clients_(std::move(clients)) {
     OutfeedReceiver::Callback callback =
-        [this](PjRtDevice* device, uint32_t consumer_id,
+        [this](ifrt::Device* device, uint32_t consumer_id,
                std::shared_ptr<Literal> literal) {
           this->Callback(device, consumer_id, std::move(literal));
         };
-    std::vector<PjRtClient*> client_ptrs(clients_.size());
-    absl::c_transform(clients_, client_ptrs.begin(),
-                      [](const nb_class_ptr<PyClient>& client) {
-                        return client->pjrt_client();
-                      });
     outfeed_receiver_ = std::make_unique<OutfeedReceiver>(
         callback, client_ptrs, max_callback_queue_size_bytes,
         executable_build_options);
@@ -105,7 +126,7 @@ class OutfeedReceiverForPython {
                                                   arrays, device_idx);
   }
 
-  void Callback(PjRtDevice* device, uint32_t consumer_id,
+  void Callback(ifrt::Device* device, uint32_t consumer_id,
                 std::shared_ptr<Literal> literal) {
     {
       absl::MutexLock lock(&mu_);
@@ -117,7 +138,7 @@ class OutfeedReceiverForPython {
     // We expect the number of clients to be small, so an O(n) search is fine.
     auto it = absl::c_find_if(
         clients_, [device](const nb_class_ptr<PyClient>& client) {
-          return client->pjrt_client() == device->client();
+          return client->ifrt_client() == device->client();
         });
     CHECK(it != clients_.end());
     PyClient* client = it->get();
@@ -148,10 +169,10 @@ void BuildOutfeedReceiverSubmodule(nb::module_& m) {
          nb::sequence clients, ssize_t max_callback_queue_size_bytes,
          std::optional<ExecutableBuildOptions> executable_build_options)
           -> std::unique_ptr<OutfeedReceiverForPython> {
-        auto server = std::make_unique<OutfeedReceiverForPython>(
+        auto server = xla::ValueOrThrow(OutfeedReceiverForPython::Create(
             std::move(callback_to_python),
             SequenceToVector<nb_class_ptr<PyClient>>(clients),
-            max_callback_queue_size_bytes, executable_build_options);
+            max_callback_queue_size_bytes, executable_build_options));
         nb::gil_scoped_release gil_release;
         server->Start();
         return server;
diff --git a/third_party/xla/xla/python/outfeed_receiver_test.cc b/third_party/xla/xla/python/outfeed_receiver_test.cc
index cb7c0040f64cdc..8a6acb01692ceb 100644
--- a/third_party/xla/xla/python/outfeed_receiver_test.cc
+++ b/third_party/xla/xla/python/outfeed_receiver_test.cc
@@ -113,11 +113,12 @@ class Accumulator {
 TEST(OutfeedReceiverTest, ReceiveOutfeedSimple) {
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<PjRtClient> cpu_client,
                           GetTfrtCpuClient(CpuClientOptions()));
-  std::vector<PjRtClient*> clients{cpu_client.get()};
+  auto ifrt_cpu_client = ifrt::PjRtClient::Create(cpu_client);
+  std::vector<ifrt::PjRtClient*> clients{ifrt_cpu_client.get()};
 
   auto receiver = std::make_unique<Accumulator>();
   OutfeedReceiver::Callback callback =
-      [&receiver](PjRtDevice* device, uint32_t consumer_id,
+      [&receiver](xla::ifrt::PjRtDevice* device, uint32_t consumer_id,
                   std::shared_ptr<Literal> data) {
         receiver->Receive(consumer_id, data);
       };
@@ -146,11 +147,12 @@ TEST(OutfeedReceiverTest, ReceiveOutfeedSimple) {
 TEST(OutfeedReceiverTest, ReceiveOutfeedTwoComputations) {
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<PjRtClient> cpu_client,
                           GetTfrtCpuClient(CpuClientOptions()));
-  std::vector<PjRtClient*> clients{cpu_client.get()};
+  auto ifrt_cpu_client = ifrt::PjRtClient::Create(cpu_client);
+  std::vector<ifrt::PjRtClient*> clients{ifrt_cpu_client.get()};
 
   auto receiver = std::make_unique<Accumulator>();
   OutfeedReceiver::Callback callback =
-      [&receiver](PjRtDevice* device, uint32_t consumer_id,
+      [&receiver](xla::ifrt::PjRtDevice* device, uint32_t consumer_id,
                   std::shared_ptr<Literal> data) {
         receiver->Receive(consumer_id, data);
       };
@@ -191,11 +193,12 @@ TEST(OutfeedReceiverTest, ReceiveOutfeedTwoComputations) {
 TEST(OutfeedReceiverTest, ReceiveOutfeedTwoOutfeed) {
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<PjRtClient> cpu_client,
                           GetTfrtCpuClient(CpuClientOptions()));
-  std::vector<PjRtClient*> clients{cpu_client.get()};
+  auto ifrt_cpu_client = ifrt::PjRtClient::Create(cpu_client);
+  std::vector<ifrt::PjRtClient*> clients{ifrt_cpu_client.get()};
 
   auto receiver = std::make_unique<Accumulator>();
   OutfeedReceiver::Callback callback =
-      [&receiver](PjRtDevice* device, uint32_t consumer_id,
+      [&receiver](xla::ifrt::PjRtDevice* device, uint32_t consumer_id,
                   std::shared_ptr<Literal> data) {
         receiver->Receive(consumer_id, data);
       };
@@ -234,11 +237,12 @@ TEST(OutfeedReceiverTest, ReceiveOutfeedTwoOutfeed) {
 TEST(OutfeedReceiverTest, DifferentShapeForConsumerIdError) {
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<PjRtClient> cpu_client,
                           GetTfrtCpuClient(CpuClientOptions()));
-  std::vector<PjRtClient*> clients{cpu_client.get()};
+  auto ifrt_cpu_client = ifrt::PjRtClient::Create(cpu_client);
+  std::vector<ifrt::PjRtClient*> clients{ifrt_cpu_client.get()};
 
   auto receiver = std::make_unique<Accumulator>();
   OutfeedReceiver::Callback callback =
-      [&receiver](PjRtDevice* device, uint32_t consumer_id,
+      [&receiver](xla::ifrt::PjRtDevice* device, uint32_t consumer_id,
                   std::shared_ptr<Literal> data) {
         receiver->Receive(consumer_id, data);
       };
@@ -268,11 +272,12 @@ TEST(OutfeedReceiverTest, DifferentShapeForConsumerIdError) {
 TEST(OutfeedReceiverTest, InvalidConsumerIdError) {
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<PjRtClient> cpu_client,
                           GetTfrtCpuClient(CpuClientOptions()));
-  std::vector<PjRtClient*> clients{cpu_client.get()};
+  auto ifrt_cpu_client = ifrt::PjRtClient::Create(cpu_client);
+  std::vector<ifrt::PjRtClient*> clients{ifrt_cpu_client.get()};
 
   auto receiver = std::make_unique<Accumulator>();
   OutfeedReceiver::Callback callback =
-      [&receiver](PjRtDevice* device, uint32_t consumer_id,
+      [&receiver](xla::ifrt::PjRtDevice* device, uint32_t consumer_id,
                   std::shared_ptr<Literal> data) {
         receiver->Receive(consumer_id, data);
       };
@@ -294,11 +299,12 @@ TEST(OutfeedReceiverTest, InvalidConsumerIdError) {
 // TEST(OutfeedReceiverTest, NonLocalDevicesIgnored) {
 //   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<PjRtClient> cpu_client,
 //                           GetCpuClientWithNonLocalDevice());
-//   std::vector<PjRtClient*> clients{cpu_client.get()};
+//     auto ifrt_cpu_client = ifrt::PjRtClient::Create(cpu_client);
+// std::vector<ifrt::PjRtClient*> clients{ifrt_cpu_client.get()};
 
 //   auto receiver = std::make_unique<Accumulator>();
 //   OutfeedReceiver::Callback callback =
-//       [&receiver](PjRtDevice* device, uint32_t consumer_id,
+//       [&receiver](xla::ifrt::PjRtDevice* device, uint32_t consumer_id,
 //                   std::shared_ptr<Literal> data) {
 //         receiver->Receive(consumer_id, data);
 //       };
diff --git a/third_party/xla/xla/python/pjit.cc b/third_party/xla/xla/python/pjit.cc
index c562eaa2eadd65..6a309a0d9a9e20 100644
--- a/third_party/xla/xla/python/pjit.cc
+++ b/third_party/xla/xla/python/pjit.cc
@@ -47,7 +47,6 @@ limitations under the License.
 #include "third_party/nanobind/include/nanobind/stl/vector.h"  // IWYU pragma: keep
 #include "xla/pjrt/exceptions.h"
 #include "xla/pjrt/lru_cache.h"
-#include "xla/pjrt/pjrt_client.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/memory.h"
@@ -380,7 +379,7 @@ PrepareIfrtInputs(const xla::PyLoadedExecutable& executable,
   xla::DevicePutOptions options;
   options.squash_64bit_types = !enable_x64;
   options.allow_zero_copy = true;
-  xla::PjRtDevice* data_device = nullptr;
+  xla::ifrt::Device* data_device = nullptr;
   if (executable.ifrt_loaded_executable()->num_devices() == 1) {
     data_device = executable.ifrt_loaded_executable()->addressable_devices()[0];
   }
@@ -401,9 +400,15 @@ PrepareIfrtInputs(const xla::PyLoadedExecutable& executable,
         TF_RETURN_IF_ERROR(
             jax::ApplyTransferGuardToHostToDevice(transfer_guard_formatter));
         TF_ASSIGN_OR_RETURN(
-            xla::DevicePutResult on_device,
+            auto on_device_fn,
             DevicePut(arg, executable.ifrt_loaded_executable()->client(),
                       data_device, options, xla::ifrt::MemoryKind()));
+        TF_ASSIGN_OR_RETURN(xla::DevicePutResult on_device, [&]() {
+          // Must release the GIL before calling IFRT because backends may
+          // decide to block/sleep for device buffer allocation.
+          nb::gil_scoped_release gil_release;
+          return std::move(on_device_fn)();
+        }());
 
         num_args_arrays.push_back(std::move(on_device.ifrt_array));
         if (on_device.owning_pybuffer) {
@@ -541,9 +546,6 @@ absl::StatusOr<nb::object> PjitFunction::Call(nb::handle callable,
     }
 
     xla::PyArray py_array = nb::borrow<xla::PyArray>(arg);
-    if (!py_array.fastpath_enabled()) {
-      return fallback_to_cache_miss();
-    }
 
     // Only allow committed PyArray in cpp pjit for now as the logic on handling
     // sharding for uncommitted PyArray is complicated and still under
@@ -748,29 +750,29 @@ void PjitFunction::PopulateCacheEntry(PjitCacheEntry& cache_entry,
   cache_entry.executable = nb::cast<std::shared_ptr<xla::PyLoadedExecutable>>(
       fastpath_data.attr("xla_executable"));
 
-  nb::list in_shardings = fastpath_data.attr("in_shardings");
-  cache_entry.in_shardings.reserve(in_shardings.size());
+  nb::sequence in_shardings = fastpath_data.attr("in_shardings");
+  cache_entry.in_shardings.reserve(nb::len(in_shardings));
   for (nb::handle sharding : in_shardings) {
     cache_entry.in_shardings.push_back(nb::borrow(sharding));
   }
 
-  nb::list out_shardings = fastpath_data.attr("out_shardings");
-  cache_entry.out_shardings.reserve(out_shardings.size());
+  nb::sequence out_shardings = fastpath_data.attr("out_shardings");
+  cache_entry.out_shardings.reserve(nb::len(out_shardings));
   for (nb::handle sharding : out_shardings) {
     cache_entry.out_shardings.push_back(nb::borrow(sharding));
   }
 
-  nb::list out_committed = fastpath_data.attr("out_committed");
-  cache_entry.out_committed.reserve(out_committed.size());
+  nb::sequence out_committed = fastpath_data.attr("out_committed");
+  cache_entry.out_committed.reserve(nb::len(out_committed));
   for (nb::handle c : out_committed) {
     cache_entry.out_committed.push_back(nb::cast<bool>(c));
   }
 
-  nb::list out_avals = fastpath_data.attr("out_avals");
-  cache_entry.out_avals.reserve(out_avals.size());
-  cache_entry.out_dtypes.reserve(out_avals.size());
-  cache_entry.out_shapes.reserve(out_avals.size());
-  cache_entry.out_weak_types.reserve(out_avals.size());
+  nb::sequence out_avals = fastpath_data.attr("out_avals");
+  cache_entry.out_avals.reserve(nb::len(out_avals));
+  cache_entry.out_dtypes.reserve(nb::len(out_avals));
+  cache_entry.out_shapes.reserve(nb::len(out_avals));
+  cache_entry.out_weak_types.reserve(nb::len(out_avals));
   for (nb::handle aval : out_avals) {
     cache_entry.out_avals.push_back(nb::borrow(aval));
     cache_entry.out_dtypes.push_back(aval.attr("dtype"));
@@ -783,8 +785,8 @@ void PjitFunction::PopulateCacheEntry(PjitCacheEntry& cache_entry,
   cache_entry.out_pytree_def = nb::cast<xla::PyTreeDef>(
       nb::handle(fastpath_data.attr("out_pytree_def").ptr()));
 
-  nb::list kept_var_bitvec = fastpath_data.attr("kept_var_bitvec");
-  cache_entry.kept_var_bitvec.reserve(kept_var_bitvec.size());
+  nb::sequence kept_var_bitvec = fastpath_data.attr("kept_var_bitvec");
+  cache_entry.kept_var_bitvec.reserve(nb::len(kept_var_bitvec));
   for (nb::handle k : kept_var_bitvec) {
     cache_entry.kept_var_bitvec.push_back(nb::cast<bool>(k));
   }
diff --git a/third_party/xla/xla/python/pjrt_ifrt/BUILD b/third_party/xla/xla/python/pjrt_ifrt/BUILD
index a0e3a4297a8c15..b77674803a05de 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/BUILD
+++ b/third_party/xla/xla/python/pjrt_ifrt/BUILD
@@ -1,7 +1,7 @@
-load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
-load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
 load("//xla:xla.bzl", "xla_cc_test")
+load("//xla/tsl:tsl.bzl", "internal_visibility")
+load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 
 package_group(
     name = "friends",
@@ -48,6 +48,7 @@ cc_library(
         ":xla_compiler_proto_cc",
         "//xla:util",
         "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
         "//xla/pjrt:pjrt_executable",
         "//xla/python/ifrt",
         "//xla/python/ifrt:serdes",
@@ -120,7 +121,7 @@ tf_proto_library(
     srcs = ["xla_sharding.proto"],
     protodeps = [
         "//xla:xla_data_proto",
-        "//xla/python/ifrt:types_proto",
+        "//xla/python/ifrt:device_proto",
     ],
 )
 
@@ -134,6 +135,11 @@ cc_library(
         "//xla/python/ifrt",
         "//xla/python/ifrt:serdes",
         "//xla/python/ifrt:sharding_serdes",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//llvm:Support",
+        "@local_tsl//tsl/platform:statusor",
     ],
     alwayslink = True,
 )
@@ -145,10 +151,14 @@ xla_cc_test(
         ":xla_ifrt",
         ":xla_sharding_serdes",
         "//xla/hlo/ir:hlo",
+        "//xla/python/ifrt",
+        "//xla/python/ifrt:serdes",
         "//xla/python/ifrt:sharding_serdes",
         "//xla/python/ifrt:sharding_test_util",
         "@com_google_absl//absl/functional:bind_front",
+        "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -162,7 +172,9 @@ cc_library(
         "//xla/pjrt:mlir_to_hlo",
         "//xla/python/ifrt",
         "//xla/python/ifrt:test_util",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -204,16 +216,20 @@ cc_library(
         "pjrt_array.cc",
         "pjrt_client.cc",
         "pjrt_compiler.cc",
+        "pjrt_device.cc",
         "pjrt_executable.cc",
         "pjrt_host_callback.cc",
+        "pjrt_memory.cc",
         "pjrt_tuple.cc",
     ],
     hdrs = [
         "pjrt_array.h",
         "pjrt_client.h",
         "pjrt_compiler.h",
+        "pjrt_device.h",
         "pjrt_executable.h",
         "pjrt_host_callback.h",
+        "pjrt_memory.h",
         "pjrt_tuple.h",
     ],
     compatible_with = get_compatible_with_portable(),
@@ -222,13 +238,15 @@ cc_library(
         "//xla:literal",
         "//xla:shape_util",
         "//xla:status",
+        "//xla:status_macros",
         "//xla:statusor",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/pjrt:host_callback",
-        "//xla/pjrt:mlir_to_hlo",
         "//xla/pjrt:pjrt_client",
+        "//xla/pjrt:pjrt_compiler",
+        "//xla/pjrt:pjrt_device_description",
         "//xla/pjrt:pjrt_executable",
         "//xla/pjrt:pjrt_future",
         "//xla/pjrt:pjrt_layout",
@@ -252,7 +270,9 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/concurrency:ref_count",
+        "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
index 1f676a1bfe423a..7336b5e610a254 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
+#include "llvm/Support/Casting.h"
 #include "xla/literal.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_layout.h"
@@ -41,8 +42,11 @@ limitations under the License.
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
 #include "xla/python/pjrt_ifrt/pjrt_client.h"
+#include "xla/python/pjrt_ifrt/pjrt_device.h"
+#include "xla/python/pjrt_ifrt/pjrt_memory.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
+#include "xla/status_macros.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/concurrency/ref_count.h"
@@ -70,7 +74,11 @@ Status ValidateArrayCreationInput(std::shared_ptr<const Sharding> sharding,
   MemoryKind canonicalized_sharding_memory_kind = CanonicalizeMemoryKind(
       sharding->memory_kind(), sharding->devices().front());
   for (int i = 0; i < sharding->devices().size(); ++i) {
-    if (pjrt_buffers[i]->device() != sharding->devices()[i]) {
+    PjRtDevice* device = llvm::dyn_cast<PjRtDevice>(sharding->devices()[i]);
+    if (!device) {
+      return InvalidArgument("Sharding device %d is not a PjRtDevice", i);
+    }
+    if (pjrt_buffers[i]->device() != device->pjrt_device()) {
       return InvalidArgument(
           "PjRtBuffer's memory space is addressed by device %s vs sharding is "
           "on device %s",
@@ -97,11 +105,13 @@ absl::StatusOr<MemoryKind> GetMemoryKindFromPjRtBuffers(
   const auto first_memory_kind =
       MakeMemoryKindFromPjRtBuffer(pjrt_buffers.front().get());
   const MemoryKind canonical_first_memory_kind =
-      CanonicalizeMemoryKind(first_memory_kind, pjrt_buffers.front()->device());
+      CanonicalizeMemoryKindWithPjRtDevice(first_memory_kind,
+                                           pjrt_buffers.front()->device());
   for (const auto& pjrt_buffer : pjrt_buffers) {
     if (auto memory_kind = MakeMemoryKindFromPjRtBuffer(pjrt_buffer.get());
         canonical_first_memory_kind !=
-        CanonicalizeMemoryKind(memory_kind, pjrt_buffer->device())) {
+        CanonicalizeMemoryKindWithPjRtDevice(memory_kind,
+                                             pjrt_buffer->device())) {
       return InvalidArgument(
           "Memory kind mismatch between PjRtBuffers. Got one buffer with "
           "memory kind: %s and another with memory_kind: %s",
@@ -192,7 +202,7 @@ MemoryKind MakeMemoryKindFromPjRtBuffer(PjRtBuffer* pjrt_buffer) {
   if (pjrt_buffer->memory_space() == nullptr) {
     return MemoryKind();
   }
-  return MemoryKind(pjrt_buffer->memory_space()->memory_space_kind());
+  return MemoryKind(pjrt_buffer->memory_space()->kind());
 }
 
 absl::StatusOr<tsl::RCReference<PjRtArray>> PjRtArray::Create(
@@ -215,8 +225,10 @@ absl::StatusOr<tsl::RCReference<PjRtArray>> PjRtArray::Create(
     PjRtCompatibleClient* client, std::shared_ptr<PjRtBuffer> pjrt_buffer) {
   TF_ASSIGN_OR_RETURN(auto dtype, ToDType(pjrt_buffer->element_type()));
   Shape shape(pjrt_buffer->dimensions());
+  TF_ASSIGN_OR_RETURN(auto device,
+                      client->LookupPjRtDevice(pjrt_buffer->device()));
   auto sharding = SingleDeviceSharding::Create(
-      pjrt_buffer->device(), MakeMemoryKindFromPjRtBuffer(pjrt_buffer.get()));
+      device, MakeMemoryKindFromPjRtBuffer(pjrt_buffer.get()));
   return tsl::MakeRef<PjRtArray>(client, dtype, std::move(shape),
                                  std::move(sharding),
                                  PjRtBuffers({std::move(pjrt_buffer)}));
@@ -256,7 +268,9 @@ absl::StatusOr<tsl::RCReference<PjRtArray>> PjRtArray::Create(
   shapes.reserve(pjrt_buffers.size());
 
   for (const auto& pjrt_buffer : pjrt_buffers) {
-    devices.push_back(pjrt_buffer->device());
+    TF_ASSIGN_OR_RETURN(auto device,
+                        client->LookupPjRtDevice(pjrt_buffer->device()));
+    devices.push_back(device);
     shapes.push_back(Shape(pjrt_buffer->dimensions()));
   }
   auto sharding = ifrt::ConcreteSharding::Create(DeviceList(std::move(devices)),
@@ -281,7 +295,9 @@ absl::StatusOr<tsl::RCReference<PjRtArray>> PjRtArray::Create(
   dynamic_shapes.reserve(pjrt_buffers.size());
 
   for (const auto& pjrt_buffer : pjrt_buffers) {
-    devices.push_back(pjrt_buffer->device());
+    TF_ASSIGN_OR_RETURN(auto device,
+                        client->LookupPjRtDevice(pjrt_buffer->device()));
+    devices.push_back(device);
     TF_ASSIGN_OR_RETURN(
         DynamicShape dynamic_shape,
         // Extracts dynamic shape info from the buffers.
@@ -345,19 +361,19 @@ PjRtArray::DisassembleIntoSingleDeviceArrays(ArrayCopySemantics semantics) {
   return result;
 }
 
-Future<Status> PjRtArray::CopyToHostBuffer(
+Future<> PjRtArray::CopyToHostBuffer(
     void* data, std::optional<absl::Span<const int64_t>> byte_strides,
     ArrayCopySemantics semantics) {
   DCHECK(this);
   if (sharding_->devices().size() != 1) {
-    return Future<Status>(
+    return Future<>(
         InvalidArgument("Only single-shard is implemented, but got %d",
                         sharding_->devices().size()));
   }
 
   auto dtype = ToPrimitiveType(dtype_);
   if (!dtype.ok()) {
-    return Future<Status>(std::move(dtype).status());
+    return Future<>(std::move(dtype).status());
   }
 
   PjRtBuffer* pjrt_buffer = pjrt_buffers_.front().get();
@@ -374,7 +390,7 @@ Future<Status> PjRtArray::CopyToHostBuffer(
     // TODO(b/314805296): Use the new dynamic shape here.
     logical_dims = pjrt_buffer->logical_dimensions();
     if (!logical_dims.ok()) {
-      return Future<Status>(std::move(logical_dims).status());
+      return Future<>(std::move(logical_dims).status());
     }
     dims = *logical_dims;
   }
@@ -384,7 +400,7 @@ Future<Status> PjRtArray::CopyToHostBuffer(
     auto xla_shape =
         MakeShapeWithTrivialByteStrides(*dtype, dims, *byte_strides);
     if (!xla_shape.ok()) {
-      return Future<Status>(std::move(xla_shape).status());
+      return Future<>(std::move(xla_shape).status());
     }
     literal = std::make_unique<xla::MutableBorrowingLiteral>(
         static_cast<char*>(data), *xla_shape);
@@ -394,8 +410,8 @@ Future<Status> PjRtArray::CopyToHostBuffer(
         static_cast<char*>(data), xla_shape);
   }
   auto* literal_ptr = literal.get();
-  auto promise = Future<Status>::CreatePromise();
-  Future<Status> future(promise);
+  auto promise = Future<>::CreatePromise();
+  Future<> future(promise);
   // TODO(hyeontaek): Handle semantics == kDonateInput.
   pjrt_buffer->ToLiteral(literal_ptr)
       .OnReady([literal = std::move(literal),
@@ -407,25 +423,25 @@ Future<Status> PjRtArray::CopyToHostBuffer(
 }
 
 // TODO(yashkatariya): Maybe move this to ifrt::Device?
-absl::StatusOr<PjRtMemorySpace*> GetMemorySpaceFromMemoryKind(
+absl::StatusOr<Memory*> GetMemorySpaceFromMemoryKind(
     ifrt::Device* device, ifrt::MemoryKind memory_kind) {
-  PjRtMemorySpace* memory_space = nullptr;
-  for (PjRtMemorySpace* ms : device->memory_spaces()) {
-    if (ms->memory_space_kind() == memory_kind.memory_kind()) {
-      memory_space = ms;
+  Memory* memory = nullptr;
+  for (Memory* ms : device->Memories()) {
+    if (ms->Kind() == memory_kind) {
+      memory = ms;
       break;
     }
   }
-  if (memory_space == nullptr) {
+  if (memory == nullptr) {
     return InvalidArgument(
         "Invalid memory kind: %s; available memory kinds: %s",
         memory_kind.DebugString(),
-        absl::StrJoin(device->memory_spaces(), ", ",
-                      [](std::string* out, PjRtMemorySpace* ms) {
-                        absl::StrAppend(out, ms->memory_space_kind());
+        absl::StrJoin(device->Memories(), ", ",
+                      [](std::string* out, Memory* m) {
+                        absl::StrAppend(out, m->Kind().DebugString());
                       }));
   }
-  return memory_space;
+  return memory;
 }
 
 absl::StatusOr<tsl::RCReference<Array>> PjRtArray::Reshard(
@@ -448,12 +464,13 @@ absl::StatusOr<tsl::RCReference<Array>> PjRtArray::Reshard(
   bool new_sharding_has_memory_kind =
       canonicalized_sharding_memory_kind.memory_kind().has_value();
   for (int i = 0; i < pjrt_buffers_.size(); ++i) {
-    bool devices_equal =
-        pjrt_buffers_[i]->device() == new_sharding->devices()[i];
+    TF_ASSIGN_OR_RETURN(Device * buffer_device,
+                        client_->LookupPjRtDevice(pjrt_buffers_[i]->device()));
+    bool devices_equal = buffer_device == new_sharding->devices()[i];
     bool memories_supported = pjrt_buffers_[i]->memory_space() != nullptr;
     bool memory_kind_equal =
         new_sharding_has_memory_kind && memories_supported &&
-        pjrt_buffers_[i]->memory_space()->memory_space_kind() ==
+        pjrt_buffers_[i]->memory_space()->kind() ==
             canonicalized_sharding_memory_kind.memory_kind();
 
     // No need for data transfer.
@@ -475,7 +492,9 @@ absl::StatusOr<tsl::RCReference<Array>> PjRtArray::Reshard(
           break;
       }
     } else {
-      if (new_sharding->devices()[i]->client() == nullptr) {
+      PjRtDevice* pjrt_device =
+          llvm::dyn_cast<PjRtDevice>(new_sharding->devices()[i]);
+      if (!pjrt_device) {
         return InvalidArgument(
             "The destination device is owned by a non-PjRt-compatible client. "
             "To use this Array on the destination device, the Array must be "
@@ -488,11 +507,14 @@ absl::StatusOr<tsl::RCReference<Array>> PjRtArray::Reshard(
       // the device.
       if (new_sharding_has_memory_kind && memories_supported) {
         TF_ASSIGN_OR_RETURN(
-            auto memory_space,
+            auto memory,
             GetMemorySpaceFromMemoryKind(new_sharding->devices()[i],
                                          canonicalized_sharding_memory_kind));
-        TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtBuffer> copied_buffer,
-                            pjrt_buffers_[i]->CopyToMemorySpace(memory_space));
+        PjRtMemory* pjrt_memory = llvm::dyn_cast<PjRtMemory>(memory);
+        TF_RET_CHECK(pjrt_memory != nullptr);
+        TF_ASSIGN_OR_RETURN(
+            std::unique_ptr<PjRtBuffer> copied_buffer,
+            pjrt_buffers_[i]->CopyToMemorySpace(pjrt_memory->pjrt_memory()));
         if (semantics == ArrayCopySemantics::kDonateInput) {
           if (!memory_kind_equal) {
             return Unimplemented(
@@ -505,7 +527,7 @@ absl::StatusOr<tsl::RCReference<Array>> PjRtArray::Reshard(
         // Use `PjRtBuffer::CopyToDevice` when memories are not supported.
         TF_ASSIGN_OR_RETURN(
             std::unique_ptr<xla::PjRtBuffer> copied_buffer,
-            pjrt_buffers_[i]->CopyToDevice(new_sharding->devices()[i]));
+            pjrt_buffers_[i]->CopyToDevice(pjrt_device->pjrt_device()));
         if (semantics == ArrayCopySemantics::kDonateInput) {
           pjrt_buffers_[i] = nullptr;
         }
@@ -521,12 +543,12 @@ absl::StatusOr<tsl::RCReference<Array>> PjRtArray::Reshard(
       shape_);
 }
 
-Future<Status> PjRtArray::GetReadyFuture() const {
+Future<> PjRtArray::GetReadyFuture() const {
   DCHECK(this);
   if (pjrt_buffers_.size() == 1) {
     return pjrt_buffers_.front()->GetReadyFuture();
   }
-  std::vector<Future<Status>> futures;
+  std::vector<Future<>> futures;
   futures.reserve(pjrt_buffers_.size());
   for (auto& buf : pjrt_buffers_) {
     futures.push_back(buf->GetReadyFuture());
@@ -534,13 +556,13 @@ Future<Status> PjRtArray::GetReadyFuture() const {
   return JoinFutures(absl::MakeSpan(futures));
 }
 
-Future<Status> PjRtArray::Delete() {
+Future<> PjRtArray::Delete() {
   DCHECK(this);
   for (auto& buffer : pjrt_buffers_) {
     buffer->Delete();
   }
   // TODO(hyeontaek): Return a correct future.
-  return Future<Status>(OkStatus());
+  return Future<>(OkStatus());
 }
 
 bool PjRtArray::IsDeleted() const {
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.h
index 5542ae9e54e11d..614187481e85f8 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.h
@@ -152,7 +152,7 @@ class PjRtArray final
   DisassembleIntoSingleDeviceArrays(ArrayCopySemantics semantics) override;
 
   ABSL_MUST_USE_RESULT
-  Future<Status> CopyToHostBuffer(
+  Future<> CopyToHostBuffer(
       void* data, std::optional<absl::Span<const int64_t>> byte_strides,
       ArrayCopySemantics semantics) override;
 
@@ -160,12 +160,12 @@ class PjRtArray final
       std::shared_ptr<const Sharding> new_sharding,
       ArrayCopySemantics semantics) override;
 
-  Future<Status> GetReadyFuture() const override;
+  Future<> GetReadyFuture() const override;
 
   std::shared_ptr<PjRtBuffer> GetPjRtBuffer(ArrayCopySemantics semantics,
                                             int index) const;
 
-  Future<Status> Delete() override;
+  Future<> Delete() override;
   bool IsDeleted() const override;
 
   std::string DebugString() const override;
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc
index 8ecf905965531f..2212c9bf30b47c 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/python/pjrt_ifrt/pjrt_client.h"
 
+#include <cstdint>
 #include <functional>
 #include <memory>
 #include <optional>
@@ -24,16 +25,34 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/functional/any_invocable.h"
 #include "absl/memory/memory.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
+#include "llvm/Support/Casting.h"
+#include "xla/layout.h"
+#include "xla/literal.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/pjrt_layout.h"
+#include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/memory.h"
+#include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt/tuple.h"
+#include "xla/python/ifrt/value.h"
 #include "xla/python/pjrt_ifrt/pjrt_array.h"
+#include "xla/python/pjrt_ifrt/pjrt_device.h"
+#include "xla/python/pjrt_ifrt/pjrt_memory.h"
 #include "xla/python/pjrt_ifrt/pjrt_tuple.h"
 #include "xla/python/pjrt_ifrt/xla_sharding.h"
 #include "xla/util.h"
 #include "tsl/concurrency/ref_count.h"
+#include "tsl/platform/casts.h"
+#include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
@@ -57,6 +76,76 @@ std::unique_ptr<PjRtClient> PjRtClient::Create(
   return absl::WrapUnique(new PjRtClient(std::move(pjrt_client)));
 }
 
+PjRtClient::PjRtClient(std::shared_ptr<xla::PjRtClient> pjrt_client)
+    : pjrt_client_(std::move(pjrt_client)), default_compiler_(this) {
+  devices_.reserve(pjrt_client_->devices().size());
+  device_map_.reserve(pjrt_client_->devices().size());
+  for (xla::PjRtDevice* device : pjrt_client_->devices()) {
+    auto ifrt_device = std::make_unique<PjRtDevice>(this, device);
+    devices_.push_back(ifrt_device.get());
+    CHECK(device_map_.insert({device, std::move(ifrt_device)}).second);
+  }
+  addressable_devices_.reserve(pjrt_client_->addressable_devices().size());
+  for (xla::PjRtDevice* device : pjrt_client_->addressable_devices()) {
+    auto it = device_map_.find(device);
+    CHECK(it != device_map_.end());
+    addressable_devices_.push_back(it->second.get());
+  }
+
+  memory_map_.reserve(pjrt_client_->memory_spaces().size());
+  for (xla::PjRtMemorySpace* memory_space : pjrt_client_->memory_spaces()) {
+    auto ifrt_memory_space = std::make_unique<PjRtMemory>(this, memory_space);
+    memory_map_[memory_space] = std::move(ifrt_memory_space);
+  }
+
+  for (Device* device : devices_) {
+    auto* pjrt_device = tensorflow::down_cast<PjRtDevice*>(device);
+    pjrt_device->memories_.reserve(
+        pjrt_device->pjrt_device()->memory_spaces().size());
+    for (xla::PjRtMemorySpace* pjrt_memory_space :
+         pjrt_device->pjrt_device()->memory_spaces()) {
+      pjrt_device->memories_.push_back(*LookupPjRtMemory(pjrt_memory_space));
+    }
+  }
+}
+
+PjRtClient::~PjRtClient() = default;
+
+absl::StatusOr<PjRtDevice*> PjRtClient::LookupPjRtDevice(
+    xla::PjRtDevice* pjrt_device) const {
+  auto it = device_map_.find(pjrt_device);
+  if (it == device_map_.end()) {
+    return InvalidArgument("PjRtDevice not found: %s",
+                           pjrt_device->DebugString());
+  }
+  return it->second.get();
+}
+
+absl::StatusOr<PjRtMemory*> PjRtClient::LookupPjRtMemory(
+    xla::PjRtMemorySpace* pjrt_memory) const {
+  auto it = memory_map_.find(pjrt_memory);
+  if (it == memory_map_.end()) {
+    return InvalidArgument("PjRtMemorySpace not found: %s",
+                           pjrt_memory->DebugString());
+  }
+  return it->second.get();
+}
+
+absl::StatusOr<Device*> PjRtClient::LookupDevice(DeviceId device_id) const {
+  DCHECK(this);
+  TF_ASSIGN_OR_RETURN(xla::PjRtDevice * pjrt_device,
+                      pjrt_client_->LookupDevice(device_id.value()));
+  return LookupPjRtDevice(pjrt_device);
+}
+
+absl::StatusOr<Device*> PjRtClient::LookupAddressableDevice(
+    int local_hardware_id) const {
+  DCHECK(this);
+  TF_ASSIGN_OR_RETURN(xla::PjRtDevice * pjrt_device,
+                      pjrt_client_->LookupAddressableDevice(local_hardware_id));
+  return LookupPjRtDevice(pjrt_device);
+}
+
 absl::flat_hash_map<std::string, Client::ClientAttribute>
 PjRtClient::attributes() const {
   absl::flat_hash_map<std::string, ClientAttribute> attributes;
@@ -71,6 +160,9 @@ PjRtClient::attributes() const {
     attributes.insert(
         {"pjrt_c_api_minor_version",
          ClientAttribute(plugin_attributes->pjrt_c_api_minor_version)});
+    for (const auto& [key, value] : plugin_attributes->attributes) {
+      attributes.insert({key, value});
+    }
   }
 
   return attributes;
@@ -112,9 +204,9 @@ absl::StatusOr<tsl::RCReference<Array>> PjRtClient::MakeArrayFromHostBuffer(
   if (sharding->memory_kind().memory_kind().has_value()) {
     // Find `PjRtMemorySpace` that is associated with the sharding's device and
     // matches the sharding's memory_kind.
-    PjRtMemorySpace* memory_space = nullptr;
-    for (PjRtMemorySpace* ms : sharding->devices().front()->memory_spaces()) {
-      if (ms->memory_space_kind() == *sharding->memory_kind().memory_kind()) {
+    Memory* memory_space = nullptr;
+    for (Memory* ms : sharding->devices().front()->Memories()) {
+      if (ms->Kind() == sharding->memory_kind()) {
         memory_space = ms;
         break;
       }
@@ -123,23 +215,26 @@ absl::StatusOr<tsl::RCReference<Array>> PjRtClient::MakeArrayFromHostBuffer(
       return InvalidArgument(
           "Invalid memory kind: %s; available memory kinds: %s",
           *sharding->memory_kind().memory_kind(),
-          absl::StrJoin(sharding->devices().front()->memory_spaces(), ", ",
-                        [](std::string* out, PjRtMemorySpace* ms) {
-                          absl::StrAppend(out, ms->memory_space_kind());
+          absl::StrJoin(sharding->devices().front()->Memories(), ", ",
+                        [](std::string* out, Memory* ms) {
+                          absl::StrAppend(out, *ms->Kind().memory_kind());
                         }));
     }
     TF_ASSIGN_OR_RETURN(
         buffer,
         pjrt_client_->BufferFromHostBuffer(
             data, primitive_type, shape.dims(), byte_strides, semantics,
-            FromStdFunction(std::move(on_done_with_host_buffer)), memory_space,
+            FromStdFunction(std::move(on_done_with_host_buffer)),
+            tensorflow::down_cast<PjRtMemory*>(memory_space)->pjrt_memory(),
             /*device_layout=*/nullptr));
   } else {
     TF_ASSIGN_OR_RETURN(
-        buffer, pjrt_client_->BufferFromHostBuffer(
-                    data, primitive_type, shape.dims(), byte_strides, semantics,
-                    FromStdFunction(std::move(on_done_with_host_buffer)),
-                    sharding->devices().front()));
+        buffer,
+        pjrt_client_->BufferFromHostBuffer(
+            data, primitive_type, shape.dims(), byte_strides, semantics,
+            FromStdFunction(std::move(on_done_with_host_buffer)),
+            tensorflow::down_cast<PjRtDevice*>(sharding->devices().front())
+                ->pjrt_device()));
   }
   return PjRtArray::Create(
       this, dtype, std::move(shape), std::move(sharding),
@@ -221,7 +316,7 @@ absl::StatusOr<tsl::RCReference<Tuple>> PjRtClient::MakeTuple(
 }
 
 absl::StatusOr<std::shared_ptr<const xla::PjRtTopologyDescription>>
-PjRtClient::GetTopologyForDevices(absl::Span<Device* const> devices) const {
+PjRtClient::GetTopologyForDevices(const xla::ifrt::DeviceList& devices) const {
   // TODO(parkers): Consider constructing a sub-slice topology based on the
   // provided devices.
   TF_ASSIGN_OR_RETURN(auto topology, pjrt_client_->GetTopologyDescription());
@@ -229,5 +324,25 @@ PjRtClient::GetTopologyForDevices(absl::Span<Device* const> devices) const {
                                                              topology);
 }
 
+absl::StatusOr<std::unique_ptr<PjRtLayout>>
+PjRtClient::GetDefaultLayoutForDevice(DType dtype,
+                                      absl::Span<const int64_t> dims,
+                                      Device* device) const {
+  TF_ASSIGN_OR_RETURN(PrimitiveType element_type, ToPrimitiveType(dtype));
+  TF_ASSIGN_OR_RETURN(xla::Layout layout,
+                      pjrt_client_->GetDefaultLayout(element_type, dims));
+  return std::make_unique<PjRtXlaLayout>(std::move(layout));
+}
+
+absl::Status PjRtClient::TransferToInfeed(PjRtDevice* device,
+                                          const LiteralSlice& literal) {
+  return device->pjrt_device()->TransferToInfeed(literal);
+}
+
+absl::Status PjRtClient::TransferFromOutfeed(PjRtDevice* device,
+                                             MutableBorrowingLiteral literal) {
+  return device->pjrt_device()->TransferFromOutfeed(literal);
+}
+
 }  // namespace ifrt
 }  // namespace xla
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.h
index 8603a950b42c0a..ba7bcbf8f76036 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.h
@@ -16,25 +16,42 @@ limitations under the License.
 #ifndef XLA_PYTHON_PJRT_IFRT_PJRT_CLIENT_H_
 #define XLA_PYTHON_PJRT_IFRT_PJRT_CLIENT_H_
 
+#include <cstdint>
 #include <functional>
 #include <memory>
 #include <optional>
 #include <string>
-#include <utility>
+#include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/literal.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_compiler.h"
+#include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/compiler.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt/tuple.h"
+#include "xla/python/ifrt/value.h"
 #include "xla/python/pjrt_ifrt/pjrt_compiler.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/concurrency/ref_count.h"
+#include "tsl/platform/logging.h"
 
 namespace xla {
 namespace ifrt {
 
 class PjRtCompatibleArray;
+class PjRtDevice;
+class PjRtMemory;
 
 // PjRt-compatible `Client` interface.
 class PjRtCompatibleClient
@@ -52,6 +69,10 @@ class PjRtCompatibleClient
       std::shared_ptr<PjRtBuffer> pjrt_buffer) = 0;
   virtual absl::StatusOr<tsl::RCReference<PjRtCompatibleArray>> CreatePjRtArray(
       Shape shape, PjRtBuffers pjrt_buffers) = 0;
+  virtual absl::StatusOr<PjRtDevice*> LookupPjRtDevice(
+      xla::PjRtDevice* pjrt_device) const = 0;
+  virtual absl::StatusOr<PjRtMemory*> LookupPjRtMemory(
+      xla::PjRtMemorySpace* pjrt_memory) const = 0;
 
   static char ID;  // NOLINT
 };
@@ -77,7 +98,7 @@ class PjRtClient final
 
   // Client implementation.
 
-  ~PjRtClient() override = default;
+  ~PjRtClient() override;
 
   absl::StatusOr<tsl::RCReference<Array>> MakeArrayFromHostBuffer(
       const void* data, DType dtype, Shape shape,
@@ -124,11 +145,11 @@ class PjRtClient final
   }
   absl::Span<Device* const> devices() const override {
     DCHECK(this);
-    return pjrt_client_->devices();
+    return devices_;
   }
   absl::Span<Device* const> addressable_devices() const override {
     DCHECK(this);
-    return pjrt_client_->addressable_devices();
+    return addressable_devices_;
   }
   int process_index() const override { return pjrt_client_->process_index(); }
   absl::StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
@@ -137,16 +158,10 @@ class PjRtClient final
     return pjrt_client_->GetDefaultDeviceAssignment(num_replicas,
                                                     num_partitions);
   }
-  absl::StatusOr<Device*> LookupDevice(int device_id) const override {
-    DCHECK(this);
-    return pjrt_client_->LookupDevice(device_id);
-  }
+  absl::StatusOr<Device*> LookupDevice(DeviceId device_id) const override;
 
   absl::StatusOr<Device*> LookupAddressableDevice(
-      int local_hardware_id) const override {
-    DCHECK(this);
-    return pjrt_client_->LookupAddressableDevice(local_hardware_id);
-  }
+      int local_hardware_id) const override;
 
   Compiler* GetDefaultCompiler() override {
     DCHECK(this);
@@ -154,16 +169,39 @@ class PjRtClient final
   }
 
   absl::StatusOr<std::shared_ptr<const xla::PjRtTopologyDescription>>
-  GetTopologyForDevices(absl::Span<Device* const> devices) const override;
+  GetTopologyForDevices(const DeviceList& devices) const override;
+
+  absl::StatusOr<std::unique_ptr<xla::PjRtLayout>> GetDefaultLayoutForDevice(
+      DType dtype, absl::Span<const int64_t> dims,
+      Device* device) const override;
+
+  absl::StatusOr<PjRtDevice*> LookupPjRtDevice(
+      xla::PjRtDevice* pjrt_device) const override;
+  absl::StatusOr<PjRtMemory*> LookupPjRtMemory(
+      xla::PjRtMemorySpace* pjrt_memory) const override;
+
+  // Transfer the given literal to the infeed queue.
+  absl::Status TransferToInfeed(PjRtDevice* device,
+                                const LiteralSlice& literal);
+
+  // Transfer and return a value of the given shape from the outfeed queue.
+  absl::Status TransferFromOutfeed(PjRtDevice* device,
+                                   MutableBorrowingLiteral literal);
 
   static char ID;  // NOLINT
 
  private:
-  explicit PjRtClient(std::shared_ptr<xla::PjRtClient> pjrt_client)
-      : pjrt_client_(std::move(pjrt_client)), default_compiler_(this) {}
+  explicit PjRtClient(std::shared_ptr<xla::PjRtClient> pjrt_client);
 
   std::shared_ptr<xla::PjRtClient> pjrt_client_;
   PjRtCompiler default_compiler_;
+
+  std::vector<Device*> devices_;
+  std::vector<Device*> addressable_devices_;
+  absl::flat_hash_map<xla::PjRtDevice*, std::unique_ptr<PjRtDevice>>
+      device_map_;
+  absl::flat_hash_map<xla::PjRtMemorySpace*, std::unique_ptr<PjRtMemory>>
+      memory_map_;
 };
 
 }  // namespace ifrt
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_device.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_device.cc
new file mode 100644
index 00000000000000..8427af3dea6fd1
--- /dev/null
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_device.cc
@@ -0,0 +1,74 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/python/pjrt_ifrt/pjrt_device.h"
+
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_device_description.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/memory.h"
+#include "xla/python/pjrt_ifrt/pjrt_client.h"
+#include "xla/python/pjrt_ifrt/pjrt_memory.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace ifrt {
+
+char PjRtDevice::ID = 0;
+
+PjRtDevice::PjRtDevice(PjRtClient* client, xla::PjRtDevice* pjrt_device)
+    : client_(client), pjrt_device_(pjrt_device) {}
+
+DeviceId PjRtDevice::Id() const {
+  return DeviceId(pjrt_device_->global_device_id().value());
+}
+
+absl::string_view PjRtDevice::Kind() const {
+  return pjrt_device_->device_kind();
+}
+
+absl::string_view PjRtDevice::ToString() const {
+  return pjrt_device_->ToString();
+}
+
+absl::string_view PjRtDevice::DebugString() const {
+  return pjrt_device_->DebugString();
+}
+
+absl::StatusOr<Memory*> PjRtDevice::DefaultMemory() const {
+  TF_ASSIGN_OR_RETURN(xla::PjRtMemorySpace * pjrt_memory_space,
+                      pjrt_device_->default_memory_space());
+  return client_->LookupPjRtMemory(pjrt_memory_space);
+}
+
+bool PjRtDevice::IsAddressable() const { return pjrt_device_->IsAddressable(); }
+
+absl::Span<Memory* const> PjRtDevice::Memories() const { return memories_; }
+
+int PjRtDevice::ProcessIndex() const { return pjrt_device_->process_index(); }
+
+const absl::flat_hash_map<std::string, PjRtDeviceAttribute>&
+PjRtDevice::Attributes() const {
+  return pjrt_device_->Attributes();
+}
+
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_device.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_device.h
new file mode 100644
index 00000000000000..da4064e0261cd4
--- /dev/null
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_device.h
@@ -0,0 +1,67 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PJRT_IFRT_PJRT_DEVICE_H_
+#define XLA_PYTHON_PJRT_IFRT_PJRT_DEVICE_H_
+
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/pjrt_ifrt/pjrt_client.h"
+
+namespace xla {
+namespace ifrt {
+
+class PjRtDevice final : public llvm::RTTIExtends<PjRtDevice, Device> {
+ public:
+  PjRtDevice(PjRtClient* client, xla::PjRtDevice* pjrt_device);
+
+  xla::PjRtDevice* pjrt_device() const { return pjrt_device_; }
+
+  // Device implementation.
+
+  PjRtClient* client() const override { return client_; }
+
+  DeviceId Id() const override;
+  absl::string_view Kind() const override;
+  absl::string_view ToString() const override;
+  absl::string_view DebugString() const override;
+  bool IsAddressable() const override;
+  absl::StatusOr<Memory*> DefaultMemory() const override;
+  absl::Span<Memory* const> Memories() const override;
+  int ProcessIndex() const override;
+  const absl::flat_hash_map<std::string, PjRtDeviceAttribute>& Attributes()
+      const override;
+
+  static char ID;  // NOLINT
+
+ private:
+  friend class PjRtClient;
+
+  PjRtClient* client_;
+  xla::PjRtDevice* pjrt_device_;
+  std::vector<Memory*> memories_;
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_PJRT_IFRT_PJRT_DEVICE_H_
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc
index 8f9b2109eb362c..d7edd8270865d2 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc
@@ -32,7 +32,6 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/pjrt/host_callback.h"
-#include "xla/pjrt/mlir_to_hlo.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_future.h"
@@ -48,7 +47,9 @@ limitations under the License.
 #include "xla/python/ifrt/sharding.h"
 #include "xla/python/pjrt_ifrt/pjrt_array.h"
 #include "xla/python/pjrt_ifrt/pjrt_client.h"
+#include "xla/python/pjrt_ifrt/pjrt_device.h"
 #include "xla/python/pjrt_ifrt/pjrt_host_callback.h"
+#include "xla/python/pjrt_ifrt/pjrt_memory.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -59,6 +60,7 @@ limitations under the License.
 #include "xla/xla_data.pb.h"
 #include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace ifrt {
@@ -329,9 +331,14 @@ PjRtLoadedExecutable::CreateInternal(
     const std::optional<xla::HloSharding>& result_hlo_sharding,
     const std::optional<std::vector<absl::string_view>>& result_memory_kinds,
     std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks) {
-  DeviceList devices(
-      DeviceList::Devices(pjrt_loaded_executable->addressable_devices().begin(),
-                          pjrt_loaded_executable->addressable_devices().end()));
+  DeviceList::Devices ds;
+  ds.reserve(pjrt_loaded_executable->addressable_devices().size());
+  for (xla::PjRtDevice* device :
+       pjrt_loaded_executable->addressable_devices()) {
+    TF_ASSIGN_OR_RETURN(Device * ifrt_device, client->LookupPjRtDevice(device));
+    ds.push_back(ifrt_device);
+  }
+  DeviceList devices(std::move(ds));
   if (devices.empty()) {
     return InvalidArgument("At least one device is required");
   }
@@ -444,17 +451,26 @@ PjRtLoadedExecutable::CreateInternal(
     }
   }
 
+  std::vector<Device*> addressable_devices;
+  addressable_devices.reserve(
+      pjrt_loaded_executable->addressable_devices().size());
+  for (xla::PjRtDevice* device :
+       pjrt_loaded_executable->addressable_devices()) {
+    TF_ASSIGN_OR_RETURN(Device * ifrt_device, client->LookupPjRtDevice(device));
+    addressable_devices.push_back(ifrt_device);
+  }
+
   return std::unique_ptr<LoadedExecutable>(new PjRtLoadedExecutable(
       client, std::move(pjrt_loaded_executable), std::move(devices),
-      std::move(loaded_host_callbacks), std::move(host_send_and_recv_callbacks),
-      std::move(output_dtypes), std::move(output_shapes),
-      std::move(output_shardings)));
+      std::move(addressable_devices), std::move(loaded_host_callbacks),
+      std::move(host_send_and_recv_callbacks), std::move(output_dtypes),
+      std::move(output_shapes), std::move(output_shardings)));
 }
 
 PjRtLoadedExecutable::PjRtLoadedExecutable(
     PjRtCompatibleClient* client,
     std::shared_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable,
-    DeviceList devices,
+    DeviceList devices, std::vector<Device*> addressable_devices,
     std::vector<tsl::RCReference<LoadedHostCallback>> all_loaded_host_callbacks,
     std::vector<PjRtHostSendAndRecvLoadedHostCallback*>
         host_send_recv_callbacks,
@@ -463,6 +479,7 @@ PjRtLoadedExecutable::PjRtLoadedExecutable(
     : client_(client),
       pjrt_loaded_executable_(std::move(pjrt_loaded_executable)),
       devices_(std::move(devices)),
+      addressable_devices_(std::move(addressable_devices)),
       all_loaded_host_callbacks_(
           std::make_shared<std::vector<tsl::RCReference<LoadedHostCallback>>>(
               std::move(all_loaded_host_callbacks))),
@@ -508,22 +525,25 @@ PjRtLoadedExecutable::Execute(absl::Span<tsl::RCReference<Array>> args,
   }
 
   const bool portable_execution = devices.has_value();
-  Device* portable_execution_device = devices_.front();
+  PjRtDevice* portable_execution_device =
+      static_cast<PjRtDevice*>(devices_.front());
   if (portable_execution) {
     if (devices->size() != 1) {
       return InvalidArgument(
           "Only single-shard portable execution is supported");
     }
-    portable_execution_device = devices->front();
+    portable_execution_device = static_cast<PjRtDevice*>(devices->front());
   }
 
   if (portable_execution) {
     if (!argument_handles[0].empty()) {
-      portable_execution_device = argument_handles[0][0]->device();
+      TF_ASSIGN_OR_RETURN(
+          portable_execution_device,
+          client_->LookupPjRtDevice(argument_handles[0][0]->device()));
     } else {
       // Cannot infer the device from the input.
       // TODO(hyeontaek): Probably we should take devices as an argument?
-      portable_execution_device = devices_.front();
+      portable_execution_device = static_cast<PjRtDevice*>(devices_.front());
     }
   }
 
@@ -566,21 +586,22 @@ PjRtLoadedExecutable::Execute(absl::Span<tsl::RCReference<Array>> args,
   std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> pjrt_outputs;
   ExecuteResult result;
   if (portable_execution) {
-    std::optional<PjRtFuture<Status>> returned_pjrt_future;
+    std::optional<PjRtFuture<>> returned_pjrt_future;
     TF_ASSIGN_OR_RETURN(
         std::vector<std::unique_ptr<PjRtBuffer>> single_device_pjrt_results,
         pjrt_loaded_executable_->ExecutePortable(
-            argument_handles.front(), portable_execution_device, opts,
-            returned_pjrt_future, /*fill_future=*/returned_future_supported));
+            argument_handles.front(), portable_execution_device->pjrt_device(),
+            opts, returned_pjrt_future,
+            /*fill_future=*/returned_future_supported));
 
     pjrt_outputs.push_back(std::move(single_device_pjrt_results));
     if (returned_future_supported) {
       result.status = *std::move(returned_pjrt_future);
     } else {
-      result.status = Future<Status>(OkStatus());
+      result.status = Future<>(OkStatus());
     }
   } else {
-    std::optional<std::vector<PjRtFuture<Status>>> returned_pjrt_futures;
+    std::optional<std::vector<PjRtFuture<>>> returned_pjrt_futures;
     if (returned_future_supported) {
       returned_pjrt_futures.emplace();
     }
@@ -592,7 +613,7 @@ PjRtLoadedExecutable::Execute(absl::Span<tsl::RCReference<Array>> args,
     if (returned_future_supported) {
       result.status = JoinFutures(absl::MakeSpan(*returned_pjrt_futures));
     } else {
-      result.status = Future<Status>(OkStatus());
+      result.status = Future<>(OkStatus());
     }
   }
 
@@ -631,13 +652,15 @@ PjRtLoadedExecutable::Execute(absl::Span<tsl::RCReference<Array>> args,
     const MemoryKind first_memory_kind =
         MakeMemoryKindFromPjRtBuffer(pjrt_outputs[0][i].get());
     const MemoryKind canonical_first_memory_kind =
-        CanonicalizeMemoryKind(first_memory_kind, pjrt_outputs[0][i]->device());
+        CanonicalizeMemoryKindWithPjRtDevice(first_memory_kind,
+                                             pjrt_outputs[0][i]->device());
     for (int j = 0; j < num_computations; ++j) {
       if (j > 0) {
         if (auto memory_kind =
                 MakeMemoryKindFromPjRtBuffer(pjrt_outputs[j][i].get());
             canonical_first_memory_kind !=
-            CanonicalizeMemoryKind(memory_kind, pjrt_outputs[j][i]->device())) {
+            CanonicalizeMemoryKindWithPjRtDevice(
+                memory_kind, pjrt_outputs[j][i]->device())) {
           return FailedPrecondition(
               "Memory kind mismatch between PjRtBuffers. Got one buffer with "
               "memory kind '%s' and another with memory_kind '%s'",
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h
index f667508a287be8..f899f908826f7e 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h
@@ -44,7 +44,6 @@ limitations under the License.
 #include "xla/python/pjrt_ifrt/pjrt_client.h"
 #include "xla/python/pjrt_ifrt/pjrt_host_callback.h"
 #include "xla/status.h"
-#include "xla/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/concurrency/ref_count.h"
@@ -296,7 +295,7 @@ class PjRtLoadedExecutable final
   }
   absl::Span<Device* const> addressable_devices() const override {
     DCHECK(this);
-    return pjrt_loaded_executable_->addressable_devices();
+    return addressable_devices_;
   }
 
   absl::StatusOr<
@@ -320,7 +319,7 @@ class PjRtLoadedExecutable final
   PjRtLoadedExecutable(
       PjRtCompatibleClient* client,
       std::shared_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable,
-      DeviceList devices,
+      DeviceList devices, std::vector<Device*> addressable_devices,
       std::vector<tsl::RCReference<LoadedHostCallback>>
           all_loaded_host_callbacks,
       std::vector<PjRtHostSendAndRecvLoadedHostCallback*>
@@ -331,6 +330,7 @@ class PjRtLoadedExecutable final
   PjRtCompatibleClient* client_;
   std::shared_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable_;
   DeviceList devices_;
+  std::vector<Device*> addressable_devices_;
   std::shared_ptr<std::vector<tsl::RCReference<LoadedHostCallback>>>
       all_loaded_host_callbacks_;
   std::vector<PjRtHostSendAndRecvLoadedHostCallback*> host_send_recv_callbacks_;
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_memory.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_memory.cc
new file mode 100644
index 00000000000000..01190328335606
--- /dev/null
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_memory.cc
@@ -0,0 +1,65 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/python/pjrt_ifrt/pjrt_memory.h"
+
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/memory.h"
+#include "xla/python/pjrt_ifrt/pjrt_client.h"
+#include "xla/python/pjrt_ifrt/pjrt_device.h"
+
+namespace xla {
+namespace ifrt {
+
+char PjRtMemory::ID = 0;
+
+PjRtMemory::PjRtMemory(PjRtClient* client, xla::PjRtMemorySpace* pjrt_memory)
+    : client_(client), pjrt_memory_(pjrt_memory), kind_(pjrt_memory->kind()) {
+  for (xla::PjRtDevice* device : pjrt_memory->devices()) {
+    devices_.push_back(*client->LookupPjRtDevice(device));
+  }
+}
+
+MemoryId PjRtMemory::Id() const { return MemoryId(pjrt_memory_->id()); }
+
+const MemoryKind& PjRtMemory::Kind() const { return kind_; }
+
+absl::string_view PjRtMemory::ToString() const {
+  return pjrt_memory_->ToString();
+}
+
+absl::string_view PjRtMemory::DebugString() const {
+  return pjrt_memory_->DebugString();
+}
+
+absl::Span<Device* const> PjRtMemory::Devices() const { return devices_; }
+
+MemoryKind CanonicalizeMemoryKindWithPjRtDevice(MemoryKind memory_kind,
+                                                xla::PjRtDevice* device) {
+  if (memory_kind.memory_kind().has_value()) {
+    return memory_kind;
+  }
+  auto default_memory_space = device->default_memory_space();
+  if (default_memory_space.ok()) {
+    return MemoryKind((*default_memory_space)->kind());
+  }
+  return MemoryKind();
+}
+
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_memory.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_memory.h
new file mode 100644
index 00000000000000..948ce1227f1b0c
--- /dev/null
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_memory.h
@@ -0,0 +1,65 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PJRT_IFRT_PJRT_MEMORY_H_
+#define XLA_PYTHON_PJRT_IFRT_PJRT_MEMORY_H_
+
+#include "absl/strings/string_view.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/python/ifrt/memory.h"
+
+namespace xla {
+namespace ifrt {
+
+class PjRtClient;
+
+class PjRtMemory final : public llvm::RTTIExtends<PjRtMemory, Memory> {
+ public:
+  PjRtMemory(PjRtClient* client, xla::PjRtMemorySpace* pjrt_memory);
+
+  PjRtClient* client() const { return client_; }
+  xla::PjRtMemorySpace* pjrt_memory() { return pjrt_memory_; }
+
+  MemoryId Id() const override;
+  const MemoryKind& Kind() const override;
+  absl::string_view ToString() const override;
+  absl::string_view DebugString() const override;
+  absl::Span<Device* const> Devices() const override;
+
+  static char ID;  // NOLINT
+
+ private:
+  PjRtClient* client_;
+  xla::PjRtMemorySpace* pjrt_memory_;
+  MemoryKind kind_;
+  std::vector<Device*> devices_;
+};
+
+// Canonicalizes `MemoryKind`. If `MemoryKind` has no memory kind chosen,
+// returns a default `MemoryKind` chosen for the PjRt device. If there is no
+// default indicated by the device, simply returns `MemoryKind` with no memory
+// kind chosen.
+//
+// TODO(hyeontaek,yashkatariya): Harden `MemoryKind` creation paths so that
+// every `MemoryKind` is canonicalized and does not require on-demand
+// canonicalization.
+MemoryKind CanonicalizeMemoryKindWithPjRtDevice(MemoryKind memory_kind,
+                                                xla::PjRtDevice* device);
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_PJRT_IFRT_PJRT_MEMORY_H_
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_tuple.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_tuple.cc
index c18b2ed6c1d1ed..61578abb195c5d 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_tuple.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_tuple.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "llvm/Support/ExtensibleRTTI.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/future.h"
 #include "tsl/concurrency/ref_count.h"
 
 namespace xla {
@@ -34,8 +35,8 @@ namespace ifrt {
   return tsl::MakeRef<PjRtTuple>(client, values);
 }
 
-Future<Status> PjRtTuple::GetReadyFuture() const {
-  std::vector<Future<Status>> futures;
+Future<> PjRtTuple::GetReadyFuture() const {
+  std::vector<Future<>> futures;
   futures.reserve(values_.size());
   for (const auto& value : values_) {
     futures.push_back(value->GetReadyFuture());
@@ -43,14 +44,14 @@ Future<Status> PjRtTuple::GetReadyFuture() const {
   return JoinFutures(absl::MakeSpan(futures));
 }
 
-Future<Status> PjRtTuple::Delete() {
+Future<> PjRtTuple::Delete() {
   {
     absl::MutexLock lock(&mu_);
     if (!is_deleted_.HasBeenNotified()) {
       is_deleted_.Notify();
     }
   }
-  std::vector<Future<Status>> futures;
+  std::vector<Future<>> futures;
   futures.reserve(values_.size());
   for (const auto& value : values_) {
     futures.push_back(value->Delete());
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_tuple.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_tuple.h
index 3d4bab2c5d71be..98bc66aff699b6 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_tuple.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_tuple.h
@@ -46,9 +46,9 @@ class PjRtTuple final : public llvm::RTTIExtends<PjRtTuple, Tuple> {
     return client_;
   }
 
-  Future<Status> GetReadyFuture() const override;
+  Future<> GetReadyFuture() const override;
 
-  Future<Status> Delete() override;
+  Future<> Delete() override;
 
   bool IsDeleted() const override;
 
diff --git a/third_party/xla/xla/python/pjrt_ifrt/xla_executable_impl_test_lib.cc b/third_party/xla/xla/python/pjrt_ifrt/xla_executable_impl_test_lib.cc
index e0a221d4ba7535..ae1c54f64e05b6 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/xla_executable_impl_test_lib.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/xla_executable_impl_test_lib.cc
@@ -19,9 +19,13 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/pjrt/mlir_to_hlo.h"
 #include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/compiler.h"
+#include "xla/python/ifrt/executable.h"
 #include "xla/python/ifrt/test_util.h"
 #include "xla/python/pjrt_ifrt/xla_compiler.h"
 #include "tsl/lib/core/status_test_util.h"
@@ -58,7 +62,7 @@ absl::StatusOr<std::unique_ptr<LoadedExecutable>> CompileOnDevices(
   ExecutableBuildOptions& build_options =
       compile_options->compile_options.executable_build_options;
   for (Device* device : devices) {
-    build_options.set_device_ordinal(device->id());
+    build_options.set_device_ordinal(device->Id().value());
     if (replicated) {
       DeviceAssignment device_assignment(/*replica_count=*/devices.size(),
                                          /*computation_count=*/1);
diff --git a/third_party/xla/xla/python/pjrt_ifrt/xla_program_serdes_test.cc b/third_party/xla/xla/python/pjrt_ifrt/xla_program_serdes_test.cc
index 33c1336477c4c1..41ce1c6ab67a3d 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/xla_program_serdes_test.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/xla_program_serdes_test.cc
@@ -60,11 +60,9 @@ module {
     TF_ASSERT_OK_AND_ASSIGN(serialized, Serialize(*program));
   }
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Serializable> deserialized,
-                          Deserialize(serialized, /*options=*/nullptr));
-
-  auto xla_program = llvm::dyn_cast<XlaProgram>(deserialized.get());
-  ASSERT_THAT(xla_program, Not(IsNull()));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<XlaProgram> xla_program,
+      Deserialize<XlaProgram>(serialized, /*options=*/nullptr));
 
   // Verify that the deserialized program has no StableHLO ops.
   bool has_unsupported_dialect = false;
diff --git a/third_party/xla/xla/python/pjrt_ifrt/xla_sharding.h b/third_party/xla/xla/python/pjrt_ifrt/xla_sharding.h
index ab1eb91e39c5b5..0ec5d5e5507f5a 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/xla_sharding.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/xla_sharding.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/python/ifrt/memory.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
diff --git a/third_party/xla/xla/python/pjrt_ifrt/xla_sharding.proto b/third_party/xla/xla/python/pjrt_ifrt/xla_sharding.proto
index 8ac855ea36942f..6867d9713c39e9 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/xla_sharding.proto
+++ b/third_party/xla/xla/python/pjrt_ifrt/xla_sharding.proto
@@ -17,7 +17,7 @@ syntax = "proto3";
 
 package xla.ifrt;
 
-import "xla/python/ifrt/types.proto";
+import "xla/python/ifrt/device.proto";
 import "xla/xla_data.proto";
 
 // Wire format for `HloSharding`.
diff --git a/third_party/xla/xla/python/pjrt_ifrt/xla_sharding_serdes.cc b/third_party/xla/xla/python/pjrt_ifrt/xla_sharding_serdes.cc
index 2cd284f025db1a..de19d84cc0e538 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/xla_sharding_serdes.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/xla_sharding_serdes.cc
@@ -17,12 +17,18 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ExtensibleRTTI.h"
 #include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/memory.h"
 #include "xla/python/ifrt/serdes.h"
-#include "xla/python/ifrt/sharding_serdes.h"
 #include "xla/python/pjrt_ifrt/xla_sharding.h"
 #include "xla/python/pjrt_ifrt/xla_sharding.pb.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace ifrt {
@@ -50,8 +56,8 @@ class HloShardingSerDes : public llvm::RTTIExtends<HloSharding, SerDes> {
   absl::StatusOr<std::unique_ptr<Serializable>> Deserialize(
       const std::string& serialized,
       std::unique_ptr<DeserializeOptions> options) override {
-    TF_ASSIGN_OR_RETURN(auto deserialize_sharding_options,
-                        GetDeserializeShardingOptions(std::move(options)));
+    const auto* deserialize_sharding_options =
+        llvm::cast<DeserializeShardingOptions>(options.get());
 
     HloShardingProto proto;
     if (!proto.ParseFromString(serialized)) {
diff --git a/third_party/xla/xla/python/pjrt_ifrt/xla_sharding_serdes_test.cc b/third_party/xla/xla/python/pjrt_ifrt/xla_sharding_serdes_test.cc
index 8adfa7ddc52092..e777bbf9fe89e3 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/xla_sharding_serdes_test.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/xla_sharding_serdes_test.cc
@@ -13,15 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstdint>
 #include <memory>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/functional/bind_front.h"
+#include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_sharding.h"
-#include "xla/python/ifrt/sharding_serdes.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/memory.h"
+#include "xla/python/ifrt/serdes.h"
+#include "xla/python/ifrt/sharding.h"
 #include "xla/python/ifrt/sharding_test_util.h"
 #include "xla/python/pjrt_ifrt/xla_sharding.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace ifrt {
@@ -41,13 +47,11 @@ TEST_P(XlaShardingSerDesTest, HloShardingRoundTrip) {
   TF_ASSERT_OK_AND_ASSIGN(auto serialized, Serialize(*sharding));
 
   TF_ASSERT_OK_AND_ASSIGN(
-      auto deserialized,
-      Deserialize(serialized,
-                  std::make_unique<DeserializeShardingOptions>(
-                      absl::bind_front(&Client::LookupDevice, client()))));
+      auto out_sharding,
+      Deserialize<HloSharding>(
+          serialized, std::make_unique<DeserializeShardingOptions>(
+                          absl::bind_front(&Client::LookupDevice, client()))));
 
-  const auto* out_sharding = llvm::dyn_cast<HloSharding>(deserialized.get());
-  ASSERT_NE(out_sharding, nullptr);
   EXPECT_THAT(out_sharding->devices(), ElementsAreArray(sharding->devices()));
   EXPECT_EQ(out_sharding->xla_hlo_sharding(), sharding->xla_hlo_sharding());
 }
diff --git a/third_party/xla/xla/python/pmap_lib.cc b/third_party/xla/xla/python/pmap_lib.cc
index 1dbeb30d01ef8b..7fea65fcd85218 100644
--- a/third_party/xla/xla/python/pmap_lib.cc
+++ b/third_party/xla/xla/python/pmap_lib.cc
@@ -45,7 +45,6 @@ limitations under the License.
 #include "third_party/nanobind/include/nanobind/stl/variant.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/vector.h"  // IWYU pragma: keep
 #include "xla/pjrt/exceptions.h"
-#include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/status_casters.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/device.h"
@@ -131,45 +130,42 @@ struct ShardArgResult {
 //
 // Both `devices` and `sharding_spec` has the same length.
 absl::StatusOr<ShardArgResult> ShardArg(
-    nb::handle arg, absl::Span<xla::PjRtDevice* const> devices,
+    nb::handle arg, absl::Span<xla::ifrt::Device* const> devices,
     const InputSpec& input_spec, nb::handle py_devices,
     const nb::callable& python_fallback) {
   if (arg.type().ptr() == xla::PyArray::type().ptr()) {
     auto py_array = nb::borrow<xla::PyArray>(arg);
-    if (py_array.fastpath_enabled()) {
-      if (py_array.sharding().type().ptr() ==
-          input_spec.array_sharding.type().ptr()) {
-        auto* pmap_sharding =
-            nb::cast<jax::PmapSharding*>(nb::handle(py_array.sharding().ptr()));
-        auto* cached_pmap_sharding = nb::cast<jax::PmapSharding*>(
-            nb::handle(input_spec.array_sharding.ptr()));
-
-        if (pmap_sharding->sharding_spec() ==
-            cached_pmap_sharding->sharding_spec()) {
-          ShardArgResult result;
-          result.owning_sda = nb::borrow<nb::object>(arg.ptr());
-          result.ifrt_array = tsl::FormRef(py_array.ifrt_array());
-          if (result.ifrt_array == nullptr) {
-            return xla::InvalidArgument("Array has been deleted.");
-          }
-          if (result.ifrt_array->sharding().devices().devices() != devices) {
-            xla::ifrt::DeviceList::Devices ifrt_devices;
-            ifrt_devices.reserve(devices.size());
-            ifrt_devices.insert(ifrt_devices.end(), devices.begin(),
-                                devices.end());
-            // pmap does not support memory_kind for now.
-            auto sharding = xla::ifrt::OpaqueSharding::Create(
-                xla::ifrt::DeviceList(std::move(ifrt_devices)),
-                xla::ifrt::MemoryKind());
-            TF_ASSIGN_OR_RETURN(
-                auto copied_ifrt_array,
-                result.ifrt_array->Reshard(
-                    std::move(sharding),
-                    xla::ifrt::ArrayCopySemantics::kReuseInput));
-            result.ifrt_array = std::move(copied_ifrt_array);
-          }
-          return result;
+    if (py_array.sharding().type().ptr() ==
+        input_spec.array_sharding.type().ptr()) {
+      auto* pmap_sharding =
+          nb::cast<jax::PmapSharding*>(nb::handle(py_array.sharding().ptr()));
+      auto* cached_pmap_sharding = nb::cast<jax::PmapSharding*>(
+          nb::handle(input_spec.array_sharding.ptr()));
+
+      if (pmap_sharding->sharding_spec() ==
+          cached_pmap_sharding->sharding_spec()) {
+        ShardArgResult result;
+        result.owning_sda = nb::borrow<nb::object>(arg.ptr());
+        result.ifrt_array = tsl::FormRef(py_array.ifrt_array());
+        if (result.ifrt_array == nullptr) {
+          return xla::InvalidArgument("Array has been deleted.");
         }
+        if (result.ifrt_array->sharding().devices().devices() != devices) {
+          xla::ifrt::DeviceList::Devices ifrt_devices;
+          ifrt_devices.reserve(devices.size());
+          ifrt_devices.insert(ifrt_devices.end(), devices.begin(),
+                              devices.end());
+          // pmap does not support memory_kind for now.
+          auto sharding = xla::ifrt::OpaqueSharding::Create(
+              xla::ifrt::DeviceList(std::move(ifrt_devices)),
+              xla::ifrt::MemoryKind());
+          TF_ASSIGN_OR_RETURN(auto copied_ifrt_array,
+                              result.ifrt_array->Reshard(
+                                  std::move(sharding),
+                                  xla::ifrt::ArrayCopySemantics::kReuseInput));
+          result.ifrt_array = std::move(copied_ifrt_array);
+        }
+        return result;
       }
     }
   }
@@ -200,6 +196,8 @@ absl::StatusOr<ShardArgResult> ShardArg(
     result.owning_sda = owning_pylist;
     const bool jax_enable_x64 = GetEnableX64();
 
+    std::vector<xla::DevicePutResultFn> device_put_fns;
+    device_put_fns.reserve(n_devices);
     xla::DevicePutOptions options;
     options.squash_64bit_types = !jax_enable_x64;
     options.allow_zero_copy = true;
@@ -210,15 +208,25 @@ absl::StatusOr<ShardArgResult> ShardArg(
       }
 
       TF_ASSIGN_OR_RETURN(
-          xla::DevicePutResult on_device,
+          device_put_fns.emplace_back(),
           DevicePut(arg[indices[i]], to_device->client()->ifrt_client(),
                     to_device->device(), options, xla::ifrt::MemoryKind()));
-
-      per_device_arrays.push_back(std::move(on_device.ifrt_array));
+    }
+    std::vector<xla::DevicePutResult> device_puts;
+    device_puts.reserve(n_devices);
+    {
+      nb::gil_scoped_release gil_release;
+      for (auto& device_put_fn : device_put_fns) {
+        TF_ASSIGN_OR_RETURN(auto device_put, std::move(device_put_fn)());
+        device_puts.push_back(std::move(device_put));
+      }
+    }
+    for (auto& device_put : device_puts) {
+      per_device_arrays.push_back(std::move(device_put.ifrt_array));
       devices.push_back(per_device_arrays.back()->sharding().devices().front());
       shapes.push_back(per_device_arrays.back()->shape());
-      if (on_device.owning_pybuffer) {
-        owning_pylist.append(on_device.owning_pybuffer);
+      if (device_put.owning_pybuffer) {
+        owning_pylist.append(device_put.owning_pybuffer);
       }
     }
 
@@ -256,7 +264,7 @@ struct PmapCacheEntry {
   std::shared_ptr<xla::PyLoadedExecutable> executable;
   // The value `backend.local_devices()`.
   nb::object py_devices;  // To pass back to Python.
-  std::vector<xla::PjRtDevice*> devices;
+  std::vector<xla::ifrt::Device*> devices;
   std::vector<InputSpec> input_specs;
   xla::PyTreeDef out_pytree_def;
   // Objects necessary to build the out Array objects.
@@ -619,7 +627,7 @@ absl::StatusOr<nb::object> PmapFunction::Call(nb::handle callable,
   }
 
   // 1. Parse arguments.
-  std::vector<xla::PjRtDevice*>& input_devices = cache_entry.devices;
+  std::vector<xla::ifrt::Device*>& input_devices = cache_entry.devices;
   std::vector<InputSpec>& input_specs = cache_entry.input_specs;
   const int num_args = flat_dynamic_args.size();
 
diff --git a/third_party/xla/xla/python/profiler/internal/BUILD b/third_party/xla/xla/python/profiler/internal/BUILD
index 127a5b54638013..00db5bd09ffe75 100644
--- a/third_party/xla/xla/python/profiler/internal/BUILD
+++ b/third_party/xla/xla/python/profiler/internal/BUILD
@@ -1,7 +1,7 @@
-load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
-load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("@local_tsl//tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
+load("//xla/tsl:tsl.bzl", "internal_visibility")
+load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/python/py_array.cc b/third_party/xla/xla/python/py_array.cc
index 28b5242cca6b7f..32b04d3ff735fe 100644
--- a/third_party/xla/xla/python/py_array.cc
+++ b/third_party/xla/xla/python/py_array.cc
@@ -66,6 +66,7 @@ limitations under the License.
 #include "xla/python/nb_helpers.h"
 #include "xla/python/nb_numpy.h"
 #include "xla/python/pjrt_ifrt/pjrt_array.h"
+#include "xla/python/pjrt_ifrt/pjrt_device.h"
 #include "xla/python/pjrt_ifrt/xla_sharding.h"
 #include "xla/python/py_client.h"
 #include "xla/python/py_device.h"
@@ -81,6 +82,7 @@ limitations under the License.
 #include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
+#include "xla/xla_data.pb.h"
 #if GOOGLE_CUDA
 #include "xla/stream_executor/cuda/cuda_driver.h"
 #endif
@@ -349,8 +351,7 @@ PyArray_Storage::PyArray_Storage(nb::object aval, bool weak_type,
                                  std::optional<nb_traceback> traceback,
                                  tsl::RCReference<ifrt::Array> ifrt_array,
                                  xla::PjRtFuture<absl::Status> result_status)
-    : fastpath_enabled(true),
-      aval(std::move(aval)),
+    : aval(std::move(aval)),
       weak_type(weak_type),
       dtype(std::move(dtype)),
       shape(std::move(shape)),
@@ -368,8 +369,6 @@ PyArray_Storage::PyArray_Storage(nb::object aval, bool weak_type,
   prev = nullptr;
 }
 
-PyArray_Storage::PyArray_Storage(DisableFastpath) : fastpath_enabled(false) {}
-
 void PyArray::PyInit(PyArray self, nb::object aval, nb::object sharding,
                      absl::Span<const PyArray> py_arrays, bool committed,
                      bool skip_checks) {
@@ -388,11 +387,6 @@ void PyArray::PyInit(PyArray self, nb::object aval, nb::object sharding,
   }
 }
 
-void PyArray::PyInit(nb::object self, DisableFastpath) {
-  Construct(reinterpret_cast<PyArrayObject*>(self.ptr()),
-            PyArray_Storage::DisableFastpath());
-}
-
 PyArray PyArray::MakeFromSingleDeviceArray(
     nb_class_ptr<PyClient> py_client, std::optional<nb_traceback> traceback,
     tsl::RCReference<ifrt::Array> ifrt_array, bool weak_type, bool committed,
@@ -624,6 +618,11 @@ Status PyArray::set_arrays(nb::object obj) {
 }
 
 StatusOr<PyArray> PyArray::FullyReplicatedShard() {
+  auto& cached = GetStorage().fully_replicated_array;
+  if (!cached.is_none()) {
+    return nb::cast<PyArray>(cached);
+  }
+
   if (ifrt_array() == nullptr) {
     return InvalidArgument(
         "FullyReplicatedShard() called on deleted or donated buffer");
@@ -632,9 +631,11 @@ StatusOr<PyArray> PyArray::FullyReplicatedShard() {
   TF_ASSIGN_OR_RETURN(auto fully_replicated_ifrt_shard,
                       ifrt_array()->FullyReplicatedShard(
                           ifrt::ArrayCopySemantics::kReuseInput));
-  return MakeFromSingleDeviceArray(py_client(), traceback(),
-                                   std::move(fully_replicated_ifrt_shard),
-                                   weak_type(), committed(), result_status());
+  auto array = MakeFromSingleDeviceArray(
+      py_client(), traceback(), std::move(fully_replicated_ifrt_shard),
+      weak_type(), committed(), result_status());
+  cached = array;
+  return nb::cast<PyArray>(cached);
 }
 
 Status PyArray::BlockUntilReady() const {
@@ -658,22 +659,6 @@ StatusOr<size_t> PyArray::GetOnDeviceSizeInBytes() {
   return shard_size * nb::len(nb::object(sharding().attr("device_set")));
 }
 
-StatusOr<PyArray> PyArray::FetchSingleShard(std::string_view api) {
-  if (ifrt_array() == nullptr) {
-    return InvalidArgument("%s( called on deleted or donated buffer", api);
-  }
-
-  if (llvm::isa<ifrt::SingleDeviceSharding>(&ifrt_array()->sharding())) {
-    return *this;
-  }
-
-  auto& py_arrays = py_arrays_cached();
-  if (py_arrays.empty() || py_arrays[0].shape() != shape()) {
-    return InvalidArgument("%s() is supported only for unsharded arrays.", api);
-  }
-  return py_arrays[0];
-}
-
 absl::Status PyArray::BlockUntilResultStatusIsReady() {
   auto& result_status = GetStorage().result_status;
   // If the result_status future is not valid, this result did not come directly
@@ -690,8 +675,7 @@ absl::Status PyArray::BlockUntilResultStatusIsReady() {
 }
 
 StatusOr<nb::object> PyArray::SingleDeviceArrayToNumpyArray() {
-  TF_ASSIGN_OR_RETURN(auto arr,
-                      FetchSingleShard("SingleDeviceArrayToNumpyArray"));
+  TF_ASSIGN_OR_RETURN(auto arr, FullyReplicatedShard());
   auto result = arr.GetStorage().host_value.AsNumPyArray(
       arr.GetStorage().dynamic_shape, arr.ifrt_array());
   TF_RETURN_IF_ERROR(arr.BlockUntilResultStatusIsReady());
@@ -699,8 +683,7 @@ StatusOr<nb::object> PyArray::SingleDeviceArrayToNumpyArray() {
 }
 
 Status PyArray::CopySingleDeviceArrayToHostAsync() {
-  TF_ASSIGN_OR_RETURN(auto arr,
-                      FetchSingleShard("CopySingleDeviceArrayToHostAsync"));
+  TF_ASSIGN_OR_RETURN(auto arr, FullyReplicatedShard());
   return arr.GetStorage().host_value.CopyToHostAsync(
       arr.GetStorage().dynamic_shape, arr.ifrt_array());
 }
@@ -899,10 +882,16 @@ StatusOr<nb::object> CudaArrayInterfaceToBuffer(const nb::dict& cai,
   Shape shape = ShapeUtil::MakeShapeWithDenseLayout(element_type, dimensions,
                                                     minor_to_major);
   std::function<void()> on_delete_callback = []() {};
+  auto* pjrt_device =
+      llvm::dyn_cast_or_null<ifrt::PjRtDevice>(device->device());
+  if (pjrt_device == nullptr) {
+    return InvalidArgument(
+        "This operation is implemented for a PjRt-compatible backend only.");
+  }
   TF_ASSIGN_OR_RETURN(
       auto pjrt_buffer,
       device->client()->pjrt_client()->CreateViewOfDeviceBuffer(
-          static_cast<char*>(data_ptr), shape, device->device(),
+          static_cast<char*>(data_ptr), shape, pjrt_device->pjrt_device(),
           on_delete_callback,
           stream <= 2 ? std::nullopt : std::make_optional(stream)));
   auto* ifrt_client =
@@ -957,9 +946,6 @@ nb::handle PyArray::Storage::AsHandle() {
 
 PyArray::Storage::~PyArray_Storage() {
   CHECK(PyGILState_Check());
-  if (!fastpath_enabled) {
-    return;
-  }
   if (py_client->arrays_ == this) {
     py_client->arrays_ = next;
   }
@@ -1083,8 +1069,8 @@ StatusOr<PyArray> PyArray::BatchedDevicePut(
   DevicePutOptions options;
   options.squash_64bit_types = !jax_enable_x64;
   options.allow_zero_copy =
-      (!force_copy &&
-       (host_buffer_semantics == ifrt::Client::HostBufferSemantics::kZeroCopy));
+      (!force_copy && (host_buffer_semantics ==
+                       ifrt::Client::HostBufferSemantics::kImmutableZeroCopy));
 
   nb::list owning_pylist;
   std::vector<tsl::RCReference<ifrt::Array>> ifrt_arrays;
@@ -1096,6 +1082,8 @@ StatusOr<PyArray> PyArray::BatchedDevicePut(
 
   ifrt::MemoryKind dst_memory_kind = CreateIfRtMemoryKindFromSharding(sharding);
 
+  std::vector<DevicePutResultFn> device_put_fns;
+  device_put_fns.reserve(xs.size());
   size_t i = 0;
   for (auto& x : xs) {
     if (PyArray::IsPyArray(x)) {
@@ -1106,16 +1094,27 @@ StatusOr<PyArray> PyArray::BatchedDevicePut(
           jax::ApplyTransferGuardToHostToDevice(transfer_guard_formatter));
     }
     TF_ASSIGN_OR_RETURN(
-        DevicePutResult on_device,
+        device_put_fns.emplace_back(),
         DevicePut(x, dst_devices[i]->client()->ifrt_client(),
                   dst_devices[i]->device(), options, dst_memory_kind));
-    ifrt_arrays.push_back(std::move(on_device.ifrt_array));
+    ++i;
+  }
+  std::vector<DevicePutResult> device_puts;
+  device_puts.reserve(device_put_fns.size());
+  {
+    nb::gil_scoped_release gil_release;
+    for (auto& device_put_fn : device_put_fns) {
+      TF_ASSIGN_OR_RETURN(auto device_put, std::move(device_put_fn)());
+      device_puts.push_back(std::move(device_put));
+    }
+  }
+  for (auto& device_put : device_puts) {
+    ifrt_arrays.push_back(std::move(device_put.ifrt_array));
     devices.push_back(ifrt_arrays.back()->sharding().devices().front());
     shapes.push_back(ifrt_arrays.back()->shape());
-    if (on_device.owning_pybuffer) {
-      owning_pylist.append(on_device.owning_pybuffer);
+    if (device_put.owning_pybuffer) {
+      owning_pylist.append(device_put.owning_pybuffer);
     }
-    ++i;
   }
 
   // TODO(phawkins): it's highly suspicious to me that owning_pylist isn't
@@ -1207,7 +1206,7 @@ bool HasDefaultLayout(const Layout& layout) {
 }
 
 int PyArray_bf_getbuffer(PyObject* exporter, Py_buffer* view, int flags) {
-  Status status = [&]() {
+  Status status = [&]() -> absl::Status {
     PyArray py_array = nb::borrow<PyArray>(exporter);
     if (py_array.ifrt_array() == nullptr) {
       // TODO(phawkins): why is this happening?
@@ -1478,8 +1477,10 @@ Status PyHostValue::CopyToHostAsync(std::optional<Shape>& dynamic_shape_holder,
   // better about an efficient layout for the host buffer. It will be useful
   // to revisit the semantics of PjRtBuffer::ToLiteral() to see if it is
   // desirable for the runtime to choose the layout.
-  ready_ = ifrt_array->CopyToHostBuffer(value_.mutable_data(), strides,
-                                        ifrt::ArrayCopySemantics::kReuseInput);
+  ready_ = ifrt_array
+               ->CopyToHostBuffer(value_.mutable_data(), strides,
+                                  ifrt::ArrayCopySemantics::kReuseInput)
+               .ToStatusFuture();
   // Make sure the destination of the copy remains alive until the copy is done.
   value_.inc_ref();
   ready_.OnReady([array{value_.ptr()}](Status status) {
@@ -1562,12 +1563,6 @@ Status PyArray::RegisterTypes(nb::module_& m) {
       },
       nb::is_method(), nb::arg("aval"), nb::arg("sharding"), nb::arg("arrays"),
       nb::arg("committed"), nb::arg("_skip_checks") = false);
-  // TODO(yashkatariya): remove this once the transition completes.
-  type.attr("_init_with_fastpath_disabled") = nb::cpp_function(
-      [](nb::object self) {
-        PyArray::PyInit(self, PyArray::DisableFastpath());
-      },
-      nb::is_method());
   type.attr("delete") = nb::cpp_function(
       [](PyArray& self) { xla::ThrowIfError(self.Delete()); }, nb::is_method());
   type.attr("_sharding") = nb_property_readonly(&PyArray::sharding);
diff --git a/third_party/xla/xla/python/py_array.h b/third_party/xla/xla/python/py_array.h
index f5b39e86e85853..4c15e5c9a80ea8 100644
--- a/third_party/xla/xla/python/py_array.h
+++ b/third_party/xla/xla/python/py_array.h
@@ -33,6 +33,8 @@ limitations under the License.
 #include "third_party/nanobind/include/nanobind/nanobind.h"
 #include "xla/pjrt/exceptions.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_future.h"
+#include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/future.h"
@@ -81,16 +83,9 @@ struct PyArray_Storage {
                   tsl::RCReference<ifrt::Array> ifrt_array,
                   xla::PjRtFuture<absl::Status> result_status);
 
-  // TODO(yashkatariya): remove this once the transition completes.
-  struct DisableFastpath {};
-  explicit PyArray_Storage(DisableFastpath);
-
   ~PyArray_Storage();
   nanobind::handle AsHandle();
 
-  // TODO(yashkatariya): remove this once the transition completes.
-  bool fastpath_enabled;
-
   nanobind::object aval;
   bool weak_type = false;
   nb_dtype dtype;
@@ -103,6 +98,7 @@ struct PyArray_Storage {
   nb_class_ptr<PyClient> py_client;
   std::optional<nb_traceback> traceback;
   tsl::RCReference<ifrt::Array> ifrt_array;
+  nanobind::object fully_replicated_array = nanobind::none();
 
   // optional field, used only in python
   std::vector<PyArray> py_arrays;
@@ -134,10 +130,6 @@ class PyArray : public nanobind::object {
                      absl::Span<const PyArray> py_arrays, bool committed,
                      bool skip_checks);
 
-  // TODO(yashkatariya): remove this once the transition completes.
-  struct DisableFastpath {};
-  static void PyInit(nanobind::object self, DisableFastpath);
-
   // Only used in C++. `skip_checks` should only be set for Arrays created by
   // jax that cannot possibly have consistency issues (e.g. `sharding` devices
   // different than `ifrt_array` devices). Arrays created by users should be
@@ -259,9 +251,6 @@ class PyArray : public nanobind::object {
     return ifrt_array_ptr->sharding().devices().size();
   }
 
-  // TODO(yashkatariya): remove this once the transition completes.
-  bool fastpath_enabled() const { return GetStorage().fastpath_enabled; }
-
   static nanobind::handle type() {
     DCHECK(type_);
     return nanobind::handle(type_);
@@ -301,7 +290,6 @@ class PyArray : public nanobind::object {
       std::vector<nanobind::object> objs);
 
  private:
-  absl::StatusOr<PyArray> FetchSingleShard(std::string_view api);
   absl::StatusOr<PyArray> AssertUnsharded(std::string_view api);
 
   void CheckAndRearrange();
diff --git a/third_party/xla/xla/python/py_client.cc b/third_party/xla/xla/python/py_client.cc
index d0389018940891..b67f18b16c1e7c 100644
--- a/third_party/xla/xla/python/py_client.cc
+++ b/third_party/xla/xla/python/py_client.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "llvm/Support/Casting.h"
@@ -45,6 +46,7 @@ limitations under the License.
 #include "third_party/nanobind/include/nanobind/stl/shared_ptr.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/string.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/string_view.h"  // IWYU pragma: keep
+#include "third_party/nanobind/include/nanobind/stl/unique_ptr.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/variant.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/vector.h"  // IWYU pragma: keep
 #include "xla/literal.h"
@@ -53,17 +55,20 @@ limitations under the License.
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
+#include "xla/pjrt/pjrt_layout.h"
 #include "xla/pjrt/pjrt_stream_executor_client.h"
 #include "xla/pjrt/status_casters.h"
 #include "xla/python/callback.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/compiler.h"
 #include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/dtype.h"
 #include "xla/python/ifrt/executable.h"
 #include "xla/python/ifrt/host_callback.h"
 #include "xla/python/ifrt/memory.h"
 #include "xla/python/nb_absl_span.h"  // IWYU pragma: keep
 #include "xla/python/nb_class_ptr.h"
+#include "xla/python/nb_numpy.h"
 #include "xla/python/pjrt_ifrt/pjrt_array.h"
 #include "xla/python/pjrt_ifrt/pjrt_client.h"
 #include "xla/python/pjrt_ifrt/xla_compiler.h"
@@ -77,6 +82,7 @@ limitations under the License.
 #include "xla/python/python_ref_manager.h"
 #include "xla/python/traceback.h"
 #include "xla/python/transfer_guard_lib.h"
+#include "xla/python/types.h"
 #include "xla/service/custom_call_target_registry.h"
 #include "xla/service/platform_util.h"  // IWYU pragma: keep
 #include "xla/shape.h"
@@ -114,7 +120,7 @@ PyClient::PyClient(std::shared_ptr<ifrt::Client> ifrt_client)
   for (ifrt::Device* device : client->ifrt_client()->devices()) {
     client->devices_[device] = make_nb_class<PyDevice>(client, device);
 
-    for (PjRtMemorySpace* memory : device->memory_spaces()) {
+    for (ifrt::Memory* memory : device->Memories()) {
       auto& py_memory = client->memory_spaces_[memory];
       if (py_memory.get() == nullptr) {
         py_memory = make_nb_class<PyMemorySpace>(client, memory);
@@ -138,7 +144,7 @@ nb_class_ptr<PyDevice> PyClient::GetPyDevice(ifrt::Device* device) {
 }
 
 nb_class_ptr<PyMemorySpace> PyClient::GetPyMemorySpace(
-    PjRtMemorySpace* memory_space) {
+    ifrt::Memory* memory_space) {
   auto& py_memory = memory_spaces_[memory_space];
   if (py_memory.get() == nullptr) {
     py_memory = make_nb_class<PyMemorySpace>(
@@ -151,7 +157,7 @@ std::vector<nb_class_ptr<PyDevice>> PyClient::Devices() {
   std::vector<nb_class_ptr<PyDevice>> devices;
   auto span = ifrt_client_->devices();
   devices.reserve(span.size());
-  for (PjRtDevice* device : span) {
+  for (ifrt::Device* device : span) {
     devices.push_back(GetPyDevice(device));
   }
   return devices;
@@ -168,7 +174,7 @@ std::vector<nb_class_ptr<PyDevice>> PyClient::LocalDevices() {
 
 absl::StatusOr<nb_class_ptr<PyDevice>> PyClient::DeviceFromLocalHardwareId(
     int local_hardware_id) {
-  TF_ASSIGN_OR_RETURN(PjRtDevice * device,
+  TF_ASSIGN_OR_RETURN(ifrt::Device * device,
                       ifrt_client_->LookupAddressableDevice(local_hardware_id));
   return GetPyDevice(device);
 }
@@ -270,7 +276,7 @@ absl::Status PyClient::Defragment() {
 }
 
 /* static */ absl::StatusOr<nb::object> PyClient::BufferFromPyval(
-    nb_class_ptr<PyClient> client, nb::handle argument, PjRtDevice* device,
+    nb_class_ptr<PyClient> client, nb::handle argument, ifrt::Device* device,
     bool force_copy, ifrt::Client::HostBufferSemantics host_buffer_semantics) {
   if (device == nullptr) {
     TF_RET_CHECK(!client->ifrt_client_->addressable_devices().empty());
@@ -302,8 +308,8 @@ absl::Status PyClient::Defragment() {
   TF_RETURN_IF_ERROR(
       jax::ApplyTransferGuardToHostToDevice(transfer_guard_formatter));
 
-  TF_ASSIGN_OR_RETURN(PjRtDevice * found_device,
-                      client->ifrt_client_->LookupDevice(device->id()));
+  TF_ASSIGN_OR_RETURN(ifrt::Device * found_device,
+                      client->ifrt_client_->LookupDevice(device->Id()));
   if (found_device != device) {
     return InvalidArgument("Cannot copy value to device '%s' with '%s' backend",
                            device->DebugString(),
@@ -314,12 +320,18 @@ absl::Status PyClient::Defragment() {
   DevicePutOptions options;
   options.squash_64bit_types = false;
   options.allow_zero_copy =
-      (!force_copy &&
-       (host_buffer_semantics == ifrt::Client::HostBufferSemantics::kZeroCopy));
+      (!force_copy && (host_buffer_semantics ==
+                       ifrt::Client::HostBufferSemantics::kImmutableZeroCopy));
   // TODO(phawkins): remove .ptr() after nanobind transition is complete.
-  TF_ASSIGN_OR_RETURN(DevicePutResult put,
-                      DevicePut(argument.ptr(), client->ifrt_client_.get(),
-                                device, options, ifrt::MemoryKind()));
+  TF_ASSIGN_OR_RETURN(
+      auto put_fn, DevicePut(argument.ptr(), client->ifrt_client_.get(), device,
+                             options, ifrt::MemoryKind()));
+  TF_ASSIGN_OR_RETURN(auto put, [&]() {
+    // Must release the GIL before calling IFRT because backends may
+    // decide to block/sleep for device buffer allocation.
+    nb::gil_scoped_release gil_release;
+    return std::move(put_fn)();
+  }());
 
   if (put.ifrt_array) {
     auto traceback = Traceback::Get();
@@ -332,61 +344,6 @@ absl::Status PyClient::Defragment() {
   }
 }
 
-/* static */ absl::StatusOr<nb::list> PyClient::MakeCrossHostReceiveBuffers(
-    nb_class_ptr<PyClient> client, absl::Span<const Shape> shapes,
-    PjRtDevice* device) {
-  CHECK(device != nullptr);
-  absl::Mutex mu;
-  absl::StatusOr<std::vector<PjRtCrossHostRecvDescriptors>> recv_descriptors_or;
-  bool done = false;
-
-  TF_ASSIGN_OR_RETURN(
-      auto buffers,
-      client->pjrt_client()->MakeCrossHostReceiveBuffers(
-          shapes, device,
-          [&done, &recv_descriptors_or,
-           &mu](absl::StatusOr<PjRtCrossHostRecvState> recv_state_or) {
-            absl::MutexLock l(&mu);
-            if (recv_state_or.ok()) {
-              nb::gil_scoped_acquire gil;
-              recv_descriptors_or = std::move(recv_state_or->descriptors);
-            } else {
-              recv_descriptors_or = recv_state_or.status();
-            }
-            done = true;
-          }));
-
-  {
-    nb::gil_scoped_release gil_release;
-    absl::MutexLock l(&mu);
-    mu.Await(absl::Condition(&done));
-  }
-
-  TF_RETURN_IF_ERROR(recv_descriptors_or.status());
-  CHECK_EQ(buffers.size(), recv_descriptors_or->size());
-  nb::list result;
-  for (int i = 0; i < buffers.size(); ++i) {
-    auto& descriptors = recv_descriptors_or->at(i);
-    CHECK_EQ(descriptors.serialized_descriptors.size(), 1);
-    const std::string& desc = descriptors.serialized_descriptors[0];
-    nb::bytes py_desc = nb::bytes(desc.data(), desc.size());
-    auto* ifrt_client = llvm::dyn_cast_or_null<ifrt::PjRtCompatibleClient>(
-        client->ifrt_client());
-    if (ifrt_client == nullptr) {
-      throw XlaRuntimeError(
-          "This operation is implemented for a PjRt-compatible backend only.");
-    }
-    TF_ASSIGN_OR_RETURN(auto ifrt_array,
-                        ifrt_client->CreatePjRtArray(std::move(buffers[i])));
-    auto py_buf = PyArray::MakeFromSingleDeviceArray(client, Traceback::Get(),
-                                                     std::move(ifrt_array),
-                                                     /*weak_type=*/false,
-                                                     /*committed=*/false);
-    result.append(nb::make_tuple(std::move(py_desc), std::move(py_buf)));
-  }
-  return result;
-}
-
 namespace {
 
 // Makes IFRT `CompileOptions` from XLA `CompileOptions` and optional host
@@ -428,15 +385,19 @@ MakeIfrtDeserializeExecutableOptions(std::optional<CompileOptions> options,
 
 }  // namespace
 
-/* static */ absl::StatusOr<nb_class_ptr<PyLoadedExecutable>> PyClient::Compile(
-    nb_class_ptr<PyClient> client, std::string mlir_module,
-    CompileOptions options, std::vector<nb::capsule> host_callbacks) {
-  // Pass allocated device memory size to compile options for pjrt compatible
-  // backends.
+/* static */ absl::StatusOr<nb_class_ptr<PyLoadedExecutable>>
+PyClient::CompileIfrtProgram(
+    nb_class_ptr<PyClient> client, std::unique_ptr<ifrt::Program> ifrt_program,
+    std::unique_ptr<ifrt::CompileOptions> ifrt_options) {
   auto* pjrt_compatible_client =
       llvm::dyn_cast_or_null<ifrt::PjRtCompatibleClient>(
           client->ifrt_client_.get());
-  if (pjrt_compatible_client != nullptr) {
+  auto* ifrt_xla_options =
+      llvm::dyn_cast_or_null<ifrt::XlaCompileOptions>(ifrt_options.get());
+  // For XLA programs, pass allocated device memory size to compile options for
+  // pjrt compatible backends.
+  if (pjrt_compatible_client != nullptr && ifrt_xla_options != nullptr) {
+    xla::CompileOptions& options = ifrt_xla_options->compile_options;
     auto addressable_devices =
         pjrt_compatible_client->pjrt_client()->addressable_devices();
     if (!addressable_devices.empty()) {
@@ -455,18 +416,11 @@ MakeIfrtDeserializeExecutableOptions(std::optional<CompileOptions> options,
 
   std::unique_ptr<ifrt::LoadedExecutable> ifrt_loaded_executable;
   std::optional<std::string> fingerprint;
-  auto ifrt_compile_options =
-      MakeIfrtCompileOptions(std::move(options), std::move(host_callbacks));
   {
     nb::gil_scoped_release gil_release;
-    mlir::MLIRContext context;
-    TF_ASSIGN_OR_RETURN(mlir::OwningOpRef<mlir::ModuleOp> module,
-                        ParseMlirModuleString(mlir_module, context));
-    TF_ASSIGN_OR_RETURN(
-        ifrt_loaded_executable,
-        client->ifrt_client_->GetDefaultCompiler()->Compile(
-            std::make_unique<xla::ifrt::XlaProgram>(module.get()),
-            std::move(ifrt_compile_options)));
+    TF_ASSIGN_OR_RETURN(ifrt_loaded_executable,
+                        client->ifrt_client_->GetDefaultCompiler()->Compile(
+                            std::move(ifrt_program), std::move(ifrt_options)));
     TF_ASSIGN_OR_RETURN(fingerprint, ifrt_loaded_executable->Fingerprint());
   }
   auto traceback = Traceback::Get();
@@ -475,6 +429,17 @@ MakeIfrtDeserializeExecutableOptions(std::optional<CompileOptions> options,
       std::move(traceback), std::move(fingerprint));
 }
 
+/* static */ absl::StatusOr<nb_class_ptr<PyLoadedExecutable>> PyClient::Compile(
+    nb_class_ptr<PyClient> client, std::string mlir_module,
+    CompileOptions options, std::vector<nb::capsule> host_callbacks) {
+  mlir::MLIRContext context;
+  TF_ASSIGN_OR_RETURN(mlir::OwningOpRef<mlir::ModuleOp> module,
+                      ParseMlirModuleString(mlir_module, context));
+  return CompileIfrtProgram(
+      client, std::make_unique<xla::ifrt::XlaProgram>(module.get()),
+      MakeIfrtCompileOptions(std::move(options), std::move(host_callbacks)));
+}
+
 absl::StatusOr<nb::bytes> PyClient::SerializeExecutable(
     const PyLoadedExecutable& executable) const {
   TF_ASSIGN_OR_RETURN(auto serialized,
@@ -511,7 +476,7 @@ namespace {
 struct HeapProfileKey {
   Traceback* traceback;
   int64_t size;
-  PjRtDevice* device;
+  xla::PjRtDevice* device;
   bool operator==(const HeapProfileKey& other) const;
 };
 
@@ -682,8 +647,7 @@ XLA_REGISTER_CUSTOM_CALL_TARGET_WITH_SYM(
   PyClient* c = nb::inst_ptr<PyClient>(self);
   absl::flat_hash_map<ifrt::Device*, nb_class_ptr<PyDevice>> devices;
   std::swap(devices, c->devices_);
-  absl::flat_hash_map<PjRtMemorySpace*, nb_class_ptr<PyMemorySpace>>
-      memory_spaces;
+  absl::flat_hash_map<ifrt::Memory*, nb_class_ptr<PyMemorySpace>> memory_spaces;
   std::swap(memory_spaces, c->memory_spaces_);
   return 0;
 }
@@ -700,7 +664,7 @@ PyType_Slot PyClient::slots_[] = {
              PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall)
       .value("IMMUTABLE_UNTIL_TRANSFER_COMPLETES",
              PjRtClient::HostBufferSemantics::kImmutableUntilTransferCompletes)
-      .value("ZERO_COPY", PjRtClient::HostBufferSemantics::kZeroCopy);
+      .value("ZERO_COPY", PjRtClient::HostBufferSemantics::kImmutableZeroCopy);
 
   nb::class_<PyClient> py_local_client(m, "Client", nb::is_weak_referenceable(),
                                        nb::type_slots(PyClient::slots_));
@@ -732,15 +696,7 @@ PyType_Slot PyClient::slots_[] = {
           nb::arg("argument"), nb::arg("device").none() = nullptr,
           nb::arg("force_copy") = false,
           nb::arg("host_buffer_semantics") =
-              PjRtClient::HostBufferSemantics::kZeroCopy)
-      .def(
-          "make_cross_host_receive_buffers",
-          [](nb_class_ptr<PyClient> client, absl::Span<const Shape> shapes,
-             PjRtDevice* device) {
-            return ValueOrThrow(PyClient::MakeCrossHostReceiveBuffers(
-                std::move(client), shapes, device));
-          },
-          nb::arg("shapes"), nb::arg("device"))
+              PjRtClient::HostBufferSemantics::kImmutableZeroCopy)
       .def(
           "compile",
           [](nb_class_ptr<PyClient> client, nb::bytes mlir_module,
@@ -762,6 +718,8 @@ PyType_Slot PyClient::slots_[] = {
           },
           nb::arg("computation"), nb::arg("compile_options") = CompileOptions(),
           nb::arg("host_callbacks") = std::vector<nb::capsule>())
+      .def("compile_ifrt_program",
+           xla::ValueOrThrowWrapper(PyClient::CompileIfrtProgram))
       .def("serialize_executable",
            xla::ValueOrThrowWrapper(&PyClient::SerializeExecutable))
       .def(
@@ -790,6 +748,17 @@ PyType_Slot PyClient::slots_[] = {
            nb::arg("result_shapes"), nb::arg("send_channel_ids"),
            nb::arg("recv_channel_ids"),
            nb::arg("serializer").none() = nb::none())
+      .def(
+          "get_default_layout",
+          [](PyClient& self, nb_dtype dtype, nb::sequence shard_shape,
+             nb_class_ptr<PyDevice> device) -> std::unique_ptr<PjRtLayout> {
+            ifrt::DType ifrt_type = xla::ValueOrThrow(DtypeToIfRtDType(dtype));
+            std::vector<int64_t> dims = SequenceToVector<int64_t>(shard_shape);
+            return xla::ValueOrThrow(
+                self.ifrt_client()->GetDefaultLayoutForDevice(
+                    ifrt_type, dims, device->device()));
+          },
+          nb::arg("dtype"), nb::arg("shard_shape"), nb::arg("device"))
       .def("__getattr__",
            [](PyClient& client, std::string_view name) -> nb::object {
              const auto& attrs = client.attributes();
diff --git a/third_party/xla/xla/python/py_client.h b/third_party/xla/xla/python/py_client.h
index 312008eb781dac..92390f41224c1e 100644
--- a/third_party/xla/xla/python/py_client.h
+++ b/third_party/xla/xla/python/py_client.h
@@ -37,6 +37,7 @@ limitations under the License.
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/compiler.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/nb_class_ptr.h"
 #include "xla/python/pjrt_ifrt/pjrt_client.h"
@@ -124,31 +125,32 @@ class PyClient {
   absl::StatusOr<nb_class_ptr<PyDevice>> DeviceFromLocalHardwareId(
       int local_hardware_id);
 
-  // Returns the PyDevice associated with the given PjRtDevice.
-  nb_class_ptr<PyDevice> GetPyDevice(PjRtDevice* device);
+  // Returns the PyDevice associated with the given ifrt::Device.
+  nb_class_ptr<PyDevice> GetPyDevice(ifrt::Device* device);
 
-  // Returns the PyMemorySpace associated with the given PjRtMemorySpace.
-  nb_class_ptr<PyMemorySpace> GetPyMemorySpace(PjRtMemorySpace* memory_space);
+  // Returns the PyMemorySpace associated with the given ifrt::Memory.
+  nb_class_ptr<PyMemorySpace> GetPyMemorySpace(ifrt::Memory* memory_space);
 
   // Returns a vector of live PyArray objects. PyArray objects may share
   // PjRtBuffers, so there may be duplicates of the same underlying device
   // buffer.
-  std::vector<nanobind::object> LiveBuffersOnDevice(PjRtDevice* device);
+  std::vector<nanobind::object> LiveBuffersOnDevice(ifrt::Device* device);
 
   nanobind::list LiveExecutables();
 
   // TODO(zhangqiaorjc): Remove when we have transparent defragmentation.
   absl::Status Defragment();
 
-  static absl::StatusOr<nanobind::list> MakeCrossHostReceiveBuffers(
-      nb_class_ptr<PyClient> client, absl::Span<const Shape> shapes,
-      PjRtDevice* device);
-
   static absl::StatusOr<nanobind::object> BufferFromPyval(
       nb_class_ptr<PyClient> client, nanobind::handle argument,
-      PjRtDevice* device, bool force_copy,
+      ifrt::Device* device, bool force_copy,
       ifrt::Client::HostBufferSemantics host_buffer_semantics);
 
+  static absl::StatusOr<nb_class_ptr<PyLoadedExecutable>> CompileIfrtProgram(
+      nb_class_ptr<PyClient> client,
+      std::unique_ptr<ifrt::Program> ifrt_program,
+      std::unique_ptr<ifrt::CompileOptions> ifrt_options);
+
   static absl::StatusOr<nb_class_ptr<PyLoadedExecutable>> Compile(
       nb_class_ptr<PyClient> client, std::string mlir_module,
       CompileOptions options, std::vector<nanobind::capsule> host_callbacks);
@@ -241,7 +243,7 @@ class PyClient {
   PyArray_Storage* arrays_ = nullptr;
 
   absl::flat_hash_map<ifrt::Device*, nb_class_ptr<PyDevice>> devices_;
-  absl::flat_hash_map<PjRtMemorySpace*, nb_class_ptr<PyMemorySpace>>
+  absl::flat_hash_map<ifrt::Memory*, nb_class_ptr<PyMemorySpace>>
       memory_spaces_;
 };
 
diff --git a/third_party/xla/xla/python/py_client_gpu.cc b/third_party/xla/xla/python/py_client_gpu.cc
index 88496bec8b1206..484ba5ae5f2336 100644
--- a/third_party/xla/xla/python/py_client_gpu.cc
+++ b/third_party/xla/xla/python/py_client_gpu.cc
@@ -132,7 +132,7 @@ void XlaPythonGpuCallback(gpuStreamHandle stream, void** buffers,
       options.dims = dims;
       options.permutation = result.reversed_layout;
       options.input_layout = xla::TransposePlan::Striding{strides};
-      xla::StatusOr<std::shared_ptr<xla::TransposePlan>> plan =
+      absl::StatusOr<std::shared_ptr<xla::TransposePlan>> plan =
           callback->transpose_cache().GetOrCreate(options);
       if (!plan.ok()) {
         throw xla::XlaRuntimeError(plan.status().ToString());
diff --git a/third_party/xla/xla/python/py_compile_only_client.cc b/third_party/xla/xla/python/py_compile_only_client.cc
index a85758f921096e..7fa7ed3305ccb3 100644
--- a/third_party/xla/xla/python/py_compile_only_client.cc
+++ b/third_party/xla/xla/python/py_compile_only_client.cc
@@ -46,6 +46,7 @@ limitations under the License.
 #include "xla/pjrt/pjrt_device_description.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_future.h"
+#include "xla/pjrt/pjrt_layout.h"
 #include "xla/pjrt/status_casters.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/client.h"
@@ -53,6 +54,7 @@ limitations under the License.
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/dtype.h"
 #include "xla/python/ifrt/executable.h"
+#include "xla/python/ifrt/memory.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
 #include "xla/python/ifrt/tuple.h"
@@ -72,44 +74,42 @@ namespace xla {
 
 namespace {
 
-class PjRtCompileOnlyDevice : public PjRtDevice {
+class CompileOnlyDevice
+    : public llvm::RTTIExtends<CompileOnlyDevice, ifrt::Device> {
  public:
-  explicit PjRtCompileOnlyDevice(const PjRtDeviceDescription* description)
+  explicit CompileOnlyDevice(const PjRtDeviceDescription* description)
       : description_(std::move(description)) {}
 
-  const PjRtDeviceDescription& description() const override {
-    return *description_;
-  }
+  const PjRtDeviceDescription& description() const { return *description_; }
 
-  PjRtClient* client() const override { return nullptr; }
+  ifrt::Client* client() const override { return nullptr; }
   bool IsAddressable() const override { return false; }
-  int local_hardware_id() const override {
-    return local_hardware_id_typed().value();
+  ifrt::DeviceId Id() const override {
+    return ifrt::DeviceId(description_->id());
   }
 
-  PjRtLocalDeviceId local_device_id() const override {
-    return PjRtLocalDeviceId(local_hardware_id_typed().value());
-  }
+  int ProcessIndex() const override { return description_->process_index(); }
 
-  PjRtLocalHardwareId local_hardware_id_typed() const override {
-    return PjRtLocalHardwareId(-1);
+  absl::string_view Kind() const override {
+    return description_->device_kind();
   }
 
-  std::unique_ptr<ScopedAsyncTrackingEvent> CreateAsyncTrackingEvent(
-      absl::string_view description) const override {
-    return nullptr;
-  }
-  absl::Status TransferToInfeed(const LiteralSlice& literal) override {
-    return Unimplemented("TransferToInfeed is not supported");
+  absl::string_view ToString() const override {
+    return description_->ToString();
   }
-  absl::Status TransferFromOutfeed(MutableBorrowingLiteral literal) override {
-    return Unimplemented("TransferFromOutfeed is not supported");
+
+  absl::string_view DebugString() const override {
+    return description_->DebugString();
   }
-  absl::Span<PjRtMemorySpace* const> memory_spaces() const override {
-    return {};
+
+  absl::Span<ifrt::Memory* const> Memories() const override { return {}; }
+  absl::StatusOr<ifrt::Memory*> DefaultMemory() const override {
+    return Unimplemented("DefaultMemory is not supported");
   }
-  absl::StatusOr<PjRtMemorySpace*> default_memory_space() const override {
-    return Unimplemented("default_memory_space is not supported");
+
+  const absl::flat_hash_map<std::string, PjRtDeviceAttribute>& Attributes()
+      const {
+    return description_->Attributes();
   }
 
  private:
@@ -145,7 +145,7 @@ class CompileOnlyIfRtClient final
         descriptions_(topology_->DeviceDescriptions()) {
     for (auto& description : descriptions_) {
       owned_devices_.push_back(
-          std::make_unique<PjRtCompileOnlyDevice>(description.get()));
+          std::make_unique<CompileOnlyDevice>(description.get()));
       devices_.push_back(owned_devices_.back().get());
     }
   }
@@ -205,7 +205,8 @@ class CompileOnlyIfRtClient final
     return Unimplemented(
         "GetDefaultDeviceAssignment not available with compile-only client.");
   }
-  absl::StatusOr<ifrt::Device*> LookupDevice(int device_id) const override {
+  absl::StatusOr<ifrt::Device*> LookupDevice(
+      ifrt::DeviceId device_id) const override {
     return Unimplemented(
         "LookupDevice not available with compile-only client.");
   }
@@ -223,17 +224,23 @@ class CompileOnlyIfRtClient final
   const PjRtTopologyDescription& topology() const { return *topology_; }
 
   absl::StatusOr<std::shared_ptr<const xla::PjRtTopologyDescription>>
-  GetTopologyForDevices(
-      absl::Span<ifrt::Device* const> devices) const override {
+  GetTopologyForDevices(const xla::ifrt::DeviceList& devices) const override {
     return topology_;
   }
 
+  absl::StatusOr<std::unique_ptr<PjRtLayout>> GetDefaultLayoutForDevice(
+      ifrt::DType dtype, absl::Span<const int64_t> dims,
+      ifrt::Device* device) const override {
+    return absl::UnimplementedError(
+        "GetDefaultLayout not supported for CompileOnlyIfRtClient.");
+  }
+
  private:
   InvalidIfrtCompiler default_compiler_;
   std::shared_ptr<PjRtTopologyDescription> topology_;
   std::vector<std::unique_ptr<const PjRtDeviceDescription>> descriptions_;
-  std::vector<std::unique_ptr<PjRtCompileOnlyDevice>> owned_devices_;
-  std::vector<PjRtDevice*> devices_;
+  std::vector<std::unique_ptr<CompileOnlyDevice>> owned_devices_;
+  std::vector<ifrt::Device*> devices_;
 };
 
 char CompileOnlyIfRtClient::ID = 0;  // NOLINT
diff --git a/third_party/xla/xla/python/py_device.cc b/third_party/xla/xla/python/py_device.cc
index bc947ae8f8848c..da639270517e5b 100644
--- a/third_party/xla/xla/python/py_device.cc
+++ b/third_party/xla/xla/python/py_device.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_join.h"
+#include "llvm/Support/Casting.h"
 #include "third_party/nanobind/include/nanobind/nanobind.h"
 #include "third_party/nanobind/include/nanobind/stl/optional.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/string.h"  // IWYU pragma: keep
@@ -37,11 +38,12 @@ limitations under the License.
 #include "third_party/nanobind/include/nanobind/stl/vector.h"  // IWYU pragma: keep
 #include "xla/layout_util.h"
 #include "xla/literal.h"
-#include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/status_casters.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/nb_class_ptr.h"
 #include "xla/python/nb_helpers.h"
+#include "xla/python/pjrt_ifrt/pjrt_client.h"
+#include "xla/python/pjrt_ifrt/pjrt_device.h"
 #include "xla/python/py_client.h"
 #include "xla/python/py_memory_space.h"
 #include "xla/python/python_ref_manager.h"
@@ -60,9 +62,9 @@ namespace xla {
 PyDevice::PyDevice(nb_class_ptr<PyClient> client, ifrt::Device* device)
     : client_(std::move(client)), device_(device) {}
 
-int PyDevice::id() const { return device_->id(); }
+int PyDevice::id() const { return device_->Id().value(); }
 
-int PyDevice::process_index() const { return device_->process_index(); }
+int PyDevice::process_index() const { return device_->ProcessIndex(); }
 
 std::string_view PyDevice::platform() const {
   // TODO(phawkins): this is a temporary backwards
@@ -79,12 +81,15 @@ std::string_view PyDevice::platform() const {
   }
 }
 
-std::string_view PyDevice::device_kind() const {
-  return device_->device_kind();
-}
+std::string_view PyDevice::device_kind() const { return device_->Kind(); }
 
 std::optional<int> PyDevice::local_hardware_id() const {
-  int local_hardware_id = device_->local_hardware_id();
+  // TODO(phawkins): consider supporting this for non-PJRT devices.
+  ifrt::PjRtDevice* device = llvm::dyn_cast<ifrt::PjRtDevice>(device_);
+  if (device == nullptr) {
+    return std::nullopt;
+  }
+  int local_hardware_id = device->pjrt_device()->local_hardware_id();
   if (local_hardware_id == -1) {
     return std::nullopt;
   }
@@ -98,7 +103,13 @@ std::string_view PyDevice::Repr() const { return device_->ToString(); }
 absl::Status PyDevice::TransferToInfeed(LiteralSlice literal) {
   GlobalPyRefManager()->CollectGarbage();
   nb::gil_scoped_release gil_release;
-  return device_->TransferToInfeed(literal);
+  auto client = llvm::dyn_cast<ifrt::PjRtClient>(client_->ifrt_client());
+  auto device = llvm::dyn_cast<ifrt::PjRtDevice>(device_);
+  if (client == nullptr || device == nullptr) {
+    return xla::InvalidArgument(
+        "TransferToInfeed is only supported for PjRt devices.");
+  }
+  return client->TransferToInfeed(device, literal);
 }
 
 absl::StatusOr<nb::object> PyDevice::TransferFromOutfeed(Shape shape) {
@@ -106,6 +117,12 @@ absl::StatusOr<nb::object> PyDevice::TransferFromOutfeed(Shape shape) {
   std::shared_ptr<Literal> literal;
   {
     nb::gil_scoped_release gil_release;
+    auto client = llvm::dyn_cast<ifrt::PjRtClient>(client_->ifrt_client());
+    auto device = llvm::dyn_cast<ifrt::PjRtDevice>(device_);
+    if (client == nullptr || device == nullptr) {
+      return xla::InvalidArgument(
+          "TransferFromOutfeed is only supported for PjRt devices.");
+    }
     ShapeUtil::ForEachMutableSubshape(
         &shape, [](Shape* subshape, const ShapeIndex&) {
           if (!subshape->has_layout()) {
@@ -113,23 +130,23 @@ absl::StatusOr<nb::object> PyDevice::TransferFromOutfeed(Shape shape) {
           }
         });
     literal = std::make_shared<Literal>(shape);
-    TF_RETURN_IF_ERROR(device_->TransferFromOutfeed(literal.get()));
+    TF_RETURN_IF_ERROR(client->TransferFromOutfeed(device, literal.get()));
   }
   return LiteralToPython(std::move(literal));
 }
 
 absl::StatusOr<nb_class_ptr<PyMemorySpace>> PyDevice::Memory(
     std::string_view kind) const {
-  xla::PjRtMemorySpace* result_memory_space = nullptr;
-  for (auto* memory_space : device_->memory_spaces()) {
-    if (memory_space->memory_space_kind() == kind) {
+  ifrt::Memory* result_memory_space = nullptr;
+  for (auto* memory_space : device_->Memories()) {
+    if (memory_space->Kind().memory_kind() == kind) {
       if (result_memory_space != nullptr) {
         std::string memories = absl::StrJoin(
-            device_->memory_spaces(), ", ",
+            device_->Memories(), ", ",
             [](std::string* out, const auto& memory_space) {
-              absl::StrAppend(out, memory_space->memory_space_kind());
+              absl::StrAppend(out, *memory_space->Kind().memory_kind());
             });
-        auto device_kind = device_->device_kind();
+        auto device_kind = device_->Kind();
         return xla::InvalidArgument(
             "Found more than one addressable memory for "
             "kind %s which is not allowed. There can only "
@@ -142,12 +159,12 @@ absl::StatusOr<nb_class_ptr<PyMemorySpace>> PyDevice::Memory(
     }
   }
   if (result_memory_space == nullptr) {
-    std::string memories =
-        absl::StrJoin(device_->memory_spaces(), ", ",
-                      [](std::string* out, const auto& memory_space) {
-                        absl::StrAppend(out, memory_space->memory_space_kind());
-                      });
-    auto device_kind = device_->device_kind();
+    std::string memories = absl::StrJoin(
+        device_->Memories(), ", ",
+        [](std::string* out, const auto& memory_space) {
+          absl::StrAppend(out, *memory_space->Kind().memory_kind());
+        });
+    auto device_kind = device_->Kind();
     return xla::InvalidArgument(
         "Could not find memory addressable by device %s. Device %s "
         "can address the following memory kinds: %s. "
@@ -158,13 +175,13 @@ absl::StatusOr<nb_class_ptr<PyMemorySpace>> PyDevice::Memory(
 }
 
 absl::StatusOr<nb_class_ptr<PyMemorySpace>> PyDevice::DefaultMemory() const {
-  TF_ASSIGN_OR_RETURN(auto* memory_space, device_->default_memory_space());
+  TF_ASSIGN_OR_RETURN(auto* memory_space, device_->DefaultMemory());
   return client_->GetPyMemorySpace(memory_space);
 }
 
 nb::list PyDevice::AddressableMemories() const {
   nb::list memory_spaces;
-  for (auto* memory_space : device_->memory_spaces()) {
+  for (auto* memory_space : device_->Memories()) {
     memory_spaces.append(client_->GetPyMemorySpace(memory_space));
   }
   return memory_spaces;
@@ -172,8 +189,13 @@ nb::list PyDevice::AddressableMemories() const {
 
 absl::StatusOr<std::optional<nb::dict>> PyDevice::MemoryStats() const {
   GlobalPyRefManager()->CollectGarbage();
+  ifrt::PjRtDevice* device = llvm::dyn_cast<ifrt::PjRtDevice>(device_);
+  if (device == nullptr) {
+    return xla::InvalidArgument(
+        "MemoryStats is only supported for PjRt devices.");
+  }
   absl::StatusOr<tsl::AllocatorStats> maybe_stats =
-      device_->GetAllocatorStats();
+      device->pjrt_device()->GetAllocatorStats();
   if (absl::IsUnimplemented(maybe_stats.status())) {
     return std::nullopt;
   }
@@ -205,7 +227,12 @@ absl::StatusOr<std::optional<nb::dict>> PyDevice::MemoryStats() const {
 
 absl::StatusOr<std::intptr_t> PyDevice::GetStreamForExternalReadyEvents()
     const {
-  return device_->GetStreamForExternalReadyEvents();
+  ifrt::PjRtDevice* device = llvm::dyn_cast<ifrt::PjRtDevice>(device_);
+  if (device == nullptr) {
+    return xla::InvalidArgument(
+        "GetStreamForExternalReadyEvents is only supported for PjRt devices.");
+  }
+  return device->pjrt_device()->GetStreamForExternalReadyEvents();
 }
 
 /* static */ int PyDevice::tp_traverse(PyObject* self, visitproc visit,
diff --git a/third_party/xla/xla/python/py_device_list.cc b/third_party/xla/xla/python/py_device_list.cc
index 3ce4eb19d34538..f89ff54adb8c04 100644
--- a/third_party/xla/xla/python/py_device_list.cc
+++ b/third_party/xla/xla/python/py_device_list.cc
@@ -30,7 +30,7 @@ limitations under the License.
 #include "third_party/nanobind/include/nanobind/make_iterator.h"
 #include "third_party/nanobind/include/nanobind/nanobind.h"
 #include "third_party/nanobind/include/nanobind/stl/string.h"  // IWYU pragma: keep
-#include "xla/pjrt/pjrt_client.h"
+#include "third_party/nanobind/include/nanobind/stl/string_view.h"  // IWYU pragma: keep
 #include "xla/python/ifrt/device.h"
 #include "xla/python/nb_class_ptr.h"
 #include "xla/python/nb_helpers.h"
@@ -53,7 +53,7 @@ PyDeviceList::PyDeviceList(nb::tuple py_device_assignment)
     : device_list_(py_device_assignment) {
   // Attempt to convert to Python devices into `ifrt::DeviceList`.
   if (py_device_assignment.size() == 0) {
-    device_list_ = xla::ifrt::DeviceList({});
+    device_list_ = xla::ifrt::DeviceList();
     return;
   }
   xla::ifrt::DeviceList::Devices devices;
@@ -247,7 +247,7 @@ bool PyDeviceList::IsFullyAddressable() {
         const int process_index = py_client_ ? py_client_->process_index() : 0;
         for (const xla::ifrt::Device* device :
              std::get<0>(device_list_).devices()) {
-          if (device->process_index() != process_index) {
+          if (device->ProcessIndex() != process_index) {
             is_fully_addressable_ = false;
             break;
           }
@@ -286,7 +286,7 @@ bool PyDeviceList::IsFullyAddressable() {
             self->py_client_ ? self->py_client_->process_index() : 0;
         for (xla::ifrt::Device* device :
              std::get<0>(self->device_list_).devices()) {
-          if (device->process_index() == process_index) {
+          if (device->ProcessIndex() == process_index) {
             addressable_devices.push_back(device);
           }
         }
@@ -334,7 +334,7 @@ void PyDeviceList::PopulateMemoryKindInfo() {
   xla::ifrt::Device* addressable_device = nullptr;
   const int process_index = py_client_ ? py_client_->process_index() : 0;
   for (xla::ifrt::Device* device : std::get<0>(device_list_).devices()) {
-    if (device->process_index() == process_index) {
+    if (device->ProcessIndex() == process_index) {
       addressable_device = device;
       break;
     }
@@ -345,20 +345,19 @@ void PyDeviceList::PopulateMemoryKindInfo() {
     return;
   }
 
-  auto default_memory = addressable_device->default_memory_space();
+  auto default_memory = addressable_device->DefaultMemory();
   if (!default_memory.ok()) {
     // Cache the error.
     memory_kind_info_ = default_memory.status();
     return;
   }
-  info.default_memory_kind =
-      nb::cast(std::string((*default_memory)->memory_space_kind()));
-  nb::tuple memory_kinds = nb::steal<nb::tuple>(
-      PyTuple_New(addressable_device->memory_spaces().size()));
-  for (size_t i = 0; i < addressable_device->memory_spaces().size(); ++i) {
-    auto* memory = addressable_device->memory_spaces()[i];
-    nb::str s = nb::str(memory->memory_space_kind().data(),
-                        memory->memory_space_kind().size());
+  info.default_memory_kind = nb::cast(*(*default_memory)->Kind().memory_kind());
+  nb::tuple memory_kinds =
+      nb::steal<nb::tuple>(PyTuple_New(addressable_device->Memories().size()));
+  for (size_t i = 0; i < addressable_device->Memories().size(); ++i) {
+    auto* memory = addressable_device->Memories()[i];
+    nb::str s = nb::str(memory->Kind().memory_kind()->data(),
+                        memory->Kind().memory_kind()->size());
     PyTuple_SET_ITEM(memory_kinds.ptr(), i, s.release().ptr());
   }
   info.memory_kinds = std::move(memory_kinds);
diff --git a/third_party/xla/xla/python/py_executable.cc b/third_party/xla/xla/python/py_executable.cc
index 837a8ed5222c40..32f2025cb61984 100644
--- a/third_party/xla/xla/python/py_executable.cc
+++ b/third_party/xla/xla/python/py_executable.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "third_party/nanobind/include/nanobind/nanobind.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/layout.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/pjrt_layout.h"
@@ -48,6 +47,8 @@ limitations under the License.
 #include "xla/python/py_client.h"
 #include "xla/python/py_device.h"
 #include "xla/python/traceback.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/fingerprint.h"
 #include "tsl/platform/logging.h"
@@ -133,7 +134,6 @@ template <>
 struct ShardedBufferAdapter<ExecuteShardedArg> {
   static int num_devices(const ExecuteShardedArg& arg) {
     if (std::holds_alternative<PyArray>(arg)) {
-      CHECK(std::get<PyArray>(arg).fastpath_enabled());
       return std::get<PyArray>(arg).num_addressable_shards();
     } else {
       return std::get<std::vector<PyArray>>(arg).size();
@@ -142,7 +142,6 @@ struct ShardedBufferAdapter<ExecuteShardedArg> {
   static tsl::RCReference<ifrt::Array> GetIfRtArray(
       const ExecuteShardedArg& arg) {
     if (std::holds_alternative<PyArray>(arg)) {
-      CHECK(std::get<PyArray>(arg).fastpath_enabled());
       return tsl::FormRef(std::get<PyArray>(arg).ifrt_array());
     }
     auto& arg_vector = std::get<std::vector<PyArray>>(arg);
@@ -179,7 +178,7 @@ struct ShardedBufferAdapter<ExecuteShardedArg> {
 void PopulateExecuteShardedResults(
     const nb_class_ptr<PyClient>& client,
     std::vector<tsl::RCReference<ifrt::Array>> ifrt_arrays,
-    const xla::PjRtFuture<absl::Status>& result_status, int num_computations,
+    const PjRtFuture<absl::Status>& result_status, int num_computations,
     std::vector<std::vector<PyArray>>& outputs) {
   auto traceback = Traceback::Get();
   DCHECK_GT(num_computations, 0);
@@ -203,17 +202,17 @@ template <typename ArgT, typename ArgAdapter = ShardedBufferAdapter<ArgT>>
 absl::StatusOr<PyExecuteResults> ExecuteShardedOnLocalDevicesInternal(
     const ExecuteOptions& options, const nb_class_ptr<PyClient>& client,
     ifrt::LoadedExecutable* ifrt_loaded_executable, absl::Span<const ArgT> args,
-    std::optional<std::vector<PjRtFuture<absl::Status>>>& returned_futures,
+    std::optional<std::vector<PjRtFuture<>>>& returned_futures,
     bool attach_status_to_results) {
   std::vector<tsl::RCReference<ifrt::Array>> output_arrays;
   std::unique_ptr<ifrt::Future<absl::Status>> returned_future;
   int num_computations = ifrt_loaded_executable->addressable_devices().size();
-  xla::PjRtFuture<absl::Status> result_status;
+  PjRtFuture<> result_status;
   {
     nb::gil_scoped_release gil_release;
     for (const auto& arg : args) {
       if (ArgAdapter::num_devices(arg) != num_computations) {
-        return xla::InvalidArgument(
+        return InvalidArgument(
             "Expected args to execute_sharded_on_local_devices to have %d "
             "shards, got: [%s]",
             num_computations,
@@ -257,8 +256,7 @@ absl::StatusOr<PyExecuteResults> ExecuteShardedOnLocalDevicesInternal(
 PyExecuteResults::PyExecuteResults(
     const nb_class_ptr<PyClient>& client,
     std::vector<tsl::RCReference<ifrt::Array>> ifrt_arrays,
-    int num_computations, PyShardedToken token,
-    xla::PjRtFuture<absl::Status> result_status)
+    int num_computations, PyShardedToken token, PjRtFuture<> result_status)
     : client_(client),
       ifrt_arrays_(std::move(ifrt_arrays)),
       num_computations_(num_computations),
@@ -288,7 +286,10 @@ PyShardedToken PyExecuteResults::ConsumeToken() {
 std::vector<std::vector<PyArray>>
 PyExecuteResults::DisassembleIntoSingleDeviceArrays() {
   std::vector<std::vector<PyArray>> outputs;
-  PopulateExecuteShardedResults(client_, Consume(), result_status_,
+  PopulateExecuteShardedResults(client_, Consume(),
+                                result_status_.IsValid()
+                                    ? result_status_.ToStatusFuture()
+                                    : PjRtFuture<absl::Status>(),
                                 num_computations_, outputs);
   return outputs;
 }
@@ -310,7 +311,10 @@ PyExecuteResults::DisassemblePrefixIntoSingleDeviceArrays(size_t n) {
   ifrt_arrays_.erase(ifrt_arrays_.begin() + n, ifrt_arrays_.end());
   std::swap(ifrt_arrays_, ifrt_arrays);
   std::vector<std::vector<PyArray>> outputs;
-  PopulateExecuteShardedResults(client_, std::move(ifrt_arrays), result_status_,
+  PopulateExecuteShardedResults(client_, std::move(ifrt_arrays),
+                                result_status_.IsValid()
+                                    ? result_status_.ToStatusFuture()
+                                    : PjRtFuture<absl::Status>(),
                                 num_computations_, outputs);
   return outputs;
 }
@@ -334,7 +338,9 @@ std::vector<nb::object> PyExecuteResults::ConsumeWithHandlers(
     auto& handler = out_handlers[buffer_id];
     if (std::holds_alternative<const PyArrayResultHandler*>(handler)) {
       outputs.push_back(std::get<const PyArrayResultHandler*>(handler)->Call(
-          client_, std::move(ifrt_arrays[buffer_id]), result_status_));
+          client_, std::move(ifrt_arrays[buffer_id]),
+          result_status_.IsValid() ? result_status_.ToStatusFuture()
+                                   : PjRtFuture<absl::Status>()));
     } else {
       tsl::profiler::TraceMe traceme("ConsumeWithHandlers fallback.");
       auto disassembled_arrays =
@@ -347,7 +353,8 @@ std::vector<nb::object> PyExecuteResults::ConsumeWithHandlers(
       for (auto& disassembled_array : *disassembled_arrays) {
         nb::object array = PyArray::MakeFromSingleDeviceArray(
             client_, traceback, std::move(disassembled_array), false, true,
-            result_status_);
+            result_status_.IsValid() ? result_status_.ToStatusFuture()
+                                     : PjRtFuture<absl::Status>());
         PyList_SET_ITEM(bufs.ptr(), i, array.release().ptr());
         ++i;
       }
@@ -360,7 +367,7 @@ std::vector<nb::object> PyExecuteResults::ConsumeWithHandlers(
 absl::StatusOr<std::vector<std::vector<PyArray>>>
 PyLoadedExecutable::ExecuteShardedOnLocalDevices(
     absl::Span<const ExecuteShardedArg> args) {
-  std::optional<std::vector<PjRtFuture<absl::Status>>> returned_futures;
+  std::optional<std::vector<PjRtFuture<>>> returned_futures;
   TF_ASSIGN_OR_RETURN(
       auto outputs_and_tokens,
       ExecuteShardedOnLocalDevicesInternal(
@@ -372,7 +379,7 @@ PyLoadedExecutable::ExecuteShardedOnLocalDevices(
 absl::StatusOr<std::pair<std::vector<std::vector<PyArray>>, PyShardedToken>>
 PyLoadedExecutable::ExecuteShardedOnLocalDevicesWithTokens(
     absl::Span<const ExecuteShardedArg> args) {
-  std::optional<std::vector<PjRtFuture<absl::Status>>> returned_futures;
+  std::optional<std::vector<PjRtFuture<>>> returned_futures;
   returned_futures.emplace();
   TF_ASSIGN_OR_RETURN(
       auto outputs_and_tokens,
@@ -385,7 +392,7 @@ PyLoadedExecutable::ExecuteShardedOnLocalDevicesWithTokens(
 
 absl::StatusOr<PyExecuteResults> PyLoadedExecutable::ExecuteSharded(
     std::vector<ExecuteShardedArg> args, bool with_tokens) {
-  std::optional<std::vector<PjRtFuture<absl::Status>>> returned_futures;
+  std::optional<std::vector<PjRtFuture<>>> returned_futures;
   if (with_tokens) {
     returned_futures.emplace();
   }
diff --git a/third_party/xla/xla/python/py_executable.h b/third_party/xla/xla/python/py_executable.h
index 3139f65e4e1ce0..31174ef7195be9 100644
--- a/third_party/xla/xla/python/py_executable.h
+++ b/third_party/xla/xla/python/py_executable.h
@@ -39,6 +39,7 @@ limitations under the License.
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_future.h"
+#include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/executable.h"
 #include "xla/python/nb_class_ptr.h"
@@ -46,6 +47,7 @@ limitations under the License.
 #include "xla/python/py_array.h"
 #include "xla/python/py_client.h"
 #include "xla/python/traceback.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/status.h"
 
@@ -54,17 +56,16 @@ namespace xla {
 class PyToken {
  public:
   PyToken() = default;
-  explicit PyToken(PjRtFuture<absl::Status> future)
-      : future_(std::move(future)) {}
+  explicit PyToken(PjRtFuture<> future) : future_(std::move(future)) {}
 
   static PyToken ReadyPyToken() {
-    return PyToken(PjRtFuture<absl::Status>(absl::OkStatus()));
+    return PyToken(PjRtFuture<>(absl::OkStatus()));
   }
 
   absl::Status Await();
 
  private:
-  PjRtFuture<absl::Status> future_;
+  PjRtFuture<> future_;
 };
 
 // PyShardedToken contains a PyToken for each device's execution.
@@ -72,7 +73,7 @@ class PyShardedToken {
  public:
   // Default construction creates a always-ready token.
   PyShardedToken() = default;
-  explicit PyShardedToken(std::vector<PjRtFuture<absl::Status>> futures)
+  explicit PyShardedToken(std::vector<PjRtFuture<>> futures)
       : futures_(std::move(futures)) {}
 
   PyToken GetPyToken(int device_id) const {
@@ -83,7 +84,7 @@ class PyShardedToken {
   absl::Status Await();
 
  private:
-  std::vector<PjRtFuture<absl::Status>> futures_;
+  std::vector<PjRtFuture<>> futures_;
 };
 
 class PyExecuteResults {
@@ -91,8 +92,7 @@ class PyExecuteResults {
   PyExecuteResults(const nb_class_ptr<PyClient>& client,
                    std::vector<tsl::RCReference<ifrt::Array>> ifrt_arrays,
                    int num_computations, PyShardedToken token,
-                   xla::PjRtFuture<absl::Status> result_status =
-                       xla::PjRtFuture<absl::Status>());
+                   PjRtFuture<> result_status = PjRtFuture<>());
 
   std::vector<std::vector<PyArray>> DisassembleIntoSingleDeviceArrays();
 
@@ -122,7 +122,7 @@ class PyExecuteResults {
   int num_computations_;
   PyShardedToken token_;
   // Only set if the computation has tokens.
-  xla::PjRtFuture<absl::Status> result_status_;
+  PjRtFuture<> result_status_;
 };
 
 using ExecuteShardedArg = std::variant<PyArray, std::vector<PyArray>>;
diff --git a/third_party/xla/xla/python/py_host_callback.cc b/third_party/xla/xla/python/py_host_callback.cc
index 6a399e6c410e5c..71fc2c15b40a62 100644
--- a/third_party/xla/xla/python/py_host_callback.cc
+++ b/third_party/xla/xla/python/py_host_callback.cc
@@ -24,10 +24,12 @@ limitations under the License.
 
 #include "google/protobuf/any.pb.h"
 #include "absl/algorithm/container.h"
+#include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
+#include "llvm/Support/ExtensibleRTTI.h"
 #include "third_party/nanobind/include/nanobind/nanobind.h"
 #include "xla/layout_util.h"
 #include "xla/pjrt/host_callback.h"
@@ -35,12 +37,18 @@ limitations under the License.
 #include "xla/python/callback.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/host_callback.h"
+#include "xla/python/pjrt_ifrt/pjrt_host_callback.h"
 #include "xla/python/pjrt_ifrt/xla_host_callback.pb.h"
 #include "xla/python/py_host_callback.pb.h"
 #include "xla/python/python_ref_manager.h"
 #include "xla/python/types.h"
 #include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/status_macros.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/concurrency/ref_count.h"
+#include "tsl/platform/statusor.h"
 
 namespace nb = nanobind;
 
diff --git a/third_party/xla/xla/python/py_memory_space.cc b/third_party/xla/xla/python/py_memory_space.cc
index 73aab577e544cc..e372c5e581f5fa 100644
--- a/third_party/xla/xla/python/py_memory_space.cc
+++ b/third_party/xla/xla/python/py_memory_space.cc
@@ -32,8 +32,8 @@ namespace nb = ::nanobind;
 namespace xla {
 
 PyMemorySpace::PyMemorySpace(nb_class_ptr<PyClient> client,
-                             PjRtMemorySpace* memory_space)
-    : client_(std::move(client)), memory_space_(memory_space) {}
+                             ifrt::Memory* memory)
+    : client_(std::move(client)), memory_(memory) {}
 
 int PyMemorySpace::process_index() const { return client_->process_index(); }
 
@@ -53,20 +53,16 @@ std::string_view PyMemorySpace::platform() const {
 }
 
 std::string_view PyMemorySpace::kind() const {
-  return memory_space_->memory_space_kind();
+  return *memory_->Kind().memory_kind();
 }
 
-std::string_view PyMemorySpace::Str() const {
-  return memory_space_->DebugString();
-}
+std::string_view PyMemorySpace::Str() const { return memory_->DebugString(); }
 
-std::string_view PyMemorySpace::Repr() const {
-  return memory_space_->ToString();
-}
+std::string_view PyMemorySpace::Repr() const { return memory_->ToString(); }
 
 nb::list PyMemorySpace::AddressableByDevices() const {
   nb::list devices;
-  for (ifrt::Device* device : memory_space_->devices()) {
+  for (ifrt::Device* device : memory_->Devices()) {
     devices.append(client_->GetPyDevice(device));
   }
   return devices;
diff --git a/third_party/xla/xla/python/py_memory_space.h b/third_party/xla/xla/python/py_memory_space.h
index 8605a6bd1d37f9..aa1d926f7cf6d9 100644
--- a/third_party/xla/xla/python/py_memory_space.h
+++ b/third_party/xla/xla/python/py_memory_space.h
@@ -21,7 +21,7 @@ limitations under the License.
 #include <string_view>
 
 #include "third_party/nanobind/include/nanobind/nanobind.h"
-#include "xla/pjrt/pjrt_client.h"
+#include "xla/python/ifrt/memory.h"
 #include "xla/python/nb_class_ptr.h"
 #include "xla/python/py_client.h"
 
@@ -29,7 +29,7 @@ namespace xla {
 
 class PyMemorySpace {
  public:
-  PyMemorySpace(nb_class_ptr<PyClient> client, PjRtMemorySpace* memory_space);
+  PyMemorySpace(nb_class_ptr<PyClient> client, ifrt::Memory* memory_space);
 
   // Memory spaces are compared using Python object identity, so we don't allow
   // them to be copied or moved.
@@ -39,7 +39,7 @@ class PyMemorySpace {
   PyMemorySpace& operator=(PyMemorySpace&&) = delete;
 
   const nb_class_ptr<PyClient>& client() const { return client_; }
-  PjRtMemorySpace* memory_space() const { return memory_space_; }
+  ifrt::Memory* memory_space() const { return memory_; }
 
   int process_index() const;
   std::string_view platform() const;
@@ -58,7 +58,7 @@ class PyMemorySpace {
   static PyType_Slot slots_[];
 
   nb_class_ptr<PyClient> client_;
-  PjRtMemorySpace* memory_space_;
+  ifrt::Memory* memory_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/python/py_program.cc b/third_party/xla/xla/python/py_program.cc
new file mode 100644
index 00000000000000..4cabd1b156f22a
--- /dev/null
+++ b/third_party/xla/xla/python/py_program.cc
@@ -0,0 +1,126 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/python/py_program.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "third_party/nanobind/include/nanobind/nanobind.h"
+#include "third_party/nanobind/include/nanobind/stl/string.h"  // IWYU pragma: keep
+#include "third_party/nanobind/include/nanobind/stl/unique_ptr.h"  // IWYU pragma: keep
+#include "third_party/nanobind/include/nanobind/stl/vector.h"  // IWYU pragma: keep
+#include "xla/pjrt/mlir_to_hlo.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/pjrt/status_casters.h"
+#include "xla/python/ifrt/compiler.h"
+#include "xla/python/ifrt/host_callback.h"
+#include "xla/python/ifrt/plugin_program.h"
+#include "xla/python/pjrt_ifrt/xla_compiler.h"
+#include "tsl/concurrency/ref_count.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+
+namespace nb = ::nanobind;
+
+namespace {
+
+absl::StatusOr<std::unique_ptr<xla::ifrt::Program>> MakePluginProgramFromString(
+    std::string data) {
+  auto plugin_program = std::make_unique<xla::ifrt::PluginProgram>();
+  plugin_program->data = std::move(data);
+  return plugin_program;
+}
+
+absl::StatusOr<std::unique_ptr<xla::ifrt::Program>> MakePluginProgramFromBytes(
+    nb::bytes data) {
+  auto plugin_program = std::make_unique<xla::ifrt::PluginProgram>();
+  plugin_program->data = std::string(data.c_str(), data.size());
+  return plugin_program;
+}
+
+absl::StatusOr<std::unique_ptr<ifrt::CompileOptions>>
+MakePluginCompileOptions() {
+  return std::make_unique<ifrt::PluginCompileOptions>();
+}
+
+absl::StatusOr<std::unique_ptr<ifrt::Program>> MakeXlaProgram(
+    absl::string_view mlir_module) {
+  auto context = std::make_unique<mlir::MLIRContext>();
+  TF_ASSIGN_OR_RETURN(mlir::OwningOpRef<mlir::ModuleOp> module,
+                      ParseMlirModuleString(mlir_module, *context));
+  return std::make_unique<xla::ifrt::XlaProgram>(std::move(context),
+                                                 std::move(module));
+}
+
+absl::StatusOr<std::unique_ptr<ifrt::Program>> MakeXlaProgramFromString(
+    std::string mlir_module) {
+  return MakeXlaProgram(mlir_module);
+}
+
+absl::StatusOr<std::unique_ptr<ifrt::Program>> MakeXlaProgramFromBytes(
+    nb::bytes mlir_module) {
+  return MakeXlaProgram(
+      absl::string_view(mlir_module.c_str(), mlir_module.size()));
+}
+
+absl::StatusOr<std::unique_ptr<ifrt::CompileOptions>> MakeXlaCompileOptions(
+    CompileOptions options, std::vector<nb::capsule> host_callbacks) {
+  std::vector<tsl::RCReference<ifrt::LoadedHostCallback>>
+      ifrt_loaded_host_callbacks;
+  ifrt_loaded_host_callbacks.reserve(host_callbacks.size());
+  // Extract `ifrt::LoadedHostCallback`s from host callback capsules that were
+  // created by `PyClient::MakePythonCallbackUsingHostSendAndRecv()` or
+  // `PyClient::GetEmitPythonCallbackDescriptor()`.
+  for (auto& host_callback : host_callbacks) {
+    ifrt_loaded_host_callbacks.push_back(tsl::FormRef(
+        static_cast<ifrt::LoadedHostCallback*>(host_callback.data())));
+  }
+  return std::make_unique<ifrt::XlaCompileOptions>(
+      std::move(options), std::move(ifrt_loaded_host_callbacks));
+}
+
+}  // namespace
+
+void BuildIfrtProgramsSubmodule(nanobind::module_& m) {
+  auto sub_module = m.def_submodule("ifrt_programs");
+  nb::class_<xla::ifrt::Program> ifrt_program_base_class(sub_module, "Program");
+  nb::class_<xla::ifrt::CompileOptions> ifrt_compile_options_base_class(
+      sub_module, "CompileOptions");
+  sub_module
+      .def("make_xla_program",
+           xla::ValueOrThrowWrapper(MakeXlaProgramFromString))
+      .def("make_xla_program",
+           xla::ValueOrThrowWrapper(MakeXlaProgramFromBytes))
+      .def("make_plugin_program",
+           xla::ValueOrThrowWrapper(MakePluginProgramFromString))
+      .def("make_plugin_program",
+           xla::ValueOrThrowWrapper(MakePluginProgramFromBytes))
+      .def("make_xla_compile_options",
+           xla::ValueOrThrowWrapper(MakeXlaCompileOptions))
+      .def("make_plugin_compile_options",
+           xla::ValueOrThrowWrapper(MakePluginCompileOptions));
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/python/py_program.h b/third_party/xla/xla/python/py_program.h
new file mode 100644
index 00000000000000..1b2b5fc6ed5a29
--- /dev/null
+++ b/third_party/xla/xla/python/py_program.h
@@ -0,0 +1,27 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PY_PROGRAM_H_
+#define XLA_PYTHON_PY_PROGRAM_H_
+
+#include "third_party/nanobind/include/nanobind/nanobind.h"
+
+namespace xla {
+
+void BuildIfrtProgramsSubmodule(nanobind::module_& m);
+
+}  // namespace xla
+
+#endif  // XLA_PYTHON_PY_PROGRAM_H_
diff --git a/third_party/xla/xla/python/py_values.cc b/third_party/xla/xla/python/py_values.cc
index 7454dd4e63bf93..1853c0ef52d7b5 100644
--- a/third_party/xla/xla/python/py_values.cc
+++ b/third_party/xla/xla/python/py_values.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include <string_view>
 #include <type_traits>
 #include <utility>
+#include <variant>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
@@ -65,18 +66,17 @@ namespace xla {
 
 namespace {
 
-using DevicePutFunc = std::function<absl::StatusOr<DevicePutResult>(
+using DevicePutFunc = std::function<absl::StatusOr<DevicePutResultFn>(
     nb::handle, ifrt::Client*, ifrt::Device*, const DevicePutOptions& options,
     ifrt::MemoryKind to_memory_kind)>;
 
 template <typename T, typename SquashedT>
-absl::StatusOr<DevicePutResult> HandlePythonScalar(
+absl::StatusOr<DevicePutResultFn> HandlePythonScalar(
     nb::handle obj, ifrt::Client* client, ifrt::Device* to_device,
     const DevicePutOptions& options, ifrt::MemoryKind to_memory_kind) {
-  T data;
-
+  T value;
   try {
-    data = nb::cast<T>(obj);
+    value = nb::cast<T>(obj);
   } catch (const std::exception& e) {
     return InvalidArgument(
         "Unable to convert Python scalar to %s. This most likely means the "
@@ -85,46 +85,45 @@ absl::StatusOr<DevicePutResult> HandlePythonScalar(
         nb::cast<std::string_view>(nb::repr(obj)));
   }
 
-  void* ptr;
-  SquashedT squashed_data;
+  std::variant<T, SquashedT> data;
   Shape shape;
   PrimitiveType type;
   if (std::is_same<T, SquashedT>() || !options.squash_64bit_types) {
-    ptr = &data;
+    data.template emplace<0>(value);
     type = primitive_util::NativeToPrimitiveType<T>();
   } else {
     // TODO(phawkins): we should check for overflow here, e.g., because of bugs
     // like https://github.com/google/jax/issues/2006
-    squashed_data = static_cast<SquashedT>(data);
-    ptr = &squashed_data;
+    data.template emplace<1>(static_cast<SquashedT>(value));
     type = primitive_util::NativeToPrimitiveType<SquashedT>();
   }
-  // Must release the GIL before BufferFromHostBuffer because backends may
-  // decide to block/sleep for device buffer allocation.
-  nb::gil_scoped_release gil_release;
-  TF_ASSIGN_OR_RETURN(auto ifrt_dtype, xla::ifrt::ToDType(type));
-  // TODO(yashkatariya): Plumb sharding or memory_kind here.
-  TF_ASSIGN_OR_RETURN(
-      auto ifrt_array,
-      client->MakeArrayFromHostBuffer(
-          ptr, ifrt_dtype, /*shape=*/ifrt::Shape({}), /*byte_strides=*/{},
-          ifrt::SingleDeviceSharding::Create(to_device, to_memory_kind),
-          ifrt::Client::HostBufferSemantics::kImmutableOnlyDuringCall,
-          /*on_done_with_host_buffer=*/{}));
-  return DevicePutResult(std::move(ifrt_array), /*weak_type=*/true);
+
+  return [client, data, type, to_device,
+          to_memory_kind]() -> absl::StatusOr<DevicePutResult> {
+    const void* ptr = std::visit(
+        [](const auto& v) { return static_cast<const void*>(&v); }, data);
+    TF_ASSIGN_OR_RETURN(auto ifrt_dtype, xla::ifrt::ToDType(type));
+    // TODO(yashkatariya): Plumb sharding or memory_kind here.
+    TF_ASSIGN_OR_RETURN(
+        auto ifrt_array,
+        client->MakeArrayFromHostBuffer(
+            ptr, ifrt_dtype, /*shape=*/ifrt::Shape({}), /*byte_strides=*/{},
+            ifrt::SingleDeviceSharding::Create(to_device, to_memory_kind),
+            ifrt::Client::HostBufferSemantics::kImmutableOnlyDuringCall,
+            /*on_done_with_host_buffer=*/{}));
+    return DevicePutResult(std::move(ifrt_array), /*weak_type=*/true);
+  };
 }
 
-absl::StatusOr<DevicePutResult> HandlePythonInt(
+absl::StatusOr<DevicePutResultFn> HandlePythonInt(
     nb::handle obj, ifrt::Client* client, ifrt::Device* to_device,
     const DevicePutOptions& options, ifrt::MemoryKind to_memory_kind) {
-  void* ptr;
   PrimitiveType type;
-  int64_t data_int64;
-  int32_t data_int32;
+  std::variant<int64_t, int32_t> data;
 
   if (options.squash_64bit_types) {
     try {
-      data_int32 = nb::cast<int32_t>(obj);
+      data.emplace<1>(nb::cast<int32_t>(obj));
     } catch (const std::exception& e) {
       return InvalidArgument(
           "Unable to convert Python scalar to %s. This most likely means the "
@@ -132,11 +131,10 @@ absl::StatusOr<DevicePutResult> HandlePythonInt(
           PrimitiveType_Name(primitive_util::NativeToPrimitiveType<int32_t>()),
           nb::cast<std::string_view>(nb::repr(obj)));
     }
-    ptr = &data_int32;
     type = S32;
   } else {
     try {
-      data_int64 = nb::cast<int64_t>(obj);
+      data.emplace<0>(nb::cast<int64_t>(obj));
     } catch (const std::exception& e) {
       return InvalidArgument(
           "Unable to convert Python scalar to %s. This most likely means the "
@@ -144,83 +142,99 @@ absl::StatusOr<DevicePutResult> HandlePythonInt(
           PrimitiveType_Name(primitive_util::NativeToPrimitiveType<int64_t>()),
           nb::cast<std::string_view>(nb::repr(obj)));
     }
-    ptr = &data_int64;
     type = S64;
   }
-  // Must release the GIL before BufferFromHostBuffer because backends may
-  // decide to block/sleep for device buffer allocation.
-  nb::gil_scoped_release gil_release;
-  TF_ASSIGN_OR_RETURN(auto ifrt_dtype, xla::ifrt::ToDType(type));
-  // TODO(yashkatariya): Plumb sharding or memory_kind here.
-  TF_ASSIGN_OR_RETURN(
-      auto ifrt_array,
-      client->MakeArrayFromHostBuffer(
-          ptr, ifrt_dtype, /*shape=*/xla::ifrt::Shape({}), /*byte_strides=*/{},
-          ifrt::SingleDeviceSharding::Create(to_device, to_memory_kind),
-          ifrt::Client::HostBufferSemantics::kImmutableOnlyDuringCall,
-          /*on_done_with_host_buffer=*/nullptr));
-  return DevicePutResult(std::move(ifrt_array), /*weak_type=*/true);
+  return [client, data, type, to_device,
+          to_memory_kind]() -> absl::StatusOr<DevicePutResult> {
+    const void* ptr = std::visit(
+        [](const auto& v) { return static_cast<const void*>(&v); }, data);
+    TF_ASSIGN_OR_RETURN(auto ifrt_dtype, xla::ifrt::ToDType(type));
+    // TODO(yashkatariya): Plumb sharding or memory_kind here.
+    TF_ASSIGN_OR_RETURN(
+        auto ifrt_array,
+        client->MakeArrayFromHostBuffer(
+            ptr, ifrt_dtype, /*shape=*/xla::ifrt::Shape({}),
+            /*byte_strides=*/{},
+            ifrt::SingleDeviceSharding::Create(to_device, to_memory_kind),
+            ifrt::Client::HostBufferSemantics::kImmutableOnlyDuringCall,
+            /*on_done_with_host_buffer=*/nullptr));
+    return DevicePutResult(std::move(ifrt_array), /*weak_type=*/true);
+  };
 }
 
 template <typename T, typename SquashedT = T>
-absl::StatusOr<DevicePutResult> HandleNumpyScalar(
+absl::StatusOr<DevicePutResultFn> HandleNumpyScalar(
     nb::handle h, ifrt::Client* client, ifrt::Device* to_device,
     const DevicePutOptions& options, ifrt::MemoryKind to_memory_kind) {
-  T data;
-  SquashedT data_squashed;
-  void* ptr;
+  std::variant<T, SquashedT, void*> data;
   PrimitiveType type;
   // For extension types, ScalarAsCtype returns a pointer to the data.
   if (std::is_same<T, xla::s4>()) {
-    PyArray_ScalarAsCtype(h.ptr(), &ptr);
+    PyArray_ScalarAsCtype(h.ptr(), &data.template emplace<2>());
     type = S4;
   } else if (std::is_same<T, xla::u4>()) {
-    PyArray_ScalarAsCtype(h.ptr(), &ptr);
+    PyArray_ScalarAsCtype(h.ptr(), &data.template emplace<2>());
     type = U4;
   } else if (std::is_same<T, bfloat16>()) {
-    PyArray_ScalarAsCtype(h.ptr(), &ptr);
+    PyArray_ScalarAsCtype(h.ptr(), &data.template emplace<2>());
     type = BF16;
   } else if (std::is_same<T, tsl::float8_e4m3fn>()) {
-    PyArray_ScalarAsCtype(h.ptr(), &ptr);
+    PyArray_ScalarAsCtype(h.ptr(), &data.template emplace<2>());
     type = F8E4M3FN;
   } else if (std::is_same<T, tsl::float8_e4m3b11>()) {
-    PyArray_ScalarAsCtype(h.ptr(), &ptr);
+    PyArray_ScalarAsCtype(h.ptr(), &data.template emplace<2>());
     type = F8E4M3B11FNUZ;
   } else if (std::is_same<T, tsl::float8_e5m2>()) {
-    PyArray_ScalarAsCtype(h.ptr(), &ptr);
+    PyArray_ScalarAsCtype(h.ptr(), &data.template emplace<2>());
     type = F8E5M2;
   } else if (std::is_same<T, tsl::float8_e4m3fnuz>()) {
-    PyArray_ScalarAsCtype(h.ptr(), &ptr);
+    PyArray_ScalarAsCtype(h.ptr(), &data.template emplace<2>());
     type = F8E4M3FNUZ;
   } else if (std::is_same<T, tsl::float8_e5m2fnuz>()) {
-    PyArray_ScalarAsCtype(h.ptr(), &ptr);
+    PyArray_ScalarAsCtype(h.ptr(), &data.template emplace<2>());
     type = F8E5M2FNUZ;
   } else if (std::is_same<T, SquashedT>() || !options.squash_64bit_types) {
-    PyArray_ScalarAsCtype(h.ptr(), &data);
-    ptr = &data;
+    PyArray_ScalarAsCtype(h.ptr(), &data.template emplace<0>());
     type = primitive_util::NativeToPrimitiveType<T>();
   } else {
-    PyArray_ScalarAsCtype(h.ptr(), &data);
-    data_squashed = static_cast<SquashedT>(data);
-    ptr = &data_squashed;
+    T value;
+    PyArray_ScalarAsCtype(h.ptr(), &value);
+    data.template emplace<1>(static_cast<SquashedT>(value));
     type = primitive_util::NativeToPrimitiveType<SquashedT>();
   }
-  // Must release the GIL before BufferFromHostBuffer because backends may
-  // decide to block/sleep for device buffer allocation.
-  nb::gil_scoped_release gil_release;
-  TF_ASSIGN_OR_RETURN(auto ifrt_dtype, xla::ifrt::ToDType(type));
-  // TODO(yashkatariya): Plumb sharding or memory_kind here.
-  TF_ASSIGN_OR_RETURN(
-      auto ifrt_array,
-      client->MakeArrayFromHostBuffer(
-          ptr, ifrt_dtype, /*shape=*/xla::ifrt::Shape({}), /*byte_strides=*/{},
-          ifrt::SingleDeviceSharding::Create(to_device, to_memory_kind),
-          ifrt::Client::HostBufferSemantics::kImmutableOnlyDuringCall,
-          /*on_done_with_host_buffer=*/nullptr));
-  return DevicePutResult(std::move(ifrt_array), /*weak_type=*/false);
+  std::shared_ptr<PythonRefManager::ManagedPyObjects> py_buffer_ref;
+  if (data.index() == 2) {
+    py_buffer_ref =
+        GlobalPyRefManager()->ManageReference(nb::cast<nb::object>(h));
+  }
+  return [client, data, py_buffer_ref, type, to_device,
+          to_memory_kind]() mutable -> absl::StatusOr<DevicePutResult> {
+    const void* ptr = std::visit(
+        [](const auto& v) -> const void* {
+          if constexpr (std::is_same_v<std::decay_t<decltype(v)>, void*>) {
+            return v;
+          } else {
+            return static_cast<const void*>(&v);
+          }
+        },
+        data);
+    TF_ASSIGN_OR_RETURN(auto ifrt_dtype, xla::ifrt::ToDType(type));
+    // TODO(yashkatariya): Plumb sharding or memory_kind here.
+    TF_ASSIGN_OR_RETURN(
+        auto ifrt_array,
+        client->MakeArrayFromHostBuffer(
+            ptr, ifrt_dtype, /*shape=*/xla::ifrt::Shape({}),
+            /*byte_strides=*/{},
+            ifrt::SingleDeviceSharding::Create(to_device, to_memory_kind),
+            ifrt::Client::HostBufferSemantics::kImmutableOnlyDuringCall,
+            /*on_done_with_host_buffer=*/
+            [py_buffer_ref = std::move(
+                 py_buffer_ref)]() { /* keeps py_buffer_ref alive */ }));
+    return DevicePutResult(std::move(ifrt_array), /*weak_type=*/false);
+  };
 }
 
-absl::StatusOr<DevicePutResult> HandleNumpyArray(
+absl::StatusOr<DevicePutResultFn> HandleNumpyArray(
     nb::handle h, ifrt::Client* client, ifrt::Device* to_device,
     const DevicePutOptions& options, ifrt::MemoryKind to_memory_kind) {
   xla::nb_numpy_ndarray array = nb::cast<xla::nb_numpy_ndarray>(h);
@@ -248,35 +262,39 @@ absl::StatusOr<DevicePutResult> HandleNumpyArray(
     byte_strides[i] = array.strides(i);
   }
   const void* data = array.data();
-  ifrt::Client::HostBufferSemantics host_buffer_semantics =
-      ifrt::Client::HostBufferSemantics::kImmutableOnlyDuringCall;
-  std::function<void()> on_done_with_host_buffer;
-  if (options.allow_zero_copy) {
-    std::shared_ptr<PythonRefManager::ManagedPyObjects> py_buffer_ref =
-        GlobalPyRefManager()->ManageReference(std::move(array));
-    on_done_with_host_buffer =
-        [py_buffer_ref{
-            std::move(py_buffer_ref)}]() { /* keeps py_buffer_ref alive */ };
-    host_buffer_semantics = ifrt::Client::HostBufferSemantics::kZeroCopy;
-  }
-  // Must release the GIL before BufferFromHostBuffer because backends may
-  // decide to block/sleep for device buffer allocation.
-  nb::gil_scoped_release gil_release;
-  TF_ASSIGN_OR_RETURN(auto ifrt_dtype, xla::ifrt::ToDType(squashed_type));
-  TF_ASSIGN_OR_RETURN(
-      auto ifrt_array,
-      client->MakeArrayFromHostBuffer(
-          data, ifrt_dtype, ifrt::Shape(dims), byte_strides,
-          xla::ifrt::SingleDeviceSharding::Create(to_device, to_memory_kind),
-          host_buffer_semantics, std::move(on_done_with_host_buffer)));
-  return DevicePutResult(std::move(ifrt_array), /*weak_type=*/false);
+  std::shared_ptr<PythonRefManager::ManagedPyObjects> py_buffer_ref =
+      GlobalPyRefManager()->ManageReference(std::move(array));
+  return [client, data, squashed_type, dims = std::move(dims),
+          byte_strides = std::move(byte_strides),
+          py_buffer_ref = std::move(py_buffer_ref),
+          allow_zero_copy = options.allow_zero_copy, to_device,
+          to_memory_kind]() mutable -> absl::StatusOr<DevicePutResult> {
+    TF_ASSIGN_OR_RETURN(auto ifrt_dtype, xla::ifrt::ToDType(squashed_type));
+
+    ifrt::Client::HostBufferSemantics host_buffer_semantics =
+        ifrt::Client::HostBufferSemantics::kImmutableOnlyDuringCall;
+    std::function<void()> on_done_with_host_buffer;
+    if (allow_zero_copy) {
+      on_done_with_host_buffer =
+          [py_buffer_ref{
+              std::move(py_buffer_ref)}]() { /* keeps py_buffer_ref alive */ };
+      host_buffer_semantics =
+          ifrt::Client::HostBufferSemantics::kImmutableZeroCopy;
+    }
+
+    TF_ASSIGN_OR_RETURN(
+        auto ifrt_array,
+        client->MakeArrayFromHostBuffer(
+            data, ifrt_dtype, ifrt::Shape(dims), byte_strides,
+            xla::ifrt::SingleDeviceSharding::Create(to_device, to_memory_kind),
+            host_buffer_semantics, std::move(on_done_with_host_buffer)));
+    return DevicePutResult(std::move(ifrt_array), /*weak_type=*/false);
+  };
 }
 
-absl::StatusOr<DevicePutResult> HandlePyArray(nb::handle obj,
-                                              ifrt::Client* client,
-                                              ifrt::Device* to_device,
-                                              const DevicePutOptions& options,
-                                              ifrt::MemoryKind to_memory_kind) {
+absl::StatusOr<DevicePutResultFn> HandlePyArray(
+    nb::handle obj, ifrt::Client* client, ifrt::Device* to_device,
+    const DevicePutOptions& options, ifrt::MemoryKind to_memory_kind) {
   auto py_array = nb::borrow<PyArray>(obj);
 
   // We only allow single device case for PyArray in device put.
@@ -304,24 +322,31 @@ absl::StatusOr<DevicePutResult> HandlePyArray(nb::handle obj,
       (!to_memory_kind.memory_kind().has_value() ||
        !ifrt_array->sharding().memory_kind().memory_kind().has_value() ||
        ifrt_array->sharding().memory_kind() == to_memory_kind)) {
-    return DevicePutResult(tsl::FormRef(ifrt_array), py_array.weak_type(),
+    DevicePutResult result(tsl::FormRef(ifrt_array), py_array.weak_type(),
                            /*owning_pybuffer=*/nb::borrow<nb::object>(obj));
+    return [result = std::move(result)]() mutable { return std::move(result); };
   } else {
-    TF_ASSIGN_OR_RETURN(
-        tsl::RCReference<ifrt::Array> copied_ifrt_array,
-        ifrt_array->Reshard(
-            ifrt::SingleDeviceSharding::Create(to_device, to_memory_kind),
-            ifrt::ArrayCopySemantics::kReuseInput));
-    return DevicePutResult(std::move(copied_ifrt_array), py_array.weak_type());
+    return [ifrt_array = tsl::FormRef(ifrt_array), to_device, to_memory_kind,
+            owning_pybuffer = py_array.weak_type()]() mutable
+           -> absl::StatusOr<DevicePutResult> {
+      TF_ASSIGN_OR_RETURN(
+          tsl::RCReference<ifrt::Array> copied_ifrt_array,
+          ifrt_array->Reshard(
+              ifrt::SingleDeviceSharding::Create(to_device, to_memory_kind),
+              ifrt::ArrayCopySemantics::kReuseInput));
+      return DevicePutResult(std::move(copied_ifrt_array),
+                             std::move(owning_pybuffer));
+    };
   }
 }
 
 }  // namespace
 
-absl::StatusOr<DevicePutResult> DevicePut(nb::handle arg, ifrt::Client* client,
-                                          ifrt::Device* to_device,
-                                          const DevicePutOptions& options,
-                                          ifrt::MemoryKind to_memory_kind) {
+absl::StatusOr<DevicePutResultFn> DevicePut(nb::handle arg,
+                                            ifrt::Client* client,
+                                            ifrt::Device* to_device,
+                                            const DevicePutOptions& options,
+                                            ifrt::MemoryKind to_memory_kind) {
   tsl::profiler::TraceMe traceme("DevicePut");
   static const absl::flat_hash_map<PyObject*, DevicePutFunc>* const handlers =
       [] {
@@ -382,9 +407,7 @@ absl::StatusOr<DevicePutResult> DevicePut(nb::handle arg, ifrt::Client* client,
 
   if (arg.type().ptr() == PyArray::type().ptr()) {
     auto array = nb::borrow<PyArray>(arg);
-    if (array.fastpath_enabled()) {
-      return HandlePyArray(arg, client, to_device, options, to_memory_kind);
-    }
+    return HandlePyArray(arg, client, to_device, options, to_memory_kind);
   }
 
   auto res = handlers->find(arg.type().ptr());
@@ -564,15 +587,13 @@ absl::StatusOr<PyArgSignature> PyArgSignatureOfValue(nb::handle arg,
 
   if (arg.type().ptr() == PyArray::type().ptr()) {
     auto array = nb::borrow<PyArray>(arg);
-    if (array.fastpath_enabled()) {
-      ifrt::Array* ifrt_array = array.ifrt_array();
-      if (ifrt_array == nullptr) {
-        return xla::InvalidArgument("Array has been deleted.");
-      }
-      TF_ASSIGN_OR_RETURN(auto primitive_type,
-                          ifrt::ToPrimitiveType(ifrt_array->dtype()));
-      return PyArgSignature(primitive_type, array.shape(), array.weak_type());
+    ifrt::Array* ifrt_array = array.ifrt_array();
+    if (ifrt_array == nullptr) {
+      return xla::InvalidArgument("Array has been deleted.");
     }
+    TF_ASSIGN_OR_RETURN(auto primitive_type,
+                        ifrt::ToPrimitiveType(ifrt_array->dtype()));
+    return PyArgSignature(primitive_type, array.shape(), array.weak_type());
   }
 
   auto res = handlers->find(arg.type().ptr());
diff --git a/third_party/xla/xla/python/py_values.h b/third_party/xla/xla/python/py_values.h
index d8106ca19462e6..d82f5310e5f39b 100644
--- a/third_party/xla/xla/python/py_values.h
+++ b/third_party/xla/xla/python/py_values.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/container/inlined_vector.h"
+#include "absl/functional/any_invocable.h"
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
 #include "third_party/nanobind/include/nanobind/nanobind.h"
@@ -31,6 +32,7 @@ limitations under the License.
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/memory.h"
 #include "xla/python/nb_numpy.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/concurrency/ref_count.h"
 
 namespace xla {
@@ -43,6 +45,13 @@ struct DevicePutResult {
         weak_type(weak_type),
         owning_pybuffer(owning_pybuffer) {}
 
+  // Disallow copy since copying `DevicePutResult` without holding GIL may be
+  // dangerous due to `owning_pybuffer`.
+  DevicePutResult(const DevicePutResult&) = delete;
+  DevicePutResult& operator=(const DevicePutResult&) = delete;
+  DevicePutResult(DevicePutResult&&) = default;
+  DevicePutResult& operator=(DevicePutResult&&) = default;
+
   // Points to the on-device array. Not owned.
   tsl::RCReference<ifrt::Array> ifrt_array;
   bool weak_type;
@@ -57,17 +66,24 @@ struct DevicePutResult {
 // If the value is known to be a PyBuffer object, py_buffer can be passed as
 // an optimization to avoid a Python->C++ cast.
 //
+// This function performs Python work inline but postpones C++ work until the
+// returned function is called. The returned function must be called after
+// releasing GIL. Useful for batching GIL release when there are many device_put
+// to execute.
+//
 // May throw exceptions from nanobind in addition to failing via an error
 // Status. (We could catch these if needed, but there seems little point.)
 struct DevicePutOptions {
   bool squash_64bit_types = false;
   bool allow_zero_copy = true;
 };
-absl::StatusOr<DevicePutResult> DevicePut(nanobind::handle arg,
-                                          ifrt::Client* client,
-                                          ifrt::Device* to_device,
-                                          const DevicePutOptions& options,
-                                          ifrt::MemoryKind to_memory_kind);
+using DevicePutResultFn =
+    absl::AnyInvocable<absl::StatusOr<DevicePutResult>() &&>;
+absl::StatusOr<DevicePutResultFn> DevicePut(nanobind::handle arg,
+                                            ifrt::Client* client,
+                                            ifrt::Device* to_device,
+                                            const DevicePutOptions& options,
+                                            ifrt::MemoryKind to_memory_kind);
 
 // Returns `true` if `arg` is a JAX float0 array.
 bool IsFloat0(xla::nb_numpy_ndarray arg);
diff --git a/third_party/xla/xla/python/pytree.cc b/third_party/xla/xla/python/pytree.cc
index 2d9dd09c3bb69f..be3172282a4e44 100644
--- a/third_party/xla/xla/python/pytree.cc
+++ b/third_party/xla/xla/python/pytree.cc
@@ -50,6 +50,7 @@ limitations under the License.
 #include "third_party/nanobind/include/nanobind/stl/vector.h"  // IWYU pragma: keep
 #include "xla/pjrt/exceptions.h"
 #include "xla/python/nb_class_ptr.h"
+#include "xla/python/pytree.pb.h"
 #include "tsl/platform/logging.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/python/refine_polymorphic_shapes.cc b/third_party/xla/xla/python/refine_polymorphic_shapes.cc
index dbf681c4d0be32..58873b9de65cd7 100644
--- a/third_party/xla/xla/python/refine_polymorphic_shapes.cc
+++ b/third_party/xla/xla/python/refine_polymorphic_shapes.cc
@@ -18,24 +18,38 @@ limitations under the License.
 #include <string>
 
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/Regex.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Bytecode/BytecodeWriter.h"  // from @llvm-project
 #include "mlir/Dialect/Func/Extensions/AllExtensions.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/IR/Region.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/ValueRange.h"  // from @llvm-project
 #include "mlir/IR/Verifier.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
+#include "stablehlo/dialect/Base.h"  // from @stablehlo
 #include "stablehlo/dialect/ChloOps.h"  // from @stablehlo
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "stablehlo/experimental/transforms/Passes.h"  // from @stablehlo
 #include "xla/mlir/utils/error_util.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/logging.h"
 
 namespace xla {
 
@@ -254,10 +268,6 @@ absl::Status RefinePolymorphicShapes(mlir::ModuleOp module,
 
   // TODO(necula): we should not need the inliner.
   pm.addPass(mlir::createInlinerPass());
-  // Efficiently remove dead code to avoid issues in subsequent passes.
-  // Too much dead code can cause e.g. the shape refinement pass to fail to
-  // converge.
-  pm.addPass(mlir::stablehlo::experimental::createStablehloTrivialDcePass());
   pm.addPass(mlir::createCSEPass());
   pm.addPass(mlir::stablehlo::experimental::createChloRecomposeOpsPass());
   pm.addPass(mlir::stablehlo::experimental::createStablehloRefineShapesPass());
diff --git a/third_party/xla/xla/python/refine_polymorphic_shapes.h b/third_party/xla/xla/python/refine_polymorphic_shapes.h
index 669f63da56b3c3..931eac7dc54f29 100644
--- a/third_party/xla/xla/python/refine_polymorphic_shapes.h
+++ b/third_party/xla/xla/python/refine_polymorphic_shapes.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define XLA_PYTHON_REFINE_POLYMORPHIC_SHAPES_H_
 
 #include "absl/status/status.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 
diff --git a/third_party/xla/xla/python/sharding.cc b/third_party/xla/xla/python/sharding.cc
index 3ae6809b54e36e..fbb4020ff603cb 100644
--- a/third_party/xla/xla/python/sharding.cc
+++ b/third_party/xla/xla/python/sharding.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "xla/python/py_client.h"
 #include "xla/python/py_device_list.h"
 #include "xla/python/sharded_device_array.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/logging.h"
 
 namespace jax {
diff --git a/third_party/xla/xla/python/tools/BUILD b/third_party/xla/xla/python/tools/BUILD
index 2b6ce41ba112b0..480f20bb2fa769 100644
--- a/third_party/xla/xla/python/tools/BUILD
+++ b/third_party/xla/xla/python/tools/BUILD
@@ -1,10 +1,9 @@
-load("@local_tsl//tsl:tsl.default.bzl", "tsl_pybind_extension")
-
 # NOTE: We can't use `pytype_pybind_extension` nor `pytype_strict_contrib_test`
 # because the OSS versions of these files do not include ports of those rules.
 # We must instead use `tsl_pybind_extension` and `py_strict_test`.
 load("//xla:pytype.default.bzl", "pytype_strict_library")
 load("//xla:strict.default.bzl", "py_strict_test")
+load("//xla/tsl:tsl.default.bzl", "tsl_pybind_extension")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -62,14 +61,14 @@ tsl_pybind_extension(
         "//third_party/nanobind",
         "//xla:literal",
         "//xla:xla_data_proto_cc",
+        "//xla/pjrt:status_casters",
         "//xla/python:logging",
         "//xla/python:nb_numpy",
         "//xla/python:types",
         "//xla/tsl/python/lib/core:numpy",
-        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@pybind11",
-        "@pybind11_abseil//pybind11_abseil:status_casters",
+        "@pybind11_abseil//pybind11_abseil:import_status_module",
         "@pybind11_protobuf//pybind11_protobuf:native_proto_caster",
     ],
 )
diff --git a/third_party/xla/xla/python/tools/_types.cc b/third_party/xla/xla/python/tools/_types.cc
index cd2a65382642c9..b5e0635265b553 100644
--- a/third_party/xla/xla/python/tools/_types.cc
+++ b/third_party/xla/xla/python/tools/_types.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "third_party/nanobind/include/nanobind/nanobind.h"
 #include "third_party/nanobind/include/nanobind/stl/shared_ptr.h"  // IWYU pragma: keep
@@ -21,9 +20,12 @@ limitations under the License.
 #include "pybind11/numpy.h"  // from @pybind11
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "pybind11/pytypes.h"  // from @pybind11
-#include "pybind11_abseil/status_casters.h"  // from @pybind11_abseil
+// The "third_party/pybind11_abseil/status_casters.h" header says
+// it's deprecated and that we should import the other headers directly.
+#include "pybind11_abseil/import_status_module.h"  // from @pybind11_abseil
 #include "pybind11_protobuf/native_proto_caster.h"  // from @pybind11_protobuf
 #include "xla/literal.h"
+#include "xla/pjrt/status_casters.h"
 #include "xla/python/logging.h"
 #include "xla/python/nb_numpy.h"
 #include "xla/python/types.h"
@@ -39,7 +41,7 @@ namespace py = ::pybind11;
 namespace nb = ::nanobind;
 
 namespace {
-absl::StatusOr<py::object> MakeNdarray(const xla::LiteralProto& proto) {
+py::object MakeNdarray(const xla::LiteralProto& proto) {
   auto m_lit = xla::Literal::CreateFromProto(proto);
   if (!m_lit.ok()) {
     // NOTE: The OSS version of XLA is still using an old version of
@@ -55,22 +57,20 @@ absl::StatusOr<py::object> MakeNdarray(const xla::LiteralProto& proto) {
   // Move (not copy) the literal onto the heap, for sharing with Python.
   auto lit = std::make_shared<xla::Literal>(std::move(m_lit).value());
 
-  TF_ASSIGN_OR_RETURN(auto nbobj, xla::LiteralToPython(std::move(lit)));
-
-  // Convert `nb::object` into `py::object`.
+  auto nbobj = xla::ValueOrThrow(xla::LiteralToPython(std::move(lit)));
   return py::reinterpret_steal<py::object>(nbobj.release().ptr());
 }
 
 // Partial reversion of cl/617156835, until we can get the proto-casters
 // (and hence the extension) switched over to nanobind.
 // TODO(wrengr): Or can we mix `{py,nb}::module_::def` calls??
-absl::StatusOr<xla::PrimitiveType> DtypeToEtype(const py::dtype& py_d) {
+xla::PrimitiveType DtypeToEtype(const py::dtype& py_d) {
   auto nb_d = nb::borrow<xla::nb_dtype>(py_d.ptr());
-  return xla::DtypeToPrimitiveType(nb_d);
+  return xla::ValueOrThrow(xla::DtypeToPrimitiveType(nb_d));
 }
 
-absl::StatusOr<py::dtype> EtypeToDtype(xla::PrimitiveType p) {
-  TF_ASSIGN_OR_RETURN(xla::nb_dtype nb_d, xla::PrimitiveTypeToNbDtype(p));
+py::dtype EtypeToDtype(xla::PrimitiveType p) {
+  auto nb_d = xla::ValueOrThrow(xla::PrimitiveTypeToNbDtype(p));
   return py::reinterpret_steal<py::dtype>(nb_d.release().ptr());
 }
 }  // namespace
@@ -94,10 +94,12 @@ PYBIND11_MODULE(_types, py_m) {
   // protobuf objects.
   pybind11_protobuf::ImportNativeProtoCasters();
 
-  // Import implicit conversions from `absl::StatusOr` to Python exceptions.
-  // (The code for performing conversions is easy enough to port to nanobind;
-  // albeit, the conversion calls themselves have to be made explicit,
-  // since `nb::detail::type_caster` disallows raising exceptions.)
+  // Import dependencies for converting `absl::StatusOr` to Python exceptions.
+  // This also brings into scope pybind11 casters for doing conversions
+  // implicitly; however, towards the goal of converting everything to
+  // nanobind, we call `xla::ValueOrThrow` to make make the conversions
+  // explicit (since `nb::detail::type_caster` disallows raising exceptions,
+  // and therefore nanobind cannot do this implicitly).
   py::google::ImportStatusModule();
 
   // Import the 'ml_dtypes' module; which is implicitly required by
diff --git a/third_party/xla/xla/python/util.cc b/third_party/xla/xla/python/util.cc
index 9db18e31534dbf..d03de01d88f341 100644
--- a/third_party/xla/xla/python/util.cc
+++ b/third_party/xla/xla/python/util.cc
@@ -30,11 +30,11 @@ limitations under the License.
 namespace xla {
 
 Status AwaitBuffersReady(absl::Span<ifrt::Array* const> ifrt_arrays) {
-  ifrt::Future<absl::Status> future;
+  ifrt::Future<> future;
   if (ifrt_arrays.size() == 1) {
     future = ifrt_arrays[0]->GetReadyFuture();
   } else {
-    std::vector<ifrt::Future<absl::Status>> futures;
+    std::vector<ifrt::Future<>> futures;
     futures.reserve(ifrt_arrays.size());
     for (ifrt::Array* const ifrt_array : ifrt_arrays) {
       futures.push_back(ifrt_array->GetReadyFuture());
diff --git a/third_party/xla/xla/python/weakref_lru_cache.cc b/third_party/xla/xla/python/weakref_lru_cache.cc
index 796d62016500b7..31c45892e61e3b 100644
--- a/third_party/xla/xla/python/weakref_lru_cache.cc
+++ b/third_party/xla/xla/python/weakref_lru_cache.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "absl/synchronization/notification.h"
 #include "third_party/nanobind/include/nanobind/nanobind.h"
 #include "third_party/nanobind/include/nanobind/stl/shared_ptr.h"  // IWYU pragma: keep
+#include "third_party/nanobind/include/nanobind/stl/string.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/vector.h"  // IWYU pragma: keep
 #include "xla/pjrt/lru_cache.h"
 #include "xla/python/nb_helpers.h"
diff --git a/third_party/xla/xla/python/weakref_lru_cache_test.py b/third_party/xla/xla/python/weakref_lru_cache_test.py
index a3b3d5ee6bf5cf..ad5f07bee0bf72 100644
--- a/third_party/xla/xla/python/weakref_lru_cache_test.py
+++ b/third_party/xla/xla/python/weakref_lru_cache_test.py
@@ -130,6 +130,22 @@ def __hash__(self):
       for _ in range(100):
         cache(wrkey, CrashingKey())
 
+  def testPrintingStats(self):
+    class WRKey:
+      pass
+
+    cache = xla_client.weakref_lru_cache(lambda: None, lambda x, y: y, 2048)
+    wrkey = WRKey()
+    for i in range(10):
+      cache(wrkey, i)
+    for i in range(5):
+      cache(wrkey, i)
+
+    self.assertEqual(
+        repr(cache.cache_info()),
+        "WeakrefLRUCache(hits=5, misses=10, maxsize=2048, currsize=10)",
+    )
+
 
 if __name__ == "__main__":
   absltest.main()
diff --git a/third_party/xla/xla/python/xla.cc b/third_party/xla/xla/python/xla.cc
index cde953367c498d..b4c279c3d1c522 100644
--- a/third_party/xla/xla/python/xla.cc
+++ b/third_party/xla/xla/python/xla.cc
@@ -55,6 +55,7 @@ limitations under the License.
 #include "xla/pjrt/status_casters.h"
 #include "xla/python/ifrt_proxy/client/py_module.h"
 #include "xla/python/py_client.h"
+#include "xla/python/py_program.h"
 #include "xla/service/cpu/collectives_interface.h"
 #include "xla/tsl/python/lib/core/numpy.h"  //NOLINT
 #ifdef XLA_PYTHON_ENABLE_GPU
@@ -110,7 +111,7 @@ limitations under the License.
 #include "xla/python/transfer_guard_lib.h"
 #include "xla/python/weakref_lru_cache.h"
 #include "xla/python/xla_compiler.h"
-#include "tsl/distributed_runtime/preemption/preemption_sync_manager.h"
+#include "xla/tsl/distributed_runtime/preemption/preemption_sync_manager.h"
 #include "tsl/platform/platform.h"
 #include "tsl/platform/status.h"
 
@@ -462,7 +463,7 @@ NB_MODULE(xla_extension, m_nb) {
               "get_topology_for_devices requires >= 1 devices.");
         }
         auto client = py_devices[0]->client();
-        std::vector<PjRtDevice*> ifrt_devices;
+        ifrt::DeviceList::Devices ifrt_devices;
         ifrt_devices.reserve(py_devices.size());
         for (const auto& py_device : py_devices) {
           if (py_device->client().get() != client.get()) {
@@ -472,8 +473,9 @@ NB_MODULE(xla_extension, m_nb) {
           }
           ifrt_devices.push_back(py_device->device());
         }
-        return xla::ValueOrThrow(client->ifrt_client()->GetTopologyForDevices(
-            absl::MakeSpan(ifrt_devices)));
+        ifrt::DeviceList device_list(std::move(ifrt_devices));
+        return xla::ValueOrThrow(
+            client->ifrt_client()->GetTopologyForDevices(device_list));
       });
 
   TF_CHECK_OK(PyArray::RegisterTypes(m_nb));
@@ -609,6 +611,7 @@ NB_MODULE(xla_extension, m_nb) {
   m_nb.def("cuda_array_interface_to_buffer",
            xla::ValueOrThrowWrapper(CudaArrayInterfaceToBuffer));
 
+  BuildIfrtProgramsSubmodule(m_nb);
   BuildProfilerSubmodule(m_nb);
   BuildOpsSubmodule(m_nb);
   BuildOutfeedReceiverSubmodule(m_nb);
@@ -931,7 +934,7 @@ NB_MODULE(xla_extension, m_nb) {
       nb::arg("aval"), nb::arg("sharding"), nb::arg("xs"), nb::arg("devices"),
       nb::arg("committed") = true, nb::arg("force_copy") = false,
       nb::arg("host_buffer_semantics") =
-          PjRtClient::HostBufferSemantics::kZeroCopy);
+          PjRtClient::HostBufferSemantics::kImmutableZeroCopy);
 
   m_nb.def("batched_block_until_ready", [](std::vector<nb::object> xs) {
     ThrowIfError(PyArray::BatchedBlockUntilReady(std::move(xs)));
diff --git a/third_party/xla/xla/python/xla_client.py b/third_party/xla/xla/python/xla_client.py
index a6b8b6e8dc4e58..a911602a16fe91 100644
--- a/third_party/xla/xla/python/xla_client.py
+++ b/third_party/xla/xla/python/xla_client.py
@@ -43,15 +43,16 @@
 # Pylint has false positives for type annotations.
 # pylint: disable=invalid-sequence-index
 
+ifrt_programs = _xla.ifrt_programs
 ops = _xla.ops
 profiler = _xla.profiler
 
 # Just an internal arbitrary increasing number to help with backward-compatible
 # changes. In JAX, reference this via jax._src.lib.xla_extension_version.
-_version = 252
+_version = 257
 
 # Version number for MLIR:Python components.
-mlir_api_version = 55
+mlir_api_version = 56
 
 xla_platform_names = {
     'cpu': 'Host',
@@ -64,6 +65,7 @@
 
 
 def make_cpu_client(
+    asynchronous=True,
     distributed_client=None,
     node_id=0,
     num_nodes=1,
@@ -71,7 +73,7 @@ def make_cpu_client(
 ) -> ...:
   register_custom_call_handler('cpu', _xla.register_custom_call_target)
   return _xla.get_tfrt_cpu_client(
-      asynchronous=True,
+      asynchronous=asynchronous,
       distributed_client=distributed_client,
       node_id=node_id,
       num_nodes=num_nodes,
diff --git a/third_party/xla/xla/python/xla_client.pyi b/third_party/xla/xla/python/xla_client.pyi
index cd8f5bd09615ee..8d4ff2729d1754 100644
--- a/third_party/xla/xla/python/xla_client.pyi
+++ b/third_party/xla/xla/python/xla_client.pyi
@@ -22,6 +22,7 @@ import numpy
 from . import xla_extension as _xla
 from .xla_extension import Shape as Shape
 from .xla_extension import Layout as Layout
+from .xla_extension import ifrt_programs as ifrt_programs
 from .xla_extension import ops as ops
 from .xla_extension import profiler as profiler
 
@@ -83,6 +84,7 @@ def heap_profile(client: Client) -> bytes:
   ...
 
 def make_cpu_client(
+    asynchronous: bool = ...,
     distributed_client: Optional[DistributedRuntimeClient] = ...,
     node_id: int = ...,
     num_nodes: int = ...,
diff --git a/third_party/xla/xla/python/xla_client_test.py b/third_party/xla/xla/python/xla_client_test.py
index e0f131334c2f3a..db86a6a5fa8c6f 100644
--- a/third_party/xla/xla/python/xla_client_test.py
+++ b/third_party/xla/xla/python/xla_client_test.py
@@ -2566,6 +2566,7 @@ def testLocalDeviceFromLocalHardwareId(self):
           self.assertEqual(lookup_device, device)
 
     @unittest.skipIf(pathways, "not implemented")
+    @unittest.skipIf(pathways_ifrt, "not implemented")
     def testMemoryStats(self):
       for device in self.backend.local_devices():
         stats = device.memory_stats()
@@ -2946,6 +2947,34 @@ def testNotExistPjRtCApiVersion(self):
       with self.assertRaises(AttributeError):
         self.backend.pjrt_c_api_minor_version  # pylint: disable=pointless-statement
 
+    @unittest.skipIf(pathways or pathways_ifrt, "has different behavior")
+    def testPluginProgramDoesNotCompile(self):
+      program = xla_client.ifrt_programs.make_plugin_program("foobar")
+      options = xla_client.ifrt_programs.make_plugin_compile_options()
+      with self.assertRaisesRegex(
+          xla_client.XlaRuntimeError, "PjRtCompiler requires an XlaProgram"
+      ):
+        self.backend.compile_ifrt_program(program, options)
+
+    @unittest.skipIf(pathways, "does not work with non-ifrt legacy pathways")
+    def testXlaProgramViaIfrtProgram(self):
+      c = self._NewComputation()
+      ops.Iota(c, xla_client.PrimitiveType.F32, 10)
+      program = xla_client.ifrt_programs.make_xla_program(
+          xla_computation_to_mlir_module(c.build())
+      )
+      options = xla_client.ifrt_programs.make_xla_compile_options(
+          xla_client.CompileOptions(), []
+      )
+
+      compiled_c = self.backend.compile_ifrt_program(program, options)
+      results = xla_client.execute_with_python_values(
+          compiled_c, arguments=(), backend=self.backend
+      )
+
+      self.assertLen(results, 1)
+      np.testing.assert_equal(results[0], np.arange(10, dtype=np.float32))
+
     @unittest.skipIf(cloud_tpu or pathways or pathways_ifrt or tfrt_tpu,
                      "not implemented")
     def testExecutableSerialization(self):
diff --git a/third_party/xla/xla/python/xla_extension/__init__.pyi b/third_party/xla/xla/python/xla_extension/__init__.pyi
index b8a02ae8f1f41e..b9af7fe9321a5b 100644
--- a/third_party/xla/xla/python/xla_extension/__init__.pyi
+++ b/third_party/xla/xla/python/xla_extension/__init__.pyi
@@ -37,6 +37,7 @@ from typing import (
 
 import numpy as np
 
+from . import ifrt_programs
 from . import ifrt_proxy
 from . import jax_jit
 from . import mlir
@@ -500,15 +501,17 @@ class Client:
       force_copy: bool = ...,
       host_buffer_semantics: HostBufferSemantics = ...,
   ) -> ArrayImpl: ...
-  def make_cross_host_receive_buffers(
-      self, shapes: Sequence[Shape], device: Device
-  ) -> List[Tuple[ArrayImpl, bytes]]: ...
   def compile(
       self,
       computation: Union[str, bytes],
       compile_options: CompileOptions = ...,
       host_callbacks: Sequence[Any] = ...,
   ) -> LoadedExecutable: ...
+  def compile_ifrt_program(
+      self,
+      program: ifrt_programs.Program,
+      program_options: ifrt_programs.CompileOptions,
+  ) -> LoadedExecutable: ...
   def serialize_executable(self, executable: LoadedExecutable) -> bytes: ...
   def deserialize_executable(
       self,
@@ -533,6 +536,9 @@ class Client:
       recv_channel_ids: Sequence[int],
       serializer: Optional[Callable] = ...,
   ) -> Any: ...
+  def get_default_layout(
+      self, dtype: np.dtype, shard_shape: Sequence[int], device: Device
+  ) -> PjRtLayout: ...
   def __getattr__(self, name: str) -> Any: ...
 
 class CpuCollectives: ...
@@ -597,8 +603,6 @@ ArrayImpl = Any
 #                _skip_checks: bool = ...): ...
 #   def block_until_ready(self) -> ArrayImpl: ...
 #   def is_deleted(self) -> bool: ...
-#   # TODO(yashkatariya): remove this once the transition completes.
-#   def _init_with_fastpath_disabled(self) -> None: ...
 #   def is_ready(self) -> bool: ...
 #   def delete(self): ...
 #   def unsafe_buffer_pointer(self) -> Any: ...
diff --git a/third_party/xla/xla/python/xla_extension/ifrt_programs.pyi b/third_party/xla/xla/python/xla_extension/ifrt_programs.pyi
new file mode 100644
index 00000000000000..58b0996b75797b
--- /dev/null
+++ b/third_party/xla/xla/python/xla_extension/ifrt_programs.pyi
@@ -0,0 +1,33 @@
+# Copyright 2024 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from typing import Any, Optional, Callable, Sequence, Union
+
+from xla.python import xla_extension
+
+class Program:  ...
+
+class CompileOptions:  ...
+
+def make_xla_program(mlir_module: Union[str, bytes]) -> Program: ...
+
+def make_plugin_program(data: Union[str, bytes]) -> Program: ...
+
+def make_xla_compile_options(
+    compile_options: xla_extension.CompileOptions,
+    host_callbacks: Sequence[Any]
+) -> CompileOptions: ...
+
+def make_plugin_compile_options() -> CompileOptions: ...
diff --git a/third_party/xla/xla/python/xla_extension/mlir.pyi b/third_party/xla/xla/python/xla_extension/mlir.pyi
index 990904841fd1cf..ff886a1bdf9669 100644
--- a/third_party/xla/xla/python/xla_extension/mlir.pyi
+++ b/third_party/xla/xla/python/xla_extension/mlir.pyi
@@ -16,14 +16,20 @@
 from typing import Union
 from . import XlaComputation
 
-def xla_computation_to_mlir_module(computation: XlaComputation) -> str: ...
+def xla_computation_to_mlir_module(
+    computation: XlaComputation, emit_stable_hlo: bool = ...
+) -> str: ...
 def mlir_module_to_xla_computation(
-    mlir_module: Union[bytes, str], use_tuple_args: bool = ...,
-    return_tuple: bool = ...) -> XlaComputation: ...
-def mhlo_to_stablehlo(mlir_module: Union[bytes, str]) -> str: ...
-def stablehlo_to_mhlo(mlir_module: Union[bytes, str]) -> str: ...
-def serialize_portable_artifact(mlir_module: str, target:str) -> bytes: ...
+    mlir_module: Union[bytes, str],
+    use_tuple_args: bool = ...,
+    return_tuple: bool = ...,
+) -> XlaComputation: ...
+def mhlo_to_stablehlo(mlir_module: Union[bytes, str]) -> bytes: ...
+def stablehlo_to_mhlo(mlir_module: Union[bytes, str]) -> bytes: ...
+def serialize_portable_artifact(mlir_module: str, target: str) -> bytes: ...
 def deserialize_portable_artifact(mlir_module: bytes) -> str: ...
-def refine_polymorphic_shapes(mlir_module: Union[bytes, str],
-                              enable_shape_assertions: bool = ...,
-                              validate_static_shapes: bool = ...) -> bytes: ...
+def refine_polymorphic_shapes(
+    mlir_module: Union[bytes, str],
+    enable_shape_assertions: bool = ...,
+    validate_static_shapes: bool = ...,
+) -> bytes: ...
diff --git a/third_party/xla/xla/runtime/BUILD b/third_party/xla/xla/runtime/BUILD
index 45f2217fcf09b1..2f9adfb4546d12 100644
--- a/third_party/xla/xla/runtime/BUILD
+++ b/third_party/xla/xla/runtime/BUILD
@@ -1,5 +1,3 @@
-load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
-load("@local_tsl//tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_platform_deps")
 load(
     "@local_tsl//tsl/platform:build_config_root.bzl",
@@ -11,6 +9,8 @@ load(
 )
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//xla:xla.bzl", "xla_cc_test")
+load("//xla/tsl:tsl.bzl", "internal_visibility")
+load("//xla/tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/runtime/default/BUILD b/third_party/xla/xla/runtime/default/BUILD
index 0cce6f64235abb..c331cf8ef3da6b 100644
--- a/third_party/xla/xla/runtime/default/BUILD
+++ b/third_party/xla/xla/runtime/default/BUILD
@@ -1,5 +1,5 @@
-load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/runtime/ffi/BUILD b/third_party/xla/xla/runtime/ffi/BUILD
index f971eadcf16130..024d01d5550d65 100644
--- a/third_party/xla/xla/runtime/ffi/BUILD
+++ b/third_party/xla/xla/runtime/ffi/BUILD
@@ -1,5 +1,5 @@
-load("@local_tsl//tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla/tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index e9c73e6fd54c35..56933bf4056053 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -8,8 +8,6 @@ load(
     "if_rocm",
     "if_rocm_is_configured",
 )
-load("@local_tsl//tsl:tsl.bzl", "if_google", "if_libtpu", "internal_visibility", "tsl_copts")
-load("@local_tsl//tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable", "internal_hlo_deps")
 load(
     "@local_tsl//tsl/platform:build_config.bzl",
     "tf_proto_library",
@@ -32,6 +30,8 @@ load(
 load("//xla/service:xla_compile.bzl", "xla_aot_compile_cpu", "xla_aot_compile_gpu", "xla_aot_compile_gpu_runtime_autotuning")
 load("//xla/stream_executor:build_defs.bzl", "if_gpu_is_configured")
 load("//xla/tests:build_defs.bzl", "xla_test")
+load("//xla/tsl:tsl.bzl", "if_google", "if_libtpu", "internal_visibility", "tsl_copts")
+load("//xla/tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable", "internal_hlo_deps")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -599,12 +599,12 @@ xla_cc_test(
         "//xla:statusor",
         "//xla:test",
         "//xla:test_helpers",
-        "//xla:types",
         "//xla:xla_data_proto_cc",
         "//xla/client:padding",
         "//xla/hlo/ir:hlo",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
@@ -969,10 +969,19 @@ cc_library(
         ":hlo_dce",
         ":hlo_domain_isolator",
         ":hlo_pass",
+        "//xla:status",
+        "//xla:status_macros",
         "//xla:statusor",
+        "//xla:util",
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -1257,7 +1266,6 @@ cc_library(
         ":collective_ops_utils",
         ":hlo_pass",
         "//xla:status",
-        "//xla:statusor",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/ir:hlo_reachability",
@@ -1265,6 +1273,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
@@ -1508,10 +1517,7 @@ xla_test(
     name = "compiler_test",
     srcs = ["compiler_test.cc"],
     backend_tags = {
-        "gpu": [
-            "requires-gpu-nvidia",
-            "config-cuda-only",
-        ],
+        "gpu": if_google(["requires-gpu-nvidia"]),
     },
     backends = [
         "gpu",
@@ -2428,6 +2434,7 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/hlo/evaluator:hlo_evaluator",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/utils:hlo_sharding_util",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -3138,6 +3145,7 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/lib/core:bitmap",
@@ -4328,6 +4336,7 @@ cc_library(
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
@@ -4774,12 +4783,14 @@ cc_library(
     hdrs = ["hlo_verifier.h"],
     deps = [
         ":collective_ops_utils",
+        ":hlo_module_config",
         ":hlo_pass",
-        ":pattern_matcher",
         ":shape_inference",
         "//xla:comparison_util",
         "//xla:permutation_util",
+        "//xla:shape_layout",
         "//xla:shape_util",
+        "//xla:status",
         "//xla:status_macros",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -4787,9 +4798,15 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -5112,10 +5129,13 @@ xla_cc_test(
         "//xla:shape_util",
         "//xla:test",
         "//xla:types",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/utils:hlo_matchers",
         "//xla/tests:hlo_test_base",
         "//xla/tests:literal_test_util",
         "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/strings:string_view",
     ],
 )
 
@@ -5325,6 +5345,8 @@ xla_test(
         "cpu",
         "gpu",
     ],
+    # TODO(b/332870133): Enable when it passes on H100.
+    disabled_backends = ["gpu_h100"],
     tags = [
         "no_windows",  # TODO(b/152037541)
     ],
@@ -5394,7 +5416,6 @@ cc_library(
         "//xla:printer",
         "//xla:shape_layout",
         "//xla:shape_util",
-        "//xla:statusor",
         "//xla:types",
         "//xla:xla_data_proto_cc",
         "@com_google_absl//absl/algorithm:container",
@@ -5940,6 +5961,7 @@ cc_library(
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:status",
+        "//xla:status_macros",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/algorithm:container",
@@ -6263,6 +6285,8 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@local_tsl//tsl/platform:errors",
     ],
 )
diff --git a/third_party/xla/xla/service/algebraic_simplifier.cc b/third_party/xla/xla/service/algebraic_simplifier.cc
index c38802d03b5b4e..ea6fec69552e54 100644
--- a/third_party/xla/xla/service/algebraic_simplifier.cc
+++ b/third_party/xla/xla/service/algebraic_simplifier.cc
@@ -45,6 +45,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/utils/hlo_sharding_util.h"
 #include "xla/layout.h"
 #include "xla/layout_util.h"
 #include "xla/literal.h"
@@ -1186,7 +1187,6 @@ Status AlgebraicSimplifierVisitor::HandleBitcast(HloInstruction* bitcast) {
     VLOG(3) << bitcast->ToString() << " has control predecessors, skipping.";
     return OkStatus();
   }
-
   // If a bitcast feeds a bitcast, make it a single bitcast.
   // Make sure the whole chain of bitcasts is optimized.
   if (bitcast->operand(0)->opcode() == HloOpcode::kBitcast) {
@@ -1201,8 +1201,13 @@ Status AlgebraicSimplifierVisitor::HandleBitcast(HloInstruction* bitcast) {
     bitcast = new_bitcast_ptr;
   }
 
-  // All bitcasts can be eliminated (assuming layout constraints are satisfied).
   HloInstruction* new_bitcast = bitcast->mutable_operand(0);
+  // Below this point avoid bitcast optimizations with mismatched data types.
+  if (!ShapeUtil::SameElementType(bitcast->shape(), new_bitcast->shape())) {
+    return OkStatus();
+  }
+
+  // All bitcasts can be eliminated (assuming layout constraints are satisfied).
   if (ReplaceInstructionIfCompatible(bitcast, new_bitcast)) {
     bitcast = new_bitcast;
   }
@@ -2144,7 +2149,7 @@ Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide) {
                     divide->shape(), HloOpcode::kMultiply, a, new_power));
   }
 
-  // A/sqrt(B) => A*rsqrt(X).
+  // A/sqrt(B) => A*rsqrt(B).
   if (Match(divide, m::Divide(m::Op(&a), m::Sqrt(m::Op(&b)).WithOneUse()))) {
     auto* rsqrt = divide->mutable_operand(1)->AddInstruction(
         HloInstruction::CreateUnary(divide->shape(), HloOpcode::kRsqrt, b));
@@ -3834,6 +3839,21 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
   return OkStatus();
 }
 
+namespace {
+std::vector<int64_t> GetPaddedDims(const HloInstruction* pad) {
+  CHECK_EQ(pad->opcode(), HloOpcode::kPad);
+  std::vector<int64_t> padded_dims;
+  for (int64_t i = 0; i != pad->shape().rank(); ++i) {
+    if (pad->padding_config().dimensions(i).edge_padding_high() != 0 ||
+        pad->padding_config().dimensions(i).edge_padding_low() != 0 ||
+        pad->padding_config().dimensions(i).interior_padding() != 0) {
+      padded_dims.push_back(i);
+    }
+  }
+  return padded_dims;
+}
+}  // namespace
+
 Status AlgebraicSimplifierVisitor::HandleGather(HloInstruction* gather) {
   const Shape& operand_shape = gather->operand(0)->shape();
   if (ShapeUtil::IsZeroElementArray(operand_shape)) {
@@ -3888,6 +3908,178 @@ Status AlgebraicSimplifierVisitor::HandleGather(HloInstruction* gather) {
     }
     return ReplaceInstruction(gather, result);
   }
+
+  const auto gather_operand_passthrough_operand_dims =
+      hlo_sharding_util::GetGatherOperandPassthroughOperandDims(
+          gather->operand(0)->shape(), *gather, gather->gather_slice_sizes());
+  const auto gather_operand_passthrough_output_dims =
+      hlo_sharding_util::GetGatherOperandPassthroughOutputDims(
+          gather->shape(), gather->operand(0)->shape(), *gather,
+          gather->gather_slice_sizes());
+  absl::flat_hash_map<int64_t, int64_t>
+      gather_operand_passthrough_operand_to_output_dims;
+  absl::flat_hash_map<int64_t, int64_t>
+      gather_operand_passthrough_output_to_operand_dims;
+  CHECK_EQ(gather_operand_passthrough_operand_dims.size(),
+           gather_operand_passthrough_output_dims.size());
+  for (int64_t i = 0; i != gather_operand_passthrough_operand_dims.size();
+       ++i) {
+    gather_operand_passthrough_operand_to_output_dims
+        [gather_operand_passthrough_operand_dims[i]] =
+            gather_operand_passthrough_output_dims[i];
+    gather_operand_passthrough_output_to_operand_dims
+        [gather_operand_passthrough_output_dims[i]] =
+            gather_operand_passthrough_operand_dims[i];
+  }
+  // If the gather operand is a pad on the pass-through dimensions, then we can
+  // gather the unpadded operand and then pad.
+  if (HloInstruction* pad = gather->mutable_operand(0);
+      pad->opcode() == HloOpcode::kPad) {
+    bool padded_on_gather_operand_passthrough_operand_dims = true;
+    std::vector<int64_t> padded_dims = GetPaddedDims(pad);
+    for (int64_t padded_dims : padded_dims) {
+      if (!gather_operand_passthrough_operand_to_output_dims.contains(
+              padded_dims)) {
+        padded_on_gather_operand_passthrough_operand_dims = false;
+        break;
+      }
+    }
+    // Change gather(pad(...)) to pad(gather(...)).
+    if (padded_on_gather_operand_passthrough_operand_dims) {
+      Shape gather_shape = gather->shape();
+      for (int64_t padded_dim : padded_dims) {
+        gather_shape.mutable_dimensions()
+            [gather_operand_passthrough_operand_to_output_dims[padded_dim]] =
+            pad->operand(0)->shape().dimensions()[padded_dim];
+      }
+      auto gather_inst = Cast<HloGatherInstruction>(gather);
+      std::vector<int64_t> slice_sizes;
+      for (int i = 0; i != gather_inst->gather_slice_sizes().size(); ++i) {
+        if (absl::c_linear_search(padded_dims, i)) {
+          slice_sizes.push_back(pad->operand(0)->shape().dimensions()[i]);
+        } else {
+          slice_sizes.push_back(gather_inst->gather_slice_sizes()[i]);
+        }
+      }
+      HloInstruction* result =
+          gather->AddInstruction(HloInstruction::CreateGather(
+              gather_shape, pad->mutable_operand(0), gather->mutable_operand(1),
+              gather_inst->gather_dimension_numbers(), slice_sizes,
+              gather_inst->indices_are_sorted()));
+      PaddingConfig pad_config;
+      for (int64_t i = 0; i != gather->shape().rank(); ++i) {
+        auto dimension = pad_config.add_dimensions();
+        if (gather_operand_passthrough_output_to_operand_dims.contains(i) &&
+            absl::c_linear_search(
+                padded_dims,
+                gather_operand_passthrough_output_to_operand_dims[i])) {
+          int64_t padded_dim =
+              gather_operand_passthrough_output_to_operand_dims[i];
+          dimension->set_edge_padding_low(
+              pad->padding_config().dimensions(padded_dim).edge_padding_low());
+          dimension->set_edge_padding_high(
+              pad->padding_config().dimensions(padded_dim).edge_padding_high());
+          dimension->set_interior_padding(
+              pad->padding_config().dimensions(padded_dim).interior_padding());
+        }
+      }
+      result = gather->AddInstruction(HloInstruction::CreatePad(
+          gather->shape(), result, pad->mutable_operand(1), pad_config));
+      return ReplaceInstruction(gather, result);
+    }
+  }
+
+  // If the gather operand is a reshape of a pad on the pass-through dimensions,
+  // then we can gather the unpadded reshape and then pad.
+  if (HloInstruction* reshape = gather->mutable_operand(0);
+      reshape->opcode() == HloOpcode::kReshape &&
+      ShapeUtil::ReshapeIsBitcast(reshape->operand(0)->shape(),
+                                  reshape->shape())) {
+    absl::flat_hash_map<int64_t, int64_t> reshape_unmodified_dims;
+    for (const auto& [from_dim, to_dim] :
+         ShapeUtil::DimensionsUnmodifiedByReshape(reshape->operand(0)->shape(),
+                                                  reshape->shape())) {
+      reshape_unmodified_dims[from_dim] = to_dim;
+    }
+    if (HloInstruction* pad = reshape->mutable_operand(0);
+        pad->opcode() == HloOpcode::kPad) {
+      bool padded_on_reshape_unmodified_dims = true;
+      bool padded_on_gather_operand_passthrough_operand_dims = true;
+      std::vector<int64_t> padded_dims = GetPaddedDims(pad);
+      for (int64_t padded_dim : padded_dims) {
+        if (!reshape_unmodified_dims.contains(padded_dim)) {
+          padded_on_reshape_unmodified_dims = false;
+          break;
+        }
+      }
+      absl::flat_hash_map<int64_t, int64_t> reshape_dims_to_padded_dims;
+      for (int64_t padded_dim : padded_dims) {
+        reshape_dims_to_padded_dims[reshape_unmodified_dims[padded_dim]] =
+            padded_dim;
+      }
+      for (auto& [padded_reshape_dim, _] : reshape_dims_to_padded_dims) {
+        if (!gather_operand_passthrough_operand_to_output_dims.contains(
+                padded_reshape_dim)) {
+          padded_on_gather_operand_passthrough_operand_dims = false;
+          break;
+        }
+      }
+      // Change gather(reshape(pad(...))) to pad(gather(reshape(...))).
+      if (padded_on_reshape_unmodified_dims &&
+          padded_on_gather_operand_passthrough_operand_dims) {
+        Shape reshape_shape = reshape->shape();
+        Shape gather_shape = gather->shape();
+        for (int64_t padded_dim : padded_dims) {
+          int64_t to_dim = reshape_unmodified_dims[padded_dim];
+          reshape_shape.mutable_dimensions()[to_dim] =
+              pad->operand(0)->shape().dimensions()[padded_dim];
+          gather_shape.mutable_dimensions()
+              [gather_operand_passthrough_operand_to_output_dims[to_dim]] =
+              pad->operand(0)->shape().dimensions()[padded_dim];
+        }
+        HloInstruction* result =
+            gather->AddInstruction(HloInstruction::CreateReshape(
+                reshape_shape, pad->mutable_operand(0)));
+        auto gather_inst = Cast<HloGatherInstruction>(gather);
+        std::vector<int64_t> slice_sizes;
+        for (int i = 0; i != gather_inst->gather_slice_sizes().size(); ++i) {
+          if (reshape_dims_to_padded_dims.contains(i)) {
+            slice_sizes.push_back(
+                pad->operand(0)
+                    ->shape()
+                    .dimensions()[reshape_dims_to_padded_dims[i]]);
+          } else {
+            slice_sizes.push_back(gather_inst->gather_slice_sizes()[i]);
+          }
+        }
+        result = gather->AddInstruction(HloInstruction::CreateGather(
+            gather_shape, result, gather->mutable_operand(1),
+            gather_inst->gather_dimension_numbers(), slice_sizes,
+            gather_inst->indices_are_sorted()));
+        PaddingConfig pad_config;
+        for (int64_t i = 0; i != gather->shape().rank(); ++i) {
+          auto dimension = pad_config.add_dimensions();
+          if (reshape_dims_to_padded_dims.contains(
+                  gather_operand_passthrough_output_to_operand_dims[i])) {
+            int64_t padded_dim = reshape_dims_to_padded_dims
+                [gather_operand_passthrough_output_to_operand_dims[i]];
+            dimension->set_edge_padding_low(pad->padding_config()
+                                                .dimensions(padded_dim)
+                                                .edge_padding_low());
+            dimension->set_edge_padding_high(pad->padding_config()
+                                                 .dimensions(padded_dim)
+                                                 .edge_padding_high());
+            dimension->set_interior_padding(pad->padding_config()
+                                                .dimensions(padded_dim)
+                                                .interior_padding());
+          }
+        }
+        result = gather->AddInstruction(HloInstruction::CreatePad(
+            gather->shape(), result, pad->mutable_operand(1), pad_config));
+        return ReplaceInstruction(gather, result);
+      }
+    }
+  }
   return OkStatus();
 }
 
@@ -4562,16 +4754,15 @@ Status AlgebraicSimplifierVisitor::HandleBroadcast(HloInstruction* broadcast) {
         broadcast, HloInstruction::CreateReshape(broadcast->shape(), operand));
   }
 
-  // A degenerate broadcast that has the same input and output rank can be
-  // converted into a transpose.
+  // A broadcast that has the same input and output rank can be converted into a
+  // transpose with the inverse of broadcast's dimensions.
   if (broadcast->shape().rank() == operand->shape().rank() &&
       ShapeUtil::ElementsIn(broadcast->shape()) ==
           ShapeUtil::ElementsIn(operand->shape())) {
-    VLOG(10) << "transform broadcast(X) -> transpose(X) where "
-                "n(broadcast(X)) == n(X)";
     return ReplaceWithNewInstruction(
-        broadcast,
-        HloInstruction::CreateTranspose(broadcast->shape(), operand, dims));
+        broadcast, HloInstruction::CreateTranspose(
+                       broadcast->shape(), operand,
+                       InversePermutation(broadcast->dimensions())));
   }
 
   // A broadcast of a reshape which merely inserts 1-sized dimensions can
@@ -4656,9 +4847,8 @@ Status AlgebraicSimplifierVisitor::HandleBroadcast(HloInstruction* broadcast) {
     return OkStatus();
   }
   if (ShapeUtil::HasDegenerateDimensions(operand->shape())) {
-    auto new_operand =
-        operand->parent()->AddInstruction(HloInstruction::CreateReshape(
-            ShapeUtil::DropDegenerateDimensions(operand->shape()), operand));
+    auto new_operand = operand->AddInstruction(HloInstruction::CreateReshape(
+        ShapeUtil::DropDegenerateDimensions(operand->shape()), operand));
     std::vector<int64_t> new_dims;
     new_dims.reserve(new_operand->shape().rank());
     for (int64_t i = 0; i < operand->shape().rank(); ++i) {
diff --git a/third_party/xla/xla/service/algebraic_simplifier_test.cc b/third_party/xla/xla/service/algebraic_simplifier_test.cc
index e08a4b7fa7847e..93e28a1e97ae9f 100644
--- a/third_party/xla/xla/service/algebraic_simplifier_test.cc
+++ b/third_party/xla/xla/service/algebraic_simplifier_test.cc
@@ -7866,6 +7866,136 @@ TEST_F(AlgebraicSimplifierTest, GatherOfScalarToBroadcast) {
   EXPECT_THAT(root, GmockMatch(m::Broadcast(m::Reshape(m::Parameter(0)))));
 }
 
+TEST_F(AlgebraicSimplifierTest, GatherOfPad) {
+  const char* hlo_string = R"(
+HloModule module
+
+ENTRY %entry {
+  reshape.17992 = f32[25165824,32]{1,0} parameter(0)
+  constant.31700 = f32[] constant(0)
+  pad.921 = f32[25165824,128]{1,0} pad(reshape.17992, constant.31700), padding=0_0x0_96
+  reshape.40561 = s32[20447232,1]{1,0} parameter(1)
+  gather.100277 = f32[20447232,128]{1,0} gather(pad.921, reshape.40561),
+    offset_dims={1}, collapsed_slice_dims={0}, start_index_map={0},
+    index_vector_dim=1, slice_sizes={1,128}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options;
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_TRUE(simplifier.Run(module.get()).value());
+  VLOG(2) << "After rewrite \n" << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              GmockMatch(m::Pad(m::Gather(m::Parameter(0), m::Parameter(1)),
+                                m::ConstantScalar(0))));
+}
+
+TEST_F(AlgebraicSimplifierTest, GatherOfPad2) {
+  const char* hlo_string = R"(
+HloModule module
+
+ENTRY %entry {
+  iota.3 = s32[4,1]{1,0} iota(), iota_dimension=0
+  constant.36 = s32[] constant(0)
+  pad = s32[4,2]{1,0} pad(iota.3, constant.36), padding=0_0x0_1
+  reshape.300 = s32[3,40,1]{2,1,0} parameter(0)
+  gather.363 = s32[3,40,2]{2,1,0} gather(pad, reshape.300),
+    offset_dims={2}, collapsed_slice_dims={0}, start_index_map={0},
+    index_vector_dim=2, slice_sizes={1,2}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options;
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_TRUE(simplifier.Run(module.get()).value());
+  VLOG(2) << "After rewrite \n" << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Pad(m::Gather(m::Iota(), m::Parameter(0)),
+                                      m::ConstantScalar(0))));
+}
+
+TEST_F(AlgebraicSimplifierTest, GatherOfReshapeOfPad) {
+  const char* hlo_string = R"(
+ENTRY %entry {
+  reshape.17992 = f32[64,393216,32]{2,1,0} parameter(0)
+  constant.31700 = f32[] constant(0)
+  pad.921 = f32[64,393216,128]{2,1,0} pad(reshape.17992, constant.31700), padding=0_0x0_0x0_96
+  reshape.100261 = f32[25165824,128]{1,0} reshape(pad.921)
+  reshape.40561 = s32[20447232,1]{1,0} parameter(1)
+  gather.100277 = f32[20447232,128]{1,0} gather(reshape.100261, reshape.40561),
+    offset_dims={1}, collapsed_slice_dims={0}, start_index_map={0},
+    index_vector_dim=1, slice_sizes={1,128}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options;
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_TRUE(simplifier.Run(module.get()).value());
+  VLOG(2) << "After rewrite \n" << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Pad(
+                        m::Gather(m::Reshape(m::Parameter(0)), m::Parameter(1)),
+                        m::ConstantScalar(0))));
+}
+
+TEST_F(AlgebraicSimplifierTest, GatherOfReshapeOfPad2) {
+  const char* hlo_string = R"(
+HloModule module
+
+ENTRY %entry {
+  iota.3 = s32[2,4,1]{2,1,0} iota(), iota_dimension=0
+  constant.36 = s32[] constant(0)
+  pad = s32[2,4,2]{2,1,0} pad(iota.3, constant.36), padding=0_0x0_0x0_1
+  reshape = s32[8,2]{1,0} reshape(pad)
+  reshape.300 = s32[3,40,1]{2,1,0} parameter(0)
+  gather.363 = s32[3,40,2]{2,1,0} gather(reshape, reshape.300),
+    offset_dims={2}, collapsed_slice_dims={0}, start_index_map={0},
+    index_vector_dim=2, slice_sizes={1,2}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options;
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_TRUE(simplifier.Run(module.get()).value());
+  VLOG(2) << "After rewrite \n" << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root, GmockMatch(m::Pad(m::Gather(m::Reshape(m::Iota()), m::Parameter(0)),
+                              m::ConstantScalar(0))));
+}
+
+TEST_F(AlgebraicSimplifierTest, GatherOfReshapeOfPad3) {
+  const char* hlo_string = R"(
+HloModule module
+
+ENTRY %entry {
+  parameter.0 = f32[2,4256]{1,0} parameter(0)
+  constant = f32[] constant(0)
+  pad.264 = f32[2,4480]{1,0} pad(parameter.0, constant), padding=0_0x0_224
+  slice.267 = f32[2,4480]{1,0} slice(pad.264), slice={[0:2], [0:4480]}
+  reshape.269 = f32[2,28,160]{2,1,0} reshape(slice.267)
+  parameter.1 = s32[27,2]{1,0} parameter(1)
+  ROOT gather.271 = f32[2,27,2,160]{3,2,1,0} gather(reshape.269, parameter.1), offset_dims={0,3}, collapsed_slice_dims={1}, start_index_map={1}, index_vector_dim=2, slice_sizes={2,1,160}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options;
+  AlgebraicSimplifier simplifier(options);
+  VLOG(2) << "After rewrite \n" << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      GmockMatch(m::Gather(
+          m::Reshape(m::Slice(m::Pad(m::Parameter(0), m::ConstantScalar(0)))),
+          m::Parameter(1))));
+}
+
 TEST_F(AlgebraicSimplifierTest, TupleReduceReshape) {
   const char* hlo_string = R"(
 HloModule module
@@ -10818,5 +10948,49 @@ TEST_F(AlgebraicSimplifierTest, SparseDotTranspose) {
   EXPECT_EQ(descriptor.dimension(), 1);
 }
 
+TEST_F(AlgebraicSimplifierTest, BroadcastToTranspose) {
+  const std::string hlo_string = R"(
+  HloModule broadcast_module
+    ENTRY %main {
+      input = f32[6,4,3] parameter(0)
+      ROOT output = f32[4,3,6] broadcast(input), dimensions={2,0,1}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  EXPECT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).value());
+  HloInstruction* root = m->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Transpose(m::Parameter(0))));
+  EXPECT_EQ(root->dimensions(), std::vector<int64_t>({1, 2, 0}));
+}
+
+TEST_F(AlgebraicSimplifierTest, BroadcastToTranspose2) {
+  const std::string hlo_string = R"(
+  HloModule broadcast_module
+    ENTRY %main {
+      input = f32[6,4,3] parameter(0)
+      ROOT output = f32[4,6,3] broadcast(input), dimensions={1,0,2}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  EXPECT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).value());
+  HloInstruction* root = m->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Transpose(m::Parameter(0))));
+  EXPECT_EQ(root->dimensions(), std::vector<int64_t>({1, 0, 2}));
+}
+
+TEST_F(AlgebraicSimplifierTest, PreserveSharding) {
+  const std::string hlo_string = R"(
+  HloModule jit_matmul, entry_computation_layout={(f64[8,3]{1,0}, f64[])->f64[8,3]{1,0}}, allow_spmd_sharding_propagation_to_parameters={false,true}, allow_spmd_sharding_propagation_to_output={true}, num_partitions=2
+    ENTRY %main.4 (Arg_0.1: f64[8,3], Arg_1.2: f64[]) -> f64[8,3] {
+      %Arg_1.2 = f64[] parameter(1)
+      %Arg_0.1 = f64[8,3]{1,0} parameter(0), sharding={devices=[2,1]0,1}
+      ROOT %dot.3 = f64[8,3]{1,0} dot(f64[] %Arg_1.2, f64[8,3]{1,0} %Arg_0.1), lhs_contracting_dims={}, rhs_contracting_dims={}, metadata={op_name="jit(matmul)/jit(main)/dot_general[dimension_numbers=(((), ()), ((), ())) precision=None preferred_element_type=float64]" source_file="third_party/py/jax/tests/pjit_test.py" source_line=4021}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  EXPECT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).value());
+  EXPECT_TRUE(m->entry_computation()->parameter_instruction(0)->has_sharding());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/buffer_assignment.cc b/third_party/xla/xla/service/buffer_assignment.cc
index c2cbdf5d001442..1f09360a6b07ce 100644
--- a/third_party/xla/xla/service/buffer_assignment.cc
+++ b/third_party/xla/xla/service/buffer_assignment.cc
@@ -524,7 +524,8 @@ bool BufferAssignment::HaveDisjointSlices(const HloInstruction* hlo_a,
     SliceSet slices;
     Status status = ShapeUtil::ForEachSubshapeWithStatus(
         instr->shape(),
-        [&](const Shape& /*subshape*/, const ShapeIndex& index) {
+        [&](const Shape& /*subshape*/,
+            const ShapeIndex& index) -> absl::Status {
           auto shape_slices = GetAllSlices(instr, index);
           if (shape_slices.empty()) {
             return InvalidArgument("No slices assigned to part of instr.");
diff --git a/third_party/xla/xla/service/call_inliner.cc b/third_party/xla/xla/service/call_inliner.cc
index 7b1a46f778e0d3..f92369d8605e91 100644
--- a/third_party/xla/xla/service/call_inliner.cc
+++ b/third_party/xla/xla/service/call_inliner.cc
@@ -16,14 +16,26 @@ limitations under the License.
 #include "xla/service/call_inliner.h"
 
 #include <memory>
-
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_sharding_metadata.h"
 #include "xla/service/call_graph.h"
 #include "xla/service/hlo_dce.h"
 #include "xla/service/hlo_domain_isolator.h"
+#include "xla/status.h"
+#include "xla/status_macros.h"
+#include "xla/util.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
@@ -136,6 +148,11 @@ CallInliner::Inline(HloInstruction* call) {
   return visitor.ConsumeInstructionMap();
 }
 
+bool CallInliner::IsInlineableCallOp(HloInstruction* instruction) const {
+  return instruction->opcode() == HloOpcode::kCall &&
+         !instruction->parent()->IsAsyncComputation();
+}
+
 absl::StatusOr<bool> CallInliner::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
@@ -156,8 +173,7 @@ absl::StatusOr<bool> CallInliner::Run(
       // used for parallel device computation.
       // TODO(b/229887502): update the inliner to ignore only parallel
       // device type async call instead of all.
-      if (instruction->opcode() == HloOpcode::kCall &&
-          !instruction->parent()->IsAsyncComputation()) {
+      if (IsInlineableCallOp(instruction)) {
         const auto& callees = instruction->called_computations();
         TF_RET_CHECK(callees.size() == 1);
         if (!single_call_site_ || call_graph->GetNode(instruction->to_apply())
@@ -182,7 +198,7 @@ absl::StatusOr<bool> CallInliner::Run(
     // Run DCE to remove called computations which are now becoming unused.
     // This can result then in problems if within the called computation, there
     // were send/recv instructions, which the module group verifier will flag as
-    // error findingthe same channel ID used for multiple send/recv
+    // error finding the same channel ID used for multiple send/recv
     // instructions.
     TF_RETURN_IF_ERROR(HloDCE().Run(module, execution_threads).status());
   }
diff --git a/third_party/xla/xla/service/call_inliner.h b/third_party/xla/xla/service/call_inliner.h
index 2ce5e7054a9235..ebc32684c93d67 100644
--- a/third_party/xla/xla/service/call_inliner.h
+++ b/third_party/xla/xla/service/call_inliner.h
@@ -17,8 +17,12 @@ limitations under the License.
 #define XLA_SERVICE_CALL_INLINER_H_
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/hlo_pass_interface.h"
-#include "xla/statusor.h"
 
 namespace xla {
 
@@ -48,6 +52,10 @@ class CallInliner : public HloModulePass {
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
+  // Returns true if the instruction is a kCall operation and is eligible for
+  // inlining.
+  virtual bool IsInlineableCallOp(HloInstruction* instruction) const;
+
  private:
   bool single_call_site_;
   bool update_domain_;
diff --git a/third_party/xla/xla/service/collective_ops_utils.cc b/third_party/xla/xla/service/collective_ops_utils.cc
index a980b88753d12b..70adcd2de8722a 100644
--- a/third_party/xla/xla/service/collective_ops_utils.cc
+++ b/third_party/xla/xla/service/collective_ops_utils.cc
@@ -588,6 +588,7 @@ bool IsCollective(const HloInstruction* instruction) {
     case HloOpcode::kCollectivePermute:
     case HloOpcode::kCollectivePermuteStart:
     case HloOpcode::kCollectivePermuteDone:
+    case HloOpcode::kReduceScatter:
       return true;
     case HloOpcode::kFusion:
       if (instruction->IsCustomFusion()) {
diff --git a/third_party/xla/xla/service/collective_opt_utils.cc b/third_party/xla/xla/service/collective_opt_utils.cc
index 8e7a6d874cfa8a..cbc7a4c8867bd4 100644
--- a/third_party/xla/xla/service/collective_opt_utils.cc
+++ b/third_party/xla/xla/service/collective_opt_utils.cc
@@ -267,13 +267,46 @@ bool IsPerIdOffset(const HloInstruction* offset, int64_t shard_size,
   return true;
 }
 
+std::optional<ReduceScatterSpec> SpecFromReduceScatterInstr(
+    const HloInstruction* rs_instr, int64_t num_partitions,
+    int64_t num_replicas, int64_t min_rank, bool is_constrain_layout,
+    bool use_global_device_ids, bool is_cross_module) {
+  if (rs_instr->shape().rank() < min_rank) {
+    return std::nullopt;
+  }
+  CHECK(rs_instr->opcode() == HloOpcode::kReduceScatter);
+  ReduceScatterSpec spec;
+  spec.split_dim = rs_instr->dimensions(0);
+  if (!is_cross_module) {
+    spec.sharded_replicas = num_replicas;
+    spec.group_size = rs_instr->replica_groups().empty()
+                          ? num_replicas
+                          : rs_instr->replica_groups()[0].replica_ids_size();
+  } else if (use_global_device_ids) {
+    spec.sharded_replicas = num_replicas;
+    spec.sharded_partitions = num_partitions;
+    spec.group_size = rs_instr->replica_groups()[0].replica_ids_size();
+  } else {
+    spec.sharded_partitions = num_partitions;
+    spec.group_size = num_partitions;
+  }
+  spec.original_split_dims = {spec.split_dim};
+  spec.dynamic_slice = nullptr;
+  return spec;
+}
+
 }  // namespace
 
 std::optional<ReduceScatterSpec> MatchReduceScatter(
-    const HloAllReduceInstruction* ar, int64_t num_partitions,
+    const HloAllReduceInstructionBase* ar, int64_t num_partitions,
     int64_t num_replicas, bool allow_multiple_split_dims,
     bool allow_intervening_reshape, int64_t min_rank,
     HloPredicate match_partition_id, HloPredicate match_replica_id) {
+  if (ar->opcode() == HloOpcode::kReduceScatter) {
+    return SpecFromReduceScatterInstr(
+        ar, num_partitions, num_replicas, min_rank, ar->constrain_layout(),
+        ar->use_global_device_ids(), ar->channel_id().has_value());
+  }
   auto spec = MatchWithDynamicSlice(
       ar, num_partitions, num_replicas, allow_multiple_split_dims,
       allow_intervening_reshape, min_rank, match_partition_id, match_replica_id,
diff --git a/third_party/xla/xla/service/collective_opt_utils.h b/third_party/xla/xla/service/collective_opt_utils.h
index 11b65c1acc4160..7d044be3c34568 100644
--- a/third_party/xla/xla/service/collective_opt_utils.h
+++ b/third_party/xla/xla/service/collective_opt_utils.h
@@ -36,7 +36,7 @@ struct ReduceScatterSpec {
 
 // Matches the given all-reduce operation to a reduce-scatter pattern.
 std::optional<ReduceScatterSpec> MatchReduceScatter(
-    const HloAllReduceInstruction* ar, int64_t num_partitions,
+    const HloAllReduceInstructionBase* ar, int64_t num_partitions,
     int64_t num_replicas, bool allow_multiple_split_dims = false,
     bool allow_intervening_reshape = false, int64_t min_rank = 1,
     HloPredicate match_partition_id = HloPredicateIsOp<HloOpcode::kPartitionId>,
diff --git a/third_party/xla/xla/service/collective_permute_decomposer.cc b/third_party/xla/xla/service/collective_permute_decomposer.cc
index 784d3c7318ec25..1660163b948b11 100644
--- a/third_party/xla/xla/service/collective_permute_decomposer.cc
+++ b/third_party/xla/xla/service/collective_permute_decomposer.cc
@@ -156,6 +156,13 @@ Status DecomposeCollectivePermute(
   HloInstruction* send_done =
       computation->AddInstruction(HloInstruction::CreateSendDone(send));
 
+  // We will add control dependence to represent how we want to order Send/Recv
+  // and other collective operations. Here we only add the necessary control
+  // dependence to avoid optimization that can cause problems, in particular,
+  // to prevent fusion from fusing the computation of Send-data with the
+  // computation that requires the Recv-result.
+  TF_RETURN_IF_ERROR(send->AddControlDependencyTo(recv_done));
+
   HloInstruction* recv_data = computation->AddInstruction(
       HloInstruction::CreateGetTupleElement(recv_done, 0));
   TF_RETURN_IF_ERROR(collective_permute->ReplaceAllUsesWith(recv_data));
diff --git a/third_party/xla/xla/service/collective_permute_decomposer_test.cc b/third_party/xla/xla/service/collective_permute_decomposer_test.cc
index e7a707743109dd..b80a52b51e9f1a 100644
--- a/third_party/xla/xla/service/collective_permute_decomposer_test.cc
+++ b/third_party/xla/xla/service/collective_permute_decomposer_test.cc
@@ -229,6 +229,9 @@ TEST_F(CollectivePermuteDecomposerTest, Pipeline1) {
   HloInstruction* send_done = FindInstruction(module.get(), "send-done");
   EXPECT_THAT(send_done->ToString(),
               HasSubstr("_xla_send_recv_pipeline=\"0\""));
+
+  EXPECT_FALSE(recv_done->control_predecessors().empty());
+  EXPECT_EQ(recv_done->control_predecessors()[0], send);
 }
 
 TEST_F(CollectivePermuteDecomposerTest, ForwardPipeline2) {
diff --git a/third_party/xla/xla/service/compiler.h b/third_party/xla/xla/service/compiler.h
index bfb6b45c043689..321e4a621921af 100644
--- a/third_party/xla/xla/service/compiler.h
+++ b/third_party/xla/xla/service/compiler.h
@@ -177,15 +177,6 @@ class Compiler {
                         CompileOptions{device_allocator});
   }
 
-  // Performs scheduling and buffer assignment and returns the buffer
-  // assignments.
-  // The returned 'BufferAssignment' retains a pointer to the 'HloModule', so
-  // the module must live at least as long as the buffer assignments.
-  virtual absl::StatusOr<std::unique_ptr<BufferAssignment>> AssignBuffers(
-      HloModule* module, const se::StreamExecutor* executor) {
-    return Unimplemented("This compiler does not support this method");
-  }
-
   // Compiles the HLO module for execution on a device given by the executor,
   // and returns an executable object or an error status. No HLO passes are
   // applied to module. Generally a module should be passed through RunHloPasses
diff --git a/third_party/xla/xla/service/computation_layout.cc b/third_party/xla/xla/service/computation_layout.cc
index 117b04f3aabf7b..029dcd1895ba3e 100644
--- a/third_party/xla/xla/service/computation_layout.cc
+++ b/third_party/xla/xla/service/computation_layout.cc
@@ -21,12 +21,11 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/strings/str_cat.h"
-#include "absl/strings/str_join.h"
 #include "xla/layout.h"
 #include "xla/printer.h"
+#include "xla/shape.h"
+#include "xla/shape_layout.h"
 #include "xla/shape_util.h"
-#include "xla/statusor.h"
-#include "xla/types.h"
 
 namespace xla {
 
@@ -54,9 +53,10 @@ bool ComputationLayout::LayoutIsSet() const {
 }
 
 bool ComputationLayout::AnyLayoutSet() const {
-  return absl::c_any_of(parameter_layouts_,
-                        [](const ShapeLayout& s) { return s.LayoutIsSet(); }) ||
-         result_layout_.LayoutIsSet();
+  return absl::c_any_of(
+             parameter_layouts_,
+             [](const ShapeLayout& s) { return s.AnyLayoutIsSet(); }) ||
+         result_layout_.AnyLayoutIsSet();
 }
 
 absl::StatusOr<std::vector<Layout>>
@@ -65,7 +65,8 @@ ComputationLayout::FlattenedParameterLayouts() const {
   for (int i = 0; i < parameter_count(); ++i) {
     TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
         parameter_shape(i),
-        [this, &result](const Shape& subshape, const ShapeIndex& index) {
+        [this, &result](const Shape& subshape,
+                        const ShapeIndex& index) -> absl::Status {
           if (subshape.IsTuple()) {
             return OkStatus();
           }
@@ -93,7 +94,8 @@ absl::StatusOr<std::vector<Layout>> ComputationLayout::FlattenedResultLayouts()
   std::vector<Layout> result;
   TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
       result_shape(),
-      [this, &result](const Shape& subshape, const ShapeIndex& index) {
+      [this, &result](const Shape& subshape,
+                      const ShapeIndex& index) -> absl::Status {
         if (subshape.IsTuple()) {
           return OkStatus();
         }
diff --git a/third_party/xla/xla/service/constant_value.cc b/third_party/xla/xla/service/constant_value.cc
index a5b6c9c30f2f0b..14de2501c5c0ef 100644
--- a/third_party/xla/xla/service/constant_value.cc
+++ b/third_party/xla/xla/service/constant_value.cc
@@ -22,7 +22,7 @@ namespace xla {
 absl::StatusOr<ConstantValue> ConstantValue::FromLiteral(
     const Literal& literal) {
   CHECK_EQ(literal.shape().dimensions_size(), 0) << "Expected scalar literal";
-  return primitive_util::PrimitiveTypeSwitch<StatusOr<ConstantValue>>(
+  return primitive_util::PrimitiveTypeSwitch<absl::StatusOr<ConstantValue>>(
       [&](auto primitive_type_constant) -> absl::StatusOr<ConstantValue> {
         if constexpr (primitive_util::IsIntegralType(primitive_type_constant)) {
           return ConstantValue(
diff --git a/third_party/xla/xla/service/convert_memory_placement_to_internal_annotations.cc b/third_party/xla/xla/service/convert_memory_placement_to_internal_annotations.cc
index a0d7887532651a..c3c266f1c683f9 100644
--- a/third_party/xla/xla/service/convert_memory_placement_to_internal_annotations.cc
+++ b/third_party/xla/xla/service/convert_memory_placement_to_internal_annotations.cc
@@ -79,10 +79,6 @@ absl::StatusOr<bool> ConvertMemoryPlacementToInternalAnnotations::Run(
         } else if (is_to_device_case) {
           VLOG(1) << "Process backward case: " << instruction->ToString();
           HloInstruction* custom_call_operand = instruction->mutable_operand(0);
-          if (custom_call_operand->users().size() != 1) {
-            VLOG(1) << "Skip because operand is used by more than one user";
-            continue;
-          }
           HloInstruction* new_result =
               c->AddInstruction(HloInstruction::CreateCustomCall(
                   custom_call_operand->shape(), {custom_call_operand},
diff --git a/third_party/xla/xla/service/cpu/BUILD b/third_party/xla/xla/service/cpu/BUILD
index a69aafc5c113f3..5041c3204e7565 100644
--- a/third_party/xla/xla/service/cpu/BUILD
+++ b/third_party/xla/xla/service/cpu/BUILD
@@ -3,8 +3,6 @@
 
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
-load("@local_tsl//tsl:tsl.bzl", "internal_visibility", "tf_openmp_copts", "tsl_copts")
-load("@local_tsl//tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
 load(
     "@local_tsl//tsl/platform:build_config_root.bzl",
@@ -25,6 +23,8 @@ load(
     "xla_cc_binary",
     "xla_cc_test",
 )
+load("//xla/tsl:tsl.bzl", "internal_visibility", "tf_openmp_copts", "tsl_copts")
+load("//xla/tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
 load(
     "//xla/tsl/mkl:build_defs.bzl",
     "mkl_deps",
@@ -500,6 +500,7 @@ cc_library(
         "@llvm-project//mlir:ShapeToStandard",
         "@llvm-project//mlir:ShapeTransforms",
         "@llvm-project//mlir:SparseTensorTransforms",
+        "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorToLinalg",
         "@llvm-project//mlir:TensorTransforms",
         "@llvm-project//mlir:Transforms",
@@ -556,6 +557,7 @@ cc_library(
         "//xla/service:custom_call_target_registry",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Core",
         "@llvm-project//llvm:ExecutionEngine",
         "@llvm-project//llvm:MC",  # fixdeps: keep
@@ -1567,6 +1569,25 @@ tf_proto_library(
     cc_api_version = 2,
 )
 
+cc_library(
+    name = "onednn_util",
+    srcs = ["onednn_util.cc"],
+    hdrs = [
+        "onednn_util.h",
+        "//xla/tsl/util:onednn_util_hdrs",
+    ],
+    copts = runtime_copts() + tsl_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "@eigen_archive//:eigen3",
+        "@local_tsl//tsl/platform:blocking_counter",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:platform_port",
+    ] + mkl_deps(),
+)
+
 cc_library(
     name = "onednn_memory_util",
     srcs = ["onednn_memory_util.cc"],
@@ -1600,18 +1621,17 @@ cc_library(
 cc_library(
     name = "onednn_matmul",
     srcs = ["onednn_matmul.cc"],
-    hdrs = [
-        "onednn_matmul.h",
-        "//xla/tsl/util:onednn_util_hdrs",
-    ],
+    hdrs = ["onednn_matmul.h"],
     copts = runtime_copts() + tsl_copts(),
     visibility = ["//visibility:public"],
     deps = [
         ":backend_config_proto_cc",
         ":onednn_memory_util",
+        ":onednn_util",
         ":runtime_lightweight_check",
         "//xla:executable_run_options",
         "//xla:shape_util",
+        "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:dynamic_annotations",
         "@eigen_archive//:eigen3",
@@ -1669,11 +1689,6 @@ cc_library(
     ] + mkl_deps(),
 )
 
-cc_library(
-    name = "onednn_util",
-    hdrs = ["onednn_util.h"],
-)
-
 cc_library(
     name = "onednn_matmul_rewriter",
     srcs = ["onednn_matmul_rewriter.cc"],
@@ -1694,6 +1709,7 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/hlo/evaluator:hlo_evaluator",
         "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_cost_analysis",
         "//xla/service:hlo_creation_utils",
         "//xla/service:hlo_pass",
         "//xla/service:pattern_matcher",
@@ -1709,9 +1725,7 @@ cc_library(
 cc_library(
     name = "onednn_ops_rewriter",
     srcs = ["onednn_ops_rewriter.cc"],
-    hdrs = [
-        "onednn_ops_rewriter.h",
-    ],
+    hdrs = ["onednn_ops_rewriter.h"],
     copts = tsl_copts(),
     deps = [
         ":backend_config_proto_cc",
diff --git a/third_party/xla/xla/service/cpu/backend_config.proto b/third_party/xla/xla/service/cpu/backend_config.proto
index 7e82b08f3da5db..d5ce59285e55c9 100644
--- a/third_party/xla/xla/service/cpu/backend_config.proto
+++ b/third_party/xla/xla/service/cpu/backend_config.proto
@@ -33,6 +33,8 @@ message OneDnnMatMulConfig {
   // To avoid protobuf failures for specific decimal values,
   // the original float value alpha is type-casted to int32.
   int32 alpha_typecast = 5;
+  bool weights_prepacked = 6;
+  bool user_scratchpad = 7;
 }
 
 message OneDnnLayerNormConfig {
diff --git a/third_party/xla/xla/service/cpu/build_defs.bzl b/third_party/xla/xla/service/cpu/build_defs.bzl
index 36a296218a0cc9..35cb88f7ca9488 100644
--- a/third_party/xla/xla/service/cpu/build_defs.bzl
+++ b/third_party/xla/xla/service/cpu/build_defs.bzl
@@ -1,11 +1,13 @@
 """build_defs for service/cpu."""
 
+load("//xla/tsl:tsl.bzl", "clean_dep")
+
 def runtime_copts():
     """Returns copts used for CPU runtime libraries."""
     return (["-DEIGEN_AVOID_STL_ARRAY"] + select({
-        "@local_tsl//tsl:android_arm": ["-mfpu=neon"],
+        clean_dep("//xla/tsl:android_arm"): ["-mfpu=neon"],
         "//conditions:default": [],
     }) + select({
-        "@local_tsl//tsl:android": ["-O2"],
+        clean_dep("//xla/tsl:android"): ["-O2"],
         "//conditions:default": [],
     }))
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc b/third_party/xla/xla/service/cpu/cpu_compiler.cc
index a5603c5b63f669..dcd9781dfef05a 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.cc
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.cc
@@ -1114,28 +1114,6 @@ absl::StatusOr<std::unique_ptr<HloModule>> CpuCompiler::RunHloPasses(
   return std::move(module);
 }
 
-absl::StatusOr<std::unique_ptr<BufferAssignment>> CpuCompiler::AssignBuffers(
-    HloModule* module, const se::StreamExecutor* /*stream_exec*/) {
-  // Select an order for emitting the HLO instructions for each computation.
-  // Using this sequence enables tighter buffer liveness analysis and reduced
-  // memory usage (as compared to using DependencyHloOrdering).
-  TF_ASSIGN_OR_RETURN(HloSchedule schedule,
-                      ScheduleModule(module, BufferSizeBytesFunction(),
-                                     ComputationSchedulerToModuleScheduler(
-                                         DFSMemoryScheduler)));
-  TF_RETURN_IF_ERROR(module->set_schedule(std::move(schedule)));
-
-  // Run buffer allocation on the HLO graph.
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<BufferAssignment> assignment,
-      BufferAssigner::Run(
-          module, std::make_unique<SequentialHloOrdering>(module->schedule()),
-          BufferSizeBytesFunction(), memory_alignment,
-          /*allocate_buffers_for_constants=*/true));
-
-  return std::move(assignment);
-}
-
 namespace {
 
 // Post-compilation callback functor for use by SimpleOrcJIT.
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.h b/third_party/xla/xla/service/cpu/cpu_compiler.h
index 41201fa0a7a77a..69ba7f498f3db7 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.h
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.h
@@ -162,9 +162,6 @@ class CpuCompiler : public LLVMCompiler {
       std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
       const CompileOptions& options) override;
 
-  absl::StatusOr<std::unique_ptr<BufferAssignment>> AssignBuffers(
-      HloModule* module, const se::StreamExecutor* stream_exec) override;
-
   absl::StatusOr<std::unique_ptr<Executable>> RunBackend(
       std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
       const CompileOptions& options) override;
diff --git a/third_party/xla/xla/service/cpu/cpu_runtime.cc b/third_party/xla/xla/service/cpu/cpu_runtime.cc
index a85ae390678099..3e38a06748b808 100644
--- a/third_party/xla/xla/service/cpu/cpu_runtime.cc
+++ b/third_party/xla/xla/service/cpu/cpu_runtime.cc
@@ -543,9 +543,14 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY int __xla_cpu_runtime_PrintfToStderr(
 }
 
 ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY int64_t __xla_cpu_runtime_TracingStart(
-    const void* /* ExecutableRunOptions*  run_options_ptr*/, const char* name) {
+    const void* /* ExecutableRunOptions*  run_options_ptr*/, const char* name,
+    const char* type, const char* src_op_type, const char* src_op_name) {
   VLOG(3) << "TracingStart " << name;
-  return tsl::profiler::TraceMe::ActivityStart(name);
+  auto trace_in =
+      tsl::profiler::TraceMeEncode(type, {{"hlo_name", name},
+                                          {"src_opType", src_op_type},
+                                          {"src_opName", src_op_name}});
+  return tsl::profiler::TraceMe::ActivityStart(trace_in);
 }
 
 ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_TracingEnd(
diff --git a/third_party/xla/xla/service/cpu/cpu_runtime.h b/third_party/xla/xla/service/cpu/cpu_runtime.h
index e4fc06fc85bd5a..de84b7e7e54fed 100644
--- a/third_party/xla/xla/service/cpu/cpu_runtime.h
+++ b/third_party/xla/xla/service/cpu/cpu_runtime.h
@@ -109,7 +109,8 @@ extern int __xla_cpu_runtime_PrintfToStderr(const char* format, ...);
 
 extern int64_t __xla_cpu_runtime_TracingStart(
     const void* /* xla::ExecutableRunOptions* */ run_options_ptr,
-    const char* name);
+    const char* name, const char* type, const char* src_op_type,
+    const char* src_op_name);
 extern void __xla_cpu_runtime_TracingEnd(
     const void* /* xla::ExecutableRunOptions* */ run_options_ptr, int64_t id);
 
diff --git a/third_party/xla/xla/service/cpu/cpu_xfeed.cc b/third_party/xla/xla/service/cpu/cpu_xfeed.cc
index f3a9d52167c153..b6fea6cb0d3851 100644
--- a/third_party/xla/xla/service/cpu/cpu_xfeed.cc
+++ b/third_party/xla/xla/service/cpu/cpu_xfeed.cc
@@ -281,7 +281,8 @@ Status ReadDynamicShapesOnCpu(
   TF_RET_CHECK(device_shape->is_dynamic());
   Shape original_device_shape = *device_shape;
   TF_RETURN_IF_ERROR(device_buffer->buffers().ForEachElementWithStatus(
-      [&](const ShapeIndex& index, const se::DeviceMemoryBase& buffer) {
+      [&](const ShapeIndex& index,
+          const se::DeviceMemoryBase& buffer) -> absl::Status {
         const Shape& buffer_shape =
             ShapeUtil::GetSubshape(*device_shape, index);
         if (buffer_shape.IsTuple()) {
diff --git a/third_party/xla/xla/service/cpu/hlo_xla_runtime_pipeline.cc b/third_party/xla/xla/service/cpu/hlo_xla_runtime_pipeline.cc
index 1bcab94dae3df8..331f5d933671a5 100644
--- a/third_party/xla/xla/service/cpu/hlo_xla_runtime_pipeline.cc
+++ b/third_party/xla/xla/service/cpu/hlo_xla_runtime_pipeline.cc
@@ -44,6 +44,8 @@ limitations under the License.
 #include "mlir/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.h"  // from @llvm-project
 #include "mlir/Dialect/Vector/Transforms/BufferizableOpInterfaceImpl.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "xla/mlir/backends/cpu/transforms/passes.h"
 #include "xla/mlir/runtime/transforms/compiler.h"
@@ -152,7 +154,11 @@ static Status CreateHloXlaPipeline(
   // one-shot-bufferize generates unnecessary allocs for. The detensorize pass
   // replaces these linalg.generics with scalar ops.
   auto detensorize = mlir::createLinalgDetensorizePass();
-  if (detensorize->initializeOptions("aggressive-mode=true").failed()) {
+  if (detensorize
+          ->initializeOptions(
+              "aggressive-mode=true",
+              [](const mlir::Twine&) { return mlir::failure(); })
+          .failed()) {
     return tsl::errors::Internal("Failed to set up detensorize pass.");
   }
   pm.addNestedPass<mlir::func::FuncOp>(std::move(detensorize));
diff --git a/third_party/xla/xla/service/cpu/ir_emitter.cc b/third_party/xla/xla/service/cpu/ir_emitter.cc
index 36794ee1bac6b9..5a2ff9e61c03d1 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter.cc
+++ b/third_party/xla/xla/service/cpu/ir_emitter.cc
@@ -2590,19 +2590,59 @@ Status IrEmitter::HandleOneDnnMatMulCalls(HloInstruction* custom_call,
   b_.CreateStore(args_val, args_ptr);
 
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(custom_call));
-  llvm_ir::IrArray result_array = GetIrArrayFor(custom_call);
-  auto result_stack_alloca = GetAllocaAndEmitMemrefInfo(b_, result_array);
 
-  EmitCallToFunc(std::move(runtime_symbol_name),
-                 {result_stack_alloca.value, args_ptr}, b_.getVoidTy());
+  StackAlloca result_stack_alloca;
+  StackAlloca scratch_stack_alloca;
+  // Custom-call target for matmul has 3 arguments: void* result, void*
+  // scratch and void** args
+  std::vector<llvm::Value*> fn_call_args;
+  fn_call_args.reserve(3);
+  const bool use_scratchpad = custom_call->shape().IsTuple();
+  if (use_scratchpad) {
+    llvm::Value* result_slice_ptr;
+    llvm::Value* scratch_slice_ptr;
+    llvm_ir::IrArray result_array;
+    llvm_ir::IrArray scratch_array;
+    TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice,
+                        assignment_.GetUniqueSlice(custom_call, {0}));
+    const Shape& result_shape = custom_call->shape().tuple_shapes(0);
+    result_slice_ptr = EmitBufferPointer(result_slice, result_shape);
+    llvm::Type* ir_type = IrShapeType(result_shape);
+    result_array = llvm_ir::IrArray(result_slice_ptr, ir_type, result_shape);
+    result_stack_alloca = GetAllocaAndEmitMemrefInfo(b_, result_array);
+    fn_call_args.push_back(result_stack_alloca.value);
+
+    TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice scratch_slice,
+                        assignment_.GetUniqueSlice(custom_call, {1}));
+    const Shape& scratch_shape = custom_call->shape().tuple_shapes(1);
+    scratch_slice_ptr = EmitBufferPointer(scratch_slice, scratch_shape);
+    llvm::Type* scratch_type = IrShapeType(scratch_shape);
+    scratch_array =
+        llvm_ir::IrArray(scratch_slice_ptr, scratch_type, scratch_shape);
+    scratch_stack_alloca = GetAllocaAndEmitMemrefInfo(b_, scratch_array);
+    fn_call_args.push_back(scratch_stack_alloca.value);
+    llvm_ir::EmitTuple(GetIrArrayFor(custom_call),
+                       {result_slice_ptr, scratch_slice_ptr}, &b_);
+  } else {
+    llvm_ir::IrArray result_array;
+    result_array = GetIrArrayFor(custom_call);
+    result_stack_alloca = GetAllocaAndEmitMemrefInfo(b_, result_array);
+    fn_call_args.push_back(result_stack_alloca.value);
+    fn_call_args.push_back(llvm::ConstantPointerNull::get(b_.getPtrTy()));
+  }
+  fn_call_args.push_back(args_ptr);
+  EmitCallToFunc(std::move(runtime_symbol_name), fn_call_args, b_.getVoidTy());
 
   // Lifetime ends for all stack allocations.
   b_.CreateLifetimeEnd(nargs_ptr, b_.getInt64(-1));
-  for (int i = 0; i < num_operands; ++i) {
-    operands_stack_alloca[i].EmitLifetimeEnd();
-  }
   b_.CreateLifetimeEnd(args_ptr, b_.getInt64(-1));
+  for (auto& alloca : operands_stack_alloca) {
+    alloca.EmitLifetimeEnd();
+  }
   result_stack_alloca.EmitLifetimeEnd();
+  if (use_scratchpad) {
+    scratch_stack_alloca.EmitLifetimeEnd();
+  }
 
   return OkStatus();
 }
@@ -2821,7 +2861,7 @@ Status IrEmitter::HandleWhile(HloInstruction* xla_while) {
       [this, &xla_while](const Shape& /*subshape*/,
                          const ShapeIndex& index) -> Status {
         auto check = [this](const HloInstruction* a, const HloInstruction* b,
-                            const ShapeIndex& index) {
+                            const ShapeIndex& index) -> absl::Status {
           const BufferAllocation::Slice slice_a =
               assignment_.GetUniqueSlice(a, index).value();
           const BufferAllocation::Slice slice_b =
@@ -3360,7 +3400,9 @@ void IrEmitter::TracingState::EmitTracingStart(llvm::IRBuilder<>* b,
 
   llvm::Type* void_ptr_type = b->getPtrTy();
   llvm::FunctionType* fn_type =
-      llvm::FunctionType::get(b->getInt64Ty(), {void_ptr_type, void_ptr_type},
+      llvm::FunctionType::get(b->getInt64Ty(),
+                              {void_ptr_type, void_ptr_type, void_ptr_type,
+                               void_ptr_type, void_ptr_type},
                               /*isVarArg=*/false);
 
   llvm::Function* function = b->GetInsertBlock()->getParent();
@@ -3373,8 +3415,25 @@ void IrEmitter::TracingState::EmitTracingStart(llvm::IRBuilder<>* b,
     fn->setDoesNotThrow();
     fn->setOnlyAccessesArgMemory();
   }
+
+  // Pass opcode as argument to TraceMe call
+  absl::string_view hlo_type_str;
+  if (hlo->opcode() == HloOpcode::kCustomCall) {
+    // For custom call, passing custom call target is more informative
+    hlo_type_str = hlo->custom_call_target();
+  } else {
+    hlo_type_str = HloOpcodeString(hlo->opcode());
+  }
+  auto* hlo_type = b->CreateGlobalStringPtr(hlo_type_str);
   auto* hlo_name = b->CreateGlobalStringPtr(hlo->name());
-  auto* activity_id = b->CreateCall(trace_func, {run_options, hlo_name});
+
+  // Also pass metadata, as it can be useful for debugging
+  auto* hlo_src_op_type = b->CreateGlobalStringPtr(hlo->metadata().op_type());
+  auto* hlo_src_op_name = b->CreateGlobalStringPtr(hlo->metadata().op_name());
+
+  auto* activity_id = b->CreateCall(
+      trace_func,
+      {run_options, hlo_name, hlo_type, hlo_src_op_type, hlo_src_op_name});
   activity_id->setName(IrName(hlo, "activity_id"));
   activity_ids_[hlo] = activity_id;
 }
diff --git a/third_party/xla/xla/service/cpu/onednn_matmul.cc b/third_party/xla/xla/service/cpu/onednn_matmul.cc
index 4c01c732a96da9..04e0bd47a2fb00 100644
--- a/third_party/xla/xla/service/cpu/onednn_matmul.cc
+++ b/third_party/xla/xla/service/cpu/onednn_matmul.cc
@@ -12,32 +12,34 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
-
 #include "xla/service/cpu/onednn_matmul.h"
 
 #include <algorithm>
 #include <cmath>
 #include <cstring>
 #include <initializer_list>
+#include <iterator>
 #include <utility>
 #include <vector>
 
-#define EIGEN_USE_THREADS
-
 #include "dnnl.hpp"
 #include "absl/base/dynamic_annotations.h"
 #include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
 #include "xla/executable_run_options.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/cpu/backend_config.pb.h"
 #include "xla/service/cpu/onednn_memory_util.h"
+#include "xla/service/cpu/onednn_util.h"
 #include "xla/service/cpu/runtime_lightweight_check.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/util/onednn_threadpool.h"
 #include "tsl/platform/logging.h"
 
+#define EIGEN_USE_THREADS
+
 namespace xla {
 namespace cpu {
 namespace {
@@ -46,40 +48,6 @@ using dnnl::matmul;
 using dnnl::memory;
 using dnnl::stream;
 
-dnnl::memory::desc Transpose(const dnnl::memory::desc& md) {
-  int64_t ndims = md.get_ndims();
-  // Do not transpose 1D
-  if (ndims == 1) {
-    return md;
-  }
-
-  std::vector<int> permutation(ndims);
-  std::iota(permutation.begin(), permutation.end(), 0);
-  std::swap(permutation[ndims - 1], permutation[ndims - 2]);
-  return md.permute_axes(permutation);
-}
-
-dnnl::memory::desc ShapeToMemDesc(const Shape& shape, bool transpose = false) {
-  auto dimensions = shape.dimensions();
-  if (dimensions.size() == 0) {
-    return dnnl::memory::desc{};
-  }
-
-  auto dims = dnnl::memory::dims(dimensions.begin(), dimensions.end());
-
-  dnnl::memory::dims strides(dims.size());
-  dnnl::memory::dim stride = 1;
-  for (auto i : shape.layout().minor_to_major()) {
-    strides.at(i) = stride;
-    stride *= dims.at(i);
-  }
-
-  auto dt = ToOneDnnDataType(static_cast<PrimitiveType>(shape.element_type()));
-
-  return transpose ? Transpose(dnnl::memory::desc(dims, dt, strides))
-                   : dnnl::memory::desc(dims, dt, strides);
-}
-
 dnnl::memory::desc OneDnnMatMulOptWeightsDesc(
     const dnnl::engine& engine, const dnnl::memory::desc& input_md,
     const dnnl::memory::desc& weights_md, const dnnl::memory::desc& bias_md,
@@ -98,8 +66,10 @@ dnnl::memory::desc OneDnnMatMulOptWeightsDesc(
     const dnnl::engine& engine, const Shape& input_shape,
     const Shape& weights_shape, const Shape& bias_shape,
     const Shape& output_shape, const OneDnnMatMulConfig* matmul_config) {
-  auto input_md = ShapeToMemDesc(input_shape, matmul_config->transpose_a());
-  auto weights_md = ShapeToMemDesc(weights_shape, matmul_config->transpose_b());
+  auto input_md = ShapeToMemDesc(input_shape);
+  auto weights_md = ShapeToMemDesc(weights_shape);
+  TRANSPOSE_LAST_TWO_DIMS_IF(matmul_config->transpose_a(), input_md);
+  TRANSPOSE_LAST_TWO_DIMS_IF(matmul_config->transpose_b(), weights_md);
   auto bias_md =
       absl::c_count(matmul_config->fused_ops(), OneDnnMatMulConfig::BIAS) > 0
           ? ShapeToMemDesc(bias_shape)
@@ -119,40 +89,6 @@ dnnl::memory::desc OneDnnMatMulOptWeightsDesc(
                                     output_md);
 }
 
-Shape MemDescToXlaShape(const dnnl::memory::desc& md) {
-  auto dtype = md.get_data_type();
-  auto element_size = dnnl::memory::data_type_size(dtype);
-  int64_t bytes_num = md.get_size();
-  XLA_LIGHTWEIGHT_CHECK(bytes_num % element_size == 0);
-  int64_t elements_num = static_cast<int64_t>(bytes_num / element_size);
-  return ShapeUtil::MakeShape(ToXlaPrimitiveType(dtype), {elements_num});
-}
-
-std::unique_ptr<tsl::OneDnnThreadPool> CreateOneDnnThreadPool(
-    const xla::ExecutableRunOptions* run_options) {
-#ifndef ENABLE_ONEDNN_OPENMP
-  if (run_options != nullptr &&
-      run_options->intra_op_thread_pool() != nullptr) {
-    return std::make_unique<tsl::OneDnnThreadPool>(
-        run_options->intra_op_thread_pool()->getPool(), false);
-  } else {
-    return nullptr;
-  }
-#else
-  return nullptr;
-#endif  // ENABLE_ONEDNN_OPENMP
-}
-
-dnnl::stream MakeOneDnnStream(
-    const dnnl::engine& cpu_engine,
-    dnnl::threadpool_interop::threadpool_iface* thread_pool) {
-  if (thread_pool != nullptr) {
-    return dnnl::threadpool_interop::make_stream(cpu_engine, thread_pool);
-  } else {
-    return dnnl::stream(cpu_engine);
-  }
-}
-
 }  // namespace
 
 Shape OneDnnMatMulOptWeightsShape(const Shape& input_shape,
@@ -164,53 +100,30 @@ Shape OneDnnMatMulOptWeightsShape(const Shape& input_shape,
   auto optimized_weights_md =
       OneDnnMatMulOptWeightsDesc(cpu_engine, input_shape, weights_shape,
                                  bias_shape, output_shape, matmul_config);
-  return MemDescToXlaShape(optimized_weights_md);
+  return MemDescToXlaShapeFlattened(optimized_weights_md);
 }
 
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_OneDnnMatMul(
-    void* result, void** args) {
-  // args[0]: ptr to nargs
-  // args[1]: ptr to ExecutableRunOptions
-  // args[2]: ptr to OneDnnMatMulConfig
-  // args[3...]: ptrs to operands
-  int arg_indx = 0;
-  const int64_t num_args = *(static_cast<int64_t*>(args[arg_indx++]));
-
-  const xla::ExecutableRunOptions* run_options =
-      static_cast<const xla::ExecutableRunOptions*>(args[arg_indx++]);
-  XLA_LIGHTWEIGHT_CHECK(run_options != nullptr);
-  XLA_LIGHTWEIGHT_CHECK(run_options->intra_op_thread_pool() != nullptr);
-
-  auto thread_pool = CreateOneDnnThreadPool(run_options);
-  engine cpu_engine(engine::kind::cpu, 0);
-  auto onednn_stream = MakeOneDnnStream(cpu_engine, thread_pool.get());
-
-  std::string config_str(static_cast<const char*>(args[arg_indx++]));
-  OneDnnMatMulConfig matmul_config;
-  matmul_config.ParseFromString(config_str);
-
-  MemrefInfo lhs_minfo(args[arg_indx++]);
-  MemrefInfo rhs_minfo(args[arg_indx++]);
-  MemrefInfo result_minfo(result);
-
-  auto lhs_md = lhs_minfo.GetOneDnnMemDesc();
-  auto rhs_md = rhs_minfo.GetOneDnnMemDesc();
+struct FusedOperandsRef {
+  const std::vector<void*>& bufs;
+  std::vector<std::pair<int, dnnl::memory>>& postop_args;
+};
+
+std::unique_ptr<matmul::primitive_desc> CreateMatMulPrimDesc(
+    const engine& cpu_engine, const memory::desc& input_md,
+    const memory::desc& plain_weights_md, const memory::desc& output_md,
+    const std::vector<memory::desc>& fused_mds,
+    const OneDnnMatMulConfig& matmul_config,
+    FusedOperandsRef* fused_operands_ref = nullptr) {
   auto bias_md = memory::desc();
-  auto result_md = result_minfo.GetOneDnnMemDesc();
-
-  // Update dims and strides for transposed inputs.
-  if (matmul_config.transpose_a()) {
-    lhs_md = Transpose(lhs_md);
-  }
-
-  if (matmul_config.transpose_b()) {
-    rhs_md = Transpose(rhs_md);
+  bool weights_packed = matmul_config.weights_prepacked();
+  auto weights_md = plain_weights_md;
+  if (weights_packed) {
+    weights_md = memory::desc(weights_md.get_dims(), weights_md.get_data_type(),
+                              memory::format_tag::any);
   }
-  auto bias_mem = memory(nullptr);
-  std::vector<std::pair<int, dnnl::memory>> postop_args;
 
-  // Currently, GELU/ReLU only fusion is supported.
   dnnl::post_ops post_ops;
+  int fused_operand_idx = 0;
   for (auto& fused_op : matmul_config.fused_ops()) {
     switch (fused_op) {
       case OneDnnMatMulConfig::RELU:
@@ -226,27 +139,35 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_OneDnnMatMul(
         post_ops.append_eltwise(dnnl::algorithm::eltwise_gelu_erf, 0.f, 0.f);
         break;
       case OneDnnMatMulConfig::BIAS: {
-        MemrefInfo bias_minfo(args[arg_indx++]);
-        bias_md = bias_minfo.GetOneDnnMemDesc();
-
+        bias_md = fused_mds.at(fused_operand_idx);
         // Extend bias rank to match result rank.
-        auto missed_rank = result_md.get_ndims() - bias_md.get_ndims();
+        auto missed_rank = output_md.get_ndims() - bias_md.get_ndims();
         XLA_LIGHTWEIGHT_CHECK(missed_rank >= 0);
         if (missed_rank > 0) {
           auto bias_dims = bias_md.get_dims();
           bias_dims.insert(bias_dims.begin(), missed_rank, 1);
           bias_md = bias_md.reshape(bias_dims);
         }
-        bias_mem = memory(bias_md, cpu_engine, bias_minfo.Data());
+        if (fused_operands_ref) {
+          fused_operands_ref->postop_args.emplace_back(
+              DNNL_ARG_BIAS,
+              dnnl::memory(bias_md, cpu_engine,
+                           fused_operands_ref->bufs[fused_operand_idx]));
+        }
+        fused_operand_idx++;
       } break;
       case OneDnnMatMulConfig::BINARY_ADD: {
-        MemrefInfo binary_minfo(args[arg_indx++]);
-        auto binary_md = binary_minfo.GetOneDnnMemDesc();
-        auto arg_idx =
-            DNNL_ARG_ATTR_MULTIPLE_POST_OP(post_ops.len()) | DNNL_ARG_SRC_1;
+        auto binary_md = fused_mds.at(fused_operand_idx);
+        if (fused_operands_ref) {
+          auto arg_idx =
+              DNNL_ARG_ATTR_MULTIPLE_POST_OP(post_ops.len()) | DNNL_ARG_SRC_1;
+          fused_operands_ref->postop_args.emplace_back(
+              arg_idx,
+              dnnl::memory(binary_md, cpu_engine,
+                           fused_operands_ref->bufs[fused_operand_idx]));
+        }
         post_ops.append_binary(dnnl::algorithm::binary_add, binary_md);
-        postop_args.emplace_back(
-            arg_idx, dnnl::memory(binary_md, cpu_engine, binary_minfo.Data()));
+        fused_operand_idx++;
       } break;
       case OneDnnMatMulConfig::LINEAR: {
         float const_float;
@@ -263,43 +184,149 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_OneDnnMatMul(
     }
   }
 
-  XLA_LIGHTWEIGHT_CHECK(num_args == arg_indx);
-
   dnnl::primitive_attr attrs;
+  if (matmul_config.user_scratchpad()) {
+    attrs.set_scratchpad_mode(dnnl::scratchpad_mode::user);
+  }
   if (post_ops.len() > 0) {
     attrs.set_post_ops(post_ops);
   }
+  return std::make_unique<matmul::primitive_desc>(
+      cpu_engine, input_md, weights_md, bias_md, output_md, attrs);
+}
 
-  bool weights_packed = rhs_md.get_ndims() == 1 &&
-                        rhs_md.get_dims().front() != lhs_md.get_dims().back();
-  if (weights_packed) {
-    // expected 2D buffer with last dim of input and last dim of output
-    auto rhs_any_md =
-        memory::desc({lhs_md.get_dims().back(), result_md.get_dims().back()},
-                     rhs_md.get_data_type(), memory::format_tag::any);
+std::unique_ptr<matmul::primitive_desc> CreateMatMulPrimDesc(
+    const Shape& input_shape, const Shape& weights_shape,
+    const Shape& output_shape, const std::vector<Shape>& fused_shapes,
+    const OneDnnMatMulConfig& matmul_config) {
+  auto input_md = ShapeToMemDesc(input_shape);
+  auto weights_md = ShapeToMemDesc(weights_shape);
+  TRANSPOSE_LAST_TWO_DIMS_IF(matmul_config.transpose_a(), input_md);
+  TRANSPOSE_LAST_TWO_DIMS_IF(matmul_config.transpose_b(), weights_md);
+  auto output_md = ShapeToMemDesc(output_shape);
+  std::vector<memory::desc> fused_mds;
+  std::transform(fused_shapes.begin(), fused_shapes.end(),
+                 std::back_inserter(fused_mds),
+                 [](const Shape& shape) { return ShapeToMemDesc(shape); });
+  return CreateMatMulPrimDesc(engine(engine::kind::cpu, 0), input_md,
+                              weights_md, output_md, fused_mds, matmul_config);
+}
 
-    rhs_md = OneDnnMatMulOptWeightsDesc(cpu_engine, lhs_md, rhs_any_md, bias_md,
-                                        result_md);
+template <>
+std::unique_ptr<dnnl::matmul::primitive_desc>
+CreateOneDnnPrimDesc<dnnl::matmul::primitive_desc>(HloInstruction* instr) {
+  if (instr->opcode() != HloOpcode::kCustomCall) {
+    return nullptr;
+  }
+  auto custom_call = Cast<xla::HloCustomCallInstruction>(instr);
+  auto backend_config = custom_call->backend_config<BackendConfig>();
+  if (!backend_config.ok()) {
+    return nullptr;
   }
+  auto& matmul_config = backend_config.value().onednn_matmul_config();
+  auto operands = custom_call->operands();
+  auto input = operands[0];
+  auto weight = operands[1];  // assuming weights is the second operand
+  auto input_shape = input->shape();
+  auto weight_shape = weight->shape();
+  auto output_shape = custom_call->shape().IsTuple()
+                          ? custom_call->shape().tuple_shapes(0)
+                          : custom_call->shape();
+
+  auto fused_operands =
+      HloInstruction::InstructionVector(operands.begin() + 2, operands.end());
+  std::vector<Shape> fused_shapes;
+  std::transform(fused_operands.begin(), fused_operands.end(),
+                 std::back_inserter(fused_shapes),
+                 [](const HloInstruction* instr) { return instr->shape(); });
+
+  return CreateMatMulPrimDesc(input_shape, weight_shape, output_shape,
+                              fused_shapes, matmul_config);
+}
 
-  auto lhs_mem = memory(lhs_md, cpu_engine, lhs_minfo.Data());
-  auto rhs_mem = memory(rhs_md, cpu_engine, rhs_minfo.Data());
-  auto result_mem = memory(result_md, cpu_engine, result_minfo.Data());
+ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_OneDnnMatMul(
+    void* result, void* scratch, void** args) {
+  // args[0]: ptr to nargs
+  // args[1]: ptr to ExecutableRunOptions
+  // args[2]: ptr to OneDnnMatMulConfig
+  // args[3...]: ptrs to operands
+  int arg_indx = 0;
+  const int64_t num_args = *(static_cast<int64_t*>(args[arg_indx++]));
 
-  auto matmul_pd = matmul::primitive_desc(cpu_engine, lhs_md, rhs_md, bias_md,
-                                          result_md, attrs);
+  const xla::ExecutableRunOptions* run_options =
+      static_cast<const xla::ExecutableRunOptions*>(args[arg_indx++]);
+  auto thread_pool = CreateOneDnnThreadPool(
+      run_options ? run_options->intra_op_thread_pool() : nullptr);
+  engine cpu_engine(engine::kind::cpu, 0);
+  auto onednn_stream = MakeOneDnnStream(cpu_engine, thread_pool.get());
+
+  std::string config_str(static_cast<const char*>(args[arg_indx++]));
+  OneDnnMatMulConfig matmul_config;
+  matmul_config.ParseFromString(config_str);
+
+  MemrefInfo input_minfo(args[arg_indx++]);
+  MemrefInfo weights_minfo(args[arg_indx++]);
+  MemrefInfo output_minfo(result);
+
+  auto input_md = input_minfo.GetOneDnnMemDesc();
+  auto weights_md = weights_minfo.GetOneDnnMemDesc();
+  // Input and weights memory::desc need to be in correct layout before matmul
+  // primitive descriptor is created.
+  TRANSPOSE_LAST_TWO_DIMS_IF(
+      matmul_config.transpose_a() && input_md.get_ndims() > 1, input_md);
+  TRANSPOSE_LAST_TWO_DIMS_IF(
+      matmul_config.transpose_b() && weights_md.get_ndims() > 1, weights_md);
+  auto output_md = output_minfo.GetOneDnnMemDesc();
+  if (matmul_config.weights_prepacked()) {
+    // Weight pre-packing is supported for 2D weights only.
+    // Since prepacked weights array is flattened, try to infer the dims from
+    // input and output.
+    // TODO(intel-tf): Add support for prepacked weights for higher then 2D
+    // array.
+    weights_md =
+        memory::desc({input_md.get_dims().back(), output_md.get_dims().back()},
+                     weights_md.get_data_type(), memory::format_tag::ab);
+  }
+  const int64_t num_fused_operands = num_args - arg_indx;
+  std::vector<memory::desc> fused_mds;
+  std::vector<void*> fused_bufs;
+  for (int64_t i = 0; i < num_fused_operands; ++i) {
+    MemrefInfo operand_minfo(args[arg_indx++]);
+    fused_mds.push_back(operand_minfo.GetOneDnnMemDesc());
+    fused_bufs.push_back(operand_minfo.Data());
+  }
+
+  std::vector<std::pair<int, dnnl::memory>> postop_args;
+  FusedOperandsRef fused_operands_ref{fused_bufs, postop_args};
+  auto matmul_pd =
+      CreateMatMulPrimDesc(cpu_engine, input_md, weights_md, output_md,
+                           fused_mds, matmul_config, &fused_operands_ref);
+
+  XLA_LIGHTWEIGHT_CHECK(num_args == arg_indx);
+
+  auto lhs_mem = memory(input_md, cpu_engine, input_minfo.Data());
+  auto rhs_mem =
+      memory(matmul_pd->weights_desc(), cpu_engine, weights_minfo.Data());
+  auto result_mem = memory(output_md, cpu_engine, output_minfo.Data());
 
-  if (std::strstr(matmul_pd.impl_info_str(), "ref") != nullptr) {
+  if (std::strstr(matmul_pd->impl_info_str(), "ref") != nullptr) {
     LOG(WARNING) << "[Perf]: MatMul reference implementation being executed";
   }
 
-  auto matmul_prim = matmul(matmul_pd);
+  auto matmul_prim = matmul(*matmul_pd);
 
   std::unordered_map<int, memory> matmul_args{{DNNL_ARG_SRC, lhs_mem},
                                               {DNNL_ARG_WEIGHTS, rhs_mem},
-                                              {DNNL_ARG_BIAS, bias_mem},
                                               {DNNL_ARG_DST, result_mem}};
 
+  if (matmul_config.user_scratchpad()) {
+    XLA_LIGHTWEIGHT_CHECK(scratch != nullptr);
+    MemrefInfo scratch_minfo(scratch);
+    auto scratchpad_md = matmul_pd->scratchpad_desc();
+    auto scratch_mem = memory(scratchpad_md, cpu_engine, scratch_minfo.Data());
+    matmul_args.insert({DNNL_ARG_SCRATCHPAD, scratch_mem});
+  }
+
   matmul_args.insert(postop_args.begin(), postop_args.end());
 
   matmul_prim.execute(onednn_stream, matmul_args);
@@ -317,7 +344,8 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_OneDnnMatMulReorder(
   const xla::ExecutableRunOptions* run_options =
       static_cast<const xla::ExecutableRunOptions*>(args[arg_indx++]);
 
-  auto thread_pool = CreateOneDnnThreadPool(run_options);
+  auto thread_pool = CreateOneDnnThreadPool(
+      run_options ? run_options->intra_op_thread_pool() : nullptr);
   engine cpu_engine(engine::kind::cpu, 0);
   auto onednn_stream = MakeOneDnnStream(cpu_engine, thread_pool.get());
 
@@ -343,14 +371,8 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_OneDnnMatMulReorder(
   XLA_LIGHTWEIGHT_CHECK(num_args >= arg_indx);
 
   // Update dims and strides for transposed inputs.
-  bool transpose_a = matmul_config.transpose_a();
-  if (transpose_a) {
-    input_md = Transpose(input_md);
-  }
-  bool transpose_b = matmul_config.transpose_b();
-  if (transpose_b) {
-    weight_md = Transpose(weight_md);
-  }
+  TRANSPOSE_LAST_TWO_DIMS_IF(matmul_config.transpose_a(), input_md);
+  TRANSPOSE_LAST_TWO_DIMS_IF(matmul_config.transpose_b(), weight_md);
 
   // extend bias rank to match result rank
   if (!bias_md.is_zero()) {
diff --git a/third_party/xla/xla/service/cpu/onednn_matmul.h b/third_party/xla/xla/service/cpu/onednn_matmul.h
index 6647eee2621a90..09a2d6752ec29b 100644
--- a/third_party/xla/xla/service/cpu/onednn_matmul.h
+++ b/third_party/xla/xla/service/cpu/onednn_matmul.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define XLA_SERVICE_CPU_ONEDNN_MATMUL_H_
 #if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
 
+#include "dnnl.hpp"
 #include "xla/service/cpu/backend_config.pb.h"
 #include "xla/shape.h"
 
@@ -30,7 +31,8 @@ Shape OneDnnMatMulOptWeightsShape(const Shape& input_shape,
                                   const OneDnnMatMulConfig* matmul_config);
 
 extern "C" {
-extern void __xla_cpu_runtime_OneDnnMatMul(void* result, void** args);
+extern void __xla_cpu_runtime_OneDnnMatMul(void* result, void* scratch,
+                                           void** args);
 extern void __xla_cpu_runtime_OneDnnMatMulReorder(void* result, void** args);
 }  // extern "C"
 
diff --git a/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc b/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc
index c9b9e9b4a04a41..08749c15399c73 100644
--- a/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc
+++ b/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc
@@ -56,13 +56,41 @@ inline Status ValidateDotDimensionNumbers(
   return OkStatus();
 }
 
+// Whether the element type of instr is compatible with oneDNN kernels.
+// TODO(intel-tf): Restict compatible types based on instruction kind.
+inline bool CompatibleElementType(const HloInstruction* instr) {
+  PrimitiveType element_type = instr->shape().element_type();
+  return element_type == BF16 || element_type == F32 || element_type == F16;
+}
+
+// Type conversion from and to any of F16, BF16 and FP32.
+// TODO(intel-tf): Support more types when enabled.
+template <typename Pattern>
+inline auto SupportedConvert(Pattern pattern) {
+  auto supported_convert = [](const HloInstruction* instr) -> bool {
+    return CompatibleElementType(instr) &&
+           CompatibleElementType(instr->operand(0));
+  };
+  return m::Convert(pattern).WithPredicate(supported_convert);
+}
+
+template <typename Pattern>
+inline auto SupportedConvert(HloInstruction** convert, Pattern pattern) {
+  auto supported_convert = [](const HloInstruction* instr) -> bool {
+    return CompatibleElementType(instr) &&
+           CompatibleElementType(instr->operand(0));
+  };
+  return m::Convert(convert, pattern).WithPredicate(supported_convert);
+}
+
 template <typename Pattern>
 auto ElementwiseSafeIntermediate(HloInstruction** instr, Pattern pattern) {
-  return m::AnyOf<HloInstruction>(m::Broadcast(instr, pattern.WithOneUser()),
-                                  m::Slice(instr, pattern.WithOneUser()),
-                                  m::Bitcast(instr, pattern.WithOneUser()),
-                                  m::Reshape(instr, pattern.WithOneUser()),
-                                  pattern);
+  return m::AnyOf<HloInstruction>(
+      m::Broadcast(instr, pattern.WithOneUser()),
+      m::Slice(instr, pattern.WithOneUser()),
+      m::Bitcast(instr, pattern.WithOneUser()),
+      m::Reshape(instr, pattern.WithOneUser()),
+      SupportedConvert(instr, pattern.WithOneUser()), pattern);
 }
 
 inline auto OneDnnMatmulInstr(HloInstruction** instr) {
@@ -137,35 +165,65 @@ inline auto BcastConstScalarNear(double value) {
   return m::Broadcast(ConstScalarNear(value));
 }
 
-auto GELUActivation(HloInstruction* instr, HloInstruction** src) {
+// Associativity and commutativity properties of multiply results in various
+// patterns for an equivalent computation. This function tries to capture most
+// of the variations for a computation a * b * c. For example, patterns could be
+// any of (a * b) * c or a * (b * c), along with the variations resulting from
+// commutative patterns.
+template <typename PatternA, typename PatternB, typename PatternC>
+inline auto MultiplyMultiplyAnyOrder(PatternA a, PatternB b, PatternC c) {
+  return m::AnyOf<HloInstruction>(
+      m::MultiplyAnyOrder(a, m::MultiplyAnyOrder(b, c)),
+      m::MultiplyAnyOrder(b, m::MultiplyAnyOrder(a, c)),
+      m::MultiplyAnyOrder(c, m::MultiplyAnyOrder(a, b)));
+}
+
+bool GELUActivation(HloInstruction* instr, HloInstruction** src) {
   // Attempt to match GELU_TANH activation
   // (https://arxiv.org/abs/1606.08415), where:
   // gelu_tanh(x) = x * cdf(x)
   // cdf(x) = 0.5 * (1 + tanh(sqrt(2 / pi) * (x + 0.044715 * x**3))
+  //                     \--------- errf_approximate term -------/
+
   HloInstruction* errf;
-  return Match(instr, m::MultiplyAnyOrder(
-                          m::Op(src),
-                          m::MultiplyAnyOrder(
-                              BcastConstScalar(0.5),
-                              m::AddAnyOrder(BcastConstScalar(1.0),
-                                             m::Op(&errf).WithOneUser())))) &&
-         Match(errf,
-               m::Tanh(m::MultiplyAnyOrder(
-                           BcastConstScalarNear(sqrt(M_2_PI)),
-                           m::AddAnyOrder(
-                               m::Op().Is(*src),
-                               m::MultiplyAnyOrder(
-                                   BcastConstScalarNear(0.044715),
-                                   m::MultiplyAnyOrder(
-                                       m::Op().Is(*src),
-                                       m::MultiplyAnyOrder(m::Op().Is(*src),
-                                                           m::Op().Is(*src))
-                                           .WithOneUser())
-                                       .WithOneUser())
-                                   .WithOneUser())
-                               .WithOneUser())
-                           .WithOneUser())
-                   .WithOneUser());
+
+  // The expression 0.5 * x * (1 + errf) as common pattern for GELU exact and
+  // approximate activations.
+  auto common_pattrn = MultiplyMultiplyAnyOrder(
+      BcastConstScalar(0.5), m::Op(src),
+      m::AddAnyOrder(BcastConstScalar(1.0), m::Op(&errf).WithOneUser()));
+
+  bool matched = Match(instr, common_pattrn);
+  if (matched) {
+    // The subexpression 0.044715 * x**3 appears in GELU approximate activation.
+    // However, it is often optimized by other HLO passes into an expression of
+    // 0.044715 * x * (x * x). Since there are three consecutive multiplies,
+    // there could be a large number of patterns. We try to capture some of
+    // those:
+    //
+    //      1. (0.044715 * x) * x * x
+    //      2. 0.044715 * (x * x) * x
+    //
+    // Note each of the above could in turn have various patterns due to
+    // associativity and commutativity properties of multiply.
+    auto subexpr_pattern = m::AnyOf<HloInstruction>(
+        MultiplyMultiplyAnyOrder(
+            m::MultiplyAnyOrder(BcastConstScalarNear(0.044715),
+                                m::Op().Is(*src)),
+            m::Op().Is(*src), m::Op().Is(*src)),
+        MultiplyMultiplyAnyOrder(
+            BcastConstScalarNear(0.044715),
+            m::Multiply(m::Op().Is(*src), m::Op().Is(*src)), m::Op().Is(*src)));
+
+    auto errf_apprx_pattern =
+        m::Tanh(m::MultiplyAnyOrder(
+                    BcastConstScalarNear(sqrt(M_2_PI)),
+                    m::AddAnyOrder(m::Op().Is(*src), subexpr_pattern)
+                        .WithOneUser()))
+            .WithOneUser();
+    matched = Match(errf, errf_apprx_pattern);
+  }
+  return matched;
 }
 
 // OneDNN matmul can fuse add operation with automatic broadcasting along the
@@ -241,33 +299,6 @@ inline bool IsRowMajor(const Shape& shape) {
   return LayoutUtil::IsMonotonicWithDim0Major(shape.layout());
 }
 
-// Whether the element type of instr is compatible with oneDNN kernels.
-// TODO(intel-tf): Restict compatible types based on instruction kind.
-inline bool CompatibleElementType(const HloInstruction* instr) {
-  PrimitiveType element_type = instr->shape().element_type();
-  return element_type == BF16 || element_type == F32 || element_type == F16;
-}
-
-// Type conversion from and to any of BF16, F16 and FP32.
-// TODO(intel-tf): Support more types when enabled.
-template <typename Pattern>
-inline auto SupportedConvert(Pattern pattern) {
-  auto supported_convert = [](const HloInstruction* instr) -> bool {
-    return CompatibleElementType(instr) &&
-           CompatibleElementType(instr->operand(0));
-  };
-  return m::Convert(pattern).WithPredicate(supported_convert);
-}
-
-template <typename Pattern>
-inline auto SupportedConvert(HloInstruction** convert, Pattern pattern) {
-  auto supported_convert = [](const HloInstruction* instr) -> bool {
-    return CompatibleElementType(instr) &&
-           CompatibleElementType(instr->operand(0));
-  };
-  return m::Convert(convert, pattern).WithPredicate(supported_convert);
-}
-
 template <typename Pattern>
 inline auto BitcastWithReshapeSemantics(HloInstruction** bitcast,
                                         Pattern pattern) {
@@ -713,10 +744,10 @@ class OneDnnMatMulRewriteVisitor : public DfsHloRewriteVisitor {
   }
 };
 
-class OneDnnMatMulReorderVisitor : public DfsHloRewriteVisitor {
+class OneDnnPostRewriteVisitor : public DfsHloRewriteVisitor {
  public:
-  OneDnnMatMulReorderVisitor(int intra_op_parallelism,
-                             const tsl::thread::ThreadPool* compile_threadpool)
+  OneDnnPostRewriteVisitor(int intra_op_parallelism,
+                           const tsl::thread::ThreadPool* compile_threadpool)
       : intra_op_parallelism_(intra_op_parallelism > 0
                                   ? intra_op_parallelism
                                   : tsl::port::MaxParallelism()),
@@ -733,108 +764,130 @@ class OneDnnMatMulReorderVisitor : public DfsHloRewriteVisitor {
                                       threadpool_handle_->NumThreads()));
     }
 
-    evaluator_.set_custom_call_handler(
-        [this](const HloInstruction* custom_call_instr,
-               absl::Span<const Literal*> operands) -> StatusOr<Literal> {
-          TF_ASSIGN_OR_RETURN(
-              auto backend_config,
-              custom_call_instr->backend_config<BackendConfig>());
-          auto& matmul_config = backend_config.onednn_matmul_config();
-
-          auto output = Literal::CreateFromShape(custom_call_instr->shape());
-
-          int64_t nargs = operands.size() + 3;
-          std::vector<void*> args;
-          args.push_back(&nargs);
-
-          ExecutableRunOptions run_options;
-          run_options.set_intra_op_thread_pool(threadpool_device_.get());
-          args.push_back(&run_options);  // No ExecutableRunOptions.
-
-          // OneDnnMatMulConfig
-          std::string config;
-          matmul_config.SerializeToString(&config);
-          args.push_back(config.data());
-
-          std::vector<MemrefInfoHandler> minfo_ptrs(operands.size());
-          std::transform(operands.begin(), operands.end(), minfo_ptrs.begin(),
-                         CreateMemrefInfoFromLiteral);
-          for (auto& minfo_ptr : minfo_ptrs) {
-            args.push_back(static_cast<void*>(minfo_ptr.get()));
-          }
-
-          auto result_ptr = CreateMemrefInfoFromLiteral(&output);
-          __xla_cpu_runtime_OneDnnMatMulReorder(result_ptr.get(), args.data());
-
-          return output;
-        });
+#ifndef ENABLE_ONEDNN_OPENMP
+    // Set oneDNN concurrency settings (which is thread-local)
+    tsl::OneDnnThreadPool::set_onednn_max_threads(intra_op_parallelism_);
+#endif
   }
 
   Status HandleCustomCall(HloInstruction* custom_call) override {
     HloInstruction* matmul;
     if (Match(custom_call, OneDnnMatmulInstr(&matmul))) {
-      TF_ASSIGN_OR_RETURN(auto backend_config,
-                          matmul->backend_config<BackendConfig>());
-      auto& matmul_config = backend_config.onednn_matmul_config();
-
-      auto operands = custom_call->operands();
-      auto input = operands[0];
-      auto weight = operands[1];  // assuming weights is the second operand
-
-      auto input_shape = input->shape();
-      auto weight_shape = weight->shape();
-      if (weight_shape.rank() != 2) {
-        // pre-pack only 2D weights
-        return DefaultAction(custom_call);
-      }
+      return HandleCustomCallInternal<dnnl::matmul::primitive_desc>(
+          custom_call);
+    }
 
-      auto bias_shape =
-          absl::c_count(matmul_config.fused_ops(), OneDnnMatMulConfig::BIAS) > 0
-              ? operands.at(2)->shape()
-              : Shape();
+    return DefaultAction(custom_call);
+  }
 
-      auto output_shape = custom_call->shape();
+  template <typename PrimDesc>
+  Status HandleCustomCallInternal(HloInstruction* custom_call) {
+    auto scratch_add = AddScratch<PrimDesc>(custom_call);
+    if (scratch_add.ok()) {
+      custom_call = *scratch_add;
+    } else {
+      VLOG(2) << scratch_add.status();
+    }
+    auto weights_prepack = PrepackWeights<PrimDesc>(custom_call);
+    if (!weights_prepack.ok()) {
+      VLOG(2) << weights_prepack.status();
+    }
+    return OkStatus();
+  }
 
-#ifndef ENABLE_ONEDNN_OPENMP
-      // set oneDNN cuncurrency settings (which is thread-local)
-      tsl::OneDnnThreadPool::set_onednn_max_threads(intra_op_parallelism_);
-#endif
-      auto new_weight_shape = OneDnnMatMulOptWeightsShape(
-          input_shape, weight_shape, bias_shape, output_shape, &matmul_config);
-
-      auto cmpt = custom_call->parent();
-      std::vector<HloInstruction*> new_operands{
-          cmpt->AddInstruction(
-              HloInstruction::CreateConstant(Literal(input_shape))),
-          weight,
-          cmpt->AddInstruction(
-              HloInstruction::CreateConstant(Literal(output_shape))),
-      };
-
-      if (ShapeUtil::IsInitialized(bias_shape)) {
-        new_operands.push_back(cmpt->AddInstruction(
-            HloInstruction::CreateConstant(Literal(bias_shape))));
-      }
+  template <typename>
+  Status SetWeightsPrepack(HloInstruction*, bool);
 
-      HloInstruction* reorder_call =
-          custom_call->AddInstruction(HloInstruction::CreateCustomCall(
-              new_weight_shape, new_operands, "__onednn$matmul_reorder"));
+  template <typename>
+  Status SetUserScratch(HloInstruction*, bool);
 
-      reorder_call->CopyBackendConfigFrom(custom_call);
+  template <typename>
+  bool GetWeightsPrepack(HloInstruction*);
 
-      Literal result;
+  template <typename>
+  bool GetUserScratch(HloInstruction*);
 
-      if (evaluator_.TryEvaluate(reorder_call, &result, true)) {
-        HloInstruction* reordered_weight = custom_call->AddInstruction(
-            HloInstruction::CreateConstant(std::move(result)));
-        return custom_call->ReplaceOperandWithDifferentShape(1,
-                                                             reordered_weight);
+  // Add scratch for matmul by changing the result of custom-call to
+  // tuple(result, scratch)
+  template <typename PrimDesc>
+  StatusOr<HloInstruction*> AddScratch(HloInstruction* custom_call) {
+    if (GetUserScratch<PrimDesc>(custom_call)) {
+      return custom_call;
+    }
+    TF_RETURN_IF_ERROR(SetUserScratch<PrimDesc>(custom_call, true));
+    auto prim_desc = CreateOneDnnPrimDesc<PrimDesc>(custom_call);
+    int64_t scratch_size = prim_desc->scratchpad_desc().get_size();
+    Shape scratch_shape = ShapeUtil::MakeShape(U8, {scratch_size});
+    Shape tuple_shape =
+        ShapeUtil::MakeTupleShape({custom_call->shape(), scratch_shape});
+    auto new_custom_call = custom_call->AddInstruction(
+        custom_call->CloneWithNewShape(tuple_shape));
+    HloInstruction* gte =
+        new_custom_call->AddInstruction(HloInstruction::CreateGetTupleElement(
+            custom_call->shape(), new_custom_call, 0));
+    auto status = ReplaceInstruction(custom_call, gte);
+    if (!status.ok()) {
+      TF_RETURN_IF_ERROR(SetUserScratch<PrimDesc>(custom_call, false));
+      return absl::CancelledError("Adding scratch is unsuccessful.");
+    }
+    return new_custom_call;
+  }
 
-      } else {
-        return DefaultAction(custom_call);
-      }
+  template <typename PrimDesc>
+  StatusOr<HloInstruction*> PrepackWeights(HloInstruction* custom_call) {
+    if (GetWeightsPrepack<PrimDesc>(custom_call)) {
+      return custom_call;
     }
-    return DefaultAction(custom_call);
+    auto weights = custom_call->operand(1);
+    if (weights->user_count() > 1) {
+      return absl::FailedPreconditionError(
+          "Cannot prepack weights. There is more than one consumer.");
+    }
+    auto weights_shape = weights->shape();
+    Literal weights_literal;
+    if (!(weights_shape.rank() == 2 &&
+          evaluator_.TryEvaluate(weights, &weights_literal))) {
+      return absl::CancelledError(
+          "Cannot prepack weights. Not constant 2D weights.");
+    }
+    auto plain_weights_md = ShapeToMemDesc(weights_shape);
+    if constexpr (std::is_same<PrimDesc, dnnl::matmul::primitive_desc>::value) {
+      TF_ASSIGN_OR_RETURN(auto backend_config,
+                          custom_call->backend_config<BackendConfig>());
+      TRANSPOSE_LAST_TWO_DIMS_IF(
+          backend_config.onednn_matmul_config().transpose_b(),
+          plain_weights_md);
+    }
+    TF_RETURN_IF_ERROR(SetWeightsPrepack<PrimDesc>(custom_call, true));
+    auto prim_desc = CreateOneDnnPrimDesc<PrimDesc>(custom_call);
+    auto packed_weights_md = prim_desc->weights_desc();
+    auto packed_weights_shape = MemDescToXlaShapeFlattened(packed_weights_md);
+    auto packed_weights_literal = Literal(packed_weights_shape);
+    ReorderWeight(plain_weights_md, weights_literal.untyped_data(),
+                  packed_weights_md, packed_weights_literal.untyped_data());
+    HloInstruction* reordered_weight = custom_call->AddInstruction(
+        HloInstruction::CreateConstant(std::move(packed_weights_literal)));
+    auto status =
+        custom_call->ReplaceOperandWithDifferentShape(1, reordered_weight);
+    if (!status.ok()) {
+      TF_RETURN_IF_ERROR(SetWeightsPrepack<PrimDesc>(custom_call, false));
+      return absl::CancelledError(
+          "Cannot replace plain weights with prepacked weights.");
+    } else {
+      return custom_call;
+    }
+  }
+
+  void ReorderWeight(const dnnl::memory::desc& src_md, void* src_buf,
+                     const dnnl::memory::desc& dst_md, void* dst_buf) {
+    auto onednn_threadpool = CreateOneDnnThreadPool(threadpool_device_.get());
+    dnnl::engine cpu_engine(dnnl::engine::kind::cpu, 0);
+    auto onednn_stream = MakeOneDnnStream(cpu_engine, onednn_threadpool.get());
+    auto src_mem = dnnl::memory(src_md, cpu_engine, src_buf);
+    auto dst_mem = dnnl::memory(dst_md, cpu_engine, dst_buf);
+    dnnl::reorder reorder_prim{src_mem, dst_mem};
+    reorder_prim.execute(onednn_stream, src_mem, dst_mem);
+    onednn_stream.wait();
   }
 
  private:
@@ -844,6 +897,43 @@ class OneDnnMatMulReorderVisitor : public DfsHloRewriteVisitor {
   std::unique_ptr<Eigen::ThreadPoolDevice> threadpool_device_;
 };
 
+#define EMIT_GET_BACKEND_CONFIG_SPECIALIZATION(GETTER, PRIM_DESC, CONFIG,  \
+                                               FIELD)                      \
+  template <>                                                              \
+  inline bool OneDnnPostRewriteVisitor::GETTER<PRIM_DESC>(HloInstruction * \
+                                                          custom_call) {   \
+    auto backend_config = custom_call->backend_config<BackendConfig>();    \
+    return backend_config.ok() ? backend_config->CONFIG().FIELD() : false; \
+  }
+
+EMIT_GET_BACKEND_CONFIG_SPECIALIZATION(GetUserScratch,
+                                       dnnl::matmul::primitive_desc,
+                                       onednn_matmul_config, user_scratchpad);
+EMIT_GET_BACKEND_CONFIG_SPECIALIZATION(GetWeightsPrepack,
+                                       dnnl::matmul::primitive_desc,
+                                       onednn_matmul_config, weights_prepacked);
+
+#define EMIT_SET_BACKEND_CONFIG_SPECIALIZATION(SETTER, PRIM_DESC, CONFIG_TYPE, \
+                                               CONFIG, FIELD)                  \
+  template <>                                                                  \
+  inline Status OneDnnPostRewriteVisitor::SETTER<PRIM_DESC>(                   \
+      HloInstruction * custom_call, bool value) {                              \
+    TF_ASSIGN_OR_RETURN(auto backend_config,                                   \
+                        custom_call->backend_config<BackendConfig>());         \
+    CONFIG_TYPE* config = backend_config.mutable_##CONFIG();                   \
+    config->set_##FIELD(value);                                                \
+    return custom_call->set_backend_config(backend_config);                    \
+  }
+
+EMIT_SET_BACKEND_CONFIG_SPECIALIZATION(SetWeightsPrepack,
+                                       dnnl::matmul::primitive_desc,
+                                       OneDnnMatMulConfig, onednn_matmul_config,
+                                       weights_prepacked);
+EMIT_SET_BACKEND_CONFIG_SPECIALIZATION(SetUserScratch,
+                                       dnnl::matmul::primitive_desc,
+                                       OneDnnMatMulConfig, onednn_matmul_config,
+                                       user_scratchpad);
+
 StatusOr<bool> OneDnnMatMulRewriter::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
@@ -851,8 +941,8 @@ StatusOr<bool> OneDnnMatMulRewriter::Run(
   TF_ASSIGN_OR_RETURN(auto result,
                       visitor.RunOnModule(module, execution_threads));
 
-  OneDnnMatMulReorderVisitor reorder_visitor(intra_op_parallelism_,
-                                             compile_threadpool_);
+  OneDnnPostRewriteVisitor reorder_visitor(intra_op_parallelism_,
+                                           compile_threadpool_);
   TF_ASSIGN_OR_RETURN(auto result2,
                       reorder_visitor.RunOnModule(module, execution_threads));
 
diff --git a/third_party/xla/xla/service/cpu/onednn_memory_util.cc b/third_party/xla/xla/service/cpu/onednn_memory_util.cc
index 372ce97c278932..fd0b6f927957a0 100644
--- a/third_party/xla/xla/service/cpu/onednn_memory_util.cc
+++ b/third_party/xla/xla/service/cpu/onednn_memory_util.cc
@@ -169,6 +169,45 @@ int64_t MemrefInfo::GetChannels() const { return pod_->dims[pod_->rank - 1]; }
 
 int64_t MemrefInfo::GetRank() const { return pod_->rank; }
 
+StatusOr<dnnl::memory::desc> TransposeLastTwoDims(
+    const dnnl::memory::desc& md) {
+  int64_t ndims = md.get_ndims();
+  if (ndims < 2) {
+    return absl::InvalidArgumentError("Requires at least 2D shape.");
+  }
+  std::vector<int> permutation(ndims);
+  std::iota(permutation.begin(), permutation.end(), 0);
+  std::swap(permutation[ndims - 1], permutation[ndims - 2]);
+  return md.permute_axes(permutation);
+}
+
+dnnl::memory::desc ShapeToMemDesc(const Shape& shape) {
+  auto dimensions = shape.dimensions();
+  if (dimensions.empty()) {
+    return dnnl::memory::desc{};
+  }
+  auto dims = dnnl::memory::dims(dimensions.begin(), dimensions.end());
+  dnnl::memory::dims strides(dims.size());
+  dnnl::memory::dim stride = 1;
+  for (auto i : shape.layout().minor_to_major()) {
+    strides.at(i) = stride;
+    stride *= dims.at(i);
+  }
+  auto dt = ToOneDnnDataType(static_cast<PrimitiveType>(shape.element_type()));
+  return dnnl::memory::desc(dims, dt, strides);
+}
+
+Shape MemDescToXlaShapeFlattened(const dnnl::memory::desc& md) {
+  if (md.is_zero()) {
+    LOG(FATAL) << "Memory descriptor is zero.";
+  }
+  auto dtype = md.get_data_type();
+  auto element_size = dnnl::memory::data_type_size(dtype);
+  int64_t bytes_num = md.get_size();
+  int64_t elements_num = static_cast<int64_t>(bytes_num / element_size);
+  return ShapeUtil::MakeShape(ToXlaPrimitiveType(dtype), {elements_num});
+}
+
 }  // namespace cpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/cpu/onednn_memory_util.h b/third_party/xla/xla/service/cpu/onednn_memory_util.h
index fb5292843b5a61..5793c1bdf4f1ed 100644
--- a/third_party/xla/xla/service/cpu/onednn_memory_util.h
+++ b/third_party/xla/xla/service/cpu/onednn_memory_util.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Value.h"
 #include "xla/literal.h"
+#include "xla/service/cpu/runtime_lightweight_check.h"
 #include "xla/service/llvm_ir/ir_array.h"
 #include "xla/xla_data.pb.h"
 
@@ -118,6 +119,18 @@ class MemrefInfo {
   MemrefInfoPOD* pod_;
 };
 
+StatusOr<dnnl::memory::desc> TransposeLastTwoDims(const dnnl::memory::desc& md);
+#define TRANSPOSE_LAST_TWO_DIMS_IF(pred, mem_desc)        \
+  if (pred) {                                             \
+    auto trans_mem_desc = TransposeLastTwoDims(mem_desc); \
+    XLA_LIGHTWEIGHT_CHECK(trans_mem_desc.ok());           \
+    mem_desc = *trans_mem_desc;                           \
+  }
+
+dnnl::memory::desc ShapeToMemDesc(const Shape& shape);
+
+Shape MemDescToXlaShapeFlattened(const dnnl::memory::desc& md);
+
 }  // namespace cpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/cpu/onednn_util.cc b/third_party/xla/xla/service/cpu/onednn_util.cc
new file mode 100644
index 00000000000000..16d1ec07b9016a
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/onednn_util.cc
@@ -0,0 +1,46 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
+
+#include "xla/service/cpu/onednn_util.h"
+
+#define EIGEN_USE_THREADS
+
+namespace xla {
+namespace cpu {
+
+std::unique_ptr<tsl::OneDnnThreadPool> CreateOneDnnThreadPool(
+    const Eigen::ThreadPoolDevice* threadpool_device) {
+#ifndef ENABLE_ONEDNN_OPENMP
+  if (threadpool_device != nullptr) {
+    return std::make_unique<tsl::OneDnnThreadPool>(threadpool_device->getPool(),
+                                                   false);
+  }
+#endif  // !ENABLE_ONEDNN_OPENMP
+  return nullptr;
+}
+
+dnnl::stream MakeOneDnnStream(
+    const dnnl::engine& cpu_engine,
+    dnnl::threadpool_interop::threadpool_iface* thread_pool) {
+  return (thread_pool != nullptr)
+             ? dnnl::threadpool_interop::make_stream(cpu_engine, thread_pool)
+             : dnnl::stream(cpu_engine);
+}
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // INTEL_MKL && ENABLE_ONEDNN_V3
diff --git a/third_party/xla/xla/service/cpu/onednn_util.h b/third_party/xla/xla/service/cpu/onednn_util.h
index 0b8a7c65b0bf48..6d1c3b9b34bac9 100644
--- a/third_party/xla/xla/service/cpu/onednn_util.h
+++ b/third_party/xla/xla/service/cpu/onednn_util.h
@@ -15,10 +15,17 @@ limitations under the License.
 
 #ifndef XLA_SERVICE_CPU_ONEDNN_UTIL_H_
 #define XLA_SERVICE_CPU_ONEDNN_UTIL_H_
+
 #if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
 
+#define EIGEN_USE_THREADS
+
+#include "dnnl.hpp"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/tsl/util/onednn_threadpool.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/cpu_info.h"
+#include "unsupported/Eigen/CXX11/Tensor"
 
 namespace xla {
 namespace cpu {
@@ -44,6 +51,18 @@ inline bool IsSupportedType(xla::PrimitiveType dtype) {
   return false;
 }
 
+std::unique_ptr<tsl::OneDnnThreadPool> CreateOneDnnThreadPool(
+    const Eigen::ThreadPoolDevice* threadpool_device);
+
+dnnl::stream MakeOneDnnStream(
+    const dnnl::engine& cpu_engine,
+    dnnl::threadpool_interop::threadpool_iface* thread_pool);
+
+// This template function must have explicit specialization at the definition
+// site.
+template <typename PrimDesc>
+std::unique_ptr<PrimDesc> CreateOneDnnPrimDesc(HloInstruction*);
+
 }  // namespace cpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/cpu/simple_orc_jit.cc b/third_party/xla/xla/service/cpu/simple_orc_jit.cc
index 3bf08e4f300e6f..04f522c770cc83 100644
--- a/third_party/xla/xla/service/cpu/simple_orc_jit.cc
+++ b/third_party/xla/xla/service/cpu/simple_orc_jit.cc
@@ -22,10 +22,13 @@ limitations under the License.
 #include <cstdio>
 #include <list>
 #include <memory>
+#include <string>
 #include <system_error>  // NOLINT
 #include <utility>
+#include <vector>
 
 #include "absl/functional/any_invocable.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/JITSymbol.h"
 #include "llvm/ExecutionEngine/Orc/ExecutorProcessControl.h"
@@ -79,10 +82,9 @@ extern "C" uint16_t __truncdfbf2(double);
 
 namespace xla {
 namespace cpu {
-namespace {
 
-llvm::SmallVector<std::string, 0> DetectMachineAttributes() {
-  llvm::SmallVector<std::string, 0> result;
+std::vector<std::string> DetectMachineAttributes() {
+  std::vector<std::string> result;
   llvm::StringMap<bool> host_features;
   if (llvm::sys::getHostCPUFeatures(host_features)) {
     for (auto& feature : host_features) {
@@ -93,6 +95,8 @@ llvm::SmallVector<std::string, 0> DetectMachineAttributes() {
   return result;
 }
 
+namespace {
+
 class DefaultMemoryMapper final
     : public llvm::SectionMemoryManager::MemoryMapper {
  public:
@@ -293,6 +297,8 @@ bool ContiguousSectionMemoryManager::finalizeMemory(std::string* err_msg) {
 SimpleOrcJIT::InferTargetMachineForJIT(
     const llvm::TargetOptions& target_options,
     llvm::CodeGenOptLevel opt_level) {
+  std::vector<std::string> attrs = DetectMachineAttributes();
+  llvm::SmallVector<std::string, 0> llvm_attrs(attrs.begin(), attrs.end());
   std::unique_ptr<llvm::TargetMachine> target_machine(
       llvm::EngineBuilder()
           .setTargetOptions(target_options)
@@ -300,7 +306,7 @@ SimpleOrcJIT::InferTargetMachineForJIT(
           .selectTarget(
               /*TargetTriple=*/llvm::Triple(), /*MArch=*/"",
               /*MCPU=*/llvm::sys::getHostCPUName(),
-              /*MAttrs=*/DetectMachineAttributes()));
+              /*MAttrs=*/llvm_attrs));
   CHECK(target_machine != nullptr);
   return target_machine;
 }
diff --git a/third_party/xla/xla/service/cpu/simple_orc_jit.h b/third_party/xla/xla/service/cpu/simple_orc_jit.h
index 41180d09bee56a..8190a910b0a68a 100644
--- a/third_party/xla/xla/service/cpu/simple_orc_jit.h
+++ b/third_party/xla/xla/service/cpu/simple_orc_jit.h
@@ -137,6 +137,8 @@ class SimpleOrcJIT : public llvm::JITEventListener {
   llvm::JITEventListener* perf_jit_event_listener_;
 };
 
+std::vector<std::string> DetectMachineAttributes();
+
 }  // namespace cpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/cpu/tests/BUILD b/third_party/xla/xla/service/cpu/tests/BUILD
index 32d785885e0ab5..eaa3e2f5f0386f 100644
--- a/third_party/xla/xla/service/cpu/tests/BUILD
+++ b/third_party/xla/xla/service/cpu/tests/BUILD
@@ -1,9 +1,9 @@
 # Description:
 #    Tests for LLVM-based CPU backend for XLA.
 
-load("@local_tsl//tsl:tsl.bzl", "tsl_copts")
-load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
 load("//xla:xla.bzl", "xla_cc_test")
+load("//xla/tsl:tsl.bzl", "tsl_copts")
+load("//xla/tsl:tsl.default.bzl", "filegroup")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -67,8 +67,12 @@ xla_cc_test(
     srcs = ["cpu_dyn_shape_test.cc"],
     deps = [
         ":cpu_codegen_test",
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
         "//xla/service/cpu:cpu_compiler",
         "//xla/service/cpu:test_header_helper",
+        "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:test_main",
     ],
 )
@@ -77,9 +81,12 @@ xla_cc_test(
     name = "cpu_fusion_test",
     srcs = ["cpu_fusion_test.cc"],
     deps = [
+        "//xla:error_spec",
         "//xla:literal",
+        "//xla:literal_util",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
+        "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:cpu_plugin",
         "//xla/service/cpu:cpu_instruction_fusion",
@@ -95,7 +102,9 @@ xla_cc_test(
     srcs = ["cpu_bytesizeof_test.cc"],
     deps = [
         "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
         "//xla/service/llvm_ir:llvm_util",
+        "@llvm-project//llvm:ir_headers",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_main",
     ],
@@ -107,9 +116,10 @@ xla_cc_test(
     deps = [
         ":cpu_codegen_test",
         "//xla:array2d",
+        "//xla:literal_util",
         "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
-        "//xla/tests:filecheck",
         "@local_tsl//tsl/platform:test",
     ],
 )
@@ -120,14 +130,21 @@ xla_cc_test(
     deps = [
         ":cpu_codegen_test",
         "//xla:literal",
+        "//xla:literal_util",
         "//xla:shape_util",
+        "//xla:status",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
+        "//xla/service:hlo_ordering",
+        "//xla/service:logical_buffer",
         "//xla/service/llvm_ir:alias_analysis",
+        "//xla/service/llvm_ir:ir_array",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/tests:filecheck",
         "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:Support",
+        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_main",
     ],
@@ -138,9 +155,14 @@ xla_cc_test(
     srcs = ["cpu_intrinsic_test.cc"],
     deps = [
         ":cpu_codegen_test",
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service/cpu:cpu_compiler",
+        "//xla/tests:hlo_test_base",
         "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
         "@llvm-project//llvm:ARMCodeGen",  # fixdeps: keep
         "@llvm-project//llvm:Target",
         "@llvm-project//llvm:X86CodeGen",  # fixdeps: keep
@@ -156,11 +178,13 @@ xla_cc_test(
     tags = ["no_mac_arm64"],
     deps = [
         ":cpu_codegen_test",
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service/cpu:cpu_compiler",
         "//xla/service/cpu:test_header_helper",
         "//xla/tests:test_utils",
-        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_main",
     ],
@@ -174,6 +198,7 @@ xla_cc_test(
         "//xla/hlo/ir:hlo",
         "//xla/service/cpu:cpu_compiler",
         "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
         "@llvm-project//llvm:ARMCodeGen",  # fixdeps: keep
         "@llvm-project//llvm:Target",
         "@llvm-project//llvm:X86CodeGen",  # fixdeps: keep
@@ -187,16 +212,7 @@ xla_cc_test(
     srcs = ["tree_reduction_rewriter_test.cc"],
     deps = [
         ":cpu_codegen_test",
-        "//xla:statusor",
-        "//xla/hlo/ir:hlo",
-        "//xla/service:hlo_module_config",
-        "//xla/service:hlo_parser",
         "//xla/service/cpu:cpu_compiler",
-        "//xla/tests:filecheck",
-        "//xla/tests:hlo_test_base",
-        "//xla/tests:llvm_irgen_test_base",
-        "@local_tsl//tsl/lib/core:status_test_util",
-        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_main",
     ],
@@ -206,9 +222,10 @@ xla_cc_test(
     name = "cpu_infeed_test",
     srcs = ["cpu_infeed_test.cc"],
     deps = [
+        "//xla:error_spec",
         "//xla:literal",
+        "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:statusor",
         "//xla:test_helpers",
         "//xla:xla_data_proto_cc",
         "//xla/client:global_data",
@@ -230,9 +247,11 @@ xla_cc_test(
     srcs = ["cpu_literal_caching_test.cc"],
     deps = [
         ":cpu_codegen_test",
-        "//xla/service:hlo_parser",
+        "//xla/hlo/ir:hlo",
         "//xla/service/cpu:cpu_compiler",
         "//xla/service/cpu:test_header_helper",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test_main",
     ],
 )
@@ -244,6 +263,8 @@ xla_cc_test(
         ":cpu_codegen_test",
         "//xla/service/cpu:cpu_compiler",
         "//xla/service/cpu:test_header_helper",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test_main",
     ],
 )
@@ -255,6 +276,8 @@ xla_cc_test(
         ":cpu_codegen_test",
         "//xla/service/cpu:cpu_compiler",
         "//xla/service/cpu:test_header_helper",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test_main",
     ],
 )
@@ -264,12 +287,12 @@ xla_cc_test(
     srcs = ["cpu_spmd_compile_test.cc"],
     deps = [
         ":cpu_codegen_test",
-        "//xla/hlo/utils:hlo_query",
+        "//xla:debug_options_flags",
+        "//xla/service:executable",
         "//xla/service:hlo_module_config",
-        "//xla/service:hlo_parser",
         "//xla/service/cpu:cpu_compiler",
         "//xla/service/cpu:test_header_helper",
-        "//xla/tests:hlo_test_base",
+        "@com_google_absl//absl/status:statusor",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_main",
@@ -281,10 +304,17 @@ xla_cc_test(
     srcs = ["cpu_topk_test.cc"],
     deps = [
         ":cpu_codegen_test",
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
         "//xla/client:xla_builder",
+        "//xla/client:xla_computation",
         "//xla/client/lib:sorting",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_module_config",
         "//xla/service/cpu:cpu_compiler",
         "//xla/service/cpu:test_header_helper",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test_main",
     ],
 )
@@ -294,9 +324,15 @@ xla_cc_test(
     srcs = ["cpu_vectorization_test.cc"],
     deps = [
         ":cpu_codegen_test",
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service/cpu:cpu_compiler",
+        "//xla/tests:hlo_test_base",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
         "@llvm-project//llvm:ARMCodeGen",  # fixdeps: keep
         "@llvm-project//llvm:Target",
         "@llvm-project//llvm:X86CodeGen",  # fixdeps: keep
@@ -311,8 +347,11 @@ xla_cc_test(
     deps = [
         ":cpu_codegen_test",
         "//xla/service/cpu:cpu_compiler",
+        "//xla/tests:literal_test_util",
+        "@com_google_googletest//:gtest_main",
         "@llvm-project//llvm:ARMCodeGen",  # fixdeps: keep
         "@llvm-project//llvm:X86CodeGen",  # fixdeps: keep
+        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test_main",
     ],
 )
diff --git a/third_party/xla/xla/service/cpu/tests/cpu_bytesizeof_test.cc b/third_party/xla/xla/service/cpu/tests/cpu_bytesizeof_test.cc
index f5c78b96ed511b..64b80d81b63cbd 100644
--- a/third_party/xla/xla/service/cpu/tests/cpu_bytesizeof_test.cc
+++ b/third_party/xla/xla/service/cpu/tests/cpu_bytesizeof_test.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "llvm/IR/DataLayout.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/shape_util.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/test.h"
 
 class CpuByteSizeOfTest : public ::testing::Test {};
diff --git a/third_party/xla/xla/service/cpu/tests/cpu_dyn_shape_test.cc b/third_party/xla/xla/service/cpu/tests/cpu_dyn_shape_test.cc
index 083cd40e400f67..ab279dc3842694 100644
--- a/third_party/xla/xla/service/cpu/tests/cpu_dyn_shape_test.cc
+++ b/third_party/xla/xla/service/cpu/tests/cpu_dyn_shape_test.cc
@@ -15,9 +15,16 @@ limitations under the License.
 
 #include <memory>
 
+#include <gtest/gtest.h>
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/cpu/cpu_compiler.h"
 #include "xla/service/cpu/test_target_triple_helper.h"
 #include "xla/service/cpu/tests/cpu_codegen_test.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace cpu {
diff --git a/third_party/xla/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc b/third_party/xla/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc
index be387ba2edc474..07e91b247aae46 100644
--- a/third_party/xla/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc
+++ b/third_party/xla/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc
@@ -18,12 +18,15 @@ limitations under the License.
 #include <algorithm>
 #include <string>
 
-#include "absl/strings/str_cat.h"
+#include <gtest/gtest.h>
 #include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/cpu/cpu_compiler.h"
 #include "xla/service/cpu/test_target_triple_helper.h"
 #include "xla/service/cpu/tests/cpu_codegen_test.h"
+#include "xla/shape_util.h"
 #include "xla/tests/test_utils.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/test.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/cpu/tests/cpu_external_constants_test.cc b/third_party/xla/xla/service/cpu/tests/cpu_external_constants_test.cc
index d088e9e11f7794..20bcc3f973c2c8 100644
--- a/third_party/xla/xla/service/cpu/tests/cpu_external_constants_test.cc
+++ b/third_party/xla/xla/service/cpu/tests/cpu_external_constants_test.cc
@@ -20,9 +20,11 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/literal_util.h"
 #include "xla/service/cpu/tests/cpu_codegen_test.h"
 #include "xla/shape_util.h"
-#include "xla/tests/filecheck.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/test.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/cpu/tests/cpu_fusion_test.cc b/third_party/xla/xla/service/cpu/tests/cpu_fusion_test.cc
index fbb4c254faddec..fcea12916cbd88 100644
--- a/third_party/xla/xla/service/cpu/tests/cpu_fusion_test.cc
+++ b/third_party/xla/xla/service/cpu/tests/cpu_fusion_test.cc
@@ -17,15 +17,18 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "xla/error_spec.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/literal.h"
+#include "xla/literal_util.h"
 #include "xla/service/cpu/cpu_instruction_fusion.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tests/literal_test_util.h"
+#include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/test.h"
 
diff --git a/third_party/xla/xla/service/cpu/tests/cpu_infeed_test.cc b/third_party/xla/xla/service/cpu/tests/cpu_infeed_test.cc
index b0e2002d529957..329faac5d2dd3f 100644
--- a/third_party/xla/xla/service/cpu/tests/cpu_infeed_test.cc
+++ b/third_party/xla/xla/service/cpu/tests/cpu_infeed_test.cc
@@ -13,6 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "xla/error_spec.h"
+#include "xla/layout.h"
+#include "xla/layout_util.h"
+#include "xla/literal_util.h"
+#include "xla/shape.h"
 #ifndef _WIN32
 #include <unistd.h>
 #endif
@@ -26,7 +31,6 @@ limitations under the License.
 #include "xla/client/xla_computation.h"
 #include "xla/literal.h"
 #include "xla/shape_util.h"
-#include "xla/statusor.h"
 #include "xla/test_helpers.h"
 #include "xla/tests/client_library_test_base.h"
 #include "xla/tests/literal_test_util.h"
diff --git a/third_party/xla/xla/service/cpu/tests/cpu_intrinsic_test.cc b/third_party/xla/xla/service/cpu/tests/cpu_intrinsic_test.cc
index fe3ed299283528..0e9d32beb5ae13 100644
--- a/third_party/xla/xla/service/cpu/tests/cpu_intrinsic_test.cc
+++ b/third_party/xla/xla/service/cpu/tests/cpu_intrinsic_test.cc
@@ -16,12 +16,20 @@ limitations under the License.
 #include <algorithm>
 #include <string>
 
+#include <gtest/gtest.h>
 #include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "llvm-c/Target.h"
 #include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/cpu/cpu_compiler.h"
 #include "xla/service/cpu/tests/cpu_codegen_test.h"
+#include "xla/shape_util.h"
+#include "xla/tests/hlo_test_base.h"
+#include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/test.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/cpu/tests/cpu_key_value_sort_test.cc b/third_party/xla/xla/service/cpu/tests/cpu_key_value_sort_test.cc
index 692f3b1984d54a..2a8d9d8a24442f 100644
--- a/third_party/xla/xla/service/cpu/tests/cpu_key_value_sort_test.cc
+++ b/third_party/xla/xla/service/cpu/tests/cpu_key_value_sort_test.cc
@@ -15,9 +15,11 @@ limitations under the License.
 
 #include <memory>
 
+#include <gtest/gtest.h>
 #include "xla/service/cpu/cpu_compiler.h"
 #include "xla/service/cpu/test_target_triple_helper.h"
 #include "xla/service/cpu/tests/cpu_codegen_test.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace cpu {
diff --git a/third_party/xla/xla/service/cpu/tests/cpu_literal_caching_test.cc b/third_party/xla/xla/service/cpu/tests/cpu_literal_caching_test.cc
index 23fc03fa5eed48..23f3fd39c94485 100644
--- a/third_party/xla/xla/service/cpu/tests/cpu_literal_caching_test.cc
+++ b/third_party/xla/xla/service/cpu/tests/cpu_literal_caching_test.cc
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <gtest/gtest.h>
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/cpu/cpu_compiler.h"
 #include "xla/service/cpu/test_target_triple_helper.h"
 #include "xla/service/cpu/tests/cpu_codegen_test.h"
-#include "xla/service/hlo_parser.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace cpu {
diff --git a/third_party/xla/xla/service/cpu/tests/cpu_noalias_test.cc b/third_party/xla/xla/service/cpu/tests/cpu_noalias_test.cc
index 5343817ca25a34..ca64a1263c3e58 100644
--- a/third_party/xla/xla/service/cpu/tests/cpu_noalias_test.cc
+++ b/third_party/xla/xla/service/cpu/tests/cpu_noalias_test.cc
@@ -16,18 +16,32 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Casting.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/literal.h"
+#include "xla/literal_util.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/cpu/tests/cpu_codegen_test.h"
+#include "xla/service/hlo_ordering.h"
 #include "xla/service/llvm_ir/alias_analysis.h"
+#include "xla/service/llvm_ir/ir_array.h"
 #include "xla/service/llvm_ir/llvm_util.h"
+#include "xla/service/logical_buffer.h"
 #include "xla/shape_util.h"
+#include "xla/status.h"
 #include "xla/tests/filecheck.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/cpu/tests/cpu_outfeed_test.cc b/third_party/xla/xla/service/cpu/tests/cpu_outfeed_test.cc
index d2713db2060446..4dd41d5d8c9dc6 100644
--- a/third_party/xla/xla/service/cpu/tests/cpu_outfeed_test.cc
+++ b/third_party/xla/xla/service/cpu/tests/cpu_outfeed_test.cc
@@ -15,9 +15,11 @@ limitations under the License.
 
 #include <memory>
 
+#include <gtest/gtest.h>
 #include "xla/service/cpu/cpu_compiler.h"
 #include "xla/service/cpu/test_target_triple_helper.h"
 #include "xla/service/cpu/tests/cpu_codegen_test.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace cpu {
diff --git a/third_party/xla/xla/service/cpu/tests/cpu_profiling_test.cc b/third_party/xla/xla/service/cpu/tests/cpu_profiling_test.cc
index 69898dfca7a1cb..80acbd762715c0 100644
--- a/third_party/xla/xla/service/cpu/tests/cpu_profiling_test.cc
+++ b/third_party/xla/xla/service/cpu/tests/cpu_profiling_test.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include <algorithm>
 #include <string>
 
-#include "absl/strings/ascii.h"
-#include "absl/strings/str_cat.h"
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
 #include "llvm-c/Target.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/service/cpu/cpu_compiler.h"
diff --git a/third_party/xla/xla/service/cpu/tests/cpu_spmd_compile_test.cc b/third_party/xla/xla/service/cpu/tests/cpu_spmd_compile_test.cc
index bacf6653dc379f..6dc8cb9f7bb089 100644
--- a/third_party/xla/xla/service/cpu/tests/cpu_spmd_compile_test.cc
+++ b/third_party/xla/xla/service/cpu/tests/cpu_spmd_compile_test.cc
@@ -17,13 +17,13 @@ limitations under the License.
 #include <string>
 #include <utility>
 
-#include "xla/hlo/utils/hlo_query.h"
+#include "absl/status/statusor.h"
+#include "xla/debug_options_flags.h"
 #include "xla/service/cpu/cpu_compiler.h"
 #include "xla/service/cpu/test_target_triple_helper.h"
 #include "xla/service/cpu/tests/cpu_codegen_test.h"
+#include "xla/service/executable.h"
 #include "xla/service/hlo_module_config.h"
-#include "xla/service/hlo_parser.h"
-#include "xla/tests/hlo_test_base.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/test.h"
 
diff --git a/third_party/xla/xla/service/cpu/tests/cpu_topk_test.cc b/third_party/xla/xla/service/cpu/tests/cpu_topk_test.cc
index d6344745d505ef..618fd0f02a904c 100644
--- a/third_party/xla/xla/service/cpu/tests/cpu_topk_test.cc
+++ b/third_party/xla/xla/service/cpu/tests/cpu_topk_test.cc
@@ -15,11 +15,19 @@ limitations under the License.
 
 #include <memory>
 
+#include <gtest/gtest.h>
 #include "xla/client/lib/sorting.h"
 #include "xla/client/xla_builder.h"
+#include "xla/client/xla_computation.h"
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/cpu/cpu_compiler.h"
 #include "xla/service/cpu/test_target_triple_helper.h"
 #include "xla/service/cpu/tests/cpu_codegen_test.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace cpu {
diff --git a/third_party/xla/xla/service/cpu/tests/cpu_vectorization_test.cc b/third_party/xla/xla/service/cpu/tests/cpu_vectorization_test.cc
index 9441e67df3753e..ec29e43b3aff93 100644
--- a/third_party/xla/xla/service/cpu/tests/cpu_vectorization_test.cc
+++ b/third_party/xla/xla/service/cpu/tests/cpu_vectorization_test.cc
@@ -16,12 +16,20 @@ limitations under the License.
 #include <algorithm>
 #include <string>
 
+#include <gtest/gtest.h>
+#include "absl/algorithm/container.h"
 #include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "llvm-c/Target.h"
 #include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/cpu/cpu_compiler.h"
 #include "xla/service/cpu/tests/cpu_codegen_test.h"
+#include "xla/shape_util.h"
+#include "xla/tests/hlo_test_base.h"
+#include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/test.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/cpu/tests/cpu_while_test.cc b/third_party/xla/xla/service/cpu/tests/cpu_while_test.cc
index 3331a445e867e1..934aed068cb18f 100644
--- a/third_party/xla/xla/service/cpu/tests/cpu_while_test.cc
+++ b/third_party/xla/xla/service/cpu/tests/cpu_while_test.cc
@@ -16,8 +16,10 @@ limitations under the License.
 #include <memory>
 #include <string>
 
-#include "xla/service/cpu/cpu_compiler.h"
+#include <gtest/gtest.h>
 #include "xla/service/cpu/tests/cpu_codegen_test.h"
+#include "xla/tests/literal_test_util.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace cpu {
diff --git a/third_party/xla/xla/service/cpu/tests/tree_reduction_rewriter_test.cc b/third_party/xla/xla/service/cpu/tests/tree_reduction_rewriter_test.cc
index b29ce87322e129..e46b4ac15ed42e 100644
--- a/third_party/xla/xla/service/cpu/tests/tree_reduction_rewriter_test.cc
+++ b/third_party/xla/xla/service/cpu/tests/tree_reduction_rewriter_test.cc
@@ -15,16 +15,7 @@ limitations under the License.
 
 #include <utility>
 
-#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/cpu/tests/cpu_codegen_test.h"
-#include "xla/service/hlo_module_config.h"
-#include "xla/service/hlo_parser.h"
-#include "xla/statusor.h"
-#include "xla/tests/filecheck.h"
-#include "xla/tests/hlo_test_base.h"
-#include "xla/tests/llvm_irgen_test_base.h"
-#include "tsl/lib/core/status_test_util.h"
-#include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/custom_call_sharding_helper.cc b/third_party/xla/xla/service/custom_call_sharding_helper.cc
index 8165206b4228f1..8bc09a4ca79ae1 100644
--- a/third_party/xla/xla/service/custom_call_sharding_helper.cc
+++ b/third_party/xla/xla/service/custom_call_sharding_helper.cc
@@ -35,7 +35,12 @@ bool CustomCallShardingHelper::IsCustomCallShardable(
   return false;
 }
 
-xla::Status CustomCallPartitioner::Partition(
+bool CustomCallShardingHelper::CanPropagateShardingToOperands(
+    const HloInstruction* instruction) const {
+  return true;
+}
+
+absl::Status CustomCallPartitioner::Partition(
     spmd::SpmdPartitioningVisitor* partitioner, HloInstruction* hlo) const {
   return xla::Unimplemented("Implement sharding for %s", hlo->ToString());
 }
diff --git a/third_party/xla/xla/service/custom_call_sharding_helper.h b/third_party/xla/xla/service/custom_call_sharding_helper.h
index c184aeb4671408..f4287e39ba6856 100644
--- a/third_party/xla/xla/service/custom_call_sharding_helper.h
+++ b/third_party/xla/xla/service/custom_call_sharding_helper.h
@@ -47,6 +47,10 @@ class CustomCallShardingHelper {
       HloInstruction* instruction) const {
     return {};
   }
+  // Returns if the given custom-call instruction can propagate sharding to its
+  // operands.
+  virtual bool CanPropagateShardingToOperands(
+      const HloInstruction* instruction) const;
   virtual ~CustomCallShardingHelper() = default;
 };
 
@@ -58,8 +62,8 @@ class SpmdPartitioningVisitor;
 // policies.
 class CustomCallPartitioner : public CustomCallShardingHelper {
  public:
-  virtual xla::Status Partition(spmd::SpmdPartitioningVisitor* partitioner,
-                                HloInstruction* hlo) const;
+  virtual absl::Status Partition(spmd::SpmdPartitioningVisitor* partitioner,
+                                 HloInstruction* hlo) const;
 
   // Returns if the given side-effecting custom-call is allowed to have
   // replicated sharding.
diff --git a/third_party/xla/xla/service/custom_call_status.cc b/third_party/xla/xla/service/custom_call_status.cc
index 44b89148d2c25b..b861ba88ffc46c 100644
--- a/third_party/xla/xla/service/custom_call_status.cc
+++ b/third_party/xla/xla/service/custom_call_status.cc
@@ -31,5 +31,5 @@ void XlaCustomCallStatusSetSuccess(XlaCustomCallStatus* status) {
 
 void XlaCustomCallStatusSetFailure(XlaCustomCallStatus* status,
                                    const char* message, size_t message_len) {
-  status->message = std::string(message, strnlen(message, message_len));
+  status->message = std::string(message, 0, message_len);
 }
diff --git a/third_party/xla/xla/service/dynamic_dimension_inference.cc b/third_party/xla/xla/service/dynamic_dimension_inference.cc
index 5edb67de0f422e..ee74ae367d93f8 100644
--- a/third_party/xla/xla/service/dynamic_dimension_inference.cc
+++ b/third_party/xla/xla/service/dynamic_dimension_inference.cc
@@ -485,8 +485,10 @@ Status DynamicDimensionInferenceVisitor::HandleCustomCall(HloInstruction* hlo) {
     TF_RETURN_IF_ERROR(custom_call_handler_(hlo, parent_));
   } else {
     TF_RETURN_IF_ERROR(ForEachOperandDynamicDimension(
-        hlo, [&](HloInstruction* operand, ShapeIndex index, int64_t dimension,
-                 int64_t operand_index, HloInstruction* dynamic_size) {
+        hlo,
+        [&](HloInstruction* operand, ShapeIndex index, int64_t dimension,
+            int64_t operand_index,
+            HloInstruction* dynamic_size) -> absl::Status {
           // Resize custom call should propagate dynamic batch (0) and channel
           // (3) dimensions.
           if (hlo->custom_call_target() == "SliceToDynamic" ||
@@ -565,8 +567,9 @@ Status DynamicDimensionInferenceVisitor::HandlePad(HloInstruction* hlo) {
     return OkStatus();
   }
   return ForEachOperandDynamicDimension(
-      hlo, [&](HloInstruction* operand, ShapeIndex index, int64_t dimension,
-               int64_t operand_index, HloInstruction* dynamic_size) {
+      hlo,
+      [&](HloInstruction* operand, ShapeIndex index, int64_t dimension,
+          int64_t operand_index, HloInstruction* dynamic_size) -> absl::Status {
         if (operand_index != 0) {
           return Unimplemented(
               "Dynamic dimension on padding value is not supported");
@@ -803,8 +806,9 @@ Status DynamicDimensionInferenceVisitor::HandleConvolution(
     return OkStatus();
   }
   return ForEachOperandDynamicDimension(
-      hlo, [&](HloInstruction* operand, ShapeIndex index, int64_t dimension,
-               int64_t operand_index, HloInstruction* dynamic_size) {
+      hlo,
+      [&](HloInstruction* operand, ShapeIndex index, int64_t dimension,
+          int64_t operand_index, HloInstruction* dynamic_size) -> absl::Status {
         HloInstruction* conv = hlo;
         const ConvolutionDimensionNumbers& dimension_numbers =
             conv->convolution_dimension_numbers();
@@ -2120,7 +2124,8 @@ Status DynamicDimensionInferenceVisitor::HandleScatter(HloInstruction* hlo) {
   return ForEachOperandDynamicDimension(
       hlo,
       [&](HloInstruction* operand, ShapeIndex dynamic_index, int64_t dimension,
-          int64_t operand_index, HloInstruction* operand_dynamic_size) {
+          int64_t operand_index,
+          HloInstruction* operand_dynamic_size) -> absl::Status {
         if (operand_index == 0) {
           SetDynamicSize(hlo, {}, dimension, operand_dynamic_size);
           return OkStatus();
diff --git a/third_party/xla/xla/service/elemental_ir_emitter.cc b/third_party/xla/xla/service/elemental_ir_emitter.cc
index af3cc23abd5725..2a40e473ab60c2 100644
--- a/third_party/xla/xla/service/elemental_ir_emitter.cc
+++ b/third_party/xla/xla/service/elemental_ir_emitter.cc
@@ -974,21 +974,65 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
       return EmitComplexLog(op, operand_value);
     }
     case HloOpcode::kLog1p: {
-      // log1p(a+bi) = .5*log((a+1)^2+b^2) + i*atan2(b, a + 1)
-      // log((a+1)+bi) = .5*log(a*a + 2*a + 1 + b*b) + i*atan2(b, a+1)
-      // log((a+1)+bi) = .5*log1p(a*a + 2*a + b*b) + i*atan2(b, a+1)
+      //  log1p(a+bi) = .5*log((a+1)^2+b^2) + i*atan2(b, a + 1)
+      //  log((a+1)+bi) = .5*log(a*a + 2*a + 1 + b*b) + i*atan2(b, a+1)
+      //   log((a+1)+bi) = .5*log1p(a*a + 2*a + b*b) + i*atan2(b, a+1)
+      //
+      // that is accurate only when |a| is relatively small while
+      // large |a| and |b| lead to multiplication overflow in the real
+      // part.
+      //
+      // The following expression for the real part:
+      //
+      // log1p(a+bi).real = log(hypot(a+1, b))
+      //                  = log(max(|a+1|, |b|) * sqrt(1 + (min(|a+1|, |b|) /
+      //                  max(|a+1|, b))^2)) [to fix overflow for maximal values
+      //                  of |a+1| and |b|] = log(max(|a+1|, |b|)) + log(sqrt(1
+      //                  + (min(|a+1|, |b|) / max(|a+1|, b))^2)) =
+      //                  log(max(|a+1|, |b|)) + 0.5*log1p((min(|a+1|, |b|) /
+      //                  max(|a+1|, b))^2) [to fix inaccuracies for small a,
+      //                  we'll use log1p] = log1p((1 + a > |b| ? a : max(|a+1|,
+      //                  |b|) - 1) + 0.5*log1p((min(|a+1|, |b|) / max(|a+1|,
+      //                  b))^2)
+      //
+      // is accurate on the whole complex plane except when |b| is
+      // small and a is very close to -|b|^2/2 that leads to
+      // substraction errors when adding the two log1p values as in
+      //   log1p(-|b|^2) + log1p(|b|^2)
+      // TODO: improve the accuracy for the case above.
+
       auto a = EmitExtractReal(operand_value);
       auto b = EmitExtractImag(operand_value);
       llvm::Type* llvm_ty = a->getType();
       auto one = llvm::ConstantFP::get(llvm_ty, 1.0);
-      auto two = llvm::ConstantFP::get(llvm_ty, 2.0);
-      auto a_plus_one = FAdd(a, one);
-      auto sum_sq = FAdd(FAdd(FMul(a, a), FMul(two, a)), FMul(b, b));
-      TF_ASSIGN_OR_RETURN(auto log_sum_sq, EmitLog1p(component_type, sum_sq));
-      TF_ASSIGN_OR_RETURN(auto angle,
-                          EmitAtan2(component_type, b, a_plus_one, ""));
-      auto one_half = llvm::ConstantFP::get(llvm_ty, 0.5);
-      return EmitComposeComplex(op, FMul(one_half, log_sum_sq), angle);
+      auto half = llvm::ConstantFP::get(llvm_ty, 0.5);
+
+      auto a1 = FAdd(a, one);
+      auto abs_a1 = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fabs, {a1},
+                                                 {llvm_ty}, b_);
+      auto abs_b = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fabs, {b},
+                                                {llvm_ty}, b_);
+
+      auto max_abs_of_a1_and_b = EmitFloatMax(abs_a1, abs_b, "");
+      auto min_abs_of_a1_and_b = EmitFloatMin(abs_a1, abs_b, "");
+
+      auto max_abs_of_a1_and_b_minus_one =
+          Select(FCmpOGT(a1, abs_b), a, FSub(max_abs_of_a1_and_b, one));
+      auto min_max_ratio = FDiv(min_abs_of_a1_and_b, max_abs_of_a1_and_b);
+
+      TF_ASSIGN_OR_RETURN(
+          auto log_of_max_abs_of_a1_and_b,
+          EmitLog1p(component_type, max_abs_of_a1_and_b_minus_one));
+      TF_ASSIGN_OR_RETURN(
+          auto log_of_sqrt_part,
+          EmitLog1p(component_type, FMul(min_max_ratio, min_max_ratio)));
+
+      auto r = FAdd(FMul(half, log_of_sqrt_part), log_of_max_abs_of_a1_and_b);
+      auto real_part = Select(FCmpUNO(r, r), min_abs_of_a1_and_b,
+                              r);  // handles nan and inf values correctly
+
+      TF_ASSIGN_OR_RETURN(auto imag_part, EmitAtan2(component_type, b, a1, ""));
+      return EmitComposeComplex(op, real_part, imag_part);
     }
     case HloOpcode::kConvert: {
       PrimitiveType from_type = op->operand(0)->shape().element_type();
@@ -1037,16 +1081,13 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
       return EmitComposeComplex(op, real_result, imag_result);
     }
     case HloOpcode::kCos:
-    case HloOpcode::kSin:
-    case HloOpcode::kTan: {
+    case HloOpcode::kSin: {
       // If the argument is z = x + i*y, let
       //   sinh(y) = (exp(y) - exp(-y)) / 2
       //   cosh(y) = (exp(y) + exp(-y)) / 2 ,
       // then
       //   sin(x + i*y) = sin(x)*cosh(y) + i*cos(x)*sinh(y)
       //   cos(x + i*y) = cos(x)*cosh(y) - i*sin(x)*sinh(y)
-      //   tan(x + i*y) = (sin(x)*cos(x) + i*sinh(y)*cosh(y)) /
-      //                    (cos(x)^2 + sinh(y)^2).
       auto x = EmitExtractReal(operand_value);
       auto y = EmitExtractImag(operand_value);
       auto type = y->getType();
@@ -1059,14 +1100,6 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
       auto cosh_y = FAdd(half_exp_y, half_exp_neg_y);
       llvm::Value* real_result = nullptr;
       llvm::Value* imag_result = nullptr;
-      if (op->opcode() == HloOpcode::kTan) {
-        auto num_real = FMul(sin_x, cos_x);
-        auto num_imag = FMul(sinh_y, cosh_y);
-        auto denom = FAdd(FMul(cos_x, cos_x), FMul(sinh_y, sinh_y));
-        auto denom_inv = FDiv(llvm::ConstantFP::get(type, 1.0), denom);
-        real_result = FMul(num_real, denom_inv);
-        imag_result = FMul(num_imag, denom_inv);
-      }
       if (op->opcode() == HloOpcode::kSin) {
         real_result = FMul(sin_x, cosh_y);
         imag_result = FMul(cos_x, sinh_y);
@@ -1077,6 +1110,8 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
       }
       return EmitComposeComplex(op, real_result, imag_result);
     }
+    case HloOpcode::kTan:
+      // tan(x+yi) = -i*tanh(-y + xi)
     case HloOpcode::kTanh: {
       /*
       tanh=(exp(x)-exp(-x)) / (exp(x)+exp(-x))
@@ -1109,12 +1144,15 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
               (e^(2a)+e^(-2a)+2*[cos(2b)])
              =(e^(2a)-e^(-2a) + i*2*sin(2b)) / (e^(2a) + e^(-2a) + 2*cos(2b))
       */
-      llvm::Value* a = EmitExtractReal(operand_value);
-      llvm::Value* b = EmitExtractImag(operand_value);
+      bool op_is_tan = op->opcode() == HloOpcode::kTan;
+      llvm::Value* a0 = EmitExtractReal(operand_value);
+      llvm::Value* b0 = EmitExtractImag(operand_value);
+      llvm::Type* type = a0->getType();
+      llvm::Value* neg_one = llvm::ConstantFP::get(type, -1.0);
 
-      llvm::Type* type = a->getType();
+      auto a = op_is_tan ? FMul(neg_one, b0) : a0;
+      auto b = op_is_tan ? a0 : b0;
 
-      llvm::Value* neg_one = llvm::ConstantFP::get(type, -1.F);
       llvm::Value* two_a = FAdd(a, a);
       llvm::Value* neg_2a = FMul(neg_one, two_a);
 
@@ -1134,7 +1172,7 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
       // = 2cos(b)^2. This gives us the ability to be more precise when the
       // denominator is close to zero.
       TF_ASSIGN_OR_RETURN(llvm::Value * cos_b, EmitCos(component_type, b));
-      llvm::Value* four = llvm::ConstantFP::get(type, 4.F);
+      llvm::Value* four = llvm::ConstantFP::get(type, 4.0);
       llvm::Value* cos_b_sq = FMul(cos_b, cos_b);
       llvm::Value* two_cos_2b_p2 = FMul(cos_b_sq, four);
 
@@ -1143,16 +1181,16 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
       TF_ASSIGN_OR_RETURN(llvm::Value * sin_b, EmitSin(component_type, b));
       llvm::Value* imag_numerator = FMul(four, FMul(cos_b, sin_b));
 
-      // Expm1(x) is about x for small values of x, but exp_sum_m2 is about x^2
-      // for small value of x. As a result, due to floating point precision
-      // issues, x^2 is a better approximation than Expm1(x) + Expm1(x) for
-      // small values of x.
-      llvm::Value* a_sqr = FMul(a, a);
-      llvm::Value* use_approx_cutoff = llvm::ConstantFP::get(type, 1e-8);
-      llvm::Value* use_approx = FCmpOLT(a_sqr, use_approx_cutoff);
+      // About "x^2 is a better approximation than Expm1(x) + Expm1(x)
+      // for small values of x": this statement is not
+      // accurate. Previously, Expm1(x) implementation had accuracy
+      // issues for small x (where it was supposed to stand out in
+      // accuracy!), but after resolving these issues (see
+      // openxla/xla#10376), using precomputed exp_2a_m1 and
+      // exp_neg_2a_m1 is accurate enough and we'll save a few
+      // instructions.
 
-      llvm::Value* exp_sum_m2 =
-          Select(use_approx, a_sqr, FAdd(exp_2a_m1, exp_neg_2a_m1));
+      auto exp_sum_m2 = FAdd(exp_2a_m1, exp_neg_2a_m1);
       llvm::Value* denom = FAdd(exp_sum_m2, two_cos_2b_p2);
 
       // As `a` grows toward +inf and -inf, the real numerator will grow towards
@@ -1198,7 +1236,7 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
             b_->getFastMathFlags().noInfs())) {
         llvm::Value* abs_a = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fabs,
                                                           {a}, {type}, b_);
-        llvm::Value* zero = llvm::ConstantFP::get(type, 0.F);
+        llvm::Value* zero = llvm::ConstantFP::get(type, 0.0);
         llvm::Value* nan = llvm::ConstantFP::getNaN(type);
 
         llvm::Value* a_is_inf = FCmpOEQ(abs_a, inf);
@@ -1218,7 +1256,8 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
         imag = Select(imag_is_zero, zero, imag);
       }
 
-      return EmitComposeComplex(op, real, imag);
+      return op_is_tan ? EmitComposeComplex(op, imag, FMul(neg_one, real))
+                       : EmitComposeComplex(op, real, imag);
     }
     case HloOpcode::kAbs: {
       return EmitComplexAbs(component_type, operand_value);
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index d723001b77dcf2..6c8ddf2ce84289 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -9,15 +9,6 @@ load(
     "if_rocm_is_configured",
     "rocm_copts",
 )
-load(
-    "@local_tsl//tsl:tsl.bzl",
-    "if_google",
-    "if_nccl",
-    "internal_visibility",
-    "tsl_copts",
-    "tsl_gpu_library",
-)
-load("@local_tsl//tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
 load(
     "@local_tsl//tsl/platform:build_config.bzl",
     "tf_proto_library",
@@ -44,6 +35,14 @@ load(
     "if_gpu_is_configured",
 )
 load("//xla/tests:build_defs.bzl", "xla_test")
+load(
+    "//xla/tsl:tsl.bzl",
+    "if_google",
+    "internal_visibility",
+    "tsl_copts",
+    "tsl_gpu_library",
+)
+load("//xla/tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -338,7 +337,6 @@ cc_library(
         ":kernel_reuse_cache",
         ":launch_dimensions",
         ":matmul_utils",
-        ":nccl_api",
         ":parallel_loop_emitter",
         ":triton_call",
         "//xla:autotuning_proto_cc",
@@ -379,6 +377,7 @@ cc_library(
         "//xla/service/gpu/runtime:nccl_all_gather_thunk",
         "//xla/service/gpu/runtime:nccl_all_reduce_thunk",
         "//xla/service/gpu/runtime:nccl_all_to_all_thunk",
+        "//xla/service/gpu/runtime:nccl_api",
         "//xla/service/gpu/runtime:nccl_collective_broadcast_thunk",
         "//xla/service/gpu/runtime:nccl_collective_permute_thunk",
         "//xla/service/gpu/runtime:nccl_collective_thunk",
@@ -500,8 +499,8 @@ cc_library(
 
 cc_library(
     name = "ir_emitter_triton",
-    srcs = if_cuda_is_configured(["ir_emitter_triton.cc"]) + if_rocm_hipblaslt([
-        "ir_emitter_triton.cc",
+    srcs = if_gpu_is_configured(["ir_emitter_triton.cc"]) + if_cuda_is_configured(["ir_emitter_triton_cuda.cc"]) + if_rocm_is_configured([
+        "ir_emitter_triton_rocm.cc",
     ]),
     hdrs = if_gpu_is_configured(["ir_emitter_triton.h"]),
     deps = [
@@ -535,6 +534,8 @@ cc_library(
         "//xla/service/gpu/model:indexing_analysis",
         "//xla/service/gpu/model:indexing_map",
         "//xla/service/gpu/model:symbolic_tile_analysis",
+        "//xla/service/gpu/model:symbolic_tiled_hlo_instruction",
+        "//xla/service/gpu/model:tiled_hlo_instruction",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:launch_dim",
@@ -559,6 +560,7 @@ cc_library(
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:ArithToLLVM",
         "@llvm-project//mlir:BuiltinToLLVMIRTranslation",
+        "@llvm-project//mlir:ControlFlowToLLVM",
         "@llvm-project//mlir:ExecutionEngineUtils",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
@@ -569,6 +571,7 @@ cc_library(
         "@llvm-project//mlir:NVVMDialect",
         "@llvm-project//mlir:NVVMToLLVMIRTranslation",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:ROCDLToLLVMIRTranslation",
         "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:SCFToControlFlow",
         "@llvm-project//mlir:Support",
@@ -583,14 +586,15 @@ cc_library(
         "@local_tsl//tsl/platform:tensor_float_32_utils",
         "@triton//:TritonDialects",
         "@triton//:TritonTransforms",
-    ] + if_cuda_is_configured([
+    ] + if_gpu_is_configured([
+        "@triton//:TritonNvidiaGPUTransforms",
+        "@triton//:TritonGPUToLLVM",
+        "@triton//:TritonToTritonGPU",
+    ]) + if_cuda_is_configured([
         "@triton//third_party/nvidia:NVGPUToLLVM",
         "@triton//third_party/nvidia:TritonNVIDIAGPUToLLVM",
         "@triton//:TritonGPUTransforms",
-        "@triton//:TritonNvidiaGPUTransforms",
         "@triton//:TritonLLVMIR",
-        "@triton//:TritonToTritonGPU",
-        "@triton//:TritonGPUToLLVM",
     ]),
 )
 
@@ -599,13 +603,13 @@ xla_test(
     srcs = if_cuda_is_configured(["ir_emitter_triton_test.cc"]),
     backends = [
         "gpu_a100",
+        "gpu_h100",
     ],
     shard_count = 20,
     tags = ["nomac"],
     deps = [
         ":backend_configs_cc",
         ":gpu_device_info_for_tests",
-        ":ir_emission_utils",
         ":ir_emitter_triton",
         ":matmul_utils",
         ":triton_fusion_analysis",
@@ -618,7 +622,6 @@ xla_test(
         "//xla/hlo/ir:hlo",
         "//xla/service:pattern_matcher",
         "//xla/service:pattern_matcher_gmock",
-        "//xla/service/gpu/model:indexing_test_utils",
         "//xla/service/gpu/tests:gpu_codegen_test",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/cuda:cublas_plugin",
@@ -651,7 +654,10 @@ xla_test(
     backend_tags = {"gpu": [
         "requires-gpu-sm80",
     ]},
-    backends = ["gpu"],
+    backends = [
+        "gpu_a100",
+        "gpu_h100",
+    ],
     tags = [
         "large",
         "no_oss",  # requires-mem:16g tag doesn't work in open source
@@ -864,78 +870,6 @@ cc_library(
     ],
 )
 
-#===-------------------------------------------------------------------------------------------===//
-# NCCL integration
-#===-------------------------------------------------------------------------------------------===//
-
-# A lot of build complexity below is because NCCL dependency might not always be available and we
-# have `if_nccl` and `if_gpu_configured` that do not compose. NCCL header included directly in
-# :nccl_api target and all other targets should use this header to launch collective operations.
-# This allows to minimize the spreading of #ifdef all over the XLA code base.
-alias(
-    name = "nccl_api",
-    actual = if_nccl(":_nccl_api_impl", ":_nccl_api_stub"),
-)
-
-cc_library(
-    name = "_nccl_api_impl",
-    srcs = if_gpu_is_configured(
-        ["nccl_api.cc"],
-        ["nccl_api_stub.cc"],
-    ),
-    hdrs = ["nccl_api.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        ":nccl_clique_key",
-        "//xla:shape_util",
-        "//xla:xla_data_proto_cc",
-        "//xla/service:collective_ops_utils",
-        "//xla/stream_executor",
-        "//xla/stream_executor/gpu:gpu_activation",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:btree",
-        "@com_google_absl//absl/hash",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/concurrency:ref_count",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:statusor",
-    ] + if_cuda_is_configured([
-        "@local_config_nccl//:nccl",
-        "//xla/stream_executor/cuda:cuda_driver",
-        "//xla/stream_executor/cuda:cuda_executor",
-    ]) + if_rocm_is_configured([
-        "@local_config_rocm//rocm:rccl",
-        "//xla/stream_executor/rocm:rocm_driver",
-        "//xla/stream_executor/rocm:rocm_executor",
-    ]) + if_gpu_is_configured([
-        "//xla/stream_executor/gpu:gpu_stream",
-    ]),
-)
-
-cc_library(
-    name = "_nccl_api_stub",
-    srcs = ["nccl_api_stub.cc"],
-    hdrs = ["nccl_api.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        ":nccl_clique_key",
-        "//xla:shape_util",
-        "//xla:xla_data_proto_cc",
-        "//xla/service:collective_ops_utils",
-        "//xla/stream_executor",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/concurrency:ref_count",
-        "@local_tsl//tsl/platform:logging",
-    ],
-)
-
 cc_library(
     name = "nccl_clique_key",
     srcs = ["nccl_clique_key.cc"],
@@ -964,41 +898,6 @@ xla_cc_test(
     ],
 )
 
-cc_library(
-    name = "nccl_clique",
-    srcs = ["nccl_clique.cc"],
-    hdrs = ["nccl_clique.h"],
-    deps = [
-        ":nccl_api",
-        ":nccl_clique_key",
-        "//xla:debug_options_flags",
-        "//xla:executable_run_options",
-        "//xla:status_macros",
-        "//xla/service:global_device_id",
-        "//xla/service:lockable",
-        "//xla/service:rendezvous",
-        "//xla/stream_executor",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:btree",
-        "@com_google_absl//absl/container:node_hash_map",
-        "@com_google_absl//absl/functional:function_ref",
-        "@com_google_absl//absl/hash",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-        "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:hash",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
 cuda_library(
     name = "sleep_kernel",
     srcs = if_cuda_is_configured(["sleep_kernel.cu.cc"]),
@@ -1006,165 +905,6 @@ cuda_library(
     deps = ["@local_config_cuda//cuda:cuda_headers"],
 )
 
-cc_library(
-    name = "mock_nccl_xml_google",
-    srcs = ["mock_nccl_xml.cc"],
-    hdrs = ["mock_nccl_xml.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
-    tags = ["manual"],
-    visibility = ["//visibility:private"],
-    deps = [
-        "//xla:status",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:regexp",
-    ] + if_cuda_is_configured([
-        "@local_config_nccl//:nccl",
-    ]) + if_rocm_is_configured([
-        "@local_config_rocm//rocm:rccl",
-    ]),
-)
-
-xla_cc_test(
-    name = "mock_nccl_xml_test",
-    size = "small",
-    srcs = if_google(["mock_nccl_xml_test.cc"]),
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
-    tags = tf_cuda_tests_tags(),
-    deps = [
-        "//xla:status",
-        "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:test",
-    ] + if_google([
-        ":mock_nccl_xml_google",
-    ]) + if_cuda_is_configured([
-        "@local_config_nccl//:nccl",
-    ]) + if_rocm_is_configured([
-        "@local_config_rocm//rocm:rccl",
-    ]),
-)
-
-# Empty library to implement nested dependency conditions.
-cc_library(
-    name = "empty",
-    compatible_with = get_compatible_with_portable(),
-)
-
-alias(
-    name = "mock_nccl_utils",
-    actual = if_nccl(
-        if_google(":_mock_nccl_utils_google", ":_mock_nccl_utils_default"),
-        ":empty",
-    ),
-)
-
-# Do not build mock_nccl_utils.cc in OSS. It uses the nccl internal cost model to estimate the
-# communication time of nccl collective calls. Only build it in Google builds.
-# TODO(b/306073484): Stub out the cost model api used in mock nccl functions.
-cc_library(
-    name = "_mock_nccl_utils_google",
-    srcs = if_cuda_is_configured(["mock_nccl_utils.cc"]),
-    hdrs = if_cuda_is_configured([
-        "mock_nccl_utils.h",
-        "mock_nccl_topo_config.h",
-    ]),
-    # Override tsl_gpu_library()'s internal default value of ["//buildenv/target:gce"].
-    compatible_with = [],
-    tags = ["manual"],
-    visibility = ["//visibility:private"],
-    deps = if_cuda_is_configured([
-        ":gpu_executable_run_options",
-        ":mock_nccl_xml_google",
-        ":nccl_api",
-        ":nccl_clique_key",
-        ":nccl_clique",
-        ":sleep_kernel",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:node_hash_map",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-        "@com_google_absl//absl/types:span",
-        "@local_config_cuda//cuda:cuda_headers",
-        "@local_config_nccl//:nccl",
-        "//xla:debug_options_flags",
-        "//xla:executable_run_options",
-        "//xla:shape_util",
-        "//xla:status",
-        "//xla:status_macros",
-        "//xla:statusor",
-        "//xla:util",
-        "//xla/service:collective_ops_utils",
-        "//xla/service:global_device_id",
-        "//xla/service:rendezvous",
-        "//xla/service:lockable",
-        "//xla/service/gpu/runtime:nccl_collective_thunk",
-        "//xla/service/gpu/runtime:nccl_p2p_thunk_common",
-        "//xla/service/gpu/runtime:thunk",
-        "//xla/stream_executor",
-        "//xla/stream_executor/gpu:gpu_activation",
-        "//xla/stream_executor/gpu:gpu_stream",
-        "//xla/stream_executor/gpu:gpu_types_header",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/lib/gtl:int_type",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
-    ]),
-)
-
-cc_library(
-    name = "_mock_nccl_utils_default",
-    srcs = if_gpu_is_configured(["mock_nccl_utils_default.cc"]),
-    hdrs = if_gpu_is_configured(["mock_nccl_utils.h"]),
-    # Override tsl_gpu_library()'s internal default value of ["//buildenv/target:gce"].
-    compatible_with = [],
-    tags = ["manual"],
-    visibility = ["//visibility:private"],
-    deps = if_gpu_is_configured([
-        ":gpu_executable_run_options",
-        ":nccl_api",
-        ":nccl_clique_key",
-        ":nccl_clique",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        "//xla:executable_run_options",
-        "//xla:status",
-        "//xla:statusor",
-        "//xla:util",
-        "//xla/service/gpu/runtime:nccl_collective_thunk",
-        "//xla/service/gpu/runtime:nccl_p2p_thunk_common",
-        "//xla/service/gpu/runtime:thunk",
-        "//xla/service:collective_ops_utils",
-        "//xla/service:global_device_id",
-        "//xla/service:lockable",
-        "//xla/stream_executor",
-        "@local_tsl//tsl/lib/gtl:int_type",
-    ]) + if_cuda_is_configured([
-        "@local_config_nccl//:nccl",
-    ]) + if_rocm_is_configured([
-        "@local_config_rocm//rocm:rccl",
-    ]),
-)
-
 # TODO(b/244780257): Remove this config.
 bool_flag(
     name = "enable_xlir",
@@ -1188,7 +928,6 @@ cc_library(
         ":gpu_constants",
         ":gpu_executable_run_options",
         ":ir_emission_utils",
-        ":nccl_clique",
         ":nccl_clique_key",
         ":stream_executor_util",
         "//xla:executable_run_options",
@@ -1210,6 +949,7 @@ cc_library(
         "//xla/service:stream_pool",
         "//xla/service:xla_debug_info_manager",
         "//xla/service/gpu/runtime:annotation",
+        "//xla/service/gpu/runtime:nccl_clique",
         "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "//xla/stream_executor:device_description",
@@ -1354,7 +1094,7 @@ cc_library(
         "@local_tsl//tsl/platform:logging",
     ] + if_cuda_is_configured([
         ":gpu_asm_opts_util",
-        "//xla/stream_executor/gpu:asm_compiler",
+        "//xla/stream_executor/cuda:cuda_asm_compiler",
     ]),
 )
 
@@ -2169,8 +1909,10 @@ cc_library(
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:numbers",
+        "//xla/tsl/util:env_var",
         "@local_tsl//tsl/platform:statusor",
         "//xla/tsl/util/proto:proto_utils",
+        "@local_tsl//tsl/platform:status",
     ]),
 )
 
@@ -2598,13 +2340,14 @@ cc_library(
     srcs = ["priority_fusion.cc"],
     hdrs = ["priority_fusion.h"],
     deps = [
+        ":backend_configs_cc",
         ":fusion_process_dump_proto_cc",
         ":gpu_fusible",
         ":hlo_fusion_analysis",
         ":hlo_traversal",
+        ":triton_fusion_analysis",
         "//xla:debug_options_flags",
         "//xla:shape_util",
-        "//xla:statusor",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:dump",
@@ -2639,9 +2382,12 @@ cc_library(
 xla_cc_test(
     name = "priority_fusion_test",
     srcs = ["priority_fusion_test.cc"],
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     tags = ["no_pip"],
     deps = [
+        ":backend_configs_cc",
         ":gpu_device_info_for_tests",
+        ":gpu_fusible",
         ":hlo_fusion_analysis",
         ":priority_fusion",
         "//xla:shape_util",
@@ -2656,6 +2402,7 @@ xla_cc_test(
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:status_matchers",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -3187,6 +2934,35 @@ cc_library(
     ]),
 )
 
+cc_library(
+    name = "cudnn_workspace_rewriter",
+    srcs = if_cuda_is_configured(["cudnn_workspace_rewriter.cc"]),
+    hdrs = if_cuda_is_configured(["cudnn_workspace_rewriter.h"]),
+    deps = if_cuda_is_configured([
+        ":backend_configs_cc",
+        ":ir_emission_utils",
+        ":gpu_fused_mha_runner",
+        ":cublas_cudnn",
+        ":stream_executor_util",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@local_config_cuda//cuda:cudnn_header",
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_pass",
+        "//xla/stream_executor:stream_executor_headers",
+        "//xla/stream_executor/cuda:cudnn_frontend_helpers",
+        "//xla/stream_executor/cuda:cudnn_plugin",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+        "@com_google_absl//absl/container:flat_hash_set",
+    ]),
+)
+
 tf_proto_library(
     name = "executable_proto",
     srcs = ["executable.proto"],
@@ -3792,10 +3568,12 @@ cc_library(
 
 xla_test(
     name = "gpu_compiler_test",
-    srcs = ["gpu_compiler_test.cc"],
+    srcs = if_gpu_is_configured(["gpu_compiler_test.cc"]),
     backends = ["gpu"],
+    data = ["gpu_compiler_test_autotune_db.textproto"],
     deps = [
-        ":horizontal_loop_fusion",
+        ":gpu_compiler",
+        ":gpu_hlo_schedule",
         ":metrics",
         "//xla:autotune_results_proto_cc",
         "//xla:error_spec",
@@ -3806,18 +3584,19 @@ xla_test(
         "//xla/service:pattern_matcher",
         "//xla/service:pattern_matcher_gmock",
         "//xla/service:xla_debug_info_manager",
+        "//xla/service/gpu:autotuner_util",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
-        "@com_google_absl//absl/base:log_severity",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/log:scoped_mock_log",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -3912,6 +3691,7 @@ cc_library(
         ":cudnn_fused_conv_rewriter",
         ":cudnn_fused_mha_rewriter",
         ":cudnn_fused_mha_transpose_fusion",
+        ":cudnn_workspace_rewriter",
         ":cudnn_fusion_compiler",
         ":cudnn_norm_rewriter",
         ":cudnn_pad_for_convolutions",
@@ -3982,6 +3762,7 @@ cc_library(
         "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor",
         "//xla/stream_executor:stream_executor_headers",
+        "//xla/stream_executor/cuda:cuda_asm_compiler",
         "//xla/stream_executor/cuda:cuda_diagnostics",
         "//xla/stream_executor/cuda:cuda_platform_id",
         "//xla/stream_executor/cuda:ptx_compiler",
@@ -4012,9 +3793,12 @@ xla_test(
         "gpu_a100",
     ],
     tags = [
+        "no_rocm",
         "nomsan",  # Pulls in precompiled NVIDIA libraries which cause false positives in msan.
     ],
     deps = [
+        ":gpu_constants",
+        ":gpu_hlo_schedule",
         ":nvptx_compiler_impl",
         "//xla:util",
         "//xla:xla_proto_cc",
@@ -4254,11 +4038,13 @@ cc_library(
         "//xla/service:layout_assignment",
         "//xla/service:logical_buffer",
         "//xla/stream_executor",
+        "//xla/tsl/util:env_var",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
@@ -4390,6 +4176,7 @@ xla_cc_test(
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/profiler/protobuf:profiled_instructions_proto_cc",
@@ -4919,6 +4706,7 @@ xla_test(
         ":cublas_cudnn",
         ":cudnn_fused_mha_rewriter",
         ":cudnn_fused_mha_transpose_fusion",
+        "//xla:error_spec",
         "//xla:test_helpers",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -6037,6 +5825,7 @@ cc_library(
     hdrs = ["stream_attribute_annotator.h"],
     deps = [
         ":backend_configs_cc",
+        ":gpu_fusible",
         "//xla:comparison_util",
         "//xla:status",
         "//xla:statusor",
diff --git a/third_party/xla/xla/service/gpu/autotuner_compile_util.cc b/third_party/xla/xla/service/gpu/autotuner_compile_util.cc
index acf628f1cd5648..e88f893a480366 100644
--- a/third_party/xla/xla/service/gpu/autotuner_compile_util.cc
+++ b/third_party/xla/xla/service/gpu/autotuner_compile_util.cc
@@ -120,6 +120,9 @@ AutotunerCompileUtil::ProfileExecutable(
   std::vector<ExecutionInput> execution_inputs =
       ExecutionInputsFromBuffers(input_buffers, input_shapes);
   ExecutionProfile profile;
+  // Flag that a warm-up run was executed so that GpuTimer can use the, more
+  // accurate, delay kernel implementation.
+  profile.set_warmup_run_executed(true);
   TF_ASSIGN_OR_RETURN(
       ExecutionOutput execution_output,
       Execute(*executable, std::move(execution_inputs), &profile));
diff --git a/third_party/xla/xla/service/gpu/autotuner_util.cc b/third_party/xla/xla/service/gpu/autotuner_util.cc
index b424aed57f87e5..9f268335ef418a 100644
--- a/third_party/xla/xla/service/gpu/autotuner_util.cc
+++ b/third_party/xla/xla/service/gpu/autotuner_util.cc
@@ -309,12 +309,8 @@ bool IsTextProtoPath(absl::string_view file_path) {
 
 /*static*/ absl::StatusOr<se::RedzoneAllocator>
 AutotunerUtil::CreateRedzoneAllocator(const AutotuneConfig& config,
-                                      const DebugOptions& opts,
-                                      se::Stream* force_stream) {
-  se::Stream* stream = force_stream;
-  if (stream == nullptr) {
-    TF_ASSIGN_OR_RETURN(stream, config.GetStream());
-  }
+                                      const DebugOptions& opts) {
+  TF_ASSIGN_OR_RETURN(se::Stream * stream, config.GetStream());
   return se::RedzoneAllocator(
       stream, config.GetAllocator(), PtxOptsFromDebugOptions(opts),
       /*memory_limit=*/std::numeric_limits<int64_t>::max(),
diff --git a/third_party/xla/xla/service/gpu/autotuner_util.h b/third_party/xla/xla/service/gpu/autotuner_util.h
index 6569615fe07b44..34cb957e6751f8 100644
--- a/third_party/xla/xla/service/gpu/autotuner_util.h
+++ b/third_party/xla/xla/service/gpu/autotuner_util.h
@@ -196,11 +196,9 @@ struct AutotunerUtil {
   // Normally, we don't have to use this low level method.
   static bool AddResult(const AutotuneCacheKey& key, AutotuneResult result);
 
-  // Creates a RedzoneAllocator from a given config. If `force_stream` is
-  // provided, than it is used for checking redzones.
+  // Creates a RedzoneAllocator from a given config.
   static absl::StatusOr<se::RedzoneAllocator> CreateRedzoneAllocator(
-      const AutotuneConfig& config, const DebugOptions& opts,
-      se::Stream* force_stream = nullptr);
+      const AutotuneConfig& config, const DebugOptions& opts);
 
   // Functions to save/load XLA's autotuning results.
   //
diff --git a/third_party/xla/xla/service/gpu/backend_configs.proto b/third_party/xla/xla/service/gpu/backend_configs.proto
index 61335f07f0497a..7f225008337be8 100644
--- a/third_party/xla/xla/service/gpu/backend_configs.proto
+++ b/third_party/xla/xla/service/gpu/backend_configs.proto
@@ -228,6 +228,16 @@ message CudnnfMHABackendConfig {
 
   // Is causal mask
   bool is_causal_mask = 21;
+
+  // mask type
+  enum MaskType {
+    NO_MASK = 0;
+    PADDING = 1;
+    CAUSAL = 2;
+    PADDING_CAUSAL = 3;
+    ALIBI = 4;
+  }
+  MaskType mask_type = 22;
 }
 
 // Generic backend config for XLA:GPU
diff --git a/third_party/xla/xla/service/gpu/buffer_comparator_test.cc b/third_party/xla/xla/service/gpu/buffer_comparator_test.cc
index 8b0f9e72d325c8..4576ab18c89aa0 100644
--- a/third_party/xla/xla/service/gpu/buffer_comparator_test.cc
+++ b/third_party/xla/xla/service/gpu/buffer_comparator_test.cc
@@ -56,10 +56,11 @@ class BufferComparatorTest : public testing::Test {
                            const std::vector<ElementType>& expected) {
     auto stream = stream_exec_->CreateStream().value();
 
-    se::ScopedDeviceMemory<ElementType> current_buffer =
-        stream_exec_->AllocateOwnedArray<ElementType>(current.size());
-    se::ScopedDeviceMemory<ElementType> expected_buffer =
-        stream_exec_->AllocateOwnedArray<ElementType>(expected.size());
+    se::ScopedDeviceMemory<ElementType> current_buffer(
+        stream_exec_, stream_exec_->AllocateArray<ElementType>(current.size()));
+    se::ScopedDeviceMemory<ElementType> expected_buffer(
+        stream_exec_,
+        stream_exec_->AllocateArray<ElementType>(expected.size()));
 
     TF_CHECK_OK(stream->Memcpy(current_buffer.ptr(), current.data(),
                                current_buffer->size()));
@@ -349,12 +350,14 @@ TEST_F(BufferComparatorTest, BF16) {
 
   auto stream = stream_exec_->CreateStream().value();
 
-  se::ScopedDeviceMemory<Eigen::bfloat16> lhs =
-      stream_exec_->AllocateOwnedArray<Eigen::bfloat16>(element_count);
+  se::ScopedDeviceMemory<Eigen::bfloat16> lhs(
+      stream_exec_,
+      stream_exec_->AllocateArray<Eigen::bfloat16>(element_count));
   InitializeBuffer(stream.get(), BF16, &rng_state, *lhs.ptr());
 
-  se::ScopedDeviceMemory<Eigen::bfloat16> rhs =
-      stream_exec_->AllocateOwnedArray<Eigen::bfloat16>(element_count);
+  se::ScopedDeviceMemory<Eigen::bfloat16> rhs(
+      stream_exec_,
+      stream_exec_->AllocateArray<Eigen::bfloat16>(element_count));
   InitializeBuffer(stream.get(), BF16, &rng_state, *rhs.ptr());
 
   BufferComparator comparator(ShapeUtil::MakeShape(BF16, {element_count}),
diff --git a/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc b/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc
index d7fa93b47e5332..c6bb1b20d3f73d 100644
--- a/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc
+++ b/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc
@@ -49,7 +49,6 @@ limitations under the License.
 #include "xla/service/gpu/hlo_algorithm_denylist.h"
 #include "xla/service/gpu/stream_executor_util.h"
 #include "xla/service/hlo_module_config.h"
-#include "xla/service/service_executable_run_options.h"
 #include "xla/service/slow_operation_alarm.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -64,12 +63,14 @@ limitations under the License.
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/util/env_var.h"
 #include "xla/tsl/util/proto/proto_utils.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/numbers.h"
+#include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
 
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA)
@@ -99,10 +100,17 @@ class ScratchAllocator : public se::ScratchAllocator {
       : device_ordinal_(device_ordinal), memory_allocator_(memory_allocator) {}
 
   int64_t GetMemoryLimitInBytes() override {
-    return 1LL << 32;  // 4GB.  TODO(jlebar): Tune this?
+    return ScratchAllocator::GetDefaultMemoryLimitInBytes();
   }
   int64_t TotalAllocatedBytes() { return total_allocated_bytes_; }
 
+  static int64_t GetDefaultMemoryLimitInBytes() {
+    int64_t value;
+    TF_CHECK_OK(tsl::ReadInt64FromEnvVar("TF_CUDNN_WORKSPACE_LIMIT_IN_MB",
+                                         1LL << 12, &value));
+    return value * (1LL << 20);
+  }
+
   absl::StatusOr<se::DeviceMemory<uint8_t>> AllocateBytes(
       int64_t byte_size) override;
 
@@ -418,14 +426,13 @@ absl::StatusOr<AutotuneResult> GpuConvAlgorithmPicker::PickBestAlgorithmNoCache(
   // se::StreamExecutorMemoryAllocator for stream_exec.
   se::DeviceMemoryAllocator* allocator = config_.GetAllocator();
 
-  TF_ASSIGN_OR_RETURN(se::Stream* const stream, config_.GetStream());
   absl::StatusOr<AutotuneResult> result_or(Internal("Unknown platform."));
   // Check StreamExecutor on which platform it is. ROCm and Cuda implementation
   // have diverged. Specifically, we need to make sure redzone allocator related
   // utilities are not used in ROCm routine
   se::Platform::Id platform_id = stream_exec->platform()->id();
   if (platform_id == se::rocm::kROCmPlatformId) {
-    result_or = PickBestAlgorithmNoCacheRocm(instr, allocator, stream);
+    result_or = PickBestAlgorithmNoCacheRocm(instr, allocator);
   } else if (platform_id == se::cuda::kCudaPlatformId) {
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA)
     DebugOptions debug_opts = instr->GetModule()->config().debug_options();
@@ -437,8 +444,7 @@ absl::StatusOr<AutotuneResult> GpuConvAlgorithmPicker::PickBestAlgorithmNoCache(
         AutotuneRuntimeArguments runtime_arguments,
         AutotuneRuntimeArguments::FromInstruction(instr, allocator, stream_exec,
                                                   &input_output_allocator));
-    result_or =
-        PickBestAlgorithmNoCacheCuda(instr, stream, key, runtime_arguments);
+    result_or = PickBestAlgorithmNoCacheCuda(instr, key, runtime_arguments);
 #endif
   }
 
@@ -521,7 +527,7 @@ GpuConvAlgorithmPicker::AutotuneRuntimeArguments::FromInstruction(
 // crash_on_checking_failure is set; and returning a DISQUALIFIED AutotuneResult
 // simply skips the engine/algorithm while recording a reason for skipping it.
 absl::StatusOr<AutotuneResult> GpuConvAlgorithmPicker::AutotuneOneConvRunner(
-    se::Stream* stream, GenericConvRunner* const runner,
+    GenericConvRunner* const runner,
     std::optional<ReferenceResult>* reference_result,
     absl::Span<const AlgorithmDesc> disabled_algos,
     std::optional<AutotuneCacheKey> instruction_info,
@@ -581,8 +587,7 @@ absl::StatusOr<AutotuneResult> GpuConvAlgorithmPicker::AutotuneOneConvRunner(
   TF_ASSIGN_OR_RETURN(
       se::RedzoneAllocator scratch_allocator,
       AutotunerUtil::CreateRedzoneAllocator(
-          config_, runtime_arguments.hlo_module_config.debug_options(),
-          stream));
+          config_, runtime_arguments.hlo_module_config.debug_options()));
 
   se::dnn::ProfileResult profile_result;
   VLOG(4) << "Trying algorithm " << alg.ToString() << " for " << instr_str;
@@ -612,7 +617,6 @@ absl::StatusOr<AutotuneResult> GpuConvAlgorithmPicker::AutotuneOneConvRunner(
   // Use assignment instead of brace-list to make GCC 4.9 happy.
   RunConvOptions options;
   options.runner_cache = runner;
-  options.profile_result = &profile_result;
   // The following plan timing code is based on
   // https://github.com/NVIDIA/cudnn-frontend/blob/60496f42fdc7a4ccc059f5934e306e728a756755/include/cudnn_frontend_find_plan.h
   float max_time = 0;
@@ -622,18 +626,27 @@ absl::StatusOr<AutotuneResult> GpuConvAlgorithmPicker::AutotuneOneConvRunner(
       runtime_arguments.operand_buffers;
   std::vector<se::DeviceMemoryBase> result_buffers =
       runtime_arguments.result_buffers;
+
+  TF_ASSIGN_OR_RETURN(se::Stream* const stream, config_.GetStream());
+
   // Dry-run to warmup the plan.
   launch_status = RunGpuConv(config, operand_buffers, result_buffers,
                              scratch_memory, stream, options);
+  // Flag that a warm-up run has been executed; this allows the GpuTimer for
+  // the main measurement to safely use the delay kernel pattern, even if lazy
+  // module loading is enabled.
+  options.profile_result = &profile_result;
+  profile_result.set_warmup_run_executed(true);
   constexpr int kMaxIter = 10;
   // Iterate until the new measurement is within kThreshold of the current
   // minimum.
   int num_iters = 0;
-  for (;
-       num_iters < kMaxIter && launch_status.ok() && profile_result.is_valid();
-       num_iters++) {
+  for (; num_iters < kMaxIter && launch_status.ok(); ++num_iters) {
     launch_status = RunGpuConv(config, operand_buffers, result_buffers,
                                scratch_memory, stream, options);
+    if (!profile_result.is_valid()) {
+      break;
+    }
     float old_min_time = min_time;
     min_time = std::min(min_time, profile_result.elapsed_time_in_ms());
     max_time = std::max(max_time, profile_result.elapsed_time_in_ms());
@@ -781,7 +794,7 @@ absl::StatusOr<AutotuneResult> GpuConvAlgorithmPicker::AutotuneOneConvRunner(
 
 absl::StatusOr<AutotuneResult>
 GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
-    const HloCustomCallInstruction* instr, se::Stream* stream,
+    const HloCustomCallInstruction* instr,
     std::optional<AutotuneCacheKey> instruction_info,
     const AutotuneRuntimeArguments& runtime_arguments) {
   se::StreamExecutor* stream_exec = config_.GetExecutor();
@@ -829,6 +842,7 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
   // this algorithm considered correct, though.
   std::optional<ReferenceResult> reference_result;
 
+  TF_ASSIGN_OR_RETURN(se::Stream* const stream, config_.GetStream());
   TF_ASSIGN_OR_RETURN(
       std::vector<GenericConvRunner> runners,
       GetAlgorithms(runtime_arguments.gpu_conv_config, stream,
@@ -838,9 +852,9 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
   std::vector<AutotuneResult> profile_results;
   for (auto& runner_cache : runners) {
     TF_ASSIGN_OR_RETURN(
-        auto result, AutotuneOneConvRunner(
-                         stream, &runner_cache, &reference_result,
-                         disabled_algos, instruction_info, runtime_arguments));
+        auto result,
+        AutotuneOneConvRunner(&runner_cache, &reference_result, disabled_algos,
+                              instruction_info, runtime_arguments));
     profile_results.emplace_back(std::move(result));
   }
 
@@ -861,10 +875,9 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
 
     for (auto& runner_cache : fallback_runners) {
       TF_ASSIGN_OR_RETURN(
-          auto result,
-          AutotuneOneConvRunner(stream, &runner_cache, &reference_result,
-                                disabled_algos, instruction_info,
-                                runtime_arguments));
+          auto result, AutotuneOneConvRunner(&runner_cache, &reference_result,
+                                             disabled_algos, instruction_info,
+                                             runtime_arguments));
       profile_results.emplace_back(std::move(result));
     }
   }
@@ -915,39 +928,10 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
 }
 #endif
 
-absl::StatusOr<AutotuneResult>
-GpuConvAlgorithmPicker::PickBestAlgorithmWithAllocatedBuffer(
-    const AutotuneConfig& config, const GpuConvConfig conv_config,
-    const ServiceExecutableRunOptions* run_options,
-    const DebugOptions& debug_options,
-    const std::vector<se::DeviceMemoryBase> buffers,
-    std::vector<se::DeviceMemoryBase> result_buffers) {
-#if GOOGLE_CUDA
-  Shape output_shape = conv_config.output_shape;
-  HloModuleConfig hlo_module_config;
-  hlo_module_config.set_debug_options(debug_options);
-  se::Stream* stream = run_options->stream();
-  TF_ASSIGN_OR_RETURN(
-      se::RedzoneAllocator input_output_allocator,
-      AutotunerUtil::CreateRedzoneAllocator(config, debug_options, stream));
-
-  GpuConvAlgorithmPicker::AutotuneRuntimeArguments autotune_runtime_arguments =
-      {output_shape,   hlo_module_config,       buffers,
-       result_buffers, &input_output_allocator, conv_config,
-       std::nullopt};
-
-  return PickBestAlgorithmNoCacheCuda(
-      /*instr=*/nullptr, stream,
-      /*instruction_info=*/std::nullopt, autotune_runtime_arguments);
-#else
-  return Internal("CUDA is not enabled");
-#endif
-}
-
 absl::StatusOr<AutotuneResult>
 GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheRocm(
-    const HloCustomCallInstruction* instr, se::DeviceMemoryAllocator* allocator,
-    se::Stream* stream) {
+    const HloCustomCallInstruction* instr,
+    se::DeviceMemoryAllocator* allocator) {
   XLA_SCOPED_LOGGING_TIMER(absl::StrCat(
       "GpuConvAlgorithmPicker::PickBestAlgorithmImpl for ", instr->ToString()));
 
@@ -964,6 +948,7 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheRocm(
   std::vector<se::DeviceMemoryBase> operand_buffers;
 
   ScratchAllocator input_output_allocator(device_ordinal, allocator);
+  TF_ASSIGN_OR_RETURN(se::Stream* const stream, config_.GetStream());
   const auto initialize_buffer = [stream](DeviceMemoryBase buffer) {
     // Although we don't have evidence this matters, zero out the buffers
     // before autotuning.  It's conceivable that using uninitialized memory as
diff --git a/third_party/xla/xla/service/gpu/conv_algorithm_picker.h b/third_party/xla/xla/service/gpu/conv_algorithm_picker.h
index 046b11ca045e9c..d0648dbbeee423 100644
--- a/third_party/xla/xla/service/gpu/conv_algorithm_picker.h
+++ b/third_party/xla/xla/service/gpu/conv_algorithm_picker.h
@@ -35,7 +35,6 @@ limitations under the License.
 #include "xla/service/gpu/gpu_conv_runner.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_pass_interface.h"
-#include "xla/service/service_executable_run_options.h"
 #include "xla/shape.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
@@ -104,14 +103,6 @@ class GpuConvAlgorithmPicker : public HloModulePass {
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
-  // Run autotuning on allocated buffers and pick the best algorithm.
-  absl::StatusOr<AutotuneResult> PickBestAlgorithmWithAllocatedBuffer(
-      const AutotuneConfig& config, GpuConvConfig conv_config,
-      const ServiceExecutableRunOptions* run_options,
-      const DebugOptions& debug_options,
-      std::vector<se::DeviceMemoryBase> buffers,
-      std::vector<se::DeviceMemoryBase> result_buffers);
-
  private:
   absl::StatusOr<bool> RunOnComputation(HloComputation* computation);
   absl::StatusOr<bool> RunOnInstruction(HloInstruction* instr);
@@ -148,7 +139,7 @@ class GpuConvAlgorithmPicker : public HloModulePass {
   };
 
   absl::StatusOr<AutotuneResult> AutotuneOneConvRunner(
-      se::Stream* stream, GenericConvRunner* runner,
+      GenericConvRunner* runner,
       std::optional<ReferenceResult>* reference_result,
       absl::Span<const stream_executor::dnn::AlgorithmDesc> disabled_algos,
       std::optional<AutotuneCacheKey> instruction_info,
@@ -156,14 +147,14 @@ class GpuConvAlgorithmPicker : public HloModulePass {
 
   // Pick the best algorithm for CUDA platform.
   absl::StatusOr<AutotuneResult> PickBestAlgorithmNoCacheCuda(
-      const HloCustomCallInstruction* instr, se::Stream* stream,
+      const HloCustomCallInstruction* instr,
       std::optional<AutotuneCacheKey> instruction_info,
       const AutotuneRuntimeArguments& runtime_arguments);
 #endif
 
   absl::StatusOr<AutotuneResult> PickBestAlgorithmNoCacheRocm(
       const HloCustomCallInstruction* instr,
-      se::DeviceMemoryAllocator* allocator, se::Stream* stream);
+      se::DeviceMemoryAllocator* allocator);
 
  private:
   AutotuneConfig config_;
diff --git a/third_party/xla/xla/service/gpu/cublas_cudnn.h b/third_party/xla/xla/service/gpu/cublas_cudnn.h
index c79e76b9a72b3c..08f41affd2e911 100644
--- a/third_party/xla/xla/service/gpu/cublas_cudnn.h
+++ b/third_party/xla/xla/service/gpu/cublas_cudnn.h
@@ -79,6 +79,14 @@ enum class CudnnfMHAKind {
   kBackwardScaleBiasSoftmaxDropout,
 };
 
+enum class CudnnfMHAMaskKind {
+  kNoMask,
+  kPadding,
+  kCausal,
+  kPaddingCausal,
+  kAlibi,
+};
+
 absl::StatusOr<CudnnConvKind> GetCudnnConvKind(
     const HloCustomCallInstruction* instr);
 
diff --git a/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter.cc b/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter.cc
index 632eb42f2b5bbb..636788795049b4 100644
--- a/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter.cc
@@ -453,6 +453,7 @@ absl::StatusOr<bool> IsFlashAttention(
   bool is_hidden_dim_supported = hidden_dim <= 128 && hidden_dim % 8 == 0;
   bool is_flash_attention = is_seqlen_supported && is_hidden_dim_supported;
   if (!is_flash_attention) return false;
+
   // going backwards to check compatibility
   if ((is_training && (s_q < 64 || s_kv < 64)) &&
       !IsComputeCapabilityAndCudnnSupported(
@@ -1450,7 +1451,8 @@ absl::StatusOr<HloInstruction*> FuseFwdMultiHeadedAttentionBlock(
   fmha_config.set_is_flash_attention(is_flash_attention);
   // set is_causal_mask here
   // choose to generate causal mask inside cuDNN attention or not
-  fmha_config.set_is_causal_mask(is_causal_mask);
+  fmha_config.set_mask_type(is_causal_mask ? CudnnfMHABackendConfig::CAUSAL
+                                           : CudnnfMHABackendConfig::NO_MASK);
 
   // Output Order: {O, scratch, Fwd act*}
   const Shape& output_shape = bmm_2->shape();
@@ -1459,12 +1461,8 @@ absl::StatusOr<HloInstruction*> FuseFwdMultiHeadedAttentionBlock(
   // Activation output is used by backward gemm.
   HloInstruction* activation_output = nullptr;
 
-  std::vector<Shape> output_shapes = {
-      output_shape,
-      ShapeUtil::MakeShape(
-          U8, {is_flash_attention
-                   ? 16
-                   : 0})};  // reserved 2 int64 for dropout seed and offset
+  std::vector<Shape> output_shapes = {output_shape,
+                                      ShapeUtil::MakeShape(U8, {0})};
   if (is_training) {
     activation_output = bmm_2->mutable_operand(0);
     // Sometimes activation output is bitcast, the actual activation is the
@@ -1598,7 +1596,8 @@ absl::StatusOr<bool> FuseBwdMultiHeadedAttentionBlock(
                       fwd_fmha_call->backend_config<GpuBackendConfig>());
   CudnnfMHABackendConfig fwd_config = gpu_config.cudnn_fmha_backend_config();
   bool is_flash_attention = fwd_config.is_flash_attention();
-  bool is_causal_mask = fwd_config.is_causal_mask();
+  bool is_causal_mask =
+      fwd_config.mask_type() == CudnnfMHABackendConfig::CAUSAL;
   CudnnfMHABackendConfig bwd_fmha_config;
   // Q tensor
   TF_ASSIGN_OR_RETURN(
@@ -1708,7 +1707,9 @@ absl::StatusOr<bool> FuseBwdMultiHeadedAttentionBlock(
 
   // Set is flash attention
   bwd_fmha_config.set_is_flash_attention(is_flash_attention);
-  bwd_fmha_config.set_is_causal_mask(is_causal_mask);
+  bwd_fmha_config.set_mask_type(is_causal_mask
+                                    ? CudnnfMHABackendConfig::CAUSAL
+                                    : CudnnfMHABackendConfig::NO_MASK);
 
   *bwd_fmha_config.mutable_intermediate_tensor_shape() =
       fwd_config.intermediate_tensor_shape();
@@ -1727,25 +1728,15 @@ absl::StatusOr<bool> FuseBwdMultiHeadedAttentionBlock(
 
   // Output order:
   // {dQ(bmm_1_grad_2), dK(bmm_1_grad_1), dV(bmm_2_grad_1),
-  // d_intermediate_tensor*, softmax_sum*, d_Q_accum*, scratch, dbias*}
+  // d_intermediate_tensor*, scratch, dbias*}
   std::vector<Shape> output_shapes = {
       bmm_1_grad_2->shape(), bmm_1_grad_1->shape(), bmm_2_grad_1->shape()};
   if (!fwd_config.is_flash_attention()) {
     output_shapes.push_back(lhs_bmm2_grad_gemm1->shape());
-  } else {
-    // softmax_sum, d_Q_accum
-    // add softmax sum here and change the data type
-    // softmax sum and d_Q_accum should both be fp32 datatype
-    output_shapes.push_back(
-        ShapeUtil::MakeShape(F32, fwd_act->shape().dimensions()));
-    output_shapes.push_back(
-        ShapeUtil::MakeShape(F32, bmm_1_grad_2->shape().dimensions()));
   }
+
   // Reserved placeholder for workspace
-  output_shapes.push_back(ShapeUtil::MakeShape(
-      U8, {is_flash_attention
-               ? 16
-               : 0}));  // reserved 2 int64 for dropout seed and offset
+  output_shapes.push_back(ShapeUtil::MakeShape(U8, {0}));
 
   if (dbias) {
     // Cudnn kernel only outputs dbias in this shape [1, num_heads, seq, seq],
@@ -1872,6 +1863,10 @@ absl::StatusOr<bool> CudnnFusedMHARewriter::Run(
       if (!matched_result.has_match) {
         continue;
       }
+      // disable cuDNN mask input
+      if (matched_result.matched_mask) {
+        continue;
+      }
       // We check the validity of bmms here before canonicalization so we don't
       // modify the graph if mha fusion is not possible
       // Relax 512 constraint if it is flash attention
diff --git a/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter_test.cc b/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter_test.cc
index b52fa5be2c598d..9741e3250f0831 100644
--- a/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/algorithm/container.h"
 #include "absl/strings/string_view.h"
+#include "xla/error_spec.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/algebraic_simplifier.h"
 #include "xla/service/computation_layout.h"
@@ -525,301 +526,6 @@ TEST_F(CudnnFusedMhaRewriterPipelineTest, F16Bmm1Bmm2Pattern) {
   EXPECT_FLOAT_EQ(config.dropout_rate(), 0.0);
 }
 
-constexpr absl::string_view hlo_BF16Bmm1ScaleMaskSoftmaxBmm2Pattern = R"(
-HloModule jit_bmm_test, entry_computation_layout={(bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0})->bf16[16,16,256,64]{3,2,1,0}}
-
-region_0.14.clone {
-  Arg_0.0 = f32[] parameter(0)
-  Arg_1.0 = f32[] parameter(1)
-  ROOT maximum.1 = f32[] maximum(Arg_0.0, Arg_1.0)
-}
-
-region_1.26 {
-  Arg_0.27 = f32[] parameter(0)
-  Arg_1.28 = f32[] parameter(1)
-  ROOT add = f32[] add(Arg_0.27, Arg_1.28)
-}
-
-ENTRY main.38 {
-  constant.10 = pred[16,16,256,256]{3,2,1,0} constant({...})
-  Arg_0.1 = bf16[16,16,256,64]{3,2,1,0} parameter(0)
-  Arg_1.2 = bf16[16,16,256,64]{3,2,1,0} parameter(1)
-  dot.11 = bf16[16,16,256,256]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
-  convert.33 = f32[16,16,256,256]{3,2,1,0} convert(dot.11)
-  constant.6 = f32[] constant(2.1)
-  broadcast.7 = f32[16,16,256,256]{3,2,1,0} broadcast(constant.6), dimensions={}
-  multiply.12 = f32[16,16,256,256]{3,2,1,0} multiply(convert.33, broadcast.7)
-  convert.34 = bf16[16,16,256,256]{3,2,1,0} convert(multiply.12)
-  constant.4 = bf16[] constant(0)
-  broadcast.5 = bf16[16,16,256,256]{3,2,1,0} broadcast(constant.4), dimensions={}
-  select.13 = bf16[16,16,256,256]{3,2,1,0} select(constant.10, convert.34, broadcast.5)
-  convert.36 = f32[16,16,256,256]{3,2,1,0} convert(select.13)
-  constant.9 = f32[] constant(-inf)
-  reduce.18 = f32[16,16,256]{2,1,0} reduce(convert.36, constant.9), dimensions={3}, to_apply=region_0.14.clone
-  broadcast.22 = f32[16,16,256,256]{3,2,1,0} broadcast(reduce.18), dimensions={0,1,2}
-  subtract.23 = f32[16,16,256,256]{3,2,1,0} subtract(convert.36, broadcast.22)
-  exponential.24 = f32[16,16,256,256]{3,2,1,0} exponential(subtract.23)
-  constant.8 = f32[] constant(0)
-  reduce.30 = f32[16,16,256]{2,1,0} reduce(exponential.24, constant.8), dimensions={3}, to_apply=region_1.26
-  broadcast.35 = f32[16,16,256,256]{3,2,1,0} broadcast(reduce.30), dimensions={0,1,2}
-  divide.36 = f32[16,16,256,256]{3,2,1,0} divide(exponential.24, broadcast.35)
-  convert.49 = bf16[16,16,256,256]{3,2,1,0} convert(divide.36)
-  Arg_2.3 = bf16[16,16,256,64]{3,2,1,0} parameter(2)
-  ROOT dot.37 = bf16[16,16,256,64]{3,2,1,0} dot(convert.49, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-})";
-
-TEST_F(CudnnFusedMhaRewriterTestHloTest, BF16Bmm1ScaleMaskSoftmaxBmm2Pattern) {
-  if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(
-                                      hlo_BF16Bmm1ScaleMaskSoftmaxBmm2Pattern));
-  CudnnFusedMHARewriter fusedMhaRewriter{GetCudaComputeCapability(),
-                                         GetCudnnVersion()};
-  TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
-  const HloInstruction* fmha;
-
-  SCOPED_TRACE(m->ToString());
-  EXPECT_THAT(
-      m->entry_computation()->root_instruction(),
-      GmockMatch(
-          m::GetTupleElement(
-              m::CustomCall(&fmha, {kCudnnfMHAScaleMaskSoftmaxCallTarget}), 0)
-              .WithShape(BF16, {16, 16, 256, 64})));
-  TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
-                          fmha->backend_config<GpuBackendConfig>());
-  const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
-  EXPECT_FLOAT_EQ(config.fmha_scale(), 2.1);
-  EXPECT_FLOAT_EQ(config.dropout_rate(), 0.0);
-  EXPECT_EQ(fmha->operands().size(), 4);
-}
-
-TEST_F(CudnnFusedMhaRewriterPipelineTest, BF16Bmm1ScaleMaskSoftmaxBmm2Pattern) {
-  if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto m, ParseAndReturnVerifiedModule(
-                  hlo_BF16Bmm1ScaleMaskSoftmaxBmm2Pattern, GetModuleConfig()));
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> optimized_module,
-                          GetOptimizedModule(std::move(m)));
-  const HloInstruction* fmha;
-
-  SCOPED_TRACE(optimized_module->ToString());
-  EXPECT_THAT(
-      optimized_module->entry_computation()->root_instruction(),
-      GmockMatch(
-          m::GetTupleElement(
-              m::CustomCall(&fmha, {kCudnnfMHAScaleMaskSoftmaxCallTarget}), 0)
-              .WithShape(BF16, {16, 16, 256, 64})));
-  TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
-                          fmha->backend_config<GpuBackendConfig>());
-  const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
-  EXPECT_FLOAT_EQ(config.fmha_scale(), 2.1);
-  EXPECT_FLOAT_EQ(config.dropout_rate(), 0.0);
-  EXPECT_EQ(fmha->operands().size(), 4);
-}
-
-constexpr absl::string_view hlo_BF16Bmm1ScaleBiasMaskSoftmaxBmm2Pattern = R"(
-HloModule jit_bmm_test, entry_computation_layout={(bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0})->bf16[16,16,256,64]{3,2,1,0}}
-
-region_0.17.clone {
-  Arg_0.0 = f32[] parameter(0)
-  Arg_1.0 = f32[] parameter(1)
-  ROOT maximum.1 = f32[] maximum(Arg_0.0, Arg_1.0)
-}
-
-region_1.29 {
-  Arg_0.30 = f32[] parameter(0)
-  Arg_1.31 = f32[] parameter(1)
-  ROOT add = f32[] add(Arg_0.30, Arg_1.31)
-}
-
-ENTRY main.41 {
-  constant.10 = pred[16,16,256,256]{3,2,1,0} constant({...})
-  Arg_0.1 = bf16[16,16,256,64]{3,2,1,0} parameter(0)
-  Arg_1.2 = bf16[16,16,256,64]{3,2,1,0} parameter(1)
-  dot.11 = bf16[16,16,256,256]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
-  convert.33 = f32[16,16,256,256]{3,2,1,0} convert(dot.11)
-  constant.6 = f32[] constant(3.1)
-  constant.11 = f32[] constant(1)
-  broadcast.7 = f32[16,16,256,256]{3,2,1,0} broadcast(constant.6), dimensions={}
-  multiply.12 = f32[16,16,256,256]{3,2,1,0} multiply(convert.33, broadcast.7)
-  broadcast.11 = f32[16,16,256,256]{3,2,1,0} broadcast(constant.11), dimensions={}
-  add.15 = f32[16,16,256,256]{3,2,1,0} add(multiply.12, broadcast.11)
-  convert.40 = bf16[16,16,256,256]{3,2,1,0} convert(add.15)
-  constant.4 = bf16[] constant(0)
-  broadcast.5 = bf16[16,16,256,256]{3,2,1,0} broadcast(constant.4), dimensions={}
-  select.13 = bf16[16,16,256,256]{3,2,1,0} select(constant.10, convert.40, broadcast.5)
-  convert.36 = f32[16,16,256,256]{3,2,1,0} convert(select.13)
-  constant.9 = f32[] constant(-inf)
-  reduce.18 = f32[16,16,256]{2,1,0} reduce(convert.36, constant.9), dimensions={3}, to_apply=region_0.17.clone
-  broadcast.22 = f32[16,16,256,256]{3,2,1,0} broadcast(reduce.18), dimensions={0,1,2}
-  subtract.23 = f32[16,16,256,256]{3,2,1,0} subtract(convert.36, broadcast.22)
-  exponential.24 = f32[16,16,256,256]{3,2,1,0} exponential(subtract.23)
-  constant.8 = f32[] constant(0)
-  reduce.30 = f32[16,16,256]{2,1,0} reduce(exponential.24, constant.8), dimensions={3}, to_apply=region_1.29
-  broadcast.35 = f32[16,16,256,256]{3,2,1,0} broadcast(reduce.30), dimensions={0,1,2}
-  divide.36 = f32[16,16,256,256]{3,2,1,0} divide(exponential.24, broadcast.35)
-  convert.49 = bf16[16,16,256,256]{3,2,1,0} convert(divide.36)
-  Arg_2.3 = bf16[16,16,256,64]{3,2,1,0} parameter(2)
-  ROOT dot.37 = bf16[16,16,256,64]{3,2,1,0} dot(convert.49, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-})";
-
-TEST_F(CudnnFusedMhaRewriterTestHloTest,
-       BF16Bmm1ScaleBiasMaskSoftmaxBmm2Pattern) {
-  if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-  TF_ASSERT_OK_AND_ASSIGN(auto m,
-                          ParseAndReturnVerifiedModule(
-                              hlo_BF16Bmm1ScaleBiasMaskSoftmaxBmm2Pattern));
-  CudnnFusedMHARewriter fusedMhaRewriter{GetCudaComputeCapability(),
-                                         GetCudnnVersion()};
-  TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
-  const HloInstruction* fmha;
-
-  SCOPED_TRACE(m->ToString());
-  EXPECT_THAT(
-      m->entry_computation()->root_instruction(),
-      GmockMatch(
-          m::GetTupleElement(
-              m::CustomCall(&fmha, {kCudnnfMHAScaleBiasMaskSoftmaxCallTarget}),
-              0)
-              .WithShape(BF16, {16, 16, 256, 64})));
-  TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
-                          fmha->backend_config<GpuBackendConfig>());
-  const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
-  EXPECT_FLOAT_EQ(config.fmha_scale(), 3.1);
-  EXPECT_FLOAT_EQ(config.dropout_rate(), 0.0);
-  EXPECT_EQ(fmha->operands().size(), 5);
-}
-
-TEST_F(CudnnFusedMhaRewriterPipelineTest,
-       BF16Bmm1ScaleBiasMaskSoftmaxBmm2Pattern) {
-  if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto m,
-      ParseAndReturnVerifiedModule(hlo_BF16Bmm1ScaleBiasMaskSoftmaxBmm2Pattern,
-                                   GetModuleConfig()));
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> optimized_module,
-                          GetOptimizedModule(std::move(m)));
-  const HloInstruction* fmha;
-
-  SCOPED_TRACE(optimized_module->ToString());
-  EXPECT_THAT(
-      optimized_module->entry_computation()->root_instruction(),
-      GmockMatch(
-          m::GetTupleElement(
-              m::CustomCall(&fmha, {kCudnnfMHAScaleBiasMaskSoftmaxCallTarget}),
-              0)
-              .WithShape(BF16, {16, 16, 256, 64})));
-  TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
-                          fmha->backend_config<GpuBackendConfig>());
-  const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
-  EXPECT_FLOAT_EQ(config.fmha_scale(), 3.1);
-  EXPECT_FLOAT_EQ(config.dropout_rate(), 0.0);
-  EXPECT_EQ(fmha->operands().size(), 5);
-}
-
-constexpr absl::string_view
-    hlo_BF16Bmm1ScaleBiasNonConstantMaskSoftmaxBmm2Pattern = R"(
-HloModule jit_bmm_test, entry_computation_layout={(bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0})->bf16[16,16,256,64]{3,2,1,0}}
-
-region_0.17.clone {
-  Arg_0.0 = f32[] parameter(0)
-  Arg_1.0 = f32[] parameter(1)
-  ROOT maximum.1 = f32[] maximum(Arg_0.0, Arg_1.0)
-}
-
-region_1.29 {
-  Arg_0.30 = f32[] parameter(0)
-  Arg_1.31 = f32[] parameter(1)
-  ROOT add = f32[] add(Arg_0.30, Arg_1.31)
-}
-
-ENTRY main.41 {
-  constant.10 = pred[16,16,256,256]{3,2,1,0} constant({...})
-  Arg_0.1 = bf16[16,16,256,64]{3,2,1,0} parameter(0)
-  Arg_1.2 = bf16[16,16,256,64]{3,2,1,0} parameter(1)
-  dot.11 = bf16[16,16,256,256]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
-  convert.33 = f32[16,16,256,256]{3,2,1,0} convert(dot.11)
-  constant.6 = f32[] constant(3.1)
-  constant.11 = f32[] constant(1)
-  broadcast.7 = f32[16,16,256,256]{3,2,1,0} broadcast(constant.6), dimensions={}
-  multiply.12 = f32[16,16,256,256]{3,2,1,0} multiply(convert.33, broadcast.7)
-  broadcast.11 = f32[16,16,256,256]{3,2,1,0} broadcast(constant.11), dimensions={}
-  add.15 = f32[16,16,256,256]{3,2,1,0} add(multiply.12, broadcast.11)
-  convert.40 = bf16[16,16,256,256]{3,2,1,0} convert(add.15)
-  constant.4 = bf16[] constant(0)
-  broadcast.5 = bf16[16,16,256,256]{3,2,1,0} broadcast(constant.4), dimensions={}
-  compare = pred[16,16,256,256]{3,2,1,0} compare(convert.40, broadcast.5), direction=GT
-  select.13 = bf16[16,16,256,256]{3,2,1,0} select(compare, convert.40, broadcast.5)
-  convert.36 = f32[16,16,256,256]{3,2,1,0} convert(select.13)
-  constant.9 = f32[] constant(-inf)
-  reduce.18 = f32[16,16,256]{2,1,0} reduce(convert.36, constant.9), dimensions={3}, to_apply=region_0.17.clone
-  broadcast.22 = f32[16,16,256,256]{3,2,1,0} broadcast(reduce.18), dimensions={0,1,2}
-  subtract.23 = f32[16,16,256,256]{3,2,1,0} subtract(convert.36, broadcast.22)
-  exponential.24 = f32[16,16,256,256]{3,2,1,0} exponential(subtract.23)
-  constant.8 = f32[] constant(0)
-  reduce.30 = f32[16,16,256]{2,1,0} reduce(exponential.24, constant.8), dimensions={3}, to_apply=region_1.29
-  broadcast.35 = f32[16,16,256,256]{3,2,1,0} broadcast(reduce.30), dimensions={0,1,2}
-  divide.36 = f32[16,16,256,256]{3,2,1,0} divide(exponential.24, broadcast.35)
-  convert.49 = bf16[16,16,256,256]{3,2,1,0} convert(divide.36)
-  Arg_2.3 = bf16[16,16,256,64]{3,2,1,0} parameter(2)
-  ROOT dot.37 = bf16[16,16,256,64]{3,2,1,0} dot(convert.49, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-})";
-
-TEST_F(CudnnFusedMhaRewriterTestHloTest,
-       BF16Bmm1ScaleBiasNonConstantMaskSoftmaxBmm2Pattern) {
-  if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto m, ParseAndReturnVerifiedModule(
-                  hlo_BF16Bmm1ScaleBiasNonConstantMaskSoftmaxBmm2Pattern));
-  CudnnFusedMHARewriter fusedMhaRewriter{GetCudaComputeCapability(),
-                                         GetCudnnVersion()};
-  TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
-  HloDCE dce;
-  TF_ASSERT_OK(RunHloPass(&dce, m.get()).status());
-  const HloInstruction* fmha;
-
-  SCOPED_TRACE(m->ToString());
-  EXPECT_THAT(
-      m->entry_computation()->root_instruction(),
-      GmockMatch(
-          m::GetTupleElement(
-              m::CustomCall(&fmha, {kCudnnfMHAScaleBiasMaskSoftmaxCallTarget}),
-              0)
-              .WithShape(BF16, {16, 16, 256, 64})));
-  TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
-                          fmha->backend_config<GpuBackendConfig>());
-  const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
-  EXPECT_FLOAT_EQ(config.fmha_scale(), 3.1);
-  EXPECT_FLOAT_EQ(config.dropout_rate(), 0.0);
-  EXPECT_EQ(fmha->operands().size(), 5);
-}
-
-TEST_F(CudnnFusedMhaRewriterPipelineTest,
-       BF16Bmm1ScaleBiasNonConstantMaskSoftmaxBmm2Pattern) {
-  if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto m, ParseAndReturnVerifiedModule(
-                  hlo_BF16Bmm1ScaleBiasNonConstantMaskSoftmaxBmm2Pattern,
-                  GetModuleConfig()));
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> optimized_module,
-                          GetOptimizedModule(std::move(m)));
-  const HloInstruction* fmha;
-
-  SCOPED_TRACE(optimized_module->ToString());
-  EXPECT_THAT(
-      optimized_module->entry_computation()->root_instruction(),
-      GmockMatch(
-          m::GetTupleElement(
-              m::CustomCall(&fmha, {kCudnnfMHAScaleBiasMaskSoftmaxCallTarget}),
-              0)
-              .WithShape(BF16, {16, 16, 256, 64})));
-  TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
-                          fmha->backend_config<GpuBackendConfig>());
-  const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
-  EXPECT_FLOAT_EQ(config.fmha_scale(), 3.1);
-  EXPECT_FLOAT_EQ(config.dropout_rate(), 0.0);
-  EXPECT_EQ(fmha->operands().size(), 5);
-}
-
 TEST_F(CudnnFusedMhaRewriterTestHloTest, BF16Bmm1CombinedMaskBiasSoftmaxBmm2) {
   if (skip_reason_) GTEST_SKIP() << *skip_reason_;
   const char* module_str = R"(
@@ -901,116 +607,6 @@ ENTRY main.61 {
   EXPECT_EQ(fmha->operands().size(), 4);
 }
 
-TEST_F(CudnnFusedMhaRewriterTestHloTest,
-       F16Bmm1ScaleBiasMaskSoftmaxDropoutBmm2) {
-  if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-  const char* module_str = R"(
-HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(f16[2,6,40,64]{3,2,1,0},f16[2,6,64,40]{3,2,1,0},f16[2,6,40,64]{3,2,1,0})->f16[2,6,40,64]{3,2,1,0}}, allow_spmd_sharding_propagation_to_output={true}
-
-region_0.34 {
-  Arg_0.35 = f16[] parameter(0)
-  Arg_1.36 = f16[] parameter(1)
-  ROOT maximum.1 = f16[] maximum(Arg_0.35, Arg_1.36)
-}
-
-region_1.46 {
-  Arg_0.47 = f32[] parameter(0)
-  Arg_1.48 = f32[] parameter(1)
-  ROOT add.2 = f32[] add(Arg_0.47, Arg_1.48)
-}
-
-ENTRY main.83 {
-  constant.5 = u32[1]{0} constant({2718843009})
-  constant.7 = u32[1]{0} constant({1272950319})
-  constant.9 = u32[1]{0} constant({0})
-  constant.11 = u32[1]{0} constant({2711844646})
-  custom-call.59 = (u32[1]{0}, u32[1]{0}) custom-call(constant.5, constant.7, constant.9, constant.11), custom_call_target="cu_threefry2x32", operand_layout_constraints={u32[1]{0}, u32[1]{0}, u32[1]{0}, u32[1]{0}}, api_version=API_VERSION_STATUS_RETURNING, backend_config="\001\000\000\000\000\000\000\000"
-  get-tuple-element.60 = u32[1]{0} get-tuple-element(custom-call.59), index=0
-  bitcast.112 = u32[] bitcast(get-tuple-element.60)
-  broadcast.14 = u32[9600]{0} broadcast(bitcast.112), dimensions={}
-  get-tuple-element.61 = u32[1]{0} get-tuple-element(custom-call.59), index=1
-  bitcast.113 = u32[] bitcast(get-tuple-element.61)
-  broadcast.16 = u32[9600]{0} broadcast(bitcast.113), dimensions={}
-  iota.62 = u32[19200]{0} iota(), iota_dimension=0
-  slice = u32[9600]{0} slice(iota.62), slice={[0:9600]}
-  slice.1 = u32[9600]{0} slice(iota.62), slice={[9600:19200]}
-  custom-call.69 = (u32[9600]{0}, u32[9600]{0}) custom-call(broadcast.14, broadcast.16, slice, slice.1), custom_call_target="cu_threefry2x32", operand_layout_constraints={u32[9600]{0}, u32[9600]{0}, u32[9600]{0}, u32[9600]{0}}, api_version=API_VERSION_STATUS_RETURNING, backend_config="\200%\000\000\000\000\000\000"
-  get-tuple-element.70 = u32[9600]{0} get-tuple-element(custom-call.69), index=0
-  get-tuple-element.71 = u32[9600]{0} get-tuple-element(custom-call.69), index=1
-  concatenate = u32[19200]{0} concatenate(get-tuple-element.70, get-tuple-element.71), dimensions={0}
-  constant.13 = u32[] constant(9)
-  broadcast.18 = u32[19200]{0} broadcast(constant.13), dimensions={}
-  shift-right-logical.1 = u32[19200]{0} shift-right-logical(concatenate, broadcast.18)
-  constant.15 = u32[] constant(1065353216)
-  broadcast.19 = u32[19200]{0} broadcast(constant.15), dimensions={}
-  or.1 = u32[19200]{0} or(shift-right-logical.1, broadcast.19)
-  bitcast-convert.1 = f32[19200]{0} bitcast-convert(or.1)
-  constant.17 = f32[] constant(-1)
-  broadcast.20 = f32[19200]{0} broadcast(constant.17), dimensions={}
-  add.3 = f32[19200]{0} add(bitcast-convert.1, broadcast.20)
-  constant.39 = f32[] constant(0)
-  broadcast.21 = f32[19200]{0} broadcast(constant.39), dimensions={}
-  maximum.2 = f32[19200]{0} maximum(add.3, broadcast.21)
-  constant.28 = f32[] constant(0.8)
-  broadcast.23 = f32[19200]{0} broadcast(constant.28), dimensions={}
-  compare.1 = pred[19200]{0} compare(maximum.2, broadcast.23), direction=LT
-  bitcast.114 = pred[2,6,40,40]{3,2,1,0} bitcast(compare.1)
-  constant.34 = pred[2,6,40,40]{3,2,1,0} constant({...})
-  Arg_0.1 = f16[2,6,40,64]{3,2,1,0} parameter(0), sharding={replicated}
-  Arg_1.2 = f16[2,6,64,40]{3,2,1,0} parameter(1), sharding={replicated}
-  dot.30 = f16[2,6,40,40]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  constant.35 = f16[] constant(2)
-  broadcast.27 = f16[2,6,40,40]{3,2,1,0} broadcast(constant.35), dimensions={}
-  multiply.2 = f16[2,6,40,40]{3,2,1,0} multiply(dot.30, broadcast.27)
-  constant.36 = f16[] constant(1)
-  broadcast.29 = f16[2,6,40,40]{3,2,1,0} broadcast(constant.36), dimensions={}
-  add.5 = f16[2,6,40,40]{3,2,1,0} add(multiply.2, broadcast.29)
-  constant.37 = f16[] constant(0)
-  broadcast.30 = f16[2,6,40,40]{3,2,1,0} broadcast(constant.37), dimensions={}
-  select.1 = f16[2,6,40,40]{3,2,1,0} select(constant.34, add.5, broadcast.30)
-  constant.38 = f16[] constant(-inf)
-  reduce.38 = f16[2,6,40]{2,1,0} reduce(select.1, constant.38), dimensions={3}, to_apply=region_0.34
-  broadcast.32 = f16[2,6,40,40]{3,2,1,0} broadcast(reduce.38), dimensions={0,1,2}
-  subtract.1 = f16[2,6,40,40]{3,2,1,0} subtract(select.1, broadcast.32)
-  exponential.1 = f16[2,6,40,40]{3,2,1,0} exponential(subtract.1)
-  convert.1 = f32[2,6,40,40]{3,2,1,0} convert(exponential.1)
-  reduce.50 = f32[2,6,40]{2,1,0} reduce(convert.1, constant.39), dimensions={3}, to_apply=region_1.46
-  convert.2 = f16[2,6,40]{2,1,0} convert(reduce.50)
-  broadcast.33 = f16[2,6,40,40]{3,2,1,0} broadcast(convert.2), dimensions={0,1,2}
-  divide = f16[2,6,40,40]{3,2,1,0} divide(exponential.1, broadcast.33)
-  constant.40 = f16[] constant(1.25)
-  broadcast.34 = f16[2,6,40,40]{3,2,1,0} broadcast(constant.40), dimensions={}
-  multiply.3 = f16[2,6,40,40]{3,2,1,0} multiply(divide, broadcast.34)
-  select.2 = f16[2,6,40,40]{3,2,1,0} select(bitcast.114, multiply.3, broadcast.30)
-  Arg_2.3 = f16[2,6,40,64]{3,2,1,0} parameter(2), sharding={replicated}
-  ROOT dot.82 = f16[2,6,40,64]{3,2,1,0} dot(select.2, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-}
-
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
-  CudnnFusedMHARewriter fusedMhaRewriter{GetCudaComputeCapability(),
-                                         GetCudnnVersion()};
-  TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
-  const HloInstruction* fmha;
-
-  SCOPED_TRACE(m->ToString());
-  EXPECT_THAT(
-      m->entry_computation()->root_instruction(),
-      GmockMatch(
-          m::GetTupleElement(
-              m::CustomCall(&fmha,
-                            {kCudnnfMHAScaleBiasMaskSoftmaxDropoutCallTarget}),
-              0)
-              .WithShape(F16, {2, 6, 40, 64})));
-  TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
-                          fmha->backend_config<GpuBackendConfig>());
-  const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
-  EXPECT_FLOAT_EQ(config.fmha_scale(), 2);
-  EXPECT_NEAR(config.dropout_rate(), 0.2, 1e-2);
-  EXPECT_EQ(fmha->operands().size(), 5);
-}
-
 TEST_F(CudnnFusedMhaRewriterTestHloTest, F16Bmm1UnfusedSoftmaxBmm2) {
   if (skip_reason_) GTEST_SKIP() << *skip_reason_;
   const char* module_str = R"(
@@ -1068,180 +664,24 @@ ENTRY main.31 {
 }
 
 TEST_F(CudnnFusedMhaRewriterTestHloTest,
-       F16Bmm1UnfusedSoftmaxWithConvertF32ToReduceMaxBmm2) {
-  if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-  const char* module_str = R"(
-HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(f16[128,6,400,64]{3,2,1,0},f16[128,6,64,400]{3,2,1,0},f16[128,6,400,64]{3,2,1,0})->f16[128,6,400,64]{3,2,1,0}}
-
-region_0.18 {
-  Arg_0.19 = f32[] parameter(0)
-  Arg_1.20 = f32[] parameter(1)
-  ROOT maximum = f32[] maximum(Arg_0.19, Arg_1.20)
-}
-
-region_1.29 {
-  Arg_0.30 = f32[] parameter(0)
-  Arg_1.31 = f32[] parameter(1)
-  ROOT add = f32[] add(Arg_0.30, Arg_1.31)
-}
-
-ENTRY main.41 {
-  constant.3 = pred[128,6,400,400]{3,2,1,0} constant({...})
-  Arg_0.1 = f16[128,6,400,64]{3,2,1,0} parameter(0), sharding={replicated}
-  Arg_1.2 = f16[128,6,64,400]{3,2,1,0} parameter(1), sharding={replicated}
-  constant.1 = f16[] constant(1)
-  broadcast.2 = f16[128,6,400,400]{3,2,1,0} broadcast(constant.1), dimensions={}
-  constant.50 = f16[] constant(2)
-  broadcast.100 = f16[128,6,400,400]{3,2,1,0} broadcast(constant.50), dimensions={}
-  dot = f16[128,6,400,400]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_contracting_dims={3}, rhs_contracting_dims={2}, lhs_batch_dims={0,1}, rhs_batch_dims={0,1}
-  multiply.100 = f16[128,6,400,400]{3,2,1,0} multiply(dot, broadcast.100)
-  add.1 = f16[128,6,400,400]{3,2,1,0} add(multiply.100, broadcast.2)
-  constant.5 = f16[] constant(0)
-  broadcast.4 = f16[128,6,400,400]{3,2,1,0} broadcast(constant.5), dimensions={}
-  select.1 = f16[128,6,400,400]{3,2,1,0} select(constant.3, add.1, broadcast.4)
-  convert.1 = f32[128,6,400,400]{3,2,1,0} convert(select.1)
-  constant.7 = f32[] constant(-inf)
-  reduce.22 = f32[128,6,400]{2,1,0} reduce(convert.1, constant.7), dimensions={3}, to_apply=region_0.18
-  broadcast.8 = f32[128,6,400,400]{3,2,1,0} broadcast(reduce.22), dimensions={0,1,2}
-  subtract.1 = f32[128,6,400,400]{3,2,1,0} subtract(convert.1, broadcast.8)
-  exponential.1 = f32[128,6,400,400]{3,2,1,0} exponential(subtract.1)
-  constant.11 = f32[] constant(0)
-  reduce.33 = f32[128,6,400]{2,1,0} reduce(exponential.1, constant.11), dimensions={3}, to_apply=region_1.29
-  broadcast.9 = f32[128,6,400,400]{3,2,1,0} broadcast(reduce.33), dimensions={0,1,2}
-  divide = f32[128,6,400,400]{3,2,1,0} divide(exponential.1, broadcast.9)
-  convert.2 = f16[128,6,400,400]{3,2,1,0} convert(divide)
-  Arg_2.3 = f16[128,6,400,64]{3,2,1,0} parameter(2), sharding={replicated}
-  ROOT dot.1 = f16[128,6,400,64]{3,2,1,0} dot(convert.2, Arg_2.3), lhs_contracting_dims={3}, rhs_contracting_dims={2}, lhs_batch_dims={0,1}, rhs_batch_dims={0,1}
-}
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
-  CudnnFusedMHARewriter fusedMhaRewriter{GetCudaComputeCapability(),
-                                         GetCudnnVersion()};
-  TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
-  const HloInstruction* fmha;
-
-  SCOPED_TRACE(m->ToString());
-  EXPECT_THAT(
-      m->entry_computation()->root_instruction(),
-      GmockMatch(
-          m::GetTupleElement(
-              m::CustomCall(&fmha, {kCudnnfMHAScaleBiasMaskSoftmaxCallTarget}),
-              0)
-              .WithShape(F16, {128, 6, 400, 64})));
-  TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
-                          fmha->backend_config<GpuBackendConfig>());
-  const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
-  EXPECT_FLOAT_EQ(config.fmha_scale(), 2.0);
-  EXPECT_FLOAT_EQ(config.dropout_rate(), 0.0);
-  EXPECT_EQ(fmha->operands().size(), 5);
-}
-
-TEST_F(CudnnFusedMhaRewriterTestHloTest,
-       BF16Bmm1UnfusedScaleMaskBiasSoftmaxBmm2) {
+       BF16Bmm1ConvertedMaskAddedAfterFirstGemmSoftmaxBmm2) {
   if (skip_reason_) GTEST_SKIP() << *skip_reason_;
   const char* module_str = R"(
-HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[16,256,16,64]{3,2,1,0},bf16[16,256,16,64]{3,2,1,0},bf16[16,256,16,64]{3,2,1,0},bf16[1,16,256,256]{3,2,1,0},pred[16,1,256,256]{3,2,1,0})->bf16[16,256,16,64]{3,2,1,0}}
+HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[16,256,16,64]{3,2,1,0},bf16[16,256,16,64]{3,2,1,0},bf16[16,256,16,64]{3,2,1,0},pred[16,1,256,256]{3,2,1,0})->bf16[16,256,16,64]{3,2,1,0}}
 
-region_0.32.clone {
+region_0.27.clone {
   Arg_0.0 = f32[] parameter(0)
   Arg_1.0 = f32[] parameter(1)
   ROOT maximum.1 = f32[] maximum(Arg_0.0, Arg_1.0)
 }
 
-region_1.44 {
-  Arg_0.45 = f32[] parameter(0)
-  Arg_1.46 = f32[] parameter(1)
-  ROOT add = f32[] add(Arg_0.45, Arg_1.46)
+region_1.39 {
+  Arg_0.40 = f32[] parameter(0)
+  Arg_1.41 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0.40, Arg_1.41)
 }
 
-ENTRY main.61 {
-  Arg_2.3 = bf16[16,256,16,64]{3,2,1,0} parameter(2), sharding={replicated}
-  transpose.5 = bf16[16,16,64,256]{3,2,1,0} transpose(Arg_2.3), dimensions={0,2,3,1}
-  Arg_0.1 = bf16[16,256,16,64]{3,2,1,0} parameter(0), sharding={replicated}
-  transpose.6 = bf16[16,16,256,64]{3,2,1,0} transpose(Arg_0.1), dimensions={0,2,1,3}
-  Arg_1.2 = bf16[16,256,16,64]{3,2,1,0} parameter(1), sharding={replicated}
-  transpose.7 = bf16[16,16,64,256]{3,2,1,0} transpose(Arg_1.2), dimensions={0,2,3,1}
-  Arg_4.5 = pred[16,1,256,256]{3,2,1,0} parameter(4), sharding={replicated}
-  bitcast.35 = pred[16,256,256]{2,1,0} bitcast(Arg_4.5)
-  convert.49 = s32[16,256,256]{2,1,0} convert(bitcast.35)
-  constant.5 = s32[] constant(0)
-  broadcast.10 = s32[16,256,256]{2,1,0} broadcast(constant.5), dimensions={}
-  constant.50 = bf16[] constant(2)
-  broadcast.100 = bf16[16,16,256,256]{3,2,1,0} broadcast(constant.50), dimensions={}
-  compare = pred[16,256,256]{2,1,0} compare(convert.49, broadcast.10), direction=GT
-  constant.7 = bf16[] constant(0)
-  broadcast.12 = bf16[16,256,256]{2,1,0} broadcast(constant.7), dimensions={}
-  constant.9 = bf16[] constant(-9.999e+09)
-  broadcast.13 = bf16[16,256,256]{2,1,0} broadcast(constant.9), dimensions={}
-  select = bf16[16,256,256]{2,1,0} select(compare, broadcast.12, broadcast.13)
-  convert.51 = f32[16,256,256]{2,1,0} convert(select)
-  broadcast.14 = f32[16,16,256,256]{3,2,1,0} broadcast(convert.51), dimensions={0,2,3}
-  Arg_3.4 = bf16[1,16,256,256]{3,2,1,0} parameter(3), sharding={replicated}
-  bitcast.52 = bf16[16,256,256]{2,1,0} bitcast(Arg_3.4)
-  convert.52 = f32[16,256,256]{2,1,0} convert(bitcast.52)
-  broadcast.15 = f32[16,16,256,256]{3,2,1,0} broadcast(convert.52), dimensions={1,2,3}
-  add.1 = f32[16,16,256,256]{3,2,1,0} add(broadcast.14, broadcast.15)
-  dot = bf16[16,16,256,256]{3,2,1,0} dot(transpose.6, transpose.7), lhs_contracting_dims={3}, rhs_contracting_dims={2}, lhs_batch_dims={0,1}, rhs_batch_dims={0,1}
-  multiply.100 = bf16[16,16,256,256]{3,2,1,0} multiply(dot, broadcast.100)
-  convert.55 = f32[16,16,256,256]{3,2,1,0} convert(multiply.100)
-  add.10 = f32[16,16,256,256]{3,2,1,0} add(convert.55, add.1)
-  constant.11 = f32[] constant(-inf)
-  reduce.36 = f32[16,16,256]{2,1,0} reduce(add.10, constant.11), dimensions={3}, to_apply=region_0.32.clone
-  broadcast.17 = f32[16,16,256,256]{3,2,1,0} broadcast(reduce.36), dimensions={0,1,2}
-  subtract.1 = f32[16,16,256,256]{3,2,1,0} subtract(add.10, broadcast.17)
-  exponential.1 = f32[16,16,256,256]{3,2,1,0} exponential(subtract.1)
-  constant.14 = f32[] constant(0)
-  reduce.48 = f32[16,16,256]{2,1,0} reduce(exponential.1, constant.14), dimensions={3}, to_apply=region_1.44
-  broadcast.18 = f32[16,16,256,256]{3,2,1,0} broadcast(reduce.48), dimensions={0,1,2}
-  divide = f32[16,16,256,256]{3,2,1,0} divide(exponential.1, broadcast.18)
-  convert.68 = bf16[16,16,256,256]{3,2,1,0} convert(divide)
-  dot.1 = bf16[16,16,64,256]{3,2,1,0} dot(transpose.5, convert.68), lhs_contracting_dims={3}, rhs_contracting_dims={3}, lhs_batch_dims={0,1}, rhs_batch_dims={0,1}
-  ROOT transpose.8 = bf16[16,256,16,64]{3,2,1,0} transpose(dot.1), dimensions={0,3,1,2}
-}
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
-  CudnnFusedMHARewriter fusedMhaRewriter{GetCudaComputeCapability(),
-                                         GetCudnnVersion()};
-  TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
-  const HloInstruction* fmha;
-
-  SCOPED_TRACE(m->ToString());
-  EXPECT_THAT(
-      m->entry_computation()->root_instruction(),
-      GmockMatch(
-          m::Transpose(
-              m::Transpose(m::GetTupleElement(
-                  m::CustomCall(&fmha, {kCudnnfMHAScaleBiasSoftmaxCallTarget}),
-                  0)))
-              .WithShape(BF16, {16, 256, 16, 64})));
-  TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
-                          fmha->backend_config<GpuBackendConfig>());
-  const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
-  EXPECT_EQ(fmha->operands().size(), 4);
-  EXPECT_FLOAT_EQ(config.fmha_scale(), 2.0);
-}
-
-TEST_F(CudnnFusedMhaRewriterTestHloTest,
-       BF16Bmm1ConvertedMaskAddedAfterFirstGemmSoftmaxBmm2) {
-  if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-  const char* module_str = R"(
-HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[16,256,16,64]{3,2,1,0},bf16[16,256,16,64]{3,2,1,0},bf16[16,256,16,64]{3,2,1,0},pred[16,1,256,256]{3,2,1,0})->bf16[16,256,16,64]{3,2,1,0}}
-
-region_0.27.clone {
-  Arg_0.0 = f32[] parameter(0)
-  Arg_1.0 = f32[] parameter(1)
-  ROOT maximum.1 = f32[] maximum(Arg_0.0, Arg_1.0)
-}
-
-region_1.39 {
-  Arg_0.40 = f32[] parameter(0)
-  Arg_1.41 = f32[] parameter(1)
-  ROOT add = f32[] add(Arg_0.40, Arg_1.41)
-}
-
-ENTRY main.56 {
+ENTRY main.56 {
   Arg_2.3 = bf16[16,256,16,64]{3,2,1,0} parameter(2), sharding={replicated}
   transpose.5 = bf16[16,16,64,256]{3,2,1,0} transpose(Arg_2.3), dimensions={0,2,3,1}
   Arg_0.1 = bf16[16,256,16,64]{3,2,1,0} parameter(0), sharding={replicated}
@@ -2240,623 +1680,201 @@ ENTRY main.146 {
       GetCudnnVersionWithDbiasAndMaskBwdInputSupport()};
   TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
 
-  CudnnFusedMHATransposeFusion fmha_transpose_fusion;
-
-  HloDCE dce;
-  TF_ASSERT_OK(RunHloPass(&alge_simp, m.get()).status());
-  TF_ASSERT_OK(RunHloPass(&fmha_transpose_fusion, m.get()).status());
-
-  TF_ASSERT_OK(RunHloPass(&dce, m.get()).status());
-
-  ComputationLayout computation_layout(
-      m->entry_computation()->ComputeProgramShape());
-
-  const HloInstruction* fmha;
-  auto dbias_index = 5;
-  SCOPED_TRACE(m->ToString());
-  EXPECT_THAT(
-      m->entry_computation()->root_instruction(),
-      GmockMatch(m::Tuple(
-          m::Bitcast().WithShape(F16, {16, 256, 16, 64}),
-          m::Bitcast(
-              m::GetTupleElement(
-                  m::CustomCall(
-                      &fmha,
-                      {kCudnnfMHAScaleBiasSoftmaxDropoutBackwardCallTarget}),
-                  0))
-              .WithShape(F16, {16, 256, 16, 64}),
-          m::Bitcast(
-              m::GetTupleElement(
-                  m::CustomCall(
-                      {kCudnnfMHAScaleBiasSoftmaxDropoutBackwardCallTarget}),
-                  1))
-              .WithShape(F16, {16, 256, 16, 64}),
-          m::Bitcast(
-              m::GetTupleElement(
-                  m::CustomCall(
-                      {kCudnnfMHAScaleBiasSoftmaxDropoutBackwardCallTarget}),
-                  2))
-              .WithShape(F16, {16, 256, 16, 64}),
-          m::GetTupleElement(  // dbias
-              m::CustomCall(
-                  {kCudnnfMHAScaleBiasSoftmaxDropoutBackwardCallTarget}),
-              dbias_index))));
-  TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
-                          fmha->backend_config<GpuBackendConfig>());
-  const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
-  EXPECT_EQ(fmha->operands().size(), 5);
-  EXPECT_NEAR(config.dropout_rate(), 0.1, 1e-2);
-}
-
-TEST_F(CudnnFusedMhaRewriterTestHloTest, BF16MiniT5xTest) {
-  if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-  const char* module_str = R"(
-HloModule jit__lambda_, entry_computation_layout={(bf16[12,512,32,64]{3,2,1,0},bf16[12,512,2,32,64]{4,3,2,1,0},f32[12,512]{1,0},f32[12,512]{1,0})->(bf16[], bf16[12,512,32,64]{3,2,1,0}, bf16[12,512,2,32,64]{4,3,2,1,0})}, allow_spmd_sharding_propagation_to_output={true,true,true}
-
-region_0.51 {
-  Arg_0.52 = bf16[] parameter(0)
-  Arg_1.53 = bf16[] parameter(1)
-  ROOT maximum.54 = bf16[] maximum(Arg_0.52, Arg_1.53)
-}
-
-region_1.63 {
-  Arg_0.64 = f32[] parameter(0)
-  Arg_1.65 = f32[] parameter(1)
-  ROOT add.66 = f32[] add(Arg_0.64, Arg_1.65)
-}
-
-region_3.99 {
-  Arg_0.100 = bf16[] parameter(0)
-  Arg_1.101 = bf16[] parameter(1)
-  ROOT add.102 = bf16[] add(Arg_0.100, Arg_1.101)
-}
-
-ENTRY main.129 {
-  Arg_1.2 = bf16[12,512,2,32,64]{4,3,2,1,0} parameter(1), sharding={replicated}
-  copy = bf16[12,512,2,32,64]{1,4,3,0,2} copy(Arg_1.2), sharding={replicated}
-  slice.42 = bf16[12,512,1,32,64]{1,4,3,0,2} slice(copy), slice={[0:12], [0:512], [1:2], [0:32], [0:64]}
-  reshape.44 = bf16[12,512,32,64]{1,3,2,0} reshape(slice.42)
-  transpose.5 = bf16[12,32,64,512]{3,2,1,0} transpose(reshape.44), dimensions={0,2,3,1}
-  Arg_0.1 = bf16[12,512,32,64]{3,2,1,0} parameter(0), sharding={replicated}
-  copy.1 = bf16[12,512,32,64]{3,1,2,0} copy(Arg_0.1), sharding={replicated}
-  constant.5 = bf16[] constant(0.125)
-  broadcast.6 = bf16[12,512,32,64]{3,1,2,0} broadcast(constant.5), dimensions={}
-  multiply.45 = bf16[12,512,32,64]{3,1,2,0} multiply(copy.1, broadcast.6)
-  transpose = bf16[12,32,512,64]{3,2,1,0} transpose(multiply.45), dimensions={0,2,1,3}
-  copy.2 = bf16[12,512,2,32,64]{1,4,3,0,2} copy(Arg_1.2), sharding={replicated}
-  slice.41 = bf16[12,512,1,32,64]{1,4,3,0,2} slice(copy.2), slice={[0:12], [0:512], [0:1], [0:32], [0:64]}
-  reshape.43 = bf16[12,512,32,64]{1,3,2,0} reshape(slice.41)
-  transpose.1 = bf16[12,32,64,512]{3,2,1,0} transpose(reshape.43), dimensions={0,2,3,1}
-  dot = bf16[12,32,512,512]{3,2,1,0} dot(transpose, transpose.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  Arg_2.3 = f32[12,512]{1,0} parameter(2), sharding={replicated}
-  constant.14 = f32[] constant(0)
-  broadcast.19 = f32[12,512]{1,0} broadcast(constant.14), dimensions={}
-  compare.24 = pred[12,512]{1,0} compare(Arg_2.3, broadcast.19), direction=GT
-  broadcast.30 = pred[12,512,512]{2,1,0} broadcast(compare.24), dimensions={0,1}
-  Arg_3.4 = f32[12,512]{1,0} parameter(3), sharding={replicated}
-  compare.25 = pred[12,512]{1,0} compare(Arg_3.4, broadcast.19), direction=GT
-  broadcast.33 = pred[12,512,512]{2,1,0} broadcast(compare.25), dimensions={0,2}
-  and.34 = pred[12,512,512]{2,1,0} and(broadcast.30, broadcast.33)
-  convert.4 = s32[12,512,512]{2,1,0} convert(and.34)
-  constant.16 = s32[] constant(0)
-  broadcast.21 = s32[12,512,512]{2,1,0} broadcast(constant.16), dimensions={}
-  compare.0 = pred[12,512,512]{2,1,0} compare(convert.4, broadcast.21), direction=GT
-  constant.20 = bf16[] constant(0)
-  broadcast.22 = bf16[12,512,512]{2,1,0} broadcast(constant.20), dimensions={}
-  constant.11 = bf16[] constant(-9.999e+09)
-  broadcast.23 = bf16[12,512,512]{2,1,0} broadcast(constant.11), dimensions={}
-  select.0 = bf16[12,512,512]{2,1,0} select(compare.0, broadcast.22, broadcast.23)
-  broadcast.49 = bf16[12,32,512,512]{3,2,1,0} broadcast(select.0), dimensions={0,2,3}
-  add.50 = bf16[12,32,512,512]{3,2,1,0} add(dot, broadcast.49)
-  constant.22 = bf16[] constant(-inf)
-  reduce.55 = bf16[12,32,512]{2,1,0} reduce(add.50, constant.22), dimensions={3}, to_apply=region_0.51
-  broadcast.59 = bf16[12,32,512,512]{3,2,1,0} broadcast(reduce.55), dimensions={0,1,2}
-  subtract.60 = bf16[12,32,512,512]{3,2,1,0} subtract(add.50, broadcast.59)
-  exponential.61 = bf16[12,32,512,512]{3,2,1,0} exponential(subtract.60)
-  convert.62 = f32[12,32,512,512]{3,2,1,0} convert(exponential.61)
-  reduce.67 = f32[12,32,512]{2,1,0} reduce(convert.62, constant.14), dimensions={3}, to_apply=region_1.63
-  convert.5 = bf16[12,32,512]{2,1,0} convert(reduce.67)
-  broadcast.72 = bf16[12,32,512,512]{3,2,1,0} broadcast(convert.5), dimensions={0,1,2}
-  divide.73 = bf16[12,32,512,512]{3,2,1,0} divide(exponential.61, broadcast.72)
-  dot.1 = bf16[12,32,64,512]{3,2,1,0} dot(transpose.5, divide.73), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
-  convert.6 = f32[12,32,64,512]{3,2,1,0} convert(dot.1)
-  reduce.83 = f32[] reduce(convert.6, constant.14), dimensions={0,3,1,2}, to_apply=region_1.63
-  convert.84 = bf16[] convert(reduce.83)
-  constant.2 = bf16[] constant(0.0007935)
-  multiply.86 = bf16[] multiply(convert.84, constant.2)
-  broadcast.9 = bf16[12,32,512,64]{3,2,1,0} broadcast(constant.2), dimensions={}
-  dot.2 = bf16[12,32,512,512]{3,2,1,0} dot(broadcast.9, transpose.5), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  divide.109 = bf16[12,32,512,512]{3,2,1,0} divide(dot.2, broadcast.72)
-  constant.10 = bf16[] constant(1)
-  broadcast.24 = bf16[12,32,512]{2,1,0} broadcast(constant.10), dimensions={}
-  multiply.4 = bf16[12,32,512]{2,1,0} multiply(convert.5, convert.5)
-  divide.0 = bf16[12,32,512]{2,1,0} divide(broadcast.24, multiply.4)
-  broadcast.96 = bf16[12,32,512,512]{3,2,1,0} broadcast(divide.0), dimensions={0,1,2}
-  multiply.97 = bf16[12,32,512,512]{3,2,1,0} multiply(dot.2, broadcast.96)
-  multiply.98 = bf16[12,32,512,512]{3,2,1,0} multiply(multiply.97, exponential.61)
-  reduce.103 = bf16[12,32,512]{2,1,0} reduce(multiply.98, constant.20), dimensions={3}, to_apply=region_3.99
-  negate.0 = bf16[12,32,512]{2,1,0} negate(reduce.103)
-  broadcast.10 = bf16[12,32,512,512]{3,2,1,0} broadcast(negate.0), dimensions={0,1,2}
-  add.118 = bf16[12,32,512,512]{3,2,1,0} add(divide.109, broadcast.10)
-  multiply.119 = bf16[12,32,512,512]{3,2,1,0} multiply(add.118, exponential.61)
-  transpose.9 = bf16[12,32,512,64]{2,3,1,0} transpose(reshape.43), dimensions={0,2,1,3}
-  copy.3 = bf16[12,32,512,64]{3,2,1,0} copy(transpose.9)
-  dot.4 = bf16[12,32,512,64]{3,2,1,0} dot(multiply.119, copy.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  broadcast.12 = bf16[12,32,512,64]{3,2,1,0} broadcast(constant.5), dimensions={}
-  multiply.3 = bf16[12,32,512,64]{3,2,1,0} multiply(dot.4, broadcast.12)
-  transpose.11 = bf16[12,512,32,64]{3,1,2,0} transpose(multiply.3), dimensions={0,2,1,3}
-  broadcast.7 = bf16[12,32,64,512]{3,2,1,0} broadcast(constant.2), dimensions={}
-  dot.90 = bf16[12,32,64,512]{3,2,1,0} dot(broadcast.7, divide.73), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  transpose.91 = bf16[12,512,32,64]{1,3,2,0} transpose(dot.90), dimensions={0,3,1,2}
-  reshape.92 = bf16[12,512,1,32,64]{1,4,3,0,2} reshape(transpose.91)
-  pad.93 = bf16[12,512,2,32,64]{1,4,3,0,2} pad(reshape.92, constant.20), padding=0_0x0_0x1_0x0_0x0_0
-  dot.3 = bf16[12,32,512,64]{3,2,1,0} dot(multiply.119, transpose), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  copy.4 = bf16[12,32,512,64]{2,3,1,0} copy(dot.3)
-  transpose.121 = bf16[12,512,32,64]{1,3,2,0} transpose(copy.4), dimensions={0,2,1,3}
-  reshape.124 = bf16[12,512,1,32,64]{1,4,3,0,2} reshape(transpose.121)
-  pad.125 = bf16[12,512,2,32,64]{1,4,3,0,2} pad(reshape.124, constant.20), padding=0_0x0_0x0_1x0_0x0_0
-  add.126 = bf16[12,512,2,32,64]{1,4,3,0,2} add(pad.93, pad.125)
-  tuple.128 = (bf16[], bf16[12,512,32,64]{3,1,2,0}, bf16[12,512,2,32,64]{1,4,3,0,2}) tuple(multiply.86, transpose.11, add.126)
-  get-tuple-element = bf16[] get-tuple-element(tuple.128), index=0
-  get-tuple-element.1 = bf16[12,512,32,64]{3,1,2,0} get-tuple-element(tuple.128), index=1
-  copy.5 = bf16[12,512,32,64]{3,2,1,0} copy(get-tuple-element.1)
-  get-tuple-element.2 = bf16[12,512,2,32,64]{1,4,3,0,2} get-tuple-element(tuple.128), index=2
-  copy.6 = bf16[12,512,2,32,64]{4,3,2,1,0} copy(get-tuple-element.2)
-  ROOT tuple = (bf16[], bf16[12,512,32,64]{3,2,1,0}, bf16[12,512,2,32,64]{4,3,2,1,0}) tuple(get-tuple-element, copy.5, copy.6)
-}
-
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
-  AlgebraicSimplifierOptions alg_sim_options;
-  alg_sim_options.set_supports_non_canonical_dots(false);
-  alg_sim_options.set_is_layout_sensitive(true);
-  alg_sim_options.set_enable_conv_operand_swap(false);
-  AlgebraicSimplifier alge_simp{alg_sim_options};
-  ReshapeDecomposer reshape_decomposer;
-  LayoutNormalization layout_normalizer;
-  HloCSE cse{/*is_layout_sensitive=*/true};
-  TF_ASSERT_OK(RunHloPass(&reshape_decomposer, m.get()).status());
-  TF_ASSERT_OK(RunHloPass(&layout_normalizer, m.get()).status());
-  TF_ASSERT_OK(RunHloPass(&cse, m.get()).status());
-  TF_ASSERT_OK(RunHloPass(&alge_simp, m.get()).status());
-
-  CudnnFusedMHARewriter fusedMhaRewriter{GetCudaComputeCapability(),
-                                         GetCudnnVersion()};
-  TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
-
-  CudnnFusedMHATransposeFusion fmha_transpose_fusion;
-
-  HloDCE dce;
-  TF_ASSERT_OK(RunHloPass(&alge_simp, m.get()).status());
-  TF_ASSERT_OK(RunHloPass(&fmha_transpose_fusion, m.get()).status());
-
-  TF_ASSERT_OK(RunHloPass(&dce, m.get()).status());
-
-  EXPECT_EQ(CountFusedAttentionCall(m.get()), 1);
-  EXPECT_EQ(CountFusedAttentionCall(m.get(), /*is_backward*/ true), 1);
-}
-
-TEST_F(CudnnFusedMhaRewriterTestHloTest,
-       BF16TrainingBmm1ScaleBiasMaskSoftmaxDropoutBmm2) {
-  if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-  const char* module_str = R"(
-HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[2,6,128,64]{3,2,1,0},bf16[2,6,64,128]{3,2,1,0},bf16[2,6,128,64]{3,2,1,0},bf16[2,6,128,64]{3,2,1,0})->(bf16[2,6,128,64]{3,2,1,0}, bf16[2,6,128,64]{3,2,1,0}, bf16[2,6,64,128]{3,2,1,0}, bf16[2,6,128,64]{3,2,1,0})}, allow_spmd_sharding_propagation_to_output={true,true,true,true}
-
-region_0.38 {
-  Arg_0.39 = bf16[] parameter(0)
-  Arg_1.40 = bf16[] parameter(1)
-  ROOT maximum.1 = bf16[] maximum(Arg_0.39, Arg_1.40)
-}
-
-region_1.50 {
-  Arg_0.51 = f32[] parameter(0)
-  Arg_1.52 = f32[] parameter(1)
-  ROOT add.2 = f32[] add(Arg_0.51, Arg_1.52)
-}
-
-region_2.99 {
-  Arg_0.100 = bf16[] parameter(0)
-  Arg_1.101 = bf16[] parameter(1)
-  ROOT add.3 = bf16[] add(Arg_0.100, Arg_1.101)
-}
-
-ENTRY main.126 {
-  constant.6 = u32[1]{0} constant({2718843009})
-  constant.8 = u32[1]{0} constant({1272950319})
-  constant.10 = u32[1]{0} constant({0})
-  constant.12 = u32[1]{0} constant({2711844646})
-  custom-call.65 = (u32[1]{0}, u32[1]{0}) custom-call(constant.6, constant.8, constant.10, constant.12), custom_call_target="cu_threefry2x32", operand_layout_constraints={u32[1]{0}, u32[1]{0}, u32[1]{0}, u32[1]{0}}, api_version=API_VERSION_STATUS_RETURNING, backend_config="\001\000\000\000\000\000\000\000"
-  get-tuple-element.66 = u32[1]{0} get-tuple-element(custom-call.65), index=0
-  bitcast.343 = u32[] bitcast(get-tuple-element.66)
-  broadcast.27 = u32[98304]{0} broadcast(bitcast.343), dimensions={}
-  get-tuple-element.67 = u32[1]{0} get-tuple-element(custom-call.65), index=1
-  bitcast.344 = u32[] bitcast(get-tuple-element.67)
-  broadcast.28 = u32[98304]{0} broadcast(bitcast.344), dimensions={}
-  iota.68 = u32[196608]{0} iota(), iota_dimension=0
-  slice = u32[98304]{0} slice(iota.68), slice={[0:98304]}
-  slice.1 = u32[98304]{0} slice(iota.68), slice={[98304:196608]}
-  custom-call.75 = (u32[98304]{0}, u32[98304]{0}) custom-call(broadcast.27, broadcast.28, slice, slice.1), custom_call_target="cu_threefry2x32", operand_layout_constraints={u32[98304]{0}, u32[98304]{0}, u32[98304]{0}, u32[98304]{0}}, api_version=API_VERSION_STATUS_RETURNING, backend_config="\000\200\001\000\000\000\000\000"
-  get-tuple-element.76 = u32[98304]{0} get-tuple-element(custom-call.75), index=0
-  get-tuple-element.77 = u32[98304]{0} get-tuple-element(custom-call.75), index=1
-  concatenate.2 = u32[196608]{0} concatenate(get-tuple-element.76, get-tuple-element.77), dimensions={0}
-  constant.56 = u32[] constant(9)
-  broadcast.63 = u32[196608]{0} broadcast(constant.56), dimensions={}
-  shift-right-logical.3 = u32[196608]{0} shift-right-logical(concatenate.2, broadcast.63)
-  constant.57 = u32[] constant(1065353216)
-  broadcast.64 = u32[196608]{0} broadcast(constant.57), dimensions={}
-  or.3 = u32[196608]{0} or(shift-right-logical.3, broadcast.64)
-  bitcast-convert.3 = f32[196608]{0} bitcast-convert(or.3)
-  constant.58 = f32[] constant(-1)
-  broadcast.65 = f32[196608]{0} broadcast(constant.58), dimensions={}
-  add.10 = f32[196608]{0} add(bitcast-convert.3, broadcast.65)
-  constant.48 = f32[] constant(0)
-  broadcast.66 = f32[196608]{0} broadcast(constant.48), dimensions={}
-  maximum.4 = f32[196608]{0} maximum(add.10, broadcast.66)
-  constant.59 = f32[] constant(0.9)
-  broadcast.67 = f32[196608]{0} broadcast(constant.59), dimensions={}
-  compare.3 = pred[196608]{0} compare(maximum.4, broadcast.67), direction=LT
-  bitcast.308 = pred[2,6,128,128]{3,2,1,0} bitcast(compare.3)
-  constant.44 = pred[2,6,128,128]{3,2,1,0} constant({...})
-  Arg_0.1 = bf16[2,6,128,64]{3,2,1,0} parameter(0), sharding={replicated}
-  Arg_1.2 = bf16[2,6,64,128]{3,2,1,0} parameter(1), sharding={replicated}
-  dot.34 = bf16[2,6,128,128]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  constant.55 = bf16[] constant(2)
-  broadcast.61 = bf16[2,6,128,128]{3,2,1,0} broadcast(constant.55), dimensions={}
-  multiply.8 = bf16[2,6,128,128]{3,2,1,0} multiply(dot.34, broadcast.61)
-  constant.52 = bf16[] constant(1)
-  broadcast.39 = bf16[2,6,128,128]{3,2,1,0} broadcast(constant.52), dimensions={}
-  add.6 = bf16[2,6,128,128]{3,2,1,0} add(multiply.8, broadcast.39)
-  constant.54 = bf16[] constant(0)
-  broadcast.52 = bf16[2,6,128,128]{3,2,1,0} broadcast(constant.54), dimensions={}
-  select.1 = bf16[2,6,128,128]{3,2,1,0} select(constant.44, add.6, broadcast.52)
-  constant.41 = bf16[] constant(-inf)
-  reduce.42 = bf16[2,6,128]{2,1,0} reduce(select.1, constant.41), dimensions={3}, to_apply=region_0.38
-  broadcast.42 = bf16[2,6,128,128]{3,2,1,0} broadcast(reduce.42), dimensions={0,1,2}
-  subtract.1 = bf16[2,6,128,128]{3,2,1,0} subtract(select.1, broadcast.42)
-  exponential.1 = bf16[2,6,128,128]{3,2,1,0} exponential(subtract.1)
-  convert.5 = f32[2,6,128,128]{3,2,1,0} convert(exponential.1)
-  reduce.54 = f32[2,6,128]{2,1,0} reduce(convert.5, constant.48), dimensions={3}, to_apply=region_1.50
-  convert.9 = bf16[2,6,128]{2,1,0} convert(reduce.54)
-  broadcast.68 = bf16[2,6,128,128]{3,2,1,0} broadcast(convert.9), dimensions={0,1,2}
-  divide.5 = bf16[2,6,128,128]{3,2,1,0} divide(exponential.1, broadcast.68)
-  constant.60 = bf16[] constant(1.109)
-  broadcast.69 = bf16[2,6,128,128]{3,2,1,0} broadcast(constant.60), dimensions={}
-  multiply.20 = bf16[2,6,128,128]{3,2,1,0} multiply(divide.5, broadcast.69)
-  select.8 = bf16[2,6,128,128]{3,2,1,0} select(bitcast.308, multiply.20, broadcast.52)
-  Arg_2.3 = bf16[2,6,128,64]{3,2,1,0} parameter(2), sharding={replicated}
-  dot.88 = bf16[2,6,128,64]{3,2,1,0} dot(select.8, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  bitcast.248 = pred[2,6,128,128]{3,2,1,0} bitcast(compare.3)
-  Arg_3.4 = bf16[2,6,128,64]{3,2,1,0} parameter(3), sharding={replicated}
-  dot.91 = bf16[2,6,128,128]{3,2,1,0} dot(Arg_3.4, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
-  select.6 = bf16[2,6,128,128]{3,2,1,0} select(bitcast.248, dot.91, broadcast.52)
-  multiply.17 = bf16[2,6,128,128]{3,2,1,0} multiply(select.6, broadcast.69)
-  divide.4 = bf16[2,6,128,128]{3,2,1,0} divide(multiply.17, broadcast.68)
-  broadcast.55 = bf16[2,6,128]{2,1,0} broadcast(constant.52), dimensions={}
-  multiply.11 = bf16[2,6,128]{2,1,0} multiply(convert.9, convert.9)
-  divide.3 = bf16[2,6,128]{2,1,0} divide(broadcast.55, multiply.11)
-  broadcast.56 = bf16[2,6,128]{2,1,0} broadcast(constant.60), dimensions={}
-  multiply.12 = bf16[2,6,128]{2,1,0} multiply(divide.3, broadcast.56)
-  broadcast.58 = bf16[2,6,128,128]{3,2,1,0} broadcast(multiply.12), dimensions={0,1,2}
-  multiply.13 = bf16[2,6,128,128]{3,2,1,0} multiply(select.6, broadcast.58)
-  multiply.14 = bf16[2,6,128,128]{3,2,1,0} multiply(multiply.13, exponential.1)
-  reduce.103 = bf16[2,6,128]{2,1,0} reduce(multiply.14, constant.54), dimensions={3}, to_apply=region_2.99
-  negate.3 = bf16[2,6,128]{2,1,0} negate(reduce.103)
-  broadcast.62 = bf16[2,6,128,128]{3,2,1,0} broadcast(negate.3), dimensions={0,1,2}
-  add.9 = bf16[2,6,128,128]{3,2,1,0} add(divide.4, broadcast.62)
-  multiply.18 = bf16[2,6,128,128]{3,2,1,0} multiply(add.9, exponential.1)
-  select.7 = bf16[2,6,128,128]{3,2,1,0} select(constant.44, multiply.18, broadcast.52)
-  multiply.19 = bf16[2,6,128,128]{3,2,1,0} multiply(select.7, broadcast.61)
-  dot.124 = bf16[2,6,128,64]{3,2,1,0} dot(multiply.19, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
-  dot = bf16[2,6,64,128]{3,2,1,0} dot(Arg_0.1, multiply.19), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  dot.1 = bf16[2,6,128,64]{3,2,1,0} dot(select.8, Arg_3.4), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  ROOT tuple.125 = (bf16[2,6,128,64]{3,2,1,0}, bf16[2,6,128,64]{3,2,1,0}, bf16[2,6,64,128]{3,2,1,0}, bf16[2,6,128,64]{3,2,1,0}) tuple(dot.88, dot.124, dot, dot.1)
-}
-)";
-  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
-  CudnnFusedMHARewriter fusedMhaRewriter{
-      GetCudaComputeCapability(),
-      GetCudnnVersionWithDbiasAndMaskBwdInputSupport()};
-  TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
-  HloDCE dce;
-  TF_ASSERT_OK(RunHloPass(&dce, m.get()).status());
-
-  ComputationLayout computation_layout(
-      m->entry_computation()->ComputeProgramShape());
-
-  const HloInstruction* fmha;
-  const absl::string_view target =
-      kCudnnfMHAScaleBiasMaskSoftmaxDropoutCallTarget;
-  const absl::string_view backward_target =
-      kCudnnfMHAScaleBiasMaskSoftmaxDropoutBackwardCallTarget;
-
-  SCOPED_TRACE(m->ToString());
-  EXPECT_THAT(
-      m->entry_computation()->root_instruction(),
-      GmockMatch(m::Tuple(
-          m::GetTupleElement(m::CustomCall(&fmha, {target}), 0)
-              .WithShape(BF16, {2, 6, 128, 64}),
-          m::GetTupleElement(m::CustomCall(&fmha, {backward_target}), 0)
-              .WithShape(BF16, {2, 6, 128, 64}),
-          m::Transpose(m::GetTupleElement(m::CustomCall({backward_target}), 1))
-              .WithShape(BF16, {2, 6, 64, 128}),
-          m::GetTupleElement(m::CustomCall({backward_target}), 2)
-              .WithShape(BF16, {2, 6, 128, 64}))));
-  TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
-                          fmha->backend_config<GpuBackendConfig>());
-  const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
-  EXPECT_EQ(fmha->operands().size(), 6);
-  EXPECT_NEAR(config.dropout_rate(), 0.1, 1e-2);
-}
-
-TEST_F(CudnnFusedMhaRewriterTestHloTest,
-       F16TrainingBmm1ScaleBiasMaskSoftmaxDropoutBmm2) {
-  if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-  const char* module_str = R"(
-HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(f16[2,6,128,64]{3,2,1,0},f16[2,6,64,128]{3,2,1,0},f16[2,6,128,64]{3,2,1,0},f16[2,6,128,64]{3,2,1,0})->(f16[2,6,128,64]{3,2,1,0}, f16[2,6,128,64]{3,2,1,0}, f16[2,6,64,128]{3,2,1,0}, f16[2,6,128,64]{3,2,1,0})}, allow_spmd_sharding_propagation_to_output={true,true,true,true}
-
-region_0.38 {
-  Arg_0.39 = f16[] parameter(0)
-  Arg_1.40 = f16[] parameter(1)
-  ROOT maximum.1 = f16[] maximum(Arg_0.39, Arg_1.40)
-}
-
-region_1.50 {
-  Arg_0.51 = f32[] parameter(0)
-  Arg_1.52 = f32[] parameter(1)
-  ROOT add.2 = f32[] add(Arg_0.51, Arg_1.52)
-}
-
-region_2.99 {
-  Arg_0.100 = f16[] parameter(0)
-  Arg_1.101 = f16[] parameter(1)
-  ROOT add.3 = f16[] add(Arg_0.100, Arg_1.101)
-}
-
-ENTRY main.126 {
-  constant.6 = u32[1]{0} constant({2718843009})
-  constant.8 = u32[1]{0} constant({1272950319})
-  constant.10 = u32[1]{0} constant({0})
-  constant.12 = u32[1]{0} constant({2711844646})
-  custom-call.65 = (u32[1]{0}, u32[1]{0}) custom-call(constant.6, constant.8, constant.10, constant.12), custom_call_target="cu_threefry2x32", operand_layout_constraints={u32[1]{0}, u32[1]{0}, u32[1]{0}, u32[1]{0}}, api_version=API_VERSION_STATUS_RETURNING, backend_config="\001\000\000\000\000\000\000\000"
-  get-tuple-element.66 = u32[1]{0} get-tuple-element(custom-call.65), index=0
-  bitcast.343 = u32[] bitcast(get-tuple-element.66)
-  broadcast.27 = u32[98304]{0} broadcast(bitcast.343), dimensions={}
-  get-tuple-element.67 = u32[1]{0} get-tuple-element(custom-call.65), index=1
-  bitcast.344 = u32[] bitcast(get-tuple-element.67)
-  broadcast.28 = u32[98304]{0} broadcast(bitcast.344), dimensions={}
-  iota.68 = u32[196608]{0} iota(), iota_dimension=0
-  slice = u32[98304]{0} slice(iota.68), slice={[0:98304]}
-  slice.1 = u32[98304]{0} slice(iota.68), slice={[98304:196608]}
-  custom-call.75 = (u32[98304]{0}, u32[98304]{0}) custom-call(broadcast.27, broadcast.28, slice, slice.1), custom_call_target="cu_threefry2x32", operand_layout_constraints={u32[98304]{0}, u32[98304]{0}, u32[98304]{0}, u32[98304]{0}}, api_version=API_VERSION_STATUS_RETURNING, backend_config="\000\200\001\000\000\000\000\000"
-  get-tuple-element.76 = u32[98304]{0} get-tuple-element(custom-call.75), index=0
-  get-tuple-element.77 = u32[98304]{0} get-tuple-element(custom-call.75), index=1
-  concatenate.2 = u32[196608]{0} concatenate(get-tuple-element.76, get-tuple-element.77), dimensions={0}
-  constant.56 = u32[] constant(9)
-  broadcast.63 = u32[196608]{0} broadcast(constant.56), dimensions={}
-  shift-right-logical.3 = u32[196608]{0} shift-right-logical(concatenate.2, broadcast.63)
-  constant.57 = u32[] constant(1065353216)
-  broadcast.64 = u32[196608]{0} broadcast(constant.57), dimensions={}
-  or.3 = u32[196608]{0} or(shift-right-logical.3, broadcast.64)
-  bitcast-convert.3 = f32[196608]{0} bitcast-convert(or.3)
-  constant.58 = f32[] constant(-1)
-  broadcast.65 = f32[196608]{0} broadcast(constant.58), dimensions={}
-  add.10 = f32[196608]{0} add(bitcast-convert.3, broadcast.65)
-  constant.48 = f32[] constant(0)
-  broadcast.66 = f32[196608]{0} broadcast(constant.48), dimensions={}
-  maximum.4 = f32[196608]{0} maximum(add.10, broadcast.66)
-  constant.59 = f32[] constant(0.9)
-  broadcast.67 = f32[196608]{0} broadcast(constant.59), dimensions={}
-  compare.3 = pred[196608]{0} compare(maximum.4, broadcast.67), direction=LT
-  bitcast.308 = pred[2,6,128,128]{3,2,1,0} bitcast(compare.3)
-  constant.44 = pred[2,6,128,128]{3,2,1,0} constant({...})
-  Arg_0.1 = f16[2,6,128,64]{3,2,1,0} parameter(0), sharding={replicated}
-  Arg_1.2 = f16[2,6,64,128]{3,2,1,0} parameter(1), sharding={replicated}
-  dot.34 = f16[2,6,128,128]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  constant.55 = f16[] constant(2)
-  broadcast.61 = f16[2,6,128,128]{3,2,1,0} broadcast(constant.55), dimensions={}
-  multiply.8 = f16[2,6,128,128]{3,2,1,0} multiply(dot.34, broadcast.61)
-  constant.52 = f16[] constant(1)
-  broadcast.39 = f16[2,6,128,128]{3,2,1,0} broadcast(constant.52), dimensions={}
-  add.6 = f16[2,6,128,128]{3,2,1,0} add(multiply.8, broadcast.39)
-  constant.54 = f16[] constant(0)
-  broadcast.52 = f16[2,6,128,128]{3,2,1,0} broadcast(constant.54), dimensions={}
-  select.1 = f16[2,6,128,128]{3,2,1,0} select(constant.44, add.6, broadcast.52)
-  constant.41 = f16[] constant(-inf)
-  reduce.42 = f16[2,6,128]{2,1,0} reduce(select.1, constant.41), dimensions={3}, to_apply=region_0.38
-  broadcast.42 = f16[2,6,128,128]{3,2,1,0} broadcast(reduce.42), dimensions={0,1,2}
-  subtract.1 = f16[2,6,128,128]{3,2,1,0} subtract(select.1, broadcast.42)
-  exponential.1 = f16[2,6,128,128]{3,2,1,0} exponential(subtract.1)
-  convert.5 = f32[2,6,128,128]{3,2,1,0} convert(exponential.1)
-  reduce.54 = f32[2,6,128]{2,1,0} reduce(convert.5, constant.48), dimensions={3}, to_apply=region_1.50
-  convert.9 = f16[2,6,128]{2,1,0} convert(reduce.54)
-  broadcast.68 = f16[2,6,128,128]{3,2,1,0} broadcast(convert.9), dimensions={0,1,2}
-  divide.5 = f16[2,6,128,128]{3,2,1,0} divide(exponential.1, broadcast.68)
-  constant.60 = f16[] constant(1.1113)
-  broadcast.69 = f16[2,6,128,128]{3,2,1,0} broadcast(constant.60), dimensions={}
-  multiply.20 = f16[2,6,128,128]{3,2,1,0} multiply(divide.5, broadcast.69)
-  select.8 = f16[2,6,128,128]{3,2,1,0} select(bitcast.308, multiply.20, broadcast.52)
-  Arg_2.3 = f16[2,6,128,64]{3,2,1,0} parameter(2), sharding={replicated}
-  dot.88 = f16[2,6,128,64]{3,2,1,0} dot(select.8, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  bitcast.248 = pred[2,6,128,128]{3,2,1,0} bitcast(compare.3)
-  Arg_3.4 = f16[2,6,128,64]{3,2,1,0} parameter(3), sharding={replicated}
-  dot.91 = f16[2,6,128,128]{3,2,1,0} dot(Arg_3.4, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
-  select.6 = f16[2,6,128,128]{3,2,1,0} select(bitcast.248, dot.91, broadcast.52)
-  multiply.17 = f16[2,6,128,128]{3,2,1,0} multiply(select.6, broadcast.69)
-  divide.4 = f16[2,6,128,128]{3,2,1,0} divide(multiply.17, broadcast.68)
-  broadcast.55 = f16[2,6,128]{2,1,0} broadcast(constant.52), dimensions={}
-  multiply.11 = f16[2,6,128]{2,1,0} multiply(convert.9, convert.9)
-  divide.3 = f16[2,6,128]{2,1,0} divide(broadcast.55, multiply.11)
-  broadcast.56 = f16[2,6,128]{2,1,0} broadcast(constant.60), dimensions={}
-  multiply.12 = f16[2,6,128]{2,1,0} multiply(divide.3, broadcast.56)
-  broadcast.58 = f16[2,6,128,128]{3,2,1,0} broadcast(multiply.12), dimensions={0,1,2}
-  multiply.13 = f16[2,6,128,128]{3,2,1,0} multiply(select.6, broadcast.58)
-  multiply.14 = f16[2,6,128,128]{3,2,1,0} multiply(multiply.13, exponential.1)
-  reduce.103 = f16[2,6,128]{2,1,0} reduce(multiply.14, constant.54), dimensions={3}, to_apply=region_2.99
-  negate.3 = f16[2,6,128]{2,1,0} negate(reduce.103)
-  broadcast.62 = f16[2,6,128,128]{3,2,1,0} broadcast(negate.3), dimensions={0,1,2}
-  add.9 = f16[2,6,128,128]{3,2,1,0} add(divide.4, broadcast.62)
-  multiply.18 = f16[2,6,128,128]{3,2,1,0} multiply(add.9, exponential.1)
-  select.7 = f16[2,6,128,128]{3,2,1,0} select(constant.44, multiply.18, broadcast.52)
-  multiply.19 = f16[2,6,128,128]{3,2,1,0} multiply(select.7, broadcast.61)
-  dot.124 = f16[2,6,128,64]{3,2,1,0} dot(multiply.19, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
-  dot = f16[2,6,64,128]{3,2,1,0} dot(Arg_0.1, multiply.19), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  dot.1 = f16[2,6,128,64]{3,2,1,0} dot(select.8, Arg_3.4), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  ROOT tuple.125 = (f16[2,6,128,64]{3,2,1,0}, f16[2,6,128,64]{3,2,1,0}, f16[2,6,64,128]{3,2,1,0}, f16[2,6,128,64]{3,2,1,0}) tuple(dot.88, dot.124, dot, dot.1)
-}
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
-  CudnnFusedMHARewriter fusedMhaRewriter{
-      GetCudaComputeCapability(),
-      GetCudnnVersionWithDbiasAndMaskBwdInputSupport()};
-  TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
-  HloDCE dce;
-  TF_ASSERT_OK(RunHloPass(&dce, m.get()).status());
-
-  ComputationLayout computation_layout(
-      m->entry_computation()->ComputeProgramShape());
-
-  const HloInstruction* fmha;
-  const absl::string_view target =
-      kCudnnfMHAScaleBiasMaskSoftmaxDropoutCallTarget;
-  const absl::string_view backward_target =
-      kCudnnfMHAScaleBiasMaskSoftmaxDropoutBackwardCallTarget;
-
-  SCOPED_TRACE(m->ToString());
-  EXPECT_THAT(
-      m->entry_computation()->root_instruction(),
-      GmockMatch(m::Tuple(
-          m::GetTupleElement(m::CustomCall(&fmha, {target}), 0)
-              .WithShape(F16, {2, 6, 128, 64}),
-          m::GetTupleElement(m::CustomCall(&fmha, {backward_target}), 0)
-              .WithShape(F16, {2, 6, 128, 64}),
-          m::Transpose(m::GetTupleElement(m::CustomCall({backward_target}), 1))
-              .WithShape(F16, {2, 6, 64, 128}),
-          m::GetTupleElement(m::CustomCall({backward_target}), 2)
-              .WithShape(F16, {2, 6, 128, 64}))));
-  TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
-                          fmha->backend_config<GpuBackendConfig>());
-  const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
-  EXPECT_EQ(fmha->operands().size(), 6);
-  EXPECT_NEAR(config.dropout_rate(), 0.1, 1e-2);
-}
-
-TEST_F(CudnnFusedMhaRewriterTestHloTest,
-       F16TrainingBmm1ScaleBiasMaskSoftmaxBmm2) {
-  if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-  const char* module_str = R"(
-HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(f16[2,6,128,64]{3,2,1,0},f16[2,6,64,128]{3,2,1,0},f16[2,6,128,64]{3,2,1,0},f16[2,6,128,64]{3,2,1,0})->(f16[2,6,128,64]{3,2,1,0}, f16[2,6,128,64]{3,2,1,0}, f16[2,6,64,128]{3,2,1,0}, f16[2,6,128,64]{3,2,1,0})}, allow_spmd_sharding_propagation_to_output={true,true,true,true}
-
-region_0.21 {
-  Arg_0.22 = f16[] parameter(0)
-  Arg_1.23 = f16[] parameter(1)
-  ROOT maximum = f16[] maximum(Arg_0.22, Arg_1.23)
-}
-
-region_1.33 {
-  Arg_0.34 = f32[] parameter(0)
-  Arg_1.35 = f32[] parameter(1)
-  ROOT add = f32[] add(Arg_0.34, Arg_1.35)
-}
-
-region_2.55 {
-  Arg_0.56 = f16[] parameter(0)
-  Arg_1.57 = f16[] parameter(1)
-  ROOT add.1 = f16[] add(Arg_0.56, Arg_1.57)
-}
-
-ENTRY main.82 {
-  constant.18 = pred[2,6,128,128]{3,2,1,0} constant({...})
-  Arg_0.1 = f16[2,6,128,64]{3,2,1,0} parameter(0), sharding={replicated}
-  Arg_1.2 = f16[2,6,64,128]{3,2,1,0} parameter(1), sharding={replicated}
-  dot.17 = f16[2,6,128,128]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  constant.22 = f16[] constant(2)
-  broadcast.24 = f16[2,6,128,128]{3,2,1,0} broadcast(constant.22), dimensions={}
-  multiply.2 = f16[2,6,128,128]{3,2,1,0} multiply(dot.17, broadcast.24)
-  constant.19 = f16[] constant(1)
-  broadcast.13 = f16[2,6,128,128]{3,2,1,0} broadcast(constant.19), dimensions={}
-  add.3 = f16[2,6,128,128]{3,2,1,0} add(multiply.2, broadcast.13)
-  constant.21 = f16[] constant(0)
-  broadcast.23 = f16[2,6,128,128]{3,2,1,0} broadcast(constant.21), dimensions={}
-  select.1 = f16[2,6,128,128]{3,2,1,0} select(constant.18, add.3, broadcast.23)
-  constant.15 = f16[] constant(-inf)
-  reduce.25 = f16[2,6,128]{2,1,0} reduce(select.1, constant.15), dimensions={3}, to_apply=region_0.21
-  broadcast.17 = f16[2,6,128,128]{3,2,1,0} broadcast(reduce.25), dimensions={0,1,2}
-  subtract.1 = f16[2,6,128,128]{3,2,1,0} subtract(select.1, broadcast.17)
-  exponential.1 = f16[2,6,128,128]{3,2,1,0} exponential(subtract.1)
-  convert.5 = f32[2,6,128,128]{3,2,1,0} convert(exponential.1)
-  constant.17 = f32[] constant(0)
-  reduce.37 = f32[2,6,128]{2,1,0} reduce(convert.5, constant.17), dimensions={3}, to_apply=region_1.33
-  convert.9 = f16[2,6,128]{2,1,0} convert(reduce.37)
-  broadcast.26 = f16[2,6,128,128]{3,2,1,0} broadcast(convert.9), dimensions={0,1,2}
-  divide.5 = f16[2,6,128,128]{3,2,1,0} divide(exponential.1, broadcast.26)
-  Arg_2.3 = f16[2,6,128,64]{3,2,1,0} parameter(2), sharding={replicated}
-  dot.46 = f16[2,6,128,64]{3,2,1,0} dot(divide.5, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  Arg_3.4 = f16[2,6,128,64]{3,2,1,0} parameter(3), sharding={replicated}
-  dot.49 = f16[2,6,128,128]{3,2,1,0} dot(Arg_3.4, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
-  divide.4 = f16[2,6,128,128]{3,2,1,0} divide(dot.49, broadcast.26)
-  broadcast.20 = f16[2,6,128]{2,1,0} broadcast(constant.19), dimensions={}
-  multiply.3 = f16[2,6,128]{2,1,0} multiply(convert.9, convert.9)
-  divide.3 = f16[2,6,128]{2,1,0} divide(broadcast.20, multiply.3)
-  broadcast.21 = f16[2,6,128,128]{3,2,1,0} broadcast(divide.3), dimensions={0,1,2}
-  multiply.4 = f16[2,6,128,128]{3,2,1,0} multiply(dot.49, broadcast.21)
-  multiply.5 = f16[2,6,128,128]{3,2,1,0} multiply(multiply.4, exponential.1)
-  reduce.59 = f16[2,6,128]{2,1,0} reduce(multiply.5, constant.21), dimensions={3}, to_apply=region_2.55
-  negate.2 = f16[2,6,128]{2,1,0} negate(reduce.59)
-  broadcast.25 = f16[2,6,128,128]{3,2,1,0} broadcast(negate.2), dimensions={0,1,2}
-  add.5 = f16[2,6,128,128]{3,2,1,0} add(divide.4, broadcast.25)
-  multiply.8 = f16[2,6,128,128]{3,2,1,0} multiply(add.5, exponential.1)
-  select.3 = f16[2,6,128,128]{3,2,1,0} select(constant.18, multiply.8, broadcast.23)
-  multiply.9 = f16[2,6,128,128]{3,2,1,0} multiply(select.3, broadcast.24)
-  dot.80 = f16[2,6,128,64]{3,2,1,0} dot(multiply.9, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
-  dot = f16[2,6,64,128]{3,2,1,0} dot(Arg_0.1, multiply.9), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  dot.1 = f16[2,6,128,64]{3,2,1,0} dot(divide.5, Arg_3.4), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  ROOT tuple.81 = (f16[2,6,128,64]{3,2,1,0}, f16[2,6,128,64]{3,2,1,0}, f16[2,6,64,128]{3,2,1,0}, f16[2,6,128,64]{3,2,1,0}) tuple(dot.46, dot.80, dot, dot.1)
-}
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
-  CudnnFusedMHARewriter fusedMhaRewriter{
-      GetCudaComputeCapability(),
-      GetCudnnVersionWithDbiasAndMaskBwdInputSupport()};
-  TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
+  CudnnFusedMHATransposeFusion fmha_transpose_fusion;
+
   HloDCE dce;
+  TF_ASSERT_OK(RunHloPass(&alge_simp, m.get()).status());
+  TF_ASSERT_OK(RunHloPass(&fmha_transpose_fusion, m.get()).status());
+
   TF_ASSERT_OK(RunHloPass(&dce, m.get()).status());
 
   ComputationLayout computation_layout(
       m->entry_computation()->ComputeProgramShape());
 
   const HloInstruction* fmha;
-
+  auto dbias_index = 5;
   SCOPED_TRACE(m->ToString());
   EXPECT_THAT(
       m->entry_computation()->root_instruction(),
       GmockMatch(m::Tuple(
-          m::GetTupleElement(
-              m::CustomCall(&fmha, {kCudnnfMHAScaleBiasMaskSoftmaxCallTarget}),
-              0)
-              .WithShape(F16, {2, 6, 128, 64}),
-          m::GetTupleElement(
-              m::CustomCall(&fmha,
-                            {kCudnnfMHAScaleBiasMaskSoftmaxBackwardCallTarget}),
-              0)
-              .WithShape(F16, {2, 6, 128, 64}),
-          m::Transpose(
+          m::Bitcast().WithShape(F16, {16, 256, 16, 64}),
+          m::Bitcast(
+              m::GetTupleElement(
+                  m::CustomCall(
+                      &fmha,
+                      {kCudnnfMHAScaleBiasSoftmaxDropoutBackwardCallTarget}),
+                  0))
+              .WithShape(F16, {16, 256, 16, 64}),
+          m::Bitcast(
               m::GetTupleElement(
                   m::CustomCall(
-                      {kCudnnfMHAScaleBiasMaskSoftmaxBackwardCallTarget}),
+                      {kCudnnfMHAScaleBiasSoftmaxDropoutBackwardCallTarget}),
                   1))
-              .WithShape(F16, {2, 6, 64, 128}),
-          m::GetTupleElement(
-              m::CustomCall({kCudnnfMHAScaleBiasMaskSoftmaxBackwardCallTarget}),
-              2)
-              .WithShape(F16, {2, 6, 128, 64}))));
+              .WithShape(F16, {16, 256, 16, 64}),
+          m::Bitcast(
+              m::GetTupleElement(
+                  m::CustomCall(
+                      {kCudnnfMHAScaleBiasSoftmaxDropoutBackwardCallTarget}),
+                  2))
+              .WithShape(F16, {16, 256, 16, 64}),
+          m::GetTupleElement(  // dbias
+              m::CustomCall(
+                  {kCudnnfMHAScaleBiasSoftmaxDropoutBackwardCallTarget}),
+              dbias_index))));
   TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
                           fmha->backend_config<GpuBackendConfig>());
   const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
-  EXPECT_EQ(fmha->operands().size(), 6);
-  EXPECT_NEAR(config.dropout_rate(), 0, 1e-2);
+  EXPECT_EQ(fmha->operands().size(), 5);
+  EXPECT_NEAR(config.dropout_rate(), 0.1, 1e-2);
+}
+
+TEST_F(CudnnFusedMhaRewriterTestHloTest, BF16MiniT5xTest) {
+  if (skip_reason_) GTEST_SKIP() << *skip_reason_;
+  const char* module_str = R"(
+HloModule jit__lambda_, entry_computation_layout={(bf16[12,512,32,64]{3,2,1,0},bf16[12,512,2,32,64]{4,3,2,1,0},f32[12,512]{1,0},f32[12,512]{1,0})->(bf16[], bf16[12,512,32,64]{3,2,1,0}, bf16[12,512,2,32,64]{4,3,2,1,0})}, allow_spmd_sharding_propagation_to_output={true,true,true}
+
+region_0.51 {
+  Arg_0.52 = bf16[] parameter(0)
+  Arg_1.53 = bf16[] parameter(1)
+  ROOT maximum.54 = bf16[] maximum(Arg_0.52, Arg_1.53)
+}
+
+region_1.63 {
+  Arg_0.64 = f32[] parameter(0)
+  Arg_1.65 = f32[] parameter(1)
+  ROOT add.66 = f32[] add(Arg_0.64, Arg_1.65)
+}
+
+region_3.99 {
+  Arg_0.100 = bf16[] parameter(0)
+  Arg_1.101 = bf16[] parameter(1)
+  ROOT add.102 = bf16[] add(Arg_0.100, Arg_1.101)
+}
+
+ENTRY main.129 {
+  Arg_1.2 = bf16[12,512,2,32,64]{4,3,2,1,0} parameter(1), sharding={replicated}
+  copy = bf16[12,512,2,32,64]{1,4,3,0,2} copy(Arg_1.2), sharding={replicated}
+  slice.42 = bf16[12,512,1,32,64]{1,4,3,0,2} slice(copy), slice={[0:12], [0:512], [1:2], [0:32], [0:64]}
+  reshape.44 = bf16[12,512,32,64]{1,3,2,0} reshape(slice.42)
+  transpose.5 = bf16[12,32,64,512]{3,2,1,0} transpose(reshape.44), dimensions={0,2,3,1}
+  Arg_0.1 = bf16[12,512,32,64]{3,2,1,0} parameter(0), sharding={replicated}
+  copy.1 = bf16[12,512,32,64]{3,1,2,0} copy(Arg_0.1), sharding={replicated}
+  constant.5 = bf16[] constant(0.125)
+  broadcast.6 = bf16[12,512,32,64]{3,1,2,0} broadcast(constant.5), dimensions={}
+  multiply.45 = bf16[12,512,32,64]{3,1,2,0} multiply(copy.1, broadcast.6)
+  transpose = bf16[12,32,512,64]{3,2,1,0} transpose(multiply.45), dimensions={0,2,1,3}
+  copy.2 = bf16[12,512,2,32,64]{1,4,3,0,2} copy(Arg_1.2), sharding={replicated}
+  slice.41 = bf16[12,512,1,32,64]{1,4,3,0,2} slice(copy.2), slice={[0:12], [0:512], [0:1], [0:32], [0:64]}
+  reshape.43 = bf16[12,512,32,64]{1,3,2,0} reshape(slice.41)
+  transpose.1 = bf16[12,32,64,512]{3,2,1,0} transpose(reshape.43), dimensions={0,2,3,1}
+  dot = bf16[12,32,512,512]{3,2,1,0} dot(transpose, transpose.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  Arg_2.3 = f32[12,512]{1,0} parameter(2), sharding={replicated}
+  constant.14 = f32[] constant(0)
+  broadcast.19 = f32[12,512]{1,0} broadcast(constant.14), dimensions={}
+  compare.24 = pred[12,512]{1,0} compare(Arg_2.3, broadcast.19), direction=GT
+  broadcast.30 = pred[12,512,512]{2,1,0} broadcast(compare.24), dimensions={0,1}
+  Arg_3.4 = f32[12,512]{1,0} parameter(3), sharding={replicated}
+  compare.25 = pred[12,512]{1,0} compare(Arg_3.4, broadcast.19), direction=GT
+  broadcast.33 = pred[12,512,512]{2,1,0} broadcast(compare.25), dimensions={0,2}
+  and.34 = pred[12,512,512]{2,1,0} and(broadcast.30, broadcast.33)
+  convert.4 = s32[12,512,512]{2,1,0} convert(and.34)
+  constant.16 = s32[] constant(0)
+  broadcast.21 = s32[12,512,512]{2,1,0} broadcast(constant.16), dimensions={}
+  compare.0 = pred[12,512,512]{2,1,0} compare(convert.4, broadcast.21), direction=GT
+  constant.20 = bf16[] constant(0)
+  broadcast.22 = bf16[12,512,512]{2,1,0} broadcast(constant.20), dimensions={}
+  constant.11 = bf16[] constant(-9.999e+09)
+  broadcast.23 = bf16[12,512,512]{2,1,0} broadcast(constant.11), dimensions={}
+  select.0 = bf16[12,512,512]{2,1,0} select(compare.0, broadcast.22, broadcast.23)
+  broadcast.49 = bf16[12,32,512,512]{3,2,1,0} broadcast(select.0), dimensions={0,2,3}
+  add.50 = bf16[12,32,512,512]{3,2,1,0} add(dot, broadcast.49)
+  constant.22 = bf16[] constant(-inf)
+  reduce.55 = bf16[12,32,512]{2,1,0} reduce(add.50, constant.22), dimensions={3}, to_apply=region_0.51
+  broadcast.59 = bf16[12,32,512,512]{3,2,1,0} broadcast(reduce.55), dimensions={0,1,2}
+  subtract.60 = bf16[12,32,512,512]{3,2,1,0} subtract(add.50, broadcast.59)
+  exponential.61 = bf16[12,32,512,512]{3,2,1,0} exponential(subtract.60)
+  convert.62 = f32[12,32,512,512]{3,2,1,0} convert(exponential.61)
+  reduce.67 = f32[12,32,512]{2,1,0} reduce(convert.62, constant.14), dimensions={3}, to_apply=region_1.63
+  convert.5 = bf16[12,32,512]{2,1,0} convert(reduce.67)
+  broadcast.72 = bf16[12,32,512,512]{3,2,1,0} broadcast(convert.5), dimensions={0,1,2}
+  divide.73 = bf16[12,32,512,512]{3,2,1,0} divide(exponential.61, broadcast.72)
+  dot.1 = bf16[12,32,64,512]{3,2,1,0} dot(transpose.5, divide.73), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
+  convert.6 = f32[12,32,64,512]{3,2,1,0} convert(dot.1)
+  reduce.83 = f32[] reduce(convert.6, constant.14), dimensions={0,3,1,2}, to_apply=region_1.63
+  convert.84 = bf16[] convert(reduce.83)
+  constant.2 = bf16[] constant(0.0007935)
+  multiply.86 = bf16[] multiply(convert.84, constant.2)
+  broadcast.9 = bf16[12,32,512,64]{3,2,1,0} broadcast(constant.2), dimensions={}
+  dot.2 = bf16[12,32,512,512]{3,2,1,0} dot(broadcast.9, transpose.5), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  divide.109 = bf16[12,32,512,512]{3,2,1,0} divide(dot.2, broadcast.72)
+  constant.10 = bf16[] constant(1)
+  broadcast.24 = bf16[12,32,512]{2,1,0} broadcast(constant.10), dimensions={}
+  multiply.4 = bf16[12,32,512]{2,1,0} multiply(convert.5, convert.5)
+  divide.0 = bf16[12,32,512]{2,1,0} divide(broadcast.24, multiply.4)
+  broadcast.96 = bf16[12,32,512,512]{3,2,1,0} broadcast(divide.0), dimensions={0,1,2}
+  multiply.97 = bf16[12,32,512,512]{3,2,1,0} multiply(dot.2, broadcast.96)
+  multiply.98 = bf16[12,32,512,512]{3,2,1,0} multiply(multiply.97, exponential.61)
+  reduce.103 = bf16[12,32,512]{2,1,0} reduce(multiply.98, constant.20), dimensions={3}, to_apply=region_3.99
+  negate.0 = bf16[12,32,512]{2,1,0} negate(reduce.103)
+  broadcast.10 = bf16[12,32,512,512]{3,2,1,0} broadcast(negate.0), dimensions={0,1,2}
+  add.118 = bf16[12,32,512,512]{3,2,1,0} add(divide.109, broadcast.10)
+  multiply.119 = bf16[12,32,512,512]{3,2,1,0} multiply(add.118, exponential.61)
+  transpose.9 = bf16[12,32,512,64]{2,3,1,0} transpose(reshape.43), dimensions={0,2,1,3}
+  copy.3 = bf16[12,32,512,64]{3,2,1,0} copy(transpose.9)
+  dot.4 = bf16[12,32,512,64]{3,2,1,0} dot(multiply.119, copy.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  broadcast.12 = bf16[12,32,512,64]{3,2,1,0} broadcast(constant.5), dimensions={}
+  multiply.3 = bf16[12,32,512,64]{3,2,1,0} multiply(dot.4, broadcast.12)
+  transpose.11 = bf16[12,512,32,64]{3,1,2,0} transpose(multiply.3), dimensions={0,2,1,3}
+  broadcast.7 = bf16[12,32,64,512]{3,2,1,0} broadcast(constant.2), dimensions={}
+  dot.90 = bf16[12,32,64,512]{3,2,1,0} dot(broadcast.7, divide.73), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  transpose.91 = bf16[12,512,32,64]{1,3,2,0} transpose(dot.90), dimensions={0,3,1,2}
+  reshape.92 = bf16[12,512,1,32,64]{1,4,3,0,2} reshape(transpose.91)
+  pad.93 = bf16[12,512,2,32,64]{1,4,3,0,2} pad(reshape.92, constant.20), padding=0_0x0_0x1_0x0_0x0_0
+  dot.3 = bf16[12,32,512,64]{3,2,1,0} dot(multiply.119, transpose), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  copy.4 = bf16[12,32,512,64]{2,3,1,0} copy(dot.3)
+  transpose.121 = bf16[12,512,32,64]{1,3,2,0} transpose(copy.4), dimensions={0,2,1,3}
+  reshape.124 = bf16[12,512,1,32,64]{1,4,3,0,2} reshape(transpose.121)
+  pad.125 = bf16[12,512,2,32,64]{1,4,3,0,2} pad(reshape.124, constant.20), padding=0_0x0_0x0_1x0_0x0_0
+  add.126 = bf16[12,512,2,32,64]{1,4,3,0,2} add(pad.93, pad.125)
+  tuple.128 = (bf16[], bf16[12,512,32,64]{3,1,2,0}, bf16[12,512,2,32,64]{1,4,3,0,2}) tuple(multiply.86, transpose.11, add.126)
+  get-tuple-element = bf16[] get-tuple-element(tuple.128), index=0
+  get-tuple-element.1 = bf16[12,512,32,64]{3,1,2,0} get-tuple-element(tuple.128), index=1
+  copy.5 = bf16[12,512,32,64]{3,2,1,0} copy(get-tuple-element.1)
+  get-tuple-element.2 = bf16[12,512,2,32,64]{1,4,3,0,2} get-tuple-element(tuple.128), index=2
+  copy.6 = bf16[12,512,2,32,64]{4,3,2,1,0} copy(get-tuple-element.2)
+  ROOT tuple = (bf16[], bf16[12,512,32,64]{3,2,1,0}, bf16[12,512,2,32,64]{4,3,2,1,0}) tuple(get-tuple-element, copy.5, copy.6)
+}
+
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
+  AlgebraicSimplifierOptions alg_sim_options;
+  alg_sim_options.set_supports_non_canonical_dots(false);
+  alg_sim_options.set_is_layout_sensitive(true);
+  alg_sim_options.set_enable_conv_operand_swap(false);
+  AlgebraicSimplifier alge_simp{alg_sim_options};
+  ReshapeDecomposer reshape_decomposer;
+  LayoutNormalization layout_normalizer;
+  HloCSE cse{/*is_layout_sensitive=*/true};
+  TF_ASSERT_OK(RunHloPass(&reshape_decomposer, m.get()).status());
+  TF_ASSERT_OK(RunHloPass(&layout_normalizer, m.get()).status());
+  TF_ASSERT_OK(RunHloPass(&cse, m.get()).status());
+  TF_ASSERT_OK(RunHloPass(&alge_simp, m.get()).status());
+
+  CudnnFusedMHARewriter fusedMhaRewriter{GetCudaComputeCapability(),
+                                         GetCudnnVersion()};
+  TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
+
+  CudnnFusedMHATransposeFusion fmha_transpose_fusion;
+
+  HloDCE dce;
+  TF_ASSERT_OK(RunHloPass(&alge_simp, m.get()).status());
+  TF_ASSERT_OK(RunHloPass(&fmha_transpose_fusion, m.get()).status());
+
+  TF_ASSERT_OK(RunHloPass(&dce, m.get()).status());
+
+  EXPECT_EQ(CountFusedAttentionCall(m.get()), 1);
+  EXPECT_EQ(CountFusedAttentionCall(m.get(), /*is_backward*/ true), 1);
 }
 
 TEST_F(CudnnFusedMhaRewriterTestHloTest,
@@ -4027,7 +3045,7 @@ ENTRY main.92 {
   EXPECT_EQ(bwd_fmha->operands().size(), 6);
   EXPECT_NEAR(config.dropout_rate(), 0, 1e-2);
   EXPECT_EQ(config.is_flash_attention(), true);
-  EXPECT_EQ(config.is_causal_mask(), true);
+  EXPECT_EQ(config.mask_type(), CudnnfMHABackendConfig::CAUSAL);
 }
 
 TEST_F(CudnnFusedMhaRewriterTestHloTest,
@@ -4135,7 +3153,7 @@ ENTRY main.92 {
   EXPECT_EQ(fmha->operands().size(), 7);
   EXPECT_NEAR(config.dropout_rate(), 0, 1e-2);
   EXPECT_EQ(config.is_flash_attention(), true);
-  EXPECT_EQ(config.is_causal_mask(), false);
+  EXPECT_EQ(config.mask_type(), CudnnfMHABackendConfig::NO_MASK);
 }
 
 TEST_F(CudnnFusedMhaRewriterTestHloTest,
@@ -4238,120 +3256,7 @@ ENTRY main.92 {
   EXPECT_NEAR(config.dropout_rate(), 0, 1e-2);
   EXPECT_FLOAT_EQ(config.fmha_scale(), 2);
   EXPECT_EQ(config.is_flash_attention(), true);
-  EXPECT_EQ(config.is_causal_mask(), false);
-}
-
-TEST_F(CudnnFusedMhaRewriterTestHloTest,
-       FlashAttentionBF16TrainingBmm1ScaleMaskSoftmaxBmm2Pattern) {
-  if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-  const char* module_str = R"(
-HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[2,6,2048,64]{3,2,1,0},bf16[2,6,64,2048]{3,2,1,0},bf16[2,6,2048,64]{3,2,1,0},bf16[2,6,2048,64]{3,2,1,0})->(bf16[2,6,2048,64]{3,2,1,0}, bf16[2,6,2048,64]{3,2,1,0}, bf16[2,6,64,2048]{3,2,1,0}, bf16[2,6,2048,64]{3,2,1,0})}, allow_spmd_sharding_propagation_to_output={true,true,true,true}
-
-region_0.21 {
-  Arg_0.22 = bf16[] parameter(0)
-  Arg_1.23 = bf16[] parameter(1)
-  ROOT maximum = bf16[] maximum(Arg_0.22, Arg_1.23)
-}
-
-region_1.33 {
-  Arg_0.34 = f32[] parameter(0)
-  Arg_1.35 = f32[] parameter(1)
-  ROOT add = f32[] add(Arg_0.34, Arg_1.35)
-}
-
-region_2.55 {
-  Arg_0.56 = bf16[] parameter(0)
-  Arg_1.57 = bf16[] parameter(1)
-  ROOT add.1 = bf16[] add(Arg_0.56, Arg_1.57)
-}
-
-ENTRY main.82 {
-  constant.18 = pred[2,6,2048,2048]{3,2,1,0} constant({...})
-  Arg_0.1 = bf16[2,6,2048,64]{3,2,1,0} parameter(0), sharding={replicated}
-  Arg_1.2 = bf16[2,6,64,2048]{3,2,1,0} parameter(1), sharding={replicated}
-  dot.17 = bf16[2,6,2048,2048]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  constant.22 = bf16[] constant(2)
-  broadcast.24 = bf16[2,6,2048,2048]{3,2,1,0} broadcast(constant.22), dimensions={}
-  multiply.2 = bf16[2,6,2048,2048]{3,2,1,0} multiply(dot.17, broadcast.24)
-  constant.19 = bf16[] constant(1)
-  constant.21 = bf16[] constant(0)
-  broadcast.23 = bf16[2,6,2048,2048]{3,2,1,0} broadcast(constant.21), dimensions={}
-  select.1 = bf16[2,6,2048,2048]{3,2,1,0} select(constant.18, multiply.2, broadcast.23)
-  constant.15 = bf16[] constant(-inf)
-  reduce.25 = bf16[2,6,2048]{2,1,0} reduce(select.1, constant.15), dimensions={3}, to_apply=region_0.21
-  broadcast.17 = bf16[2,6,2048,2048]{3,2,1,0} broadcast(reduce.25), dimensions={0,1,2}
-  subtract.1 = bf16[2,6,2048,2048]{3,2,1,0} subtract(select.1, broadcast.17)
-  exponential.1 = bf16[2,6,2048,2048]{3,2,1,0} exponential(subtract.1)
-  convert.5 = f32[2,6,2048,2048]{3,2,1,0} convert(exponential.1)
-  constant.17 = f32[] constant(0)
-  reduce.37 = f32[2,6,2048]{2,1,0} reduce(convert.5, constant.17), dimensions={3}, to_apply=region_1.33
-  convert.9 = bf16[2,6,2048]{2,1,0} convert(reduce.37)
-  broadcast.26 = bf16[2,6,2048,2048]{3,2,1,0} broadcast(convert.9), dimensions={0,1,2}
-  divide.5 = bf16[2,6,2048,2048]{3,2,1,0} divide(exponential.1, broadcast.26)
-  Arg_2.3 = bf16[2,6,2048,64]{3,2,1,0} parameter(2), sharding={replicated}
-  dot.46 = bf16[2,6,2048,64]{3,2,1,0} dot(divide.5, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  Arg_3.4 = bf16[2,6,2048,64]{3,2,1,0} parameter(3), sharding={replicated}
-  dot.49 = bf16[2,6,2048,2048]{3,2,1,0} dot(Arg_3.4, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
-  divide.4 = bf16[2,6,2048,2048]{3,2,1,0} divide(dot.49, broadcast.26)
-  broadcast.20 = bf16[2,6,2048]{2,1,0} broadcast(constant.19), dimensions={}
-  multiply.3 = bf16[2,6,2048]{2,1,0} multiply(convert.9, convert.9)
-  divide.3 = bf16[2,6,2048]{2,1,0} divide(broadcast.20, multiply.3)
-  broadcast.21 = bf16[2,6,2048,2048]{3,2,1,0} broadcast(divide.3), dimensions={0,1,2}
-  multiply.4 = bf16[2,6,2048,2048]{3,2,1,0} multiply(dot.49, broadcast.21)
-  multiply.5 = bf16[2,6,2048,2048]{3,2,1,0} multiply(multiply.4, exponential.1)
-  reduce.59 = bf16[2,6,2048]{2,1,0} reduce(multiply.5, constant.21), dimensions={3}, to_apply=region_2.55
-  negate.2 = bf16[2,6,2048]{2,1,0} negate(reduce.59)
-  broadcast.25 = bf16[2,6,2048,2048]{3,2,1,0} broadcast(negate.2), dimensions={0,1,2}
-  add.5 = bf16[2,6,2048,2048]{3,2,1,0} add(divide.4, broadcast.25)
-  multiply.8 = bf16[2,6,2048,2048]{3,2,1,0} multiply(add.5, exponential.1)
-  select.3 = bf16[2,6,2048,2048]{3,2,1,0} select(constant.18, multiply.8, broadcast.23)
-  multiply.9 = bf16[2,6,2048,2048]{3,2,1,0} multiply(select.3, broadcast.24)
-  dot.80 = bf16[2,6,2048,64]{3,2,1,0} dot(multiply.9, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
-  dot = bf16[2,6,64,2048]{3,2,1,0} dot(Arg_0.1, multiply.9), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  dot.1 = bf16[2,6,2048,64]{3,2,1,0} dot(divide.5, Arg_3.4), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  ROOT tuple.81 = (bf16[2,6,2048,64]{3,2,1,0}, bf16[2,6,2048,64]{3,2,1,0}, bf16[2,6,64,2048]{3,2,1,0}, bf16[2,6,2048,64]{3,2,1,0}) tuple(dot.46, dot.80, dot, dot.1)
-}
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
-  CudnnFusedMHARewriter fusedMhaRewriter{
-      GetCudaComputeCapability(), GetCudnnVersionWithFlashAttentionSupport()};
-  TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
-  HloDCE dce;
-  TF_ASSERT_OK(RunHloPass(&dce, m.get()).status());
-
-  ComputationLayout computation_layout(
-      m->entry_computation()->ComputeProgramShape());
-
-  const HloInstruction* fmha;
-
-  SCOPED_TRACE(m->ToString());
-  EXPECT_THAT(
-      m->entry_computation()->root_instruction(),
-      GmockMatch(m::Tuple(
-          m::GetTupleElement(
-              m::CustomCall(&fmha, {kCudnnfMHAScaleMaskSoftmaxCallTarget}), 0)
-              .WithShape(BF16, {2, 6, 2048, 64}),
-          m::GetTupleElement(
-              m::CustomCall(&fmha,
-                            {kCudnnfMHAScaleMaskSoftmaxBackwardCallTarget}),
-              0)
-              .WithShape(BF16, {2, 6, 2048, 64}),
-          m::Transpose(
-              m::GetTupleElement(
-                  m::CustomCall({kCudnnfMHAScaleMaskSoftmaxBackwardCallTarget}),
-                  1))
-              .WithShape(BF16, {2, 6, 64, 2048}),
-          m::GetTupleElement(
-              m::CustomCall({kCudnnfMHAScaleMaskSoftmaxBackwardCallTarget}), 2)
-              .WithShape(BF16, {2, 6, 2048, 64}))));
-  TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
-                          fmha->backend_config<GpuBackendConfig>());
-  const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
-  EXPECT_EQ(fmha->operands().size(), 7);
-  EXPECT_NEAR(config.dropout_rate(), 0, 1e-2);
-  EXPECT_EQ(config.is_flash_attention(), true);
-  EXPECT_EQ(config.is_causal_mask(), false);
+  EXPECT_EQ(config.mask_type(), CudnnfMHABackendConfig::NO_MASK);
 }
 
 // GPT3 pattern
@@ -4938,7 +3843,7 @@ main {
                           fwd_instruction->backend_config<GpuBackendConfig>());
   const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
   EXPECT_EQ(config.is_flash_attention(), true);
-  EXPECT_EQ(config.is_causal_mask(), true);
+  EXPECT_EQ(config.mask_type(), CudnnfMHABackendConfig::CAUSAL);
 }
 
 TEST_F(CudnnFusedMhaRewriterTestHloTest,
@@ -5180,10 +4085,7 @@ ENTRY main.49 {
   constant.4 = f16[] constant(0.5)
   broadcast.6 = f16[4,48,1024,1024]{3,2,1,0} broadcast(constant.4), dimensions={}
   multiply = f16[4,48,1024,1024]{3,2,1,0} multiply(dot.9, broadcast.6)
-  constant = f16[] constant(-inf)
-  broadcast.7 = f16[4,48,1024,1024]{3,2,1,0} broadcast(constant), dimensions={}
-  select.1 = f16[4,48,1024,1024]{3,2,1,0} select(broadcast.4, multiply, broadcast.7)
-  convert.1 = f32[4,48,1024,1024]{3,2,1,0} convert(select.1)
+  convert.1 = f32[4,48,1024,1024]{3,2,1,0} convert(multiply)
   constant.7 = f32[] constant(-inf)
   reduce.30 = f32[4,48,1024]{2,1,0} reduce(convert.1, constant.7), dimensions={3}, to_apply=region_0.26
   broadcast.8 = f32[4,48,1024,1024]{3,2,1,0} broadcast(reduce.30), dimensions={0,1,2}
@@ -5221,10 +4123,9 @@ TEST_F(CudnnFusedMhaRewriterTestHloTest, HeadDimNotMultipleOf64) {
   SCOPED_TRACE(m->ToString());
   EXPECT_THAT(
       m->entry_computation()->root_instruction(),
-      GmockMatch(
-          m::GetTupleElement(
-              m::CustomCall(&fmha, {kCudnnfMHAScaleMaskSoftmaxCallTarget}), 0)
-              .WithShape(F16, {4, 48, 1024, 16})));
+      GmockMatch(m::GetTupleElement(
+                     m::CustomCall(&fmha, {kCudnnfMHASoftmaxCallTarget}), 0)
+                     .WithShape(F16, {4, 48, 1024, 16})));
   TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
                           fmha->backend_config<GpuBackendConfig>());
   const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
diff --git a/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.cc b/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.cc
index da304e5ebdf1f6..2c3f4ec7221b30 100644
--- a/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.cc
@@ -147,6 +147,20 @@ inline std::optional<fe::DataType_t> ToCudnnDataType(const PrimitiveType type) {
   }
 }
 
+inline std::optional<fe::DataType_t> GetComputeDataType(
+    const PrimitiveType type) {
+  fe::DataType_t compute_dtype = fe::DataType_t::FLOAT;
+  if (primitive_util::IsIntegralType(type)) {
+#if CUDNN_VERSION >= 90100
+    compute_dtype = fe::DataType_t::INT32;
+#else
+    VLOG(3) << "Integer math requires cuDNN 9.1+.";
+    return std::nullopt;
+#endif  // CUDNN_VERSION
+  }
+  return compute_dtype;
+}
+
 int FusionLevel(const HloInstruction& hlo) {
   return hlo.GetModule()
       ->config()
@@ -372,12 +386,13 @@ absl::StatusOr<std::optional<se::gpu::CudnnGraph>> HloFusionToCuDnnGraph(
         return std::nullopt;
       }
       const auto compute_dtype =
-          (primitive_util::IsIntegralType(hlo->shape().element_type()))
-              ? fe::DataType_t::INT32
-              : fe::DataType_t::FLOAT;
+          GetComputeDataType(hlo->shape().element_type());
+      if (!compute_dtype.has_value()) {
+        return std::nullopt;
+      }
       const auto attrs = graph::Pointwise_attributes()
                              .set_mode(mode.value())
-                             .set_compute_data_type(compute_dtype);
+                             .set_compute_data_type(compute_dtype.value());
       if (hlo->operand_count() == 1) {
         hlo_to_cudnn[hlo] = graph.pointwise(operand(0), attrs);
       } else if (hlo->operand_count() == 2) {
@@ -396,12 +411,14 @@ absl::StatusOr<std::optional<se::gpu::CudnnGraph>> HloFusionToCuDnnGraph(
       }
     } else if (hlo->opcode() == HloOpcode::kDot) {
       const auto compute_dtype =
-          (primitive_util::IsIntegralType(hlo->shape().element_type()))
-              ? fe::DataType_t::INT32
-              : fe::DataType_t::FLOAT;
-      hlo_to_cudnn[hlo] = graph.matmul(
-          operand(0), operand(1),
-          graph::Matmul_attributes().set_compute_data_type(compute_dtype));
+          GetComputeDataType(hlo->shape().element_type());
+      if (!compute_dtype.has_value()) {
+        return std::nullopt;
+      }
+      hlo_to_cudnn[hlo] =
+          graph.matmul(operand(0), operand(1),
+                       graph::Matmul_attributes().set_compute_data_type(
+                           compute_dtype.value()));
     } else {
       VLOG(3) << "Unimplemented operation.";
       return std::nullopt;
diff --git a/third_party/xla/xla/service/gpu/cudnn_workspace_rewriter.cc b/third_party/xla/xla/service/gpu/cudnn_workspace_rewriter.cc
new file mode 100644
index 00000000000000..b9e03c2f2ffc24
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/cudnn_workspace_rewriter.cc
@@ -0,0 +1,291 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/cudnn_workspace_rewriter.h"
+
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "third_party/gpus/cudnn/cudnn_version.h"
+#include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_clone_context.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/primitive_util.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/cublas_cudnn.h"
+#include "xla/service/gpu/gpu_fused_mha_runner.h"
+#include "xla/service/gpu/ir_emission_utils.h"
+#include "xla/service/gpu/stream_executor_util.h"
+#include "xla/stream_executor/cuda/cuda_dnn.h"
+#include "xla/stream_executor/cuda/cudnn_frontend_helpers.h"
+#include "xla/util.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+
+namespace {
+
+namespace fe = cudnn_frontend;
+namespace graph = fe::graph;
+
+// create cuDNN graphs from HloCustomCall
+absl::StatusOr<se::gpu::CudnnGraph> HloCustomCallToCuDnnGraph(
+    se::dnn::DnnSupport& dnn_support,
+    const HloCustomCallInstruction* custom_call) {
+  if (IsFwdCustomCallTofMHA(*custom_call)) {
+    TF_ASSIGN_OR_RETURN(const xla::gpu::CudnnfMHAKind kind,
+                        xla::gpu::GetCudnnfMHAKind(custom_call));
+    std::optional<Shape> mask_shape, bias_shape;
+    {
+      bool has_mask = kind == CudnnfMHAKind::kScaleMaskSoftmax ||
+                      kind == CudnnfMHAKind::kScaleMaskSoftmaxDropout ||
+                      kind == CudnnfMHAKind::kScaleBiasMaskSoftmax ||
+                      kind == CudnnfMHAKind::kScaleBiasMaskSoftmaxDropout;
+      bool has_bias = kind == CudnnfMHAKind::kScaleBiasMaskSoftmax ||
+                      kind == CudnnfMHAKind::kScaleBiasSoftmaxDropout ||
+                      kind == CudnnfMHAKind::kScaleBiasSoftmax ||
+                      kind == CudnnfMHAKind::kScaleBiasSoftmaxDropout;
+
+      if (has_mask) {
+        const HloInstruction* mask = custom_call->operand(3);
+        mask_shape = mask->shape();
+        if (has_bias) {
+          const HloInstruction* bias = custom_call->operand(4);
+          bias_shape = bias->shape();
+        }
+      } else if (has_bias) {
+        const HloInstruction* bias = custom_call->operand(3);
+        bias_shape = bias->shape();
+      }
+    }
+
+    TF_ASSIGN_OR_RETURN(
+        const auto gpu_config,
+        custom_call->backend_config<xla::gpu::GpuBackendConfig>());
+    const xla::gpu::CudnnfMHABackendConfig& config =
+        gpu_config.cudnn_fmha_backend_config();
+    Shape intermediate_tensor_shape(config.intermediate_tensor_shape());
+    absl::InlinedVector<Shape, 2> output_shapes = {
+        ShapeUtil::GetSubshape(custom_call->shape(), {0})};
+
+    bool has_activation =
+        xla::ShapeUtil::TupleElementCount(custom_call->shape()) == 3;
+    if (has_activation) {
+      output_shapes.push_back(
+          ShapeUtil::GetSubshape(custom_call->shape(), {2}));
+    }
+
+    Shape q_shape = custom_call->operand(0)->shape();
+    Shape k_shape = custom_call->operand(1)->shape();
+    Shape v_shape = custom_call->operand(2)->shape();
+    TF_ASSIGN_OR_RETURN(CudnnfMHAMaskKind cudnn_mask_type,
+                        AsCudnnFmhaMaskKind(config.mask_type()));
+    GpufMHADescriptor descriptor = {kind,
+                                    config,
+                                    config.is_flash_attention(),
+                                    cudnn_mask_type,
+                                    q_shape,
+                                    k_shape,
+                                    v_shape,
+                                    intermediate_tensor_shape,
+                                    output_shapes,
+                                    config.bmm1_dot_dimension_numbers(),
+                                    config.bmm2_dot_dimension_numbers(),
+                                    mask_shape,
+                                    bias_shape};
+
+    TF_ASSIGN_OR_RETURN(GpufMHAConfig fmha_config,
+                        GpufMHAConfig::For(descriptor));
+    TF_ASSIGN_OR_RETURN(
+        se::dnn::FMHAMaskKind dnn_mask_type,
+        GetDNNFmhaMaskKindFromCudnnFmhaMaskKind(fmha_config.mask_type));
+    TF_ASSIGN_OR_RETURN(
+        se::gpu::CudnnGraph graph,
+        se::gpu::GetCudnnFlashAttentionOperationGraph(
+            dnn_support, fmha_config.lhs_bmm1, fmha_config.rhs_bmm1,
+            fmha_config.rhs_bmm2, fmha_config.output, fmha_config.bias,
+            fmha_config.mask, fmha_config.activation,
+            static_cast<float>(*fmha_config.fmha_scale),
+            fmha_config.dropout_rate && *fmha_config.dropout_rate > 0.0,
+            fmha_config.dropout_rate, dnn_mask_type));
+    return std::move(graph);
+  } else {
+    TF_ASSIGN_OR_RETURN(
+        const auto gpu_config,
+        custom_call->backend_config<xla::gpu::GpuBackendConfig>());
+    const xla::gpu::CudnnfMHABackendConfig& config =
+        gpu_config.cudnn_fmha_backend_config();
+    bool is_flash_attention = config.is_flash_attention();
+
+    int input_index = 0;
+    Shape bmm1_grad_gemm1_rhs_shape =
+        custom_call->operand(input_index++)->shape();
+    Shape bmm1_grad_gemm2_rhs_shape =
+        custom_call->operand(input_index++)->shape();
+    Shape bmm2_grad_gemm2_rhs_shape =
+        custom_call->operand(input_index++)->shape();
+    Shape bmm2_grad_gemm1_lhs_shape(config.intermediate_tensor_shape());
+    input_index++;
+    Shape d_output_shape = custom_call->operand(input_index++)->shape();
+
+    TF_ASSIGN_OR_RETURN(const CudnnfMHAKind kind,
+                        GetCudnnfMHAKind(custom_call));
+    bool has_mask = kind == CudnnfMHAKind::kBackwardScaleMaskSoftmax ||
+                    kind == CudnnfMHAKind::kBackwardScaleBiasMaskSoftmax ||
+                    kind == CudnnfMHAKind::kBackwardScaleMaskSoftmaxDropout ||
+                    kind == CudnnfMHAKind::kBackwardScaleBiasMaskSoftmaxDropout;
+    std::optional<Shape> mask_shape;
+    if (has_mask) {
+      mask_shape = custom_call->operand(input_index++)->shape();
+    }
+
+    bool has_bias =
+        (kind == CudnnfMHAKind::kBackwardScaleBiasSoftmax ||
+         kind == CudnnfMHAKind::kBackwardScaleBiasSoftmaxDropout ||
+         kind == CudnnfMHAKind::kBackwardScaleBiasMaskSoftmax ||
+         kind == CudnnfMHAKind::kBackwardScaleBiasMaskSoftmaxDropout);
+    std::optional<Shape> bias_shape;
+    if (has_bias) {
+      bias_shape = custom_call->operand(input_index++)->shape();
+    }
+
+    std::optional<Shape> fwd_output_shape =
+        custom_call->operand(input_index++)->shape();
+    TF_RET_CHECK(input_index == custom_call->operand_count());
+
+    int output_index = 0;
+    Shape d_bmm1_lhs_shape =
+        ShapeUtil::GetSubshape(custom_call->shape(), {output_index++});
+    Shape d_bmm1_rhs_shape =
+        ShapeUtil::GetSubshape(custom_call->shape(), {output_index++});
+    Shape d_bmm2_rhs_shape =
+        ShapeUtil::GetSubshape(custom_call->shape(), {output_index++});
+    output_index++;
+    std::optional<Shape> d_s_shape;
+    std::optional<Shape> d_bias_shape;
+    TF_RET_CHECK(output_index == custom_call->shape().tuple_shapes().size());
+    TF_ASSIGN_OR_RETURN(CudnnfMHAMaskKind cudnn_mask_type,
+                        AsCudnnFmhaMaskKind(config.mask_type()));
+    GpufMHABackwardDescriptor descriptor = {
+        kind,
+        config,
+        is_flash_attention,
+        cudnn_mask_type,
+        bmm1_grad_gemm1_rhs_shape,
+        bmm1_grad_gemm2_rhs_shape,
+        bmm2_grad_gemm1_lhs_shape,
+        bmm2_grad_gemm2_rhs_shape,
+        d_output_shape,
+        d_bmm1_lhs_shape,
+        d_bmm1_rhs_shape,
+        d_bmm2_rhs_shape,
+        config.bmm1_grad_gemm1_dot_dimension_numbers(),
+        config.bmm1_grad_gemm2_dot_dimension_numbers(),
+        config.bmm2_grad_gemm1_dot_dimension_numbers(),
+        config.bmm2_grad_gemm2_dot_dimension_numbers(),
+        d_s_shape,
+        fwd_output_shape,
+        mask_shape,
+        d_bias_shape,
+        bias_shape};
+
+    TF_ASSIGN_OR_RETURN(GpufMHABackwardConfig fmha_config,
+                        GpufMHABackwardConfig::For(descriptor));
+    TF_ASSIGN_OR_RETURN(
+        se::dnn::FMHAMaskKind dnn_mask_type,
+        GetDNNFmhaMaskKindFromCudnnFmhaMaskKind(fmha_config.mask_type));
+    TF_ASSIGN_OR_RETURN(
+        se::gpu::CudnnGraph graph,
+        se::gpu::GetCudnnFlashAttentionBackwardOperationGraph(
+            dnn_support, fmha_config.bmm1_grad_gemm1_rhs,
+            fmha_config.bmm1_grad_gemm2_rhs, fmha_config.bmm2_grad_gemm1_lhs,
+            fmha_config.bmm2_grad_gemm2_rhs, fmha_config.d_output,
+            fmha_config.d_bmm1_lhs, fmha_config.d_bmm1_rhs,
+            fmha_config.d_bmm2_rhs, fmha_config.bias, fmha_config.dropout_rate,
+            fmha_config.seed, *fmha_config.fmha_scale,
+            fmha_config.dropout_rate && *fmha_config.dropout_rate > 0.0,
+            fmha_config.mask != std::nullopt, fmha_config.bias != std::nullopt,
+            dnn_mask_type));
+    return std::move(graph);
+  }
+}
+
+class CuDnnCustomCallVisitor : public DfsHloRewriteVisitor {
+ public:
+  explicit CuDnnCustomCallVisitor(se::dnn::DnnSupport& dnn_support)
+      : dnn_support_(dnn_support) {}
+
+  absl::Status HandleCustomCall(HloInstruction* hlo) override {
+    if (!IsCustomCallTofMHA(*hlo)) {
+      // don't do anything about other cuDNN custom calls
+      return absl::OkStatus();
+    }
+    TF_ASSIGN_OR_RETURN(auto gpu_config,
+                        hlo->backend_config<GpuBackendConfig>());
+    const CudnnfMHABackendConfig& config =
+        gpu_config.cudnn_fmha_backend_config();
+    if (!config.is_flash_attention()) {
+      // only flash attention is supported in new cudnn frontend
+      return absl::OkStatus();
+    }
+
+    TF_ASSIGN_OR_RETURN(
+        se::gpu::CudnnGraph graph,
+        HloCustomCallToCuDnnGraph(dnn_support_,
+                                  DynCast<HloCustomCallInstruction>(hlo)));
+    auto workspace = graph.Graph().get_workspace_size();
+    if (workspace != 0) {
+      // rewrite custom call to have correct scratch spaces
+      VLOG(4) << "Rewriting: " << hlo->ToString();
+      Shape* shape = hlo->mutable_shape();
+      if (IsFwdCustomCallTofMHA(*hlo)) {
+        shape->mutable_tuple_shapes(1)->set_dimensions(0, workspace);
+      } else {
+        shape->mutable_tuple_shapes(3)->set_dimensions(0, workspace);
+      }
+      MarkAsChanged();
+    }
+    return absl::OkStatus();
+  }
+
+ private:
+  se::dnn::DnnSupport& dnn_support_;
+};
+
+}  // namespace
+
+absl::StatusOr<bool> CuDnnWorkspaceRewriter::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  XLA_SCOPED_LOGGING_TIMER("cuDNN workspace rewriter");
+  return CuDnnCustomCallVisitor(dnn_support_)
+      .RunOnModule(module, execution_threads);
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/cudnn_workspace_rewriter.h b/third_party/xla/xla/service/gpu/cudnn_workspace_rewriter.h
new file mode 100644
index 00000000000000..530646b0b436e3
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/cudnn_workspace_rewriter.h
@@ -0,0 +1,54 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_CUDNN_WORKSPACE_REWRITER_H_
+#define XLA_SERVICE_GPU_CUDNN_WORKSPACE_REWRITER_H_
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/hlo_pass_interface.h"
+#include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace xla {
+namespace gpu {
+
+// Rewrite cuDNN custom call to have correct workspace size by build graph
+// and serialize so we can use it later
+class CuDnnWorkspaceRewriter : public HloModulePass {
+ public:
+  // <HLO computation fingerprint, serialized compiled cuDNN graph>.
+  using BinaryMap = absl::flat_hash_map<std::string, std::string>;
+
+  explicit CuDnnWorkspaceRewriter(se::StreamExecutor& stream_exec)
+      : dnn_support_(*stream_exec.AsDnn()) {}
+
+  absl::string_view name() const override { return "cudnn-workspace-rewriter"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  se::dnn::DnnSupport& dnn_support_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_CUDNN_WORKSPACE_REWRITER_H_
diff --git a/third_party/xla/xla/service/gpu/custom_call_test.cc b/third_party/xla/xla/service/gpu/custom_call_test.cc
index e9fe12d6a7c4c5..feb3d53c79c260 100644
--- a/third_party/xla/xla/service/gpu/custom_call_test.cc
+++ b/third_party/xla/xla/service/gpu/custom_call_test.cc
@@ -350,13 +350,13 @@ TEST_F(CustomCallTest, WithStatusFailed) {
 // XLA runtime custom calls provides type-safe custom call API
 //===----------------------------------------------------------------------===//
 
-static absl::Status AlwaysFail(ffi::BufferBase arg, int32_t value) {
+static absl::Status AlwaysFail(ffi::Result<ffi::BufferBase>, int32_t value) {
   return absl::InternalError(absl::StrCat("Uh oh, wrong value: ", value));
 }
 
 XLA_FFI_DEFINE_HANDLER(kAlwaysFail, AlwaysFail,
                        ffi::Ffi::Bind()
-                           .Arg<ffi::BufferBase>()  // arg
+                           .Ret<ffi::BufferBase>()  //
                            .Attr<int32_t>("value")  // value
 );
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$always_fail",
@@ -376,9 +376,9 @@ TEST_F(CustomCallTest, RuntimeCustomCallAlwaysFail) {
 }
 
 static absl::Status Memcpy(se::Stream* stream, ffi::BufferBase src,
-                           ffi::BufferBase dst) {
+                           ffi::Result<ffi::BufferBase> dst) {
   return stream->MemcpyD2D(
-      &dst.data, src.data,
+      &dst->data, src.data,
       absl::c_accumulate(src.dimensions, 1.0, std::multiplies<int64_t>()) *
           sizeof(float));
 }
@@ -387,7 +387,7 @@ XLA_FFI_DEFINE_HANDLER(kMemcpy, Memcpy,
                        ffi::Ffi::Bind()
                            .Ctx<ffi::Stream>()
                            .Arg<ffi::BufferBase>()  // src
-                           .Arg<ffi::BufferBase>()  // dst
+                           .Ret<ffi::BufferBase>()  // dst
 );
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$memcpy", PLATFORM,
                          kMemcpy);
@@ -405,13 +405,14 @@ TEST_F(CustomCallTest, ExportedFfiMemcpy) {
   EXPECT_THAT(result.data<float>(), ::testing::Each(42));
 }
 
-static absl::Status HandleUserPointer(ffi::BufferBase, const std::string* str) {
+static absl::Status HandleUserPointer(ffi::Result<ffi::BufferBase>,
+                                      const std::string* str) {
   return absl::InternalError(*str);
 }
 
 XLA_FFI_DEFINE_HANDLER(kHandleUserPointer, HandleUserPointer,
                        ffi::Ffi::Bind()
-                           .Arg<ffi::BufferBase>()  // buffer for result
+                           .Ret<ffi::BufferBase>()  // buffer for result
                            .Attr<ffi::Pointer<std::string>>("message"));
 
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$user_data", PLATFORM,
@@ -435,14 +436,14 @@ TEST_F(CustomCallTest, PassUserPointerWithAttrs) {
 }
 
 bool is_ffi_invoked = false;
-static absl::Status IsInvoked(ffi::BufferBase) {
+static absl::Status IsInvoked(ffi::Result<ffi::BufferBase>) {
   is_ffi_invoked = true;
   return absl::OkStatus();
 }
 
 XLA_FFI_DEFINE_HANDLER(
     kIsInvoked, IsInvoked,
-    ffi::Ffi::Bind().Arg<ffi::BufferBase>());  // Buffer for result (unused).
+    ffi::Ffi::Bind().Ret<ffi::BufferBase>());  // Buffer for result (unused).
 
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$isinvoked", PLATFORM,
                          kIsInvoked);
@@ -477,7 +478,8 @@ TEST_F(CustomCallTest, ExportedFfiUnknownTarget) {
 // fusions/address_computation_fusion_test.cc
 
 // Reusing kExpectedOpaque from the original test.
-static absl::Status Opaque(ffi::BufferBase, const std::string* str) {
+static absl::Status Opaque(ffi::Result<ffi::BufferBase>,
+                           const std::string* str) {
   std::string opaque(*str);
   if (opaque != kExpectedOpaque)
     return absl::InternalError(absl::StrFormat(
@@ -488,7 +490,7 @@ static absl::Status Opaque(ffi::BufferBase, const std::string* str) {
 
 XLA_FFI_DEFINE_HANDLER(kOpaque, Opaque,
                        ffi::Ffi::Bind()
-                           .Arg<ffi::BufferBase>()  // Dummy result buffer.
+                           .Ret<ffi::BufferBase>()  // Dummy result buffer.
                            .Attr<ffi::Pointer<std::string>>("opaque"));
 
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$opaque", PLATFORM,
@@ -514,20 +516,23 @@ static absl::Status TokensChecker(std::vector<ffi::BufferBase> inputs,
   return absl::OkStatus();
 }
 
-static absl::Status Tokens1Input(ffi::BufferBase input1, ffi::BufferBase,
+static absl::Status Tokens1Input(ffi::BufferBase input1,
+                                 ffi::Result<ffi::BufferBase>,
                                  const std::string* opaque) {
   return TokensChecker({input1}, opaque);
 }
 
 static absl::Status Tokens2Inputs(ffi::BufferBase input1,
-                                  ffi::BufferBase input2, ffi::BufferBase,
+                                  ffi::BufferBase input2,
+                                  ffi::Result<ffi::BufferBase>,
                                   const std::string* opaque) {
   return TokensChecker({input1, input2}, opaque);
 }
 
 static absl::Status Tokens3Inputs(ffi::BufferBase input1,
                                   ffi::BufferBase input2,
-                                  ffi::BufferBase input3, ffi::BufferBase,
+                                  ffi::BufferBase input3,
+                                  ffi::Result<ffi::BufferBase>,
                                   const std::string* opaque) {
   return TokensChecker({input1, input2, input3}, opaque);
 }
@@ -535,7 +540,7 @@ static absl::Status Tokens3Inputs(ffi::BufferBase input1,
 XLA_FFI_DEFINE_HANDLER(kTokens1Input, Tokens1Input,
                        ffi::Ffi::Bind()
                            .Arg<ffi::BufferBase>()  // 1 input buffer.
-                           .Arg<ffi::BufferBase>()  // Output buffer.
+                           .Ret<ffi::BufferBase>()  // Output buffer.
                            .Attr<ffi::Pointer<std::string>>("opaque"));
 
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$tokens_1input",
@@ -545,7 +550,7 @@ XLA_FFI_DEFINE_HANDLER(kTokens2Inputs, Tokens2Inputs,
                        ffi::Ffi::Bind()
                            .Arg<ffi::BufferBase>()  // 1st input buffer.
                            .Arg<ffi::BufferBase>()  // 2nd input buffer.
-                           .Arg<ffi::BufferBase>()  // Output buffer.
+                           .Ret<ffi::BufferBase>()  // Output buffer.
                            .Attr<ffi::Pointer<std::string>>("opaque"));
 
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$tokens_2inputs",
@@ -556,7 +561,7 @@ XLA_FFI_DEFINE_HANDLER(kTokens3Inputs, Tokens3Inputs,
                            .Arg<ffi::BufferBase>()  // 1st input buffer.
                            .Arg<ffi::BufferBase>()  // 2nd input buffer.
                            .Arg<ffi::BufferBase>()  // 3rd input buffer.
-                           .Arg<ffi::BufferBase>()  // Output buffer.
+                           .Ret<ffi::BufferBase>()  // Output buffer.
                            .Attr<ffi::Pointer<std::string>>("opaque"));
 
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$tokens_3inputs",
@@ -596,10 +601,12 @@ TEST_P(CustomCallTokensTest, ExportedFfiTokensTest) {
 INSTANTIATE_TEST_SUITE_P(CustomCallTokensTest, CustomCallTokensTest,
                          ::testing::ValuesIn(GetTokenTestCases()));
 
-static absl::Status AlwaysSucceed(ffi::BufferBase) { return absl::OkStatus(); }
+static absl::Status AlwaysSucceed(ffi::Result<ffi::BufferBase>) {
+  return absl::OkStatus();
+}
 
 XLA_FFI_DEFINE_HANDLER(kAlwaysSucceed, AlwaysSucceed,
-                       ffi::Ffi::Bind().Arg<ffi::BufferBase>());
+                       ffi::Ffi::Bind().Ret<ffi::BufferBase>());
 
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$always_succeed",
                          PLATFORM, kAlwaysSucceed);
@@ -619,7 +626,7 @@ TEST_F(CustomCallTest, ExportedFfiWithStatusSucceeded) {
 // XLA:FFI handler for testing attributes decoding
 //===----------------------------------------------------------------------===//
 
-static absl::Status FfiAttributes(ffi::BufferBase,
+static absl::Status FfiAttributes(ffi::Result<ffi::BufferBase>,
                                   absl::Span<const int32_t> i32_arr) {
   if (i32_arr.size() != 4)
     return absl::InternalError("i32_arr size does not match");
@@ -632,7 +639,7 @@ static absl::Status FfiAttributes(ffi::BufferBase,
 
 XLA_FFI_DEFINE_HANDLER(kFfiAttributes, FfiAttributes,
                        ffi::Ffi::Bind()
-                           .Arg<ffi::BufferBase>()
+                           .Ret<ffi::BufferBase>()
                            .Attr<absl::Span<const int32_t>>("i32_arr"));
 
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "xla.gpu.ffi_attributes",
@@ -656,7 +663,7 @@ TEST_F(CustomCallTest, FfiAttributes) {
 
 static absl::Status MemcpyWithCalledComputation(
     se::Stream* stream, se::OwningScratchAllocator<> scratch_allocator,
-    ffi::BufferBase src, ffi::BufferBase dst,
+    ffi::BufferBase src, ffi::Result<ffi::BufferBase> dst,
     const HloComputation* called_computation) {
   if (called_computation == nullptr)
     return absl::InternalError("Called computation is not defined");
@@ -680,7 +687,7 @@ XLA_FFI_DEFINE_HANDLER(kMemcpyWithCalledComputation,
                            .Ctx<ffi::Stream>()
                            .Ctx<ffi::ScratchAllocator>()  // scratch
                            .Arg<ffi::BufferBase>()        // src
-                           .Arg<ffi::BufferBase>()        // dst
+                           .Ret<ffi::BufferBase>()        // dst
                            .Ctx<ffi::CalledComputation>());
 
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(),
diff --git a/third_party/xla/xla/service/gpu/fusions/BUILD b/third_party/xla/xla/service/gpu/fusions/BUILD
index 2fbbcd17d47da1..2053e2344f9591 100644
--- a/third_party/xla/xla/service/gpu/fusions/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/BUILD
@@ -23,12 +23,14 @@ cc_library(
         "//xla/service/gpu:ir_emitter",
         "//xla/service/gpu:ir_emitter_context",
         "//xla/service/gpu:launch_dimensions",
+        "//xla/service/gpu/model:indexing_map",
         "//xla/service/llvm_ir:dynamic_update_slice_util",
         "//xla/service/llvm_ir:fused_ir_emitter",
         "//xla/service/llvm_ir:ir_array",
         "@com_google_absl//absl/status",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:ir_headers",
+        "@llvm-project//mlir:IR",
     ],
 )
 
@@ -62,6 +64,7 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:hlo_fusion_analysis",
+        "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu/fusions/mlir:computation_partitioner",
         "//xla/service/gpu/fusions/mlir:elemental_hlo_to_mlir",
@@ -88,10 +91,12 @@ xla_cc_test(
         ":mlir_emitter_test_base",
         "//xla:error_spec",
         "//xla/service:gpu_plugin",
+        "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu/model:indexing_test_utils",
         "//xla/tests:xla_internal_test_main",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -101,12 +106,12 @@ cc_library(
     hdrs = ["copy.h"],
     deps = [
         ":fusion_emitter",
-        "//xla:statusor",
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:ir_emitter_context",
         "//xla/service/gpu/runtime:copy_thunk",
         "//xla/service/gpu/runtime:thunk",
+        "@com_google_absl//absl/status:statusor",
         "@llvm-project//mlir:IR",
     ],
 )
@@ -123,11 +128,11 @@ cc_library(
         "//xla:statusor",
         "//xla:util",
         "//xla/ffi:ffi_api",
-        "//xla/ffi/api:c_api",
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service:custom_call_status",
         "//xla/service:custom_call_target_registry",
+        "//xla/service:hlo_proto_cc",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:cublas_cudnn",
         "//xla/service/gpu:hlo_fusion_analysis",
@@ -146,6 +151,7 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
@@ -164,6 +170,8 @@ xla_test(
     deps = [
         "//xla:error_spec",
         "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla:xla_proto_cc",
         "//xla/client:xla_builder",
         "//xla/client:xla_computation",
         "//xla/client/lib:constants",
@@ -173,6 +181,7 @@ xla_test(
         "//xla/service:custom_call_target_registry",
         "//xla/service:executable",
         "//xla/service:hlo_module_config",
+        "//xla/service:hlo_proto_cc",
         "//xla/service/gpu:address_computation_fusion_rewriter",
         "//xla/stream_executor",
         "//xla/stream_executor:device_description",
@@ -180,6 +189,7 @@ xla_test(
         "//xla/tests:hlo_test_base",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/status",
+        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_main",
@@ -216,6 +226,7 @@ cc_library(
         "//xla/stream_executor:device_description",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
@@ -254,15 +265,18 @@ cc_library(
         ":triton",
         "//xla:shape_util",
         "//xla:status",
-        "//xla:statusor",
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
+        "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu/fusions/mlir:elemental_hlo_to_mlir",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/types:span",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
@@ -328,9 +342,12 @@ cc_library(
         "//xla/service/gpu/model:indexing_map",
         "//xla/service/llvm_ir:fused_ir_emitter",
         "//xla/service/llvm_ir:ir_array",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/numeric:bits",
         "@llvm-project//llvm:ir_headers",
         "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform:macros",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -352,15 +369,15 @@ cc_library(
         "//xla/service/gpu/fusions/mlir:mlir_fusion_emitter",
         "//xla/service/gpu/fusions/mlir/ir:xla_gpu",
         "//xla/service/gpu/model:indexing_analysis",
+        "//xla/service/gpu/model:indexing_map",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:TensorDialect",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -391,6 +408,7 @@ cc_library(
         "//xla:status",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/service:scatter_simplifier",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu/fusions/mlir:computation_partitioner",
@@ -439,7 +457,6 @@ cc_library(
         "//xla:permutation_util",
         "//xla:shape_util",
         "//xla:status",
-        "//xla:status_macros",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
@@ -453,8 +470,7 @@ cc_library(
         "//xla/service/gpu/fusions/mlir/ir:xla_gpu",
         "//xla/service/gpu/model:indexing_analysis",
         "//xla/service/gpu/model:indexing_map",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -464,7 +480,6 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
-        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
@@ -490,10 +505,9 @@ xla_cc_test(
     name = "loop_test",
     srcs = ["loop_test.cc"],
     deps = [
+        ":fusion_emitter",
         ":fusions",
-        ":loop",
         "//xla:status_macros",
-        "//xla:statusor",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu/model:affine_map_printer",
@@ -501,6 +515,7 @@ xla_cc_test(
         "//xla/stream_executor:device_description",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_googletest//:gtest",
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:statusor",
@@ -632,7 +647,14 @@ cc_library(
         ":fusion_emitter",
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:hlo_fusion_analysis",
+        "//xla/service/gpu:ir_emitter_context",
+        "//xla/service/gpu:kernel_arguments",
+        "//xla/service/gpu:kernel_reuse_cache",
         "//xla/service/gpu/runtime:cudnn_thunk",
+        "//xla/service/gpu/runtime:thunk",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status:statusor",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -646,10 +668,17 @@ xla_test(
         "gpu",
     ],
     deps = [
+        "//xla:comparison_util",
+        "//xla:debug_options_flags",
+        "//xla:error_spec",
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:executable",
         "//xla/service/gpu:stream_executor_util",
         "//xla/service/gpu/tests:gpu_codegen_test",
+        "//xla/stream_executor:stream_executor_headers",
         "//xla/tests:filecheck",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -911,19 +940,15 @@ cc_library(
         "//xla/service/gpu/fusions/mlir:computation_partitioner",
         "//xla/service/gpu/fusions/mlir:elemental_hlo_to_mlir",
         "//xla/service/gpu/fusions/mlir:mlir_fusion_emitter",
-        "//xla/service/gpu/fusions/mlir/ir:xla_gpu",
         "//xla/service/gpu/model:indexing_analysis",
         "//xla/service/gpu/model:indexing_map",
-        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:DataLayoutInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:TensorDialect",
-        "@local_tsl//tsl/platform:errors",
     ],
 )
 
@@ -932,14 +957,15 @@ xla_cc_test(
     srcs = ["concatenate_mlir_test.cc"],
     tags = tf_cuda_tests_tags(),
     deps = [
-        ":concatenate",
         ":concatenate_mlir",
         ":mlir_emitter_test_base",
         "//xla:error_spec",
+        "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu/model:indexing_test_utils",
         "//xla/tests:xla_internal_test_main",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -1005,9 +1031,9 @@ cc_library(
         ":fusion_emitter",
         "//xla:shape_util",
         "//xla:status",
-        "//xla:statusor",
         "//xla:util",
         "//xla/hlo/ir:hlo",
+        "//xla/service:elemental_ir_emitter",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu:ir_emitter",
@@ -1015,12 +1041,14 @@ cc_library(
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:parallel_loop_emitter",
         "//xla/service/gpu/model:indexing_analysis",
+        "//xla/service/gpu/model:indexing_map",
         "//xla/service/llvm_ir:fused_ir_emitter",
         "//xla/service/llvm_ir:ir_array",
         "//xla/service/llvm_ir:kernel_support_library",
         "//xla/service/llvm_ir:llvm_loop",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:ir_headers",
@@ -1052,8 +1080,6 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:TensorDialect",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
diff --git a/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc b/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc
index 2374f5abe6e2fd..994e0c3a53d1c3 100644
--- a/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc
@@ -28,13 +28,16 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/custom_call_target_registry.h"
 #include "xla/service/gpu/address_computation_fusion_rewriter.h"
+#include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_module_config.h"
-#include "xla/service/service_executable_run_options.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/gpu/gpu_types.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tests/hlo_test_base.h"
+#include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
 
@@ -831,9 +834,9 @@ TEST_F(AddressComputationFusionTest, SlicedOperandAliasingOutput) {
 }
 
 static absl::Status Memcpy(se::Stream* stream, ffi::BufferBase src,
-                           ffi::BufferBase dst) {
+                           ffi::Result<ffi::BufferBase> dst) {
   return stream->MemcpyD2D(
-      &dst.data, src.data,
+      &dst->data, src.data,
       absl::c_accumulate(src.dimensions, 1.0, std::multiplies<int64_t>()) *
           sizeof(float));
 }
@@ -842,7 +845,7 @@ XLA_FFI_DEFINE_HANDLER(kMemcpy, Memcpy,
                        ffi::Ffi::Bind()
                            .Ctx<ffi::Stream>()
                            .Arg<ffi::BufferBase>()  // src
-                           .Arg<ffi::BufferBase>()  // dst
+                           .Ret<ffi::BufferBase>()  // dst
 );
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$memcpy", PLATFORM,
                          kMemcpy);
@@ -880,14 +883,14 @@ TEST_F(AddressComputationFusionTest, CustomCallSimple) {
                                       error_spec, /*run_hlo_passes=*/false));
 }
 
-static absl::Status SubBuffers(se::Stream* stream, ffi::BufferBase src0,
-                               ffi::BufferBase src1, ffi::BufferBase src2,
-                               ffi::BufferBase src3, ffi::BufferBase src4,
-                               ffi::BufferBase src5, ffi::BufferBase src6,
-                               ffi::BufferBase src7, ffi::BufferBase dst0,
-                               ffi::BufferBase dst1, ffi::BufferBase dst2,
-                               ffi::BufferBase dst3, ffi::BufferBase dst4,
-                               ffi::BufferBase dst5, ffi::BufferBase dst6) {
+static absl::Status SubBuffers(
+    se::Stream* stream, ffi::BufferBase src0, ffi::BufferBase src1,
+    ffi::BufferBase src2, ffi::BufferBase src3, ffi::BufferBase src4,
+    ffi::BufferBase src5, ffi::BufferBase src6, ffi::BufferBase src7,
+    ffi::Result<ffi::BufferBase> dst0, ffi::Result<ffi::BufferBase> dst1,
+    ffi::Result<ffi::BufferBase> dst2, ffi::Result<ffi::BufferBase> dst3,
+    ffi::Result<ffi::BufferBase> dst4, ffi::Result<ffi::BufferBase> dst5,
+    ffi::Result<ffi::BufferBase> dst6) {
   //  src0:  param 0 at tuple index {0}, shape f32[128]
   //  src1:  param 0 at tuple index {1}, shape f32[256]
   //  src2:  param 1 at tuple index {0}, shape f32[1024]
@@ -906,21 +909,21 @@ static absl::Status SubBuffers(se::Stream* stream, ffi::BufferBase src0,
   //  dst6:  result at tuple index {5}, shape f32[96]
 
   TF_RETURN_IF_ERROR(
-      stream->MemcpyD2D(&dst0.data, src3.data, 8 * sizeof(float)));
+      stream->MemcpyD2D(&dst0->data, src3.data, 8 * sizeof(float)));
   TF_RETURN_IF_ERROR(
-      stream->MemcpyD2D(&dst1.data, src0.data, 128 * sizeof(float)));
+      stream->MemcpyD2D(&dst1->data, src0.data, 128 * sizeof(float)));
   TF_RETURN_IF_ERROR(
-      stream->MemcpyD2D(&dst2.data, src1.data, 256 * sizeof(float)));
+      stream->MemcpyD2D(&dst2->data, src1.data, 256 * sizeof(float)));
   TF_RETURN_IF_ERROR(
-      stream->MemcpyD2D(&dst3.data, src2.data, 1024 * sizeof(float)));
+      stream->MemcpyD2D(&dst3->data, src2.data, 1024 * sizeof(float)));
   TF_RETURN_IF_ERROR(
-      stream->MemcpyD2D(&dst4.data, src4.data, 4 * 8 * sizeof(float)));
+      stream->MemcpyD2D(&dst4->data, src4.data, 4 * 8 * sizeof(float)));
   TF_RETURN_IF_ERROR(
-      stream->MemcpyD2D(&dst5.data, src7.data, 3 * 128 * sizeof(float)));
+      stream->MemcpyD2D(&dst5->data, src7.data, 3 * 128 * sizeof(float)));
   TF_RETURN_IF_ERROR(
-      stream->MemcpyD2D(&dst6.data, src6.data, 64 * sizeof(float)));
+      stream->MemcpyD2D(&dst6->data, src6.data, 64 * sizeof(float)));
   stream_executor::DeviceMemoryBase slice =
-      dst6.data.GetByteSlice(64 * sizeof(float), 32 * sizeof(float));
+      dst6->data.GetByteSlice(64 * sizeof(float), 32 * sizeof(float));
   TF_RETURN_IF_ERROR(stream->MemcpyD2D(&slice, src6.data, 32 * sizeof(float)));
   return absl::OkStatus();
 }
@@ -936,13 +939,13 @@ XLA_FFI_DEFINE_HANDLER(kSubBuffers, SubBuffers,
                            .Arg<ffi::BufferBase>()  // src5
                            .Arg<ffi::BufferBase>()  // src6
                            .Arg<ffi::BufferBase>()  // src7
-                           .Arg<ffi::BufferBase>()  // dst0
-                           .Arg<ffi::BufferBase>()  // dst1
-                           .Arg<ffi::BufferBase>()  // dst2
-                           .Arg<ffi::BufferBase>()  // dst3
-                           .Arg<ffi::BufferBase>()  // dst4
-                           .Arg<ffi::BufferBase>()  // dst5
-                           .Arg<ffi::BufferBase>()  // dst6
+                           .Ret<ffi::BufferBase>()  // dst0
+                           .Ret<ffi::BufferBase>()  // dst1
+                           .Ret<ffi::BufferBase>()  // dst2
+                           .Ret<ffi::BufferBase>()  // dst3
+                           .Ret<ffi::BufferBase>()  // dst4
+                           .Ret<ffi::BufferBase>()  // dst5
+                           .Ret<ffi::BufferBase>()  // dst6
 );
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$subbuffers",
                          PLATFORM, kSubBuffers);
@@ -2539,14 +2542,14 @@ TEST_F(AddressComputationFusionTest, DynamicCustomCallWithTuple) {
                                       error_spec, /*run_hlo_passes=*/false));
 }
 
-static absl::Status SubBuffers2(se::Stream* stream, ffi::BufferBase src0,
-                                ffi::BufferBase src1, ffi::BufferBase src2,
-                                ffi::BufferBase src3, ffi::BufferBase src4,
-                                ffi::BufferBase src5, ffi::BufferBase src6,
-                                ffi::BufferBase dst0, ffi::BufferBase dst1,
-                                ffi::BufferBase dst2, ffi::BufferBase dst3,
-                                ffi::BufferBase dst4, ffi::BufferBase dst5,
-                                ffi::BufferBase dst6) {
+static absl::Status SubBuffers2(
+    se::Stream* stream, ffi::BufferBase src0, ffi::BufferBase src1,
+    ffi::BufferBase src2, ffi::BufferBase src3, ffi::BufferBase src4,
+    ffi::BufferBase src5, ffi::BufferBase src6,
+    ffi::Result<ffi::BufferBase> dst0, ffi::Result<ffi::BufferBase> dst1,
+    ffi::Result<ffi::BufferBase> dst2, ffi::Result<ffi::BufferBase> dst3,
+    ffi::Result<ffi::BufferBase> dst4, ffi::Result<ffi::BufferBase> dst5,
+    ffi::Result<ffi::BufferBase> dst6) {
   //  src0:  param 0 at tuple index {0}, shape f32[128]
   //  src1:  param 0 at tuple index {1}, shape f32[256]
   //  src2:  param 1 at tuple index {0}, shape f32[1024]
@@ -2564,19 +2567,19 @@ static absl::Status SubBuffers2(se::Stream* stream, ffi::BufferBase src0,
   //  dst6:  result at tuple index {4, 1}, shape f32[3,128]
 
   TF_RETURN_IF_ERROR(
-      stream->MemcpyD2D(&dst0.data, src3.data, 8 * sizeof(float)));
+      stream->MemcpyD2D(&dst0->data, src3.data, 8 * sizeof(float)));
   TF_RETURN_IF_ERROR(
-      stream->MemcpyD2D(&dst1.data, src0.data, 128 * sizeof(float)));
+      stream->MemcpyD2D(&dst1->data, src0.data, 128 * sizeof(float)));
   TF_RETURN_IF_ERROR(
-      stream->MemcpyD2D(&dst2.data, src1.data, 256 * sizeof(float)));
+      stream->MemcpyD2D(&dst2->data, src1.data, 256 * sizeof(float)));
   TF_RETURN_IF_ERROR(
-      stream->MemcpyD2D(&dst3.data, src2.data, 1024 * sizeof(float)));
+      stream->MemcpyD2D(&dst3->data, src2.data, 1024 * sizeof(float)));
   TF_RETURN_IF_ERROR(
-      stream->MemcpyD2D(&dst4.data, src4.data, 4 * 8 * sizeof(float)));
+      stream->MemcpyD2D(&dst4->data, src4.data, 4 * 8 * sizeof(float)));
   TF_RETURN_IF_ERROR(
-      stream->MemcpyD2D(&dst5.data, src6.data, 5 * 128 * sizeof(float)));
+      stream->MemcpyD2D(&dst5->data, src6.data, 5 * 128 * sizeof(float)));
   TF_RETURN_IF_ERROR(
-      stream->MemcpyD2D(&dst6.data, src5.data, 3 * 128 * sizeof(float)));
+      stream->MemcpyD2D(&dst6->data, src5.data, 3 * 128 * sizeof(float)));
   return absl::OkStatus();
 }
 
@@ -2590,13 +2593,13 @@ XLA_FFI_DEFINE_HANDLER(kSubBuffers2, SubBuffers2,
                            .Arg<ffi::BufferBase>()  // src4
                            .Arg<ffi::BufferBase>()  // src5
                            .Arg<ffi::BufferBase>()  // src6
-                           .Arg<ffi::BufferBase>()  // dst0
-                           .Arg<ffi::BufferBase>()  // dst1
-                           .Arg<ffi::BufferBase>()  // dst2
-                           .Arg<ffi::BufferBase>()  // dst3
-                           .Arg<ffi::BufferBase>()  // dst4
-                           .Arg<ffi::BufferBase>()  // dst5
-                           .Arg<ffi::BufferBase>()  // dst6
+                           .Ret<ffi::BufferBase>()  // dst0
+                           .Ret<ffi::BufferBase>()  // dst1
+                           .Ret<ffi::BufferBase>()  // dst2
+                           .Ret<ffi::BufferBase>()  // dst3
+                           .Ret<ffi::BufferBase>()  // dst4
+                           .Ret<ffi::BufferBase>()  // dst5
+                           .Ret<ffi::BufferBase>()  // dst6
 );
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$subbuffers2",
                          PLATFORM, kSubBuffers2);
diff --git a/third_party/xla/xla/service/gpu/fusions/concatenate.h b/third_party/xla/xla/service/gpu/fusions/concatenate.h
index ec9349c9589410..4b54b87c508ad8 100644
--- a/third_party/xla/xla/service/gpu/fusions/concatenate.h
+++ b/third_party/xla/xla/service/gpu/fusions/concatenate.h
@@ -19,11 +19,16 @@ limitations under the License.
 #include <vector>
 
 #include "absl/status/status.h"
+#include "llvm/IR/IRBuilder.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/fusions/fusion_emitter.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/model/indexing_map.h"
+#include "xla/service/llvm_ir/ir_array.h"
+#include "xla/shape.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.cc b/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.cc
index e6091085c2e2c5..aa472039837f1f 100644
--- a/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <iterator>
 #include <optional>
 
-#include "absl/algorithm/container.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -31,18 +30,15 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/ValueRange.h"  // from @llvm-project
-#include "mlir/Interfaces/DataLayoutInterfaces.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/fusions/concatenate.h"
 #include "xla/service/gpu/fusions/mlir/computation_partitioner.h"
 #include "xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.h"
-#include "xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/model/indexing_analysis.h"
 #include "xla/service/gpu/model/indexing_map.h"
-#include "tsl/platform/errors.h"
 
 namespace xla {
 namespace gpu {
@@ -72,10 +68,12 @@ MlirConcatenateFusion::ComputeThreadIdToInputIndexing(
                                        ctx);
 }
 
-std::vector<const HloInstruction*>
-MlirConcatenateFusion::GetInstructionsWithCustomCodegen(
-    const HloFusionInstruction& fusion) const {
-  return analysis_.fusion_heroes();
+std::optional<mlir_converter::EpilogueSpecification>
+MlirConcatenateFusion::GetEpilogue(const HloFusionInstruction& fusion,
+                                   mlir::MLIRContext* mlir_context) const {
+  return mlir_converter::EpilogueSpecification::FromIdentityIndexing(
+      analysis_.fusion_heroes().front(), analysis_.fusion_roots().front(),
+      mlir_context);
 }
 
 absl::Status MlirConcatenateFusion::EmitEntryFunction(
diff --git a/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.h b/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.h
index f14606a1073f47..cd2306110df5e7 100644
--- a/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.h
+++ b/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.h
@@ -23,7 +23,9 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/gpu/fusions/mlir/computation_partitioner.h"
 #include "xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/launch_dimensions.h"
@@ -53,8 +55,9 @@ class MlirConcatenateFusion : public MlirFusionEmitterBase {
       mlir::func::FuncOp entry_function,
       const HloFusionInstruction& fusion) const override;
 
-  std::vector<const HloInstruction*> GetInstructionsWithCustomCodegen(
-      const HloFusionInstruction& fusion) const override;
+  std::optional<mlir_converter::EpilogueSpecification> GetEpilogue(
+      const HloFusionInstruction& fusion,
+      mlir::MLIRContext* mlir_context) const override;
 
  private:
   const HloFusionAnalysis& analysis_;
diff --git a/third_party/xla/xla/service/gpu/fusions/concatenate_mlir_test.cc b/third_party/xla/xla/service/gpu/fusions/concatenate_mlir_test.cc
index 515d385408018f..cbb9877601de71 100644
--- a/third_party/xla/xla/service/gpu/fusions/concatenate_mlir_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/concatenate_mlir_test.cc
@@ -15,12 +15,14 @@ limitations under the License.
 
 #include "xla/service/gpu/fusions/concatenate_mlir.h"
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "xla/error_spec.h"
-#include "xla/service/gpu/fusions/concatenate.h"
 #include "xla/service/gpu/fusions/mlir_emitter_test_base.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/model/indexing_test_utils.h"
 #include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/fusions/copy.cc b/third_party/xla/xla/service/gpu/fusions/copy.cc
index 37cb5ab0dba0e0..1ab401bbce67fc 100644
--- a/third_party/xla/xla/service/gpu/fusions/copy.cc
+++ b/third_party/xla/xla/service/gpu/fusions/copy.cc
@@ -16,12 +16,12 @@ limitations under the License.
 
 #include <memory>
 
+#include "absl/status/statusor.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/fusions/fusion_emitter.h"
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/runtime/copy_thunk.h"
 #include "xla/service/gpu/runtime/thunk.h"
-#include "xla/statusor.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/fusions/copy.h b/third_party/xla/xla/service/gpu/fusions/copy.h
index 574f1eb454271a..00b9ce4506a5a3 100644
--- a/third_party/xla/xla/service/gpu/fusions/copy.h
+++ b/third_party/xla/xla/service/gpu/fusions/copy.h
@@ -17,7 +17,9 @@ limitations under the License.
 
 #include <vector>
 
+#include "absl/status/statusor.h"
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/fusions/fusion_emitter.h"
 #include "xla/service/gpu/ir_emitter_context.h"
diff --git a/third_party/xla/xla/service/gpu/fusions/cudnn.cc b/third_party/xla/xla/service/gpu/fusions/cudnn.cc
index 60f8af52222eb6..f2c0df4b54c8de 100644
--- a/third_party/xla/xla/service/gpu/fusions/cudnn.cc
+++ b/third_party/xla/xla/service/gpu/fusions/cudnn.cc
@@ -15,7 +15,15 @@ limitations under the License.
 
 #include "xla/service/gpu/fusions/cudnn.h"
 
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/gpu/fusions/fusion_emitter.h"
+#include "xla/service/gpu/ir_emitter_context.h"
+#include "xla/service/gpu/kernel_arguments.h"
+#include "xla/service/gpu/kernel_reuse_cache.h"
+#include "xla/service/gpu/runtime/thunk.h"
+#include "tsl/platform/statusor.h"
 #if GOOGLE_CUDA
 #include "xla/service/gpu/runtime/cudnn_thunk.h"
 #endif
diff --git a/third_party/xla/xla/service/gpu/fusions/cudnn.h b/third_party/xla/xla/service/gpu/fusions/cudnn.h
index ba5eecc88a4816..c65750828c2f81 100644
--- a/third_party/xla/xla/service/gpu/fusions/cudnn.h
+++ b/third_party/xla/xla/service/gpu/fusions/cudnn.h
@@ -16,9 +16,11 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_FUSIONS_CUDNN_H_
 #define XLA_SERVICE_GPU_FUSIONS_CUDNN_H_
 
+#include "absl/status/statusor.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/fusions/fusion_emitter.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/ir_emitter_context.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/fusions/cudnn_test.cc b/third_party/xla/xla/service/gpu/fusions/cudnn_test.cc
index acd36195bed160..ca7d2c2b8d59bb 100644
--- a/third_party/xla/xla/service/gpu/fusions/cudnn_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/cudnn_test.cc
@@ -17,13 +17,22 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_replace.h"
 #include "absl/strings/substitute.h"
+#include "xla/comparison_util.h"
+#include "xla/debug_options_flags.h"
+#include "xla/error_spec.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/primitive_util.h"
 #include "xla/service/executable.h"
 #include "xla/service/gpu/stream_executor_util.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
+#include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/stream_executor_pimpl.h"
 #include "xla/tests/filecheck.h"
+#include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/statusor.h"
 
@@ -267,6 +276,32 @@ ENTRY e {
                             ErrorSpec{/*aabs=*/1e-5, /*arel=*/1e-5}));
 }
 
+TEST_F(CuDnnFusionExecutionTest, IntegerMathExecutesCorrectly) {
+  if (!IsAtLeastCuDnn91()) {
+    GTEST_SKIP() << "Integer math requires cuDNN 9.1+.";
+  }
+  const std::string kHloText =
+      R"(
+fusion1 {
+  p0 = s8[16,16] parameter(0)
+  p1 = s8[16,16] parameter(1)
+  d = s32[16,16] dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  p2 = s32[16,16] parameter(2)
+  ROOT a = s32[16,16] add(d, p2)
+}
+
+ENTRY e {
+  p0 = s8[16,16] parameter(0)
+  p1 = s8[16,16] parameter(1)
+  p2 = s32[16,16] parameter(2)
+  ROOT r = s32[16,16] fusion(p0, p1, p2), kind=kCustom,
+    calls=fusion1,
+    backend_config={"fusion_backend_config": {"kind":"__cudnn$fusion"}}
+})";
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/0, /*arel=*/0}));
+}
+
 class CuDnnFusionCommandBufferTest : public CuDnnFusionTest {
  public:
   DebugOptions GetDebugOptionsForTest() override {
diff --git a/third_party/xla/xla/service/gpu/fusions/custom.cc b/third_party/xla/xla/service/gpu/fusions/custom.cc
index 99b1e9a0918669..8f619ff54f29a3 100644
--- a/third_party/xla/xla/service/gpu/fusions/custom.cc
+++ b/third_party/xla/xla/service/gpu/fusions/custom.cc
@@ -30,12 +30,9 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/TypeSwitch.h"
 #include "mlir/AsmParser/AsmParser.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "xla/ffi/api/c_api.h"
 #include "xla/ffi/ffi_api.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -60,6 +57,7 @@ limitations under the License.
 #include "xla/service/gpu/runtime/gemm_thunk.h"
 #include "xla/service/gpu/runtime/kernel_thunk.h"
 #include "xla/service/gpu/runtime/thunk.h"
+#include "xla/service/hlo.pb.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
@@ -122,7 +120,7 @@ absl::StatusOr<BufferAllocation::Slice> GetOperandSlice(
     if (!IsContiguousSlice(slice_instr->operand(0)->shape(),
                            slice_instr->shape())) {
       return absl::InternalError(
-          "DynamicAddressComputationFusion only handles contiguous slices "
+          "AddressComputationFusion only handles contiguous slices "
           "currently");
     }
 
@@ -239,7 +237,7 @@ absl::StatusOr<BufferAllocation::Slice> GetResultSlice(
                                ->update()
                                ->shape())) {
       return absl::InternalError(
-          "DynamicAddressComputationFusion only handles contiguous slices "
+          "AddressComputationFusion only handles contiguous slices "
           "currently");
     }
   }
@@ -738,28 +736,6 @@ absl::StatusOr<FusionEmissionResult> AddressComputationFusion::Emit(
         "AddressComputationFusion requires a CustomCall hero");
   }
 
-  const auto& custom_call = *static_cast<const HloCustomCallInstruction*>(
-      &maybe_custom_call_adaptor->instruction());
-  // TODO(vuson): these Emit* are mostly duplicated from ir_emitter_unnested
-  if (IsLegacyCublasMatmul(custom_call)) {
-    return EmitGemm(ir_emitter_context, adaptor, fusion, custom_call);
-  }
-
-  return EmitCustomCall(ir_emitter_context, adaptor, fusion, custom_call);
-}
-
-absl::StatusOr<FusionEmissionResult> DynamicAddressComputationFusion::Emit(
-    IrEmitterContext& ir_emitter_context,
-    const HloFusionInstruction& fusion) const {
-  const HloFusionAdaptor& adaptor = analysis_.fusion();
-  auto maybe_custom_call_adaptor = HloFindIf(
-      adaptor.GetRoots(), adaptor,
-      [](auto node) { return node.opcode() == HloOpcode::kCustomCall; });
-  if (maybe_custom_call_adaptor == std::nullopt) {
-    return absl::InternalError(
-        "DynamicAddressComputationFusion requires a CustomCall hero");
-  }
-
   const auto& custom_call = *static_cast<const HloCustomCallInstruction*>(
       &maybe_custom_call_adaptor->instruction());
   if (IsLegacyCublasMatmul(custom_call)) {
diff --git a/third_party/xla/xla/service/gpu/fusions/custom.h b/third_party/xla/xla/service/gpu/fusions/custom.h
index 24f82865e05354..5dbc75adb8864b 100644
--- a/third_party/xla/xla/service/gpu/fusions/custom.h
+++ b/third_party/xla/xla/service/gpu/fusions/custom.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_FUSIONS_CUSTOM_H_
 #define XLA_SERVICE_GPU_FUSIONS_CUSTOM_H_
 
+#include "absl/status/statusor.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/fusions/fusion_emitter.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
@@ -63,28 +64,6 @@ class AddressComputationFusion : public FusionInterface {
   const HloFusionAnalysis& analysis_;
 };
 
-// TODO(vuson): merge these two fusions.
-// Emitter for custom fusions implementing dynamic address computation. A
-// dynamic address computation contains a custom call hero, with at least one of
-// its operands coming from a dynamic contiguous slice, and/or with at least one
-// of its results feeding into a contiguous DUS.
-//
-// The goal is to compute the buffer addresses for sliced operands/results
-// without having to allocate new buffers for these by wrapping
-// AddressComputationThunk around the original custom call thunk.
-class DynamicAddressComputationFusion : public FusionInterface {
- public:
-  explicit DynamicAddressComputationFusion(const HloFusionAnalysis& analysis)
-      : analysis_(analysis) {}
-
-  absl::StatusOr<FusionEmissionResult> Emit(
-      IrEmitterContext& ir_emitter_context,
-      const HloFusionInstruction& fusion) const final;
-
- private:
-  const HloFusionAnalysis& analysis_;
-};
-
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/gpu/fusions/fusion_emitter.cc b/third_party/xla/xla/service/gpu/fusions/fusion_emitter.cc
index e652532fd0464c..0c3fa4e9795aa4 100644
--- a/third_party/xla/xla/service/gpu/fusions/fusion_emitter.cc
+++ b/third_party/xla/xla/service/gpu/fusions/fusion_emitter.cc
@@ -26,6 +26,7 @@ limitations under the License.
 
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
@@ -59,7 +60,6 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/status.h"
 #include "xla/status_macros.h"
-#include "xla/statusor.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
diff --git a/third_party/xla/xla/service/gpu/fusions/fusion_emitter.h b/third_party/xla/xla/service/gpu/fusions/fusion_emitter.h
index dc8c399cbf42fd..db2e6d82ff35ac 100644
--- a/third_party/xla/xla/service/gpu/fusions/fusion_emitter.h
+++ b/third_party/xla/xla/service/gpu/fusions/fusion_emitter.h
@@ -40,6 +40,7 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/status.h"
 #include "xla/statusor.h"
+#include "xla/stream_executor/device_description.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/fusions/fusions.cc b/third_party/xla/xla/service/gpu/fusions/fusions.cc
index 5c192c7a5ca9fc..2d5faddf657f1f 100644
--- a/third_party/xla/xla/service/gpu/fusions/fusions.cc
+++ b/third_party/xla/xla/service/gpu/fusions/fusions.cc
@@ -22,13 +22,16 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/log/check.h"
-#include "absl/types/span.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/match.h"
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/layout_util.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/fusions/concatenate.h"
 #include "xla/service/gpu/fusions/concatenate_mlir.h"
 #include "xla/service/gpu/fusions/copy.h"
@@ -54,7 +57,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
-#include "xla/statusor.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
@@ -129,7 +131,7 @@ bool HloFusionInfo::CanEmitDynamicUpdateSliceInPlace() const {
 }
 
 absl::StatusOr<std::unique_ptr<FusionInterface>> GetFusionEmitter(
-    const FusionInfo& fusion_info) {
+    const FusionInfo& fusion_info, bool is_emission_phase) {
   const auto& analysis = fusion_info.analysis();
   const FusionBackendConfig& backend_config = analysis.fusion_backend_config();
 
@@ -155,19 +157,22 @@ absl::StatusOr<std::unique_ptr<FusionInterface>> GetFusionEmitter(
     }
 
     static int num_mlir_emitters = 0;
-    // This kernel can be emitted with MLIR, but we need to check if there are
-    // limits to how many kernels can be emitted.
-    ++num_mlir_emitters;
-    if (num_mlir_emitters <= opts.xla_gpu_skip_mlir_kernels()) {
-      VLOG(5) << "Skipping MLIR emission because initial skips were requested.";
-      return false;
-    }
+    if (is_emission_phase) {
+      // This kernel can be emitted with MLIR, but we need to check if there are
+      // limits to how many kernels can be emitted.
+      ++num_mlir_emitters;
+      if (num_mlir_emitters <= opts.xla_gpu_skip_mlir_kernels()) {
+        VLOG(5)
+            << "Skipping MLIR emission because initial skips were requested.";
+        return false;
+      }
 
-    int n_emitted = num_mlir_emitters - opts.xla_gpu_skip_mlir_kernels();
-    if (opts.xla_gpu_max_mlir_kernels() > 0 &&
-        n_emitted > opts.xla_gpu_max_mlir_kernels()) {
-      VLOG(5) << "Skipping MLIR emission because max_mlir_emitters was set.";
-      return false;
+      int n_emitted = num_mlir_emitters - opts.xla_gpu_skip_mlir_kernels();
+      if (opts.xla_gpu_max_mlir_kernels() > 0 &&
+          n_emitted > opts.xla_gpu_max_mlir_kernels()) {
+        VLOG(5) << "Skipping MLIR emission because max_mlir_emitters was set.";
+        return false;
+      }
     }
     VLOG(5) << "Emitting with MLIR.";
     return true;
@@ -176,12 +181,9 @@ absl::StatusOr<std::unique_ptr<FusionInterface>> GetFusionEmitter(
   switch (analysis.GetEmitterFusionKind()) {
     case HloFusionAnalysis::EmitterFusionKind::kCustomFusion: {
       const auto& config = backend_config.custom_fusion_config();
-      if (config.name() == "address_computation") {
+      if (absl::StrContains(config.name(), "address_computation")) {
         return std::make_unique<AddressComputationFusion>(analysis);
       }
-      if (config.name() == "dynamic_address_computation") {
-        return std::make_unique<DynamicAddressComputationFusion>(analysis);
-      }
       return std::make_unique<CustomFusion>();
     }
     case HloFusionAnalysis::EmitterFusionKind::kInputSlices:
diff --git a/third_party/xla/xla/service/gpu/fusions/fusions.h b/third_party/xla/xla/service/gpu/fusions/fusions.h
index f91ac049acb4a2..3c1f22412177be 100644
--- a/third_party/xla/xla/service/gpu/fusions/fusions.h
+++ b/third_party/xla/xla/service/gpu/fusions/fusions.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <memory>
 #include <optional>
 
+#include "absl/status/statusor.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/fusions/fusion_emitter.h"
@@ -90,7 +91,7 @@ class PreBufferAssignmentFusionInfo : public FusionInfo {
 // Returns the emitter for the given fusion. Returns nullopt if the fusion
 // type is not yet supported.
 absl::StatusOr<std::unique_ptr<FusionInterface>> GetFusionEmitter(
-    const FusionInfo& fusion_info);
+    const FusionInfo& fusion_info, bool is_emission_phase = false);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.cc b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.cc
index ea4c56fa576b14..959480d85a3ca7 100644
--- a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.cc
+++ b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.cc
@@ -21,16 +21,17 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/IRBuilder.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/elemental_ir_emitter.h"
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/model/indexing_map.h"
 #include "xla/service/llvm_ir/dynamic_update_slice_util.h"
 #include "xla/service/llvm_ir/fused_ir_emitter.h"
 #include "xla/service/llvm_ir/ir_array.h"
 #include "xla/status.h"
-#include "xla/statusor.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.h b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.h
index 213f7e7ecbdeab..678796c930364a 100644
--- a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.h
+++ b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "llvm/IR/IRBuilder.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/fusions/fusion_emitter.h"
@@ -26,6 +27,7 @@ limitations under the License.
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/model/indexing_map.h"
 #include "xla/service/llvm_ir/ir_array.h"
 #include "xla/status.h"
 #include "xla/statusor.h"
diff --git a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc
index eccdcfceee8a8e..abab64dfb8064c 100644
--- a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc
@@ -94,10 +94,13 @@ MlirInPlaceDynamicUpdateSliceFusion::ComputeThreadIdToInputIndexing(
                                        update_shape, mlir_context);
 }
 
-std::vector<const HloInstruction*>
-MlirInPlaceDynamicUpdateSliceFusion::GetInstructionsWithCustomCodegen(
-    const HloFusionInstruction& fusion) const {
-  return dus_ops_;
+std::optional<mlir_converter::EpilogueSpecification>
+MlirInPlaceDynamicUpdateSliceFusion::GetEpilogue(
+    const HloFusionInstruction& fusion, mlir::MLIRContext* mlir_context) const {
+  // We don't actually support epilogues for DUS, but this is how we tell
+  // the base class that we don't want it to generate code for the DUS.
+  return mlir_converter::EpilogueSpecification::FromIdentityIndexing(
+      dus_ops_.front(), analysis_.fusion_roots().front(), mlir_context);
 }
 
 absl::Status MlirInPlaceDynamicUpdateSliceFusion::EmitEntryFunction(
@@ -155,6 +158,13 @@ absl::Status MlirInPlaceDynamicUpdateSliceFusion::EmitEntryFunction(
         auto updated_value =
             ProvideParameter(dus_subgraph, dus_instr, kDUSUpdateIndex,
                              input_indices, call_targets, entry_function, b)[0];
+        // Handle bitcasts under the DUS.
+        if (dus_instr->shape() != fusion.shape()) {
+          update_indices = ApplyAffineMap(
+              GetBitcastMap(dus_instr->shape(), fusion.shape(), b.getContext())
+                  .GetAffineMap(),
+              update_indices, {}, b);
+        }
         auto insert = b.create<InsertOp>(updated_value, output_tensors[0],
                                          update_indices);
 
diff --git a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.h b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.h
index bac44f13144cd3..73b92ae71582a3 100644
--- a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.h
+++ b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.h
@@ -18,12 +18,17 @@ limitations under the License.
 #include <cstdint>
 #include <optional>
 
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/Interfaces/DataLayoutInterfaces.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/gpu/fusions/mlir/computation_partitioner.h"
 #include "xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/model/indexing_map.h"
 #include "xla/status.h"
 
 namespace xla {
@@ -67,8 +72,9 @@ class MlirInPlaceDynamicUpdateSliceFusion : public MlirFusionEmitterBase {
       mlir::func::FuncOp entry_function,
       const HloFusionInstruction& fusion) const override;
 
-  std::vector<const HloInstruction*> GetInstructionsWithCustomCodegen(
-      const HloFusionInstruction& fusion) const override;
+  std::optional<mlir_converter::EpilogueSpecification> GetEpilogue(
+      const HloFusionInstruction& fusion,
+      mlir::MLIRContext* mlir_context) const override;
 
  private:
   const HloFusionAnalysis& analysis_;
diff --git a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir_test.cc b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir_test.cc
index 3aabb901b498c9..b9cd940c102295 100644
--- a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir_test.cc
@@ -14,11 +14,14 @@ limitations under the License.
 ==============================================================================*/
 #include "xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.h"
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "xla/error_spec.h"
 #include "xla/service/gpu/fusions/mlir_emitter_test_base.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/model/indexing_test_utils.h"
 #include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
@@ -176,6 +179,29 @@ TEST_F(MlirInPlaceDynamicUpdateSliceFusionTest, OutOfBoundDUS) {
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
 }
 
+TEST_F(MlirInPlaceDynamicUpdateSliceFusionTest, BitcastDus) {
+  auto kHloString = R"(
+    HloModule module
+
+    fused_computation {
+      in = f32[20,30] parameter(0)
+      updates = f32[5,6] parameter(1)
+      i0 = s32[] parameter(2)
+      i1 = s32[] parameter(3)
+      updated = f32[20,30] dynamic-update-slice(in, updates, i0, i1)
+      ROOT bitcast = f32[600] bitcast(updated)
+    }
+    ENTRY entry {
+      in = f32[20,30] parameter(0)
+      updates = f32[5,6] parameter(1)
+      i0 = s32[] constant(2)
+      i1 = s32[] constant(3)
+      ROOT fusion = f32[600] fusion(in, updates, i0, i1), kind=kLoop, calls=fused_computation
+    }
+  )";
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/fusions/input_slices.cc b/third_party/xla/xla/service/gpu/fusions/input_slices.cc
index 225de1da8be49b..35a9f1eca18aa9 100644
--- a/third_party/xla/xla/service/gpu/fusions/input_slices.cc
+++ b/third_party/xla/xla/service/gpu/fusions/input_slices.cc
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
 #include "llvm/IR/IRBuilder.h"
@@ -30,10 +31,12 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/elemental_ir_emitter.h"
 #include "xla/service/gpu/elemental_ir_emitter.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/model/indexing_map.h"
 #include "xla/service/gpu/parallel_loop_emitter.h"
 #include "xla/service/llvm_ir/fused_ir_emitter.h"
 #include "xla/service/llvm_ir/ir_array.h"
@@ -42,7 +45,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
-#include "xla/statusor.h"
 #include "xla/util.h"
 #include "tsl/platform/statusor.h"
 
diff --git a/third_party/xla/xla/service/gpu/fusions/input_slices.h b/third_party/xla/xla/service/gpu/fusions/input_slices.h
index 90f4f4e4a24d03..d9a68f315be720 100644
--- a/third_party/xla/xla/service/gpu/fusions/input_slices.h
+++ b/third_party/xla/xla/service/gpu/fusions/input_slices.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/model/indexing_analysis.h"
+#include "xla/service/gpu/model/indexing_map.h"
 #include "xla/service/llvm_ir/ir_array.h"
 #include "xla/status.h"
 
diff --git a/third_party/xla/xla/service/gpu/fusions/input_slices_mlir.h b/third_party/xla/xla/service/gpu/fusions/input_slices_mlir.h
index 1de06b963d9e59..cd617409ea82e6 100644
--- a/third_party/xla/xla/service/gpu/fusions/input_slices_mlir.h
+++ b/third_party/xla/xla/service/gpu/fusions/input_slices_mlir.h
@@ -18,11 +18,14 @@ limitations under the License.
 #include <cstdint>
 #include <optional>
 
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/gpu/fusions/mlir/computation_partitioner.h"
 #include "xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/model/indexing_map.h"
 #include "xla/status.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/gpu/fusions/loop.cc b/third_party/xla/xla/service/gpu/fusions/loop.cc
index e417f96923f4e3..07f0c29454157b 100644
--- a/third_party/xla/xla/service/gpu/fusions/loop.cc
+++ b/third_party/xla/xla/service/gpu/fusions/loop.cc
@@ -19,9 +19,12 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/log/log.h"
 #include "absl/numeric/bits.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Type.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/layout_util.h"
@@ -33,11 +36,15 @@ limitations under the License.
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/model/indexing_analysis.h"
+#include "xla/service/gpu/model/indexing_map.h"
 #include "xla/service/gpu/parallel_loop_emitter.h"
 #include "xla/service/llvm_ir/fused_ir_emitter.h"
 #include "xla/service/llvm_ir/ir_array.h"
 #include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/status.h"
+#include "tsl/platform/macros.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/fusions/loop_mlir.cc b/third_party/xla/xla/service/gpu/fusions/loop_mlir.cc
index bf41d50930ea95..0e292ea6674fc1 100644
--- a/third_party/xla/xla/service/gpu/fusions/loop_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/loop_mlir.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "llvm/ADT/STLExtras.h"
@@ -26,7 +27,6 @@ limitations under the License.
 #include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
 #include "mlir/IR/AffineExpr.h"  // from @llvm-project
 #include "mlir/IR/AffineMap.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
@@ -37,13 +37,13 @@ limitations under the License.
 #include "xla/service/gpu/fusions/mlir/computation_partitioner.h"
 #include "xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.h"
 #include "xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/model/indexing_analysis.h"
+#include "xla/service/gpu/model/indexing_map.h"
 #include "xla/shape.h"
 #include "xla/status_macros.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/fusions/loop_mlir.h b/third_party/xla/xla/service/gpu/fusions/loop_mlir.h
index 228c8c87b5ff28..dce02b372f849a 100644
--- a/third_party/xla/xla/service/gpu/fusions/loop_mlir.h
+++ b/third_party/xla/xla/service/gpu/fusions/loop_mlir.h
@@ -18,12 +18,15 @@ limitations under the License.
 #include <cstdint>
 #include <optional>
 
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/fusions/loop.h"
+#include "xla/service/gpu/fusions/mlir/computation_partitioner.h"
 #include "xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/model/indexing_map.h"
 #include "xla/status.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/gpu/fusions/loop_mlir_test.cc b/third_party/xla/xla/service/gpu/fusions/loop_mlir_test.cc
index 1f3d41bddc46a0..ba2dda49ccc44c 100644
--- a/third_party/xla/xla/service/gpu/fusions/loop_mlir_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/loop_mlir_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "xla/service/gpu/fusions/loop_mlir.h"
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "xla/error_spec.h"
 #include "xla/service/gpu/fusions/mlir_emitter_test_base.h"
diff --git a/third_party/xla/xla/service/gpu/fusions/loop_test.cc b/third_party/xla/xla/service/gpu/fusions/loop_test.cc
index 1bb5fdb8705d30..d87bc8fa0db98e 100644
--- a/third_party/xla/xla/service/gpu/fusions/loop_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/loop_test.cc
@@ -12,21 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "xla/service/gpu/fusions/loop.h"
 
 #include <memory>
 #include <optional>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/statusor.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "xla/service/gpu/fusions/fusion_emitter.h"
 #include "xla/service/gpu/fusions/fusions.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/model/affine_map_printer.h"
 #include "xla/service/gpu/model/indexing_test_utils.h"
 #include "xla/status_macros.h"
-#include "xla/statusor.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/BUILD b/third_party/xla/xla/service/gpu/fusions/mlir/BUILD
index 3e475b1a9cdbeb..997ee83be9a228 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/BUILD
@@ -23,6 +23,11 @@ cc_library(
         "//xla:shape_util",
         "//xla:union_find",
         "//xla/hlo/ir:hlo",
+        "//xla/service/gpu:hlo_fusion_analysis",
+        "//xla/service/gpu:launch_dimensions",
+        "//xla/service/gpu/fusions:fusion_emitter",
+        "//xla/service/gpu/model:indexing_analysis",
+        "//xla/service/gpu/model:indexing_map",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/translate/hlo_to_mhlo:hlo_utils",
         "@com_google_absl//absl/algorithm:container",
@@ -37,7 +42,6 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LLVMDialect",
-        "@llvm-project//mlir:TensorDialect",
     ],
 )
 
@@ -47,6 +51,7 @@ xla_cc_test(
     deps = [
         ":computation_partitioner",
         "//xla/hlo/ir:hlo",
+        "//xla/service/gpu:launch_dimensions",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_googletest//:gtest",
@@ -92,6 +97,7 @@ cc_library(
         "@llvm-project//mlir:AffineDialect",
         "@llvm-project//mlir:AffineUtils",
         "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:ComplexDialect",
         "@llvm-project//mlir:DataLayoutInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
@@ -111,10 +117,14 @@ xla_cc_test(
         ":computation_partitioner",
         ":elemental_hlo_to_mlir",
         "//xla:status_macros",
+        "//xla/hlo/ir:hlo",
         "//xla/mlir_hlo",
         "//xla/service:hlo_parser",
+        "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu/fusions/mlir/ir:xla_gpu",
+        "//xla/service/gpu/model:indexing_map",
         "//xla/service/llvm_ir:llvm_util",
+        "//xla/stream_executor:launch_dim",
         "//xla/tests:filecheck",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
@@ -149,6 +159,7 @@ cc_library(
         ":type_util",
         "//xla:shape_util",
         "//xla:status_macros",
+        "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/mlir_hlo",
         "//xla/mlir_hlo:mhlo_passes",
@@ -179,11 +190,11 @@ cc_library(
         "@llvm-project//mlir:AffineDialect",
         "@llvm-project//mlir:AffineToStandard",
         "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:BufferizationDialect",
         "@llvm-project//mlir:BufferizationInterfaces",
         "@llvm-project//mlir:BuiltinToLLVMIRTranslation",
         "@llvm-project//mlir:ComplexToStandard",
         "@llvm-project//mlir:ControlFlowDialect",
+        "@llvm-project//mlir:DLTIDialect",
         "@llvm-project//mlir:DataLayoutInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FuncExtensions",
@@ -211,12 +222,14 @@ xla_cc_test(
     name = "mlir_fusion_emitter_test",
     srcs = ["mlir_fusion_emitter_test.cc"],
     deps = [
+        ":computation_partitioner",
         ":mlir_fusion_emitter",
         "//xla/hlo/ir:hlo",
         "//xla/mlir_hlo",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu/model:indexing_map",
+        "//xla/stream_executor:device_description",
         "//xla/tests:filecheck",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
@@ -226,10 +239,10 @@ xla_cc_test(
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:ir_headers",
         "@llvm-project//mlir:AffineDialect",
+        "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:BufferizationInterfaces",
         "@llvm-project//mlir:BuiltinToLLVMIRTranslation",
         "@llvm-project//mlir:ComplexDialect",
-        "@llvm-project//mlir:DataLayoutInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FuncExtensions",
         "@llvm-project//mlir:GPUDialect",
@@ -297,7 +310,6 @@ cc_library(
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:ArithToLLVM",
         "@llvm-project//mlir:ArithTransforms",
-        "@llvm-project//mlir:ArithUtils",
         "@llvm-project//mlir:ComplexDialect",
         "@llvm-project//mlir:ComplexToLLVM",
         "@llvm-project//mlir:ControlFlowToLLVM",
@@ -343,6 +355,7 @@ xla_cc_test(
     deps = [
         ":type_util",
         "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
         "//xla/tests:xla_internal_test_main",
         "@com_google_googletest//:gtest",
         "@llvm-project//llvm:Support",
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/computation_partitioner.cc b/third_party/xla/xla/service/gpu/fusions/mlir/computation_partitioner.cc
index ad46b914a9e484..18c74f4dfc1fe1 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/computation_partitioner.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/computation_partitioner.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <cstdint>
 #include <functional>
+#include <iterator>
 #include <optional>
 #include <sstream>
 #include <string>
@@ -37,6 +38,7 @@ limitations under the License.
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Interfaces/DataLayoutInterfaces.h"  // from @llvm-project
@@ -44,6 +46,8 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/gpu/fusions/mlir/type_util.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/model/indexing_analysis.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/shape.h"
 #include "xla/translate/hlo_to_mhlo/hlo_utils.h"
@@ -93,10 +97,53 @@ absl::flat_hash_map<const HloInstruction*, int> PartitionGraphByIndexing(
 
 }  // namespace
 
+EpilogueSpecification EpilogueSpecification::FromIdentityIndexing(
+    const HloInstruction* hero, const HloInstruction* root,
+    mlir::MLIRContext* mlir_context) {
+  EpilogueSpecification result;
+  absl::c_copy(root->shape().dimensions(),
+               std::back_inserter(result.index_ranges));
+  result.root_indexing.push_back(mlir::AffineMap::getMultiDimIdentityMap(
+      root->shape().rank(), mlir_context));
+  result.heroes.push_back(hero);
+  return result;
+}
+
+EpilogueSpecification EpilogueSpecification::FromOutputIndexing(
+    const HloFusionAnalysis& analysis,
+    const std::vector<const HloInstruction*>& heroes,
+    const KernelFusionInterface& fusion, mlir::MLIRContext* mlir_context) {
+  EpilogueSpecification result;
+
+  for (auto [index, hero] : llvm::enumerate(analysis.fusion_heroes())) {
+    auto indexing = fusion.ComputeThreadIdToOutputIndexing(index, mlir_context);
+    if (index == 0) {
+      result.index_ranges.reserve(indexing->GetDimensionCount() +
+                                  indexing->GetSymbolCount());
+      for (const auto& dim : indexing->GetDimensionBounds()) {
+        result.index_ranges.push_back(dim.upper + 1);
+      }
+      for (const auto& sym : indexing->GetSymbolBounds()) {
+        result.index_ranges.push_back(sym.upper + 1);
+      }
+    }
+
+    auto epilogue_indexing =
+        ComputeEpilogueInputToOutputIndexing(hero, mlir_context);
+    auto root_indexing = ComposeIndexingMaps(*indexing, epilogue_indexing);
+
+    result.root_indexing.push_back(root_indexing.GetAffineMap());
+  }
+  result.heroes = heroes;
+  return result;
+}
+
 std::string PartitionedComputation::Subgraph::ToString() const {
   std::ostringstream ss;
   ss << "SUBGRAPH " << name << " {\n";
-  for (auto instr : instructions_post_order) {
+  for (auto* instr :
+       (*instructions.begin())->parent()->MakeInstructionPostOrder()) {
+    if (!instructions.contains(instr)) continue;
     ss << "  ";
     if (absl::c_linear_search(roots, instr)) {
       ss << "ROOT ";
@@ -126,7 +173,7 @@ std::string PartitionedComputations::ToString() const {
 }
 
 PartitionedComputation::PartitionedComputation(
-    const HloComputation* computation,
+    const HloComputation* computation, mlir::MLIRContext* mlir_context,
     std::function<bool(const HloInstruction*)> is_subgraph_root)
     : computation_(computation) {
   CHECK_NE(computation, nullptr);
@@ -189,12 +236,38 @@ PartitionedComputation::PartitionedComputation(
     };
 
     std::vector<const HloInstruction*> roots;
+    std::vector<mlir::AffineMap> root_indexing;
+    const xla::Shape* first_root_shape = nullptr;
     for (auto* instruction : instructions) {
       if (instruction->user_count() == 0 ||
           absl::c_any_of(instruction->users(), is_different_cluster)) {
         roots.push_back(instruction);
+        if (first_root_shape) {
+          CHECK(!instruction->shape().IsTuple())
+              << "Internal tuples are not supported";
+          if (ShapeUtil::EqualIgnoringElementType(*first_root_shape,
+                                                  instruction->shape())) {
+            root_indexing.push_back(root_indexing.front());
+          } else {
+            // Bitcast from the first root to the target shape.
+            auto bitcast = GetBitcastMap(*first_root_shape,
+                                         instruction->shape(), mlir_context);
+            root_indexing.push_back(bitcast.GetAffineMap());
+          }
+        } else {
+          first_root_shape = &instruction->shape();
+          if (first_root_shape->IsTuple()) {
+            first_root_shape = &first_root_shape->tuple_shapes()[0];
+          }
+          root_indexing.push_back(mlir::AffineMap::getMultiDimIdentityMap(
+              first_root_shape->rank(), mlir_context));
+        }
       }
     }
+
+    std::vector<int64_t> ranges{first_root_shape->dimensions().begin(),
+                                first_root_shape->dimensions().end()};
+
     CHECK(!roots.empty()) << "No roots found";
     std::string name = llvm_ir::SanitizeFunctionName(absl::StrCat(
         roots.front()->parent()->name(), "_",
@@ -204,12 +277,13 @@ PartitionedComputation::PartitionedComputation(
     subgraphs_.push_back(
         Subgraph{.name = std::move(name),
                  .instructions = {instructions.begin(), instructions.end()},
-                 .instructions_post_order = std::move(instructions),
-                 .roots = std::move(roots)});
+                 .roots = std::move(roots),
+                 .index_ranges = std::move(ranges),
+                 .root_indexing = std::move(root_indexing)});
   }
 
   for (const auto& subgraph : subgraphs_) {
-    for (const auto* instruction : subgraph.instructions_post_order) {
+    for (const auto* instruction : subgraph.instructions) {
       instructions_to_subgraphs_[instruction] = &subgraph;
     }
   }
@@ -217,28 +291,37 @@ PartitionedComputation::PartitionedComputation(
 
 std::optional<PartitionedComputation::Subgraph>
 PartitionedComputation::Subgraph::ForEpilogue(
-    const HloComputation* computation,
-    absl::Span<const HloInstruction* const> heroes) {
-  if (heroes.empty() ||
-      (heroes.size() == 1 && heroes[0] == computation->root_instruction())) {
+    const std::optional<EpilogueSpecification>& epilogue) {
+  if (!epilogue) {
+    return std::nullopt;
+  }
+  const auto* computation = epilogue->heroes.front()->parent();
+  if ((epilogue->heroes.size() == 1 &&
+       epilogue->heroes[0] == computation->root_instruction())) {
     return std::nullopt;
   }
 
-  PartitionedComputation::Subgraph subgraph{
-      .name = llvm_ir::SanitizeFunctionName(
-          absl::StrCat(computation->name(), "__epilogue__")),
-      .roots = {computation->root_instruction()},
-  };
-  for (auto [index, hero] : llvm::enumerate(heroes)) {
-    subgraph.injected_values[hero] = index;
+  PartitionedComputation::Subgraph subgraph;
+  subgraph.name = llvm_ir::SanitizeFunctionName(
+      absl::StrCat(computation->name(), "__epilogue__"));
+  if (computation->root_instruction()->opcode() == HloOpcode::kTuple) {
+    absl::c_copy(computation->root_instruction()->operands(),
+                 std::back_inserter(subgraph.roots));
+  } else {
+    subgraph.roots = {computation->root_instruction()};
+  }
+
+  for (auto* hero : epilogue->heroes) {
+    if (!subgraph.injected_values.contains(hero)) {
+      int index = subgraph.injected_values.size();
+      subgraph.injected_values[hero] = index;
+    }
   }
 
-  std::vector<const HloInstruction*> instructions_pre_order;
   absl::flat_hash_set<const HloInstruction*> seen;
   std::function<void(const HloInstruction*)> visit;
   visit = [&](const HloInstruction* instruction) {
     if (!seen.insert(instruction).second) return;
-    instructions_pre_order.push_back(instruction);
     for (auto [index, operand] : llvm::enumerate(instruction->operands())) {
       if (!subgraph.injected_values.contains(operand)) {
         visit(operand);
@@ -248,16 +331,16 @@ PartitionedComputation::Subgraph::ForEpilogue(
 
   visit(computation->root_instruction());
   subgraph.instructions = std::move(seen);
-  subgraph.instructions_post_order = {instructions_pre_order.rbegin(),
-                                      instructions_pre_order.rend()};
+  subgraph.index_ranges = epilogue->index_ranges;
+  subgraph.root_indexing = epilogue->root_indexing;
   return subgraph;
 }
 
 PartitionedComputations::PartitionedComputations(
-    const HloComputation* fusion,
-    absl::Span<const HloInstruction* const> heroes)
+    const HloComputation* fusion, mlir::MLIRContext* mlir_context,
+    std::optional<EpilogueSpecification> epilogue)
     : fusion_(fusion),
-      epilogue_(PartitionedComputation::Subgraph::ForEpilogue(fusion, heroes)) {
+      epilogue_(PartitionedComputation::Subgraph::ForEpilogue(epilogue)) {
   // Collect all transitively called computations (including the fusion itself).
   absl::flat_hash_set<const HloComputation*> seen;
   std::vector<const HloComputation*> computations;
@@ -271,11 +354,13 @@ PartitionedComputations::PartitionedComputations(
   };
   visit(fusion);
 
-  absl::flat_hash_set<const HloInstruction*> roots{heroes.begin(),
-                                                   heroes.end()};
-  for (auto* instruction : heroes) {
-    roots.insert(instruction->operands().begin(),
-                 instruction->operands().end());
+  absl::flat_hash_set<const HloInstruction*> roots;
+  if (epilogue) {
+    roots = {epilogue->heroes.begin(), epilogue->heroes.end()};
+    for (auto* instruction : epilogue->heroes) {
+      roots.insert(instruction->operands().begin(),
+                   instruction->operands().end());
+    }
   }
   auto is_root = [&](const HloInstruction* instruction) {
     return roots.contains(instruction);
@@ -285,7 +370,7 @@ PartitionedComputations::PartitionedComputations(
   for (auto* computation : computations) {
     computation_to_partitioning_[computation] =
         &partitioned_computations_.emplace_back(
-            PartitionedComputation{computation, is_root});
+            PartitionedComputation{computation, mlir_context, is_root});
   }
 }
 
@@ -325,7 +410,9 @@ CallTargetProvider PartitionedComputations::CreateCallTargetProvider(
                               mlir::func::FuncOp>& subgraph_to_func) const {
   return [&, this](const HloInstruction* instr) {
     const auto& subgraph = FindSubgraph(instr);
-    CHECK(subgraph_to_func.contains(&subgraph));
+    CHECK(subgraph_to_func.contains(&subgraph))
+        << "No function found for subgraph with instruction "
+        << instr->ToString();
     return subgraph_to_func.at(&subgraph);
   };
 }
@@ -341,19 +428,12 @@ mlir::func::FuncOp CreateSubgraphMlirFunction(
     return *ConvertPrimitiveTypeToMlirType(shape.element_type(), b);
   };
 
-  const xla::Shape* first_root_shape = nullptr;
   for (auto* root : subgraph.roots) {
     if (root->shape().IsTuple()) {
       for (auto& shape : root->shape().tuple_shapes()) {
-        if (!first_root_shape) {
-          first_root_shape = &shape;
-        }
         result_types.push_back(element_type(shape));
       }
     } else {
-      if (!first_root_shape) {
-        first_root_shape = &root->shape();
-      }
       result_types.push_back(element_type(root->shape()));
     }
   }
@@ -366,13 +446,11 @@ mlir::func::FuncOp CreateSubgraphMlirFunction(
       parameter_types.push_back(TensorShapeToMlirType(param->shape(), b));
       arg_attrs.emplace_back();
     }
-    for (int dim = 0; dim < first_root_shape->rank(); ++dim) {
+    for (int64_t size : subgraph.index_ranges) {
       parameter_types.push_back(b.getIndexType());
       arg_attrs.emplace_back(mlir::DictionaryAttr::get(
           b.getContext(),
-          {b.getNamedAttr("xla.range",
-                          b.getIndexArrayAttr(
-                              {0, first_root_shape->dimensions(dim) - 1}))}));
+          {b.getNamedAttr("xla.range", b.getIndexArrayAttr({0, size - 1}))}));
     }
 
     // Populate arguments for injected parameters (values that are computed
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/computation_partitioner.h b/third_party/xla/xla/service/gpu/fusions/mlir/computation_partitioner.h
index 5c6f61a2dff480..bbbf02f951fe90 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/computation_partitioner.h
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/computation_partitioner.h
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <functional>
 #include <string>
-#include <utility>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
@@ -25,16 +24,39 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
-#include "mlir/IR/Value.h"  // from @llvm-project
-#include "mlir/IR/ValueRange.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/Interfaces/DataLayoutInterfaces.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/fusions/fusion_emitter.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
 
 namespace xla {
 namespace gpu {
 namespace mlir_converter {
 
+struct EpilogueSpecification {
+  // Creates an epilogue with output indices matching the given root's shape.
+  static EpilogueSpecification FromIdentityIndexing(
+      const HloInstruction* hero, const HloInstruction* root,
+      mlir::MLIRContext* mlir_context);
+  // Creates an epilogue with the raw thread/block/symbol indices, as defined
+  // by the fusion's thread->output mapping.
+  static EpilogueSpecification FromOutputIndexing(
+      const HloFusionAnalysis& analysis,
+      const std::vector<const HloInstruction*>& heroes,
+      const KernelFusionInterface& fusion, mlir::MLIRContext* mlir_context);
+
+  std::vector<const HloInstruction*> heroes;
+
+  // The ranges of the indices that the subgraph is called with.
+  std::vector<int64_t> index_ranges;
+
+  // Indexing maps for each root output. All maps must have the same number of
+  // input dimensions.
+  std::vector<mlir::AffineMap> root_indexing;
+};
+
 // Partitions an HLO computation into subgraphs so that all users of a node have
 // consistent indexing, i. e. when we compute a node `a` with users `b` and `c`,
 // all three nodes will have the same indexing - neither of `b` or `c` will be a
@@ -64,7 +86,7 @@ namespace mlir_converter {
 class PartitionedComputation {
  public:
   explicit PartitionedComputation(
-      const HloComputation* computation,
+      const HloComputation* computation, mlir::MLIRContext* mlir_context,
       std::function<bool(const HloInstruction*)> is_subgraph_root =
           [](const HloInstruction*) { return false; });
 
@@ -74,15 +96,20 @@ class PartitionedComputation {
 
     // The instructions that make up this subgraph.
     absl::flat_hash_set<const HloInstruction*> instructions;
-    std::vector<const HloInstruction*> instructions_post_order;
 
-    // The roots. These are guaranteed not to have users inside the subgraph.
+    // The roots (return values of the function).
     std::vector<const HloInstruction*> roots;
 
-    // For values that are function arguments (not function calls), stores the
-    // mapping from value to the argument index. The arguments always come
-    // after the tensor parameters and output indices; the indices are relative
-    // to the argument after the last index argument.
+    // The ranges of the indices that the subgraph is called with.
+    std::vector<int64_t> index_ranges;
+
+    // Maps from raw indices to root indices.
+    std::vector<mlir::AffineMap> root_indexing;
+
+    // For values that are function arguments (not function calls), stores
+    // the mapping from value to the argument index. The arguments always
+    // come after the tensor parameters and output indices; the indices are
+    // relative to the argument after the last index argument.
     absl::flat_hash_map<const HloInstruction*, int> injected_values;
 
     std::string ToString() const;
@@ -91,8 +118,7 @@ class PartitionedComputation {
     // be injected into the subgraph.
     // If there is no epilogue (the root is the hero), returns nullopt.
     static std::optional<Subgraph> ForEpilogue(
-        const HloComputation* computation,
-        absl::Span<const HloInstruction* const> heroes);
+        const std::optional<EpilogueSpecification>& epilogue);
   };
 
   absl::Span<const Subgraph> subgraphs() const { return subgraphs_; }
@@ -125,9 +151,11 @@ using CallTargetProvider =
 // including all transitively called computations.
 class PartitionedComputations {
  public:
+  // Partition the given fusion computation and optionally generate an epilogue
+  // for the given heroes.
   explicit PartitionedComputations(
-      const HloComputation* fusion,
-      absl::Span<const HloInstruction* const> heroes = {});
+      const HloComputation* fusion, mlir::MLIRContext* mlir_context,
+      std::optional<EpilogueSpecification> epilogue = std::nullopt);
 
   const PartitionedComputation& FindPartitionedComputation(
       const HloComputation* computation) const {
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/computation_partitioner_test.cc b/third_party/xla/xla/service/gpu/fusions/mlir/computation_partitioner_test.cc
index 606e4f4182a926..fb51ddd8bdb613 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/computation_partitioner_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/computation_partitioner_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/AffineExpr.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
@@ -36,8 +37,16 @@ namespace {
 
 using ::testing::ElementsAre;
 using ::testing::SizeIs;
+using ::testing::UnorderedElementsAre;
 
-using ComputationPartitionerTest = HloTestBase;
+class ComputationPartitionerTest : public HloTestBase {
+ protected:
+  ComputationPartitionerTest() {
+    mlir_context_.loadDialect<mlir::func::FuncDialect>();
+  }
+
+  mlir::MLIRContext mlir_context_;
+};
 
 std::string PrintAndErase(mlir::func::FuncOp func) {
   std::string out;
@@ -70,7 +79,7 @@ TEST_F(ComputationPartitionerTest, PartitionDiamonds) {
 
   auto* fusion = module->GetComputationWithName("fused_computation");
   ASSERT_NE(fusion, nullptr);
-  PartitionedComputation computation(fusion);
+  PartitionedComputation computation(fusion, &mlir_context_);
   auto param = fusion->GetInstructionWithName("param");
   auto slice01 = fusion->GetInstructionWithName("slice0.1");
   auto slice02 = fusion->GetInstructionWithName("slice0.2");
@@ -87,15 +96,15 @@ TEST_F(ComputationPartitionerTest, PartitionDiamonds) {
 
   const auto& graphs = computation.subgraphs();
   ASSERT_THAT(graphs, SizeIs(5));
-  EXPECT_THAT(graphs[0].instructions_post_order, ElementsAre(param));
-  EXPECT_THAT(graphs[1].instructions_post_order,
-              ElementsAre(slice01, slice02, add0));
-  EXPECT_THAT(graphs[2].instructions_post_order,
-              ElementsAre(slice11, slice12, add1));
-  EXPECT_THAT(graphs[3].instructions_post_order,
-              ElementsAre(slice21, slice22, add2));
-  EXPECT_THAT(graphs[4].instructions_post_order,
-              ElementsAre(slice31, slice32, add3));
+  EXPECT_THAT(graphs[0].instructions, UnorderedElementsAre(param));
+  EXPECT_THAT(graphs[1].instructions,
+              UnorderedElementsAre(slice01, slice02, add0));
+  EXPECT_THAT(graphs[2].instructions,
+              UnorderedElementsAre(slice11, slice12, add1));
+  EXPECT_THAT(graphs[3].instructions,
+              UnorderedElementsAre(slice21, slice22, add2));
+  EXPECT_THAT(graphs[4].instructions,
+              UnorderedElementsAre(slice31, slice32, add3));
 
   EXPECT_THAT(graphs[1].roots, ElementsAre(add0));
   EXPECT_THAT(graphs[2].roots, ElementsAre(add1));
@@ -120,12 +129,12 @@ TEST_F(ComputationPartitionerTest, TupleRoot) {
 
   auto* fusion = module->GetComputationWithName("fused_computation");
   ASSERT_NE(fusion, nullptr);
-  PartitionedComputation computation(fusion);
+  PartitionedComputation computation(fusion, &mlir_context_);
 
   ASSERT_THAT(computation.subgraphs(), SizeIs(1)) << computation.ToString();
 }
 
-TEST_F(ComputationPartitionerTest, TupleRootWithInjectedValues) {
+TEST_F(ComputationPartitionerTest, Epilogue) {
   auto module = ParseAndReturnVerifiedModule(R"(
     HloModule test_module
 
@@ -148,16 +157,26 @@ TEST_F(ComputationPartitionerTest, TupleRootWithInjectedValues) {
                     .value();
 
   auto* fused_computation = module->GetComputationWithName("fused_computation");
-  PartitionedComputations fusion(
-      fused_computation,
-      /*heroes=*/
-      {fused_computation->GetInstructionWithName("reduce")});
+  EpilogueSpecification epilogue{
+      /*heroes=*/{fused_computation->GetInstructionWithName("reduce")},
+      /*index_ranges=*/{1, 42},
+      {mlir::AffineMap::get(1, 0, mlir::getAffineDimExpr(0, &mlir_context_))}};
+  PartitionedComputations fusion(fused_computation, &mlir_context_, epilogue);
 
   // The epilogue should be one subgraph.
   EXPECT_EQ(
       &fusion.FindSubgraph(
           fused_computation->GetInstructionWithName("bitcast")),
       &fusion.FindSubgraph(fused_computation->GetInstructionWithName("tuple")));
+
+  mlir::ImplicitLocOpBuilder builder(mlir::UnknownLoc::get(&mlir_context_),
+                                     &mlir_context_);
+  EXPECT_EQ(
+      PrintAndErase(CreateSubgraphMlirFunction(*fusion.epilogue(), builder)),
+      "func.func private @fused_computation__epilogue__(tensor<4xf32>, "
+      "index {xla.range = [0 : index, 0 : index]}, "
+      "index {xla.range = [0 : index, 41 : index]}, "
+      "f32) -> (f32, f32)");
 }
 
 TEST_F(ComputationPartitionerTest, EnforcePartitioning) {
@@ -175,12 +194,13 @@ TEST_F(ComputationPartitionerTest, EnforcePartitioning) {
 
   auto* fusion = module->GetComputationWithName("fused_computation");
   ASSERT_NE(fusion, nullptr);
-  PartitionedComputation computation(fusion, [](const HloInstruction* instr) {
-    return instr->opcode() == HloOpcode::kTranspose;
-  });
+  PartitionedComputation computation(
+      fusion, &mlir_context_, [](const HloInstruction* instr) {
+        return instr->opcode() == HloOpcode::kTranspose;
+      });
   ASSERT_THAT(computation.subgraphs(), SizeIs(2));
   EXPECT_THAT(computation.GetRootSubgraph().roots, SizeIs(1));
-  EXPECT_THAT(computation.GetRootSubgraph().instructions_post_order, SizeIs(2));
+  EXPECT_THAT(computation.GetRootSubgraph().instructions, SizeIs(2));
 }
 
 TEST_F(ComputationPartitionerTest, PartiallyMergable) {
@@ -197,14 +217,14 @@ TEST_F(ComputationPartitionerTest, PartiallyMergable) {
 
   auto* fusion = module->GetComputationWithName("fused_computation");
   ASSERT_NE(fusion, nullptr);
-  PartitionedComputation computation(fusion);
+  PartitionedComputation computation(fusion, &mlir_context_);
 
   auto transpose = fusion->GetInstructionWithName("transpose");
   auto sub = fusion->GetInstructionWithName("sub");
 
   ASSERT_THAT(computation.subgraphs(), SizeIs(2));
-  EXPECT_THAT(computation.GetRootSubgraph().instructions_post_order,
-              ElementsAre(transpose, sub));
+  EXPECT_THAT(computation.GetRootSubgraph().instructions,
+              UnorderedElementsAre(transpose, sub));
 }
 
 TEST_F(ComputationPartitionerTest, SubgraphSignatures) {
@@ -237,7 +257,8 @@ TEST_F(ComputationPartitionerTest, SubgraphSignatures) {
   context.loadDialect<mlir::func::FuncDialect>();
   mlir::ImplicitLocOpBuilder builder(mlir::UnknownLoc::get(&context), &context);
 
-  PartitionedComputation fusion(module->GetComputationWithName("fusion"));
+  PartitionedComputation fusion(module->GetComputationWithName("fusion"),
+                                &mlir_context_);
   EXPECT_EQ(
       PrintAndErase(
           CreateSubgraphMlirFunction(fusion.GetRootSubgraph(), builder)),
@@ -245,60 +266,13 @@ TEST_F(ComputationPartitionerTest, SubgraphSignatures) {
       "tensor<2xi64>>, tensor<10x10xf32>, index {xla.range = [0 : index, 9 : "
       "index]}) -> f32");
 
-  PartitionedComputation add(module->GetComputationWithName("add"));
+  PartitionedComputation add(module->GetComputationWithName("add"),
+                             &mlir_context_);
   EXPECT_EQ(
       PrintAndErase(CreateSubgraphMlirFunction(add.GetRootSubgraph(), builder)),
       "func.func private @add_add(f32, f32) -> f32");
 }
 
-TEST_F(ComputationPartitionerTest, SubgraphSignaturesWithInjectedValues) {
-  auto module = ParseAndReturnVerifiedModule(R"(
-    HloModule test_module
-
-    %fused_computation {
-      %p0 = f32[2,16,17] parameter(0)
-      %log = f32[2,16,17] log(%p0)
-      %transpose = f32[2,17,16] transpose(%log), dimensions={0,2,1}
-      %p1 = f32[] parameter(1)
-      %bc = f32[2,17,16] broadcast(%p1), dimensions={}
-      ROOT %add = f32[2,17,16] add(%transpose, %bc)
-    }
-
-    ENTRY main {
-      %p0 = f32[2,16,17] parameter(0)
-      %p1 = f32[] parameter(1)
-      ROOT %fusion = f32[2,17,16] fusion(%p0, %p1), kind=kInput,
-            calls=%fused_computation
-    }
-  )")
-                    .value();
-
-  mlir::MLIRContext context;
-  context.loadDialect<mlir::func::FuncDialect>();
-  mlir::ImplicitLocOpBuilder builder(mlir::UnknownLoc::get(&context), &context);
-
-  // We force a split at the transpose (like the transpose emitter would do) and
-  // enforce that the transpose is injected as a parameter into the epilogue.
-  auto* fused_computation = module->GetComputationWithName("fused_computation");
-  auto* transpose = fused_computation->GetInstructionWithName("transpose");
-  PartitionedComputations fusion(fused_computation,
-                                 /*heroes=*/
-                                 {transpose});
-  auto& epilogue_graph = fusion.epilogue();
-  auto& injected_values = epilogue_graph->injected_values;
-  EXPECT_EQ(injected_values.size(), 1);
-  std::pair<const HloInstruction*, int> injected_operand_key(
-      fused_computation->root_instruction(), 0);
-  ASSERT_TRUE(injected_values.contains(transpose));
-  EXPECT_EQ(injected_values.at(transpose), 0);
-  EXPECT_EQ(
-      PrintAndErase(CreateSubgraphMlirFunction(*epilogue_graph, builder)),
-      "func.func private @fused_computation__epilogue__(tensor<2x16x17xf32>, "
-      "tensor<f32>, index {xla.range = [0 : index, 1 : index]}, index "
-      "{xla.range = [0 : index, 16 : index]}, index {xla.range = [0 : "
-      "index, 15 : index]}, f32) -> f32");
-}
-
 }  // namespace
 }  // namespace mlir_converter
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc
index 4e3e181f94c817..90b1ea30903a44 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.h"
 
+#include <cstddef>
 #include <cstdint>
 #include <functional>
 #include <iterator>
@@ -38,6 +39,7 @@ limitations under the License.
 #include "mlir/Dialect/Affine/IR/AffineOps.h"  // from @llvm-project
 #include "mlir/Dialect/Affine/LoopUtils.h"  // from @llvm-project
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Complex/IR/Complex.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
 #include "mlir/Dialect/SCF/IR/SCF.h"  // from @llvm-project
@@ -166,15 +168,9 @@ static auto& kUnsupportedOps =
                                         HloOpcode::kStochasticConvert,
                                         HloOpcode::kCall};
 
-static auto& kUnimplementedOps = *new absl::flat_hash_set<HloOpcode>{
-    HloOpcode::kConvolution, HloOpcode::kMap};
-
 bool IsUnsupportedConstant(const HloInstruction* instr) {
   return instr->opcode() == HloOpcode::kConstant &&
-         (!ShapeUtil::IsEffectiveScalar(instr->shape()) ||
-          primitive_util::IsUnsignedIntegralType(
-              instr->shape().element_type()) ||
-          primitive_util::IsComplexType(instr->shape().element_type()));
+         !ShapeUtil::IsEffectiveScalar(instr->shape());
 }
 
 bool IsUnsupportedTuple(const HloInstruction* instr) {
@@ -193,17 +189,6 @@ bool IsUnsupportedTuple(const HloInstruction* instr) {
     return true;
   }
 
-  // All tuple elements must have bitcast-compatible dimensions (element types
-  // may differ).
-  auto first_shape = instr->shape().tuple_shapes(0);
-  for (int i = 1; i < instr->operand_count(); ++i) {
-    const auto& tuple_shape = instr->shape().tuple_shapes(i);
-    if (!ShapeUtil::EqualIgnoringElementType(tuple_shape, first_shape) &&
-        !ShapeUtil::IsReshapeOrTransposeBitcast(tuple_shape, first_shape,
-                                                /*ignore_element_type=*/true)) {
-      return true;
-    }
-  }
   return false;
 }
 
@@ -593,27 +578,10 @@ absl::StatusOr<Value> EmitMulAdd(Value lhs, Value rhs, Value accumulator,
                                  b.create<arith::MulIOp>(lhs, rhs));
 }
 
-absl::StatusOr<SmallVector<Value>> EmitDot(
+absl::StatusOr<SmallVector<Value>> EmitDotLoop(
     const HloInstruction* instr, mlir::Type result_element_type,
     ValueRange indices, const OperandProvider& operand_provider,
     ImplicitLocOpBuilder& b) {
-  VLOG(1) << "EmitDot: " << instr->ToString() << " "
-          << llvm_ir::DumpToString(result_element_type);
-
-  if (!algorithm_util::IsSupportedByElementalIrEmitter(
-          instr->precision_config().algorithm())) {
-    return absl::InvalidArgumentError(
-        absl::StrFormat("Algorithm not supported by the ElementalIrEmitter: %s",
-                        PrecisionConfig::Algorithm_Name(
-                            instr->precision_config().algorithm())));
-  }
-  auto* dot = DynCast<HloDotInstruction>(instr);
-  TF_RET_CHECK(dot != nullptr);
-  if (dot->sparse_operands()) {
-    return absl::UnimplementedError(
-        "Sparse dot is supported by Triton emitter only.");
-  }
-
   HloInstructionIndexing indexing =
       ComputeOutputToInputIndexing(instr, /*output_id=*/0, b.getContext());
   const IndexingMap& lhs_indexing_map = *indexing.indexing_maps.at(0).begin();
@@ -624,13 +592,18 @@ absl::StatusOr<SmallVector<Value>> EmitDot(
   Value accum_init_value =
       b.create<ConstantOp>(b.getZeroAttr(accumulator_type)).getResult();
 
+  // For convolutions with `batch_group_count` > 1, there is an additional
+  // symbol for LHS (group id) - ignore it for RHS.
+  size_t rhs_symbol_count = rhs_indexing_map.GetSymbolCount();
+
   auto body =
       [&](ValueRange iter_args, ValueRange dim_values,
           ValueRange symbol_values) -> absl::StatusOr<SmallVector<Value>> {
     llvm::SmallVector<Value> lhs_indices = ApplyAffineMap(
         lhs_indexing_map.GetAffineMap(), dim_values, symbol_values, b);
-    llvm::SmallVector<Value> rhs_indices = ApplyAffineMap(
-        rhs_indexing_map.GetAffineMap(), dim_values, symbol_values, b);
+    llvm::SmallVector<Value> rhs_indices =
+        ApplyAffineMap(rhs_indexing_map.GetAffineMap(), dim_values,
+                       symbol_values.take_front(rhs_symbol_count), b);
 
     TF_ASSIGN_OR_RETURN(Value lhs_value, GetSingleOperandValue(
                                              operand_provider, instr,
@@ -655,6 +628,40 @@ absl::StatusOr<SmallVector<Value>> EmitDot(
   return results;
 }
 
+absl::StatusOr<SmallVector<Value>> EmitDot(
+    const HloInstruction* instr, mlir::Type result_element_type,
+    ValueRange indices, const OperandProvider& operand_provider,
+    ImplicitLocOpBuilder& b) {
+  VLOG(1) << "EmitDot: " << instr->ToString() << " "
+          << llvm_ir::DumpToString(result_element_type);
+
+  if (!algorithm_util::IsSupportedByElementalIrEmitter(
+          instr->precision_config().algorithm())) {
+    return absl::InvalidArgumentError(
+        absl::StrFormat("Algorithm not supported by the ElementalIrEmitter: %s",
+                        PrecisionConfig::Algorithm_Name(
+                            instr->precision_config().algorithm())));
+  }
+  auto* dot = DynCast<HloDotInstruction>(instr);
+  TF_RET_CHECK(dot != nullptr);
+  if (dot->sparse_operands()) {
+    return absl::UnimplementedError(
+        "Sparse dot is supported by Triton emitter only.");
+  }
+
+  return EmitDotLoop(instr, result_element_type, indices, operand_provider, b);
+}
+
+absl::StatusOr<SmallVector<Value>> EmitConvolution(
+    const HloInstruction* instr, mlir::Type result_element_type,
+    ValueRange indices, const OperandProvider& operand_provider,
+    ImplicitLocOpBuilder& b) {
+  VLOG(1) << "EmitConvolution: " << instr->ToString() << " "
+          << llvm_ir::DumpToString(result_element_type);
+
+  return EmitDotLoop(instr, result_element_type, indices, operand_provider, b);
+}
+
 absl::StatusOr<SmallVector<Value>> EmitParameter(const HloInstruction* instr,
                                                  mlir::func::FuncOp this_fn,
                                                  ValueRange indices,
@@ -750,7 +757,6 @@ absl::StatusOr<SmallVector<Value>> HloToMlir(
     const CallTargetProvider& call_target_provider,
     ImplicitLocOpBuilder& builder) {
   CHECK(!kUnsupportedOps.contains(instr->opcode())) << instr->ToShortString();
-  CHECK(!kUnimplementedOps.contains(instr->opcode())) << instr->ToShortString();
 
   auto element_type = instr->shape().element_type();
   mlir::Type element_mlir_type;
@@ -779,13 +785,27 @@ absl::StatusOr<SmallVector<Value>> HloToMlir(
                         builder);
     case HloOpcode::kConstant:
       if (ShapeUtil::IsEffectiveScalar(instr->shape())) {
+        TF_ASSIGN_OR_RETURN(auto value_attr, CreateDenseElementsAttrFromLiteral(
+                                                 instr->literal(), builder));
+        if (result_element_type != element_mlir_type) {
+          value_attr = value_attr.mapValues(
+              result_element_type, [](const llvm::APInt& i) { return i; });
+        }
+        if (primitive_util::IsComplexType(element_type)) {
+          return {{builder.create<mlir::complex::ConstantOp>(
+              element_mlir_type,
+              mlir::cast<mlir::ArrayAttr>(
+                  value_attr.getValues<mlir::Attribute>()[0]))}};
+        }
         auto val = mlir::cast<mlir::TypedAttr>(
-            CreateDenseElementsAttrFromLiteral(instr->literal(), builder)
-                ->getValues<mlir::Attribute>()[0]);
+            value_attr.getValues<mlir::Attribute>()[0]);
         return {{builder.create<ConstantOp>(val).getResult()}};
       }
       return absl::UnimplementedError(
           absl::StrCat("Unimplemented: ", instr->ToShortString()));
+    case HloOpcode::kConvolution:
+      return EmitConvolution(instr, result_element_type, indices,
+                             operand_provider, builder);
     case HloOpcode::kDynamicSlice:
       return EmitDynamicSlice(instr, indices, operand_provider, builder);
     case HloOpcode::kDynamicUpdateSlice:
@@ -901,17 +921,15 @@ absl::StatusOr<SmallVector<Value>> HloToMlir(
       return MapElementwiseOp<mhlo::ClzOp>(arg_types, operands, builder);
     case HloOpcode::kCompare: {
       auto* context = builder.getContext();
-      auto dir = builder.getDictionaryAttr(builder.getNamedAttr(
-          "comparison_direction",
-          mhlo::ComparisonDirectionAttr::get(
-              context,
-              mhlo::symbolizeComparisonDirection(
-                  ComparisonDirectionToString(instr->comparison_direction()))
-                  .value())));
+      auto direction = mhlo::symbolizeComparisonDirection(
+          ComparisonDirectionToString(instr->comparison_direction()));
+      mhlo::CompareOp::Properties properties;
+      properties.comparison_direction =
+          mhlo::ComparisonDirectionAttr::get(context, direction.value());
       auto result_types = llvm::to_vector(mlir::TypeRange{builder.getI1Type()});
       return {{mhlo::MhloOpToStdScalarOp::mapOpOfType<mhlo::CompareOp>(
           builder.getLoc(), result_types, arg_types,
-          mhlo::CompareOp::Adaptor(operands, dir), &builder)}};
+          mhlo::CompareOp::Adaptor(operands, nullptr, properties), &builder)}};
     }
     case HloOpcode::kComplex:
       return MapHloOp<mhlo::ComplexOp>(element_mlir_type, arg_types, operands,
@@ -940,6 +958,11 @@ absl::StatusOr<SmallVector<Value>> HloToMlir(
       return MapElementwiseOp<mhlo::Log1pOp>(arg_types, operands, builder);
     case HloOpcode::kLogistic:
       return MapElementwiseOp<mhlo::LogisticOp>(arg_types, operands, builder);
+    case HloOpcode::kMap: {
+      auto mapper = call_target_provider(
+          instr->called_computations().front()->root_instruction());
+      return builder.create<PureCallOp>(mapper, operands).getResults();
+    }
     case HloOpcode::kMaximum:
       return MapElementwiseOp<mhlo::MaxOp>(arg_types, operands, builder);
     case HloOpcode::kMinimum:
@@ -961,16 +984,14 @@ absl::StatusOr<SmallVector<Value>> HloToMlir(
       return MapHloOp<mhlo::RealOp>(element_mlir_type, arg_types, operands,
                                     builder);
     case HloOpcode::kReducePrecision: {
-      mlir::NamedAttribute exponent_bits(
-          builder.getStringAttr("exponent_bits"),
-          builder.getI32IntegerAttr(instr->exponent_bits()));
-      mlir::NamedAttribute mantissa_bits(
-          builder.getStringAttr("mantissa_bits"),
-          builder.getI32IntegerAttr(instr->mantissa_bits()));
-      return MapHloOp<mhlo::ReducePrecisionOp>(
-          operands.front().getType(), arg_types, operands, builder,
-          mlir::DictionaryAttr::get(builder.getContext(),
-                                    {exponent_bits, mantissa_bits}));
+      mhlo::ReducePrecisionOp::Properties properties;
+      properties.exponent_bits =
+          builder.getI32IntegerAttr(instr->exponent_bits());
+      properties.mantissa_bits =
+          builder.getI32IntegerAttr(instr->mantissa_bits());
+      return MapHloOp<mhlo::ReducePrecisionOp>(operands.front().getType(),
+                                               arg_types, operands, builder,
+                                               nullptr, properties);
     }
     case HloOpcode::kRemainder:
       return MapElementwiseOp<mhlo::RemOp>(arg_types, operands, builder);
@@ -1034,14 +1055,8 @@ bool IsHloOpSupported(const HloInstruction* instr,
                       se::CudaComputeCapability compute_capability) {
   auto is_unsupported_type = [](const HloInstruction* instr) {
     auto e = instr->shape().element_type();
-    // TODO(akuegel): Fix remaining issues with complex.
     // TODO(jreiffers): Support fp8.
-    // TODO(jreiffers): Support int4.
-    return (primitive_util::IsIntegralType(e) &&
-            primitive_util::BitWidth(e) > 1 &&
-            primitive_util::BitWidth(e) < 8) ||
-           primitive_util::IsComplexType(e) ||
-           (primitive_util::IsFloatingPointType(e) &&
+    return (primitive_util::IsFloatingPointType(e) &&
             primitive_util::BitWidth(e) < 16);
   };
   if (is_unsupported_type(instr) ||
@@ -1050,7 +1065,6 @@ bool IsHloOpSupported(const HloInstruction* instr,
   }
 
   return !(kUnsupportedOps.contains(instr->opcode()) ||
-           kUnimplementedOps.contains(instr->opcode()) ||
            IsUnsupportedConstant(instr) || IsUnsupportedTuple(instr) ||
            IsUnsupportedGather(instr));
 }
@@ -1164,16 +1178,15 @@ absl::StatusOr<SmallVector<Value>> SubgraphToMlir(
 
   auto provide_operand =
       [&](const HloInstruction* instr, int index,
-          ValueRange indices) -> absl::StatusOr<SmallVector<Value>> {
+          ValueRange operand_indices) -> absl::StatusOr<SmallVector<Value>> {
     auto* operand = instr->operand(index);
     if (subgraph.instructions.contains(operand)) {
-      return emit_instr(operand, indices);
+      return emit_instr(operand, operand_indices);
     }
     return ConvertToSignless(
-        ProvideParameter(subgraph, instr, index, indices, call_target_provider,
-                         this_fn, builder),
+        ProvideParameter(subgraph, instr, index, operand_indices,
+                         call_target_provider, this_fn, builder),
         builder);
-    return results;
   };
 
   emit_instr = [&](const HloInstruction* instr,
@@ -1212,8 +1225,21 @@ absl::StatusOr<SmallVector<Value>> SubgraphToMlir(
     return entry;
   };
 
-  for (const auto* root : subgraph.roots) {
-    TF_ASSIGN_OR_RETURN(auto root_results, emit_instr(root, indices));
+  TF_RET_CHECK(subgraph.roots.size() == subgraph.root_indexing.size())
+      << "roots and root_indexing must have the same size in "
+      << subgraph.ToString();
+  for (const auto [root, indexing] :
+       llvm::zip(subgraph.roots, subgraph.root_indexing)) {
+    TF_RET_CHECK(indexing.getNumDims() + indexing.getNumSymbols() ==
+                 indices.size())
+        << "Incorrect number of indices (got " << indices.size()
+        << ", expected " << indexing.getNumDims() << " dims and "
+        << indexing.getNumSymbols() << "symbols) in " << subgraph.ToString();
+    int num_dims = indexing.getNumDims();
+    auto root_indices =
+        ApplyAffineMap(indexing, /*dims=*/indices.take_front(num_dims),
+                       /*symbols=*/indices.drop_front(num_dims), builder);
+    TF_ASSIGN_OR_RETURN(auto root_results, emit_instr(root, root_indices));
     results.append(root_results.begin(), root_results.end());
   }
   return results;
@@ -1341,6 +1367,9 @@ mlir::Value ClampIndex(mlir::Value index, bool is_unsigned, int64_t high,
   }
 
   if (is_unsigned) {
+    if (index.getType().isUnsignedInteger()) {
+      index = ConvertToSignless({index}, b).front();
+    }
     if (index.getType() != b.getIndexType()) {
       index = b.create<arith::IndexCastUIOp>(b.getIndexType(), index);
     }
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir_test.cc b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir_test.cc
index 1acdff315457e4..0baebb83406a20 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.h"
 
+#include <functional>
 #include <string>
 #include <vector>
 
@@ -28,16 +29,20 @@ limitations under the License.
 #include "mlir/Dialect/Math/IR/Math.h"  // from @llvm-project
 #include "mlir/Dialect/SCF/IR/SCF.h"  // from @llvm-project
 #include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
+#include "mlir/IR/AffineExpr.h"  // from @llvm-project
 #include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_computation.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/service/gpu/fusions/mlir/computation_partitioner.h"
 #include "xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.h"
+#include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/hlo_parser.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/status_macros.h"
+#include "xla/stream_executor/launch_dim.h"
 #include "xla/tests/filecheck.h"
 #include "xla/tests/hlo_test_base.h"
 #include "tsl/lib/core/status_test_util.h"
@@ -61,9 +66,9 @@ class ElementalHloToMlirTest : public HloTestBase {
 
   // Converts the root subgraph of the entry function of the given hlo module to
   // MLIR.
-  absl::Status Run(
-      const std::string& hlo, const std::string& filecheck_str,
-      std::function<bool(const HloInstruction*)> is_subgraph_root = nullptr) {
+  absl::Status Run(const std::string& hlo, const std::string& filecheck_str,
+                   std::function<EpilogueSpecification(HloComputation* entry)>
+                       epilogue_spec_fn = nullptr) {
     auto hlo_module = ParseAndReturnVerifiedModule(hlo).value();
 
     mlir::ImplicitLocOpBuilder builder(mlir::UnknownLoc::get(&context_),
@@ -75,15 +80,12 @@ class ElementalHloToMlirTest : public HloTestBase {
                              builder.getContext()));
     builder.setInsertionPointToStart(module->getBody());
     auto* entry_computation = hlo_module->entry_computation();
-    std::vector<const HloInstruction*> roots;
-    if (is_subgraph_root) {
-      for (auto* instr : entry_computation->instructions()) {
-        if (is_subgraph_root(instr)) {
-          roots.push_back(instr);
-        }
-      }
+    std::optional<EpilogueSpecification> epilogue_spec = std::nullopt;
+    if (epilogue_spec_fn) {
+      epilogue_spec = epilogue_spec_fn(entry_computation);
     }
-    PartitionedComputations partitioned_computations(entry_computation, roots);
+    PartitionedComputations partitioned_computations(entry_computation,
+                                                     &context_, epilogue_spec);
     auto fns = partitioned_computations.DeclareFunctions(module.get());
     auto entry_func = fns[&partitioned_computations
                                .FindPartitionedComputation(entry_computation)
@@ -758,6 +760,320 @@ TEST_F(ElementalHloToMlirTest, DotWithBatchAnd2ContractingDims) {
   )"));
 }
 
+TEST_F(ElementalHloToMlirTest, ConvolutionSimple) {
+  TF_EXPECT_OK(Run(R"(
+    ENTRY main {
+      p0 = f32[2,8,12,4] parameter(0)
+      p1 = f32[4,3,5,16] parameter(1)
+      ROOT conv = f32[2,6,8,16] convolution(p0, p1), window={size=3x5 pad=0_0x0_0}, dim_labels=b01f_i01o->b01f
+    })",
+                   R"(
+    // CHECK:      @main_conv(
+    // CHECK-SAME: %[[LHS:.+]]: tensor<2x8x12x4xf32>, %[[RHS:.*]]: tensor<4x3x5x16xf32>,
+    // CHECK-SAME: %[[B:.+]]: index {xla.range = [0 : index, 1 : index]},
+    // CHECK-SAME: %[[W:.+]]: index {xla.range = [0 : index, 5 : index]},
+    // CHECK-SAME: %[[H:.+]]: index {xla.range = [0 : index, 7 : index]},
+    // CHECK-SAME: %[[O:.+]]: index {xla.range = [0 : index, 15 : index]})
+    // CHECK-SAME: -> f32
+    // CHECK-DAG:  %[[INIT:.+]] = arith.constant 0.000000e+00 : f32
+    // CHECK-DAG:  %[[C0:.+]] = arith.constant 0 : index
+    // CHECK-DAG:  %[[C1:.+]] = arith.constant 1 : index
+    // CHECK-DAG:  %[[C3:.+]] = arith.constant 3 : index
+    // CHECK-DAG:  %[[C4:.+]] = arith.constant 4 : index
+    // CHECK-DAG:  %[[C5:.+]] = arith.constant 5 : index
+    // CHECK:      %[[R0:.+]] = scf.for %[[X:.+]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[A0:.+]] = %[[INIT]]) -> (f32) {
+    // CHECK-NEXT: %[[R1:.+]] = scf.for %[[Y:.+]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[A1:.+]] = %[[A0]]) -> (f32) {
+    // CHECK-NEXT: %[[R2:.+]] = scf.for %[[I:.+]] = %[[C0]] to %[[C4]] step %[[C1]] iter_args(%[[ACC:.+]] = %[[A1]]) -> (f32) {
+    // CHECK:      %[[R3:.+]] = scf.if {{.+}} -> (f32) {
+    // CHECK-DAG:    %[[XX:.+]] = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%[[X]], %[[W]]]
+    // CHECK-DAG:    %[[YY:.+]] = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%[[Y]], %[[H]]]
+    // CHECK-DAG:    %[[VL:.+]] = tensor.extract %[[LHS]][%[[B]], %[[XX]], %[[YY]], %[[I]]] : tensor<2x8x12x4xf32>
+    // CHECK-DAG:    %[[VR:.+]] = tensor.extract %[[RHS]][%[[I]], %[[X]], %[[Y]], %[[O]]] : tensor<4x3x5x16xf32>
+    // CHECK:        %[[MUL:.+]] = arith.mulf %[[VL]], %[[VR]] : f32
+    // CHECK-NEXT:   %[[ADD:.+]] = arith.addf %[[ACC]], %[[MUL]] : f32
+    // CHECK-NEXT:   scf.yield %[[ADD]] : f32
+    // CHECK-NEXT: } else {
+    // CHECK-NEXT:   scf.yield %[[ACC]] : f32
+    // CHECK-NEXT: }
+    // CHECK-NEXT: scf.yield %[[R3]] : f32
+    // CHECK:      scf.yield %[[R2]] : f32
+    // CHECK:      scf.yield %[[R1]] : f32
+    // CHECK:      return %[[R0]] : f32
+  )"));
+}
+
+TEST_F(ElementalHloToMlirTest, ConvolutionWithWindowStrides) {
+  TF_EXPECT_OK(Run(R"(
+    ENTRY main {
+      p0 = f32[2,8,12,4] parameter(0)
+      p1 = f32[4,3,5,16] parameter(1)
+      ROOT conv = f32[2,3,4,16] convolution(p0, p1), window={size=3x5 stride=2x2 pad=0_0x0_0}, dim_labels=b01f_i01o->b01f
+    })",
+                   R"(
+    // CHECK:      @main_conv(
+    // CHECK-SAME: %[[LHS:.+]]: tensor<2x8x12x4xf32>, %[[RHS:.*]]: tensor<4x3x5x16xf32>,
+    // CHECK-SAME: %[[B:.+]]: index {xla.range = [0 : index, 1 : index]},
+    // CHECK-SAME: %[[W:.+]]: index {xla.range = [0 : index, 2 : index]},
+    // CHECK-SAME: %[[H:.+]]: index {xla.range = [0 : index, 3 : index]},
+    // CHECK-SAME: %[[O:.+]]: index {xla.range = [0 : index, 15 : index]})
+    // CHECK-SAME: -> f32
+    // CHECK-DAG:  %[[INIT:.+]] = arith.constant 0.000000e+00 : f32
+    // CHECK-DAG:  %[[C0:.+]] = arith.constant 0 : index
+    // CHECK-DAG:  %[[C1:.+]] = arith.constant 1 : index
+    // CHECK-DAG:  %[[C3:.+]] = arith.constant 3 : index
+    // CHECK-DAG:  %[[C4:.+]] = arith.constant 4 : index
+    // CHECK-DAG:  %[[C5:.+]] = arith.constant 5 : index
+    // CHECK:      %[[R0:.+]] = scf.for %[[X:.+]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[A0:.+]] = %[[INIT]]) -> (f32) {
+    // CHECK-NEXT: %[[R1:.+]] = scf.for %[[Y:.+]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[A1:.+]] = %[[A0]]) -> (f32) {
+    // CHECK-NEXT: %[[R2:.+]] = scf.for %[[I:.+]] = %[[C0]] to %[[C4]] step %[[C1]] iter_args(%[[ACC:.+]] = %[[A1]]) -> (f32) {
+    // CHECK:      %[[R3:.+]] = scf.if {{.+}} -> (f32) {
+    // CHECK-DAG:    %[[XX:.+]] = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 2)>()[%[[X]], %[[W]]]
+    // CHECK-DAG:    %[[YY:.+]] = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 2)>()[%[[Y]], %[[H]]]
+    // CHECK-DAG:    %[[VL:.+]] = tensor.extract %[[LHS]][%[[B]], %[[XX]], %[[YY]], %[[I]]] : tensor<2x8x12x4xf32>
+    // CHECK-DAG:    %[[VR:.+]] = tensor.extract %[[RHS]][%[[I]], %[[X]], %[[Y]], %[[O]]] : tensor<4x3x5x16xf32>
+    // CHECK:        %[[MUL:.+]] = arith.mulf %[[VL]], %[[VR]] : f32
+    // CHECK-NEXT:   %[[ADD:.+]] = arith.addf %[[ACC]], %[[MUL]] : f32
+    // CHECK-NEXT:   scf.yield %[[ADD]] : f32
+    // CHECK-NEXT: } else {
+    // CHECK-NEXT:   scf.yield %[[ACC]] : f32
+    // CHECK-NEXT: }
+    // CHECK-NEXT: scf.yield %[[R3]] : f32
+    // CHECK:      scf.yield %[[R2]] : f32
+    // CHECK:      scf.yield %[[R1]] : f32
+    // CHECK:      return %[[R0]] : f32
+  )"));
+}
+
+TEST_F(ElementalHloToMlirTest, ConvolutionWithPadding) {
+  TF_EXPECT_OK(Run(R"(
+    ENTRY main {
+      p0 = f32[2,8,12,4] parameter(0)
+      p1 = f32[4,3,5,16] parameter(1)
+      ROOT conv = f32[2,8,12,16] convolution(p0, p1), window={size=3x5 pad=1_1x2_2}, dim_labels=b01f_i01o->b01f
+    })",
+                   R"(
+    // CHECK:      @main_conv(
+    // CHECK-SAME: %[[LHS:.+]]: tensor<2x8x12x4xf32>, %[[RHS:.*]]: tensor<4x3x5x16xf32>,
+    // CHECK-SAME: %[[B:.+]]: index {xla.range = [0 : index, 1 : index]},
+    // CHECK-SAME: %[[W:.+]]: index {xla.range = [0 : index, 7 : index]},
+    // CHECK-SAME: %[[H:.+]]: index {xla.range = [0 : index, 11 : index]},
+    // CHECK-SAME: %[[O:.+]]: index {xla.range = [0 : index, 15 : index]})
+    // CHECK-SAME: -> f32
+    // CHECK-DAG:  %[[INIT:.+]] = arith.constant 0.000000e+00 : f32
+    // CHECK-DAG:  %[[C0:.+]] = arith.constant 0 : index
+    // CHECK-DAG:  %[[C1:.+]] = arith.constant 1 : index
+    // CHECK-DAG:  %[[C2:.+]] = arith.constant 2 : index
+    // CHECK-DAG:  %[[C3:.+]] = arith.constant 3 : index
+    // CHECK-DAG:  %[[C4:.+]] = arith.constant 4 : index
+    // CHECK-DAG:  %[[C5:.+]] = arith.constant 5 : index
+    // CHECK-DAG:  %[[C8:.+]] = arith.constant 8 : index
+    // CHECK-DAG:  %[[C13:.+]] = arith.constant 13 : index
+    // CHECK:      %[[R0:.+]] = scf.for %[[X:.+]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[A0:.+]] = %[[INIT]]) -> (f32) {
+    // CHECK-NEXT: %[[R1:.+]] = scf.for %[[Y:.+]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[A1:.+]] = %[[A0]]) -> (f32) {
+    // CHECK-NEXT: %[[R2:.+]] = scf.for %[[I:.+]] = %[[C0]] to %[[C4]] step %[[C1]] iter_args(%[[ACC:.+]] = %[[A1]]) -> (f32) {
+    // CHECK-DAG:  %[[TESTX:.+]] = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%[[X]], %[[W]]]
+    // CHECK-DAG:  %[[TXGE:.+]] = arith.cmpi sge, %[[TESTX]], %[[C1]] : index
+    // CHECK-DAG:  %[[TXLE:.+]] = arith.cmpi sle, %[[TESTX]], %[[C8]] : index
+    // CHECK-DAG:  %[[TX:.+]] = arith.andi %[[TXGE]], %[[TXLE]] : i1
+    // CHECK-DAG:  %[[TESTY:.+]] = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%[[Y]], %[[H]]]
+    // CHECK-DAG:  %[[TYGE:.+]] = arith.cmpi sge, %[[TESTY]], %[[C2]] : index
+    // CHECK-DAG:  %[[TYLE:.+]] = arith.cmpi sle, %[[TESTY]], %[[C13]] : index
+    // CHECK-DAG:  %[[TY:.+]] = arith.andi %[[TYGE]], %[[TYLE]] : i1
+    // CHECK:      %[[R3:.+]] = scf.if {{.+}} -> (f32) {
+    // CHECK-DAG:    %[[XX:.+]] = affine.apply affine_map<()[s0, s1] -> (s0 + s1 - 1)>()[%[[X]], %[[W]]]
+    // CHECK-DAG:    %[[YY:.+]] = affine.apply affine_map<()[s0, s1] -> (s0 + s1 - 2)>()[%[[Y]], %[[H]]]
+    // CHECK-DAG:    %[[VL:.+]] = tensor.extract %[[LHS]][%[[B]], %[[XX]], %[[YY]], %[[I]]] : tensor<2x8x12x4xf32>
+    // CHECK-DAG:    %[[VR:.+]] = tensor.extract %[[RHS]][%[[I]], %[[X]], %[[Y]], %[[O]]] : tensor<4x3x5x16xf32>
+    // CHECK:        %[[MUL:.+]] = arith.mulf %[[VL]], %[[VR]] : f32
+    // CHECK-NEXT:   %[[ADD:.+]] = arith.addf %[[ACC]], %[[MUL]] : f32
+    // CHECK-NEXT:   scf.yield %[[ADD]] : f32
+    // CHECK-NEXT: } else {
+    // CHECK-NEXT:   scf.yield %[[ACC]] : f32
+    // CHECK-NEXT: }
+    // CHECK-NEXT: scf.yield %[[R3]] : f32
+    // CHECK:      scf.yield %[[R2]] : f32
+    // CHECK:      scf.yield %[[R1]] : f32
+    // CHECK:      return %[[R0]] : f32
+  )"));
+}
+
+TEST_F(ElementalHloToMlirTest, ConvolutionWithLhsDilation) {
+  TF_EXPECT_OK(Run(R"(
+    ENTRY main {
+      p0 = f32[2,8,12,4] parameter(0)
+      p1 = f32[4,3,5,16] parameter(1)
+      ROOT conv = f32[2,13,19,16] convolution(p0, p1), window={size=3x5 pad=0_0x0_0 lhs_dilate=2x2}, dim_labels=b01f_i01o->b01f
+    })",
+                   R"(
+    // CHECK:      @main_conv(
+    // CHECK-SAME: %[[LHS:.+]]: tensor<2x8x12x4xf32>, %[[RHS:.*]]: tensor<4x3x5x16xf32>,
+    // CHECK-SAME: %[[B:.+]]: index {xla.range = [0 : index, 1 : index]},
+    // CHECK-SAME: %[[W:.+]]: index {xla.range = [0 : index, 12 : index]},
+    // CHECK-SAME: %[[H:.+]]: index {xla.range = [0 : index, 18 : index]},
+    // CHECK-SAME: %[[O:.+]]: index {xla.range = [0 : index, 15 : index]})
+    // CHECK-SAME: -> f32
+    // CHECK-DAG:  %[[INIT:.+]] = arith.constant 0.000000e+00 : f32
+    // CHECK-DAG:  %[[C0:.+]] = arith.constant 0 : index
+    // CHECK-DAG:  %[[C1:.+]] = arith.constant 1 : index
+    // CHECK-DAG:  %[[C3:.+]] = arith.constant 3 : index
+    // CHECK-DAG:  %[[C4:.+]] = arith.constant 4 : index
+    // CHECK-DAG:  %[[C5:.+]] = arith.constant 5 : index
+    // CHECK:      %[[R0:.+]] = scf.for %[[X:.+]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[A0:.+]] = %[[INIT]]) -> (f32) {
+    // CHECK-NEXT: %[[R1:.+]] = scf.for %[[Y:.+]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[A1:.+]] = %[[A0]]) -> (f32) {
+    // CHECK-NEXT: %[[R2:.+]] = scf.for %[[I:.+]] = %[[C0]] to %[[C4]] step %[[C1]] iter_args(%[[ACC:.+]] = %[[A1]]) -> (f32) {
+    // CHECK-DAG:  %[[TESTX:.+]] = affine.apply affine_map<()[s0, s1] -> ((s0 + s1) mod 2)>()[%[[X]], %[[W]]]
+    // CHECK-DAG:  %[[TX:.+]] = arith.cmpi eq, %[[TESTX]], %[[C0]] : index
+    // CHECK-DAG:  %[[TESTY:.+]] = affine.apply affine_map<()[s0, s1] -> ((s0 + s1) mod 2)>()[%[[Y]], %[[H]]]
+    // CHECK-DAG:  %[[TY:.+]] = arith.cmpi eq, %[[TESTY]], %[[C0]] : index
+    // CHECK:      %[[R3:.+]] = scf.if {{.+}} -> (f32) {
+    // CHECK-DAG:    %[[XX:.+]] = affine.apply affine_map<()[s0, s1] -> ((s0 + s1) floordiv 2)>()[%[[X]], %[[W]]]
+    // CHECK-DAG:    %[[YY:.+]] = affine.apply affine_map<()[s0, s1] -> ((s0 + s1) floordiv 2)>()[%[[Y]], %[[H]]]
+    // CHECK-DAG:    %[[VL:.+]] = tensor.extract %[[LHS]][%[[B]], %[[XX]], %[[YY]], %[[I]]] : tensor<2x8x12x4xf32>
+    // CHECK-DAG:    %[[VR:.+]] = tensor.extract %[[RHS]][%[[I]], %[[X]], %[[Y]], %[[O]]] : tensor<4x3x5x16xf32>
+    // CHECK:        %[[MUL:.+]] = arith.mulf %[[VL]], %[[VR]] : f32
+    // CHECK-NEXT:   %[[ADD:.+]] = arith.addf %[[ACC]], %[[MUL]] : f32
+    // CHECK-NEXT:   scf.yield %[[ADD]] : f32
+    // CHECK-NEXT: } else {
+    // CHECK-NEXT:   scf.yield %[[ACC]] : f32
+    // CHECK-NEXT: }
+    // CHECK-NEXT: scf.yield %[[R3]] : f32
+    // CHECK:      scf.yield %[[R2]] : f32
+    // CHECK:      scf.yield %[[R1]] : f32
+    // CHECK:      return %[[R0]] : f32
+  )"));
+}
+
+TEST_F(ElementalHloToMlirTest, ConvolutionWithRhsDilation) {
+  TF_EXPECT_OK(Run(R"(
+    ENTRY main {
+      p0 = f32[2,8,12,4] parameter(0)
+      p1 = f32[4,3,5,16] parameter(1)
+      ROOT conv = f32[2,4,4,16] convolution(p0, p1), window={size=3x5 pad=0_0x0_0 rhs_dilate=2x2}, dim_labels=b01f_i01o->b01f
+    })",
+                   R"(
+    // CHECK:      @main_conv(
+    // CHECK-SAME: %[[LHS:.+]]: tensor<2x8x12x4xf32>, %[[RHS:.*]]: tensor<4x3x5x16xf32>,
+    // CHECK-SAME: %[[B:.+]]: index {xla.range = [0 : index, 1 : index]},
+    // CHECK-SAME: %[[W:[^ ]+]]: index {xla.range = [0 : index, 3 : index]},
+    // CHECK-SAME: %[[H:.+]]: index {xla.range = [0 : index, 3 : index]},
+    // CHECK-SAME: %[[O:.+]]: index {xla.range = [0 : index, 15 : index]})
+    // CHECK-SAME: -> f32
+    // CHECK-DAG:  %[[INIT:.+]] = arith.constant 0.000000e+00 : f32
+    // CHECK-DAG:  %[[C0:.+]] = arith.constant 0 : index
+    // CHECK-DAG:  %[[C1:.+]] = arith.constant 1 : index
+    // CHECK-DAG:  %[[C3:.+]] = arith.constant 3 : index
+    // CHECK-DAG:  %[[C4:.+]] = arith.constant 4 : index
+    // CHECK-DAG:  %[[C5:.+]] = arith.constant 5 : index
+    // CHECK:      %[[R0:.+]] = scf.for %[[X:.+]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[A0:.+]] = %[[INIT]]) -> (f32) {
+    // CHECK-NEXT: %[[R1:.+]] = scf.for %[[Y:.+]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[A1:.+]] = %[[A0]]) -> (f32) {
+    // CHECK-NEXT: %[[R2:.+]] = scf.for %[[I:.+]] = %[[C0]] to %[[C4]] step %[[C1]] iter_args(%[[ACC:.+]] = %[[A1]]) -> (f32) {
+    // CHECK:      %[[R3:.+]] = scf.if {{.+}} -> (f32) {
+    // CHECK-DAG:    %[[XX:.+]] = affine.apply affine_map<()[s0, s1] -> (s0 * 2 + s1)>()[%[[X]], %[[W]]]
+    // CHECK-DAG:    %[[YY:.+]] = affine.apply affine_map<()[s0, s1] -> (s0 * 2 + s1)>()[%[[Y]], %[[H]]]
+    // CHECK-DAG:    %[[VL:.+]] = tensor.extract %[[LHS]][%[[B]], %[[XX]], %[[YY]], %[[I]]] : tensor<2x8x12x4xf32>
+    // CHECK-DAG:    %[[VR:.+]] = tensor.extract %[[RHS]][%[[I]], %[[X]], %[[Y]], %[[O]]] : tensor<4x3x5x16xf32>
+    // CHECK:        %[[MUL:.+]] = arith.mulf %[[VL]], %[[VR]] : f32
+    // CHECK-NEXT:   %[[ADD:.+]] = arith.addf %[[ACC]], %[[MUL]] : f32
+    // CHECK-NEXT:   scf.yield %[[ADD]] : f32
+    // CHECK-NEXT: } else {
+    // CHECK-NEXT:   scf.yield %[[ACC]] : f32
+    // CHECK-NEXT: }
+    // CHECK-NEXT: scf.yield %[[R3]] : f32
+    // CHECK:      scf.yield %[[R2]] : f32
+    // CHECK:      scf.yield %[[R1]] : f32
+    // CHECK:      return %[[R0]] : f32
+  )"));
+}
+
+TEST_F(ElementalHloToMlirTest, ConvolutionWithFeatureGroupCount) {
+  TF_EXPECT_OK(Run(R"(
+    ENTRY main {
+      p0 = f32[2,8,12,4] parameter(0)
+      p1 = f32[2,3,5,16] parameter(1)
+      ROOT conv = f32[2,6,8,16] convolution(p0, p1), window={size=3x5 pad=0_0x0_0}, dim_labels=b01f_i01o->b01f, feature_group_count=2
+    })",
+                   R"(
+    // CHECK:      @main_conv(
+    // CHECK-SAME: %[[LHS:.+]]: tensor<2x8x12x4xf32>, %[[RHS:.*]]: tensor<2x3x5x16xf32>,
+    // CHECK-SAME: %[[B:.+]]: index {xla.range = [0 : index, 1 : index]},
+    // CHECK-SAME: %[[W:.+]]: index {xla.range = [0 : index, 5 : index]},
+    // CHECK-SAME: %[[H:.+]]: index {xla.range = [0 : index, 7 : index]},
+    // CHECK-SAME: %[[O:.+]]: index {xla.range = [0 : index, 15 : index]})
+    // CHECK-SAME: -> f32
+    // CHECK-DAG:  %[[INIT:.+]] = arith.constant 0.000000e+00 : f32
+    // CHECK-DAG:  %[[C0:.+]] = arith.constant 0 : index
+    // CHECK-DAG:  %[[C1:.+]] = arith.constant 1 : index
+    // CHECK-DAG:  %[[C2:.+]] = arith.constant 2 : index
+    // CHECK-DAG:  %[[C3:.+]] = arith.constant 3 : index
+    // CHECK-DAG:  %[[C5:.+]] = arith.constant 5 : index
+    // CHECK:      %[[R0:.+]] = scf.for %[[X:.+]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[A0:.+]] = %[[INIT]]) -> (f32) {
+    // CHECK-NEXT: %[[R1:.+]] = scf.for %[[Y:.+]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[A1:.+]] = %[[A0]]) -> (f32) {
+    // CHECK-NEXT: %[[R2:.+]] = scf.for %[[I:.+]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ACC:.+]] = %[[A1]]) -> (f32) {
+    // CHECK:      %[[R3:.+]] = scf.if {{.+}} -> (f32) {
+    // CHECK-DAG:    %[[XX:.+]] = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%[[X]], %[[W]]]
+    // CHECK-DAG:    %[[YY:.+]] = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%[[Y]], %[[H]]]
+    // CHECK-DAG:    %[[II:.+]] = affine.apply affine_map<()[s0, s1] -> (s0 + (s1 floordiv 8) * 2)>()[%[[I]], %[[O]]]
+    // CHECK-DAG:    %[[VL:.+]] = tensor.extract %[[LHS]][%[[B]], %[[XX]], %[[YY]], %[[II]]] : tensor<2x8x12x4xf32>
+    // CHECK-DAG:    %[[VR:.+]] = tensor.extract %[[RHS]][%[[I]], %[[X]], %[[Y]], %[[O]]] : tensor<2x3x5x16xf32>
+    // CHECK:        %[[MUL:.+]] = arith.mulf %[[VL]], %[[VR]] : f32
+    // CHECK-NEXT:   %[[ADD:.+]] = arith.addf %[[ACC]], %[[MUL]] : f32
+    // CHECK-NEXT:   scf.yield %[[ADD]] : f32
+    // CHECK-NEXT: } else {
+    // CHECK-NEXT:   scf.yield %[[ACC]] : f32
+    // CHECK-NEXT: }
+    // CHECK-NEXT: scf.yield %[[R3]] : f32
+    // CHECK:      scf.yield %[[R2]] : f32
+    // CHECK:      scf.yield %[[R1]] : f32
+    // CHECK:      return %[[R0]] : f32
+  )"));
+}
+
+TEST_F(ElementalHloToMlirTest, ConvolutionWithBatchGroupCount) {
+  TF_EXPECT_OK(Run(R"(
+    ENTRY main {
+      p0 = f32[2,8,12,4] parameter(0)
+      p1 = f32[4,3,5,16] parameter(1)
+      ROOT conv = f32[1,6,8,16] convolution(p0, p1), window={size=3x5 pad=0_0x0_0}, dim_labels=b01f_i01o->b01f, batch_group_count=2
+    })",
+                   R"(
+    // CHECK:      @main_conv(
+    // CHECK-SAME: %[[LHS:.+]]: tensor<2x8x12x4xf32>, %[[RHS:.*]]: tensor<4x3x5x16xf32>,
+    // CHECK-SAME: %[[B:.+]]: index {xla.range = [0 : index, 0 : index]},
+    // CHECK-SAME: %[[W:.+]]: index {xla.range = [0 : index, 5 : index]},
+    // CHECK-SAME: %[[H:.+]]: index {xla.range = [0 : index, 7 : index]},
+    // CHECK-SAME: %[[O:.+]]: index {xla.range = [0 : index, 15 : index]})
+    // CHECK-SAME: -> f32
+    // CHECK-DAG:  %[[INIT:.+]] = arith.constant 0.000000e+00 : f32
+    // CHECK-DAG:  %[[C0:.+]] = arith.constant 0 : index
+    // CHECK-DAG:  %[[C1:.+]] = arith.constant 1 : index
+    // CHECK-DAG:  %[[C2:.+]] = arith.constant 2 : index
+    // CHECK-DAG:  %[[C3:.+]] = arith.constant 3 : index
+    // CHECK-DAG:  %[[C4:.+]] = arith.constant 4 : index
+    // CHECK-DAG:  %[[C5:.+]] = arith.constant 5 : index
+    // CHECK:      %[[R0:.+]] = scf.for %[[X:.+]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[A0:.+]] = %[[INIT]]) -> (f32) {
+    // CHECK-NEXT: %[[R1:.+]] = scf.for %[[Y:.+]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[A1:.+]] = %[[A0]]) -> (f32) {
+    // CHECK-NEXT: %[[R2:.+]] = scf.for %[[I:.+]] = %[[C0]] to %[[C4]] step %[[C1]] iter_args(%[[A2:.+]] = %[[A1]]) -> (f32) {
+    // CHECK-NEXT: %[[R3:.+]] = scf.for %[[G:.+]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ACC:.+]] = %[[A2]]) -> (f32) {
+    // CHECK:      %[[R4:.+]] = scf.if {{.+}} -> (f32) {
+    // CHECK-DAG:    %[[BB:.+]] = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%[[G]], %[[B]]]
+    // CHECK-DAG:    %[[XX:.+]] = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%[[X]], %[[W]]]
+    // CHECK-DAG:    %[[YY:.+]] = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%[[Y]], %[[H]]]
+    // CHECK-DAG:    %[[VL:.+]] = tensor.extract %[[LHS]][%[[BB]], %[[XX]], %[[YY]], %[[I]]] : tensor<2x8x12x4xf32>
+    // CHECK-DAG:    %[[VR:.+]] = tensor.extract %[[RHS]][%[[I]], %[[X]], %[[Y]], %[[O]]] : tensor<4x3x5x16xf32>
+    // CHECK:        %[[MUL:.+]] = arith.mulf %[[VL]], %[[VR]] : f32
+    // CHECK-NEXT:   %[[ADD:.+]] = arith.addf %[[ACC]], %[[MUL]] : f32
+    // CHECK-NEXT:   scf.yield %[[ADD]] : f32
+    // CHECK-NEXT: } else {
+    // CHECK-NEXT:   scf.yield %[[ACC]] : f32
+    // CHECK-NEXT: }
+    // CHECK-NEXT: scf.yield %[[R4]] : f32
+    // CHECK:      scf.yield %[[R3]] : f32
+    // CHECK:      scf.yield %[[R2]] : f32
+    // CHECK:      scf.yield %[[R1]] : f32
+    // CHECK:      return %[[R0]] : f32
+  )"));
+}
+
 TEST_F(ElementalHloToMlirTest, Transpose) {
   TF_EXPECT_OK(Run(R"(
     ENTRY main {
@@ -901,15 +1217,20 @@ TEST_F(ElementalHloToMlirTest, Epilogue) {
       // CHECK-SAME:     %[[ARG0:.*]]: tensor<2x16x17xf32>
       // CHECK-SAME:     %[[ARG1:.*]]: tensor<f32>
       // CHECK-SAME:     %[[X:.*]]: index {xla.range = [0 : index, 1 :
-      // CHECK-SAME:     %[[Y:.*]]: index {xla.range = [0 : index, 16 :
-      // CHECK-SAME:     %[[Z:.*]]: index {xla.range = [0 : index, 15 :
+      // CHECK-SAME:     %[[Y:.*]]: index {xla.range = [0 : index, 15 :
+      // CHECK-SAME:     %[[Z:.*]]: index {xla.range = [0 : index, 16 :
       // CHECK-SAME:     %[[TRANSPOSE:.*]]: f32) -> f32
       // CHECK:        %[[B:.*]] = tensor.extract %[[ARG1]][]
       // CHECK:        %[[RET:.*]] = arith.addf %[[TRANSPOSE]], %[[B]]
       // CHECK:        return %[[RET]])",
-      [](const HloInstruction* instr) {
-        // Make the transpose a new root.
-        return instr->opcode() == HloOpcode::kTranspose;
+      [this](HloComputation* entry) {
+        EpilogueSpecification epilogue;
+        epilogue.heroes.push_back(entry->GetInstructionWithName("transpose"));
+        epilogue.index_ranges = {2, 16, 17};
+        epilogue.root_indexing.push_back(
+            mlir::AffineMap::getMultiDimIdentityMap(3, &context_)
+                .getSubMap({0, 2, 1}));
+        return epilogue;
       }));
 }
 
@@ -931,6 +1252,43 @@ TEST_F(ElementalHloToMlirTest, ScalarConstant) {
   })"));
 }
 
+TEST_F(ElementalHloToMlirTest, ScalarUnsignedConstant) {
+  TF_EXPECT_OK(Run(R"(
+    ENTRY main {
+      p0 = u32[1,1] parameter(0)
+      c1 = u32[1,1] constant({{1}})
+      ROOT add = u32[1,1] add(p0, c1)
+    })",
+                   R"(
+    // CHECK:      @main_add(
+    // CHECK-SAME:     %[[ARG0:.*]]: tensor<1x1xui32>
+    // CHECK-SAME:     %[[X:.*]]: index {{.*}}, %[[Y:.*]]: index {{.*}}
+    // CHECK:        %[[C_1:.*]] = arith.constant 1
+    // CHECK:        %[[A:.*]] = tensor.extract %[[ARG0]][%[[X]], %[[Y]]]
+    // CHECK:        %[[CAST:.*]] = builtin.unrealized_conversion_cast %[[A]] : ui32 to i32
+    // CHECK:        %[[RET:.*]] = arith.addi %[[CAST]], %[[C_1]]
+    // CHECK:        %[[CAST_RET:.*]] = builtin.unrealized_conversion_cast %[[RET]] : i32 to ui32
+    // CHECK:        return %[[CAST_RET]]
+  })"));
+}
+
+TEST_F(ElementalHloToMlirTest, ScalarComplexConstant) {
+  TF_EXPECT_OK(Run(R"(
+    ENTRY main {
+      p0 = c64[] parameter(0)
+      c1 = c64[] constant((1.0, 0.0))
+      ROOT add = c64[] add(p0, c1)
+    })",
+                   R"(
+    // CHECK:      @main_add(
+    // CHECK-SAME:     %[[ARG0:.*]]: tensor<complex<f32>>
+    // CHECK:        %[[C_1:.*]] = complex.constant [1.000000e+00 : f32, 0.000000e+00 : f32]
+    // CHECK:        %[[A:.*]] = tensor.extract %[[ARG0]][]
+    // CHECK:        %[[RET:.*]] = complex.add %[[A]], %[[C_1]]
+    // CHECK:        return %[[RET]]
+  })"));
+}
+
 TEST_F(ElementalHloToMlirTest, DynamicSlice) {
   TF_EXPECT_OK(Run(R"(
     ENTRY main {
@@ -1119,6 +1477,38 @@ TEST_F(ElementalHloToMlirTest, MixedIndexingTuple) {
   )"));
 }
 
+TEST_F(ElementalHloToMlirTest, ReducePrecision) {
+  TF_EXPECT_OK(Run(R"(
+                     ENTRY main {
+                       %p0 = f32[5,7] parameter(0)
+                       ROOT r = f32[5,7] reduce-precision(%p0),
+                        exponent_bits=8, mantissa_bits=23
+                     }
+                   )",
+                   "// CHECK: @main"));
+}
+
+TEST_F(ElementalHloToMlirTest, Map) {
+  TF_EXPECT_OK(Run(R"(
+    mapper {
+      a = f32[] parameter(0)
+      b = f32[] parameter(1)
+      ROOT add = f32[] add(a, b)
+    }
+    ENTRY main {
+      %p0 = f32[5,7] parameter(0)
+      %p1 = f32[5,7] parameter(1)
+      ROOT r = f32[5,7] map(%p0, %p1), dimensions={}, to_apply=mapper
+    })",
+                   R"(
+    // CHECK: @main
+    // CHECK-NEXT: tensor.extract
+    // CHECK-NEXT: tensor.extract
+    // CHECK-NEXT: pure_call @mapper_add
+    // CHECK-NEXT: return
+  )"));
+}
+
 }  // namespace
 }  // namespace mlir_converter
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/lower_tensors.cc b/third_party/xla/xla/service/gpu/fusions/mlir/lower_tensors.cc
index 5b95585f3414dd..cee198a44dc726 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/lower_tensors.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/lower_tensors.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <limits>
 #include <memory>
 #include <string>
+#include <tuple>
 #include <utility>
 
 #include "absl/strings/str_cat.h"
@@ -33,21 +34,26 @@ limitations under the License.
 #include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
 #include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"  // from @llvm-project
 #include "mlir/IR/AffineExpr.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/TypeRange.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/ValueRange.h"  // from @llvm-project
+#include "mlir/Interfaces/DataLayoutInterfaces.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "xla/layout_util.h"
 #include "xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.h"
 #include "xla/shape_util.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
@@ -114,11 +120,8 @@ struct RewriteFunctionSignatures : mlir::OpRewritePattern<mlir::func::FuncOp> {
   }
 };
 
-mlir::LLVM::GEPOp CreateGep(mlir::Operation* op,
-                            mlir::TypedValue<mlir::RankedTensorType> tensor,
-                            ValueRange indices,
-                            mlir::PatternRewriter& rewriter) {
-  auto ptr = mlir::LLVM::LLVMPointerType::get(rewriter.getContext());
+Value GetLinearIndex(mlir::TypedValue<mlir::RankedTensorType> tensor,
+                     ValueRange indices, mlir::PatternRewriter& rewriter) {
   auto byte_shape = ShapeUtil::MakeShape(U8, tensor.getType().getShape());
   if (auto encoding = tensor.getType().getEncoding()) {
     *byte_shape.mutable_layout() = LayoutUtil::MakeLayout(llvm::to_vector(
@@ -131,40 +134,81 @@ mlir::LLVM::GEPOp CreateGep(mlir::Operation* op,
                     mlir::getAffineDimExpr(dim, rewriter.getContext()) * stride;
   }
 
-  rewriter.setInsertionPoint(op);
   Value index = rewriter.create<mlir::affine::AffineApplyOp>(
       tensor.getLoc(), linearize_map, indices);
-  auto index_ty =
-      ShapeUtil::ElementsIn(byte_shape) < std::numeric_limits<int32_t>::max()
-          ? rewriter.getI32Type()
-          : rewriter.getI64Type();
-  index = rewriter.create<mlir::arith::IndexCastUIOp>(tensor.getLoc(), index_ty,
-                                                      index);
+  auto index_ty = rewriter.getIntegerType(
+      mlir::DataLayout::closest(rewriter.getInsertionBlock()->getParentOp())
+          .getTypeSizeInBits(index.getType()));
+  return rewriter.create<mlir::arith::IndexCastUIOp>(tensor.getLoc(), index_ty,
+                                                     index);
+}
 
+std::tuple<Value, Value> GetI4IndexAndNibble(Value linear_index,
+                                             mlir::ImplicitLocOpBuilder& b) {
+  Value one = b.create<mlir::arith::ConstantIntOp>(1, linear_index.getType());
+  Value is_low_nibble = b.create<mlir::arith::CmpIOp>(
+      mlir::arith::CmpIPredicate::eq, one,
+      b.create<mlir::arith::AndIOp>(linear_index, one));
+  Value i8_index = b.create<mlir::arith::ShRUIOp>(linear_index, one);
+  return {i8_index, is_low_nibble};
+}
+
+mlir::LLVM::GEPOp CreateGep(mlir::TypedValue<mlir::RankedTensorType> tensor,
+                            Value linear_index, mlir::PatternRewriter& rewriter,
+                            mlir::Type element_type = nullptr) {
+  if (!element_type) {
+    element_type = tensor.getType().getElementType();
+  }
+  auto ptr = mlir::LLVM::LLVMPointerType::get(rewriter.getContext());
   auto tensor_ptr = rewriter
                         .create<mlir::UnrealizedConversionCastOp>(
                             tensor.getLoc(), ptr, tensor)
                         .getResult(0);
   mlir::LLVMTypeConverter converter(rewriter.getContext());
-  auto llvm_element_type =
-      converter.convertType(tensor.getType().getElementType());
+  auto llvm_element_type = converter.convertType(element_type);
   auto gep = rewriter.create<mlir::LLVM::GEPOp>(
-      tensor.getLoc(), ptr, llvm_element_type, tensor_ptr, index);
+      tensor.getLoc(), ptr, llvm_element_type, tensor_ptr, linear_index);
   gep.setInbounds(true);
   return gep;
 }
 
+mlir::LLVM::GEPOp CreateGep(mlir::TypedValue<mlir::RankedTensorType> tensor,
+                            ValueRange indices,
+                            mlir::PatternRewriter& rewriter) {
+  return CreateGep(tensor, GetLinearIndex(tensor, indices, rewriter), rewriter);
+}
+
 struct RewriteTensorExtract : mlir::OpRewritePattern<mlir::tensor::ExtractOp> {
   using OpRewritePattern::OpRewritePattern;
 
   mlir::LogicalResult matchAndRewrite(
       mlir::tensor::ExtractOp op,
       mlir::PatternRewriter& rewriter) const override {
-    auto gep = CreateGep(op, op.getTensor(), op.getIndices(), rewriter);
+    mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+    auto linear_index =
+        GetLinearIndex(op.getTensor(), op.getIndices(), rewriter);
+    mlir::Type element_type = op.getTensor().getType().getElementType();
+    Value is_low_nibble = nullptr;
+    if (element_type == rewriter.getI4Type()) {
+      element_type = rewriter.getI8Type();
+      std::tie(linear_index, is_low_nibble) =
+          GetI4IndexAndNibble(linear_index, b);
+    }
+
+    auto gep = CreateGep(op.getTensor(), linear_index, rewriter, element_type);
     auto load =
         rewriter
             .create<mlir::LLVM::LoadOp>(gep.getLoc(), gep.getElemType(), gep)
             .getResult();
+
+    if (is_low_nibble) {
+      auto high_value = b.create<mlir::arith::ShRUIOp>(
+          load, b.create<mlir::arith::ConstantIntOp>(4, load.getType()));
+      load = b.create<mlir::arith::TruncIOp>(
+          op.getType(),
+          b.create<mlir::arith::SelectOp>(is_low_nibble, load, high_value));
+    }
+
     rewriter.replaceOpWithNewOp<mlir::UnrealizedConversionCastOp>(
         op, op.getType(), load);
     return success();
@@ -196,10 +240,39 @@ struct RewriteTensorInsert : mlir::OpRewritePattern<mlir::tensor::InsertOp> {
       }
     }
 
-    auto gep =
-        CreateGep(op, dest.cast<mlir::TypedValue<mlir::RankedTensorType>>(),
-                  op.getIndices(), rewriter);
+    mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+    auto tensor_dest = dest.cast<mlir::TypedValue<mlir::RankedTensorType>>();
+    auto linear_index = GetLinearIndex(tensor_dest, op.getIndices(), rewriter);
+    auto element_type = tensor_dest.getType().getElementType();
+    Value is_low_nibble = nullptr;
+
+    if (element_type == rewriter.getI4Type()) {
+      element_type = rewriter.getI8Type();
+      std::tie(linear_index, is_low_nibble) =
+          GetI4IndexAndNibble(linear_index, b);
+    }
+
+    auto gep = CreateGep(tensor_dest, linear_index, rewriter, element_type);
     auto scalar_value = op.getScalar();
+
+    if (is_low_nibble) {
+      Value current_value =
+          b.create<mlir::LLVM::LoadOp>(gep.getElemType(), gep);
+      auto ty = current_value.getType();
+      scalar_value = b.create<mlir::arith::ExtUIOp>(ty, scalar_value);
+      Value low_updated = b.create<mlir::arith::OrIOp>(
+          b.create<mlir::arith::AndIOp>(
+              current_value, b.create<mlir::arith::ConstantIntOp>(0xf0, ty)),
+          scalar_value);
+      Value high_updated = b.create<mlir::arith::OrIOp>(
+          b.create<mlir::arith::AndIOp>(
+              current_value, b.create<mlir::arith::ConstantIntOp>(0x0f, ty)),
+          b.create<mlir::arith::ShLIOp>(
+              scalar_value, b.create<mlir::arith::ConstantIntOp>(4, ty)));
+      scalar_value = b.create<mlir::arith::SelectOp>(is_low_nibble, low_updated,
+                                                     high_updated);
+    }
+
     mlir::LLVMTypeConverter converter(getContext());
     auto llvm_type = converter.convertType(scalar_value.getType());
     scalar_value = rewriter
@@ -248,7 +321,14 @@ struct RewriteAllocateShared : mlir::OpRewritePattern<AllocateSharedOp> {
     auto module = op->getParentOfType<mlir::ModuleOp>();
     auto shaped_ty = op.getResult().getType().cast<mlir::ShapedType>();
     constexpr int kGPUSharedMemoryAddrSpace = 3;
-    auto array_ty = mlir::LLVM::LLVMArrayType::get(shaped_ty.getElementType(),
+    mlir::Type element_type = shaped_ty.getElementType();
+    if (auto complex_ty = mlir::dyn_cast<mlir::ComplexType>(element_type)) {
+      element_type = mlir::LLVM::LLVMStructType::getLiteral(
+          getContext(),
+          {complex_ty.getElementType(), complex_ty.getElementType()});
+    }
+
+    auto array_ty = mlir::LLVM::LLVMArrayType::get(element_type,
                                                    shaped_ty.getNumElements());
 
     std::string name;
@@ -332,7 +412,7 @@ struct RewriteAtomicRMW : mlir::OpRewritePattern<AtomicRMWOp> {
         mlir::IntegerType::get(op.getContext(), small_type ? 32 : result_size);
 
     // Calculate load address for the input.
-    Value addr = CreateGep(op, input, op.getIndices(), rewriter);
+    Value addr = CreateGep(input, op.getIndices(), rewriter);
     Value shift, mask;
     if (small_type) {
       // Update input pointer by discarding the last two bits - i.e. align to
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/lower_to_llvm.cc b/third_party/xla/xla/service/gpu/fusions/mlir/lower_to_llvm.cc
index a1d7fe28549329..2c2d458cab6f67 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/lower_to_llvm.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/lower_to_llvm.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"  // from @llvm-project
 #include "mlir/Conversion/MathToLLVM/MathToLLVM.h"  // from @llvm-project
 #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"  // from @llvm-project
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Arith/Transforms/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/Complex/IR/Complex.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -51,7 +52,10 @@ class LowerToLLVMPass : public impl::LowerToLLVMPassBase<LowerToLLVMPass> {
 
   void runOnOperation() override {
     // Populate type conversions.
-    mlir::LLVMTypeConverter type_converter(getOperation().getContext());
+    mlir::LowerToLLVMOptions llvm_opts(&getContext(),
+                                       mlir::DataLayout(getOperation()));
+    mlir::LLVMTypeConverter type_converter(getOperation().getContext(),
+                                           llvm_opts);
     mlir::LLVMConversionTarget target(*getOperation().getContext());
 
     // Populate patterns.
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/lower_xla_gpu_to_scf.cc b/third_party/xla/xla/service/gpu/fusions/mlir/lower_xla_gpu_to_scf.cc
index aab493ec184e09..29e9887caa752f 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/lower_xla_gpu_to_scf.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/lower_xla_gpu_to_scf.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Complex/IR/Complex.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"  // from @llvm-project
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
@@ -25,6 +26,7 @@ limitations under the License.
 #include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
@@ -103,41 +105,52 @@ struct RewriteShuffleReduce : mlir::OpRewritePattern<ShuffleReduceOp> {
     mlir::ValueRange values = op.getOperands();
     for (int distance = max_distance; distance > 0; distance /= 2) {
       namespace ml = mlir::LLVM;
-      auto shuffle = [&](mlir::Value v) {
+      auto shuffle_32 = [&](mlir::Value v) {
         return b
             .create<mlir::gpu::ShuffleOp>(v, distance, WarpSize(),
                                           mlir::gpu::ShuffleMode::DOWN)
             .getShuffleResult();
       };
 
-      llvm::SmallVector<mlir::Value> args = values;
-      for (auto value : values) {
-        // Shuffle within the warps.
+      auto shuffle_int_or_float = [&](mlir::Value value) {
         auto ty = value.getType();
         int bit_width = ty.getIntOrFloatBitWidth();
-
         if (bit_width == 32) {
-          value = shuffle(value);
-        } else {
-          int n_shuffles = CeilOfRatio(bit_width, 32);
-          auto int_ty = b.getIntegerType(bit_width);
-          auto padded_int_ty = b.getIntegerType(n_shuffles * 32);
-          value = b.create<mlir::arith::BitcastOp>(int_ty, value);
-          value = b.create<mlir::arith::ExtUIOp>(padded_int_ty, value);
-          auto vector_type = ml::getVectorType(b.getI32Type(), n_shuffles);
-          value = b.create<ml::BitcastOp>(vector_type, value);
-          mlir::Value result_vec = b.create<ml::UndefOp>(vector_type);
-          for (int i = 0; i < n_shuffles; ++i) {
-            auto idx = b.create<mlir::arith::ConstantIntOp>(i, 32);
-            result_vec = b.create<ml::InsertElementOp>(
-                result_vec, shuffle(b.create<ml::ExtractElementOp>(value, idx)),
-                idx);
-          }
-          value = b.create<ml::BitcastOp>(padded_int_ty, result_vec);
-          value = b.create<mlir::arith::TruncIOp>(int_ty, value);
-          value = b.create<mlir::arith::BitcastOp>(ty, value);
+          return shuffle_32(value);
         }
-        args.push_back(value);
+        int n_shuffles = CeilOfRatio(bit_width, 32);
+        auto int_ty = b.getIntegerType(bit_width);
+        auto padded_int_ty = b.getIntegerType(n_shuffles * 32);
+        value = b.create<mlir::arith::BitcastOp>(int_ty, value);
+        value = b.create<mlir::arith::ExtUIOp>(padded_int_ty, value);
+        auto vector_type = ml::getVectorType(b.getI32Type(), n_shuffles);
+        value = b.create<ml::BitcastOp>(vector_type, value);
+        mlir::Value result_vec = b.create<ml::UndefOp>(vector_type);
+        for (int i = 0; i < n_shuffles; ++i) {
+          auto idx = b.create<mlir::arith::ConstantIntOp>(i, 32);
+          result_vec = b.create<ml::InsertElementOp>(
+              result_vec,
+              shuffle_32(b.create<ml::ExtractElementOp>(value, idx)), idx);
+        }
+        value = b.create<ml::BitcastOp>(padded_int_ty, result_vec);
+        value = b.create<mlir::arith::TruncIOp>(int_ty, value);
+        value = b.create<ml::BitcastOp>(ty, value);
+        return value;
+      };
+
+      auto shuffle = [&](mlir::Value value) -> mlir::Value {
+        if (value.getType().isa<mlir::ComplexType>()) {
+          return b.create<mlir::complex::CreateOp>(
+              value.getType(),
+              shuffle_int_or_float(b.create<mlir::complex::ReOp>(value)),
+              shuffle_int_or_float(b.create<mlir::complex::ImOp>(value)));
+        }
+        return shuffle_int_or_float(value);
+      };
+
+      llvm::SmallVector<mlir::Value> args = values;
+      for (auto value : values) {
+        args.push_back(shuffle(value));
       }
       values = b.create<mlir::func::CallOp>(op.getReducerAttr().getAttr(),
                                             op.getResultTypes(), args)
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.cc b/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.cc
index f12af822e51ba8..fbd479552aa9be 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.cc
@@ -44,6 +44,7 @@ limitations under the License.
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"  // from @llvm-project
 #include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"  // from @llvm-project
+#include "mlir/Dialect/DLTI/DLTI.h"  // from @llvm-project
 #include "mlir/Dialect/Func/Extensions/InlinerExtension.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"  // from @llvm-project
@@ -92,6 +93,7 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
@@ -141,6 +143,21 @@ void AddRanges(llvm::Function* func, const LaunchDimensions& launch_dims,
   }
 }
 
+bool Needs64Bits(const Shape& shape) {
+  return shape.IsArray() ? !IsInt32(ShapeUtil::ElementsIn(shape))
+                         : absl::c_any_of(shape.tuple_shapes(), Needs64Bits);
+}
+
+bool Needs64BitIndices(const HloComputation* computation) {
+  for (auto* instr : computation->instructions()) {
+    if (Needs64Bits(instr->shape()) ||
+        absl::c_any_of(instr->called_computations(), Needs64BitIndices)) {
+      return true;
+    }
+  }
+  return false;
+}
+
 }  // namespace
 
 Value MlirFusionEmitterBase::EmitBlockId(mlir::ImplicitLocOpBuilder& builder,
@@ -163,6 +180,13 @@ Value MlirFusionEmitterBase::EmitThreadId(mlir::ImplicitLocOpBuilder& builder,
   return thread_id;
 }
 
+llvm::SmallVector<Value> MlirFusionEmitterBase::EmitThreadAndBlockIds(
+    mlir::ImplicitLocOpBuilder& builder) const {
+  auto& b = builder;
+  return {EmitThreadId(b, 0), EmitThreadId(b, 1), EmitThreadId(b, 2),
+          EmitBlockId(b, 0),  EmitBlockId(b, 1),  EmitBlockId(b, 2)};
+}
+
 absl::StatusOr<FusionEmissionResult> MlirFusionEmitterBase::Emit(
     IrEmitterContext& ir_emitter_context,
     const HloFusionInstruction& fusion) const {
@@ -295,12 +319,12 @@ MlirFusionEmitterBase::CreateMLIRModule(
     mlir::MLIRContext& context, const HloFusionInstruction& fusion,
     const std::string& entry_function_name,
     const BufferAssignment* buffer_assignment) const {
-  context.loadDialect<mlir::tensor::TensorDialect, mlir::func::FuncDialect,
-                      mlir::affine::AffineDialect, mlir::arith::ArithDialect,
-                      mlir::cf::ControlFlowDialect, mlir::math::MathDialect,
-                      mlir::scf::SCFDialect, mlir::mhlo::MhloDialect,
-                      mlir::gpu::GPUDialect, mlir::NVVM::NVVMDialect,
-                      xla::gpu::XlaGpuDialect>();
+  context.loadDialect<mlir::DLTIDialect, mlir::tensor::TensorDialect,
+                      mlir::func::FuncDialect, mlir::affine::AffineDialect,
+                      mlir::arith::ArithDialect, mlir::cf::ControlFlowDialect,
+                      mlir::math::MathDialect, mlir::scf::SCFDialect,
+                      mlir::mhlo::MhloDialect, mlir::gpu::GPUDialect,
+                      mlir::NVVM::NVVMDialect, xla::gpu::XlaGpuDialect>();
   mlir::DialectRegistry registry;
   mlir::func::registerInlinerExtension(registry);
   mlir::registerBuiltinDialectTranslation(registry);
@@ -401,29 +425,29 @@ SmallVector<Value> MlirFusionEmitterBase::EmitThreadLoopNest(
     const std::function<
         SmallVector<Value>(ValueRange outputs_tensors, ValueRange dim_values,
                            ValueRange symbol_values)>& create_body) const {
-  SmallVector<Value> dim_values{EmitThreadId(b, 0), EmitThreadId(b, 1),
-                                EmitThreadId(b, 2), EmitBlockId(b, 0),
-                                EmitBlockId(b, 1),  EmitBlockId(b, 2)};
-  return mlir_converter::EmitLoopNest(b, dim_values, outputs, indexing_map,
-                                      create_body);
+  return mlir_converter::EmitLoopNest(b, EmitThreadAndBlockIds(b), outputs,
+                                      indexing_map, create_body);
 }
 
 absl::Status MlirFusionEmitterBase::EmitMlir(
     mlir::ModuleOp module, mlir::func::FuncOp entry_function,
     const HloFusionInstruction& fusion) const {
-  auto customized = GetInstructionsWithCustomCodegen(fusion);
+  std::optional<mlir_converter::EpilogueSpecification> epilogue =
+      GetEpilogue(fusion, module->getContext());
   mlir_converter::PartitionedComputations computations(
-      fusion.fused_instructions_computation(), customized);
+      fusion.fused_instructions_computation(), module->getContext(), epilogue);
   auto subgraph_to_mlir_fn = computations.DeclareFunctions(module);
 
-  // Erase subgraphs for all customized instructions that aren't used anywhere
-  // else. This is necessary because the instructions may not have elemental
-  // implementations (scatter).
-  for (auto* custom : customized) {
-    if (custom->user_count() == 0) {
-      subgraph_to_mlir_fn.extract(&computations.FindSubgraph(custom))
-          .mapped()
-          .erase();
+  // Erase subgraphs for all heroes that aren't used anywhere else. This is
+  // necessary because the instructions may not have elemental implementations
+  // (scatter).
+  if (epilogue) {
+    for (auto* custom : epilogue->heroes) {
+      if (custom->user_count() == 0) {
+        subgraph_to_mlir_fn.extract(&computations.FindSubgraph(custom))
+            .mapped()
+            .erase();
+      }
     }
   }
 
@@ -444,6 +468,15 @@ absl::Status MlirFusionEmitterBase::EmitMlir(
         *epilogue, subgraph_to_mlir_fn[&*epilogue], call_targets));
   }
 
+  int index_bitwidth =
+      Needs64BitIndices(fusion.fused_instructions_computation()) ? 64 : 32;
+  mlir::OpBuilder b(module->getContext());
+  auto index_layout = mlir::DataLayoutEntryAttr::get(
+      b.getIndexType(), b.getI32IntegerAttr(index_bitwidth));
+  module->setAttr(
+      mlir::DLTIDialect::kDataLayoutAttrName,
+      mlir::DataLayoutSpecAttr::get(module->getContext(), {index_layout}));
+
   return EmitEntryFunction(computations, call_targets, entry_function, fusion);
 }
 
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h b/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h
index 6baf86372613e5..c30325ff75e15b 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h
@@ -70,9 +70,10 @@ class MlirFusionEmitterBase : public KernelFusionInterface {
   // Returns the set of instructions that will be isolated in the partitioned,
   // i.e., they will get their own subgraph. We won't automatically emit
   // functions for these instructions.
-  virtual std::vector<const HloInstruction*> GetInstructionsWithCustomCodegen(
-      const HloFusionInstruction& fusion) const {
-    return {};
+  virtual std::optional<mlir_converter::EpilogueSpecification> GetEpilogue(
+      const HloFusionInstruction& fusion,
+      mlir::MLIRContext* mlir_context) const {
+    return std::nullopt;
   }
 
   virtual absl::Status EmitEntryFunction(
@@ -103,6 +104,8 @@ class MlirFusionEmitterBase : public KernelFusionInterface {
 
   mlir::Value EmitBlockId(mlir::ImplicitLocOpBuilder& builder, int dim) const;
   mlir::Value EmitThreadId(mlir::ImplicitLocOpBuilder& builder, int dim) const;
+  llvm::SmallVector<mlir::Value> EmitThreadAndBlockIds(
+      mlir::ImplicitLocOpBuilder& builder) const;
 
  private:
   // Emits MLIR for the given fusion. The entry function has one tensor argument
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter_test.cc b/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter_test.cc
index b0ed47330a5558..759d625d02b4ba 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter_test.cc
@@ -18,13 +18,13 @@ limitations under the License.
 #include <optional>
 #include <string>
 
-#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"  // from @llvm-project
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"  // from @llvm-project
 #include "mlir/Dialect/Complex/IR/Complex.h"  // from @llvm-project
 #include "mlir/Dialect/Func/Extensions/InlinerExtension.h"  // from @llvm-project
@@ -37,7 +37,6 @@ limitations under the License.
 #include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/ValueRange.h"  // from @llvm-project
-#include "mlir/Interfaces/DataLayoutInterfaces.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"  // from @llvm-project
@@ -45,9 +44,11 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "xla/service/gpu/fusions/mlir/computation_partitioner.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/model/indexing_map.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/tests/filecheck.h"
 #include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/statusor.h"
@@ -169,11 +170,9 @@ TEST_F(MlirFusionEmitterTest, CreateLLVMModule) {
   TF_ASSERT_OK_AND_ASSIGN(auto filecheck_result, RunFileCheck(out, R"(
     // CHECK: define void @fusion(ptr noalias %[[IN:.*]], ptr noalias %[[OUT:.*]])
     // CHECK:   %[[TID:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-    // CHECK:   %[[EXT:.*]] = sext i32 %[[TID]] to i64
-    // CHECK:   %[[TRUNC:.*]] = trunc i64 %[[EXT]] to i32
-    // CHECK:   %[[IN_PTR:.*]] = getelementptr inbounds float, ptr %[[IN]], i32 %[[TRUNC]]
+    // CHECK:   %[[IN_PTR:.*]] = getelementptr inbounds float, ptr %[[IN]], i32 %[[TID]]
     // CHECK:   %[[VAL:.*]] = load float, ptr %[[IN_PTR]], align 4
-    // CHECK:   %[[OUT_PTR:.*]] = getelementptr inbounds float, ptr %[[OUT]], i32 %[[TRUNC]]
+    // CHECK:   %[[OUT_PTR:.*]] = getelementptr inbounds float, ptr %[[OUT]], i32 %[[TID]]
     // CHECK:   store float %[[VAL]], ptr %[[OUT_PTR]], align 4
     // CHECK:   ret void
   )"));
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/simplify_arith.cc b/third_party/xla/xla/service/gpu/fusions/mlir/simplify_arith.cc
index 79fa5b66969e6e..22f2d0f06e7502 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/simplify_arith.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/simplify_arith.cc
@@ -20,25 +20,11 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "absl/base/optimization.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/Casting.h"
-#include "mlir/Dialect/Affine/IR/AffineOps.h"  // from @llvm-project
-#include "mlir/Dialect/Affine/LoopUtils.h"  // from @llvm-project
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
-#include "mlir/Dialect/Arith/Utils/Utils.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/SCF/IR/SCF.h"  // from @llvm-project
 #include "mlir/Dialect/Utils/StaticValueUtils.h"  // from @llvm-project
-#include "mlir/IR/AffineExpr.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
-#include "mlir/IR/Matchers.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "xla/service/gpu/fusions/mlir/passes.h"
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/tests/BUILD b/third_party/xla/xla/service/gpu/fusions/mlir/tests/BUILD
index c4a8230a2966b4..f6e4c28d00b47a 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/tests/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/tests/BUILD
@@ -16,6 +16,7 @@ xla_cc_binary(
         "@llvm-project//mlir:AffineDialect",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:ComplexDialect",
+        "@llvm-project//mlir:DLTIDialect",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FuncExtensions",
         "@llvm-project//mlir:GPUDialect",
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/tests/lower_tensors.mlir b/third_party/xla/xla/service/gpu/fusions/mlir/tests/lower_tensors.mlir
index b508d2cb84dd32..1687d88a74a30c 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/tests/lower_tensors.mlir
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/tests/lower_tensors.mlir
@@ -1,6 +1,6 @@
 // RUN: mlir_fusions_opt %s -split-input-file -xla-gpu-lower-tensors | FileCheck %s
 
-module {
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>} {
   func.func private @add(%arg0: f32, %arg1: f32) -> f32 {
     %sum = arith.addf %arg0, %arg1 : f32
     func.return %sum : f32
@@ -72,7 +72,7 @@ module {
 // CHECK:      @layout(%[[ARG0:.*]]: !llvm.ptr,
 // CHECK-SAME:     %[[X:.*]]: index, %[[Y:.*]]: index
 // CHECK:        %[[IDX:.*]] = affine.apply #[[MAP]](%[[X]], %[[Y]])
-// CHECK:        %[[IDX_CAST:.*]] = arith.index_castui %[[IDX]] : index to i32
+// CHECK:        %[[IDX_CAST:.*]] = arith.index_castui %[[IDX]] : index to i64
 // CHECK:        %[[PTR:.*]] = llvm.getelementptr inbounds %[[ARG0]][%[[IDX_CAST]]]
 // CHECK:        llvm.load %[[PTR]]
 
@@ -110,7 +110,7 @@ module {
 // CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
 // CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
 // CHECK:       scf.for %[[I:.*]] = %[[C0]] to %[[C2]] step %[[C1]] {
-// CHECK:         %[[CAST:.*]] = arith.index_castui %[[I]] : index to i32
+// CHECK:         %[[CAST:.*]] = arith.index_castui %[[I]] : index to i64
 // CHECK:         %[[PTR:.*]] = llvm.getelementptr inbounds %[[ARG0]][%[[CAST]]]
 // CHECK:         llvm.store {{.*}}, %[[PTR]]
 // CHECK:       %[[INBOUNDS:.*]] = arith.cmpi
@@ -309,3 +309,34 @@ module {
 // CHECK-NEXT: %[[RES_SHIFT:.*]] = llvm.shl %[[RES_WIDE]], %{{.*}}
 // CHECK-NEXT: %[[NEW:.*]] = llvm.or %[[NEW_MASKED]], %[[RES_SHIFT]]
 // CHECK-NEXT: llvm.cmpxchg %[[BASE]], %[[VAR]], %[[NEW]]
+
+// -----
+
+module {
+  func.func @shared_complex() -> tensor<10xcomplex<f32>> {
+    %shared = xla_gpu.allocate_shared : tensor<10xcomplex<f32>>
+    return %shared : tensor<10xcomplex<f32>>
+  }
+}
+
+// CHECK: llvm.mlir.global private @{{.*}}() {addr_space = 3 : i32} : !llvm.array<10 x struct<(f32, f32)>>
+// CHECK: @shared_complex
+
+// -----
+
+module {
+  func.func @i4_load_store(%arg: tensor<10xi4>, %i: index, %j: index) -> tensor<10xi4> {
+    %v = tensor.extract %arg[%i] : tensor<10xi4>
+    %r = tensor.insert %v into %arg[%j] : tensor<10xi4>
+    return %r : tensor<10xi4>
+  }
+}
+
+// CHECK: @i4_load_store
+// CHECK: llvm.getelementptr
+// CHECK-SAME: -> !llvm.ptr, i8
+// CHECK: llvm.load
+// CHECK: llvm.getelementptr
+// CHECK-SAME: -> !llvm.ptr, i8
+// CHECK: llvm.load
+// CHECK: llvm.store
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/tests/lower_xla_gpu_to_scf.mlir b/third_party/xla/xla/service/gpu/fusions/mlir/tests/lower_xla_gpu_to_scf.mlir
index 9b1d0b20fe894c..79923cde8b3d04 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/tests/lower_xla_gpu_to_scf.mlir
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/tests/lower_xla_gpu_to_scf.mlir
@@ -46,6 +46,22 @@ module {
 
 // -----
 
+module {
+  func.func @reducer(%a: complex<f64>, %b: complex<f64>) -> complex<f64> {
+    return %a : complex<f64>
+  }
+
+  func.func @shuffler(%a: complex<f64>) -> complex<f64> {
+    %ret = xla_gpu.shuffle_reduce @reducer(%a) to 1 : complex<f64>
+    return %ret : complex<f64>
+  }
+}
+
+// CHECK: @shuffler
+// CHECK-COUNT-4: gpu.shuffle down {{.*}}, %[[C1]]
+
+// -----
+
 module {
   func.func @predicated_insert(
       %v: i32, %tensor: tensor<2xi32>, %index: index,
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/tests/mlir_fusions_opt.cc b/third_party/xla/xla/service/gpu/fusions/mlir/tests/mlir_fusions_opt.cc
index 3f77b3336b2b7f..37629b4faae153 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/tests/mlir_fusions_opt.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/tests/mlir_fusions_opt.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "mlir/Dialect/Affine/IR/AffineOps.h"  // from @llvm-project
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Complex/IR/Complex.h"  // from @llvm-project
+#include "mlir/Dialect/DLTI/DLTI.h"  // from @llvm-project
 #include "mlir/Dialect/Func/Extensions/AllExtensions.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"  // from @llvm-project
@@ -32,12 +33,13 @@ limitations under the License.
 
 int main(int argc, char **argv) {
   mlir::DialectRegistry registry;
-  registry.insert<mlir::tensor::TensorDialect, mlir::func::FuncDialect,
-                  mlir::affine::AffineDialect, mlir::arith::ArithDialect,
-                  mlir::complex::ComplexDialect, mlir::math::MathDialect,
-                  mlir::scf::SCFDialect, mlir::mhlo::MhloDialect,
-                  mlir::LLVM::LLVMDialect, mlir::gpu::GPUDialect,
-                  mlir::mhlo::MhloDialect, xla::gpu::XlaGpuDialect>();
+  registry.insert<mlir::DLTIDialect, mlir::tensor::TensorDialect,
+                  mlir::func::FuncDialect, mlir::affine::AffineDialect,
+                  mlir::arith::ArithDialect, mlir::complex::ComplexDialect,
+                  mlir::math::MathDialect, mlir::scf::SCFDialect,
+                  mlir::mhlo::MhloDialect, mlir::LLVM::LLVMDialect,
+                  mlir::gpu::GPUDialect, mlir::mhlo::MhloDialect,
+                  xla::gpu::XlaGpuDialect>();
   mlir::func::registerAllExtensions(registry);
   mlir::registerCanonicalizerPass();
   mlir::registerCSEPass();
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/type_util_test.cc b/third_party/xla/xla/service/gpu/fusions/mlir/type_util_test.cc
index 95dcdbf161725e..0d35bbc452f533 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/type_util_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/type_util_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "xla/shape_util.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/fusions/reduction_base.cc b/third_party/xla/xla/service/gpu/fusions/reduction_base.cc
index 86bb721129d009..dc146b235313c6 100644
--- a/third_party/xla/xla/service/gpu/fusions/reduction_base.cc
+++ b/third_party/xla/xla/service/gpu/fusions/reduction_base.cc
@@ -319,10 +319,9 @@ ReductionInfo ReductionInfo::Create(const HloFusionAnalysis& analysis) {
 std::optional<IndexingMap> ReductionInfo::ComputeThreadIdToOutputIndexing(
     int64_t root_index, mlir::MLIRContext* ctx) const {
   if (!groups_.is_reduction_root[root_index]) {
-    // Non-transpose roots are elementwise by definition.
+    // Non-reduction roots are elementwise by definition.
     return ComputeThreadIdToInputIndexing(root_index, 0, ctx);
   }
-  auto* root = analysis_.fusion_roots()[root_index];
   auto* hero = analysis_.fusion_heroes()[root_index];
 
   auto block_offsets = GetBlockOffsetsForTiling(tiling_, ctx);
@@ -349,7 +348,7 @@ std::optional<IndexingMap> ReductionInfo::ComputeThreadIdToOutputIndexing(
   constexpr int kColMinorKept = ReductionDimensions::kColMinorKeptDimension;
   constexpr int kColReduced = ReductionDimensions::kColReducedDimension;
 
-  auto physical_index = [&]() {
+  auto map = [&]() {
     if (is_row_reduction_) {
       IndexingMap linear_index(
           mlir::AffineMap::get(
@@ -398,10 +397,6 @@ std::optional<IndexingMap> ReductionInfo::ComputeThreadIdToOutputIndexing(
                       physical_shape, ctx));
   }();
 
-  auto map = ComposeIndexingMaps(
-      physical_index,
-      GetBitcastMap(FirstShape(hero->shape()), FirstShape(root->shape()), ctx));
-
   int group_index = groups_.group_id_per_root[root_index];
   map.AddConstraint(
       mlir::getAffineDimExpr(KernelFusionInterface::kIndexingMapBlockIdxDims[1],
diff --git a/third_party/xla/xla/service/gpu/fusions/reduction_base_test.cc b/third_party/xla/xla/service/gpu/fusions/reduction_base_test.cc
index 2c4ffa0e9ce078..f0e2d914a3e8cd 100644
--- a/third_party/xla/xla/service/gpu/fusions/reduction_base_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/reduction_base_test.cc
@@ -98,7 +98,6 @@ TEST_F(ReductionTest, ThreadIndexingRowReduction) {
         s0 in [0, 0]
         s1 in [0, 0]
         s2 in [0, 15]
-        0 in [0, 0]
         d0 mod 32 + s2 * 32 in [0, 511]
         d3 * 8 + d0 floordiv 32 in [0, 6399]
       )"));
@@ -116,7 +115,6 @@ TEST_F(ReductionTest, ThreadIndexingRowReduction) {
         d3 in [0, 799]
         d4 in [0, 0]
         d5 in [0, 0]
-        (d3 * 8 + d0 floordiv 32) mod 64 in [0, 63]
         d0 mod 32 in [0, 0]
         d3 * 8 + d0 floordiv 32 in [0, 6399]
       )"));
@@ -167,7 +165,6 @@ TEST_F(ReductionTest, ThreadIndexingMultiRowReduction) {
         s0 in [0, 0]
         s1 in [0, 0]
         s2 in [0, 0]
-        0 in [0, 0]
         d0 mod 4 in [0, 3]
         d3 * 64 + d0 floordiv 4 in [0, 6399]
       )"));
@@ -185,10 +182,8 @@ TEST_F(ReductionTest, ThreadIndexingMultiRowReduction) {
         d3 in [0, 99]
         d4 in [0, 0]
         d5 in [0, 0]
-        (d0 floordiv 4) mod 64 in [0, 63]
         d0 mod 4 in [0, 0]
         d3 * 64 + d0 floordiv 4 in [0, 6399]
-        d3 + (d0 floordiv 4) floordiv 64 in [0, 99]
       )"));
 }
 
@@ -289,7 +284,6 @@ TEST_F(ReductionTest, ThreadIndexingOutputLayout) {
         d3 in [0, 799]
         d4 in [0, 0]
         d5 in [0, 0]
-        (d3 * 8 + d0 floordiv 32) mod 64 in [0, 63]
         d0 mod 32 in [0, 0]
         d3 * 8 + d0 floordiv 32 in [0, 6399]
       )"));
@@ -340,7 +334,6 @@ TEST_F(ReductionTest, ThreadIndexingSideOutput) {
       s0 in [0, 0]
       s1 in [0, 0]
       s2 in [0, 15]
-      0 in [0, 0]
       d0 mod 32 + s2 * 32 in [0, 511]
       d3 * 8 + d0 floordiv 32 in [0, 6399]
   )";
@@ -395,7 +388,6 @@ TEST_F(ReductionTest, bla) {
         s1 in [0, 0]
         s2 in [0, 7]
         s3 in [0, 1]
-        0 in [0, 0]
         d0 + s2 * 512 in [0, 4095]
       )"));
 }
diff --git a/third_party/xla/xla/service/gpu/fusions/reduction_mlir.cc b/third_party/xla/xla/service/gpu/fusions/reduction_mlir.cc
index 1ff23dddcf51ba..0ef5d05b14d90b 100644
--- a/third_party/xla/xla/service/gpu/fusions/reduction_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/reduction_mlir.cc
@@ -16,13 +16,10 @@ limitations under the License.
 
 #include <cstdint>
 #include <iterator>
-#include <memory>
-#include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
@@ -41,7 +38,6 @@ limitations under the License.
 #include "mlir/IR/TypeRange.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/ValueRange.h"  // from @llvm-project
-#include "mlir/Interfaces/DataLayoutInterfaces.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/fusions/mlir/computation_partitioner.h"
@@ -56,7 +52,6 @@ limitations under the License.
 #include "xla/service/gpu/reduction_utils.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
-#include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
@@ -65,7 +60,6 @@ namespace gpu {
 using llvm::SmallVector;
 using mlir::Value;
 using mlir::ValueRange;
-using mlir_converter::PartitionedComputation;
 using mlir_converter::PartitionedComputations;
 
 struct MlirReductionFusion::EmitterState {
@@ -115,10 +109,11 @@ bool MlirReductionFusion::IsSupported(const HloFusionAnalysis& analysis) {
          info.IsRaceFree();
 }
 
-std::vector<const HloInstruction*>
-MlirReductionFusion::GetInstructionsWithCustomCodegen(
-    const HloFusionInstruction& fusion) const {
-  return reduction_heroes_;
+std::optional<mlir_converter::EpilogueSpecification>
+MlirReductionFusion::GetEpilogue(const HloFusionInstruction& fusion,
+                                 mlir::MLIRContext* mlir_context) const {
+  return mlir_converter::EpilogueSpecification::FromOutputIndexing(
+      analysis(), reduction_heroes_, *this, mlir_context);
 }
 
 absl::Status MlirReductionFusion::EmitEntryFunction(
@@ -200,11 +195,35 @@ absl::Status MlirReductionFusion::EmitReduction(EmitterState& state) const {
   }
   bool use_shared = !shared_tile_size.empty();
 
-  auto output_indexing = ComputeThreadIdToOutputIndexing(0, ctx);
-  auto output_indices = mlir_converter::ApplyAffineMap(
-      output_indexing->GetAffineMap(), thread_and_block_indices, {}, builder);
-  auto thread_has_output = mlir_converter::CheckConstraints(
-      *output_indexing, thread_and_block_indices, {}, builder);
+  llvm::SmallVector<llvm::SmallVector<mlir::Value>> root_output_indices;
+  root_output_indices.resize(analysis().fusion_roots().size());
+  for (const auto& [hero, root_ids] : reduction_roots_) {
+    auto hero_indices = mlir_converter::ApplyAffineMap(
+        ComputeThreadIdToOutputIndexing(root_ids.front(), ctx)->GetAffineMap(),
+        thread_and_block_indices, {}, builder);
+    auto root_indices = mlir_converter::ApplyAffineMap(
+        ComputeEpilogueInputToOutputIndexing(hero, ctx).GetAffineMap(),
+        hero_indices, {}, builder);
+    for (auto root_id : root_ids) {
+      root_output_indices[root_id] = root_indices;
+    }
+  }
+
+  if (analysis().fusion_roots().size() == 1 &&
+      analysis().fusion_roots().front()->shape().IsTuple()) {
+    // This is a variadic reduce. The root indices are the same for all
+    // elements.
+    int num_elements =
+        analysis().fusion_roots().front()->shape().tuple_shapes_size();
+    root_output_indices.reserve(num_elements);
+    for (int i = 0; i < num_elements - 1; ++i) {
+      root_output_indices.push_back(root_output_indices.front());
+    }
+  }
+
+  auto thread_has_output =
+      mlir_converter::CheckConstraints(*ComputeThreadIdToOutputIndexing(0, ctx),
+                                       thread_and_block_indices, {}, builder);
 
   llvm::DenseMap<const HloInstruction*, SmallVector<Value>> inits;
   for (auto [index, hero] : llvm::enumerate(reduction_heroes_)) {
@@ -222,14 +241,23 @@ absl::Status MlirReductionFusion::EmitReduction(EmitterState& state) const {
       return results.front();
     }
 
-    llvm::SmallVector<Value> hero_values;
-    for (const auto& result : results) {
+    llvm::SmallVector<Value> hero_values(reduction_heroes_.size());
+    const auto& injected = state.computations.epilogue()->injected_values;
+    for (auto [hero, result] : llvm::zip(reduction_heroes_, results)) {
       CHECK(result.size() == 1)
           << "Epilogue fusions are not supported with variadic reduce.";
-      hero_values.push_back(result.front());
+      hero_values[injected.at(hero)] = result.front();
     }
+
+    llvm::SmallVector<Value> indices = EmitThreadAndBlockIds(builder);
+    int num_symbols =
+        state.computations.epilogue()->root_indexing.front().getNumSymbols();
+    for (int i = 0; i < num_symbols; ++i) {
+      indices.push_back(zero);
+    }
+
     return EmitEpilogue(state.computations, state.entry_function, hero_values,
-                        output_indices, builder);
+                        indices, builder);
   };
 
   SmallVector<Value> updated_outputs;
@@ -267,9 +295,10 @@ absl::Status MlirReductionFusion::EmitReduction(EmitterState& state) const {
   } else {
     // Evaluate the epilogue, if there is one.
     auto result_scalars = evaluate_epilogue(results);
-    for (auto [value, output] : llvm::zip(result_scalars, output_args)) {
+    for (auto [value, output, indices] :
+         llvm::zip(result_scalars, output_args, root_output_indices)) {
       updated_outputs.push_back(builder.create<PredicatedInsertOp>(
-          thread_has_output, value, output, output_indices));
+          thread_has_output, value, output, indices));
     }
     builder.create<mlir::func::ReturnOp>(updated_outputs);
     return absl::OkStatus();
@@ -305,9 +334,10 @@ absl::Status MlirReductionFusion::EmitReduction(EmitterState& state) const {
 
     auto result_scalars = evaluate_epilogue(results);
 
-    for (auto [output_value, dest] : llvm::zip(result_scalars, output_args)) {
+    for (auto [output_value, dest, indices] :
+         llvm::zip(result_scalars, output_args, root_output_indices)) {
       updated_outputs.push_back(b.create<PredicatedInsertOp>(
-          thread_has_output, output_value, dest, output_indices));
+          thread_has_output, output_value, dest, indices));
     }
     b.create<mlir::scf::YieldOp>(loc, updated_outputs);
   };
diff --git a/third_party/xla/xla/service/gpu/fusions/reduction_mlir.h b/third_party/xla/xla/service/gpu/fusions/reduction_mlir.h
index e321d4da97bbc6..2b6a1a865884fb 100644
--- a/third_party/xla/xla/service/gpu/fusions/reduction_mlir.h
+++ b/third_party/xla/xla/service/gpu/fusions/reduction_mlir.h
@@ -42,8 +42,9 @@ class MlirReductionFusion : public ReductionFusionBase<MlirFusionEmitterBase> {
       mlir::func::FuncOp entry_function,
       const HloFusionInstruction& fusion) const override;
 
-  std::vector<const HloInstruction*> GetInstructionsWithCustomCodegen(
-      const HloFusionInstruction& fusion) const override;
+  std::optional<mlir_converter::EpilogueSpecification> GetEpilogue(
+      const HloFusionInstruction& fusion,
+      mlir::MLIRContext* mlir_context) const override;
 
  private:
   struct EmitterState;
diff --git a/third_party/xla/xla/service/gpu/fusions/scatter_mlir.cc b/third_party/xla/xla/service/gpu/fusions/scatter_mlir.cc
index 6d8969a5251f31..7e9657f131f558 100644
--- a/third_party/xla/xla/service/gpu/fusions/scatter_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/scatter_mlir.cc
@@ -44,6 +44,7 @@ limitations under the License.
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/model/indexing_analysis.h"
 #include "xla/service/gpu/model/indexing_map.h"
+#include "xla/service/scatter_simplifier.h"
 #include "xla/shape.h"
 #include "xla/xla_data.pb.h"
 
@@ -93,6 +94,8 @@ std::optional<IndexingMap> MlirScatterFusion::ComputeThreadIdToInputIndexing(
     mlir::MLIRContext* ctx) const {
   auto* scatter =
       DynCast<HloScatterInstruction>(analysis_.fusion_heroes().front());
+  CHECK(ScatterSimplifier::IsSimplifiedScatter(scatter))
+      << "Non-simplified HLO Scatter is not supported.";
   int64_t scatter_operand_count = scatter->scatter_operand_count();
   // Scatter operands a packed in the following way:
   // Operand IDs [0, scatter_operand_count - 1] for `scatter operands`.
@@ -137,10 +140,14 @@ LaunchDimensions MlirScatterFusion::launch_dimensions() const {
   return CalculateLaunchDimensions(shape, analysis_.device_info());
 }
 
-std::vector<const HloInstruction*>
-MlirScatterFusion::GetInstructionsWithCustomCodegen(
-    const HloFusionInstruction& fusion) const {
-  return analysis_.fusion_heroes();
+std::optional<mlir_converter::EpilogueSpecification>
+MlirScatterFusion::GetEpilogue(const HloFusionInstruction& fusion,
+                               mlir::MLIRContext* mlir_context) const {
+  // We don't actually support epilogues for scatter, but this is how we tell
+  // the base class that we don't want it to generate code for the scatter.
+  return mlir_converter::EpilogueSpecification::FromIdentityIndexing(
+      analysis_.fusion_heroes().front(), analysis_.fusion_roots().front(),
+      mlir_context);
 }
 
 mlir::Value EmitScatterComputation(
diff --git a/third_party/xla/xla/service/gpu/fusions/scatter_mlir.h b/third_party/xla/xla/service/gpu/fusions/scatter_mlir.h
index e66e2c6a4f5a78..3b8bfeb4b2694d 100644
--- a/third_party/xla/xla/service/gpu/fusions/scatter_mlir.h
+++ b/third_party/xla/xla/service/gpu/fusions/scatter_mlir.h
@@ -55,8 +55,9 @@ class MlirScatterFusion : public MlirFusionEmitterBase {
       mlir::func::FuncOp entry_function,
       const HloFusionInstruction& fusion) const override;
 
-  std::vector<const HloInstruction*> GetInstructionsWithCustomCodegen(
-      const HloFusionInstruction& fusion) const override;
+  std::optional<mlir_converter::EpilogueSpecification> GetEpilogue(
+      const HloFusionInstruction& fusion,
+      mlir::MLIRContext* mlir_context) const override;
 
  private:
   const HloFusionAnalysis& analysis_;
diff --git a/third_party/xla/xla/service/gpu/fusions/transpose_mlir.cc b/third_party/xla/xla/service/gpu/fusions/transpose_mlir.cc
index 9e2e5be2ea564b..28ac6c70bdbd21 100644
--- a/third_party/xla/xla/service/gpu/fusions/transpose_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/transpose_mlir.cc
@@ -15,27 +15,24 @@ limitations under the License.
 #include "xla/service/gpu/fusions/transpose_mlir.h"
 
 #include <cstdint>
+#include <iterator>
 #include <optional>
 #include <vector>
 
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
+#include "absl/algorithm/container.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
 #include "mlir/IR/AffineExpr.h"  // from @llvm-project
 #include "mlir/IR/AffineMap.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/TypeRange.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/ValueRange.h"  // from @llvm-project
@@ -43,7 +40,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/mlir/utils/type_util.h"
 #include "xla/permutation_util.h"
 #include "xla/service/gpu/fusions/mlir/computation_partitioner.h"
@@ -57,10 +53,8 @@ limitations under the License.
 #include "xla/service/gpu/model/indexing_map.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status_macros.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
@@ -68,12 +62,10 @@ namespace gpu {
 namespace {
 
 using absl::StatusOr;
-using llvm::SmallPtrSet;
 using llvm::SmallVector;
 using mlir::AffineExpr;
 using mlir::AffineMap;
 using mlir::MLIRContext;
-using mlir::ModuleOp;
 using mlir::RankedTensorType;
 using mlir::Value;
 using mlir::ValueRange;
@@ -203,10 +195,8 @@ IndexingMap GetSharedMemoryWriteIndexingMap(
           thread_id_indexing.GetSymbolCount(),
           {c0, th_x.floorDiv(32) + 4 * tile_sizes[loop_dim], th_x % 32},
           mlir_context),
-      thread_id_indexing.GetDimVars(),
-      thread_id_indexing.GetRangeVars(),
-      thread_id_indexing.GetRTVars(),
-      thread_id_indexing.GetConstraints()};
+      thread_id_indexing.GetDimVars(), thread_id_indexing.GetRangeVars(),
+      thread_id_indexing.GetRTVars(), thread_id_indexing.GetConstraints()};
   shmem_write_indexing.Simplify(GetIndexingMapForInstruction);
   return shmem_write_indexing;
 }
@@ -293,9 +283,6 @@ absl::Status MlirTransposeFusion::EmitReadFromShMemMlir(
   auto output_indexing = *ComputeThreadIdToOutputIndexing(0, mlir_context);
   auto shmem_output_indexing =
       GetSharedMemoryReadIndexingMap(output_indexing, permutation_[2]);
-  auto epilogue_indexing = ComputeEpilogueInputToOutputIndexing(
-      analysis_.fusion_heroes()[0], mlir_context);
-  auto root_indexing = ComposeIndexingMaps(output_indexing, epilogue_indexing);
   auto result_tensors = EmitThreadLoopNest(
       builder, output_tensor_args, output_indexing,
       [&](ValueRange output_tensors, ValueRange dim_values,
@@ -308,25 +295,23 @@ absl::Status MlirTransposeFusion::EmitReadFromShMemMlir(
           transpose_values.push_back(
               builder.create<ExtractOp>(shmem, shmem_indices));
         }
-        auto root_indices = ApplyAffineMap(root_indexing.GetAffineMap(),
-                                           dim_values, symbol_values, builder);
+        llvm::SmallVector<Value> epilogue_indices = dim_values;
+        absl::c_copy(symbol_values, std::back_inserter(epilogue_indices));
         auto result_scalars =
             EmitEpilogue(computations, entry_function, transpose_values,
-                         root_indices, builder);
+                         epilogue_indices, builder);
         SmallVector<Value> results;
         results.reserve(output_tensor_args.size());
-        const auto& first_shape = analysis_.fusion_roots().front()->shape();
-        for (auto [tensor, value, root] : llvm::zip(
-                 output_tensors, result_scalars, analysis_.fusion_roots())) {
-          llvm::SmallVector<Value> indices;
-          if (ShapeUtil::EqualIgnoringElementType(first_shape, root->shape())) {
-            indices = root_indices;
-          } else {
-            auto bitcast_map =
-                GetBitcastMap(first_shape, root->shape(), mlir_context);
-            indices = ApplyAffineMap(bitcast_map.GetAffineMap(), root_indices,
-                                     {}, builder);
-          }
+        std::vector<AffineMap> root_indexing;
+        if (computations.epilogue()) {
+          root_indexing = computations.epilogue()->root_indexing;
+        } else {
+          root_indexing.push_back(output_indexing.GetAffineMap());
+        }
+        for (auto [tensor, value, indexing] :
+             llvm::zip(output_tensors, result_scalars, root_indexing)) {
+          llvm::SmallVector<Value> indices =
+              ApplyAffineMap(indexing, dim_values, symbol_values, builder);
           results.push_back(builder.create<InsertOp>(value, tensor, indices));
         }
         return results;
@@ -336,10 +321,11 @@ absl::Status MlirTransposeFusion::EmitReadFromShMemMlir(
   return absl::OkStatus();
 }
 
-std::vector<const HloInstruction*>
-MlirTransposeFusion::GetInstructionsWithCustomCodegen(
-    const HloFusionInstruction& fusion) const {
-  return GetShMemTransposes(analysis_);
+std::optional<mlir_converter::EpilogueSpecification>
+MlirTransposeFusion::GetEpilogue(const HloFusionInstruction& fusion,
+                                 MLIRContext* mlir_context) const {
+  return mlir_converter::EpilogueSpecification::FromOutputIndexing(
+      analysis_, shmem_transposes_, *this, mlir_context);
 }
 
 absl::Status MlirTransposeFusion::EmitEntryFunction(
diff --git a/third_party/xla/xla/service/gpu/fusions/transpose_mlir.h b/third_party/xla/xla/service/gpu/fusions/transpose_mlir.h
index 8329cdd852ae63..f789af0f8a1a2e 100644
--- a/third_party/xla/xla/service/gpu/fusions/transpose_mlir.h
+++ b/third_party/xla/xla/service/gpu/fusions/transpose_mlir.h
@@ -68,8 +68,9 @@ class MlirTransposeFusion : public MlirFusionEmitterBase {
       mlir::func::FuncOp entry_function,
       const HloFusionInstruction& fusion) const override;
 
-  std::vector<const HloInstruction*> GetInstructionsWithCustomCodegen(
-      const HloFusionInstruction& fusion) const override;
+  std::optional<mlir_converter::EpilogueSpecification> GetEpilogue(
+      const HloFusionInstruction& fusion,
+      mlir::MLIRContext* mlir_context) const override;
 
   absl::StatusOr<llvm::SmallVector<mlir::Value, 4>> EmitWriteToShMemMlir(
       mlir::ImplicitLocOpBuilder& builder, mlir::func::FuncOp entry_function,
diff --git a/third_party/xla/xla/service/gpu/fusions/triton.cc b/third_party/xla/xla/service/gpu/fusions/triton.cc
index 2fc6d15898da6c..ebbaccdb0bd742 100644
--- a/third_party/xla/xla/service/gpu/fusions/triton.cc
+++ b/third_party/xla/xla/service/gpu/fusions/triton.cc
@@ -42,7 +42,7 @@ limitations under the License.
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "xla/service/gpu/ir_emitter_triton.h"
 #else
 #include "absl/status/status.h"
@@ -98,7 +98,7 @@ absl::StatusOr<FusionEmissionResult> TritonFusion::Emit(
     IrEmitterContext& ir_emitter_context,
     const HloFusionInstruction& fusion) const {
   llvm::IRBuilder builder(ir_emitter_context.llvm_module()->getContext());
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   VLOG(3) << fusion.ToString();
   std::string suggested_kernel_name = std::string(fusion.name());
   TF_ASSIGN_OR_RETURN(
@@ -137,7 +137,7 @@ absl::StatusOr<FusionEmissionResult> TritonFusion::Emit(
       TF_ASSIGN_OR_RETURN(
           triton_wrapper_result,
           TritonWrapper(analysis, impl_fn_name, hlo_computation,
-                        ir_emitter_context.cuda_compute_capability(),
+                        ir_emitter_context.gpu_compute_capability(),
                         ir_emitter_context.gpu_device_info(), config,
                         ir_emitter_context.llvm_module(), &EmitSoftMax,
                         *ir_emitter_context.mlir_context()));
@@ -164,7 +164,7 @@ absl::StatusOr<FusionEmissionResult> TritonFusion::Emit(
       TF_ASSIGN_OR_RETURN(
           triton_wrapper_result,
           TritonWrapper(analysis, impl_fn_name, hlo_computation,
-                        ir_emitter_context.cuda_compute_capability(),
+                        ir_emitter_context.gpu_compute_capability(),
                         ir_emitter_context.gpu_device_info(), config,
                         ir_emitter_context.llvm_module(), &EmitMatMul,
                         *ir_emitter_context.mlir_context()));
@@ -212,7 +212,7 @@ absl::StatusOr<FusionEmissionResult> TritonFusion::Emit(
 
   return result;
 #else
-  return absl::UnimplementedError("Triton support requires CUDA");
+  return absl::UnimplementedError("Triton support requires CUDA or ROCm");
 #endif
 }
 
diff --git a/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc b/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc
index 515a05f2ffb9d8..52b50c9dabb406 100644
--- a/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc
+++ b/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc
@@ -96,6 +96,7 @@ class GemmAutotuner {
   std::unique_ptr<se::RedzoneAllocator> redzone_allocator_;
   se::Stream* stream_ = nullptr;
   bool deterministic_ops_ = false;
+  size_t solutions_limit_ = 0;
   int64_t rng_state_ = 0;
 
  public:
@@ -114,6 +115,7 @@ class GemmAutotuner {
     const DebugOptions& debug_options =
         gemm->GetModule()->config().debug_options();
     deterministic_ops_ = debug_options.xla_gpu_deterministic_ops();
+    solutions_limit_ = debug_options.xla_gpu_autotune_max_solutions();
 
     TF_ASSIGN_OR_RETURN(auto gemm_config, GemmConfig::For(gemm));
 
@@ -193,6 +195,7 @@ class GemmAutotuner {
           c_scale_buffer, d_scale_buffer, d_amax_buffer, algorithm,
           scratch_allocator));
       se::blas::ProfileResult profile_result;
+      profile_result.set_warmup_run_executed(true);
       TF_RETURN_IF_ERROR(plan->ExecuteOnStream(
           stream_, lhs_buffer_, rhs_buffer_, output_buffer_, output_buffer_,
           bias_buffer, aux_buffer, a_scale_buffer, b_scale_buffer,
@@ -207,20 +210,9 @@ class GemmAutotuner {
 
   absl::StatusOr<AutotuneResult> TuneGpuBlas(const HloInstruction* gemm,
                                              const GemmConfig& gemm_config) {
-    int64_t workspace_size =
-        std::visit(VariantVisitor{[](const se::CudaComputeCapability& cc) {
-                                    return cc.IsAtLeastHopper()
-                                               ? GemmConfig::kHopperWorkspace
-                                               : GemmConfig::kDefaultWorkspace;
-                                  },
-                                  [](const se::RocmComputeCapability&) {
-                                    return GemmConfig::kDefaultWorkspace;
-                                  }},
-                   autotune_config_.GetGpuComputeCapability());
-
-    TF_ASSIGN_OR_RETURN(
-        auto workspace_buffer,
-        CreateBuffer(ShapeUtil::MakeShape(S8, {workspace_size})));
+    TF_ASSIGN_OR_RETURN(auto workspace_shape,
+                        ShapeUtil::TryGetSubshape(gemm->shape(), {1}));
+    TF_ASSIGN_OR_RETURN(auto workspace_buffer, CreateBuffer(*workspace_shape));
 
     std::vector<se::blas::AlgorithmType> algorithms;
     TF_ASSIGN_OR_RETURN(GemmConfig::DescriptorsTuple desc,
@@ -236,17 +228,19 @@ class GemmAutotuner {
                                 &algorithms);
 
     AutotuneResult best_algorithm;
-#if TENSORFLOW_USE_ROCM        // Blas gemm algorithms can be empty for ROCM
-    if (algorithms.empty()) {  // nothing to autotune
-      LOG(WARNING) << "No solutions found: skipping autotuning for ROCM..";
-      best_algorithm.mutable_gemm()->set_algorithm(se::blas::kDefaultAlgorithm);
-      return best_algorithm;
-    }
-#endif
-
     auto tuned_func = [&](const se::blas::AlgorithmType& algorithm)
         -> absl::StatusOr<se::blas::ProfileResult> {
+      // Do a warm-up run first, without a profile result. RunGemm swallows
+      // error codes when profile_result is passed, as it is in the measurement
+      // below, but not otherwise. It is, therefore, consistent to ignore the
+      // error code here.
+      static_cast<void>(RunGemm(gemm_config, lhs_buffer_, rhs_buffer_,
+                                output_buffer_, workspace_buffer,
+                                deterministic_ops_, stream_, algorithm));
       se::blas::ProfileResult profile_result;
+      // Allow GpuTimer to use its delay kernel implementation to improve
+      // accuracy.
+      profile_result.set_warmup_run_executed(true);
       // We expect GemmWithAlgorithm to fail sometimes -- in fact, it will fail
       // for all algorithms if we're targeting < sm_50. But because we pass a
       // non-null ProfileResult, DoGemmWithAlgorithm should always return true,
@@ -296,7 +290,10 @@ class GemmAutotuner {
     results.reserve(algorithms.size());
     std::optional<int64_t> reference_algorithm;
 
-    for (const AlgoT& algorithm : algorithms) {
+    auto num = algorithms.size();
+    if (solutions_limit_ > 0) num = std::min(num, solutions_limit_);
+    for (size_t i = 0; i < num; i++) {
+      const AlgoT& algorithm = algorithms[i];
       // Make sure the output buffer always has the same value if we use
       // the bias parameter.
       if (autotune_config_.should_reinit_output_buffer() && beta != 0) {
diff --git a/third_party/xla/xla/service/gpu/gemm_fusion_autotuner_test.cc b/third_party/xla/xla/service/gpu/gemm_fusion_autotuner_test.cc
index f8def9455b2c1f..42be8cd64eb8a2 100644
--- a/third_party/xla/xla/service/gpu/gemm_fusion_autotuner_test.cc
+++ b/third_party/xla/xla/service/gpu/gemm_fusion_autotuner_test.cc
@@ -418,6 +418,7 @@ ENTRY e {
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
+// Modify block_k back to 16 once b/331362083 is fixed.
 TEST_F(GemmFusionAutotunerTest, DoNotRunAutotuningKernelSpillingRegisters) {
   const std::string kHloText = R"(
 HloModule m
@@ -435,7 +436,7 @@ ENTRY %e {
   %get-tuple-element.7020 = s8[12288,1536]{1,0} parameter(0)
   %convert = s8[4,12288]{1,0} parameter(1)
   ROOT %triton = s8[4,1536]{1,0} fusion(s8[12288,1536]{1,0} %get-tuple-element.7020, s8[4,12288]{1,0} %convert), kind=kCustom, calls=%triton_gemm_dot,
-    backend_config={"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"256","block_n":"256","block_k":"16","split_k":"1","num_stages":"1","num_warps":"16","num_ctas":"1"}}}
+    backend_config={"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"256","block_n":"256","block_k":"32","split_k":"1","num_stages":"1","num_warps":"16","num_ctas":"1"}}}
 })";
 
   auto module = ParseAndReturnVerifiedModule(kHloText).value();
@@ -452,6 +453,7 @@ ENTRY %e {
               "Compilation result discarded due to register spilling")));
 }
 
+// Modify block_k back to 16 once b/331362083 is fixed.
 TEST_F(GemmFusionAutotunerTest,
        DoNotFilterOutAutotuningKernelSpillingRegisters) {
   const std::string kHloText = R"(
@@ -470,7 +472,7 @@ ENTRY %e {
   %get-tuple-element.7020 = s8[12288,1536]{1,0} parameter(0)
   %convert = s8[4,12288]{1,0} parameter(1)
   ROOT %triton = s8[4,1536]{1,0} fusion(s8[12288,1536]{1,0} %get-tuple-element.7020, s8[4,12288]{1,0} %convert), kind=kCustom, calls=%triton_gemm_dot,
-    backend_config={"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"256","block_n":"256","block_k":"16","split_k":"1","num_stages":"1","num_warps":"16","num_ctas":"1"}}}
+    backend_config={"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"256","block_n":"256","block_k":"32","split_k":"1","num_stages":"1","num_warps":"16","num_ctas":"1"}}}
 })";
 
   auto module = ParseAndReturnVerifiedModule(kHloText).value();
@@ -493,6 +495,7 @@ ENTRY %e {
   EXPECT_NE(executable, nullptr);
 }
 
+// Modify block_k back to 16 once b/331362083 is fixed.
 TEST_F(GemmFusionAutotunerTest, RunAutotuningKernelNotSpillingRegisters) {
   const std::string kHloText = R"(
 HloModule m
@@ -508,7 +511,7 @@ ENTRY %e {
   %p0 = s8[12288,1536]{1,0} parameter(0)
   %p1 = f16[4,12288]{1,0} parameter(1)
   ROOT %triton_dot = f16[4,1536]{1,0} fusion(s8[12288,1536]{1,0} %p0, f16[4,12288]{1,0} %p1), kind=kCustom, calls=%triton_gemm_dot,
-    backend_config={"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"16","block_n":"32","block_k":"16","split_k":"1","num_stages":"1","num_warps":"2","num_ctas":"1"}}}
+    backend_config={"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"16","block_n":"32","block_k":"32","split_k":"1","num_stages":"1","num_warps":"2","num_ctas":"1"}}}
 })";
 
   auto module = ParseAndReturnVerifiedModule(kHloText).value();
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index 63d1082da8ef9e..b72cedc879a388 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -651,10 +651,8 @@ absl::Status RunSPMDPasses(
 
     spmd_simplify.AddPass<SortSimplifier>();
     spmd_simplify.AddPass<TupleSimplifier>();
-    spmd_simplify.AddPass<ScatterSimplifier>();
     spmd_simplify.AddPass<ScatterExpander>(
         ScatterExpander::kEliminateSimpleScatters);
-    spmd_simplify.AddPass<GatherSimplifier>();
     spmd_simplify.AddPass<GatherExpander>(
         GatherExpander::kEliminateSimpleGathers);
     spmd_simplify.AddPass<WhileLoopConstantSinking>();
@@ -1673,36 +1671,6 @@ absl::Status RunPostSchedulingCopyInsertion(
 }
 }  // namespace
 
-absl::StatusOr<std::unique_ptr<BufferAssignment>> GpuCompiler::AssignBuffers(
-    HloModule* hlo_module, const se::StreamExecutor* stream_exec) {
-  const se::DeviceDescription& gpu_device_info =
-      stream_exec->GetDeviceDescription();
-  TF_RETURN_IF_ERROR(
-      ScheduleGpuModule(hlo_module, pointer_size_, gpu_device_info).status());
-
-  TF_RETURN_IF_ERROR(
-      RunPostSchedulingCopyInsertion(hlo_module, GetCanShareBuffer()));
-
-  auto buffer_size_bytes_function =
-      [this](const BufferValue& buffer_value) -> int64_t {
-    return GetSizeOfShape(buffer_value.shape(), pointer_size_);
-  };
-
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<BufferAssignment> assignment,
-      BufferAssigner::Run(
-          hlo_module,
-          std::make_unique<SequentialHloOrdering>(hlo_module->schedule()),
-          buffer_size_bytes_function,
-          /*color_alignment=*/
-          [](LogicalBuffer::Color) { return kXlaAllocatedBufferAlignBytes; },
-          /*allocate_buffers_for_constants=*/true,
-          /*colorer=*/BufferAssigner::DefaultColorer(),
-          /*must_not_live_out=*/{}, GetCanShareBuffer()));
-
-  return std::move(assignment);
-}
-
 using OutputInfoMap =
     absl::flat_hash_map<ShapeIndex, GpuExecutable::OutputInfo>;
 
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.h b/third_party/xla/xla/service/gpu/gpu_compiler.h
index 87824610328860..9d30a471deda8a 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.h
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.h
@@ -71,9 +71,6 @@ class GpuCompiler : public LLVMCompiler {
       std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
       const CompileOptions& options) override;
 
-  absl::StatusOr<std::unique_ptr<BufferAssignment>> AssignBuffers(
-      HloModule* hlo_module, const se::StreamExecutor* stream_exec) override;
-
   absl::StatusOr<std::unique_ptr<Executable>> RunBackend(
       std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
       const CompileOptions& options) override;
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler_test.cc b/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
index b168f69308eb55..4e886b46e5408c 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
@@ -1,3 +1,4 @@
+#include "xla/service/gpu/gpu_compiler.h"
 /* Copyright 2022 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -33,6 +34,8 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/executable.h"
+#include "xla/service/gpu/autotuner_util.h"
+#include "xla/service/gpu/gpu_hlo_schedule.h"
 #include "xla/service/gpu/metrics.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/pattern_matcher.h"
@@ -41,7 +44,9 @@ limitations under the License.
 #include "xla/tests/hlo_test_base.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/env.h"
+#include "tsl/platform/path.h"
 #include "tsl/platform/statusor.h"
+#include "tsl/platform/test.h"
 
 namespace xla {
 namespace gpu {
@@ -55,10 +60,13 @@ using ::testing::TempDir;
 
 class GpuCompilerTest : public HloTestBase {
  public:
-  absl::StatusOr<std::unique_ptr<BufferAssignment>> AssignBuffers(
-      HloModule* module) {
+  absl::Status Schedule(HloModule* module) {
     auto compiler = backend().compiler();
-    return compiler->AssignBuffers(module, backend().default_stream_executor());
+    const se::DeviceDescription& gpu_device_info =
+        backend().default_stream_executor()->GetDeviceDescription();
+    TF_RETURN_IF_ERROR(ScheduleGpuModule(module, 4, gpu_device_info).status());
+    return tensorflow::down_cast<GpuCompiler*>(compiler)
+        ->RunPostSchedulingPipelines(module, 4 * 1024 * 1024, gpu_device_info);
   }
 };
 
@@ -311,7 +319,7 @@ ENTRY main {
   EXPECT_EQ(while_op->while_body()->root_instruction()->operand(1)->opcode(),
             HloOpcode::kCopy);
 
-  TF_ASSERT_OK_AND_ASSIGN(auto buffer_assignment, AssignBuffers(module.get()));
+  TF_ASSERT_OK(Schedule(module.get()));
   EXPECT_EQ(CountCopies(*module), 4);
   module->entry_computation()->root_instruction();
   while_op = root->operand(0)->operand(0);
@@ -355,10 +363,18 @@ ENTRY main {
   config.set_debug_options(triton_enabled_debug_options);
   config.set_replica_count(1);
   config.set_num_partitions(1);
+
+  // Load autotuning DB. We shouldn't depend on actual execution times in a unit
+  // test.
+  std::string path =
+      tsl::io::JoinPath(tsl::testing::XlaSrcRoot(), "service", "gpu",
+                        "gpu_compiler_test_autotune_db.textproto");
+  TF_EXPECT_OK(AutotunerUtil::LoadAutotuneResultsFromFile(path));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_string, config));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> triton_enabled_module,
                           GetOptimizedModule(std::move(module)));
+  AutotunerUtil::ClearAutotuneResults();
   DebugOptions triton_disabled_debug_options = GetDebugOptionsForTest();
   triton_disabled_debug_options.set_xla_gpu_enable_address_computation_fusion(
       false);
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler_test_autotune_db.textproto b/third_party/xla/xla/service/gpu/gpu_compiler_test_autotune_db.textproto
new file mode 100644
index 00000000000000..0f81bf9ae86902
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/gpu_compiler_test_autotune_db.textproto
@@ -0,0 +1,25 @@
+version: 3
+results {
+  device: "sm_9.0 with 84942979072B RAM, 132 cores, 1980000KHz clock, 2619000KHz mem clock, 52428800B L2$"
+  hlo: "(bf16[128,1024,1024]{2,1,0}, s8[33554432]{0}) custom-call(bf16[128,1024,1024]{2,1,0}, bf16[128,1024,1024]{2,1,0}), custom_call_target=\"__cublas$gemm\", backend_config={\"operation_queue_id\":\"0\",\"wait_on_operation_queues\":[],\"gemm_backend_config\":{\"alpha_real\":1,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"2\"],\"rhs_contracting_dimensions\":[\"1\"],\"lhs_batch_dimensions\":[\"0\"],\"rhs_batch_dimensions\":[\"0\"]},\"alpha_imag\":0,\"precision_config\":{\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"],\"algorithm\":\"ALG_UNSET\"},\"epilogue\":\"DEFAULT\",\"lhs_stride\":\"1048576\",\"rhs_stride\":\"1048576\",\"grad_x\":false,\"grad_y\":false},\"force_earliest_schedule\":false}"
+  result {
+    gemm {
+      algorithm: -1
+    }
+    run_time {
+      nanos: 657376
+    }
+  }
+}
+results {
+  device: "sm_9.0 with 84942979072B RAM, 132 cores, 1980000KHz clock, 2619000KHz mem clock, 52428800B L2$"
+  hlo: "{\n  tmp_0 = bf16[1,4,32,1024,1024]{4,3,2,1,0} parameter(0)\n  tmp_1 = bf16[] constant({...})\n  tmp_2 = bf16[1,4,32,1024,1024]{4,3,2,1,0} broadcast(bf16[] tmp_1), dimensions={}\n  tmp_3 = bf16[1,4,32,1024,1024]{4,3,2,1,0} multiply(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_0, bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_2)\n  tmp_4 = bf16[4,32,1024,1024]{3,2,1,0} bitcast(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_3)\n  tmp_5 = bf16[4,32,1024,1024]{3,2,1,0} transpose(bf16[4,32,1024,1024]{3,2,1,0} tmp_4), dimensions={0,1,3,2}\n  tmp_6 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[4,32,1024,1024]{3,2,1,0} tmp_5)\n  tmp_7 = bf16[1,4,32,1024,1024]{4,3,2,1,0} parameter(1)\n  tmp_8 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_7)\n  tmp_9 = bf16[128,1024,1024]{2,1,0} dot(bf16[128,1024,1024]{2,1,0} tmp_6, bf16[128,1024,1024]{2,1,0} tmp_8), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}\n  ROOT tmp_10 = bf16[4,32,1024,1024]{3,2,1,0} bitcast(bf16[128,1024,1024]{2,1,0} tmp_9)\n}"
+  result {
+    gemm {
+      algorithm: -1
+    }
+    run_time {
+      nanos: 854688
+    }
+  }
+}
\ No newline at end of file
diff --git a/third_party/xla/xla/service/gpu/gpu_executable.cc b/third_party/xla/xla/service/gpu/gpu_executable.cc
index 25c03adb2e9c53..bc5c873b3688d0 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable.cc
+++ b/third_party/xla/xla/service/gpu/gpu_executable.cc
@@ -51,9 +51,9 @@ limitations under the License.
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/gpu_constants.h"
 #include "xla/service/gpu/gpu_executable_run_options.h"
-#include "xla/service/gpu/nccl_clique.h"
 #include "xla/service/gpu/nccl_clique_key.h"
 #include "xla/service/gpu/runtime/annotation.h"
+#include "xla/service/gpu/runtime/nccl_clique.h"
 #include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/gpu/stream_executor_util.h"
 #include "xla/service/hlo_execution_profile.h"
@@ -355,8 +355,8 @@ class ResourceRequests : public Thunk::ResourceRequests {
       if (b.key.devices().size() > a.key.devices().size()) return false;
 
       // If cliques have the same size prefer cliques with smaller stream id.
-      if (a.key.stream_id() < b.key.stream_id()) return true;
-      if (b.key.stream_id() < a.key.stream_id()) return false;
+      if (a.key.stream_id().value() < b.key.stream_id().value()) return true;
+      if (b.key.stream_id().value() < a.key.stream_id().value()) return false;
 
       // Prefer cliques with smaller id (comes earlier in execution order).
       return a.id < b.id;
@@ -439,8 +439,9 @@ absl::Status ExecuteThunks(
   if (ExecutionProfile* profile =
           run_options->run_options().execution_profile();
       profile) {
-    TF_ASSIGN_OR_RETURN(execution_timer,
-                        se::gpu::GpuTimer::Create(main_stream));
+    TF_ASSIGN_OR_RETURN(
+        execution_timer,
+        se::gpu::GpuTimer::Create(main_stream, profile->warmup_run_executed()));
   }
 #endif
 
diff --git a/third_party/xla/xla/service/gpu/gpu_fused_mha_runner.cc b/third_party/xla/xla/service/gpu/gpu_fused_mha_runner.cc
index ab157e81baf4d1..3dc4763bae91dd 100644
--- a/third_party/xla/xla/service/gpu/gpu_fused_mha_runner.cc
+++ b/third_party/xla/xla/service/gpu/gpu_fused_mha_runner.cc
@@ -80,7 +80,9 @@ absl::Status RunFusedMHA(GpufMHAParams params, se::Stream *stream,
   if (params.config->seed) {
     seed = *params.config->seed;
   }
-
+  TF_ASSIGN_OR_RETURN(
+      se::dnn::FMHAMaskKind mask_type,
+      GetDNNFmhaMaskKindFromCudnnFmhaMaskKind(params.config->mask_type));
   se::dnn::FusedMHAOp::Config config{kind,
                                      scale,
                                      params.config->lhs_bmm1,
@@ -94,7 +96,7 @@ absl::Status RunFusedMHA(GpufMHAParams params, se::Stream *stream,
                                      dropout_rate,
                                      seed,
                                      params.config->is_flash_attention,
-                                     params.config->is_causal_mask};
+                                     mask_type};
   TF_ASSIGN_OR_RETURN(auto *runner,
                       lazy_runner->GetOrCreateRunner(config, stream));
   return (*runner)(stream, options.profile_result, scratch_memory,
@@ -167,58 +169,6 @@ absl::Status RunGpuFMHAImpl(const GpufMHAParams &params, se::Stream *stream,
   return absl::OkStatus();
 }
 
-void AssignScale(GpufMHAConfig &config,
-                 const CudnnfMHABackendConfig &backend_config) {
-  double fmha_scale = 0.0;
-
-  switch (config.kind) {
-    case CudnnfMHAKind::kScaleBiasMaskSoftmax:
-    case CudnnfMHAKind::kScaleBiasMaskSoftmaxDropout:
-    case CudnnfMHAKind::kScaleMaskSoftmax:
-    case CudnnfMHAKind::kScaleMaskSoftmaxDropout:
-    case CudnnfMHAKind::kScaleBiasSoftmaxDropout:
-    case CudnnfMHAKind::kScaleBiasSoftmax:
-      fmha_scale = backend_config.fmha_scale();
-      config.fmha_scale.emplace(fmha_scale);
-      break;
-    default:
-      break;
-  }
-}
-
-void AssignDropoutRate(GpufMHAConfig &config,
-                       const CudnnfMHABackendConfig &backend_config) {
-  double dropout_rate = 0.0;
-  switch (config.kind) {
-    case CudnnfMHAKind::kScaleBiasMaskSoftmaxDropout:
-    case CudnnfMHAKind::kScaleMaskSoftmaxDropout:
-    case CudnnfMHAKind::kSoftmaxDropout:
-    case CudnnfMHAKind::kScaleBiasSoftmaxDropout:
-      dropout_rate = backend_config.dropout_rate();
-      config.dropout_rate.emplace(dropout_rate);
-      break;
-    default:
-      break;
-  }
-}
-
-void AssignSeed(GpufMHAConfig &config,
-                const CudnnfMHABackendConfig &backend_config) {
-  int64_t seed_value = 0;
-
-  switch (config.kind) {
-    case CudnnfMHAKind::kScaleBiasMaskSoftmaxDropout:
-    case CudnnfMHAKind::kScaleMaskSoftmaxDropout:
-    case CudnnfMHAKind::kSoftmaxDropout:
-    case CudnnfMHAKind::kScaleBiasSoftmaxDropout:
-      seed_value = backend_config.seed();
-      config.seed.emplace(seed_value);
-      break;
-    default:
-      break;
-  }
-}
-
 template <typename ElementType, typename OutputType>
 absl::Status RunFusedMHABackward(
     GpufMHABackwardParams params, se::Stream *stream,
@@ -231,7 +181,6 @@ absl::Status RunFusedMHABackward(
     DeviceMemory<OutputType> d_bmm1_lhs_buffer,
     DeviceMemory<OutputType> d_bmm1_rhs_buffer,
     DeviceMemory<OutputType> d_bmm2_rhs_buffer, DeviceMemoryBase d_s_buffer,
-    DeviceMemoryBase softmax_buffer, DeviceMemoryBase d_Q_accum_buffer,
     DeviceMemoryBase mask_buffer, DeviceMemoryBase d_bias_buffer,
     DeviceMemoryBase fwd_output_buffer, DeviceMemoryBase bias_buffer,
     DeviceMemoryBase scratch_memory, DeviceMemoryBase seqlen_q,
@@ -261,6 +210,10 @@ absl::Status RunFusedMHABackward(
   if (params.config->seed) {
     seed = *params.config->seed;
   }
+
+  TF_ASSIGN_OR_RETURN(
+      se::dnn::FMHAMaskKind mask_type,
+      GetDNNFmhaMaskKindFromCudnnFmhaMaskKind(params.config->mask_type));
   se::dnn::FusedMHABackwardOp::Config config{kind,
                                              scale,
                                              params.config->bmm1_grad_gemm1_rhs,
@@ -279,7 +232,7 @@ absl::Status RunFusedMHABackward(
                                              dropout_rate,
                                              seed,
                                              params.config->is_flash_attention,
-                                             params.config->is_causal_mask};
+                                             mask_type};
   TF_ASSIGN_OR_RETURN(auto *runner,
                       lazy_runner->GetOrCreateRunner(config, stream));
   // TODO: pass in real softmax_sum, dQ_accum, fwd_output
@@ -287,8 +240,7 @@ absl::Status RunFusedMHABackward(
                    bmm1_grad_gemm1_rhs_buffer, bmm1_grad_gemm2_rhs_buffer,
                    bmm2_grad_gemm1_lhs_buffer, bmm2_grad_gemm2_rhs_buffer,
                    d_output_buffer, d_bmm1_lhs_buffer, d_bmm1_rhs_buffer,
-                   d_bmm2_rhs_buffer, d_s_buffer, softmax_buffer,
-                   d_Q_accum_buffer, mask_buffer, d_bias_buffer,
+                   d_bmm2_rhs_buffer, d_s_buffer, mask_buffer, d_bias_buffer,
                    fwd_output_buffer, bias_buffer, seqlen_q, seqlen_k);
   return absl::OkStatus();
 }
@@ -318,15 +270,6 @@ absl::Status RunGpuFMHABackwardImpl(const GpufMHABackwardParams &params,
   auto d_s_buffer = params.d_s_buffer.has_value()
                         ? se::DeviceMemory<OutputType>(*params.d_s_buffer)
                         : se::DeviceMemoryBase();
-  auto softmax_sum_buffer =
-      params.softmax_sum_buffer.has_value()
-          ? se::DeviceMemory<float>(*params.softmax_sum_buffer)
-          : se::DeviceMemoryBase();
-
-  auto d_Q_accum_buffer =
-      params.d_Q_accum_buffer.has_value()
-          ? se::DeviceMemory<float>(*params.d_Q_accum_buffer)
-          : se::DeviceMemoryBase();
 
   auto mask_buffer = params.mask_buffer.has_value()
                          ? se::DeviceMemory<ElementType>(*params.mask_buffer)
@@ -375,9 +318,9 @@ absl::Status RunGpuFMHABackwardImpl(const GpufMHABackwardParams &params,
           params, stream, options, bmm1_grad_gemm1_rhs_buffer,
           bmm1_grad_gemm2_rhs_buffer, bmm2_grad_gemm1_lhs_buffer,
           bmm2_grad_gemm2_rhs_buffer, d_output_buffer, d_bmm1_lhs_buffer,
-          d_bmm1_rhs_buffer, d_bmm2_rhs_buffer, d_s_buffer, softmax_sum_buffer,
-          d_Q_accum_buffer, mask_buffer, d_bias_buffer, fwd_output_buffer,
-          bias_buffer, scratch_memory, seqlen_q_buffer, seqlen_k_buffer);
+          d_bmm1_rhs_buffer, d_bmm2_rhs_buffer, d_s_buffer, mask_buffer,
+          d_bias_buffer, fwd_output_buffer, bias_buffer, scratch_memory,
+          seqlen_q_buffer, seqlen_k_buffer);
       break;
     default:
       return Internal("Invalid cuDNN fMHA kind");
@@ -483,13 +426,12 @@ absl::Status RunGpuFMHABackwardImpl(const GpufMHABackwardParams &params,
   }
   config.kind = desc.kind;
   config.is_flash_attention = desc.is_flash_attention;
-  config.is_causal_mask = desc.is_causal_mask;
+  config.mask_type = desc.mask_type;
   const CudnnfMHABackendConfig &backend_config = desc.backend_config;
   config.algorithm = se::dnn::AlgorithmDesc(backend_config.algorithm());
-
-  AssignScale(config, backend_config);
-  AssignDropoutRate(config, backend_config);
-  AssignSeed(config, backend_config);
+  config.fmha_scale.emplace(backend_config.fmha_scale());
+  config.dropout_rate.emplace(backend_config.dropout_rate());
+  config.seed.emplace(backend_config.seed());
   return config;
 }
 
@@ -627,22 +569,12 @@ absl::Status RunGpuFMHABackwardImpl(const GpufMHABackwardParams &params,
 
   config.kind = desc.kind;
   config.is_flash_attention = desc.is_flash_attention;
-  config.is_causal_mask = desc.is_causal_mask;
+  config.mask_type = desc.mask_type;
   const CudnnfMHABackendConfig &backend_config = desc.backend_config;
   config.algorithm = se::dnn::AlgorithmDesc(backend_config.algorithm());
-
-  auto assign_scale = [&]() {
-    config.fmha_scale.emplace(backend_config.fmha_scale());
-  };
-
-  auto assign_dropout_rate = [&]() {
-    config.dropout_rate.emplace(backend_config.dropout_rate());
-  };
-
-  auto assign_seed = [&]() { config.seed.emplace(backend_config.seed()); };
-  assign_scale();
-  assign_dropout_rate();
-  assign_seed();
+  config.fmha_scale.emplace(backend_config.fmha_scale());
+  config.dropout_rate.emplace(backend_config.dropout_rate());
+  config.seed.emplace(backend_config.seed());
   return config;
 }
 
@@ -680,8 +612,6 @@ absl::Status RunGpuFMHABackwardImpl(const GpufMHABackwardParams &params,
     se::DeviceMemoryBase d_bmm1_rhs_buffer,
     se::DeviceMemoryBase d_bmm2_rhs_buffer,
     std::optional<se::DeviceMemoryBase> d_s_buffer,
-    std::optional<se::DeviceMemoryBase> softmax_sum_buffer,
-    std::optional<se::DeviceMemoryBase> d_Q_accum_buffer,
     std::optional<se::DeviceMemoryBase> mask_buffer,
     std::optional<se::DeviceMemoryBase> d_bias_buffer,
     std::optional<se::DeviceMemoryBase> fwd_output_buffer,
@@ -699,8 +629,6 @@ absl::Status RunGpuFMHABackwardImpl(const GpufMHABackwardParams &params,
   params.d_bmm1_rhs_buffer = d_bmm1_rhs_buffer;
   params.d_bmm2_rhs_buffer = d_bmm2_rhs_buffer;
   params.d_s_buffer = d_s_buffer;
-  params.softmax_sum_buffer = softmax_sum_buffer;
-  params.d_Q_accum_buffer = d_Q_accum_buffer;
   params.mask_buffer = mask_buffer;
   params.d_bias_buffer = d_bias_buffer;
   params.fwd_output_buffer = fwd_output_buffer;
@@ -754,8 +682,6 @@ absl::Status RunGpuFMHABackward(
     se::DeviceMemoryBase d_bmm1_rhs_buffer,
     se::DeviceMemoryBase d_bmm2_rhs_buffer,
     std::optional<se::DeviceMemoryBase> d_s_buffer,
-    std::optional<se::DeviceMemoryBase> softmax_sum_buffer,
-    std::optional<se::DeviceMemoryBase> d_Q_accum_buffer,
     std::optional<se::DeviceMemoryBase> mask_buffer,
     std::optional<se::DeviceMemoryBase> d_bias_buffer,
     std::optional<se::DeviceMemoryBase> fwd_output_buffer,
@@ -769,9 +695,8 @@ absl::Status RunGpuFMHABackward(
           fmha_config, bmm1_grad_gemm1_rhs_buffer, bmm1_grad_gemm2_rhs_buffer,
           bmm2_grad_gemm1_lhs_buffer, bmm2_grad_gemm2_rhs_buffer,
           d_output_buffer, d_bmm1_lhs_buffer, d_bmm1_rhs_buffer,
-          d_bmm2_rhs_buffer, d_s_buffer, softmax_sum_buffer, d_Q_accum_buffer,
-          mask_buffer, d_bias_buffer, fwd_output_buffer, bias_buffer,
-          seqlen_q_buffer, seqlen_k_buffer));
+          d_bmm2_rhs_buffer, d_s_buffer, mask_buffer, d_bias_buffer,
+          fwd_output_buffer, bias_buffer, seqlen_q_buffer, seqlen_k_buffer));
   PrimitiveType input_primitive_type = fmha_config.input_type;
   switch (input_primitive_type) {
     case F16:
diff --git a/third_party/xla/xla/service/gpu/gpu_fused_mha_runner.h b/third_party/xla/xla/service/gpu/gpu_fused_mha_runner.h
index 108cf6a1d4b1b4..5466e61ff627a6 100644
--- a/third_party/xla/xla/service/gpu/gpu_fused_mha_runner.h
+++ b/third_party/xla/xla/service/gpu/gpu_fused_mha_runner.h
@@ -38,6 +38,24 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+inline absl::StatusOr<xla::gpu::CudnnfMHAMaskKind> AsCudnnFmhaMaskKind(
+    xla::gpu::CudnnfMHABackendConfig_MaskType mask_type) {
+  switch (mask_type) {
+    case xla::gpu::CudnnfMHABackendConfig::NO_MASK:
+      return xla::gpu::CudnnfMHAMaskKind::kNoMask;
+    case xla::gpu::CudnnfMHABackendConfig::PADDING:
+      return xla::gpu::CudnnfMHAMaskKind::kPadding;
+    case xla::gpu::CudnnfMHABackendConfig::CAUSAL:
+      return xla::gpu::CudnnfMHAMaskKind::kCausal;
+    case xla::gpu::CudnnfMHABackendConfig::PADDING_CAUSAL:
+      return xla::gpu::CudnnfMHAMaskKind::kPaddingCausal;
+    case xla::gpu::CudnnfMHABackendConfig::ALIBI:
+      return xla::gpu::CudnnfMHAMaskKind::kAlibi;
+    default:
+      return xla::Internal("Unknown fmha mask kind.");
+  }
+}
+
 // This is an interim structure to hold the parameters to construct a
 // GpufMHAConfig.
 // Struct to describe properties of a FMHA without being tied to specific
@@ -47,7 +65,7 @@ struct GpufMHADescriptor {
   CudnnfMHAKind kind;
   CudnnfMHABackendConfig backend_config;
   bool is_flash_attention;
-  bool is_causal_mask;
+  CudnnfMHAMaskKind mask_type;
   Shape lhs_bmm1_shape;
   Shape rhs_bmm1_shape;
   Shape rhs_bmm2_shape;
@@ -65,7 +83,7 @@ struct GpufMHABackwardDescriptor {
   CudnnfMHAKind kind;
   CudnnfMHABackendConfig backend_config;
   bool is_flash_attention;
-  bool is_causal_mask;
+  CudnnfMHAMaskKind mask_type;
   Shape bmm1_grad_gemm1_rhs_shape;
   Shape bmm1_grad_gemm2_rhs_shape;
   Shape bmm2_grad_gemm1_lhs_shape;
@@ -99,7 +117,7 @@ struct GpufMHAConfig {
 
   se::dnn::AlgorithmDesc algorithm;
   bool is_flash_attention;
-  bool is_causal_mask;
+  CudnnfMHAMaskKind mask_type;
   // bias -> [1, num_attn_heads, q_seq_len, kv_seq_len]
   // mask -> [batch_size, 1, q_seq_len, kv_seq_len]
   se::dnn::MatmulTensorDescriptor lhs_bmm1;
@@ -128,7 +146,7 @@ struct GpufMHABackwardConfig {
 
   se::dnn::AlgorithmDesc algorithm;
   bool is_flash_attention;
-  bool is_causal_mask;
+  CudnnfMHAMaskKind mask_type;
   // mask -> [batch_size, 1, q_seq_len, kv_seq_len]
   // d_bias -> [1, num_heads, q_seq_len, kv_seq_len]
   se::dnn::MatmulTensorDescriptor bmm1_grad_gemm1_rhs;
@@ -182,8 +200,6 @@ struct GpufMHABackwardParams {
       se::DeviceMemoryBase d_bmm1_rhs_buffer,
       se::DeviceMemoryBase d_bmm2_rhs_buffer,
       std::optional<se::DeviceMemoryBase> d_s_buffer,
-      std::optional<se::DeviceMemoryBase> softmax_sum_buffer,
-      std::optional<se::DeviceMemoryBase> d_Q_accum_buffer,
       std::optional<se::DeviceMemoryBase> mask_buffer,
       std::optional<se::DeviceMemoryBase> d_bias_buffer,
       std::optional<se::DeviceMemoryBase> fwd_output_buffer,
@@ -201,8 +217,6 @@ struct GpufMHABackwardParams {
   se::DeviceMemoryBase d_bmm1_rhs_buffer;
   se::DeviceMemoryBase d_bmm2_rhs_buffer;
   std::optional<se::DeviceMemoryBase> d_s_buffer;
-  std::optional<se::DeviceMemoryBase> softmax_sum_buffer;
-  std::optional<se::DeviceMemoryBase> d_Q_accum_buffer;
   std::optional<se::DeviceMemoryBase> mask_buffer;
   std::optional<se::DeviceMemoryBase> d_bias_buffer;
   std::optional<se::DeviceMemoryBase> fwd_output_buffer;
@@ -412,8 +426,6 @@ absl::Status RunGpuFMHABackward(
     se::DeviceMemoryBase d_bmm1_rhs_buffer,
     se::DeviceMemoryBase d_bmm2_rhs_buffer,
     std::optional<se::DeviceMemoryBase> d_s_buffer,
-    std::optional<se::DeviceMemoryBase> softmax_sum_buffer,
-    std::optional<se::DeviceMemoryBase> d_Q_accum_buffer,
     std::optional<se::DeviceMemoryBase> mask_buffer,
     std::optional<se::DeviceMemoryBase> d_bias_buffer,
     std::optional<se::DeviceMemoryBase> fwd_output_buffer,
diff --git a/third_party/xla/xla/service/gpu/gpu_fusible.h b/third_party/xla/xla/service/gpu/gpu_fusible.h
index 6e32f6d141a043..79fef3b70ba310 100644
--- a/third_party/xla/xla/service/gpu/gpu_fusible.h
+++ b/third_party/xla/xla/service/gpu/gpu_fusible.h
@@ -67,7 +67,7 @@ struct FusionInfoCache {
 int64_t SharedMemoryUsage(const HloInstruction& instr,
                           FusionInfoCache* cache = nullptr);
 
-inline constexpr int64_t MaxOperandsAndOutputsPerFusion() { return 64; }
+inline constexpr int64_t MaxOperandsAndOutputsPerFusion() { return 96; }
 
 // Whether the op transposes the physical data layout. Fusing such ops may lead
 // to uncoalesced data access and may thus not be beneficial.
diff --git a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
index 08725514ceecf0..8688349c0cdd2e 100644
--- a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
+++ b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
@@ -373,6 +373,17 @@ class GpuAsyncTrackerBase : public AsyncTracker {
       HloScheduleGraph* schedule_graph,
       const LatencyEstimator* latency_estimator) const override {
     for (auto inst : schedule_graph->GetOriginalInstrList()) {
+      // Force pipelined Recv to be closed to Recvdone so that copies inserted
+      // for RecvDone can be eliminated.
+      if (inst->opcode() == HloOpcode::kRecv) {
+        if (inst->frontend_attributes().map().count(kSendRecvPipelineAttr) >
+            0) {
+          HloGraphNode& node = schedule_graph->GetNode(inst);
+          node.SetForceEarly(true);
+          VLOG(5) << "Setting force early for instruction: "
+                  << inst->ToString();
+        }
+      }
       if (inst->has_backend_config()) {
         auto gpu_config = inst->backend_config<GpuBackendConfig>();
         if (gpu_config.ok()) {
diff --git a/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc b/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc
index aa669290f9c9d7..4edab874aaf513 100644
--- a/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc
@@ -777,30 +777,18 @@ TEST_F(GpuHloScheduleTest, LHSSendRecvPipelined1) {
   HloModule test
 
   while_cond {
-    param = (u32[], (f32[1,1024,1024], u32[], token[]),
-      (f32[1,1024,1024], u32[], token[])) parameter(0)
+    param = (u32[], (f32[1,1024,1024], token[]), token[]) parameter(0)
     count = get-tuple-element(param), index=0
     ub = u32[] constant(25)
     ROOT cond-result = pred[] compare(count, ub), direction=LT
   }
 
   while_body {
-    param = (u32[], (f32[1,1024,1024], u32[], token[]),
-      (f32[1,1024,1024], u32[], token[])) parameter(0)
+    param = (u32[], (f32[1,1024,1024], token[]), token[]) parameter(0)
     count = get-tuple-element(param), index=0
 
-    recv.1.q = (f32[1,1024,1024], u32[], token[]) get-tuple-element(param), index=1
-    recv-done.1 = (f32[1,1024,1024], token[]) recv-done(recv.1.q), channel_id=1,
-      frontend_attributes={
-      _xla_send_recv_pipeline="0"
-      }
-    recv-data = f32[1, 1024, 1024] get-tuple-element(recv-done.1), index=0
-
-    send.1.q = (f32[1,1024,1024], u32[], token[]) get-tuple-element(param), index=2
-    send-done.1 = token[] send-done(send.1.q), channel_id=1,
-      frontend_attributes={
-      _xla_send_recv_pipeline="0"
-      }
+    recv-done.1.q = (f32[1,1024,1024], token[]) get-tuple-element(param), index=1
+    recv-data = f32[1, 1024, 1024] get-tuple-element(recv-done.1.q), index=0
 
     c1 = u32[] constant(1)
     new-count = u32[] add(count, c1)
@@ -820,17 +808,24 @@ TEST_F(GpuHloScheduleTest, LHSSendRecvPipelined1) {
     after-all.1 = token[] after-all()
     send.1 = (f32[1, 1024, 1024], u32[], token[]) send(send-data, after-all.1),
       channel_id=1, frontend_attributes={
-      _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}",
-      _xla_send_recv_pipeline="0"
-    }
+        _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}",
+        _xla_send_recv_pipeline="0"
+      }
     recv.1 = (f32[1, 1024, 1024], u32[], token[]) recv(after-all.1), channel_id=1,
       frontend_attributes={
-       _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}",
-       _xla_send_recv_pipeline="0"
-    }
-
-    ROOT body-result = (u32[], (f32[1,1024,1024], u32[], token[]),
-      (f32[1,1024,1024], u32[], token[])) tuple(new-count, recv.1, send.1)
+        _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}",
+        _xla_send_recv_pipeline="0"
+      }
+    recv-done.1 = (f32[1,1024,1024], token[]) recv-done(recv.1), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
+    send-done.1 = token[] send-done(send.1), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
+    ROOT body-result = (u32[], (f32[1,1024,1024], token[]), token[])
+      tuple(new-count, recv-done.1, send-done.1)
   }
 
   ENTRY main {
@@ -841,35 +836,32 @@ TEST_F(GpuHloScheduleTest, LHSSendRecvPipelined1) {
     after-all.2 = token[] after-all()
     recv.2 = (f32[1, 1024, 1024], u32[], token[]) recv(after-all.2), channel_id=1,
       frontend_attributes={
-       _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}",
-       _xla_send_recv_pipeline="0"
-    }
+        _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}",
+        _xla_send_recv_pipeline="0"
+      }
     send.2 = (f32[1, 1024, 1024], u32[], token[]) send(init, after-all.2), channel_id=1,
       frontend_attributes={
-       _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}",
-       _xla_send_recv_pipeline="0"
-    }
-
-    while-init =  (u32[], (f32[1,1024,1024], u32[], token[]),
-      (f32[1,1024,1024], u32[], token[])) tuple(c0, recv.2, send.2)
-    while-result = (u32[], (f32[1,1024,1024], u32[], token[]),
-      (f32[1,1024,1024], u32[], token[])) while(while-init),
-      body=while_body, condition=while_cond,
-      backend_config={"known_trip_count":{"n":"25"}}
-
-    recv.2.q = (f32[1,1024,1024], u32[], token[]) get-tuple-element(while-result), index=1
-    recv-done.2 = (f32[1,1024,1024], token[]) recv-done(recv.2.q), channel_id=1,
+        _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}",
+        _xla_send_recv_pipeline="0"
+      }
+    recv-done.2 = (f32[1,1024,1024], token[]) recv-done(recv.2), channel_id=1,
       frontend_attributes={
-      _xla_send_recv_pipeline="0"
+        _xla_send_recv_pipeline="0"
       }
-
-    send.2.q = (f32[1,1024,1024], u32[], token[]) get-tuple-element(while-result), index=2
-    send-done.2 = token[] send-done(send.2.q), channel_id=1,
+    send-done.2 = token[] send-done(send.2), channel_id=1,
       frontend_attributes={
-      _xla_send_recv_pipeline="0"
+        _xla_send_recv_pipeline="0"
       }
+    while-init =  (u32[], (f32[1,1024,1024], token[]), token[])
+      tuple(c0, recv-done.2, send-done.2)
+    while-result = (u32[], (f32[1,1024,1024], token[]), token[])
+      while(while-init),
+      body=while_body, condition=while_cond,
+      backend_config={"known_trip_count":{"n":"25"}}
 
-    ROOT entry-result = f32[1, 1024, 1024] get-tuple-element(recv-done.2), index=0
+    recv-done.2.q = (f32[1,1024,1024], token[]) get-tuple-element(while-result), index=1
+
+    ROOT entry-result = f32[1, 1024, 1024] get-tuple-element(recv-done.2.q), index=0
   }
   )";
 
@@ -894,20 +886,23 @@ TEST_F(GpuHloScheduleTest, LHSSendRecvPipelined1) {
                                }) -
                instruction_sequence.begin();
       };
-
   EXPECT_TRUE(HasValidFingerprint(module.get()));
-  // The pipelined Send-Recv in the main.
-  EXPECT_LT(get_index("recv.2", main), get_index("while-result", main));
-  EXPECT_LT(get_index("send.2", main), get_index("while-result", main));
-  EXPECT_LT(get_index("while-result", main), get_index("recv-done.2", main));
-  EXPECT_LT(get_index("while-result", main), get_index("send-done.2", main));
 
-  // The pipelined Send-Recv in the while-body.
+  // The pipelined Send-Recv in the main. A pipelined Recv is scheduled right
+  // after its corresponding Send due to kForceEarly.
+  EXPECT_EQ(get_index("recv.2", main) + 1, get_index("send.2", main));
+  EXPECT_LT(get_index("send.2", main), get_index("recv-done.2", main));
+  EXPECT_LT(get_index("recv-done.2", main), get_index("send-done.2", main));
+  EXPECT_LT(get_index("send-done.2", main), get_index("while-result", main));
+
+  // The pipelined Send-Recv in the while-body. A pipelined Recv is scheduled
+  // right after its corresponding Send due to kForceEarly.
+  EXPECT_EQ(get_index("recv.1", while_body) + 1,
+            get_index("send.1", while_body));
+  EXPECT_LT(get_index("send.1", while_body),
+            get_index("recv-done.1", while_body));
   EXPECT_LT(get_index("recv-done.1", while_body),
             get_index("send-done.1", while_body));
-  EXPECT_LT(get_index("send-done.1", while_body),
-            get_index("recv.1", while_body));
-  EXPECT_LT(get_index("recv.1", while_body), get_index("send.1", while_body));
 }
 
 // Checks that with the dependence added by the gpu-hlo-scheduler, the
@@ -917,45 +912,22 @@ TEST_F(GpuHloScheduleTest, LHSSendRecvPipelined2) {
   HloModule test
 
   while_cond {
-    param = (u32[], (f32[1,1024,1024], u32[], token[]),
-      (f32[1,1024,1024], u32[], token[]), (f32[1,1024,1024], u32[], token[]),
-      (f32[1,1024,1024], u32[], token[])) parameter(0)
+    param = (u32[], (f32[1,1024,1024],  token[]), token[],
+      (f32[1,1024,1024], token[]), token[]) parameter(0)
     count = get-tuple-element(param), index=0
     ub = u32[] constant(25)
     ROOT cond-result = pred[] compare(count, ub), direction=LT
   }
 
   while_body {
-    param = (u32[], (f32[1,1024,1024], u32[], token[]),
-      (f32[1,1024,1024], u32[], token[]), (f32[1,1024,1024], u32[], token[]),
-      (f32[1,1024,1024], u32[], token[])) parameter(0)
+    param = (u32[], (f32[1,1024,1024], token[]), token[],
+      (f32[1,1024,1024], token[]), token[]) parameter(0)
     count = get-tuple-element(param), index=0
 
-    recv.0.q = (f32[1,1024,1024], u32[], token[]) get-tuple-element(param), index=1
-    recv-done.0 = (f32[1,1024,1024], token[]) recv-done(recv.0.q), channel_id=1,
-      frontend_attributes={
-      _xla_send_recv_pipeline="0"
-      }
-    recv-data.0 = f32[1, 1024, 1024] get-tuple-element(recv-done.0), index=0
-
-    send.0.q = (f32[1,1024,1024], u32[], token[]) get-tuple-element(param), index=2
-    send-done.0 = token[] send-done(send.0.q), channel_id=1,
-      frontend_attributes={
-      _xla_send_recv_pipeline="0"
-      }
-
-    recv.1.q = (f32[1,1024,1024], u32[], token[]) get-tuple-element(param), index=3
-    recv-done.1 = (f32[1,1024,1024], token[]) recv-done(recv.1.q), channel_id=2,
-      frontend_attributes={
-      _xla_send_recv_pipeline="1"
-      }
-    recv-data.1 = f32[1, 1024, 1024] get-tuple-element(recv-done.1), index=0
-
-    send.1.q = (f32[1,1024,1024], u32[], token[]) get-tuple-element(param), index=4
-    send-done.1 = token[] send-done(send.1.q), channel_id=2,
-      frontend_attributes={
-      _xla_send_recv_pipeline="1"
-      }
+    recv-done.0.q = (f32[1,1024,1024], token[]) get-tuple-element(param), index=1
+    recv-data.0 = f32[1, 1024, 1024] get-tuple-element(recv-done.0.q), index=0
+    recv-done.1.q = (f32[1,1024,1024], token[]) get-tuple-element(param), index=3
+    recv-data.1 = f32[1, 1024, 1024] get-tuple-element(recv-done.1.q), index=0
 
     replica = u32[] replica-id()
     constant0 = u32[] constant(0)
@@ -980,30 +952,46 @@ TEST_F(GpuHloScheduleTest, LHSSendRecvPipelined2) {
     after-all.0 = token[] after-all()
     send.0 = (f32[1, 1024, 1024], u32[], token[]) send(send-data, after-all.0),
       channel_id=1, frontend_attributes={
-      _xla_send_recv_source_target_pairs="{{3,0}}",
-      _xla_send_recv_pipeline="0"
-    }
+        _xla_send_recv_source_target_pairs="{{3,0}}",
+        _xla_send_recv_pipeline="0"
+      }
     recv.0 = (f32[1, 1024, 1024], u32[], token[]) recv(after-all.0), channel_id=1,
       frontend_attributes={
-       _xla_send_recv_source_target_pairs="{{3,0}}",
-       _xla_send_recv_pipeline="0"
-    }
+        _xla_send_recv_source_target_pairs="{{3,0}}",
+        _xla_send_recv_pipeline="0"
+      }
+    recv-done.0 = (f32[1,1024,1024], token[]) recv-done(recv.0), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
+    send-done.0 = token[] send-done(send.0), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
 
     after-all.1 = token[] after-all()
     send.1 = (f32[1, 1024, 1024], u32[], token[]) send(send-data, after-all.1),
       channel_id=2, frontend_attributes={
-      _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}}",
-      _xla_send_recv_pipeline="1"
-    }
-    recv.1 = (f32[1, 1024, 1024], u32[], token[]) recv(after-all.1), channel_id=2,
-      frontend_attributes={
        _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}}",
        _xla_send_recv_pipeline="1"
-    }
+      }
+    recv.1 = (f32[1, 1024, 1024], u32[], token[]) recv(after-all.1), channel_id=2,
+      frontend_attributes={
+        _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}}",
+        _xla_send_recv_pipeline="1"
+      }
+    recv-done.1 = (f32[1,1024,1024], token[]) recv-done(recv.1), channel_id=2,
+      frontend_attributes={
+        _xla_send_recv_pipeline="1"
+      }
+    send-done.1 = token[] send-done(send.1), channel_id=2,
+      frontend_attributes={
+        _xla_send_recv_pipeline="1"
+      }
 
-    ROOT body-result = (u32[], (f32[1,1024,1024], u32[], token[]),
-      (f32[1,1024,1024], u32[], token[]), (f32[1,1024,1024], u32[], token[]),
-      (f32[1,1024,1024], u32[], token[])) tuple(new-count, recv.0, send.0, recv.1, send.1)
+    ROOT body-result = (u32[], (f32[1,1024,1024], token[]), token[],
+      (f32[1,1024,1024], token[]), token[])
+      tuple(new-count, recv-done.0, send-done.0, recv-done.1, send-done.1)
   }
 
   ENTRY main {
@@ -1022,6 +1010,14 @@ TEST_F(GpuHloScheduleTest, LHSSendRecvPipelined2) {
        _xla_send_recv_source_target_pairs="{{3,0}}",
        _xla_send_recv_pipeline="0"
     }
+    recv-done.2 = (f32[1,1024,1024], token[]) recv-done(recv.2), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
+    send-done.2 = token[] send-done(send.2), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
 
     after-all.3 = token[] after-all()
     recv.3 = (f32[1, 1024, 1024], u32[], token[]) recv(after-all.3), channel_id=2,
@@ -1034,41 +1030,26 @@ TEST_F(GpuHloScheduleTest, LHSSendRecvPipelined2) {
        _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}}",
        _xla_send_recv_pipeline="1"
     }
-
-    while-init =  (u32[], (f32[1,1024,1024], u32[], token[]),
-      (f32[1,1024,1024], u32[], token[]), (f32[1,1024,1024], u32[], token[]),
-      (f32[1,1024,1024], u32[], token[])) tuple(c0, recv.2, send.2, recv.3, send.3)
-    while-result = (u32[], (f32[1,1024,1024], u32[], token[]),
-      (f32[1,1024,1024], u32[], token[]), (f32[1,1024,1024], u32[], token[]),
-      (f32[1,1024,1024], u32[], token[])) while(while-init),
-      body=while_body, condition=while_cond,
-      backend_config={"known_trip_count":{"n":"25"}}
-
-    recv.2.q = (f32[1,1024,1024], u32[], token[]) get-tuple-element(while-result), index=1
-    recv-done.2 = (f32[1,1024,1024], token[]) recv-done(recv.2.q), channel_id=1,
+    recv-done.3 = (f32[1,1024,1024], token[]) recv-done(recv.3), channel_id=2,
       frontend_attributes={
-      _xla_send_recv_pipeline="0"
+        _xla_send_recv_pipeline="1"
       }
-    recv-data.2 = f32[1, 1024, 1024] get-tuple-element(recv-done.2), index=0
-
-    send.2.q = (f32[1,1024,1024], u32[], token[]) get-tuple-element(while-result), index=2
-    send-done.2 = token[] send-done(send.2.q), channel_id=1,
+    send-done.3 = token[] send-done(send.3), channel_id=2,
       frontend_attributes={
-      _xla_send_recv_pipeline="0"
+        _xla_send_recv_pipeline="1"
       }
 
-    recv.3.q = (f32[1,1024,1024], u32[], token[]) get-tuple-element(while-result), index=3
-    recv-done.3 = (f32[1,1024,1024], token[]) recv-done(recv.3.q), channel_id=2,
-      frontend_attributes={
-      _xla_send_recv_pipeline="1"
-      }
-    recv-data.3 = f32[1, 1024, 1024] get-tuple-element(recv-done.3), index=0
+    while-init =  (u32[], (f32[1,1024,1024], token[]), token[],
+      (f32[1,1024,1024], token[]), token[]) tuple(c0, recv-done.2, send-done.2, recv-done.3, send-done.3)
+    while-result = (u32[], (f32[1,1024,1024], token[]), token[],
+      (f32[1,1024,1024], token[]), token[]) while(while-init),
+      body=while_body, condition=while_cond,
+      backend_config={"known_trip_count":{"n":"25"}}
 
-    send.3.q = (f32[1,1024,1024], u32[], token[]) get-tuple-element(while-result), index=4
-    send-done.3 = token[] send-done(send.3.q), channel_id=2,
-      frontend_attributes={
-      _xla_send_recv_pipeline="1"
-      }
+    recv-done.2.q = (f32[1,1024,1024], token[]) get-tuple-element(while-result), index=1
+    recv-data.2 = f32[1, 1024, 1024] get-tuple-element(recv-done.2.q), index=0
+    recv-done.3.q = (f32[1,1024,1024], token[]) get-tuple-element(while-result), index=3
+    recv-data.3 = f32[1, 1024, 1024] get-tuple-element(recv-done.3.q), index=0
 
     replica = u32[] replica-id()
     constant0 = u32[] constant(0)
@@ -1101,18 +1082,32 @@ TEST_F(GpuHloScheduleTest, LHSSendRecvPipelined2) {
       };
 
   EXPECT_TRUE(HasValidFingerprint(module.get()));
-  // The pipelined Send-Recv in the main.
-  EXPECT_LT(get_index("recv.2", main), get_index("while-result", main));
-  EXPECT_LT(get_index("send.2", main), get_index("while-result", main));
-  EXPECT_LT(get_index("while-result", main), get_index("recv-done.2", main));
-  EXPECT_LT(get_index("while-result", main), get_index("send-done.2", main));
-
-  // The pipelined Send-Recv in the while-body.
+  // The pipelined Send-Recv in the main. A pipelined Recv is scheduled right
+  // after its corresponding Send due to kForceEarly.
+  EXPECT_EQ(get_index("recv.2", main) + 1, get_index("send.2", main));
+  EXPECT_LT(get_index("send.2", main), get_index("recv.3", main));
+  EXPECT_EQ(get_index("recv.3", main) + 1, get_index("send.3", main));
+  EXPECT_LT(get_index("send.3", main), get_index("recv-done.2", main));
+  EXPECT_LT(get_index("recv-done.2", main), get_index("recv-done.3", main));
+  EXPECT_LT(get_index("recv-done.3", main), get_index("send-done.2", main));
+  EXPECT_LT(get_index("send-done.2", main), get_index("send-done.3", main));
+  EXPECT_LT(get_index("send-done.3", main), get_index("while-result", main));
+
+  // The pipelined Send-Recv in the while-body. A pipelined Recv is scheduled
+  // right after its corresponding Send due to kForceEarly.
+  EXPECT_EQ(get_index("recv.0", while_body) + 1,
+            get_index("send.0", while_body));
+  EXPECT_LT(get_index("send.0", while_body), get_index("recv.1", while_body));
+  EXPECT_EQ(get_index("recv.1", while_body) + 1,
+            get_index("send.1", while_body));
+  EXPECT_LT(get_index("send.1", while_body),
+            get_index("recv-done.0", while_body));
+  EXPECT_LT(get_index("recv-done.0", while_body),
+            get_index("recv-done.1", while_body));
   EXPECT_LT(get_index("recv-done.1", while_body),
+            get_index("send-done.0", while_body));
+  EXPECT_LT(get_index("send-done.0", while_body),
             get_index("send-done.1", while_body));
-  EXPECT_LT(get_index("send-done.1", while_body),
-            get_index("recv.1", while_body));
-  EXPECT_LT(get_index("recv.1", while_body), get_index("send.1", while_body));
 }
 
 TEST_F(GpuHloScheduleTest, SkipAlreadyScheduled) {
diff --git a/third_party/xla/xla/service/gpu/gpu_layout_assignment.cc b/third_party/xla/xla/service/gpu/gpu_layout_assignment.cc
index 8a26b0419b51d6..17cb6b31fcda7e 100644
--- a/third_party/xla/xla/service/gpu/gpu_layout_assignment.cc
+++ b/third_party/xla/xla/service/gpu/gpu_layout_assignment.cc
@@ -48,11 +48,13 @@ limitations under the License.
 #include "xla/status.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/dnn.h"
+#include "xla/tsl/util/env_var.h"
 #include "xla/util.h"
 #include "xla/window_util.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
@@ -125,38 +127,51 @@ HeuristicLayoutAssignment(const HloInstruction* instr,
     return kAllNHWC;
   }
 
-  // If we're not Volta or not fp16/bfloat16, or not conv2D, the decision is
-  // easy: Use NCHW.
   const bool isFloat16 = (input_ty == F16) || (input_ty == BF16);
-  const auto* cuda_compute_capability =
-      std::get_if<se::CudaComputeCapability>(&gpu_version);
-  bool is_volta =
-      cuda_compute_capability &&
-      cuda_compute_capability->IsAtLeast(se::CudaComputeCapability::VOLTA);
-  if (!isFloat16 || !is_volta ||
-      instr->shape().tuple_shapes(0).dimensions_size() != 4) {
-    return kAllNCHW;
+  if (std::holds_alternative<se::CudaComputeCapability>(gpu_version)) {
+    // If we're not Volta or not fp16/bfloat16, or not conv2D, the decision is
+    // easy: Use NCHW.
+    const auto* cuda_compute_capability =
+        std::get_if<se::CudaComputeCapability>(&gpu_version);
+    bool is_volta =
+        cuda_compute_capability &&
+        cuda_compute_capability->IsAtLeast(se::CudaComputeCapability::VOLTA);
+    if (!isFloat16 || !is_volta ||
+        instr->shape().tuple_shapes(0).dimensions_size() != 4) {
+      return kAllNCHW;
+    }
+
+    // Empirically we've found with Volta and cudnn <= 7.3 that backward-input
+    // convs with stride are significantly faster with NCHW layouts.
+    //
+    // We could have used a mixed layout combination, e.g. (NHWC, NCHW, NCHW),
+    // which on paper gives good performance. However, there are two
+    // observations:
+    // * a mixed layout combination is more cuDNN-bug prone, based on empirical
+    //   evidence.
+    // * we've also observed that for mixed layouts, cuDNN transposes data back
+    //   and forth from a different layout combination. If we end up with
+    //   transposes anyway, we prefer to have them in XLA, as they can be fused.
+    if (std::make_tuple(dnn_version.major_version(),
+                        dnn_version.minor_version()) <= std::make_tuple(7, 3) &&
+        instr->custom_call_target() == kCudnnConvBackwardInputCallTarget &&
+        window_util::HasStride(instr->window())) {
+      return kAllNCHW;
+    }
+  } else if (std::holds_alternative<se::RocmComputeCapability>(gpu_version)) {
+    bool is_enabled = false;
+    TF_CHECK_OK(tsl::ReadBoolFromEnvVar("TF_USE_ROCM_NHWC",
+                                        /*default_val=*/false, &is_enabled));
+    auto rocm_compute_capability =
+        std::get<se::RocmComputeCapability>(gpu_version);
+    if (!isFloat16 || (!rocm_compute_capability.has_nhwc_layout_support()) ||
+        instr->shape().tuple_shapes(0).dimensions_size() != 4 || !is_enabled) {
+      return kAllNCHW;
+    }
   }
 
   VLOG(2) << "Using heuristic to figure out layouts for " << instr->ToString();
 
-  // Empirically we've found with Volta and cudnn <= 7.3 that backward-input
-  // convs with stride are significantly faster with NCHW layouts.
-  //
-  // We could have used a mixed layout combination, e.g. (NHWC, NCHW, NCHW),
-  // which on paper gives good performance. However, there are two observations:
-  // * a mixed layout combination is more cuDNN-bug prone, based on empirical
-  //   evidence.
-  // * we've also observed that for mixed layouts, cuDNN transposes data back
-  //   and forth from a different layout combination. If we end up with
-  //   transposes anyway, we prefer to have them in XLA, as they can be fused.
-  if (std::make_tuple(dnn_version.major_version(),
-                      dnn_version.minor_version()) <= std::make_tuple(7, 3) &&
-      instr->custom_call_target() == kCudnnConvBackwardInputCallTarget &&
-      window_util::HasStride(instr->window())) {
-    return kAllNCHW;
-  }
-
   // For other Volta f16 convolutions, use NHWC.
   return kAllNHWC;
 }
diff --git a/third_party/xla/xla/service/gpu/gpu_offloading_test.cc b/third_party/xla/xla/service/gpu/gpu_offloading_test.cc
index ce82cabef0798a..56abbe6f911408 100644
--- a/third_party/xla/xla/service/gpu/gpu_offloading_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_offloading_test.cc
@@ -54,13 +54,6 @@ using ::testing::Not;
 using ::testing::TempDir;
 
 class GpuCompilerTest : public HloTestBase {
- public:
-  absl::StatusOr<std::unique_ptr<BufferAssignment>> AssignBuffers(
-      HloModule* module) {
-    auto compiler = backend().compiler();
-    return compiler->AssignBuffers(module, backend().default_stream_executor());
-  }
-
  protected:
   absl::StatusOr<bool> RunHloRematerialization(int64_t memory_limit_bytes,
                                                HloModule* module,
diff --git a/third_party/xla/xla/service/gpu/gpu_p2p_pipeliner.cc b/third_party/xla/xla/service/gpu/gpu_p2p_pipeliner.cc
index 5f5028a43442c3..f8cba55030c9f2 100644
--- a/third_party/xla/xla/service/gpu/gpu_p2p_pipeliner.cc
+++ b/third_party/xla/xla/service/gpu/gpu_p2p_pipeliner.cc
@@ -38,7 +38,7 @@ namespace gpu {
 namespace {
 
 bool ShouldPipeline(const HloInstruction* instr) {
-  if (!HloPredicateIsOp<HloOpcode::kRecv, HloOpcode::kSend>(instr)) {
+  if (!HloPredicateIsOp<HloOpcode::kRecvDone, HloOpcode::kSendDone>(instr)) {
     return false;
   }
 
@@ -48,10 +48,12 @@ bool ShouldPipeline(const HloInstruction* instr) {
     return false;
   }
 
-  // Check that the Send or Recv is used for non-trivial computation. This
-  // avoids repeatedly pipelining a loop.
-  return (instr->user_count() == 1 && instr->parent() != nullptr &&
-          instr->users()[0] != instr->parent()->root_instruction());
+  // Checks that the SendDone or RecvDone is used for non-trivial computation.
+  // This avoids repeatedly pipelining a loop.
+  bool is_pipelined =
+      (instr->user_count() == 1 && instr->parent() != nullptr &&
+       instr->users()[0] == instr->parent()->root_instruction());
+  return !is_pipelined;
 }
 
 bool ShouldAllowLoopVariantParameterInChain(const HloInstruction* instr) {
@@ -65,6 +67,14 @@ bool ShouldAllowLoopVariantParameterInChain(const HloInstruction* instr) {
 Status PostprocessP2PImpl(
     HloInstruction* instr,
     std::function<std::string(std::vector<ReplicaGroup>&)> transformer) {
+  // The input instruction is a Done instruction.
+  if (!HloPredicateIsOp<HloOpcode::kRecvDone, HloOpcode::kSendDone>(instr)) {
+    return Internal("Expected SendDone/RecvDone as the pipelined collective");
+  }
+  instr = instr->mutable_operand(0);
+  if (!HloPredicateIsOp<HloOpcode::kRecv, HloOpcode::kSend>(instr)) {
+    return Internal("Expected Send/Recv as the SendDone/RecvDone operand");
+  }
   auto validation_it =
       instr->frontend_attributes().map().find(kSendRecvValidationAttr);
   if (validation_it == instr->frontend_attributes().map().end() ||
diff --git a/third_party/xla/xla/service/gpu/gpu_transfer_manager.cc b/third_party/xla/xla/service/gpu/gpu_transfer_manager.cc
index b890c8b63ca156..8bd97486d34358 100644
--- a/third_party/xla/xla/service/gpu/gpu_transfer_manager.cc
+++ b/third_party/xla/xla/service/gpu/gpu_transfer_manager.cc
@@ -112,7 +112,8 @@ absl::Status GpuTransferManager::ReadDynamicShapes(
   std::vector<std::pair<se::DeviceMemoryBase, Shape*>> copies;
 
   TF_RETURN_IF_ERROR(device_buffer->buffers().ForEachElementWithStatus(
-      [&](const ShapeIndex& index, const se::DeviceMemoryBase& buffer) {
+      [&](const ShapeIndex& index,
+          const se::DeviceMemoryBase& buffer) -> absl::Status {
         const Shape& buffer_shape =
             ShapeUtil::GetSubshape(*device_shape, index);
         if (buffer_shape.IsTuple()) {
diff --git a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc
index 30742675d2e489..529ebf9008d5d9 100644
--- a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc
+++ b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc
@@ -202,7 +202,7 @@ HloFusionAnalysis::EmitterFusionKind HloFusionAnalysis::GetEmitterFusionKind()
     return EmitterFusionKind::kCustomFusion;
   }
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   if (fusion_backend_config_.kind() == kTritonGemmFusionKind ||
       fusion_backend_config_.kind() == kTritonSoftmaxFusionKind) {
     return EmitterFusionKind::kTriton;
diff --git a/third_party/xla/xla/service/gpu/hlo_traversal.cc b/third_party/xla/xla/service/gpu/hlo_traversal.cc
index 5ba3a1e6c1293d..33c8588d732666 100644
--- a/third_party/xla/xla/service/gpu/hlo_traversal.cc
+++ b/third_party/xla/xla/service/gpu/hlo_traversal.cc
@@ -66,7 +66,9 @@ const HloInstruction* ResolveOperand(const HloInstruction* operand) {
   // Deal with multi-output fusion operands, which are reached via a
   // get-tuple-element op.
   if (operand->opcode() == HloOpcode::kGetTupleElement &&
-      operand->operand(0)->opcode() == HloOpcode::kFusion) {
+      operand->operand(0)->opcode() == HloOpcode::kFusion &&
+      operand->operand(0)->fused_expression_root()->opcode() ==
+          HloOpcode::kTuple) {
     return operand->operand(0)->fused_expression_root()->operand(
         operand->tuple_index());
   }
diff --git a/third_party/xla/xla/service/gpu/horizontal_loop_fusion.cc b/third_party/xla/xla/service/gpu/horizontal_loop_fusion.cc
index e6074d43254d3d..c292f9020ba60f 100644
--- a/third_party/xla/xla/service/gpu/horizontal_loop_fusion.cc
+++ b/third_party/xla/xla/service/gpu/horizontal_loop_fusion.cc
@@ -321,7 +321,7 @@ void HorizontalLoopFusionImpl::FusionCandidates::Initialize(
   // the fused instructions to have the same number/type of outputs and also the
   // same output shape. We did a sort here so the fusion candidates is
   // populating a continuous span.
-  std::sort(
+  std::stable_sort(
       fusible_instrs_.begin(), fusible_instrs_.end(),
       [&](const HloInstruction* a, const HloInstruction* b) {
         if (GetUniqueOutputTypeOfFusible(*a) !=
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
index 427ac4dbd6b4fa..dd44b8ec418eba 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
@@ -30,8 +30,6 @@ limitations under the License.
 #include <variant>
 #include <vector>
 
-#include "nvidia/include/NVGPUToLLVM/NVGPUToLLVMPass.h"
-#include "nvidia/include/TritonNVIDIAGPUToLLVM/Passes.h"
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
@@ -54,6 +52,7 @@ limitations under the License.
 #include "llvm/TargetParser/Triple.h"
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"  // from @llvm-project
 #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"  // from @llvm-project
+#include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"  // from @llvm-project
 #include "mlir/Conversion/IndexToLLVM/IndexToLLVM.h"  // from @llvm-project
 #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"  // from @llvm-project
 #include "mlir/Dialect/Affine/IR/AffineOps.h"  // from @llvm-project
@@ -90,6 +89,7 @@ limitations under the License.
 #include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.h"  // from @llvm-project
+#include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Export.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "xla/autotuning.pb.h"
@@ -113,9 +113,9 @@ limitations under the License.
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
 #include "xla/service/gpu/matmul_utils.h"
-#include "xla/service/gpu/model/indexing_analysis.h"
 #include "xla/service/gpu/model/indexing_map.h"
 #include "xla/service/gpu/model/symbolic_tile_analysis.h"
+#include "xla/service/gpu/model/tiled_hlo_instruction.h"
 #include "xla/service/gpu/target_util.h"
 #include "xla/service/gpu/triton_fusion_analysis.h"
 #include "xla/service/gpu/triton_tiling_propagation.h"
@@ -124,7 +124,6 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/status.h"
 #include "xla/status_macros.h"
-#include "xla/statusor.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/translate/hlo_to_mhlo/hlo_function_importer.h"
@@ -140,8 +139,6 @@ limitations under the License.
 #include "triton/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/Triton/IR/Types.h"
-#include "triton/Dialect/Triton/Transforms/Passes.h"
-#include "triton/Dialect/TritonGPU/Transforms/Passes.h"
 #include "triton/Dialect/TritonNvidiaGPU/Transforms/Passes.h"
 
 namespace xla {
@@ -424,11 +421,15 @@ absl::StatusOr<Value> EmitElementwise(ImplicitLocOpBuilder& b,
       mlir::getElementTypeOrSelf(inputs[0]).isF64()) {
     auto dev_fn_id = GetTargetDeviceFunctionID(hlo.opcode());
     if (dev_fn_id.ok()) {
+      llvm::Triple triple("nvptx64-unknown-unknown");
+      if (std::holds_alternative<se::RocmComputeCapability>(
+              device_info.gpu_compute_capability())) {
+        triple.setTriple("amdgcn-unknown-unknown");
+      }
       return b.create<mt::ExternElementwiseOp>(
           inputs[0].getType(), inputs, "libdevice", libdevice_path,
           ObtainDeviceFunctionName(dev_fn_id.value(),
-                                   hlo.shape().element_type(),
-                                   llvm::Triple("nvptx64-unknown-unknown")),
+                                   hlo.shape().element_type(), triple),
           /*pure=*/true);
     }
   }
@@ -737,17 +738,16 @@ absl::StatusOr<Value> EmitNestedFusion(
 
 // TODO(b/331332678): Add unit tests to target this function specifically.
 Value EmitTiledBroadcast(
-    ImplicitLocOpBuilder& b, const SymbolicTileAnalysis& analysis,
-    const TiledHloInstruction& tiled_broadcast,
+    ImplicitLocOpBuilder& b, const TiledHloInstruction& tiled_broadcast,
     absl::flat_hash_map<const TiledHloInstruction*, Value>& values) {
-  auto input_tile_shape = analysis.TileSizes(*tiled_broadcast.operands[0]);
-  auto output_tile_shape = analysis.TileSizes(tiled_broadcast);
+  auto input_tile_shape = tiled_broadcast.operand(0)->tile_sizes();
+  auto output_tile_shape = tiled_broadcast.tile_sizes();
 
-  Value expanded_input = values[tiled_broadcast.operands[0]];
+  Value expanded_input = values[tiled_broadcast.operand(0)];
 
   // Returns true if `dim_id` is broadcasted.
   auto is_broadcasted_dim = [&](int64_t dim_id) {
-    return !llvm::is_contained(tiled_broadcast.hlo->dimensions(), dim_id);
+    return !llvm::is_contained(tiled_broadcast.hlo()->dimensions(), dim_id);
   };
 
   // The loop below iterates over output dimensions and tracks matching dims in
@@ -798,11 +798,11 @@ Value EmitTiledBroadcast(
 absl::StatusOr<Value> EmitTiledHloInstruction(
     ImplicitLocOpBuilder& b, absl::string_view libdevice_path,
     const se::DeviceDescription& device_info,
-    const SymbolicTileAnalysis& analysis, const TiledHloInstruction& tiled_hlo,
+    const TiledHloInstruction& tiled_hlo,
     std::function<absl::StatusOr<Value>(const TiledHloInstruction&)>
         emit_param_load_fn,
     absl::flat_hash_map<const TiledHloInstruction*, Value>& values) {
-  const HloInstruction* hlo = tiled_hlo.hlo;
+  const HloInstruction* hlo = tiled_hlo.hlo();
 
   if (hlo->opcode() == HloOpcode::kParameter) {
     return emit_param_load_fn(tiled_hlo);
@@ -815,19 +815,19 @@ absl::StatusOr<Value> EmitTiledHloInstruction(
   }
 
   if (hlo->opcode() == HloOpcode::kBroadcast) {
-    return EmitTiledBroadcast(b, analysis, tiled_hlo, values);
+    return EmitTiledBroadcast(b, tiled_hlo, values);
   }
 
   if (hlo->opcode() == HloOpcode::kReduce) {
     return EmitReduce(b, libdevice_path, device_info, *hlo,
-                      values[tiled_hlo.operands[0]]);
+                      values[tiled_hlo.operand(0)]);
   }
 
   if (hlo->IsElementwise()) {
     std::vector<Value> operands;
     operands.reserve(hlo->operands().size());
 
-    for (const TiledHloInstruction* operand : tiled_hlo.operands) {
+    for (const TiledHloInstruction* operand : tiled_hlo.operands()) {
       operands.push_back(values[operand]);
     }
     return EmitElementwise(b, libdevice_path, device_info, *hlo, operands);
@@ -838,7 +838,7 @@ absl::StatusOr<Value> EmitTiledHloInstruction(
     // All these are currently supported only as operations on indices
     // which are pushed to loads and stores. No operations on tiles are
     // performed here.
-    return values[tiled_hlo.operands[0]];
+    return values[tiled_hlo.operand(0)];
   }
 
   return absl::UnimplementedError(
@@ -850,21 +850,22 @@ absl::StatusOr<Value> EmitTiledHloInstruction(
 absl::StatusOr<Value> EmitTiledScope(
     ImplicitLocOpBuilder& b, absl::string_view libdevice_path,
     const se::DeviceDescription& device_info,
-    const SymbolicTileAnalysis& analysis,
+    const std::vector<std::unique_ptr<TiledHloInstruction>>&
+        tiled_hlo_instructions,
     std::function<absl::StatusOr<Value>(const TiledHloInstruction&)>
         emit_param_load_fn,
     absl::flat_hash_map<const TiledHloInstruction*, Value>& values) {
-  for (const auto& tiled_hlo : analysis.GetTiledHloInstructions()) {
+  for (const auto& tiled_hlo : tiled_hlo_instructions) {
     TF_ASSIGN_OR_RETURN(
         Value result,
-        EmitTiledHloInstruction(b, libdevice_path, device_info, analysis,
-                                *tiled_hlo, emit_param_load_fn, values));
+        EmitTiledHloInstruction(b, libdevice_path, device_info, *tiled_hlo,
+                                emit_param_load_fn, values));
     TF_RET_CHECK(values.insert({tiled_hlo.get(), result}).second)
-        << tiled_hlo->hlo->ToString();
+        << tiled_hlo->hlo()->ToString();
     VLOG(8) << "Emitted "
-            << tiled_hlo->hlo->ToString(HloPrintOptions::ShortParsable());
+            << tiled_hlo->hlo()->ToString(HloPrintOptions::ShortParsable());
   }
-  return values[analysis.GetRoot()];
+  return values[tiled_hlo_instructions.back().get()];
 }
 
 // Emit sequence of instructions using compatible tiling ordered producers
@@ -932,81 +933,6 @@ absl::StatusOr<Value> EmitScope(
   return values[instructions.back()];
 }
 
-// Create Triton pipeline.
-//
-// `out_cluster_info` must be kept alive at least until pm.run() is called.
-// It should be read after that. We have to pass the cluster dims to
-// LaunchDimensions. Triton currently uses this as an out-parameter to return
-// the cluster dims determined based on `config.num_ctas` and a heuristic. There
-// are some signs that show that this was intended to be used as an in-out
-// parameter which would give a hint to Triton which cluster dims we prefer to
-// use, but that's not the case currently.
-absl::Status CreateTritonPipeline(
-    mlir::OpPassManager& pm, const se::CudaComputeCapability& cc,
-    const TritonGemmConfig& config,
-    mt::nvidia_gpu::ClusterInfo& out_cluster_info) {
-  const int ccAsInt = cc.major * 10 + cc.minor;
-  const int threadsPerWarp = 32;
-
-  // Based on make_ttir() in
-  // @triton//:third_party/nvidia/backend/compiler.py
-  pm.addPass(mlir::createInlinerPass());
-  pm.addPass(mt::createRewriteTensorPointerPass());
-  pm.addPass(mt::createCombineOpsPass());
-  pm.addPass(mlir::createCanonicalizerPass());
-  pm.addPass(mt::createReorderBroadcastPass());
-  pm.addPass(mlir::createCSEPass());
-  pm.addPass(mlir::createLoopInvariantCodeMotionPass());
-  pm.addPass(mlir::createSymbolDCEPass());
-
-  // Based on make_ttgir() in
-  // @triton//:third_party/nvidia/backend/compiler.py
-  pm.addPass(mt::createConvertTritonToTritonGPUPass(
-      config.num_warps, threadsPerWarp, config.num_ctas, ccAsInt));
-  pm.addPass(mt::gpu::createCoalescePass());
-  pm.addPass(mlir::createTritonNvidiaGPUPlanCTAPass(&out_cluster_info));
-  pm.addPass(mt::gpu::createRemoveLayoutConversionsPass());
-  pm.addPass(mt::gpu::createOptimizeThreadLocalityPass());
-  pm.addPass(mt::gpu::createAccelerateMatmulPass(ccAsInt));
-  pm.addPass(mt::gpu::createRemoveLayoutConversionsPass());
-  pm.addPass(mt::gpu::createOptimizeDotOperandsPass());
-  pm.addPass(mlir::createCSEPass());
-
-  pm.addPass(mt::gpu::createPipelinePass(config.num_stages, config.num_warps,
-                                         config.num_ctas, ccAsInt));
-
-  if (!cc.IsAtLeastHopper()) {
-    pm.addPass(mt::gpu::createPrefetchPass());
-  }
-
-  pm.addPass(mt::gpu::createOptimizeDotOperandsPass());
-  pm.addPass(mt::gpu::createRemoveLayoutConversionsPass());
-  pm.addPass(mt::gpu::createReduceDataDuplicationPass());
-  pm.addPass(mt::gpu::createReorderInstructionsPass());
-  pm.addPass(mlir::createCSEPass());
-  pm.addPass(mlir::createSymbolDCEPass());
-  if (cc.IsAtLeastHopper()) {
-    pm.addPass(mlir::createTritonNvidiaGPUFenceInsertionPass(ccAsInt));
-  }
-  pm.addPass(mlir::createCanonicalizerPass());
-
-  // Based on make_llir() in
-  // @triton//:third_party/nvidia/backend/compiler.py
-  pm.addPass(mt::gpu::createDecomposeUnsupportedConversionsPass());
-  pm.addPass(mlir::createConvertSCFToCFPass());
-  pm.addPass(mlir::createConvertIndexToLLVMPass());
-  pm.addPass(mt::gpu::createAllocateSharedMemoryPass());
-  pm.addPass(mt::createConvertTritonGPUToLLVMPass(ccAsInt));
-  pm.addPass(mt::createConvertNVGPUToLLVMPass());
-  pm.addPass(mlir::createArithToLLVMConversionPass());
-  pm.addPass(mlir::createCanonicalizerPass());
-  pm.addPass(mlir::createCSEPass());
-  pm.addPass(mlir::createSymbolDCEPass());
-  // Note: translateTritonGPUToLLVMIR adds line info with LLVMDIScopePass.
-
-  return absl::OkStatus();
-}
-
 // Extract additional attributes from an LLVM function that are not passed
 // to the builder directly.
 SmallVector<mlir::NamedAttribute> GetExtraAttrs(ml::LLVMFuncOp func) {
@@ -1988,8 +1914,9 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
                         const TritonGemmConfig& config) {
   TF_RETURN_IF_ERROR(CheckGemmTilingComplexityHeuristic(config));
 
-  const HloDotInstruction* dot_instr = DynCast<HloDotInstruction>(
-      hlo_query::GetFirstInstructionWithOpcode(*computation, HloOpcode::kDot));
+  const HloInstruction* instr =
+      hlo_query::GetFirstInstructionWithOpcode(*computation, HloOpcode::kDot);
+  const HloDotInstruction* dot_instr = DynCast<HloDotInstruction>(instr);
   TF_RET_CHECK(!dot_instr->sparse_operands());
   // Use 32-bit indexing if addressing any of the inputs or the output (which
   // could grow if split_k is set) does not cross the INT_MAX boundary.
@@ -2003,6 +1930,21 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
   const HloInstruction* root = dot_instr->parent()->root_instruction();
   TF_RET_CHECK(!root->shape().IsTuple());
 
+  HloInstructionAdaptor instr_adaptor{*instr};
+  auto fusion_adaptor = HloFusionAdaptor::ForComputation(computation);
+  // TODO(b/320659359) Allow TF32 for 8-bit or less types with F32.
+  bool is_8_bit_or_less_dot_with_F32 = HloAnyOf(
+      instr_adaptor.GetOperands(), *fusion_adaptor,
+      [&](HloInstructionAdaptor node) {
+        if (node.opcode() != HloOpcode::kConvert) {
+          return false;
+        }
+        Type in_type =
+            TritonType(builder, node.GetOperand(0).shape().element_type());
+        Type out_type = TritonType(builder, node.shape().element_type());
+        return in_type.getIntOrFloatBitWidth() <= 8 && out_type.isF32();
+      });
+
   // We'll be creating a lot of instructions from a single dot, use an
   // implicit loc builder so we don't have to pass around the location all the
   // time.
@@ -2086,7 +2028,6 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
     iter_args_next.reserve(iter_args.size());
     absl::flat_hash_map<const HloInstruction*, Value> values_lhs;
     absl::flat_hash_map<const HloInstruction*, Value> values_rhs;
-    bool has_8_bit_input = false;
 
     // Load tiles of all parameters of LHS and RHS scopes and advance pointers.
     for (int i = 0; i < iter_args.size() - 1; ++i) {
@@ -2105,10 +2046,6 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
         param_value = Cast(b, param_value, param_ty);
       }
 
-      if (param_ty.getIntOrFloatBitWidth() <= 8) {
-        has_8_bit_input = true;
-      }
-
       CHECK(values.insert({param_hlo, param_value}).second);
       SmallVector<Value> increments;
       for (const DimProperties& dim : side.tiled_dims) {
@@ -2170,10 +2107,6 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
       dot_input_rhs = apply_mask(1, dot_input_rhs);
     }
 
-    // TODO(b/320659359) Allow TF32 for 8-bit types with F32.
-    bool has_convert_8_bit_to_f32 =
-        has_8_bit_input && getElementTypeOrSelf(dot_input_lhs).isF32();
-
     const HloModule* hlo_module = dot_instr->GetModule();
     if (hlo_module->config().debug_options().xla_gpu_enable_bf16_3way_gemm() &&
         hlo_module->config().debug_options().xla_gpu_enable_bf16_6way_gemm()) {
@@ -2200,10 +2133,11 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
       // maxNumImpreciseAcc flag was introduced for Hopper to accumulate in a
       // lower precision than the output type. The change was introduced here:
       // https://github.com/openai/triton/commit/31b0c521427109a8eda609b58d756c380b21599a
-      accumulator_next = b.create<mt::DotOp>(
-          dot_input_lhs, dot_input_rhs, iter_args.back(),
-          /*allowTF32=*/IsTf32Allowed(dot_instr) && !has_convert_8_bit_to_f32,
-          /*maxNumImpreciseAcc=*/0);
+      accumulator_next =
+          b.create<mt::DotOp>(dot_input_lhs, dot_input_rhs, iter_args.back(),
+                              /*allowTF32=*/IsTf32Allowed(dot_instr) &&
+                                  !is_8_bit_or_less_dot_with_F32,
+                              /*maxNumImpreciseAcc=*/0);
     }
     iter_args_next.push_back(accumulator_next);
 
@@ -2280,48 +2214,14 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
   return absl::OkStatus();
 }
 
-// Computes indexing map from program id into the tile offset for the given
-// shape and tile sizes.
-IndexingMap ComputeProgramIdToOutputTileIndexing(
-    absl::Span<const int64_t> dimensions, absl::Span<const int64_t> tile_sizes,
-    mlir::MLIRContext* mlir_context) {
-  CHECK_EQ(dimensions.size(), tile_sizes.size());
-
-  int num_tiles = 1;
-  std::vector<int64_t> outer_loop_bounds;
-  outer_loop_bounds.reserve(dimensions.size());
-  for (auto [dim_size, tile_size] : llvm::zip(dimensions, tile_sizes)) {
-    int num_tiles_per_dim = (dim_size + tile_size - 1) / tile_size;
-
-    num_tiles *= num_tiles_per_dim;
-    outer_loop_bounds.push_back(num_tiles_per_dim);
-  }
-
-  mlir::AffineExpr program_id = mlir::getAffineDimExpr(0, mlir_context);
-
-  // Delinearize the block id.
-  auto tile_exprs =
-      DelinearizeIndex(outer_loop_bounds, program_id, mlir_context);
-
-  // Scale each index by the tile size to produce tile offset.
-  for (auto [tile_expr, tile_size] : llvm::zip(tile_exprs, tile_sizes)) {
-    tile_expr = tile_expr * tile_size;
-  }
-
-  return IndexingMap::FromTensorSizes(
-      mlir::AffineMap::get(
-          /*dimCount=*/1, /*symbolCount=*/0, tile_exprs, mlir_context),
-      /*dim_upper_bounds=*/{num_tiles}, /*symbol_upper_bounds=*/{});
-}
-
 // Computes the base pointer offset for the given pid and shape.
 // `tile_offset_indexing` is a mapping from
 // (program_id) -> [tile_offset0, ..., tile_offsetN]
-StatusOr<Value> ComputeBasePtrOffset(ImplicitLocOpBuilder b, Value pid,
-                                     const Shape& shape,
-                                     const IndexingMap& tile_offset_indexing) {
+Value ComputeBasePtrOffset(ImplicitLocOpBuilder b, Value pid,
+                           const TiledHloInstruction& tiled_hlo) {
+  const Shape& shape = tiled_hlo.hlo()->shape();
   ArrayRef<mlir::AffineExpr> dimension_exprs =
-      tile_offset_indexing.GetAffineMap().getResults();
+      tiled_hlo.block_id_to_tile_offsets_indexing().GetAffineMap().getResults();
 
   mlir::AffineExpr linear_index =
       mlir::getAffineConstantExpr(0, b.getContext());
@@ -2331,25 +2231,10 @@ StatusOr<Value> ComputeBasePtrOffset(ImplicitLocOpBuilder b, Value pid,
     stride *= shape.dimensions(i);
   }
 
-  // A symbol in an indexing map means that to produce on element of output, we
-  // need to read all elements of input in the symbol range. Since this function
-  // computes start of the tile, we need to substitute each symbol with its
-  // lower bound value. We assume here the iteration order is normalized.
-  // TODO(b/330906085): Support cases when tile offsets are not 0.
-  for (const Interval& symbol_bound : tile_offset_indexing.GetSymbolBounds()) {
-    if (symbol_bound.lower != 0) {
-      return absl::FailedPreconditionError(absl::StrCat(
-          "Symbol lower bound is not zero. ", tile_offset_indexing.ToString()));
-    }
-  }
-
-  std::vector<Value> symbol_lower_bounds(
-      tile_offset_indexing.GetSymbolCount(),
-      b.create<ma::ConstantOp>(b.getIndexAttr(0)));
-
   return b.create<ma::IndexCastUIOp>(
-      b.getI64Type(), mlir_converter::ApplyAffineExpr(linear_index, pid,
-                                                      symbol_lower_bounds, b));
+      b.getI64Type(),
+      mlir_converter::ApplyAffineExpr(linear_index, /*dims=*/pid,
+                                      /*symbols=*/{}, b));
 }
 
 absl::Status EmitTiledSoftMax(mlir::OpBuilder builder,
@@ -2358,8 +2243,6 @@ absl::Status EmitTiledSoftMax(mlir::OpBuilder builder,
                               SymbolicTileAnalysis* analysis,
                               const HloComputation* computation,
                               mlir::triton::FuncOp fn) {
-  mlir::MLIRContext* mlir_context = analysis->GetMLIRContext();
-
   const HloInstruction* root = computation->root_instruction();
   auto loc = mlir::NameLoc::get(builder.getStringAttr(root->name()));
   ImplicitLocOpBuilder b(loc, builder);
@@ -2408,11 +2291,9 @@ absl::Status EmitTiledSoftMax(mlir::OpBuilder builder,
       computation->root_instruction()->shape().rank(), 1);
   output_tile_sizes.back() = row_len;
 
-  analysis->SetTileSizes(output_tile_sizes);
-
-  IndexingMap program_id_to_output_tile_indexing =
-      ComputeProgramIdToOutputTileIndexing(root_shape.dimensions(),
-                                           output_tile_sizes, mlir_context);
+  TF_ASSIGN_OR_RETURN(
+      std::vector<std::unique_ptr<TiledHloInstruction>> tiled_hlo_instructions,
+      analysis->ComputeTiledHloInstructions(output_tile_sizes));
 
   // block_size must be a power of two.
   int result_block_size = llvm::PowerOf2Ceil(row_len);
@@ -2423,32 +2304,23 @@ absl::Status EmitTiledSoftMax(mlir::OpBuilder builder,
   }
 
   // Emits load instructions
-  auto emit_param_load = [&](const TiledHloInstruction& tiled_hlo_instruction)
-      -> absl::StatusOr<Value> {
+  auto emit_param_load =
+      [&](const TiledHloInstruction& tiled_hlo) -> absl::StatusOr<Value> {
     std::vector<Value> tile_sizes, tile_strides, tile_offsets;
-    for (auto [size, stride, offset] :
-         llvm::zip(analysis->TileSizes(tiled_hlo_instruction),
-                   analysis->TileStrides(tiled_hlo_instruction),
-                   analysis->TileOffsets(tiled_hlo_instruction))) {
+    for (auto [size, stride] :
+         llvm::zip(tiled_hlo.tile_sizes(), tiled_hlo.tile_strides())) {
       if (size == 1) continue;
 
       tile_sizes.push_back(CreateConst(b, b.getI64Type(), size));
       tile_strides.push_back(CreateConst(b, b.getI64Type(), stride));
-      tile_offsets.push_back(CreateConst(b, b.getI32Type(), offset));
+      tile_offsets.push_back(CreateConst(b, b.getI32Type(), 0));
     }
 
-    IndexingMap program_id_to_input_tile_indexing = ComposeIndexingMaps(
-        program_id_to_output_tile_indexing, tiled_hlo_instruction.indexing_map);
-    program_id_to_input_tile_indexing.Simplify(GetIndexingMapForInstruction);
-
     // Manually compute pointer offset to avoid materialized fully parallel
     // dimensions in the tile. Current codegen tried to avoid size-1 dims.
-    TF_ASSIGN_OR_RETURN(
-        Value ptr_offset,
-        ComputeBasePtrOffset(b, pid, tiled_hlo_instruction.hlo->shape(),
-                             program_id_to_input_tile_indexing));
+    Value ptr_offset = ComputeBasePtrOffset(b, pid, tiled_hlo);
 
-    auto fn_arg = fn.getArgument(tiled_hlo_instruction.hlo->parameter_number());
+    auto fn_arg = fn.getArgument(tiled_hlo.hlo()->parameter_number());
     auto tile_ptr = AddPtr(b, fn_arg, ptr_offset);
 
     if (tile_sizes.empty()) {
@@ -2467,13 +2339,13 @@ absl::Status EmitTiledSoftMax(mlir::OpBuilder builder,
   };
 
   absl::flat_hash_map<const TiledHloInstruction*, Value> values_out;
-  TF_ASSIGN_OR_RETURN(Value result,
-                      EmitTiledScope(b, libdevice_path, device_info, *analysis,
-                                     emit_param_load, values_out));
+  TF_ASSIGN_OR_RETURN(
+      Value result,
+      EmitTiledScope(b, libdevice_path, device_info, tiled_hlo_instructions,
+                     emit_param_load, values_out));
 
-  TF_ASSIGN_OR_RETURN(Value ptr_offset,
-                      ComputeBasePtrOffset(b, pid, root_shape,
-                                           program_id_to_output_tile_indexing));
+  Value ptr_offset =
+      ComputeBasePtrOffset(b, pid, *tiled_hlo_instructions.back());
 
   Value store_tensor = b.create<mt::MakeTensorPtrOp>(
       /*base=*/AddPtr(b, fn.getArgument(computation->num_parameters()),
@@ -2653,6 +2525,7 @@ absl::StatusOr<std::unique_ptr<llvm::Module>> TranslateLLVMToLLVMIR(
   mlir::registerBuiltinDialectTranslation(registry);
   mlir::registerLLVMDialectTranslation(registry);
   mlir::registerNVVMDialectTranslation(registry);
+  mlir::registerROCDLDialectTranslation(registry);
   module->getContext()->appendDialectRegistry(registry);
 
   std::unique_ptr<llvm::Module> llvmModule =
@@ -2677,15 +2550,6 @@ absl::StatusOr<std::unique_ptr<llvm::Module>> TranslateLLVMToLLVMIR(
   return llvmModule;
 }
 
-namespace {
-
-std::string GetLibdevicePath(const HloModuleConfig& hlo_config) {
-  return nvptx::LibDevicePath(
-      hlo_config.debug_options().xla_gpu_cuda_data_dir());
-}
-
-}  // namespace
-
 absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateTritonModule(
     const TritonFusionAnalysis& analysis, absl::string_view fn_name,
     const HloComputation* hlo_computation,
@@ -2723,9 +2587,9 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateTritonModule(
   fn.addEntryBlock();
   b.setInsertionPointToStart(&fn.front());
 
-  TF_RETURN_IF_ERROR(
-      ir_emitter(b, GetLibdevicePath(hlo_computation->parent()->config()),
-                 device_info, analysis, hlo_computation, fn, config));
+  TF_RETURN_IF_ERROR(ir_emitter(
+      b, GetLibdevicePath(hlo_computation->parent()->config(), device_info),
+      device_info, analysis, hlo_computation, fn, config));
 
   b.create<mt::ReturnOp>(loc);
 
@@ -2746,20 +2610,22 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateTritonModule(
 
 absl::StatusOr<TritonWrapperResult> TritonWrapper(
     const TritonFusionAnalysis& analysis, absl::string_view fn_name,
-    const HloComputation* hlo_computation, const se::CudaComputeCapability& cc,
+    const HloComputation* hlo_computation, const se::GpuComputeCapability& cc,
     const se::DeviceDescription& device_info, const TritonGemmConfig& config,
     llvm::Module* llvm_module, TritonIrEmitter ir_emitter,
     mlir::MLIRContext& mlir_context) {
-  if (!cc.IsAtLeastAmpere()) {
-    return absl::FailedPreconditionError(
-        "Triton support is only enabled for Ampere GPUs and up.");
+  if (std::holds_alternative<se::CudaComputeCapability>(cc)) {
+    auto ccCuda = std::get<se::CudaComputeCapability>(cc);
+    if (!ccCuda.IsAtLeastAmpere()) {
+      return absl::FailedPreconditionError(
+          "Triton support is only enabled for Ampere GPUs and up.");
+    }
   }
 
   auto debug_options = GetDebugOptionsFromFlags();
   if (debug_options.xla_gpu_enable_triton_hopper()) {
     // Set environment variables for consumption by Triton.
     tsl::setenv("ENABLE_MMA_V3", "true", true /*overwrite*/);
-    tsl::setenv("ENABLE_PIPELINING", "true", true /*overwrite*/);
   }
 
   TF_ASSIGN_OR_RETURN(
@@ -2780,13 +2646,16 @@ absl::StatusOr<TritonWrapperResult> TritonWrapper(
 // TODO(b/325220878): Replace TritonGemmConfig with a more generic abstraction.
 absl::StatusOr<TritonWrapperResult> CompileTritonToLLVM(
     const HloModuleConfig& hlo_config, absl::string_view hlo_module_name,
-    const se::CudaComputeCapability& cc,
+    const se::GpuComputeCapability& cc,
     const se::DeviceDescription& device_info, const TritonGemmConfig& config,
     mlir::ModuleOp triton_module, llvm::Module* llvm_module,
     mlir::MLIRContext& mlir_context) {
-  if (!cc.IsAtLeastAmpere()) {
-    return absl::FailedPreconditionError(
-        "Triton support is only enabled for Ampere GPUs and up.");
+  if (std::holds_alternative<se::CudaComputeCapability>(cc)) {
+    auto ccCuda = std::get<se::CudaComputeCapability>(cc);
+    if (!ccCuda.IsAtLeastAmpere()) {
+      return absl::FailedPreconditionError(
+          "Triton support is only enabled for Ampere GPUs and up.");
+    }
   }
 
   bool should_verify =
@@ -2860,7 +2729,8 @@ absl::StatusOr<TritonWrapperResult> CompileTritonToLLVM(
       triton_module->getAttrOfType<mlir::IntegerAttr>("triton_gpu.shared")
           .getInt();
   VLOG(2) << "Shared memory usage: " << shared_mem_bytes << " B";
-  if (shared_mem_bytes > device_info.shared_memory_per_block_optin()) {
+  if (std::holds_alternative<se::CudaComputeCapability>(cc) &&
+      shared_mem_bytes > device_info.shared_memory_per_block_optin()) {
     return absl::ResourceExhaustedError(absl::StrFormat(
         "Shared memory size limit exceeded: requested %d, available: %d",
         shared_mem_bytes, device_info.shared_memory_per_block_optin()));
@@ -2869,7 +2739,7 @@ absl::StatusOr<TritonWrapperResult> CompileTritonToLLVM(
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<llvm::Module> ll_triton_module,
       TranslateLLVMToLLVMIR(&llvm_module->getContext(), triton_module,
-                            GetLibdevicePath(hlo_config)));
+                            GetLibdevicePath(hlo_config, device_info)));
   VLogModule(5, *ll_triton_module);
   if (should_verify) {
     VerifyModule(*ll_triton_module);
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.h b/third_party/xla/xla/service/gpu/ir_emitter_triton.h
index 96ca55139bb196..91cd73e70da7c3 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.h
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.h
@@ -28,33 +28,30 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "xla/autotuning.pb.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/service/gpu/hlo_traversal.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/matmul_utils.h"
-#include "xla/service/gpu/model/indexing_map.h"
 #include "xla/service/gpu/triton_fusion_analysis.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/status.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
+#include "triton/Dialect/TritonNvidiaGPU/Transforms/Passes.h"
 
 namespace xla {
 namespace gpu {
 
+namespace mt = ::mlir::triton;
+
 struct TritonWrapperResult {
   int64_t shmem_bytes = 0;
   std::optional<se::ClusterDim> cluster_dim;
 };
 
-// Computes indexing map from program id into the tile offset for the given
-// shape and tile sizes.
-IndexingMap ComputeProgramIdToOutputTileIndexing(
-    absl::Span<const int64_t> dimensions, absl::Span<const int64_t> tile_sizes,
-    mlir::MLIRContext* mlir_context);
-
 // Compute the launch dimensions for the given Triton MatMul.
 absl::StatusOr<LaunchDimensions> GetMatMulLaunchDimensions(
     const TritonFusionAnalysis& analysis, const HloFusionAdaptor& fusion,
@@ -89,7 +86,7 @@ using TritonIrEmitter = std::function<Status(
 // MatMul and SoftMax above are some such IR generators.
 absl::StatusOr<TritonWrapperResult> TritonWrapper(
     const TritonFusionAnalysis& analysis, absl::string_view fn_name,
-    const HloComputation* hlo_computation, const se::CudaComputeCapability& cc,
+    const HloComputation* hlo_computation, const se::GpuComputeCapability& cc,
     const se::DeviceDescription& device_info, const TritonGemmConfig& config,
     llvm::Module* llvm_module, TritonIrEmitter ir_emitter,
     mlir::MLIRContext& mlir_context);
@@ -105,11 +102,28 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateTritonModule(
 // Compiles a given Triton module to LLVM IR.
 absl::StatusOr<TritonWrapperResult> CompileTritonToLLVM(
     const HloModuleConfig& hlo_config, absl::string_view hlo_module_name,
-    const se::CudaComputeCapability& cc,
+    const se::GpuComputeCapability& cc,
     const se::DeviceDescription& device_info, const TritonGemmConfig& config,
     mlir::ModuleOp triton_module, llvm::Module* llvm_module,
     mlir::MLIRContext& mlir_context);
 
+// Create Triton pipeline.
+//
+// `out_cluster_info` must be kept alive at least until pm.run() is called.
+// It should be read after that. We have to pass the cluster dims to
+// LaunchDimensions. Triton currently uses this as an out-parameter to return
+// the cluster dims determined based on `config.num_ctas` and a heuristic. There
+// are some signs that show that this was intended to be used as an in-out
+// parameter which would give a hint to Triton which cluster dims we prefer to
+// use, but that's not the case currently.
+absl::Status CreateTritonPipeline(
+    mlir::OpPassManager& pm, const se::GpuComputeCapability& cc,
+    const TritonGemmConfig& config,
+    mt::nvidia_gpu::ClusterInfo& out_cluster_info);
+
+std::string GetLibdevicePath(const HloModuleConfig& hlo_config,
+                             const se::DeviceDescription& device_info);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_cuda.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_cuda.cc
new file mode 100644
index 00000000000000..c8e31b0e0f87ac
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_cuda.cc
@@ -0,0 +1,120 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+
+#include "nvidia/include/NVGPUToLLVM/NVGPUToLLVMPass.h"
+#include "nvidia/include/TritonNVIDIAGPUToLLVM/Passes.h"
+#include "absl/status/status.h"
+#include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"  // from @llvm-project
+#include "mlir/Conversion/IndexToLLVM/IndexToLLVM.h"  // from @llvm-project
+#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Transforms/Passes.h"  // from @llvm-project
+#include "xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
+#include "xla/service/gpu/matmul_utils.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/stream_executor/device_description.h"
+#include "triton/Conversion/TritonGPUToLLVM/Passes.h"
+#include "triton/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.h"
+#include "triton/Dialect/Triton/Transforms/Passes.h"
+#include "triton/Dialect/TritonGPU/Transforms/Passes.h"
+#include "triton/Dialect/TritonNvidiaGPU/Transforms/Passes.h"
+
+namespace xla {
+namespace gpu {
+
+namespace mt = ::mlir::triton;
+
+absl::Status CreateTritonPipeline(
+    mlir::OpPassManager& pm, const se::GpuComputeCapability& cc,
+    const TritonGemmConfig& config,
+    mt::nvidia_gpu::ClusterInfo& out_cluster_info) {
+  auto ccCuda = std::get<se::CudaComputeCapability>(cc);
+  const int ccAsInt = ccCuda.major * 10 + ccCuda.minor;
+  const int threadsPerWarp = 32;
+
+  // Based on make_ttir() in
+  // @triton//:third_party/nvidia/backend/compiler.py
+  pm.addPass(mlir::createInlinerPass());
+  pm.addPass(mt::createRewriteTensorPointerPass());
+  pm.addPass(mt::createCombineOpsPass());
+  pm.addPass(mlir::createCanonicalizerPass());
+  pm.addPass(mt::createReorderBroadcastPass());
+  pm.addPass(mlir::createCSEPass());
+  pm.addPass(mlir::createLoopInvariantCodeMotionPass());
+  pm.addPass(mlir::createSymbolDCEPass());
+
+  // Based on make_ttgir() in
+  // @triton//:third_party/nvidia/backend/compiler.py
+  pm.addPass(mt::createConvertTritonToTritonGPUPass(
+      config.num_warps, threadsPerWarp, config.num_ctas, ccAsInt));
+  pm.addPass(mt::gpu::createCoalescePass());
+  pm.addPass(mlir::createTritonNvidiaGPUPlanCTAPass(&out_cluster_info));
+  pm.addPass(mt::gpu::createRemoveLayoutConversionsPass());
+  pm.addPass(mt::gpu::createOptimizeThreadLocalityPass());
+  pm.addPass(mt::gpu::createAccelerateMatmulPass(ccAsInt));
+  pm.addPass(mt::gpu::createRemoveLayoutConversionsPass());
+  pm.addPass(mt::gpu::createOptimizeDotOperandsPass());
+  pm.addPass(mlir::createCSEPass());
+
+  pm.addPass(mt::gpu::createPipelinePass(config.num_stages, config.num_warps,
+                                         config.num_ctas, ccAsInt));
+
+  if (!ccCuda.IsAtLeastHopper()) {
+    pm.addPass(mt::gpu::createPrefetchPass());
+  }
+
+  pm.addPass(mt::gpu::createOptimizeDotOperandsPass());
+  // We need to disable this pass because it undoes the hoisting of dot_operand
+  // layout conversion done in
+  // triton/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp in
+  // HoistLayoutConversion pattern.
+  // Bug: b/331360119
+  // pm.addPass(mt::gpu::createRemoveLayoutConversionsPass());
+  pm.addPass(mt::gpu::createReduceDataDuplicationPass());
+  pm.addPass(mt::gpu::createReorderInstructionsPass());
+  pm.addPass(mlir::createCSEPass());
+  pm.addPass(mlir::createSymbolDCEPass());
+  if (ccCuda.IsAtLeastHopper()) {
+    pm.addPass(mlir::createTritonNvidiaGPUFenceInsertionPass(ccAsInt));
+  }
+  pm.addPass(mlir::createCanonicalizerPass());
+
+  // Based on make_llir() in
+  // @triton//:third_party/nvidia/backend/compiler.py
+  pm.addPass(mt::gpu::createDecomposeUnsupportedConversionsPass());
+  pm.addPass(mlir::createConvertSCFToCFPass());
+  pm.addPass(mlir::createConvertIndexToLLVMPass());
+  pm.addPass(mt::gpu::createAllocateSharedMemoryPass());
+  pm.addPass(mt::createConvertTritonGPUToLLVMPass(ccAsInt));
+  pm.addPass(mt::createConvertNVGPUToLLVMPass());
+  pm.addPass(mlir::createArithToLLVMConversionPass());
+  pm.addPass(mlir::createCanonicalizerPass());
+  pm.addPass(mlir::createCSEPass());
+  pm.addPass(mlir::createSymbolDCEPass());
+  // Note: translateTritonGPUToLLVMIR adds line info with LLVMDIScopePass.
+
+  return absl::OkStatus();
+}
+
+std::string GetLibdevicePath(const HloModuleConfig& hlo_config,
+                             const se::DeviceDescription& device_info) {
+  return nvptx::LibDevicePath(
+      hlo_config.debug_options().xla_gpu_cuda_data_dir());
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
index 3be360ae554379..55155ef877f21f 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
@@ -131,7 +131,9 @@ INSTANTIATE_TEST_SUITE_P(RewriteTestSuite, MixedTypeTest,
                              //  TritonRewriteTest2Params{F32, F16},
                              //  TritonRewriteTest2Params{F32, BF16},
                              MixTypeParams{S8, BF16, 24, 40, 8},
-                             MixTypeParams{S8, F16, 80, 16, 32, 1e-3, 1e-6},
+                             // Modify the case below to use k = 32 instead of
+                             // 16 once b/331362083 is fixed.
+                             MixTypeParams{S8, F16, 80, 32, 32, 1e-3, 1e-6},
                              MixTypeParams{F16, F32, 127, 3, 300, 1e-2, 1e-2},
                              MixTypeParams{F16, BF16, 544, 96, 16, 1e-3, 1e-3},
                              MixTypeParams{BF16, F32, 77, 500, 333, 3e-3, 3e-3},
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_rocm.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_rocm.cc
new file mode 100644
index 00000000000000..1a7aa92c62a9eb
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_rocm.cc
@@ -0,0 +1,125 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// TODO(ROCm): Enable and include ROCm Triton passes when ROCm Triton is
+// included in build.
+// #include "third_party/amd/include/TritonAMDGPUToLLVM/Passes.h"
+#include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"  // from @llvm-project
+#include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"  // from @llvm-project
+#include "mlir/Conversion/IndexToLLVM/IndexToLLVM.h"  // from @llvm-project
+#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Transforms/Passes.h"  // from @llvm-project
+#include "xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
+#include "xla/service/gpu/matmul_utils.h"
+#include "xla/service/hlo_module_config.h"
+#include "tsl/platform/rocm_rocdl_path.h"
+#include "triton/Conversion/TritonGPUToLLVM/Passes.h"
+#include "triton/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.h"
+#include "triton/Dialect/Triton/Transforms/Passes.h"
+#include "triton/Dialect/TritonGPU/Transforms/Passes.h"
+#include "triton/Dialect/TritonNvidiaGPU/Transforms/Passes.h"
+
+namespace xla {
+namespace gpu {
+
+namespace ma = ::mlir::arith;
+namespace mm = ::mlir::math;
+namespace ml = ::mlir::LLVM;
+namespace mt = ::mlir::triton;
+
+using ::llvm::SmallVector;
+using mlir::ArrayRef;
+using ::mlir::ShapedType;
+using ::mlir::Type;
+using ::mlir::Value;
+using mlir::ValueRange;
+
+absl::Status CreateTritonPipeline(
+    mlir::OpPassManager& pm, const se::GpuComputeCapability& cc,
+    const TritonGemmConfig& config,
+    mt::nvidia_gpu::ClusterInfo& out_cluster_info) {
+  // TODO(ROCm): Check whether value different than 0 can be used.
+  const int ccAsInt = 0;
+  // TODO(ROCm): Check why some test fail when threadsPerWarp is set to 64.
+  const int threadsPerWarp = 32;
+
+  // Based on make_ttir() in
+  // @triton//:third_party/nvidia/backend/compiler.py
+  pm.addPass(mlir::createInlinerPass());
+  pm.addPass(mt::createRewriteTensorPointerPass());
+  pm.addPass(mt::createCombineOpsPass());
+  pm.addPass(mlir::createCanonicalizerPass());
+  pm.addPass(mt::createReorderBroadcastPass());
+  pm.addPass(mlir::createCSEPass());
+  pm.addPass(mlir::createLoopInvariantCodeMotionPass());
+  pm.addPass(mlir::createSymbolDCEPass());
+
+  // Based on make_ttgir() in
+  // @triton//:third_party/nvidia/backend/compiler.py
+  pm.addPass(mt::createConvertTritonToTritonGPUPass(
+      config.num_warps, threadsPerWarp, config.num_ctas, ccAsInt));
+  pm.addPass(mt::gpu::createCoalescePass());
+  pm.addPass(mt::gpu::createRemoveLayoutConversionsPass());
+  pm.addPass(mt::gpu::createOptimizeThreadLocalityPass());
+  pm.addPass(mt::gpu::createAccelerateMatmulPass(ccAsInt));
+  pm.addPass(mt::gpu::createRemoveLayoutConversionsPass());
+  pm.addPass(mt::gpu::createOptimizeDotOperandsPass());
+  pm.addPass(mlir::createCSEPass());
+  pm.addPass(mt::gpu::createPipelinePass(config.num_stages, config.num_warps,
+                                         config.num_ctas, ccAsInt));
+  pm.addPass(mt::gpu::createPrefetchPass());
+
+  pm.addPass(mt::gpu::createOptimizeDotOperandsPass());
+  pm.addPass(mt::gpu::createRemoveLayoutConversionsPass());
+  pm.addPass(mt::gpu::createReduceDataDuplicationPass());
+  pm.addPass(mt::gpu::createReorderInstructionsPass());
+  pm.addPass(mlir::createCSEPass());
+  pm.addPass(mlir::createSymbolDCEPass());
+  pm.addPass(mlir::createCanonicalizerPass());
+
+  // Based on make_llir() in
+  // @triton//:third_party/nvidia/backend/compiler.py
+  // pm.addPass(mt::gpu::createDecomposeUnsupportedConversionsPass());
+  pm.addPass(mlir::createConvertSCFToCFPass());
+  pm.addPass(mlir::createConvertIndexToLLVMPass());
+  pm.addPass(mt::gpu::createAllocateSharedMemoryPass());
+  // pm.addPass(mt::createConvertTritonAMDGPUToLLVMPass());
+  pm.addPass(mlir::createArithToLLVMConversionPass());
+  pm.addPass(mlir::createCanonicalizerPass());
+  pm.addPass(mlir::createCSEPass());
+  pm.addPass(mlir::createSymbolDCEPass());
+  // Note: translateTritonGPUToLLVMIR adds line info with LLVMDIScopePass.
+  pm.addPass(mlir::createConvertSCFToCFPass());
+  pm.addPass(mlir::createConvertControlFlowToLLVMPass());
+
+  // There is no clusters in ROCm for now.
+  out_cluster_info.clusterDimX = 1;
+  out_cluster_info.clusterDimY = 1;
+  out_cluster_info.clusterDimZ = 1;
+
+  return absl::OkStatus();
+}
+
+std::string GetLibdevicePath(const HloModuleConfig& hlo_config,
+                             const se::DeviceDescription& device_info) {
+  std::string libdevice_dir = tsl::RocdlRoot();
+  auto compute_capability = device_info.rocm_compute_capability();
+  const std::string libdevice_path =
+      amdgpu::LibDevicePath(compute_capability.gcn_arch_name(), libdevice_dir);
+  return libdevice_path;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
index 9a2b5a4e8b1d25..7ff92e7207ad62 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
@@ -15,12 +15,12 @@ limitations under the License.
 
 #include "xla/service/gpu/ir_emitter_triton.h"
 
-#include <cstdint>
 #include <iterator>
 #include <limits>
 #include <memory>
 #include <string>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include <gmock/gmock.h>
@@ -42,7 +42,6 @@ limitations under the License.
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/matmul_utils.h"
-#include "xla/service/gpu/model/indexing_test_utils.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
 #include "xla/service/gpu/triton_fusion_analysis.h"
 #include "xla/service/pattern_matcher.h"
@@ -71,6 +70,10 @@ namespace {
 namespace m = ::xla::match;
 
 class TritonTest : public GpuCodegenTest {
+  const auto& device_desc() {
+    return backend().default_stream_executor()->GetDeviceDescription();
+  }
+
  public:
   se::CudaComputeCapability GetCudaComputeCapability() {
     return backend()
@@ -78,6 +81,27 @@ class TritonTest : public GpuCodegenTest {
         ->GetDeviceDescription()
         .cuda_compute_capability();
   }
+
+  const se::GpuComputeCapability& GpuComputeComp() {
+    return device_desc().gpu_compute_capability();
+  }
+
+  bool SkipBF16Tests() {
+    if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
+      auto rcc = device_desc().rocm_compute_capability();
+      return !rcc.has_bf16_dtype_support();
+    }
+    return false;
+  }
+
+  se::GpuComputeCapability CudaAmpereOrRocm() {
+    if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
+      return se::GpuComputeCapability{device_desc().rocm_compute_capability()};
+    } else {
+      return se::GpuComputeCapability{
+          se::CudaComputeCapability{se::CudaComputeCapability::AMPERE, 0}};
+    }
+  }
 };
 
 class TritonGemmTest : public TritonTest {
@@ -151,31 +175,6 @@ absl::Status TritonFilecheckTest::CreateTritonIrAndFileCheck(
   return absl::OkStatus();
 }
 
-TEST_F(TritonTest, ComputeProgramIdToOutputTileIndexing) {
-  mlir::MLIRContext context;
-
-  auto compute_map = [&](absl::Span<const int64_t> dimensions,
-                         absl::Span<const int64_t> tile_sizes) {
-    return ComputeProgramIdToOutputTileIndexing(dimensions, tile_sizes,
-                                                &context);
-  };
-
-  EXPECT_THAT(compute_map(/*dimensions=*/{9, 17}, /*tile_sizes=*/{5, 10}),
-              MatchIndexingMap(R"(
-    (d0) -> ((d0 floordiv 2) * 5, (d0 mod 2) * 10)
-    domain:
-    d0 in [0, 3]
-  )"));
-
-  EXPECT_THAT(
-      compute_map(/*dimensions=*/{8, 16, 32}, /*tile_sizes=*/{1, 1, 32}),
-      MatchIndexingMap(R"(
-    (d0) -> (d0 floordiv 16, d0 mod 16, 0)
-    domain:
-    d0 in [0, 127]
-  )"));
-}
-
 TEST_F(TritonFilecheckTest, TestGemm) {
   const std::string kHloText = R"(
 HloModule t, is_scheduled=true
@@ -866,6 +865,9 @@ TEST_F(TritonFilecheckTest, NestedReducerFusionGetsCodegenedCorrectly) {
           se::CudaComputeCapability::AMPERE)) {
     GTEST_SKIP() << "Doesn't pass on pre-Ampere GPUs.";
   }
+  if (SkipBF16Tests()) {
+    GTEST_SKIP() << "BF16 not supported.";
+  }
 
   const std::string kHloText = R"(
 HloModule softmax
@@ -1295,6 +1297,9 @@ CHECK: mma
 }
 
 TEST_F(TritonGemmTest, FailIfTooMuchShmem) {
+  if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
+    GTEST_SKIP() << "GEMM padding requirements for ROCM not included yet.";
+  }
   const std::string kHloText = R"(
 HloModule module, is_scheduled=true
 
@@ -1327,9 +1332,7 @@ ENTRY entry {
   TritonGemmConfig config(16, 32, 512, 1, 4, 8);
   EXPECT_THAT(
       TritonWrapper(*TritonFusionAnalysis::Execute(*triton_dot_computation),
-                    "test_fn", triton_dot_computation,
-                    se::CudaComputeCapability{se::CudaComputeCapability::AMPERE,
-                                              /*minor=*/0},
+                    "test_fn", triton_dot_computation, CudaAmpereOrRocm(),
                     dev_info, config, &llvm_module, &EmitMatMul, mlir_context),
       tsl::testing::StatusIs(
           tsl::error::RESOURCE_EXHAUSTED,
@@ -1342,9 +1345,7 @@ ENTRY entry {
   TF_ASSERT_OK_AND_ASSIGN(
       const auto result,
       TritonWrapper(*TritonFusionAnalysis::Execute(*triton_dot_computation),
-                    "test_fn", triton_dot_computation,
-                    se::CudaComputeCapability{se::CudaComputeCapability::AMPERE,
-                                              /*minor=*/0},
+                    "test_fn", triton_dot_computation, CudaAmpereOrRocm(),
                     dev_info, config, &llvm_module, &EmitMatMul, mlir_context));
   // Use optin shared memory which is > shared_memory_per_block.
   EXPECT_GT(result.shmem_bytes, dev_info.shared_memory_per_block());
@@ -1375,7 +1376,7 @@ ENTRY e {
 ; CHECK-NEXT: parameter
 ; CHECK-NEXT: fusion(
 ; CHECK-SAME: kind=kCustom
-; CHECK-SAME: "block_m":
+; CHECK-PTX-SAME: "block_m":
   )");
 
   // Not doing a comparison here, because the input matrices are quite big.
@@ -1401,7 +1402,7 @@ ENTRY e {
 ; CHECK-NEXT: parameter
 ; CHECK-NEXT: fusion(
 ; CHECK-SAME: kind=kCustom
-; CHECK-SAME: "block_m":
+; CHECK-PTX-SAME: "block_m":
   )");
 
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
@@ -1426,7 +1427,7 @@ ENTRY e {
 ; CHECK-NEXT: ROOT
 ; CHECK-SAME: fusion(
 ; CHECK-SAME: kind=kCustom
-; CHECK-SAME: "block_m":
+; CHECK-PTX-SAME: "block_m":
 ; CHECK-NOT: pad
 ; CHECK-NOT: slice
 )");
@@ -1453,7 +1454,7 @@ ENTRY e {
 ; CHECK-NEXT: parameter
 ; CHECK-NEXT: fusion(
 ; CHECK-SAME: kind=kCustom
-; CHECK-SAME: "block_m":
+; CHECK-PTX-SAME: "block_m":
 )");
 
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/0, /*arel=*/0}));
@@ -1480,7 +1481,7 @@ ENTRY e {
 ; CHECK-NEXT: parameter
 ; CHECK-NEXT: fusion(
 ; CHECK-SAME: kind=kCustom
-; CHECK-SAME: "block_m":
+; CHECK-PTX-SAME: "block_m":
 )");
 
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
@@ -1557,7 +1558,7 @@ ENTRY e {
 ; CHECK: transpose
 ; CHECK: fusion
 ; CHECK-SAME: kind=kCustom
-; CHECK-SAME: "block_m":
+; CHECK-PTX-SAME: "block_m":
 )");
 
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
@@ -1582,7 +1583,7 @@ ENTRY e {
 ; CHECK: transpose
 ; CHECK: fusion
 ; CHECK-SAME: kind=kCustom
-; CHECK-SAME: "block_m":
+; CHECK-PTX-SAME: "block_m":
 )");
 
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
@@ -1607,7 +1608,7 @@ ENTRY e {
 ; CHECK-NEXT: parameter
 ; CHECK-NEXT: fusion
 ; CHECK-SAME: kind=kCustom
-; CHECK-SAME: "block_m":
+; CHECK-PTX-SAME: "block_m":
 )");
 
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-2}));
@@ -1632,7 +1633,7 @@ ENTRY e {
 ; CHECK-NEXT: parameter
 ; CHECK-NEXT: fusion
 ; CHECK-SAME: kind=kCustom
-; CHECK-SAME: "block_m":
+; CHECK-PTX-SAME: "block_m":
 )");
 
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
@@ -1658,7 +1659,7 @@ ENTRY e {
 ; CHECK-NEXT: parameter
 ; CHECK-NEXT: fusion
 ; CHECK-SAME: kind=kCustom
-; CHECK-SAME: "block_m":
+; CHECK-PTX-SAME: "block_m":
 )");
 
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-2}));
@@ -1683,7 +1684,7 @@ ENTRY e {
 ; CHECK: f32[5,3,4]{2,1,0} bitcast
 ; CHECK: fusion
 ; CHECK-SAME: kind=kCustom
-; CHECK-SAME: "block_m":
+; CHECK-PTX-SAME: "block_m":
 )");
 
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-4}));
@@ -1761,6 +1762,9 @@ ENTRY e {
 }
 
 TEST_F(TritonGemmTestWithoutTritonGemmAny, SkipU8) {
+  if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
+    GTEST_SKIP() << "GEMM padding requirements for ROCM not included yet.";
+  }
   const std::string hlo_text = R"(
 HloModule t
 
@@ -1779,6 +1783,9 @@ ENTRY e {
 }
 
 TEST_F(TritonGemmTestWithoutTritonGemmAny, SkipF32F32) {
+  if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
+    GTEST_SKIP() << "GEMM padding requirements for ROCM not included yet.";
+  }
   const std::string hlo_text = R"(
 HloModule t
 
@@ -1830,9 +1837,7 @@ ENTRY entry {
   TritonGemmConfig config(512, 512, 32, 1, 1, 2);
   EXPECT_THAT(
       TritonWrapper(*TritonFusionAnalysis::Execute(*triton_dot_computation),
-                    "test_fn", triton_dot_computation,
-                    se::CudaComputeCapability{se::CudaComputeCapability::AMPERE,
-                                              /*minor=*/0},
+                    "test_fn", triton_dot_computation, CudaAmpereOrRocm(),
                     dev_info, config, &llvm_module, &EmitMatMul, mlir_context),
       tsl::testing::StatusIs(
           tsl::error::RESOURCE_EXHAUSTED,
@@ -1844,9 +1849,7 @@ ENTRY entry {
   config.block_k = 32;
   TF_CHECK_OK(
       TritonWrapper(*TritonFusionAnalysis::Execute(*triton_dot_computation),
-                    "test_fn", triton_dot_computation,
-                    se::CudaComputeCapability{se::CudaComputeCapability::AMPERE,
-                                              /*minor=*/0},
+                    "test_fn", triton_dot_computation, CudaAmpereOrRocm(),
                     dev_info, config, &llvm_module, &EmitMatMul, mlir_context)
           .status());
 }
@@ -1904,11 +1907,14 @@ ENTRY e {
                     // multiple times and assign block sizes on success.
                     R"(
 ; CHECK: f16[77,99,111]{2,1,0} transpose
-; CHECK: block_m
+; CHECK-PTX: block_m
 )");
 }
 
 TEST_F(TritonGemmTest, SingleElementTileIsHandled) {
+  if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
+    GTEST_SKIP() << "Not using autotuner on ROCM yet.";
+  }
   MatchOptimizedHlo(R"(
 t {
   p0 = f32[2,7,3]{2,1,0} parameter(0)
@@ -1962,13 +1968,16 @@ ENTRY e {
   MatchOptimizedHlo(hlo_text, R"(
 ; CHECK: fusion(
 ; CHECK-SAME: kind=kCustom
-; CHECK-SAME: block_m
+; CHECK-PTX-SAME: block_m
 )");
 
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
 TEST_F(TritonGemmTestAny, DoAddConstantToScalarAndBroadcastThat) {
+  if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
+    GTEST_SKIP() << "Not using autotuner on ROCM yet.";
+  }
   const std::string hlo_text = R"(
 HloModule t
 
@@ -2005,12 +2014,40 @@ ENTRY e {
 ; CHECK: ENTRY
 ; CHECK: %[[p0:.*]] = pred[5,5]{1,0} parameter(0)
 ; CHECK: fusion(%[[p0]], %[[p0]]), kind=kCustom
-; CHECK-SAME: "block_m":
+; CHECK-PTX-SAME: "block_m":
 )");
 
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-6, /*arel=*/1e-6}));
 }
 
+TEST_F(TritonGemmTestAny,
+       DoNotFuseConcatenationOfSplitNonContractingDimension) {
+  if (SkipBF16Tests()) {
+    GTEST_SKIP() << "BF16 not supported.";
+  }
+  const std::string hlo_text = R"(
+HloModule m
+
+ENTRY e {
+  x = bf16[2,128,10] parameter(0)
+  y = bf16[2,256,10] parameter(1)
+  concat = bf16[2,384,10] concatenate(x, y), dimensions={1}
+  z = bf16[10,20] parameter(2)
+  ROOT d = bf16[2,384,20] dot(concat, z), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+})";
+
+  MatchOptimizedHlo(hlo_text, R"(
+; CHECK:      ENTRY
+; CHECK:      concatenate
+; CHECK:        ROOT
+; CHECK-SAME:     fusion
+; CHECK-SAME:       kind=kCustom
+; CHECK-SAME:       "block_m"
+)");
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
 class TritonGemmLevel2Test : public TritonGemmTest {
  public:
   DebugOptions GetDebugOptionsForTest() override {
@@ -2210,6 +2247,9 @@ ENTRY e {
 }
 
 TEST_F(TritonGemmLevel2Test, DoubleBroadcastOfScalarConstantIsHandled) {
+  if (SkipBF16Tests()) {
+    GTEST_SKIP() << "BF16 not supported.";
+  }
   const std::string kHloText = R"(
 ENTRY e {
   c = s32[] constant(1)
@@ -2255,6 +2295,9 @@ ENTRY e {
 }
 
 TEST_F(TritonGemmLevel2Test, AlwaysFuseScalarConstantAtBroadcastInput) {
+  if (SkipBF16Tests()) {
+    GTEST_SKIP() << "BF16 not supported.";
+  }
   const std::string kHloText = R"(
 ENTRY e {
   p0 = bf16[2,3,3]{2,1,0} parameter(0)
@@ -2308,6 +2351,9 @@ ENTRY e {
 }
 
 TEST_F(TritonGemmLevel2Test, FuseConcatenation) {
+  if (SkipBF16Tests()) {
+    GTEST_SKIP() << "BF16 not supported.";
+  }
   const std::string kHloText = R"(
 e {
   p0 = s8[153,1536] parameter(0)
@@ -2350,7 +2396,7 @@ ENTRY e {
   MatchOptimizedHlo(kHloText, R"(
 ; CHECK: fusion(
 ; CHECK-SAME: kind=kCustom
-; CHECK-SAME: block_m
+; CHECK-PTX-SAME: block_m
 )");
 
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
@@ -2373,7 +2419,7 @@ ENTRY e {
   MatchOptimizedHlo(kHloText, R"(
 ; CHECK: fusion(
 ; CHECK-SAME: kind=kCustom
-; CHECK-SAME: block_m
+; CHECK-PTX-SAME: block_m
 )");
 
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
@@ -2396,7 +2442,7 @@ ENTRY e {
   MatchOptimizedHlo(kHloText, R"(
 ; CHECK: fusion(
 ; CHECK-SAME: kind=kCustom
-; CHECK-SAME: block_m
+; CHECK-PTX-SAME: block_m
 )");
 
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
@@ -2419,7 +2465,7 @@ ENTRY e {
   MatchOptimizedHlo(kHloText, R"(
 ; CHECK: fusion(
 ; CHECK-SAME: kind=kCustom
-; CHECK-SAME: block_m
+; CHECK-PTX-SAME: block_m
 )");
 
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
@@ -2443,7 +2489,7 @@ ENTRY e {
   MatchOptimizedHlo(kHloText, R"(
 ; CHECK: fusion(
 ; CHECK-SAME: kind=kCustom
-; CHECK-SAME: block_m
+; CHECK-PTX-SAME: block_m
 )");
 
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3,
@@ -2468,7 +2514,7 @@ ENTRY e {
   MatchOptimizedHlo(kHloText, R"(
 ; CHECK: fusion(
 ; CHECK-SAME: kind=kCustom
-; CHECK-SAME: block_m
+; CHECK-PTX-SAME: block_m
 )");
 
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3,
@@ -2493,7 +2539,7 @@ ENTRY e {
   MatchOptimizedHlo(kHloText, R"(
 ; CHECK: fusion(
 ; CHECK-SAME: kind=kCustom
-; CHECK-SAME: block_m
+; CHECK-PTX-SAME: block_m
 )");
 
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3,
@@ -2518,7 +2564,7 @@ ENTRY e {
   MatchOptimizedHlo(kHloText, R"(
 ; CHECK: fusion(
 ; CHECK-SAME: kind=kCustom
-; CHECK-SAME: block_m
+; CHECK-PTX-SAME: block_m
 )");
 
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3,
@@ -2686,6 +2732,9 @@ ENTRY e {
 }
 
 TEST_F(TritonGemmLevel2Test, ParameterAfterDotIsFused) {
+  if (SkipBF16Tests()) {
+    GTEST_SKIP() << "BF16 not supported.";
+  }
   const std::string kHloText = R"(
 HloModule m
 
@@ -2715,6 +2764,9 @@ ENTRY e {
 }
 
 TEST_F(TritonGemmLevel2Test, OutputFusionExecutesCorrectly) {
+  if (SkipBF16Tests()) {
+    GTEST_SKIP() << "BF16 not supported.";
+  }
   const std::string kHloText = R"(
 HloModule m
 
@@ -2748,6 +2800,9 @@ ENTRY e {
 }
 
 TEST_F(TritonGemmLevel2Test, SplitLHSOutputTransposeAloneIsNotFused) {
+  if (SkipBF16Tests()) {
+    GTEST_SKIP() << "BF16 not supported.";
+  }
   const std::string kHloText = R"(
 HloModule m
 
@@ -2772,6 +2827,9 @@ ENTRY e {
 }
 
 TEST_F(TritonGemmLevel2Test, SplitLHSInputOutputIsFused) {
+  if (SkipBF16Tests()) {
+    GTEST_SKIP() << "BF16 not supported.";
+  }
   const std::string kHloText = R"(
 ENTRY e {
   p0t = (s8[5,18,20,150]) parameter(0)
@@ -3047,6 +3105,9 @@ ENTRY e {
 }
 
 TEST_F(CompareTest, BF16TransposedLHS) {
+  if (SkipBF16Tests()) {
+    GTEST_SKIP() << "BF16 not supported.";
+  }
   const char* hlo_text_ref = R"(
 HloModule r
 
@@ -3135,9 +3196,9 @@ ENTRY e {
   TF_ASSERT_OK_AND_ASSIGN(
       const auto result,
       TritonWrapper(*TritonFusionAnalysis::Execute(*triton_dot_computation),
-                    "test_fn", triton_dot_computation,
-                    GetCudaComputeCapability(), dev_info, triton_gemm_config,
-                    &llvm_module, &EmitMatMul, mlir_context));
+                    "test_fn", triton_dot_computation, GpuComputeComp(),
+                    dev_info, triton_gemm_config, &llvm_module, &EmitMatMul,
+                    mlir_context));
   // The config is chosen so that the used memory size is slightly above the
   // 48 kB boundary of standard / optin shared memory so that any GPU that
   // has the optin one should be able to execute the test.
@@ -3252,6 +3313,9 @@ ENTRY e {
 }
 
 TEST_F(CompareTest, S8BF16) {
+  if (SkipBF16Tests()) {
+    GTEST_SKIP() << "BF16 not supported.";
+  }
   const char* hlo_text_ref = R"(
 HloModule r
 
@@ -3299,6 +3363,9 @@ ENTRY e {
 }
 
 TEST_F(CompareTest, SplitK) {
+  if (SkipBF16Tests()) {
+    GTEST_SKIP() << "BF16 not supported.";
+  }
   const std::string hlo_text_ref = R"(
 HloModule t, is_scheduled=true
 
@@ -3372,6 +3439,9 @@ ENTRY e {
 }
 
 TEST_F(CompareTest, SplitKBatch) {
+  if (SkipBF16Tests()) {
+    GTEST_SKIP() << "BF16 not supported.";
+  }
   const std::string kHloTextRef = R"(
 HloModule m, is_scheduled=true
 
@@ -3434,6 +3504,9 @@ ENTRY e {
 }
 
 TEST_F(CompareTest, SplitKNontrivialBitcast) {
+  if (SkipBF16Tests()) {
+    GTEST_SKIP() << "BF16 not supported.";
+  }
   const std::string kHloTextRef = R"(
 HloModule module, is_scheduled=true
 
@@ -4008,6 +4081,9 @@ ENTRY e {
 }
 
 TEST_F(CompareTest, PredToBF16ConversionWorks) {
+  if (SkipBF16Tests()) {
+    GTEST_SKIP() << "BF16 not supported.";
+  }
   const std::string kHloTextTest = R"(
 HloModule m, is_scheduled=true
 
@@ -4128,6 +4204,9 @@ class TritonGemmContractionDims : public TritonGemmTest {
 };
 
 TEST_F(TritonGemmContractionDims, TritonDotForceContractionDims_1_0) {
+  if (SkipBF16Tests()) {
+    GTEST_SKIP() << "BF16 not supported.";
+  }
   const std::string kHloText = R"(
 HloModule m
 
@@ -4150,6 +4229,9 @@ ENTRY e {
 }
 
 TEST_F(TritonGemmContractionDims, TritonDotForceContractionDims_1_2_1_2) {
+  if (SkipBF16Tests()) {
+    GTEST_SKIP() << "BF16 not supported.";
+  }
   const std::string kHloText = R"(
 HloModule m
 
@@ -4172,6 +4254,9 @@ ENTRY e {
 }
 
 TEST_F(TritonGemmContractionDims, TritonDotForceContractionDims_1_2_0_1) {
+  if (SkipBF16Tests()) {
+    GTEST_SKIP() << "BF16 not supported.";
+  }
   const std::string kHloText = R"(
 HloModule m
 
@@ -4195,6 +4280,9 @@ ENTRY e {
 }
 
 TEST_F(TritonGemmContractionDims, TritonDotForceContractionDims_1_1) {
+  if (SkipBF16Tests()) {
+    GTEST_SKIP() << "BF16 not supported.";
+  }
   const std::string kHloText = R"(
 HloModule m
 
@@ -4234,6 +4322,13 @@ class Triton6xBF16GemmTest : public TritonFilecheckTest {
     debug_options.set_xla_gpu_enable_split_k_autotuning(false);
     return debug_options;
   }
+
+ protected:
+  void SetUp() override {
+    if (SkipBF16Tests()) {
+      GTEST_SKIP() << "BF16 not supported.";
+    }
+  }
 };
 
 // In these tests, we depend on debug option flags for selecting the 6XBF16
@@ -4516,7 +4611,6 @@ CHECK-COUNT-6:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} {allowTF32 = false, m
                                ErrorSpec{/*aabs=*/1e-6, /*arel=*/1e-6}));
 }
 
-
 TEST_F(Triton6xBF16GemmTest, Emit6xBF16GemmEndToEnd) {
   const char* kHloText = R"(
 HloModule t
@@ -4580,6 +4674,13 @@ class Triton3xBF16GemmTestWithFlag : public TritonFilecheckTest {
     debug_options.set_xla_gpu_enable_bf16_3way_gemm(true);
     return debug_options;
   }
+
+ protected:
+  void SetUp() override {
+    if (SkipBF16Tests()) {
+      GTEST_SKIP() << "BF16 not supported.";
+    }
+  }
 };
 
 TEST_F(Triton3xBF16GemmTest, Emit3xBF16GemmWhenBothInputsAreF32) {
@@ -4860,6 +4961,45 @@ CHECK-COUNT-3:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} {allowTF32 = false, m
                                ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-4}));
 }
 
+// This test could be modified to allow TF32 once this bug is fixed.
+// TODO(b/320659359) Allow TF32 for 8-bit or less types with F32.
+TEST_F(TritonFilecheckTest, NoTF32For8BitOrLessWithF32) {
+  const std::string hlo_text = R"(
+HloModule t
+
+triton_dot {
+  parameter_0 = s32[11,24]{1,0} parameter(0)
+  broadcast.1747 = s32[11,24,128]{2,1,0} broadcast(parameter_0),
+  dimensions={0,1} parameter_1 = s32[11,24,128]{2,1,0} parameter(1)
+  compare.49 = pred[11,24,128]{2,1,0} compare(broadcast.1747, parameter_1),
+      direction=EQ bitcast.4717 = pred[264,128]{1,0} bitcast(compare.49)
+  convert.142 = f32[264,128]{1,0} convert(bitcast.4717)
+  parameter_2 = f32[128,8]{1,0} parameter(2)
+  ROOT dot.381 = f32[264,8]{1,0} dot(convert.142, parameter_2),
+      lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = s32[11,24]{1,0} parameter(0)
+  p1 = s32[11,24,128]{2,1,0} parameter(1)
+  p2 = f32[128,8]{1,0} parameter(2)
+  ROOT _ = f32[264,8] fusion(p0, p1, p2), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+      triton_gemm_config:
+        {"block_m":32,"block_n":16,"block_k":128,
+         "split_k":1,"num_stages":1,"num_warps":4,
+         "num_ctas":1}}}
+})";
+
+  TritonGemmConfig config(32, 16, 128, 1, 1, 4);
+  ASSERT_OK(
+      CreateTritonIrAndFileCheck(hlo_text, config, EmitMatMul, "triton_dot", R"(
+CHECK: %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} {allowTF32 = false
+  )"));
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
 TEST_F(Triton3xBF16GemmTest, Emit3xBF16GemmEndToEnd) {
   const char* kHloText = R"(
 HloModule t
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc b/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
index 62a695dd4af293..adcf09a0546003 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
@@ -109,7 +109,6 @@ limitations under the License.
 #include "xla/service/gpu/kernels/topk_custom_kernel.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/matmul_utils.h"
-#include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/parallel_loop_emitter.h"
 #include "xla/service/gpu/runtime/command_buffer_cmd.h"
 #include "xla/service/gpu/runtime/command_buffer_cmd_emitter.h"
@@ -126,6 +125,7 @@ limitations under the License.
 #include "xla/service/gpu/runtime/nccl_all_gather_thunk.h"
 #include "xla/service/gpu/runtime/nccl_all_reduce_thunk.h"
 #include "xla/service/gpu/runtime/nccl_all_to_all_thunk.h"
+#include "xla/service/gpu/runtime/nccl_api.h"
 #include "xla/service/gpu/runtime/nccl_collective_broadcast_thunk.h"
 #include "xla/service/gpu/runtime/nccl_collective_permute_thunk.h"
 #include "xla/service/gpu/runtime/nccl_collective_thunk.h"
@@ -1008,11 +1008,12 @@ absl::Status IrEmitterUnnested::EmitFusedMHAThunk(
   if (has_activation) {
     output_shapes.push_back(ShapeUtil::GetSubshape(instr->shape(), {2}));
   }
-
+  TF_ASSIGN_OR_RETURN(const auto mask_type,
+                      AsCudnnFmhaMaskKind(config.mask_type()));
   GpufMHADescriptor descriptor = {kind,
                                   config,
                                   config.is_flash_attention(),
-                                  config.is_causal_mask(),
+                                  mask_type,
                                   lhs_bmm1->shape(),
                                   rhs_bmm1->shape(),
                                   rhs_bmm2->shape(),
@@ -1136,17 +1137,12 @@ absl::Status IrEmitterUnnested::EmitFusedMHABackwardThunk(
   Shape d_bmm2_rhs_shape =
       ShapeUtil::GetSubshape(instr->shape(), {output_index++});
 
-  BufferAllocation::Slice d_s_slice, softmax_sum_slice, d_Q_accum_slice;
+  BufferAllocation::Slice d_s_slice;
   std::optional<Shape> d_s_shape;
   if (!is_flash_attention) {
     TF_ASSIGN_OR_RETURN(d_s_slice,
                         GetAllocationSliceForHlo(instr, {output_index}));
     d_s_shape = ShapeUtil::GetSubshape(instr->shape(), {output_index++});
-  } else {
-    TF_ASSIGN_OR_RETURN(softmax_sum_slice,
-                        GetAllocationSliceForHlo(instr, {output_index++}));
-    TF_ASSIGN_OR_RETURN(d_Q_accum_slice,
-                        GetAllocationSliceForHlo(instr, {output_index++}));
   }
 
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice scratch_slice,
@@ -1161,14 +1157,14 @@ absl::Status IrEmitterUnnested::EmitFusedMHABackwardThunk(
                         GetAllocationSliceForHlo(instr, {output_index}));
     d_bias_shape = ShapeUtil::GetSubshape(instr->shape(), {output_index++});
   }
-
   TF_RET_CHECK(output_index == instr->shape().tuple_shapes().size());
-
+  TF_ASSIGN_OR_RETURN(const auto mask_type,
+                      AsCudnnFmhaMaskKind(config.mask_type()));
   GpufMHABackwardDescriptor descriptor = {
       kind,
       config,
       is_flash_attention,
-      config.is_causal_mask(),
+      mask_type,
       bmm1_grad_gemm1_rhs_shape,
       bmm1_grad_gemm2_rhs_shape,
       bmm2_grad_gemm1_lhs_shape,
@@ -1196,8 +1192,8 @@ absl::Status IrEmitterUnnested::EmitFusedMHABackwardThunk(
       bmm1_grad_gemm2_rhs_slice, bmm2_grad_gemm1_lhs_slice,
       bmm2_grad_gemm2_rhs_slice, d_output_slice, scratch_slice,
       d_bmm1_lhs_slice, d_bmm1_rhs_slice, d_bmm2_rhs_slice, d_s_slice,
-      softmax_sum_slice, d_Q_accum_slice, mask_slice, d_bias_slice,
-      fwd_output_slice, bias_slice, seqlen_q_slice, seqlen_k_slice));
+      mask_slice, d_bias_slice, fwd_output_slice, bias_slice, seqlen_q_slice,
+      seqlen_k_slice));
 
   return absl::OkStatus();
 }
@@ -1711,8 +1707,9 @@ absl::Status IrEmitterUnnested::EmitFusion(const HloFusionInstruction* instr,
                                            HloFusionAnalysis& fusion_analysis) {
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<FusionInterface> emitter,
-      GetFusionEmitter(HloFusionInfo(
-          fusion_analysis, instr, &ir_emitter_context_->buffer_assignment())));
+      GetFusionEmitter(HloFusionInfo(fusion_analysis, instr,
+                                     &ir_emitter_context_->buffer_assignment()),
+                       /*is_emission_phase=*/true));
   return AddThunksToThunkSequence(emitter->Emit(*ir_emitter_context_, *instr));
 }
 
@@ -2213,7 +2210,8 @@ Status IrEmitterUnnested::EmitCollectivePermute(
         /*destination_memory_space=*/dst_memory_space};
     auto thunk = std::make_unique<NcclCollectivePermuteStartThunk>(
         Thunk::ThunkInfo::WithProfileAnnotation(instr), NcclApi::Default(),
-        instr, replica_count, partition_count, buffer);
+        instr, replica_count, partition_count, buffer,
+        ir_emitter_context_->debug_options().xla_gpu_use_memcpy_local_p2p());
     GetCollectivesAsyncEvents().try_emplace(instr, thunk->async_events());
     AddThunkToThunkSequence(std::move(thunk));
   }
@@ -2399,7 +2397,8 @@ absl::Status IrEmitterUnnested::EmitInfeed(const HloInfeedInstruction* instr) {
   // We only need the result data to construct the infeed thunk.
   std::vector<ShapedSlice> shaped_slices;
   TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
-      instr->shape(), [&](const Shape& subshape, const ShapeIndex& index) {
+      instr->shape(),
+      [&](const Shape& subshape, const ShapeIndex& index) -> absl::Status {
         if (subshape.IsTuple() || subshape.IsToken()) return absl::OkStatus();
         if (subshape.IsArray()) {
           TF_ASSIGN_OR_RETURN(BufferAllocation::Slice data,
@@ -2425,7 +2424,8 @@ absl::Status IrEmitterUnnested::EmitOutfeed(
   const HloInstruction* source = instr->operand(0);
   std::vector<ShapedSlice> shaped_slices;
   TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
-      source->shape(), [&](const Shape& subshape, const ShapeIndex& index) {
+      source->shape(),
+      [&](const Shape& subshape, const ShapeIndex& index) -> absl::Status {
         if (subshape.IsTuple()) return absl::OkStatus();
         if (subshape.IsArray()) {
           TF_ASSIGN_OR_RETURN(BufferAllocation::Slice data,
diff --git a/third_party/xla/xla/service/gpu/kernels/BUILD b/third_party/xla/xla/service/gpu/kernels/BUILD
index 324286a7dcb77c..ae1d8d247c5993 100644
--- a/third_party/xla/xla/service/gpu/kernels/BUILD
+++ b/third_party/xla/xla/service/gpu/kernels/BUILD
@@ -105,6 +105,8 @@ xla_test(
     name = "cutlass_gemm_fusion_test",
     srcs = ["cutlass_gemm_fusion_test.cc"],
     backends = ["gpu"],
+    # TODO(b/332820384): Enable when it passes on H100.
+    disabled_backends = ["gpu_h100"],
     tags = ["no_rocm"],
     deps = [
         ":custom_kernel_fusion_pattern",
diff --git a/third_party/xla/xla/service/gpu/kernels/topk_kernel_test.cc b/third_party/xla/xla/service/gpu/kernels/topk_kernel_test.cc
index 8030da63a9e772..a5770e9a80e3ed 100644
--- a/third_party/xla/xla/service/gpu/kernels/topk_kernel_test.cc
+++ b/third_party/xla/xla/service/gpu/kernels/topk_kernel_test.cc
@@ -97,9 +97,12 @@ TEST_P(TopkTest, TopKFloat) {
   const auto [n_kb, k, batch_size, offset] = GetParam();
   const size_t n = n_kb * 1024 + offset;
 
-  auto input_buffer = executor->AllocateOwnedArray<T>(n * batch_size),
-       output_values = executor->AllocateOwnedArray<T>(k * batch_size);
-  auto output_indices = executor->AllocateOwnedArray<uint32_t>(k * batch_size);
+  stream_executor::ScopedDeviceMemory<T> input_buffer(
+      executor, executor->AllocateArray<T>(n * batch_size));
+  stream_executor::ScopedDeviceMemory<T> output_values(
+      executor, executor->AllocateArray<T>(k * batch_size));
+  stream_executor::ScopedDeviceMemory<T> output_indices(
+      executor, executor->AllocateArray<uint32_t>(k * batch_size));
 
   ASSERT_TRUE(!(input_buffer.is_null() || output_values.is_null() ||
                 output_indices.is_null()));
@@ -133,9 +136,12 @@ TEST_P(TopkTest, TopKPackedNegative) {
   const auto [n_kb, k, batch_size, offset] = GetParam();
   const size_t n = n_kb * 1024 + offset;
 
-  auto input_buffer = executor->AllocateOwnedArray<T>(n * batch_size),
-       output_values = executor->AllocateOwnedArray<T>(k * batch_size);
-  auto output_indices = executor->AllocateOwnedArray<uint32_t>(k * batch_size);
+  stream_executor::ScopedDeviceMemory<T> input_buffer(
+      executor, executor->AllocateArray<T>(n * batch_size));
+  stream_executor::ScopedDeviceMemory<T> output_values(
+      executor, executor->AllocateArray<T>(k * batch_size));
+  stream_executor::ScopedDeviceMemory<T> output_indices(
+      executor, executor->AllocateArray<uint32_t>(k * batch_size));
 
   ASSERT_TRUE(!(input_buffer.is_null() || output_values.is_null() ||
                 output_indices.is_null()));
@@ -187,9 +193,12 @@ void BM_SmallTopk(benchmark::State& state) {
   auto* executor = GetGpuExecutor();
   auto stream = executor->CreateStream().value();
 
-  auto input_buffer = executor->AllocateOwnedArray<T>(n * batch_size),
-       output_values = executor->AllocateOwnedArray<T>(k * batch_size);
-  auto output_indices = executor->AllocateOwnedArray<uint32_t>(k * batch_size);
+  stream_executor::ScopedDeviceMemory<T> input_buffer(
+      executor, executor->AllocateArray<T>(n * batch_size));
+  stream_executor::ScopedDeviceMemory<T> output_values(
+      executor, executor->AllocateArray<T>(k * batch_size));
+  stream_executor::ScopedDeviceMemory<T> output_indices(
+      executor, executor->AllocateArray<uint32_t>(k * batch_size));
 
   if (input_buffer.is_null() || output_values.is_null() ||
       output_indices.is_null()) {
@@ -206,11 +215,14 @@ void BM_SmallTopk(benchmark::State& state) {
   }
 
   for (auto _ : state) {
-    auto timer = se::gpu::GpuTimer::Create(stream.get());
+    // Warmup execution without GpuTimer active
+    CHECK_OK(RunTopk(stream.get(), Get(T()), *input_buffer, n, *output_values,
+                     *output_indices, k, batch_size));
+    auto timer = se::gpu::GpuTimer::Create(stream.get(),
+                                           true /* warmup run was executed */);
     CHECK_OK(timer.status());
     CHECK_OK(RunTopk(stream.get(), Get(T()), *input_buffer, n, *output_values,
                      *output_indices, k, batch_size));
-    CHECK_OK(stream->BlockHostUntilDone());
     auto timer_duration = timer.value().GetElapsedDuration();
     CHECK_OK(timer_duration.status());
     state.SetIterationTime(absl::ToDoubleSeconds(timer_duration.value()));
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD b/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
index d1b47ba1bbe021..6cef8a063c4396 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
@@ -2,8 +2,8 @@ load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "if_rocm_is_configured",
 )
-load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("//xla:xla.bzl", "xla_cc_test")
+load("//xla/tsl:tsl.bzl", "internal_visibility")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -29,6 +29,7 @@ cc_library(
         "utils.h",
     ],
     deps = [
+        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:types",
@@ -40,11 +41,14 @@ cc_library(
         "//xla/stream_executor:device_description",
         "//xla/tsl/util:env_var",
         "@com_google_absl//absl/base",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
         "@llvm-project//llvm:Analysis",
         "@llvm-project//llvm:BitReader",
         "@llvm-project//llvm:BitWriter",
@@ -60,16 +64,22 @@ cc_library(
         "@llvm-project//llvm:Scalar",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:Target",
+        "@llvm-project//mlir:NVVMDialect",
+        "@local_config_cuda//cuda:cuda_headers",
         "@local_tsl//tsl/platform:cuda_libdevice_path",
         "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:random",
         "@local_tsl//tsl/platform:rocm_rocdl_path",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/profiler/lib:traceme",
     ] + if_rocm_is_configured([
         "@local_config_rocm//rocm:rocm_headers",
         "@llvm-project//llvm:AMDGPUCodeGen",
+        "@llvm-project//llvm:AMDGPUAsmParser",
     ]),
 )
 
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
index 6c6a6c20fbe27f..c84dd8f6092bed 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -15,21 +15,37 @@ limitations under the License.
 
 #include "xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
 
+#include <cstdint>
 #include <fstream>
 #include <functional>
 #include <ios>
 #include <memory>
+#include <mutex>  // NOLINT
 #include <optional>
 #include <string>
+#include <system_error>  // NOLINT
 #include <utility>
 #include <variant>
 #include <vector>
 
 #include "absl/base/call_once.h"
+#include "absl/base/const_init.h"
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/memory/memory.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "llvm/ADT/Any.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringSet.h"
+#include "llvm/Analysis/CGSCCPassManager.h"
+#include "llvm/Analysis/LazyCallGraph.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Bitcode/BitcodeReader.h"
@@ -39,19 +55,22 @@ limitations under the License.
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Linker/Linker.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/PassRegistry.h"
+#include "llvm/Passes/OptimizationLevel.h"
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Passes/StandardInstrumentations.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Program.h"
 #include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/AlwaysInliner.h"
 #include "llvm/Transforms/IPO/Internalize.h"
 #include "llvm/Transforms/Scalar.h"
@@ -59,23 +78,29 @@ limitations under the License.
 #include "xla/service/gpu/metrics.h"
 #include "xla/service/llvm_ir/llvm_command_line_options.h"
 #include "xla/service/llvm_ir/llvm_type_conversion_util.h"
-#include "xla/status_macros.h"
+#include "xla/status.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tsl/util/env_var.h"
-#include "xla/types.h"
 #include "xla/util.h"
 #include "tsl/platform/cuda_libdevice_path.h"
 #include "tsl/platform/env.h"
+#include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/path.h"
 #include "tsl/platform/random.h"
 #include "tsl/platform/rocm_rocdl_path.h"
+#include "tsl/platform/status.h"
+#include "tsl/platform/statusor.h"
 #include "tsl/profiler/lib/traceme.h"
 
 #if !defined(PLATFORM_GOOGLE) && TENSORFLOW_USE_ROCM
 #include "rocm/rocm_config.h"
 #endif
 
+#if GOOGLE_CUDA
+#include "third_party/gpus/cuda/include/cuda.h"
+#endif
+
 namespace xla {
 namespace gpu {
 namespace {
@@ -123,12 +148,14 @@ static std::string GetSmName(se::CudaComputeCapability compute_capability) {
   return absl::StrCat("sm_", sm_version, extension);
 }
 
+// NOLINTBEGIN: clang-diagnostic-unused-function
 // Convenience function for producing a name of a temporary compilation product
 // from the input filename.
 std::string MakeNameForTempProduct(absl::string_view input_filename,
                                    absl::string_view extension) {
   return ReplaceFilenameExtension(tsl::io::Basename(input_filename), extension);
 }
+// NOLINTEND: clang-diagnostic-unused-function
 
 // Initializes LLVM passes. Uses the PassRegistry mechanism.
 void InitializePasses(llvm::PassRegistry* pass_registry) {
@@ -294,6 +321,11 @@ std::unique_ptr<llvm::TargetMachine> NVPTXGetTargetMachine(
     const DebugOptions& debug_options) {
   // Figure out the exact name of the processor as known to the NVPTX backend
   // from the gpu_architecture flag.
+#if defined(GOOGLE_CUDA) && CUDA_VERSION >= 12010
+  // use ptx81 for CUDA >= 12.1
+  return GetTargetMachine(target_triple, GetSmName(compute_capability),
+                          debug_options, /*feature_str=*/"+ptx81");
+#endif
   return GetTargetMachine(target_triple, GetSmName(compute_capability),
                           debug_options, /*feature_str=*/"+ptx74");
 }
@@ -667,7 +699,7 @@ struct HsacoCache {
                   const std::vector<uint8_t>& hsaco);
 };
 
-static HsacoCache g_hsacoCache;
+static HsacoCache g_hsacoCache;  // NOLINT: static/global vars forbidden
 
 bool HsacoCache::Find(const std::string& ir, uint64_t& hash,
                       const std::string& gfx, std::vector<uint8_t>& hsaco) {
@@ -908,7 +940,7 @@ std::unique_ptr<llvm::TargetMachine> AMDGPUGetTargetMachine(
 // Returns the directory containing ROCm-Device-Libs files.
 std::string GetROCDLDir(const DebugOptions& debug_options) {
   std::vector<std::string> potential_rocdl_dirs;
-  const std::string datadir = debug_options.xla_gpu_cuda_data_dir();
+  const std::string& datadir = debug_options.xla_gpu_cuda_data_dir();
   if (!datadir.empty()) {
     potential_rocdl_dirs.push_back(datadir);
   }
@@ -941,6 +973,7 @@ void AMDGPUBackendInit(const DebugOptions& debug_options,
   LLVMInitializeAMDGPUTarget();
   LLVMInitializeAMDGPUTargetInfo();
   LLVMInitializeAMDGPUTargetMC();
+  LLVMInitializeAMDGPUAsmParser();
   LLVMInitializeAMDGPUAsmPrinter();
 #endif
 
@@ -952,6 +985,18 @@ void AMDGPUBackendInit(const DebugOptions& debug_options,
 }  // namespace
 
 namespace amdgpu {
+
+std::string LibDevicePath(std::string gcn_arch_name,
+                          const std::string& rocdl_dir_path) {
+  auto libdevice_dir_paths = GetROCDLPaths(gcn_arch_name, rocdl_dir_path);
+  for (auto libdevice_dir_path : libdevice_dir_paths) {
+    if (libdevice_dir_path.find("ocml.bc")) {
+      return libdevice_dir_path;
+    }
+  }
+  return "";
+}
+
 absl::StatusOr<std::vector<uint8_t>> CompileToHsaco(
     llvm::Module* module, se::GpuComputeCapability gpu_version,
     const DebugOptions& debug_options,
@@ -959,7 +1004,7 @@ absl::StatusOr<std::vector<uint8_t>> CompileToHsaco(
   static absl::once_flag backend_init_flag;
   // TODO(rocm) Ideally this would be refreshed if xla_gpu_cuda_data_dir
   // changes.
-  static std::string rocdl_dir_path;
+  static std::string rocdl_dir_path;  // NOLINT: static/global vars forbidden
   absl::call_once(backend_init_flag, AMDGPUBackendInit, debug_options,
                   rocdl_dir_path);
 
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h b/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h
index 38a4687987533a..3d67bf043e6444 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Target/TargetMachine.h"
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"  // from @llvm-project
 #include "xla/stream_executor/device_description.h"
 #include "xla/types.h"
 #include "xla/xla.pb.h"
@@ -60,6 +61,9 @@ absl::StatusOr<std::string> CompileToPtx(
 }  // namespace nvptx
 
 namespace amdgpu {
+// Get path to libdevice file.
+std::string LibDevicePath(std::string gcn_arch_name,
+                          const std::string& rocdl_dir_path);
 // Compiles the argument module and returns it with LLVM AMDGPU backend.
 // rocdl_dir_path is the parent directory of ROCm-Device-Libs bitcode libraries.
 // The contents of the module may be changed.
diff --git a/third_party/xla/xla/service/gpu/matmul_utils.cc b/third_party/xla/xla/service/gpu/matmul_utils.cc
index 78bb545b920991..9a26f9357a9b82 100644
--- a/third_party/xla/xla/service/gpu/matmul_utils.cc
+++ b/third_party/xla/xla/service/gpu/matmul_utils.cc
@@ -119,7 +119,8 @@ absl::StatusOr<Shape> GetBatchRowColumnShape(
     // The GeMM output always has its layout set such that the batch, row, and
     // col dim groups are each laid out physically sequentially. GeMM operands
     // must, therefore, be laid out similarly.
-    auto check_physically_sequential = [&](absl::Span<const int64_t> dims) {
+    auto check_physically_sequential =
+        [&](absl::Span<const int64_t> dims) -> absl::Status {
       for (auto it = dims.rbegin(); it != dims.rend(); ++it) {
         // NOTE: `i` is incremented as we check the dimensions.
         if (*it != shape.layout().minor_to_major()[i++])
diff --git a/third_party/xla/xla/service/gpu/mock_nccl_topo_config.h b/third_party/xla/xla/service/gpu/mock_nccl_topo_config.h
deleted file mode 100644
index 7125a064756546..00000000000000
--- a/third_party/xla/xla/service/gpu/mock_nccl_topo_config.h
+++ /dev/null
@@ -1,296 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_GPU_MOCK_NCCL_TOPO_CONFIG_H_
-#define XLA_SERVICE_GPU_MOCK_NCCL_TOPO_CONFIG_H_
-
-namespace xla {
-namespace gpu {
-// Nccl device topology info generated by the NCCL_TOPO_DUMP_FILE of the Nccl
-// library. See
-// https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-topo-dump-file
-// for more details.
-// kGCPA3 is for GCP A3 VM.
-// kNvidia is for Nvidia A100 VM
-const char kGCPA3[] = R"(
-<system version="1">
-<cpu numaid="0" affinity="0000,00000000,0fffffff,ffffff00,00000000,000fffff,ffffffff" arch="x86_64" vendor="GenuineIntel" familyid="6" modelid="143">
-    <pci busid="0000:00:0c.0" class="0x020000" vendor="0x1ae0" device="0x0042" subsystem_vendor="0x1ae0" subsystem_device="0x0058" link_speed="" link_width="0">
-      <nic>
-        <net name="eth0" dev="0" speed="200000" port="0" latency="0.000000" guid="0x0" maxconn="65536" gdr="0"/>
-      </nic>
-    </pci>
-    <pci busid="0000:02:00.0" class="0x060400" vendor="0x10b5" device="0x8796" subsystem_vendor="0x10b5" subsystem_device="0x8796" link_speed="16.0 GT/s PCIe" link_width="16">
-      <pci busid="0000:04:00.0" class="0x030200" vendor="0x10de" device="0x2330" subsystem_vendor="0x10de" subsystem_device="0x16c1" link_speed="16.0 GT/s PCIe" link_width="16">
-        <gpu dev="0" sm="90" rank="0" gdr="1">
-          <nvlink target="fffffff:ff:ff.0" count="18" tclass="0x068000"/>
-        </gpu>
-      </pci>
-      <pci busid="0000:05:00.0" class="0x030200" vendor="0x10de" device="0x2330" subsystem_vendor="0x10de" subsystem_device="0x16c1" link_speed="16.0 GT/s PCIe" link_width="16">
-        <gpu dev="1" sm="90" rank="1" gdr="1">
-          <nvlink target="fffffff:ff:ff.0" count="18" tclass="0x068000"/>
-        </gpu>
-      </pci>
-      <pci busid="0000:06:00.0" class="0x020000" vendor="0x1ae0" device="0x0042" subsystem_vendor="0x1ae0" subsystem_device="0x0058" link_speed="16.0 GT/s PCIe" link_width="16">
-        <nic>
-          <net name="eth1" dev="1" speed="200000" port="0" latency="0.000000" guid="0x1" maxconn="65536" gdr="0"/>
-        </nic>
-      </pci>
-     </pci>
-     <pci busid="0000:08:00.0" class="0x060400" vendor="0x10b5" device="0x8796" subsystem_vendor="0x10b5" subsystem_device="0x8796" link_speed="16.0 GT/s PCIe" link_width="16">
-      <pci busid="0000:0a:00.0" class="0x030200" vendor="0x10de" device="0x2330" subsystem_vendor="0x10de" subsystem_device="0x16c1" link_speed="16.0 GT/s PCIe" link_width="16">
-        <gpu dev="2" sm="90" rank="2" gdr="1">
-          <nvlink target="fffffff:ff:ff.0" count="18" tclass="0x068000"/>
-        </gpu>
-      </pci>
-      <pci busid="0000:0b:00.0" class="0x030200" vendor="0x10de" device="0x2330" subsystem_vendor="0x10de" subsystem_device="0x16c1" link_speed="16.0 GT/s PCIe" link_width="16">
-        <gpu dev="3" sm="90" rank="3" gdr="1">
-          <nvlink target="fffffff:ff:ff.0" count="18" tclass="0x068000"/>
-        </gpu>
-      </pci>
-      <pci busid="0000:0c:00.0" class="0x020000" vendor="0x1ae0" device="0x0042" subsystem_vendor="0x1ae0" subsystem_device="0x0058" link_speed="16.0 GT/s PCIe" link_width="16">
-        <nic>
-          <net name="eth2" dev="2" speed="200000" port="0" latency="0.000000" guid="0x2" maxconn="65536" gdr="0"/>
-        </nic>
-      </pci>
-     </pci>
-     <nic>
-        <net name="gke6b9b09baec5" dev="5" speed="10000" port="0" latency="0.000000" guid="0x5" maxconn="65536" gdr="0"/>
-        <net name="cilium_host" dev="6" speed="10000" port="0" latency="0.000000" guid="0x6" maxconn="65536" gdr="0"/>
-        <net name="cilium_net" dev="7" speed="10000" port="0" latency="0.000000" guid="0x7" maxconn="65536" gdr="0"/>
-     </nic>
-</cpu>
-<cpu numaid="1" affinity="ffff,ffffffff,f0000000,000000ff,ffffffff,fff00000,00000000" arch="x86_64" vendor="GenuineIntel" familyid="6" modelid="143">
-    <pci busid="0000:82:00.0" class="0x060400" vendor="0x10b5" device="0x8796" subsystem_vendor="0x10b5" subsystem_device="0x8796" link_speed="16.0 GT/s PCIe" link_width="16">
-      <pci busid="0000:84:00.0" class="0x030200" vendor="0x10de" device="0x2330" subsystem_vendor="0x10de" subsystem_device="0x16c1" link_speed="16.0 GT/s PCIe" link_width="16">
-        <gpu dev="4" sm="90" rank="4" gdr="1">
-          <nvlink target="fffffff:ff:ff.0" count="18" tclass="0x068000"/>
-        </gpu>
-      </pci>
-      <pci busid="0000:85:00.0" class="0x030200" vendor="0x10de" device="0x2330" subsystem_vendor="0x10de" subsystem_device="0x16c1" link_speed="16.0 GT/s PCIe" link_width="16">
-        <gpu dev="5" sm="90" rank="5" gdr="1">
-          <nvlink target="fffffff:ff:ff.0" count="18" tclass="0x068000"/>
-        </gpu>
-      </pci>
-      <pci busid="0000:86:00.0" class="0x020000" vendor="0x1ae0" device="0x0042" subsystem_vendor="0x1ae0" subsystem_device="0x0058" link_speed="16.0 GT/s PCIe" link_width="16">
-        <nic>
-          <net name="eth3" dev="3" speed="200000" port="0" latency="0.000000" guid="0x3" maxconn="65536" gdr="0"/>
-        </nic>
-      </pci>
-    </pci>
-    <pci busid="0000:88:00.0" class="0x060400" vendor="0x10b5" device="0x8796" subsystem_vendor="0x10b5" subsystem_device="0x8796" link_speed="16.0 GT/s PCIe" link_width="16">
-      <pci busid="0000:8a:00.0" class="0x030200" vendor="0x10de" device="0x2330" subsystem_vendor="0x10de" subsystem_device="0x16c1" link_speed="16.0 GT/s PCIe" link_width="16">
-        <gpu dev="6" sm="90" rank="6" gdr="1">
-          <nvlink target="fffffff:ff:ff.0" count="18" tclass="0x068000"/>
-        </gpu>
-      </pci>
-      <pci busid="0000:8b:00.0" class="0x030200" vendor="0x10de" device="0x2330" subsystem_vendor="0x10de" subsystem_device="0x16c1" link_speed="16.0 GT/s PCIe" link_width="16">
-        <gpu dev="7" sm="90" rank="7" gdr="1">
-          <nvlink target="fffffff:ff:ff.0" count="18" tclass="0x068000"/>
-        </gpu>
-      </pci>
-      <pci busid="0000:8c:00.0" class="0x020000" vendor="0x1ae0" device="0x0042" subsystem_vendor="0x1ae0" subsystem_device="0x0058" link_speed="16.0 GT/s PCIe" link_width="16">
-        <nic>
-          <net name="eth4" dev="4" speed="200000" port="0" latency="0.000000" guid="0x4" maxconn="65536" gdr="0"/>
-        </nic>
-      </pci>
-    </pci>
-  </cpu>
-</system>
-)";
-const char kNvidia[] = R"(
-<system version="1">
-  <cpu numaid="3" affinity="00000000,00000000,ffff0000,00000000,00000000,00000000,ffff0000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
-    <pci busid="0000:01:00.0" class="0x060400" vendor="0x1000" device="0xc010" subsystem_vendor="0x1000" subsystem_device="0xa096" link_speed="16.0 GT/s PCIe" link_width="16">
-      <pci busid="0000:03:00.0" class="0x060400" vendor="0x1000" device="0xc010" subsystem_vendor="0x1000" subsystem_device="0xa096" link_speed="16.0 GT/s PCIe" link_width="16">
-        <pci busid="0000:05:00.0" class="0x060400" vendor="0x1000" device="0xc010" subsystem_vendor="0x10de" subsystem_device="0x13b8" link_speed="16.0 GT/s PCIe" link_width="16">
-          <pci busid="0000:07:00.0" class="0x030200" vendor="0x10de" device="0x20b2" subsystem_vendor="0x10de" subsystem_device="0x1463" link_speed="16.0 GT/s PCIe" link_width="16">
-            <gpu dev="0" sm="80" rank="0" gdr="1">
-              <nvlink target="0000:c7:00.0" count="2" tclass="0x068000"/>
-              <nvlink target="0000:c4:00.0" count="2" tclass="0x068000"/>
-              <nvlink target="0000:c6:00.0" count="2" tclass="0x068000"/>
-              <nvlink target="0000:c9:00.0" count="2" tclass="0x068000"/>
-              <nvlink target="0000:c5:00.0" count="2" tclass="0x068000"/>
-              <nvlink target="0000:c8:00.0" count="2" tclass="0x068000"/>
-            </gpu>
-          </pci>
-        </pci>
-      </pci>
-      <pci busid="0000:0a:00.0" class="0x060400" vendor="0x1000" device="0xc010" subsystem_vendor="0x1000" subsystem_device="0xa096" link_speed="16.0 GT/s PCIe" link_width="16">
-        <pci busid="0000:0c:00.0" class="0x020700" vendor="0x15b3" device="0x101b" subsystem_vendor="0x15b3" subsystem_device="0x0007" link_speed="16.0 GT/s PCIe" link_width="16">
-          <nic>
-            <net name="mlx5_0" dev="0" speed="200000" port="1" latency="0.000000" guid="0xcf70b0003a1420c" maxconn="131072" gdr="1"/>
-          </nic>
-        </pci>
-        <pci busid="0000:0d:00.0" class="0x060400" vendor="0x1000" device="0xc010" subsystem_vendor="0x10de" subsystem_device="0x13b8" link_speed="16.0 GT/s PCIe" link_width="16">
-          <pci busid="0000:0f:00.0" class="0x030200" vendor="0x10de" device="0x20b2" subsystem_vendor="0x10de" subsystem_device="0x1463" link_speed="16.0 GT/s PCIe" link_width="16">
-            <gpu dev="1" sm="80" rank="1" gdr="1">
-              <nvlink target="0000:c7:00.0" count="2" tclass="0x068000"/>
-              <nvlink target="0000:c4:00.0" count="2" tclass="0x068000"/>
-              <nvlink target="0000:c6:00.0" count="2" tclass="0x068000"/>
-              <nvlink target="0000:c9:00.0" count="2" tclass="0x068000"/>
-              <nvlink target="0000:c5:00.0" count="2" tclass="0x068000"/>
-              <nvlink target="0000:c8:00.0" count="2" tclass="0x068000"/>
-            </gpu>
-          </pci>
-        </pci>
-      </pci>
-      <pci busid="0000:10:00.0" class="0x060400" vendor="0x1000" device="0xc010" subsystem_vendor="0x1000" subsystem_device="0xa096" link_speed="16.0 GT/s PCIe" link_width="16">
-        <pci busid="0000:12:00.0" class="0x020700" vendor="0x15b3" device="0x101b" subsystem_vendor="0x15b3" subsystem_device="0x0007" link_speed="16.0 GT/s PCIe" link_width="16">
-          <nic>
-            <net name="mlx5_1" dev="1" speed="200000" port="1" latency="0.000000" guid="0xfcf60b0003a1420c" maxconn="131072" gdr="1"/>
-          </nic>
-        </pci>
-      </pci>
-    </pci>
-  </cpu>
-  <cpu numaid="1" affinity="00000000,00000000,00000000,ffff0000,00000000,00000000,00000000,ffff0000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
-    <pci busid="0000:41:00.0" class="0x060400" vendor="0x1000" device="0xc010" subsystem_vendor="0x1000" subsystem_device="0xa096" link_speed="16.0 GT/s PCIe" link_width="16">
-      <pci busid="0000:43:00.0" class="0x060400" vendor="0x1000" device="0xc010" subsystem_vendor="0x1000" subsystem_device="0xa096" link_speed="16.0 GT/s PCIe" link_width="16">
-        <pci busid="0000:45:00.0" class="0x060400" vendor="0x1000" device="0xc010" subsystem_vendor="0x10de" subsystem_device="0x13b8" link_speed="16.0 GT/s PCIe" link_width="16">
-          <pci busid="0000:47:00.0" class="0x030200" vendor="0x10de" device="0x20b2" subsystem_vendor="0x10de" subsystem_device="0x1463" link_speed="16.0 GT/s PCIe" link_width="16">
-            <gpu dev="2" sm="80" rank="2" gdr="1">
-              <nvlink target="0000:c7:00.0" count="2" tclass="0x068000"/>
-              <nvlink target="0000:c4:00.0" count="2" tclass="0x068000"/>
-              <nvlink target="0000:c6:00.0" count="2" tclass="0x068000"/>
-              <nvlink target="0000:c9:00.0" count="2" tclass="0x068000"/>
-              <nvlink target="0000:c5:00.0" count="2" tclass="0x068000"/>
-              <nvlink target="0000:c8:00.0" count="2" tclass="0x068000"/>
-            </gpu>
-          </pci>
-        </pci>
-      </pci>
-      <pci busid="0000:49:00.0" class="0x060400" vendor="0x1000" device="0xc010" subsystem_vendor="0x1000" subsystem_device="0xa096" link_speed="16.0 GT/s PCIe" link_width="16">
-        <pci busid="0000:4b:00.0" class="0x020700" vendor="0x15b3" device="0x101b" subsystem_vendor="0x15b3" subsystem_device="0x0007" link_speed="16.0 GT/s PCIe" link_width="16">
-          <nic>
-            <net name="mlx5_2" dev="2" speed="200000" port="1" latency="0.000000" guid="0x98f50b0003a1420c" maxconn="131072" gdr="1"/>
-          </nic>
-        </pci>
-        <pci busid="0000:4c:00.0" class="0x060400" vendor="0x1000" device="0xc010" subsystem_vendor="0x10de" subsystem_device="0x13b8" link_speed="16.0 GT/s PCIe" link_width="16">
-          <pci busid="0000:4e:00.0" class="0x030200" vendor="0x10de" device="0x20b2" subsystem_vendor="0x10de" subsystem_device="0x1463" link_speed="16.0 GT/s PCIe" link_width="16">
-            <gpu dev="3" sm="80" rank="3" gdr="1">
-              <nvlink target="0000:c7:00.0" count="2" tclass="0x068000"/>
-              <nvlink target="0000:c4:00.0" count="2" tclass="0x068000"/>
-              <nvlink target="0000:c6:00.0" count="2" tclass="0x068000"/>
-              <nvlink target="0000:c9:00.0" count="2" tclass="0x068000"/>
-              <nvlink target="0000:c5:00.0" count="2" tclass="0x068000"/>
-              <nvlink target="0000:c8:00.0" count="2" tclass="0x068000"/>
-            </gpu>
-          </pci>
-        </pci>
-      </pci>
-      <pci busid="0000:50:00.0" class="0x060400" vendor="0x1000" device="0xc010" subsystem_vendor="0x1000" subsystem_device="0xa096" link_speed="16.0 GT/s PCIe" link_width="16">
-        <pci busid="0000:54:00.0" class="0x020700" vendor="0x15b3" device="0x101b" subsystem_vendor="0x15b3" subsystem_device="0x0007" link_speed="16.0 GT/s PCIe" link_width="16">
-          <nic>
-            <net name="mlx5_3" dev="3" speed="200000" port="1" latency="0.000000" guid="0xf0f60b0003a1420c" maxconn="131072" gdr="1"/>
-          </nic>
-        </pci>
-      </pci>
-    </pci>
-  </cpu>
-  <cpu numaid="7" affinity="ffff0000,00000000,00000000,00000000,ffff0000,00000000,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
-    <pci busid="0000:81:00.0" class="0x060400" vendor="0x1000" device="0xc010" subsystem_vendor="0x1000" subsystem_device="0xa096" link_speed="16.0 GT/s PCIe" link_width="16">
-      <pci busid="0000:83:00.0" class="0x060400" vendor="0x1000" device="0xc010" subsystem_vendor="0x1000" subsystem_device="0xa096" link_speed="16.0 GT/s PCIe" link_width="16">
-        <pci busid="0000:85:00.0" class="0x060400" vendor="0x1000" device="0xc010" subsystem_vendor="0x10de" subsystem_device="0x13b8" link_speed="16.0 GT/s PCIe" link_width="16">
-          <pci busid="0000:87:00.0" class="0x030200" vendor="0x10de" device="0x20b2" subsystem_vendor="0x10de" subsystem_device="0x1463" link_speed="16.0 GT/s PCIe" link_width="16">
-            <gpu dev="4" sm="80" rank="4" gdr="1">
-              <nvlink target="0000:c7:00.0" count="2" tclass="0x068000"/>
-              <nvlink target="0000:c4:00.0" count="2" tclass="0x068000"/>
-              <nvlink target="0000:c6:00.0" count="2" tclass="0x068000"/>
-              <nvlink target="0000:c9:00.0" count="2" tclass="0x068000"/>
-              <nvlink target="0000:c5:00.0" count="2" tclass="0x068000"/>
-              <nvlink target="0000:c8:00.0" count="2" tclass="0x068000"/>
-            </gpu>
-          </pci>
-        </pci>
-      </pci>
-      <pci busid="0000:8b:00.0" class="0x060400" vendor="0x1000" device="0xc010" subsystem_vendor="0x1000" subsystem_device="0xa096" link_speed="16.0 GT/s PCIe" link_width="16">
-        <pci busid="0000:8d:00.0" class="0x020700" vendor="0x15b3" device="0x101b" subsystem_vendor="0x15b3" subsystem_device="0x0007" link_speed="16.0 GT/s PCIe" link_width="16">
-          <nic>
-            <net name="mlx5_6" dev="4" speed="200000" port="1" latency="0.000000" guid="0xa4f00b0003a1420c" maxconn="131072" gdr="1"/>
-          </nic>
-        </pci>
-        <pci busid="0000:8e:00.0" class="0x060400" vendor="0x1000" device="0xc010" subsystem_vendor="0x10de" subsystem_device="0x13b8" link_speed="16.0 GT/s PCIe" link_width="16">
-          <pci busid="0000:90:00.0" class="0x030200" vendor="0x10de" device="0x20b2" subsystem_vendor="0x10de" subsystem_device="0x1463" link_speed="16.0 GT/s PCIe" link_width="16">
-            <gpu dev="5" sm="80" rank="5" gdr="1">
-              <nvlink target="0000:c7:00.0" count="2" tclass="0x068000"/>
-              <nvlink target="0000:c4:00.0" count="2" tclass="0x068000"/>
-              <nvlink target="0000:c6:00.0" count="2" tclass="0x068000"/>
-              <nvlink target="0000:c9:00.0" count="2" tclass="0x068000"/>
-              <nvlink target="0000:c5:00.0" count="2" tclass="0x068000"/>
-              <nvlink target="0000:c8:00.0" count="2" tclass="0x068000"/>
-            </gpu>
-          </pci>
-        </pci>
-      </pci>
-      <pci busid="0000:92:00.0" class="0x060400" vendor="0x1000" device="0xc010" subsystem_vendor="0x1000" subsystem_device="0xa096" link_speed="16.0 GT/s PCIe" link_width="16">
-        <pci busid="0000:94:00.0" class="0x020700" vendor="0x15b3" device="0x101b" subsystem_vendor="0x15b3" subsystem_device="0x0007" link_speed="16.0 GT/s PCIe" link_width="16">
-          <nic>
-            <net name="mlx5_7" dev="5" speed="200000" port="1" latency="0.000000" guid="0xa0f00b0003a1420c" maxconn="131072" gdr="1"/>
-          </nic>
-        </pci>
-      </pci>
-    </pci>
-  </cpu>
-  <cpu numaid="5" affinity="00000000,ffff0000,00000000,00000000,00000000,ffff0000,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
-    <pci busid="0000:b1:00.0" class="0x060400" vendor="0x1000" device="0xc010" subsystem_vendor="0x1000" subsystem_device="0xa096" link_speed="16.0 GT/s PCIe" link_width="16">
-      <pci busid="0000:b3:00.0" class="0x060400" vendor="0x1000" device="0xc010" subsystem_vendor="0x1000" subsystem_device="0xa096" link_speed="16.0 GT/s PCIe" link_width="16">
-        <pci busid="0000:b5:00.0" class="0x060400" vendor="0x1000" device="0xc010" subsystem_vendor="0x10de" subsystem_device="0x13b8" link_speed="16.0 GT/s PCIe" link_width="16">
-          <pci busid="0000:b7:00.0" class="0x030200" vendor="0x10de" device="0x20b2" subsystem_vendor="0x10de" subsystem_device="0x1463" link_speed="16.0 GT/s PCIe" link_width="16">
-            <gpu dev="6" sm="80" rank="6" gdr="1">
-              <nvlink target="0000:c7:00.0" count="2" tclass="0x068000"/>
-              <nvlink target="0000:c4:00.0" count="2" tclass="0x068000"/>
-              <nvlink target="0000:c6:00.0" count="2" tclass="0x068000"/>
-              <nvlink target="0000:c9:00.0" count="2" tclass="0x068000"/>
-              <nvlink target="0000:c5:00.0" count="2" tclass="0x068000"/>
-              <nvlink target="0000:c8:00.0" count="2" tclass="0x068000"/>
-            </gpu>
-          </pci>
-        </pci>
-      </pci>
-      <pci busid="0000:b8:00.0" class="0x060400" vendor="0x1000" device="0xc010" subsystem_vendor="0x1000" subsystem_device="0xa096" link_speed="16.0 GT/s PCIe" link_width="16">
-        <pci busid="0000:ba:00.0" class="0x020700" vendor="0x15b3" device="0x101b" subsystem_vendor="0x15b3" subsystem_device="0x0007" link_speed="16.0 GT/s PCIe" link_width="16">
-          <nic>
-            <net name="mlx5_8" dev="6" speed="200000" port="1" latency="0.000000" guid="0x8f80b0003a1420c" maxconn="131072" gdr="1"/>
-          </nic>
-        </pci>
-        <pci busid="0000:bb:00.0" class="0x060400" vendor="0x1000" device="0xc010" subsystem_vendor="0x10de" subsystem_device="0x13b8" link_speed="16.0 GT/s PCIe" link_width="16">
-          <pci busid="0000:bd:00.0" class="0x030200" vendor="0x10de" device="0x20b2" subsystem_vendor="0x10de" subsystem_device="0x1463" link_speed="16.0 GT/s PCIe" link_width="16">
-            <gpu dev="7" sm="80" rank="7" gdr="1">
-              <nvlink target="0000:c7:00.0" count="2" tclass="0x068000"/>
-              <nvlink target="0000:c4:00.0" count="2" tclass="0x068000"/>
-              <nvlink target="0000:c6:00.0" count="2" tclass="0x068000"/>
-              <nvlink target="0000:c9:00.0" count="2" tclass="0x068000"/>
-              <nvlink target="0000:c5:00.0" count="2" tclass="0x068000"/>
-              <nvlink target="0000:c8:00.0" count="2" tclass="0x068000"/>
-            </gpu>
-          </pci>
-        </pci>
-      </pci>
-      <pci busid="0000:be:00.0" class="0x060400" vendor="0x1000" device="0xc010" subsystem_vendor="0x1000" subsystem_device="0xa096" link_speed="16.0 GT/s PCIe" link_width="16">
-        <pci busid="0000:cc:00.0" class="0x020700" vendor="0x15b3" device="0x101b" subsystem_vendor="0x15b3" subsystem_device="0x0007" link_speed="16.0 GT/s PCIe" link_width="16">
-          <nic>
-            <net name="mlx5_9" dev="7" speed="200000" port="1" latency="0.000000" guid="0xacf50b0003a1420c" maxconn="131072" gdr="1"/>
-          </nic>
-        </pci>
-      </pci>
-    </pci>
-  </cpu>
-</system>
-)";
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_SERVICE_GPU_MOCK_NCCL_TOPO_CONFIG_H_
diff --git a/third_party/xla/xla/service/gpu/mock_nccl_utils.cc b/third_party/xla/xla/service/gpu/mock_nccl_utils.cc
deleted file mode 100644
index 911346cc5435fb..00000000000000
--- a/third_party/xla/xla/service/gpu/mock_nccl_utils.cc
+++ /dev/null
@@ -1,846 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/mock_nccl_utils.h"
-
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <cstring>
-#include <limits>
-#include <memory>
-#include <optional>
-#include <string>
-#include <tuple>
-#include <utility>
-#include <vector>
-
-#include "absl/algorithm/container.h"
-#include "absl/base/thread_annotations.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/node_hash_map.h"
-#include "absl/log/check.h"
-#include "absl/log/log.h"
-#include "absl/status/status.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "absl/synchronization/mutex.h"
-#include "absl/synchronization/notification.h"
-#include "absl/time/clock.h"
-#include "absl/time/time.h"
-#include "absl/types/span.h"
-#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
-#include "third_party/gpus/cuda/include/driver_types.h"
-#include "third_party/gpus/cuda/include/vector_types.h"
-#include "third_party/gpus/nccl/graph/topo.h"
-#include "third_party/gpus/nccl/graph/xml.h"
-#include "third_party/gpus/nccl/include/alloc.h"
-#include "third_party/gpus/nccl/include/comm.h"
-#include "third_party/gpus/nccl/include/graph.h"
-#include "third_party/gpus/nccl/include/info.h"
-#include "third_party/gpus/nccl/include/nccl_common.h"
-#include "third_party/nccl/nccl.h"
-#include "xla/debug_options_flags.h"
-#include "xla/executable_run_options.h"
-#include "xla/primitive_util.h"
-#include "xla/service/collective_ops_utils.h"
-#include "xla/service/global_device_id.h"
-#include "xla/service/gpu/gpu_executable_run_options.h"
-#include "xla/service/gpu/mock_nccl_topo_config.h"
-#include "xla/service/gpu/mock_nccl_xml.h"
-#include "xla/service/gpu/nccl_api.h"
-#include "xla/service/gpu/nccl_clique.h"
-#include "xla/service/gpu/nccl_clique_key.h"
-#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
-#include "xla/service/gpu/runtime/nccl_p2p_thunk_common.h"
-#include "xla/service/gpu/runtime/thunk.h"
-#include "xla/service/gpu/sleep_kernel.h"
-#include "xla/service/lockable.h"
-#include "xla/service/rendezvous.h"
-#include "xla/shape_util.h"
-#include "xla/status_macros.h"
-#include "xla/stream_executor/device_description.h"
-#include "xla/stream_executor/gpu/gpu_activation.h"
-#include "xla/stream_executor/gpu/gpu_stream.h"
-#include "xla/stream_executor/gpu/gpu_types.h"
-#include "xla/stream_executor/stream.h"
-#include "xla/util.h"
-#include "tsl/platform/env.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
-
-namespace xla {
-namespace gpu {
-
-//==-----------------------------------------------------------------------===//
-// Macros to return or warn on NCCL errors.
-//==-----------------------------------------------------------------------===//
-
-static absl::Status ToStatus(ncclResult_t s, const char* file, int64_t line,
-                             const char* expr) {
-  if (s == ncclSuccess) return absl::OkStatus();
-
-  return absl::InternalError(absl::StrFormat(
-      "%s:%d: NCCL operation %s failed: %s."
-      " Last NCCL warning(error) log entry (may be unrelated) '%s'.",
-      file, line, expr, ncclGetErrorString(s), ncclGetLastError(nullptr)));
-}
-
-#define XLA_NCCL_STATUS(expr) \
-  xla::gpu::ToStatus(expr, __FILE__, __LINE__, #expr)
-
-#define XLA_NCCL_RETURN_IF_ERROR(expr)      \
-  do {                                      \
-    absl::Status s = XLA_NCCL_STATUS(expr); \
-    if (!s.ok()) {                          \
-      return s;                             \
-    }                                       \
-  } while (0)
-
-#define XLA_NCCL_LOG_IF_ERROR(expr)         \
-  do {                                      \
-    absl::Status s = XLA_NCCL_STATUS(expr); \
-    if (!s.ok()) {                          \
-      LOG(ERROR) << s.ToString();           \
-    }                                       \
-  } while (0)
-
-//==-----------------------------------------------------------------------===//
-
-static absl::StatusOr<ncclDataType_t> ToNcclDataType(PrimitiveType element_type,
-                                                     Thunk::Kind reduction_op) {
-  switch (element_type) {
-    case S8:
-    case F8E5M2:
-    case F8E4M3FN:
-      return ncclInt8;
-    case PRED:
-    case U8:
-      return ncclUint8;
-    case S32:
-      return ncclInt32;
-    case U32:
-      return ncclUint32;
-    case S64:
-      return ncclInt64;
-    case U64:
-      return ncclUint64;
-    case F16:
-      return ncclFloat16;
-    case F32:
-    case C64:
-      return ncclFloat32;
-    case F64:
-    case C128:
-      return ncclFloat64;
-    case S16:
-    case U16:
-      // For all-reduce and reduce-scatter, we expect 16 bit integer types to be
-      // promoted to 32-bit.
-      if (reduction_op == Thunk::kNcclAllReduce ||
-          reduction_op == Thunk::kNcclAllReduceStart ||
-          reduction_op == Thunk::kNcclReduceScatter) {
-        return tsl::errors::InvalidArgument(absl::StrFormat(
-            "Unsupported data type: %s", PrimitiveType_Name(element_type)));
-      }
-      // For collectives that just move data around, we can use ncclFloat16 for
-      // 16-bit integer data types.
-      return ncclFloat16;
-#if defined(__CUDA_BF16_TYPES_EXIST__) || TENSORFLOW_USE_ROCM
-    case BF16:
-      return ncclBfloat16;
-#endif
-    default:
-      return tsl::errors::InvalidArgument(absl::StrFormat(
-          "Unsupported data type: %s", PrimitiveType_Name(element_type)));
-  }
-}
-
-static absl::StatusOr<std::pair<ncclDataType_t, int>>
-ToNcclDataTypeAndCountMultiplier(PrimitiveType element_type,
-                                 Thunk::Kind reduction_op) {
-  TF_ASSIGN_OR_RETURN(ncclDataType_t dtype,
-                      ToNcclDataType(element_type, reduction_op));
-  bool is_complex = primitive_util::IsComplexType(element_type);
-  return std::make_pair(dtype, is_complex ? 2 : 1);
-}
-
-using ncclInfo_t = ncclInfo*;
-
-absl::StatusOr<int> GetNcclDataTypeSize(ncclDataType_t dtype) {
-  switch (dtype) {
-    case ncclInt8:
-    case ncclUint8:
-      return 1;
-    case ncclInt32:
-    case ncclUint32:
-      return 4;
-    case ncclInt64:
-    case ncclUint64:
-      return 8;
-    case ncclFloat16:
-      return 2;
-    case ncclFloat32:
-      return 4;
-    case ncclFloat64:
-      return 8;
-#if defined(__CUDA_BF16_TYPES_EXIST__) || TENSORFLOW_USE_ROCM
-    case ncclBfloat16:
-      return 2;
-#endif
-    default:
-      return absl::InvalidArgumentError(
-          absl::StrFormat("Unsupported nccl data type: %d", dtype));
-  }
-}
-
-absl::StatusOr<ncclFunc_t> ToNcclFunctionType(Thunk::Kind reduce_op) {
-  switch (reduce_op) {
-    case Thunk::kNcclAllReduce:
-      return ncclFuncAllReduce;
-    case Thunk::kNcclAllGather:
-      return ncclFuncAllGather;
-    case Thunk::kNcclReduceScatter:
-      return ncclFuncReduceScatter;
-    case Thunk::kNcclSend:
-      return ncclFuncSend;
-    case Thunk::kNcclRecv:
-      return ncclFuncRecv;
-    default:
-      return absl::InvalidArgumentError(
-          absl::StrFormat("Unsupported nccl function type: %d", reduce_op));
-  }
-}
-
-absl::Status LaunchSleepKernel(se::StreamExecutor* executor,
-                               se::gpu::GpuStreamHandle gpu_stream,
-                               ncclInfo_t info, int64_t sleep_duration) {
-  void* kernel = GetSleepKernel();
-  int64_t clock_cycles =
-      sleep_duration * executor->GetDeviceDescription().clock_rate_ghz();
-  void* kernel_args[] = {&clock_cycles};
-  dim3 gridDim = {1, 1, 1};
-  dim3 blockDim = {512, 1, 1};
-  cudaError_t launch_status =
-      cudaLaunchKernel(kernel, gridDim, blockDim, kernel_args, 0, gpu_stream);
-  if (launch_status != cudaSuccess) {
-    return absl::InternalError(absl::StrCat("Failed to launch kernel: ",
-                                            cudaGetErrorString(launch_status)));
-  }
-  return absl::OkStatus();
-}
-
-inline absl::Status MockNcclInfoSetDerived(ncclInfo_t info, int nRanks) {
-  TF_ASSIGN_OR_RETURN(int dtype_size, GetNcclDataTypeSize(info->datatype));
-  info->nBytes = info->count * dtype_size;
-  if (info->coll == ncclFuncAllGather || info->coll == ncclFuncBroadcast) {
-    info->count = info->nBytes;
-    info->datatype = ncclInt8;
-  }
-  if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter)
-    info->nBytes *= nRanks;  // count is per rank
-  return absl::OkStatus();
-}
-
-// Return estimated sleep time in nano seconds for simulating the nccl
-// collective calls
-absl::StatusOr<int64_t> GetMockNcclSleepTime(size_t count,
-                                             ncclDataType_t datatype,
-                                             ncclComm_t comm,
-                                             cudaStream_t stream,
-                                             ncclInfo_t info) {
-  info->count = count;
-  info->datatype = datatype;
-  info->nChannels = 1;
-  info->algorithm = -1;
-  info->protocol = -1;
-
-  TF_RETURN_IF_ERROR(MockNcclInfoSetDerived(info, comm->nRanks));
-
-  int numPipeOps = 1;  // number of pipelined ops. Used to adjust latency.
-                       // Assume 1 for simplicity.
-  float minTime = std::numeric_limits<float>::infinity();
-  float time = 0.0f;
-  if (info->coll == ncclFuncAllReduce) {
-    XLA_NCCL_RETURN_IF_ERROR(ncclTopoGetAlgoTime(
-        info, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, numPipeOps, &time));
-    info->algorithm = NCCL_ALGO_RING;
-    info->protocol = NCCL_PROTO_SIMPLE;
-    minTime = time;
-  } else {
-    for (int p = 0; p < 3; p++) {
-      XLA_NCCL_RETURN_IF_ERROR(
-          ncclTopoGetAlgoTime(info, NCCL_ALGO_RING, p, numPipeOps, &time));
-      if (time > 0 && time < minTime) {
-        info->algorithm = NCCL_ALGO_RING;
-        info->protocol = p;
-        minTime = time;
-      }
-    }
-  }
-  return ceil(minTime * 1000);
-}
-
-// Create the mock nccl communicator assuming all hosts have the same hardwares.
-// We first create a local nccl communicator for gpus within a single host; then
-// together with the input clique, we re-run nccl algorithms to construct the
-// target nccl topology graphs.
-absl::StatusOr<NcclComm::Lock> LockMockNcclComm(
-    const Thunk::CollectiveExecuteParams& params,
-    const std::vector<ReplicaGroup>& replica_groups,
-    CollectiveOpGroupMode group_mode, int64_t op_id, int64_t stream_id,
-    bool enable_clique_optimization,
-    GpuExecutableRunOptions::MockNcclTopoModel topo_model) {
-  GlobalDeviceId global_device_id = params.global_device_id;
-
-  TF_ASSIGN_OR_RETURN(
-      std::vector<GlobalDeviceId> participants,
-      GetParticipatingDevices(global_device_id, *params.device_assn,
-                              replica_groups, group_mode));
-
-  if (IsGlobalNcclConfig() &&
-      (participants.size() != params.device_assn->replica_count())) {
-    return InvalidArgument(
-        "Partial replica groups are not allowed when using NCCL_COMM_ID "
-        "environment configuration.");
-  }
-
-  std::vector<GlobalDeviceId> local_devices;
-  if (params.global_device_id_map) {
-    local_devices.reserve(params.global_device_id_map->size());
-    for (const auto& entry : *params.global_device_id_map) {
-      local_devices.push_back(entry.second);
-    }
-  } else {
-    local_devices = participants;
-  }
-  TF_ASSIGN_OR_RETURN(
-      const NcclCliqueIdCallback* clique_id_callback,
-      GetNcclCliqueIdCallback(params.nccl_clique_id_callback, true));
-
-  size_t num_local_participants = GetNumLocalParticipants(
-      participants, params.global_device_id_map ? &local_devices : nullptr);
-
-  auto global_it = absl::c_find(participants, global_device_id);
-  TF_RET_CHECK(global_it != participants.end());
-  int global_rank = global_it - participants.begin();
-
-  if (global_rank != 0) {
-    return absl::CancelledError("Only mock nccl call for gpu rank 0");
-  }
-
-  return AcquireMockNcclComm(params.run_id, OpId(op_id),
-                             std::move(participants), std::move(local_devices),
-                             num_local_participants, *clique_id_callback,
-                             global_rank, stream_id, false, topo_model);
-}
-
-absl::Status RunMockNcclCollectives(NcclApi* nccl_api,
-                                    std::vector<DeviceBufferPair>& buffers,
-                                    se::Stream& stream,
-                                    NcclApi::NcclCommHandle comm,
-                                    Thunk::Kind reduce_op) {
-  ncclComm_t mock_comm = reinterpret_cast<ncclComm_t>(comm);
-
-  int device_ordinal = stream.parent()->device_ordinal();
-  VLOG(3) << "Performing the mock nccl collective call from device ordinal: "
-          << device_ordinal;
-  se::StreamExecutor* executor = stream.parent();
-  se::gpu::GpuStreamHandle gpu_stream = se::gpu::AsGpuStreamValue(&stream);
-  ncclInfo info;
-  TF_ASSIGN_OR_RETURN(info.coll, ToNcclFunctionType(reduce_op));
-  info.comm = reinterpret_cast<ncclComm_t>(mock_comm);
-  info.stream = gpu_stream;
-
-  int64_t total_element_count = 0;
-  ncclDataType_t previous_dtype = ncclNumTypes;
-  int64_t sleep_duration = 0;
-  for (size_t i = 0; i < buffers.size(); ++i) {
-    DeviceBufferPair& buffer = buffers[i];
-    PrimitiveType element_type = buffer.element_type;
-    TF_ASSIGN_OR_RETURN(
-        auto dtype_and_multiplier,
-        ToNcclDataTypeAndCountMultiplier(element_type, reduce_op));
-    ncclDataType_t dtype = dtype_and_multiplier.first;
-    int64_t element_count = buffer.element_count * dtype_and_multiplier.second;
-    if (reduce_op == Thunk::kNcclReduceScatter)
-      element_count = element_count / mock_comm->nRanks;
-    if (i == 0 || dtype == previous_dtype) {
-      previous_dtype = dtype;
-      total_element_count += element_count;
-      continue;
-    }
-
-    TF_ASSIGN_OR_RETURN(sleep_duration, GetMockNcclSleepTime(
-                                            total_element_count, previous_dtype,
-                                            mock_comm, gpu_stream, &info));
-    TF_RETURN_IF_ERROR(
-        LaunchSleepKernel(executor, gpu_stream, &info, sleep_duration));
-    total_element_count = element_count;
-    previous_dtype = dtype;
-  }
-
-  TF_ASSIGN_OR_RETURN(sleep_duration,
-                      GetMockNcclSleepTime(total_element_count, previous_dtype,
-                                           mock_comm, gpu_stream, &info));
-
-  TF_RETURN_IF_ERROR(
-      LaunchSleepKernel(executor, gpu_stream, &info, sleep_duration));
-  VLOG(3) << "Done performing the mock nccl collective call for ordinal: "
-          << device_ordinal;
-  return absl::OkStatus();
-}
-
-absl::Status RunMockNcclAllToAll(NcclApi* nccl_api, bool has_split_dimension,
-                                 std::vector<DeviceBufferPair>& buffers,
-                                 se::Stream& stream,
-                                 NcclApi::NcclCommHandle comm) {
-  ncclComm_t mock_comm = reinterpret_cast<ncclComm_t>(comm);
-
-  se::StreamExecutor* executor = stream.parent();
-  se::gpu::GpuStreamHandle gpu_stream = se::gpu::AsGpuStreamValue(&stream);
-  int num_participants = mock_comm->nRanks;
-
-  ncclInfo info;
-  info.comm = mock_comm;
-  info.stream = gpu_stream;
-
-  int64_t sleep_duration = 0;
-
-  // AllToAll can operate in two modes. Either it specifies a split dimension,
-  // in which case inputs are split and outputs concatenated in that dimension
-  // (here, we only support dimension 0), or it takes a list of inputs
-  // and produces a tuple of outputs.
-  if (has_split_dimension) {
-    for (size_t i = 0; i < buffers.size(); ++i) {
-      DeviceBufferPair& buffer = buffers[i];
-      const uint8_t* send_buffer =
-          static_cast<uint8_t*>(buffer.source_buffer.opaque());
-      uint8_t* recv_buffer =
-          static_cast<uint8_t*>(buffer.destination_buffer.opaque());
-
-      TF_ASSIGN_OR_RETURN(auto dtype_and_multiplier,
-                          ToNcclDataTypeAndCountMultiplier(
-                              buffer.element_type, Thunk::kNcclAllToAll));
-      ncclDataType_t dtype = dtype_and_multiplier.first;
-      int64_t element_count =
-          buffer.element_count * dtype_and_multiplier.second;
-
-      TF_RET_CHECK(element_count % num_participants == 0)
-          << "Buffer was not an exact multiple of the number of participants.";
-      size_t chunk_elements = element_count / num_participants;
-      size_t chunk_bytes = chunk_elements * ShapeUtil::ByteSizeOfPrimitiveType(
-                                                buffer.element_type);
-      for (int rank = 0; rank < num_participants; ++rank) {
-        VLOG(3) << absl::StreamFormat(
-            "Calling mock ncclSend(sendbuff=%p, count=%d, peer=%d "
-            "comm=%p, stream=%p)",
-            send_buffer + rank * chunk_bytes, chunk_elements, rank,
-            static_cast<const void*>(mock_comm), gpu_stream);
-        info.coll = ncclFuncSend;
-        TF_ASSIGN_OR_RETURN(sleep_duration,
-                            GetMockNcclSleepTime(chunk_elements, dtype,
-                                                 mock_comm, gpu_stream, &info));
-        TF_RETURN_IF_ERROR(
-            LaunchSleepKernel(executor, gpu_stream, &info, sleep_duration));
-
-        VLOG(3) << absl::StreamFormat(
-            "Calling mock ncclRecv(recvbuff=%p, count=%d, peer=%d "
-            "comm=%p, stream=%p)",
-            recv_buffer + rank * chunk_bytes, chunk_elements, rank,
-            static_cast<const void*>(mock_comm), gpu_stream);
-
-        info.coll = ncclFuncRecv;
-        TF_ASSIGN_OR_RETURN(sleep_duration,
-                            GetMockNcclSleepTime(chunk_elements, dtype,
-                                                 mock_comm, gpu_stream, &info));
-        TF_RETURN_IF_ERROR(
-            LaunchSleepKernel(executor, gpu_stream, &info, sleep_duration));
-      }
-    }
-  } else {
-    TF_RET_CHECK(buffers.size() == num_participants)
-        << "Number of inputs didn't match the number of participants.";
-    for (size_t i = 0; i < buffers.size(); ++i) {
-      DeviceBufferPair& buffer = buffers[i];
-      const uint8_t* send_buffer =
-          static_cast<uint8_t*>(buffer.source_buffer.opaque());
-      uint8_t* recv_buffer =
-          static_cast<uint8_t*>(buffer.destination_buffer.opaque());
-
-      TF_ASSIGN_OR_RETURN(auto dtype_and_multiplier,
-                          ToNcclDataTypeAndCountMultiplier(
-                              buffer.element_type, Thunk::kNcclAllToAll));
-      ncclDataType_t dtype = dtype_and_multiplier.first;
-      int64_t element_count =
-          buffer.element_count * dtype_and_multiplier.second;
-
-      VLOG(3) << absl::StreamFormat(
-          "Calling mock ncclSend(sendbuff=%p, count=%d, peer=%d "
-          "comm=%p, stream=%p)",
-          send_buffer, element_count, i, static_cast<const void*>(mock_comm),
-          gpu_stream);
-
-      info.coll = ncclFuncSend;
-      TF_ASSIGN_OR_RETURN(sleep_duration,
-                          GetMockNcclSleepTime(element_count, dtype, mock_comm,
-                                               gpu_stream, &info));
-      TF_RETURN_IF_ERROR(
-          LaunchSleepKernel(executor, gpu_stream, &info, sleep_duration));
-
-      VLOG(3) << absl::StreamFormat(
-          "Calling mock ncclRecv(recvbuff=%p, count=%d, peer=%d "
-          "comm=%p, stream=%p)",
-          recv_buffer, element_count, i, static_cast<const void*>(mock_comm),
-          gpu_stream);
-
-      info.coll = ncclFuncRecv;
-      TF_ASSIGN_OR_RETURN(sleep_duration,
-                          GetMockNcclSleepTime(element_count, dtype, mock_comm,
-                                               gpu_stream, &info));
-      TF_RETURN_IF_ERROR(
-          LaunchSleepKernel(executor, gpu_stream, &info, sleep_duration));
-    }
-  }
-
-  VLOG(3) << "Done performing mock all-to-all ";
-  return absl::OkStatus();
-}
-
-absl::Status RunMockCollectivePermute(
-    NcclApi* nccl_api, NcclP2PConfig::SourceTargetMapEntry source_target,
-    DeviceBufferPair& buffer, se::Stream& stream, NcclApi::NcclCommHandle comm,
-    absl::string_view device_string, int64_t current_id) {
-  ncclComm_t mock_comm = reinterpret_cast<ncclComm_t>(comm);
-
-  se::StreamExecutor* executor = stream.parent();
-  int device_ordinal = stream.parent()->device_ordinal();
-  VLOG(3) << "Performing collective permute from device ordinal: "
-          << device_ordinal << "current_id " << current_id;
-
-  const std::optional<int64_t> source_id = source_target.source;
-  const std::optional<int64_t> target_id = source_target.target;
-
-  se::DeviceMemoryBase src_addr = buffer.source_buffer;
-  se::DeviceMemoryBase dest_addr = buffer.destination_buffer;
-
-  VLOG(3) << absl::StreamFormat("%s : id = %d, source_id = %d, target_id = %d",
-                                device_string, current_id,
-                                source_id.value_or(-1), target_id.value_or(-1));
-
-  TF_ASSIGN_OR_RETURN(auto dtype_and_multiplier,
-                      ToNcclDataTypeAndCountMultiplier(
-                          buffer.element_type, Thunk::kNcclCollectivePermute));
-  ncclDataType_t dtype = dtype_and_multiplier.first;
-  int64_t element_count = buffer.element_count * dtype_and_multiplier.second;
-
-  se::gpu::GpuStreamHandle gpu_stream = se::gpu::AsGpuStreamValue(&stream);
-  ncclInfo info;
-  info.comm = mock_comm;
-  info.stream = gpu_stream;
-
-  int64_t sleep_duration = 0;
-
-  // Send source buffer to target peer if needed.
-  if (target_id) {
-    info.coll = ncclFuncSend;
-    VLOG(3) << absl::StreamFormat(
-        "%s : Calling mock ncclSend(sendbuff=%p, count=%d, peer=%d "
-        "comm=%p, stream=%p)",
-        device_string, src_addr.opaque(), element_count, *target_id,
-        static_cast<const void*>(mock_comm), gpu_stream);
-    TF_ASSIGN_OR_RETURN(sleep_duration,
-                        GetMockNcclSleepTime(element_count, dtype, mock_comm,
-                                             gpu_stream, &info));
-    TF_RETURN_IF_ERROR(
-        LaunchSleepKernel(executor, gpu_stream, &info, sleep_duration));
-  }
-
-  // Receive data from the source peer to the destination buffer.
-  if (source_id) {
-    info.coll = ncclFuncRecv;
-    VLOG(3) << absl::StreamFormat(
-        "%s : Calling mock ncclRecv(recvbuff=%p, count=%d, peer=%d comm=%p, "
-        "stream=%p)",
-        device_string, dest_addr.opaque(), element_count, *source_id,
-        static_cast<const void*>(mock_comm), gpu_stream);
-    TF_ASSIGN_OR_RETURN(sleep_duration,
-                        GetMockNcclSleepTime(element_count, dtype, mock_comm,
-                                             gpu_stream, &info));
-    TF_RETURN_IF_ERROR(
-        LaunchSleepKernel(executor, gpu_stream, &info, sleep_duration));
-  }
-
-  VLOG(3) << "Done performing the mock nccl collective call for ordinal: "
-          << device_ordinal;
-
-  if (!source_id) {
-    // If there is no source peer, i.e. no one send us any data, zero out dest
-    // buffer.
-    VLOG(3) << absl::StreamFormat(
-        "%s : mock collective-Permute: Issuing MemZero", device_string);
-    return stream.MemZero(&dest_addr, dest_addr.size());
-  }
-  return absl::OkStatus();
-}
-
-namespace {
-void CheckNcclAsyncError(NcclComm& lockable_comm) {
-  NcclApi::NcclCommHandle comm = *lockable_comm.Acquire();
-  if (comm == nullptr) return;
-
-  absl::Status status = NcclApi::Default()->CommGetAsyncError(comm);
-  if (!status.ok()) LOG(ERROR) << status;
-}
-
-struct NcclCliqueState {
-  NcclCliqueId clique_id;
-  int64_t run_id = -1;
-
-  // `mu` guards `communicators` and `status` during initialization.
-  // Once `ready` has been notified, the communicators may be accessed without
-  // synchronization.
-  absl::Mutex mu;
-  absl::Notification ready;
-  absl::Status status;
-  absl::flat_hash_map<int, std::unique_ptr<NcclComm>> communicators;
-};
-
-using NcclClique = Lockable<NcclCliqueState>;
-
-struct NcclCliques {
-  NcclClique& operator[](const NcclCliqueKey& key) {
-    absl::MutexLock lock(&mu);
-    return cliques[key];
-  }
-
-  absl::Mutex mu;
-  absl::node_hash_map<NcclCliqueKey, NcclClique> cliques ABSL_GUARDED_BY(mu);
-};
-
-absl::StatusOr<ncclUniqueId> ToNcclUniqueId(const std::string& id_str) {
-  static_assert(sizeof(ncclUniqueId) == NCCL_UNIQUE_ID_BYTES,
-                "NCCL_UNIQUE_ID_BYTES");
-
-  TF_RET_CHECK(id_str.size() == NCCL_UNIQUE_ID_BYTES);
-  ncclUniqueId id;
-  absl::c_copy(id_str, id.internal);
-  return id;
-}
-
-absl::StatusOr<std::shared_ptr<NcclClique::Lock>> AcquireNcclClique(
-    RunId run_id, OpId op_id, NcclCliqueKey clique_key,
-    const NcclCliqueIdCallback& clique_id_callback,
-    size_t num_local_participants, bool may_skip_rendezvous) {
-  static auto& cliques = *new NcclCliques;
-
-  VLOG(2) << "AcquireNcclClique Rendezvous key (clique_key:"
-          << clique_key.ToString() << ", run" << run_id.ToString() << ", op"
-          << op_id.value() << ")";
-
-  auto rendezvous_key = std::make_tuple(run_id, op_id, std::move(clique_key));
-
-  int64_t terminate_timeout = xla::GetDebugOptionsFromFlags()
-                                  .xla_gpu_nccl_termination_timeout_seconds();
-
-  return RendezvousSingle<absl::StatusOr<NcclClique::Lock>>(
-      "acquire mock NCCL clique", rendezvous_key, num_local_participants,
-      [&]() -> absl::StatusOr<NcclClique::Lock> {
-        const NcclCliqueKey& clique_key = std::get<2>(rendezvous_key);
-        NcclClique::Lock clique = cliques[clique_key].Acquire();
-        clique->run_id = run_id.ToInt();
-        return clique;
-      },
-      /*warn_stuck_timeout=*/absl::Seconds(10),
-      (terminate_timeout >= 0) ? absl::Seconds(terminate_timeout)
-                               : absl::InfiniteDuration());
-}
-
-absl::Status InitializeMockNcclCostModel(
-    int nRanks, int rank, int num_local_participants,
-    absl::Span<const std::pair<int, int>> local_ranks,
-    GpuExecutableRunOptions::MockNcclTopoModel topo_model,
-    ncclComm_t* comm_ptr) {
-  XLA_NCCL_RETURN_IF_ERROR(ncclCalloc(comm_ptr, 1));
-  ncclComm_t comm = *comm_ptr;
-  comm->nChannels = 1;
-  comm->nRanks = nRanks;
-  comm->rank = rank;
-  absl::string_view xml_str;
-  switch (topo_model) {
-    case GpuExecutableRunOptions::MockNcclTopoModel::kGCPA3:
-      comm->collNetSupport = false;
-      comm->nvlsSupport = false;
-      comm->minCompCap = comm->maxCompCap = stream_executor::
-          CudaComputeCapability::CudaComputeCapabilities::HOPPER;
-      XLA_NCCL_RETURN_IF_ERROR(ncclCalloc(&comm->peerInfo, nRanks + 1));
-      xml_str = kGCPA3;
-      break;
-    case GpuExecutableRunOptions::MockNcclTopoModel::kNvidia:
-      comm->collNetSupport = false;
-      comm->nvlsSupport = false;
-      comm->minCompCap = comm->maxCompCap = stream_executor::
-          CudaComputeCapability::CudaComputeCapabilities::AMPERE;
-      XLA_NCCL_RETURN_IF_ERROR(ncclCalloc(&comm->peerInfo, nRanks + 1));
-      xml_str = kNvidia;
-      break;
-    default:
-      return absl::InvalidArgumentError("Unknown MockNcclTopoModel");
-  }
-
-  auto xml = std::make_unique<ncclXml>();
-  TF_RETURN_IF_ERROR(MockTopoGetXml(xml_str, xml.get()));
-  TF_RETURN_IF_ERROR(MockNcclTopoUpdateXml(local_ranks, xml.get()));
-  XLA_NCCL_RETURN_IF_ERROR(ncclTopoTrimXml(xml.get()));
-  XLA_NCCL_RETURN_IF_ERROR(ncclTopoGetSystemFromXml(xml.get(), &comm->topo));
-  XLA_NCCL_RETURN_IF_ERROR(ncclTopoComputePaths(comm->topo, nullptr));
-  XLA_NCCL_RETURN_IF_ERROR(ncclTopoTrimSystem(comm->topo, comm));
-  XLA_NCCL_RETURN_IF_ERROR(ncclTopoComputePaths(comm->topo, nullptr));
-  XLA_NCCL_RETURN_IF_ERROR(ncclTopoSearchInit(comm->topo));
-
-  ncclTopoGraph ringGraph;
-  ncclTopoGraph treeGraph;
-  ncclTopoGraph collNetGraph;
-  ncclTopoGraph nvlsGraph;
-  ncclTopoGraph* graphs[] = {&treeGraph,    &ringGraph, &collNetGraph,
-                             &collNetGraph, &nvlsGraph, &nvlsGraph};
-
-  // Get rings and trees
-  ringGraph.id = 0;
-  ringGraph.pattern = NCCL_TOPO_PATTERN_RING;
-  ringGraph.collNet = 0;
-  ringGraph.minChannels = 1;
-  ringGraph.maxChannels = MAXCHANNELS / 2;
-  XLA_NCCL_RETURN_IF_ERROR(ncclTopoCompute(comm->topo, &ringGraph));
-
-  treeGraph.id = 1;
-  treeGraph.pattern = NCCL_TOPO_PATTERN_BALANCED_TREE;
-  treeGraph.collNet = 0;
-  treeGraph.minChannels = ringGraph.nChannels;
-  treeGraph.maxChannels = ringGraph.nChannels;
-  XLA_NCCL_RETURN_IF_ERROR(ncclTopoCompute(comm->topo, &treeGraph));
-
-  collNetGraph.id = 2;
-  collNetGraph.pattern = NCCL_TOPO_PATTERN_TREE;
-  collNetGraph.collNet = 1;
-  collNetGraph.minChannels = collNetGraph.maxChannels = ringGraph.nChannels;
-  if (comm->collNetSupport) {
-    XLA_NCCL_RETURN_IF_ERROR(ncclTopoCompute(comm->topo, &collNetGraph));
-  } else {
-    collNetGraph.nChannels = 0;
-  }
-
-  nvlsGraph.id = 3;
-  nvlsGraph.pattern = NCCL_TOPO_PATTERN_NVLS;
-  nvlsGraph.collNet = 0;
-  nvlsGraph.minChannels = 1;
-  nvlsGraph.maxChannels = MAXCHANNELS;
-  if (comm->nvlsSupport) {
-    XLA_NCCL_RETURN_IF_ERROR(ncclTopoCompute(comm->topo, &nvlsGraph));
-  } else {
-    nvlsGraph.nChannels = 0;
-  }
-
-  comm->nNodes = nRanks / num_local_participants;
-  XLA_NCCL_RETURN_IF_ERROR(
-      ncclTopoTuneModel(comm, comm->minCompCap, comm->maxCompCap, graphs));
-  return absl::OkStatus();
-}
-
-}  // namespace
-
-absl::StatusOr<NcclComm::Lock> AcquireMockNcclComm(
-    RunId run_id, OpId op_id, std::vector<GlobalDeviceId> participants,
-    std::vector<GlobalDeviceId> local_devices, size_t num_local_participants,
-    const NcclCliqueIdCallback& clique_id_callback, int rank, int64_t stream_id,
-    bool enable_clique_optimization,
-    GpuExecutableRunOptions::MockNcclTopoModel topo_model) {
-  int nRanks = participants.size();
-  std::vector<std::pair<int, int>> local_ranks;
-  for (int i = 0; i < local_devices.size(); i++) {
-    auto it = absl::c_find(participants, local_devices[i]);
-    if (it != participants.end()) {
-      local_ranks.push_back(std::make_pair(it - participants.begin(), i));
-    }
-  }
-
-  // Ensure that this group of threads have exclusive access to the clique to
-  // prevent threads from different groups locking communicators in the clique.
-  NcclCliqueKey clique_key(std::move(participants), stream_id);
-  TF_ASSIGN_OR_RETURN(
-      auto clique,
-      AcquireNcclClique(
-          run_id, op_id, clique_key, clique_id_callback, 1,
-          enable_clique_optimization ||
-              stream_id == GetStreamId(true, AsyncStreamKind::kP2P0)));
-
-  struct AllCommunicators {
-    absl::Mutex mu;
-    std::vector<NcclComm*> communicators ABSL_GUARDED_BY(mu);
-  };
-  static auto& all_communicators = *new AllCommunicators;
-
-  // Launch a thread that periodically checks all NCCL communicators for
-  // asynchronous errors. If an asynchronous error is observed, the communicator
-  // is aborted and an error message logged.
-  static auto check_async_error_thread = tsl::Env::Default()->StartThread(
-      tsl::ThreadOptions(), "nccl_async_error_thread", [&] {
-        while (true) {
-          absl::SleepFor(absl::Seconds(30));
-          absl::MutexLock lock(&all_communicators.mu);
-          for (NcclComm* comm : all_communicators.communicators) {
-            CheckNcclAsyncError(*comm);
-          }
-        }
-      });
-  (void)check_async_error_thread;  // Silence unused variable warning.
-
-  NcclCliqueState& state = **clique;
-
-  if (!state.ready.HasBeenNotified()) {
-    ncclComm_t comm = nullptr;
-    absl::Status status = InitializeMockNcclCostModel(
-        nRanks, rank, num_local_participants, local_ranks, topo_model, &comm);
-    size_t num_initialized = [&] {
-      absl::MutexLock lock(&state.mu);
-      state.status.Update(status);
-      state.communicators[rank] = std::make_unique<NcclComm>(
-          reinterpret_cast<NcclApi::NcclCommHandle>(comm));
-      return state.communicators.size();
-    }();
-
-    // Wait for all communicators to initialize before allowing any progress.
-    // Otherwise we may get deadlocks, because ncclCommInitRank may allocate,
-    // which may block on the completion of device activity on a peer device,
-    // which may depend on the completion of this collective if we do not have a
-    // barrier to prevent it.
-    if (num_initialized == 1) {
-      state.ready.Notify();
-    } else {
-      TF_RETURN_IF_ERROR(status);
-      state.ready.WaitForNotification();
-    }
-
-    absl::MutexLock lock(&all_communicators.mu);
-    all_communicators.communicators.push_back(state.communicators[rank].get());
-  }
-
-  TF_RETURN_IF_ERROR(state.status);
-  return state.communicators[rank]->Acquire();
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/mock_nccl_utils.h b/third_party/xla/xla/service/gpu/mock_nccl_utils.h
deleted file mode 100644
index 1f1db47bfce802..00000000000000
--- a/third_party/xla/xla/service/gpu/mock_nccl_utils.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_GPU_MOCK_NCCL_UTILS_H_
-#define XLA_SERVICE_GPU_MOCK_NCCL_UTILS_H_
-
-#include <cstddef>
-#include <cstdint>
-#include <string>
-#include <vector>
-
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "xla/executable_run_options.h"
-#include "xla/service/collective_ops_utils.h"
-#include "xla/service/global_device_id.h"
-#include "xla/service/gpu/gpu_executable_run_options.h"
-#include "xla/service/gpu/nccl_api.h"
-#include "xla/service/gpu/nccl_clique_key.h"
-#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
-#include "xla/service/gpu/runtime/nccl_p2p_thunk_common.h"
-#include "xla/service/gpu/runtime/thunk.h"
-#include "xla/service/lockable.h"
-#include "xla/stream_executor/stream.h"
-#include "tsl/lib/gtl/int_type.h"
-
-namespace xla {
-namespace gpu {
-
-TSL_LIB_GTL_DEFINE_INT_TYPE(OpId, int64_t);
-
-struct NcclCommName {
-  static std::string ToString(NcclApi::NcclCommHandle comm) {
-    return absl::StrFormat("lockable comm %p", comm);
-  }
-};
-
-struct NcclComm : public Lockable<NcclApi::NcclCommHandle, NcclCommName> {
-  explicit NcclComm(NcclApi::NcclCommHandle comm) : Lockable(comm) {}
-};
-
-// Create the mock nccl communicator assuming all hosts have the same hardwares.
-absl::StatusOr<NcclComm::Lock> LockMockNcclComm(
-    const Thunk::CollectiveExecuteParams& params,
-    const std::vector<ReplicaGroup>& replica_groups,
-    CollectiveOpGroupMode group_mode, int64_t op_id, int64_t stream_id,
-    bool enable_clique_optimization,
-    GpuExecutableRunOptions::MockNcclTopoModel topo_model);
-
-absl::StatusOr<NcclComm::Lock> AcquireMockNcclComm(
-    RunId run_id, OpId op_id, std::vector<GlobalDeviceId> participants,
-    std::vector<GlobalDeviceId> local_devices, size_t num_local_participants,
-    const NcclCliqueIdCallback& clique_id_callback, int rank, int64_t stream_id,
-    bool enable_clique_optimization,
-    GpuExecutableRunOptions::MockNcclTopoModel topo_model);
-
-// Mock a Nccl collective op including all-reduce, all-gather, and
-// reduce-scatter.
-absl::Status RunMockNcclCollectives(NcclApi* nccl_api,
-                                    std::vector<DeviceBufferPair>& buffers,
-                                    se::Stream& stream,
-                                    NcclApi::NcclCommHandle comm,
-                                    Thunk::Kind reduce_op);
-
-// Mock a NCCL-based All-To-All op.
-absl::Status RunMockNcclAllToAll(NcclApi* nccl_api, bool has_split_dimension,
-                                 std::vector<DeviceBufferPair>& buffers,
-                                 se::Stream& stream,
-                                 NcclApi::NcclCommHandle comm);
-
-// Mock a collective permute op.
-absl::Status RunMockCollectivePermute(
-    NcclApi* nccl_api, NcclP2PConfig::SourceTargetMapEntry source_target,
-    DeviceBufferPair& buffer, se::Stream& stream, NcclApi::NcclCommHandle comm,
-    absl::string_view device_string, int64_t current_id);
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_SERVICE_GPU_MOCK_NCCL_UTILS_H_
diff --git a/third_party/xla/xla/service/gpu/mock_nccl_utils_default.cc b/third_party/xla/xla/service/gpu/mock_nccl_utils_default.cc
deleted file mode 100644
index 856b3a4b871a77..00000000000000
--- a/third_party/xla/xla/service/gpu/mock_nccl_utils_default.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <cstddef>
-#include <cstdint>
-#include <vector>
-
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/string_view.h"
-#include "xla/executable_run_options.h"
-#include "xla/service/collective_ops_utils.h"
-#include "xla/service/global_device_id.h"
-#include "xla/service/gpu/gpu_executable_run_options.h"
-#include "xla/service/gpu/mock_nccl_utils.h"
-#include "xla/service/gpu/nccl_api.h"
-#include "xla/service/gpu/nccl_clique_key.h"
-#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
-#include "xla/service/gpu/runtime/nccl_p2p_thunk_common.h"
-#include "xla/service/gpu/runtime/thunk.h"
-#include "xla/stream_executor/stream.h"
-#include "xla/util.h"
-
-namespace xla {
-namespace gpu {
-
-absl::StatusOr<NcclComm::Lock> AcquireMockNcclComm(
-    RunId run_id, OpId op_id, std::vector<GlobalDeviceId> participants,
-    std::vector<GlobalDeviceId> local_devices, size_t num_local_participants,
-    const NcclCliqueIdCallback& clique_id_callback, int rank, int64_t stream_id,
-    bool enable_clique_optimization,
-    GpuExecutableRunOptions::MockNcclTopoModel topo_model) {
-  return Unimplemented("AcquireMockNcclComm is not implemented.");
-}
-
-absl::StatusOr<NcclComm::Lock> LockMockNcclComm(
-    const Thunk::CollectiveExecuteParams& params,
-    const std::vector<ReplicaGroup>& replica_groups,
-    CollectiveOpGroupMode group_mode, int64_t op_id, int64_t stream_id,
-    bool enable_clique_optimization,
-    GpuExecutableRunOptions::MockNcclTopoModel topo_model) {
-  return Unimplemented("LockMockNcclComm is not implemented.");
-}
-
-absl::Status RunMockNcclCollectives(NcclApi*, std::vector<DeviceBufferPair>&,
-                                    se::Stream&, NcclApi::NcclCommHandle,
-                                    Thunk::Kind) {
-  return Unimplemented("Mock nccl collectives is not implemented.");
-}
-
-absl::Status RunMockNcclAllToAll(NcclApi*, bool, std::vector<DeviceBufferPair>&,
-                                 se::Stream&, NcclApi::NcclCommHandle) {
-  return Unimplemented("Mock nccl AllToAll is not implemented.");
-}
-
-absl::Status RunMockCollectivePermute(NcclApi*,
-                                      NcclP2PConfig::SourceTargetMapEntry,
-                                      DeviceBufferPair&, se::Stream&,
-                                      NcclApi::NcclCommHandle,
-                                      absl::string_view, int64_t) {
-  return Unimplemented("Mock collective permute is not implemented.");
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/mock_nccl_xml.cc b/third_party/xla/xla/service/gpu/mock_nccl_xml.cc
deleted file mode 100644
index 881eb061fb5a81..00000000000000
--- a/third_party/xla/xla/service/gpu/mock_nccl_xml.cc
+++ /dev/null
@@ -1,197 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/mock_nccl_xml.h"
-
-#include <cstring>
-#include <string>
-#include <tuple>
-#include <utility>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
-#include "absl/log/check.h"
-#include "absl/status/status.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "xla/status.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/regexp.h"
-#if GOOGLE_CUDA
-#include "third_party/gpus/nccl/graph/xml.h"
-#endif
-
-namespace xla {
-namespace gpu {
-namespace {
-
-#if GOOGLE_CUDA
-
-class NcclTopoXmlParser {
- public:
-  NcclTopoXmlParser(absl::string_view src, ncclXml* xml)
-      : src_(src), xml_(xml) {}
-  // A simple xml parser to parse the NCCL_TOPO_DUMP_FILE (see
-  // mock_nccl_topo_a3.xml) generated by the nccl library to the ncclXmlNode
-  // struct.
-  Status Parse(ncclXmlNode* head) {
-    if (head && head->type == NODE_TYPE_SINGLE) return absl::OkStatus();
-    while (true) {
-      if (xml_->maxIndex == MAX_NODES) {
-        return absl::InternalError("XML parser is limited to 1024 nodes");
-      }
-      ncclXmlNode* node = xml_->nodes + xml_->maxIndex;
-      memset(node, 0, sizeof(ncclXmlNode));
-      TF_RETURN_IF_ERROR(GetNode(node));
-      if (node->type == NODE_TYPE_NONE) {
-        if (head) {
-          return absl::InternalError(
-              absl::StrFormat("XML Parser : unterminated %s", head->name));
-        } else {
-          // All done
-          return absl::OkStatus();
-        }
-      }
-      if (head && node->type == NODE_TYPE_CLOSE) {
-        if (strcmp(node->name, head->name) != 0) {
-          return absl::InternalError(absl::StrFormat(
-              "XML Parser Mismatch : %s / %s", head->name, node->name));
-        }
-        return absl::OkStatus();
-      }
-      if (!head || spec_[head->name].contains(node->name)) {
-        if (head) head->subs[head->nSubs++] = node;
-        node->parent = head;
-        node->nSubs = 0;
-        xml_->maxIndex++;
-        TF_RETURN_IF_ERROR(Parse(node));
-      } else {
-        return absl::InternalError(
-            absl::StrFormat("XML Parser : Unhandled element %s", node->name));
-      }
-    }
-  }
-
- private:
-  std::pair<bool, absl::string_view> GetAttr(absl::string_view src,
-                                             std::string& name,
-                                             std::string& value) {
-    static const LazyRE2 attr_regex = {"\\s*([^=]+)=\"([^\"]*)\""};
-    return {RE2::Consume(&src, *attr_regex, &name, &value), src};
-  }
-
-  std::pair<bool, absl::string_view> GetNodeName(absl::string_view src,
-                                                 std::string& delimiter,
-                                                 std::string& name) {
-    static const LazyRE2 name_regex = {"(/?)(\\w+)"};
-    return {RE2::Consume(&src, *name_regex, &delimiter, &name), src};
-  }
-
-  Status GetNode(ncclXmlNode* node) {
-    static const LazyRE2 node_regex = {"\\s*<([^>]+)>"};
-    std::string node_str;
-
-    if (!RE2::Consume(&src_, *node_regex, &node_str)) return absl::OkStatus();
-    absl::string_view node_str_view = absl::string_view(node_str);
-    std::string delimiter;
-    std::string name;
-    bool found_name = false;
-    std::tie(found_name, node_str_view) =
-        GetNodeName(node_str_view, delimiter, name);
-    CHECK(found_name) << "Fail to extract nccl topo node name";
-    absl::SNPrintF(node->name, sizeof(node->name), "%s", name.c_str());
-
-    if (delimiter[0] == '/') {
-      node->type = NODE_TYPE_CLOSE;
-      return absl::OkStatus();
-    }
-    node->type = NODE_TYPE_OPEN;
-    int num_attrs = 0;
-    bool found_attr = false;
-    do {
-      std::string key;
-      std::string value;
-      std::tie(found_attr, node_str_view) = GetAttr(node_str_view, key, value);
-      if (found_attr) {
-        absl::SNPrintF(node->attrs[num_attrs].key,
-                       sizeof(node->attrs[num_attrs].key), "%s", key.c_str());
-        absl::SNPrintF(node->attrs[num_attrs].value,
-                       sizeof(node->attrs[num_attrs].value), "%s",
-                       value.c_str());
-        num_attrs++;
-      }
-    } while (found_attr);
-    node->nAttrs = num_attrs;
-    if (*node_str.rbegin() == '/') node->type = NODE_TYPE_SINGLE;
-    return absl::OkStatus();
-  }
-
-  absl::string_view src_;
-  ncclXml* xml_;
-  absl::flat_hash_map<std::string, absl::flat_hash_set<std::string>> spec_ = {
-      {"system", {"cpu"}}, {"pci", {"pci", "gpu", "nic"}}, {"gpu", {"nvlink"}},
-      {"nic", {"net"}},    {"cpu", {"pci", "nic"}},        {"nvlink", {}},
-      {"net", {}}};
-};
-
-Status MockNcclTopoUpdateXmlRec(
-    absl::Span<const std::pair<int, int>> local_ranks,
-    struct ncclXmlNode* node) {
-  if (strcmp(node->name, "gpu") == 0) {
-    int rank;
-    xmlGetAttrInt(node, "rank", &rank);
-    for (auto p : local_ranks) {
-      if (rank == p.second) {
-        xmlSetAttrInt(node, "keep", 1);
-        xmlSetAttrInt(node, "rank", p.first);
-        break;
-      }
-    }
-  } else if (strcmp(node->name, "net") == 0) {
-    xmlSetAttrInt(node, "keep", 1);
-  }
-
-  for (int i = 0; i < node->nSubs; i++) {
-    TF_RETURN_IF_ERROR(MockNcclTopoUpdateXmlRec(local_ranks, node->subs[i]));
-  }
-  return absl::OkStatus();
-}
-
-#endif
-
-}  // namespace
-
-Status MockTopoGetXml(absl::string_view xml_str_view, ncclXml* xml) {
-#if GOOGLE_CUDA
-  xml->maxIndex = 0;
-  NcclTopoXmlParser parser(xml_str_view, xml);
-  return parser.Parse(nullptr);
-#else
-  return absl::OkStatus();
-#endif
-}
-
-Status MockNcclTopoUpdateXml(absl::Span<const std::pair<int, int>> local_ranks,
-                             ncclXml* xml) {
-#if GOOGLE_CUDA
-  return MockNcclTopoUpdateXmlRec(local_ranks, xml->nodes);
-#else
-  return absl::OkStatus();
-#endif
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/mock_nccl_xml_test.cc b/third_party/xla/xla/service/gpu/mock_nccl_xml_test.cc
deleted file mode 100644
index c33f29f98e5ee0..00000000000000
--- a/third_party/xla/xla/service/gpu/mock_nccl_xml_test.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/mock_nccl_xml.h"
-
-#include <memory>
-#include <string>
-
-#include <gtest/gtest.h>
-#include "xla/status.h"
-#include "tsl/platform/test.h"
-#if GOOGLE_CUDA
-#include "third_party/gpus/nccl/graph/xml.h"
-#endif
-
-namespace xla {
-namespace gpu {
-namespace {
-
-#if GOOGLE_CUDA
-
-class MockNcclXmlParserTest : public ::testing::Test {};
-
-TEST_F(MockNcclXmlParserTest, PciNic) {
-  const std::string original = R"(
-    <pci busid="0000:0c:00.0" class="0x020700" vendor="0x15b3" device="0x101b" subsystem_vendor="0x15b3" subsystem_device="0x0007" link_speed="16.0 GT/s PCIe" link_width="16">
-      <nic>
-        <net name="mlx5_0" dev="0" speed="200000" port="1" latency="0.000000" guid="0xcf70b0003a1420c" maxconn="131072" gdr="1"/>
-      </nic>
-    </pci>
-  )";
-  auto xml = std::make_unique<ncclXml>();
-  auto result = MockTopoGetXml(original, xml.get());
-
-  EXPECT_EQ(OkStatus(), result);
-  EXPECT_EQ(xml->maxIndex, 3);
-  EXPECT_EQ(std::string(xml->nodes[0].name), "pci");
-  EXPECT_EQ(xml->nodes[0].nAttrs, 8);
-  EXPECT_EQ(std::string(xml->nodes[0].attrs[0].key), "busid");
-  EXPECT_EQ(std::string(xml->nodes[0].attrs[0].value), "0000:0c:00.0");
-  EXPECT_EQ(std::string(xml->nodes[0].attrs[1].key), "class");
-  EXPECT_EQ(std::string(xml->nodes[0].attrs[1].value), "0x020700");
-  EXPECT_EQ(std::string(xml->nodes[0].attrs[2].key), "vendor");
-  EXPECT_EQ(std::string(xml->nodes[0].attrs[2].value), "0x15b3");
-  EXPECT_EQ(std::string(xml->nodes[0].attrs[3].key), "device");
-  EXPECT_EQ(std::string(xml->nodes[0].attrs[3].value), "0x101b");
-  EXPECT_EQ(std::string(xml->nodes[0].attrs[4].key), "subsystem_vendor");
-  EXPECT_EQ(std::string(xml->nodes[0].attrs[4].value), "0x15b3");
-  EXPECT_EQ(std::string(xml->nodes[0].attrs[5].key), "subsystem_device");
-  EXPECT_EQ(std::string(xml->nodes[0].attrs[5].value), "0x0007");
-  EXPECT_EQ(std::string(xml->nodes[0].attrs[6].key), "link_speed");
-  EXPECT_EQ(std::string(xml->nodes[0].attrs[6].value), "16.0 GT/s PCIe");
-  EXPECT_EQ(std::string(xml->nodes[0].attrs[7].key), "link_width");
-  EXPECT_EQ(std::string(xml->nodes[0].attrs[7].value), "16");
-  EXPECT_EQ(xml->nodes[0].nSubs, 1);
-  EXPECT_EQ(std::string(xml->nodes[0].subs[0]->name), "nic");
-}
-
-TEST_F(MockNcclXmlParserTest, GpuNvlink) {
-  const std::string original = R"(
-    <gpu dev="0" sm="80" rank="0" gdr="1">
-      <nvlink target="0000:c7:00.0" count="2" tclass="0x068000"/>
-    </gpu>
-  )";
-  auto xml = std::make_unique<ncclXml>();
-  auto result = MockTopoGetXml(original, xml.get());
-  EXPECT_EQ(OkStatus(), result);
-  EXPECT_EQ(xml->maxIndex, 2);
-  EXPECT_EQ(std::string(xml->nodes[0].name), "gpu");
-  EXPECT_EQ(xml->nodes[0].nAttrs, 4);
-  EXPECT_EQ(xml->nodes[0].nSubs, 1);
-  EXPECT_EQ(std::string(xml->nodes[0].subs[0]->name), "nvlink");
-  EXPECT_EQ(xml->nodes[0].subs[0]->nAttrs, 3);
-  EXPECT_EQ(std::string(xml->nodes[0].subs[0]->attrs[0].key), "target");
-  EXPECT_EQ(std::string(xml->nodes[0].subs[0]->attrs[0].value), "0000:c7:00.0");
-  EXPECT_EQ(std::string(xml->nodes[0].subs[0]->attrs[1].key), "count");
-  EXPECT_EQ(std::string(xml->nodes[0].subs[0]->attrs[1].value), "2");
-  EXPECT_EQ(std::string(xml->nodes[0].subs[0]->attrs[2].key), "tclass");
-  EXPECT_EQ(std::string(xml->nodes[0].subs[0]->attrs[2].value), "0x068000");
-}
-
-#endif
-
-}  // namespace
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/BUILD b/third_party/xla/xla/service/gpu/model/BUILD
index fe2c14ab9ac2c8..100341d255361d 100644
--- a/third_party/xla/xla/service/gpu/model/BUILD
+++ b/third_party/xla/xla/service/gpu/model/BUILD
@@ -1,6 +1,4 @@
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
-load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
 load("@local_tsl//tsl/platform:build_config_root.bzl", "tf_cuda_tests_tags")
 load("@local_tsl//tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured")
@@ -8,6 +6,8 @@ load("//xla:xla.bzl", "xla_cc_test", "xla_nvml_deps")
 
 # Libraries for performance modeling of HLO.
 load("//xla/tests:build_defs.bzl", "xla_test")
+load("//xla/tsl:tsl.bzl", "internal_visibility")
+load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -424,6 +424,7 @@ cc_library(
     deps = [
         ":affine_map_printer",
         "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
@@ -444,7 +445,9 @@ xla_cc_test(
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
         "//xla/tests:hlo_test_base",
+        "//xla/tests:verified_hlo_module",
         "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_googletest//:gtest",
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:test",
@@ -556,6 +559,73 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "symbolic_tiled_hlo_instruction",
+    srcs = ["symbolic_tiled_hlo_instruction.cc"],
+    hdrs = ["symbolic_tiled_hlo_instruction.h"],
+    deps = [
+        ":indexing_map",
+        ":symbolic_tile",
+        "//xla:status",
+        "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+xla_cc_test(
+    name = "symbolic_tiled_hlo_instruction_test",
+    srcs = ["symbolic_tiled_hlo_instruction_test.cc"],
+    deps = [
+        ":indexing_analysis",
+        ":indexing_map",
+        ":symbolic_tile",
+        ":symbolic_tiled_hlo_instruction",
+        "//xla/hlo/ir:hlo",
+        "//xla/service/gpu:hlo_traversal",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:verified_hlo_module",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+cc_library(
+    name = "tiled_hlo_instruction",
+    srcs = ["tiled_hlo_instruction.cc"],
+    hdrs = ["tiled_hlo_instruction.h"],
+    deps = [
+        ":indexing_map",
+        "//xla:util",
+        "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/hash",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+xla_cc_test(
+    name = "tiled_hlo_instruction_test",
+    srcs = ["tiled_hlo_instruction_test.cc"],
+    deps = [
+        ":indexing_map",
+        ":indexing_test_utils",
+        ":tiled_hlo_instruction",
+        "//xla:shape_util",
+        "//xla/hlo/ir:hlo",
+        "//xla/tests:hlo_test_base",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
 cc_library(
     name = "symbolic_tile_analysis",
     srcs = ["symbolic_tile_analysis.cc"],
@@ -564,6 +634,8 @@ cc_library(
         ":indexing_analysis",
         ":indexing_map",
         ":symbolic_tile",
+        ":symbolic_tiled_hlo_instruction",
+        ":tiled_hlo_instruction",
         "//xla:status",
         "//xla/hlo/ir:hlo",
         "//xla/service:instruction_fusion",
@@ -571,9 +643,14 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -581,12 +658,13 @@ xla_cc_test(
     name = "symbolic_tile_analysis_test",
     srcs = ["symbolic_tile_analysis_test.cc"],
     deps = [
-        ":indexing_map",
+        ":indexing_test_utils",
         ":symbolic_tile_analysis",
+        ":tiled_hlo_instruction",
+        "//xla/hlo/ir:hlo",
         "//xla/tests:hlo_test_base",
         "//xla/tests:verified_hlo_module",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:statusor",
@@ -608,6 +686,7 @@ cc_library(
         "//xla/service/gpu/fusions:fusion_emitter",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
diff --git a/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc b/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
index e7ad4bd6c01a93..83fe09366d8b31 100644
--- a/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
@@ -18,12 +18,15 @@ limitations under the License.
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
+#include <cstdlib>
 #include <optional>
+#include <stack>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
 #include "mlir/IR/AffineExpr.h"  // from @llvm-project
@@ -95,95 +98,15 @@ bool IsReadCoalescedHeuristic(HloFusionAnalysis::EmitterFusionKind fusion_kind,
 
 namespace {
 
-using mlir::AffineExpr;
-using mlir::AffineMap;
-using mlir::getAffineConstantExpr;
-using mlir::MLIRContext;
-
-// Performs backtracking to find all feasible dimensions, symbols that satisfy
-// the constraints and then evaluates the affine map at those.
-// For example, for the following indexing map:
-//   (d0)[s0] -> (d0 + s0)
-//   domain:
-//   d0 in [0, 3]
-//   s0 in [0, 1, 2]
-//   s0 mod 2 in [0, 0]
-// The function will compute the following indices [0, 2, 1, 3, 2, 4, 3, 5].
-void FindAllIndices(const IndexingMap& thread_id_to_physical_index,
-                    MLIRContext* mlir_context, int dim_id, int symbol_id,
-                    std::vector<AffineExpr>* dimensions,
-                    std::vector<AffineExpr>* symbols,
-                    std::vector<int64_t>* indices) {
-  if (dim_id < thread_id_to_physical_index.GetDimensionCount()) {
-    Interval dim_range = thread_id_to_physical_index.GetDimensionBound(dim_id);
-    for (int64_t dim_value = dim_range.lower; dim_value <= dim_range.upper;
-         ++dim_value) {
-      dimensions->push_back(getAffineConstantExpr(dim_value, mlir_context));
-      FindAllIndices(thread_id_to_physical_index, mlir_context, dim_id + 1,
-                     symbol_id, dimensions, symbols, indices);
-      dimensions->pop_back();
-    }
-    return;
-  }
-  if (symbol_id < thread_id_to_physical_index.GetSymbolCount()) {
-    Interval symbol_range =
-        thread_id_to_physical_index.GetSymbolBound(symbol_id);
-    for (int64_t symbol_value = symbol_range.lower;
-         symbol_value <= symbol_range.upper; ++symbol_value) {
-      symbols->push_back(getAffineConstantExpr(symbol_value, mlir_context));
-      FindAllIndices(thread_id_to_physical_index, mlir_context, dim_id,
-                     symbol_id + 1, dimensions, symbols, indices);
-      symbols->pop_back();
-    }
-    return;
-  }
-  if (!thread_id_to_physical_index.ConstraintsSatisfied(*dimensions,
-                                                        *symbols)) {
-    return;
-  }
-  indices->push_back(
-      thread_id_to_physical_index.Evaluate(*dimensions, *symbols).front());
-}
-
-// Computes contiguous intervals of accessed elements.
-// For example, for an indexing map
-//   (thread_x) -> (thread_x * 4 + s0 + (thread_x floordiv 16) * 1984)
-//   d0 in [0, 31]
-//   s0 in [0, 3]
-// The intervals are [0, 63] and [2047, 2111].
-// TODO(b/325613460): Make it faster than O(number of elements in the domain).
-std::vector<Interval> FindContiguousIntervals(
-    const IndexingMap& thread_id_to_physical_index) {
-  CHECK(thread_id_to_physical_index.GetAffineMap().getNumResults() == 1)
-      << "Expects an affine map that maps to 1D.";
-  MLIRContext* mlir_context = thread_id_to_physical_index.GetMLIRContext();
-
-  // Find all linear indices, sort and deduplicate them.
-  std::vector<AffineExpr> dimensions, symbols;
-  std::vector<int64_t> linear_indices;
-  FindAllIndices(thread_id_to_physical_index, mlir_context,
-                 /*dim_id=*/0,
-                 /*symbol_id=*/0, &dimensions, &symbols, &linear_indices);
-  std::sort(linear_indices.begin(), linear_indices.end());
-  linear_indices.erase(
-      std::unique(linear_indices.begin(), linear_indices.end()),
-      linear_indices.end());
-
-  // Scan over the sorted unique indices and combine them in intervals.
-  std::vector<Interval> intervals;
-  for (int i = 0, start, end; i < linear_indices.size(); ++i) {
-    start = linear_indices[i++];
-    end = start;
-    while (i < linear_indices.size() && linear_indices[i] == end + 1) {
-      ++end;
-      ++i;
-    }
-    intervals.push_back(Interval{start, end});
-  }
-  return intervals;
-}
-
-int64_t CeilDiv(int64_t a, int64_t b) { return a / b + (a % b != 0); }
+using ::mlir::AffineBinaryOpExpr;
+using ::mlir::AffineConstantExpr;
+using ::mlir::AffineDimExpr;
+using ::mlir::AffineExpr;
+using ::mlir::AffineExprKind;
+using ::mlir::AffineMap;
+using ::mlir::AffineSymbolExpr;
+using ::mlir::getAffineConstantExpr;
+using ::mlir::MLIRContext;
 
 // Approximately estimate the number of memory transactions needed to load all
 // elements in every range and compare it with the "ideal" number of memory
@@ -212,34 +135,6 @@ bool EstimateCoalescingViaMemoryTransactionsCount(
          memory_transactions * kIsCoalescedThreshold;
 }
 
-bool IsCoalesced(const IndexingMap& thread_id_to_input_indexing_map,
-                 PrimitiveType element_type) {
-  // Undefined indexing maps, i.e. those for which we don't know the indexing
-  // are assumed to be uncoalesced.
-  if (thread_id_to_input_indexing_map.IsUndefined()) {
-    return false;
-  }
-  // 0d constants are coalesced.
-  if (thread_id_to_input_indexing_map.GetAffineMap().getNumResults() == 0) {
-    return true;
-  }
-  MLIRContext* mlir_context = thread_id_to_input_indexing_map.GetMLIRContext();
-  AffineExpr thread_x_dim = mlir::getAffineDimExpr(
-      KernelFusionInterface::kIndexingMapThreadIdxDims[0], mlir_context);
-  AffineExpr c0 = mlir::getAffineConstantExpr(0, mlir_context);
-  IndexingMap thread_x_first_32_elements{
-      AffineMap::get(1, 0, {thread_x_dim, c0, c0, c0, c0, c0}, mlir_context),
-      {DimVar{{0, 31}}},
-      /*range_vars=*/{},
-      /*rt_vars=*/{}};
-  IndexingMap thread_x_to_linearized_input =
-      thread_x_first_32_elements * thread_id_to_input_indexing_map;
-  thread_x_to_linearized_input.Simplify(GetIndexingMapForInstruction);
-  thread_x_to_linearized_input.RemoveUnusedSymbols();
-  return EstimateCoalescingViaMemoryTransactionsCount(
-      FindContiguousIntervals(thread_x_to_linearized_input), element_type);
-}
-
 // Returns a linearized shape, i.e. tensor<num_elements(input) x element_type>.
 Shape GetLinearizedShape(const Shape& shape) {
   if (shape.rank() == 0) {
@@ -258,7 +153,7 @@ std::optional<GroupedByOpIndexingMap> GetThreadIdToInputMemoryLayoutsMaps(
     const HloFusionAdaptor& fusion_adaptor,
     absl::Span<const HloInstruction* const> operands,
     const HloFusionAnalysis& fusion_analysis,
-    KernelFusionInterface* fusion_interface, mlir::MLIRContext* mlir_context) {
+    KernelFusionInterface* fusion_interface, MLIRContext* mlir_context) {
   GroupedByOpIndexingMap result;
   for (const auto& [root_index, hero] :
        llvm::enumerate(fusion_analysis.fusion_heroes())) {
@@ -327,13 +222,358 @@ std::optional<GroupedByOpIndexingMap> GetThreadIdToInputMemoryLayoutsMaps(
   return result;
 }
 
+// Replaces RTVars with the midpoints of the feasible intervals.
+void AssignValuesToRTVars(IndexingMap* indexing_map) {
+  // If RTVars are present, replace them with constants.
+  if (indexing_map->GetRTVarsCount() == 0) {
+    return;
+  }
+  MLIRContext* mlir_context = indexing_map->GetMLIRContext();
+  llvm::SmallVector<AffineExpr, 2> symbol_replacements;
+  for (int64_t symbol_id = 0; symbol_id < indexing_map->GetRangeVarsCount();
+       ++symbol_id) {
+    symbol_replacements.push_back(
+        mlir::getAffineSymbolExpr(symbol_id, mlir_context));
+  }
+  for (const RTVar& rt_var : indexing_map->GetRTVars()) {
+    // Take midpoint of the feasible interval for the RT variable.
+    symbol_replacements.push_back(getAffineConstantExpr(
+        (rt_var.feasible_values.lower + rt_var.feasible_values.upper) / 2,
+        mlir_context));
+  }
+  AffineMap thread_x_to_input_no_dim_symbols =
+      indexing_map->GetAffineMap().replaceDimsAndSymbols(
+          {}, symbol_replacements, indexing_map->GetDimVarsCount(),
+          indexing_map->GetRangeVarsCount());
+  *indexing_map = IndexingMap{thread_x_to_input_no_dim_symbols,
+                              indexing_map->GetDimVars(),
+                              indexing_map->GetRangeVars(),
+                              {}};
+  indexing_map->Simplify(GetIndexingMapForInstruction);
+  indexing_map->RemoveUnusedSymbols();
+}
+
+// Replaces all but one RangeVars with the first elements in the range.
+// At the moment, we assume that the last RangeVar symbol corresponds to the
+// innermost loop induction variable.
+void AssignValuesToOuterLoopIVs(IndexingMap* indexing_map) {
+  if (indexing_map->GetRangeVarsCount() <= 1) {
+    return;
+  }
+  MLIRContext* mlir_context = indexing_map->GetMLIRContext();
+  llvm::SmallVector<AffineExpr, 2> symbol_replacements;
+  for (const RangeVar& range_var : indexing_map->GetRangeVars()) {
+    symbol_replacements.push_back(
+        getAffineConstantExpr(range_var.range.lower, mlir_context));
+  }
+  symbol_replacements.push_back(mlir::getAffineSymbolExpr(
+      indexing_map->GetRangeVarsCount() - 1, mlir_context));
+
+  AffineMap thread_x_to_input_no_dim_symbols =
+      indexing_map->GetAffineMap().replaceDimsAndSymbols(
+          {}, symbol_replacements, indexing_map->GetDimVarsCount(), 1);
+  *indexing_map = IndexingMap{thread_x_to_input_no_dim_symbols,
+                              indexing_map->GetDimVars(),
+                              indexing_map->GetRangeVars(),
+                              {}};
+  indexing_map->Simplify(GetIndexingMapForInstruction);
+  indexing_map->RemoveUnusedSymbols();
+}
+
+// Result of partitioning of AffineExpr f(d0) + g(s0) into the summands.
+struct PartitionedExpr {
+  explicit PartitionedExpr(MLIRContext* mlir_context) {
+    AffineExpr zero = getAffineConstantExpr(0, mlir_context);
+    func_of_d0 = zero;
+    func_of_s0 = zero;
+  }
+  AffineExpr func_of_d0;
+  AffineExpr func_of_s0;
+};
+
+// Given an AffineExpr that depends on d0 and s0, attempts to split it into
+// f(d0) + g(s0). If it is not possible, returns std::nullopt.
+std::optional<PartitionedExpr> Partition(AffineExpr expr) {
+  PartitionedExpr result(expr.getContext());
+
+  std::vector<AffineExpr> summands;
+  std::stack<AffineExpr> dfs;
+  dfs.push(expr);
+  while (!dfs.empty()) {
+    auto top = dfs.top();
+    dfs.pop();
+    auto sum = mlir::dyn_cast<AffineBinaryOpExpr>(top);
+    if (sum && sum.getKind() == AffineExprKind::Add) {
+      dfs.push(sum.getLHS());
+      dfs.push(sum.getRHS());
+      continue;
+    }
+    bool depends_on_thread_x = top.isFunctionOfDim(0);
+    bool depends_on_range = top.isFunctionOfSymbol(0);
+
+    if (depends_on_thread_x && depends_on_range) {
+      return std::nullopt;
+    }
+    if (depends_on_thread_x) {
+      result.func_of_d0 = top + result.func_of_d0;
+    }
+    if (depends_on_range) {
+      result.func_of_s0 = top + result.func_of_s0;
+    }
+  }
+  return result;
+}
+
+// Given an AffineExpr and the values for its dimensions and symbols, evaluates
+// the result.
+int64_t EvaluateAffineExpr(AffineExpr expr,
+                           const std::vector<int64_t>& dim_values,
+                           const std::vector<int64_t>& symbol_values = {}) {
+  if (auto const_expr = mlir::dyn_cast<AffineConstantExpr>(expr)) {
+    return const_expr.getValue();
+  }
+  if (auto dim_expr = mlir::dyn_cast<AffineDimExpr>(expr)) {
+    return dim_values[dim_expr.getPosition()];
+  }
+  if (auto symbol_expr = mlir::dyn_cast<AffineSymbolExpr>(expr)) {
+    return symbol_values[symbol_expr.getPosition()];
+  }
+  auto binary_expr = mlir::cast<AffineBinaryOpExpr>(expr);
+  int64_t lhs =
+      EvaluateAffineExpr(binary_expr.getLHS(), dim_values, symbol_values);
+  int64_t rhs =
+      EvaluateAffineExpr(binary_expr.getRHS(), dim_values, symbol_values);
+  switch (binary_expr.getKind()) {
+    case AffineExprKind::Add:
+      return lhs + rhs;
+    case AffineExprKind::Mul:
+      return lhs * rhs;
+    case AffineExprKind::FloorDiv:
+      return FloorDiv(lhs, rhs);
+    case AffineExprKind::Mod:
+      return lhs % rhs;
+    default:
+      LOG(FATAL) << "Unsupported expression";
+  }
+}
+
+// Performs backtracking to find all feasible dimensions, symbols that satisfy
+// the constraints and then evaluates the affine map at those.
+// For example, for the following indexing map:
+//   (d0)[s0] -> (d0 + s0)
+//   domain:
+//   d0 in [0, 3]
+//   s0 in [0, 1, 2]
+//   s0 mod 2 in [0, 0]
+// The function will compute the following indices [0, 2, 1, 3, 2, 4, 3, 5].
+void FindAllIndices(AffineExpr expr, int dim_id, int symbol_id,
+                    const std::vector<Interval>& dimension_ranges,
+                    const std::vector<Interval>& symbol_ranges,
+                    std::vector<int64_t>* dimensions,
+                    std::vector<int64_t>* symbols,
+                    std::vector<int64_t>* indices) {
+  if (dim_id < dimension_ranges.size()) {
+    Interval dim_range = dimension_ranges[dim_id];
+    for (int64_t dim_value = dim_range.lower; dim_value <= dim_range.upper;
+         ++dim_value) {
+      dimensions->push_back(dim_value);
+      FindAllIndices(expr, dim_id + 1, symbol_id, dimension_ranges,
+                     symbol_ranges, dimensions, symbols, indices);
+      dimensions->pop_back();
+    }
+    return;
+  }
+  if (symbol_id < symbol_ranges.size()) {
+    Interval symbol_range = symbol_ranges[symbol_id];
+    for (int64_t symbol_value = symbol_range.lower;
+         symbol_value <= symbol_range.upper; ++symbol_value) {
+      symbols->push_back(symbol_value);
+      FindAllIndices(expr, dim_id, symbol_id + 1, dimension_ranges,
+                     symbol_ranges, dimensions, symbols, indices);
+      symbols->pop_back();
+    }
+    return;
+  }
+  indices->push_back(EvaluateAffineExpr(expr, *dimensions, *symbols));
+}
+
+// Computes contiguous intervals of accessed elements.
+// For example, for an indexing map
+//   (thread_x) -> (thread_x * 4 + s0 + (thread_x floordiv 16) * 1984)
+//   d0 in [0, 31]
+//   s0 in [0, 3]
+// The intervals are [0, 63] and [2047, 2111].
+std::vector<Interval> FindIntervals(
+    AffineExpr expr, const std::vector<Interval>& dimension_ranges,
+    const std::vector<Interval>& symbol_ranges = {}) {
+  // Find all linear indices, sort and deduplicate them.
+  std::vector<int64_t> dimensions, symbols;
+  std::vector<int64_t> linear_indices;
+  FindAllIndices(expr, 0, 0, dimension_ranges, symbol_ranges, &dimensions,
+                 &symbols, &linear_indices);
+
+  std::sort(linear_indices.begin(), linear_indices.end());
+  linear_indices.erase(
+      std::unique(linear_indices.begin(), linear_indices.end()),
+      linear_indices.end());
+
+  // Scan over the sorted unique indices and combine them in intervals.
+  std::vector<Interval> intervals;
+  for (int i = 0, start, end; i < linear_indices.size();) {
+    start = linear_indices[i++];
+    end = start;
+    while (i < linear_indices.size() && linear_indices[i] == end + 1) {
+      ++end;
+      ++i;
+    }
+    intervals.push_back(Interval{start, end});
+  }
+  return intervals;
+}
+
+// Given a vector of interval [lb, ub] computes intervals [lb, ub + length] and
+// then computes union of contiguous intervals.
+std::vector<Interval> ExtendIntervals(const std::vector<Interval>& intervals,
+                                      int64_t length) {
+  // Compute union of overlapped intervals.
+  std::vector<Interval> overlapped_intervals;
+  for (int i = 0; i < intervals.size();) {
+    int64_t lower = intervals[i].lower;
+    int64_t upper = intervals[i].upper + length;
+    ++i;
+    while (i < intervals.size() && upper >= intervals[i].lower - 1) {
+      upper = std::max(upper, intervals[i].upper + length);
+      ++i;
+    }
+    overlapped_intervals.push_back(Interval{lower, upper});
+  }
+  return overlapped_intervals;
+}
+
+// Computes contiguous intervals, for the expression of type f(thread_x) + g(s).
+std::vector<Interval> FindContiguousIntervals(
+    const PartitionedExpr& partitioned_expr, const IndexingMap& indexing_map) {
+  constexpr int64_t kNumThreadsPerWarp = 32;
+  MLIRContext* mlir_context = indexing_map.GetMLIRContext();
+  AffineExpr thread_x = mlir::getAffineDimExpr(0, mlir_context);
+  AffineExpr range = mlir::getAffineSymbolExpr(0, mlir_context);
+
+  // Case 1: f(thread_x) = thread_x * multiplier.
+  // Case 1.1: multiplier == 1.
+  if (partitioned_expr.func_of_d0 == thread_x) {
+    return {Interval{0, kNumThreadsPerWarp - 1}};
+  }
+  if (auto mul =
+          mlir::dyn_cast<AffineBinaryOpExpr>(partitioned_expr.func_of_d0);
+      mul && mul.getKind() == AffineExprKind::Mul) {
+    if (auto multiplier = mlir::dyn_cast<AffineConstantExpr>(mul.getRHS());
+        multiplier) {
+      // Case 1.2: multiplier == -1.
+      if (multiplier.getValue() == -1) {
+        return {Interval{0, kNumThreadsPerWarp - 1}};
+      }
+      // Case 1.3: |multiplier| != 1 and g(s) = s.
+      if (partitioned_expr.func_of_s0 == range) {
+        Interval range_interval = indexing_map.GetSymbolBound(0);
+        int64_t num_elems = range_interval.NumElements();
+        // In this case we get a single interval, because the ranges that every
+        // thread is reading overlap.
+        if (num_elems >= std::abs(multiplier.getValue())) {
+          return {Interval{0, multiplier.getValue() * (kNumThreadsPerWarp - 1) +
+                                  num_elems - 1}};
+        }
+        std::vector<Interval> intervals;
+        for (int i = 0, dm = 0; i < kNumThreadsPerWarp;
+             ++i, dm += multiplier.getValue()) {
+          intervals.push_back(
+              {range_interval.lower + dm, range_interval.upper + dm});
+        }
+        return intervals;
+      }
+      // Case 1.4: |multiplier| != 1 and g(s) != s.
+      std::vector<Interval> intervals;
+      for (int i = 0, dm = 0; i < kNumThreadsPerWarp;
+           ++i, dm += multiplier.getValue()) {
+        intervals.push_back({dm, dm});
+      }
+      return intervals;
+    }
+  }
+  // Case 2: f(thread_x) != thread_x * multiplier.
+  auto intervals = FindIntervals(partitioned_expr.func_of_d0,
+                                 {indexing_map.GetDimVars(0).bounds});
+  // Case 2.1: g(s) != s.
+  if (partitioned_expr.func_of_s0 != range) {
+    return intervals;
+  }
+  // Case 2.2: g(s) = s.
+  Interval range_interval = indexing_map.GetSymbolBound(0);
+  return ExtendIntervals(intervals, range_interval.NumElements() - 1);
+}
+
+bool IsIndexingCoalesced(IndexingMap& thread_x_to_linearized_input,
+                         PrimitiveType element_type) {
+  // Undefined indexing maps, i.e. those for which we don't know the indexing
+  // are assumed to be uncoalesced.
+  if (thread_x_to_linearized_input.IsUndefined()) {
+    return false;
+  }
+  // 0d constants are coalesced.
+  if (thread_x_to_linearized_input.GetAffineMap().getNumResults() == 0) {
+    return true;
+  }
+  // Replace RTVars with the feasible values.
+  AssignValuesToRTVars(&thread_x_to_linearized_input);
+
+  // Compute the indexing map for the first [0, 31] threads. This should be
+  // extended to sampling several warps.
+  MLIRContext* mlir_context = thread_x_to_linearized_input.GetMLIRContext();
+  AffineExpr thread_x_dim = mlir::getAffineDimExpr(
+      KernelFusionInterface::kIndexingMapThreadIdxDims[0], mlir_context);
+  AffineExpr c0 = getAffineConstantExpr(0, mlir_context);
+  IndexingMap thread_x_first_32_elements{
+      AffineMap::get(1, 0, {thread_x_dim, c0, c0, c0, c0, c0}, mlir_context),
+      {DimVar{{0, 31}}},
+      /*range_vars=*/{},
+      /*rt_vars=*/{}};
+  IndexingMap thread_x_to_input_sample =
+      thread_x_first_32_elements * thread_x_to_linearized_input;
+  thread_x_to_input_sample.Simplify(GetIndexingMapForInstruction);
+  thread_x_to_input_sample.RescaleSymbols();
+  thread_x_to_input_sample.RemoveUnusedSymbols();
+
+  // If the indexing map is "empty", then the input is not used in this warp,
+  // therefore, it's coalesced.
+  if (thread_x_to_input_sample.IsKnownEmpty()) {
+    return true;
+  }
+  AssignValuesToOuterLoopIVs(&thread_x_to_input_sample);
+  auto partitioned_expr =
+      Partition(thread_x_to_input_sample.GetAffineMap().getResult(0));
+  if (!partitioned_expr.has_value()) {
+    return false;
+  }
+  // Right now we support only thread_x maps what do not have any constraints or
+  // have a single constraint that coincides with
+  // thread_x_to_input_sample.getAffineMap().
+  if (thread_x_to_input_sample.GetConstraintsCount() > 1 ||
+      (thread_x_to_input_sample.GetConstraintsCount() == 1 &&
+       thread_x_to_input_sample.GetConstraints().begin()->first !=
+           partitioned_expr->func_of_d0 + partitioned_expr->func_of_s0)) {
+    return false;
+  }
+  return EstimateCoalescingViaMemoryTransactionsCount(
+      FindContiguousIntervals(*partitioned_expr, thread_x_to_input_sample),
+      element_type);
+}
+
 }  // namespace
 
 CoalescingAnalysis::CoalescingAnalysis(
     const HloInstruction* instr,
     absl::Span<const HloInstruction* const> operands,
     const HloFusionAnalysis& fusion_analysis,
-    KernelFusionInterface* fusion_interface, mlir::MLIRContext* mlir_context,
+    KernelFusionInterface* fusion_interface, MLIRContext* mlir_context,
     bool use_heuristic) {
   auto fusion_adaptor = HloFusionAdaptor::ForInstruction(instr);
   if (!use_heuristic && ComputeCoalescingForAllOperands(
@@ -350,7 +590,7 @@ CoalescingAnalysis::CoalescingAnalysis(
     const HloInstruction* producer, const HloInstruction* consumer,
     absl::Span<const HloInstruction* const> operands,
     const HloFusionAnalysis& fusion_analysis,
-    KernelFusionInterface* fusion_interface, mlir::MLIRContext* mlir_context,
+    KernelFusionInterface* fusion_interface, MLIRContext* mlir_context,
     bool use_heuristic) {
   ProducerConsumerFusion fusion_adaptor(producer, consumer);
   if (!use_heuristic &&
@@ -367,7 +607,7 @@ bool CoalescingAnalysis::ComputeCoalescingForAllOperands(
     const HloFusionAdaptor& fusion_adaptor,
     absl::Span<const HloInstruction* const> operands,
     const HloFusionAnalysis& fusion_analysis,
-    KernelFusionInterface* fusion_interface, mlir::MLIRContext* mlir_context) {
+    KernelFusionInterface* fusion_interface, MLIRContext* mlir_context) {
   std::optional<GroupedByOpIndexingMap> thread_id_to_input_memory_layouts =
       GetThreadIdToInputMemoryLayoutsMaps(fusion_adaptor, operands,
                                           fusion_analysis, fusion_interface,
@@ -388,10 +628,9 @@ bool CoalescingAnalysis::ComputeCoalescingForAllOperands(
       coalescing_per_operand_.insert({operand, true});
       continue;
     }
-    for (const IndexingMap& operand_indexing_map :
-         operand_indexing_maps->second) {
-      bool is_coalesced =
-          IsCoalesced(operand_indexing_map, operand->shape().element_type());
+    for (IndexingMap operand_indexing_map : operand_indexing_maps->second) {
+      bool is_coalesced = IsIndexingCoalesced(operand_indexing_map,
+                                              operand->shape().element_type());
       auto [it, inserted] =
           coalescing_per_operand_.insert({operand, is_coalesced});
       if (!inserted) {
diff --git a/third_party/xla/xla/service/gpu/model/coalescing_analysis.h b/third_party/xla/xla/service/gpu/model/coalescing_analysis.h
index 300036aa453bae..f65f4c8b8ddd49 100644
--- a/third_party/xla/xla/service/gpu/model/coalescing_analysis.h
+++ b/third_party/xla/xla/service/gpu/model/coalescing_analysis.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "xla/service/gpu/fusions/fusion_emitter.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/hlo_traversal.h"
+#include "xla/service/gpu/model/indexing_map.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc b/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc
index 18a69aa6bf404b..63c4dba8c5b489 100644
--- a/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc
@@ -368,8 +368,11 @@ TEST_F(CoalescingTest, VariadicReduceViaLoopEmitter) {
       ROOT f = (s32[5696,4], s32[5696,4]) fusion(p0, p1, p2, p3),
           kind=kInput, calls=fusion
     })";
+  // thread_x to linearized input mapping for thread_x in [0, 31]:
+  // Operands 1, 2: (d0)[s0] -> ((d0 floordiv 4) * 40 + d0 mod 4 + s0 * 4)
+  //  for s0 in [0, 9].
   EXPECT_THAT(IsReadCoalescedPerOperand(ir),
-              ElementsAre(true, true, true, true));
+              ElementsAre(false, false, true, true));
 }
 
 TEST_F(CoalescingTest, VariadicReduceViaReductionEmitter) {
@@ -401,6 +404,57 @@ TEST_F(CoalescingTest, VariadicReduceViaReductionEmitter) {
       ROOT f = (s32[32], s32[32]) fusion(p0, p1, p2, p3),
           kind=kInput, calls=fusion
     })";
+  // thread_x to linearized input mapping for thread_x in [0, 31]:
+  // Operands 1, 2: (d0)[s0] -> (d0 + s0 * 32)
+  //  for s0 in [0, 1] and d0 + s0 * 32 in [0, 39].
+  EXPECT_THAT(IsReadCoalescedPerOperand(ir),
+              ElementsAre(true, true, true, true));
+}
+
+TEST_F(CoalescingTest, Gather) {
+  absl::string_view ir = R"(
+    HloModule module
+    fusion {
+      operand = f32[33, 76, 70] parameter(0)
+      indices = s32[1806, 2] parameter(1)
+      ROOT gather = f32[1806, 7, 8, 4] gather(operand, indices),
+        offset_dims={1,2,3}, collapsed_slice_dims={}, start_index_map={0,1},
+        index_vector_dim=1, slice_sizes={7,8,4}
+    }
+    ENTRY entry {
+      p0 = f32[33, 76, 70] parameter(0)
+      p1 = s32[1806, 2] parameter(1)
+      ROOT %fusion = f32[1806, 7, 8, 4] fusion(p0, p1), kind=kLoop, calls=fusion
+  })";
+  // thread_x to linearized input mapping for thread_x in [0, 31]:
+  // Operand 1: (d0)[s0] -> (
+  //  (d0 floordiv 8) * 5320 + (d0 mod 8) * 70 + s0 * 70 + 34) for s0 in [0, 3]
+  // Operand 2: (d0)[s0] -> (s0)
+  //  for s0 in [0, 1].
+  EXPECT_THAT(IsReadCoalescedPerOperand(ir), ElementsAre(false, true));
+}
+
+TEST_F(CoalescingTest, DynamicSlice) {
+  absl::string_view ir = R"(
+    HloModule module
+    fusion {
+      %src = s32[2,2,258] parameter(0)
+      %of1 = s32[] parameter(1)
+      %of2 = s32[] parameter(2)
+      %of3 = s32[] parameter(3)
+      ROOT %ds = s32[1,2,32] dynamic-slice(s32[2,2,258] %src,
+        s32[] %of1, s32[] %of2, s32[] %of3),
+        dynamic_slice_sizes={1, 2, 32}
+    }
+    ENTRY entry {
+      %p0 = s32[2,2,258] parameter(0)
+      %p1 = s32[] parameter(1)
+      %p2 = s32[] parameter(2)
+      %p3 = s32[] parameter(3)
+      ROOT %fusion = s32[1,2,32] fusion(p0, p1, p2, p3), kind=kLoop, calls=fusion
+  })";
+  // thread_x to linearized input mapping for thread_x in [0, 31]:
+  // Operand 1: (d0) -> (d0).
   EXPECT_THAT(IsReadCoalescedPerOperand(ir),
               ElementsAre(true, true, true, true));
 }
diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis.cc b/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
index b0deed5f9f8952..765429c8f4b26a 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
@@ -465,8 +465,7 @@ IndexingMap ComputeOutputToInputPadOpIndexingImpl(
       AffineMap::get(output_rank, /*symbolCount=*/0, exprs, mlir_context),
       std::move(dim_vars),
       /*range_vars = */ {},
-      /*rt_vars = */ {},
-      absl::MakeSpan(constraints)};
+      /*rt_vars = */ {}, absl::MakeSpan(constraints)};
 }
 
 HloInstructionIndexing ComputeOutputToInputPadOpIndexing(
@@ -714,8 +713,9 @@ HloInstructionIndexing ComputeOutputToInputConvolutionOpIndexing(
     kernel_exprs[dnums.kernel_spatial_dimensions(i)] =
         getAffineSymbolExpr(i, mlir_context);
   }
-  kernel_exprs[dnums.kernel_output_feature_dimension()] =
+  AffineExpr dim_expr =
       getAffineDimExpr(dnums.output_feature_dimension(), mlir_context);
+  kernel_exprs[dnums.kernel_output_feature_dimension()] = dim_expr;
 
   // Build initial symbol ranges.
   std::vector<RangeVar> input_symbols = input_spatial_indexing.GetRangeVars();
@@ -737,10 +737,12 @@ HloInstructionIndexing ComputeOutputToInputConvolutionOpIndexing(
   // With multiple feature groups, the input feature dimension is equally split.
   if (convolution->feature_group_count() > 1) {
     AffineExpr& input_feature = input_exprs[dnums.input_feature_dimension()];
-    AffineExpr dim_expr =
-        getAffineDimExpr(dnums.output_feature_dimension(), mlir_context);
-    input_feature =
-        dim_expr.floorDiv(input_group_size) * input_group_size + input_feature;
+    int64_t output_group_size =
+        output_shape.dimensions(dnums.output_feature_dimension());
+    int64_t feature_group_size =
+        output_group_size / convolution->feature_group_count();
+    input_feature = dim_expr.floorDiv(feature_group_size) * input_group_size +
+                    input_feature;
   }
 
   // With multiple batch groups, the input batch dimension is equally split.
@@ -1385,7 +1387,10 @@ bool FuseProducerConsumerOutputToInputIndexing(
 HloInstructionIndexing ComputeOutputToInputIndexing(const HloInstruction* instr,
                                                     int output_id,
                                                     MLIRContext* ctx) {
-  if (HloInstruction::IsOpElementwise(instr->opcode())) {
+  if (HloInstruction::IsOpElementwise(instr->opcode()) ||
+      instr->opcode() == HloOpcode::kMap) {
+    // Note: map has a `dimensions` attribute, but it does nothing. See
+    // b/65689298.
     return ComputeOutputToInputCwiseOpIndexing(instr, ctx);
   }
   if (instr->opcode() == HloOpcode::kBitcast) {
@@ -1451,7 +1456,10 @@ HloInstructionIndexing ComputeOutputToInputIndexing(const HloInstruction* instr,
 HloInstructionIndexing ComputeInputToOutputIndexing(const HloInstruction* instr,
                                                     int input_id,
                                                     MLIRContext* ctx) {
-  if (HloInstruction::IsOpElementwise(instr->opcode())) {
+  if (HloInstruction::IsOpElementwise(instr->opcode()) ||
+      instr->opcode() == HloOpcode::kMap) {
+    // Note: map has a `dimensions` attribute, but it does nothing. See
+    // b/65689298.
     return ComputeInputToOutputCwiseOpIndexing(instr, ctx);
   }
   if (instr->opcode() == HloOpcode::kBitcast) {
@@ -1498,6 +1506,7 @@ IndexingMap ComputeEpilogueInputToOutputIndexing(
         user, user->operand_index(instr), mlir_context);
     root_indexing = root_indexing * *user_indexing.indexing_maps[0].begin();
     root_indexing.Simplify(GetIndexingMapForInstruction);
+    root_indexing.RemoveUnusedSymbols();
     instr = user;
   }
   return root_indexing;
diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc b/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
index 1ae80e76fa0907..80ec956a241ab7 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
@@ -456,6 +456,54 @@ TEST_F(IndexingAnalysisTest, ElementwiseOp) {
                           )"))));
 }
 
+TEST_F(IndexingAnalysisTest, Map) {
+  auto root = ParseAndGetRoot(R"(
+    HloModule m
+    mapper {
+      a = f32[] parameter(0)
+      b = f32[] parameter(1)
+      ROOT add = f32[] add(a, b)
+    }
+    ENTRY e {
+      p0 = f32[10, 20] parameter(0)
+      p1 = f32[10, 20] parameter(1)
+      ROOT add0 = f32[10, 20] map(%p0, %p1), dimensions={}, to_apply=mapper
+    }
+  )");
+  auto input_indexing = GetOutputToInputIndexing(root);
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1) -> (d0, d1)
+                            domain:
+                            d0 in [0, 9]
+                            d1 in [0, 19]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1) -> (d0, d1)
+                            domain:
+                            d0 in [0, 9]
+                            d1 in [0, 19]
+                          )"))));
+
+  auto output_indexing_0 = GetInputToOutputIndexing(root, /*input_id=*/0);
+  EXPECT_THAT(output_indexing_0.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1) -> (d0, d1)
+                            domain:
+                            d0 in [0, 9]
+                            d1 in [0, 19]
+                          )"))));
+
+  auto output_indexing_1 = GetInputToOutputIndexing(root, /*input_id=*/1);
+  EXPECT_THAT(output_indexing_1.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1) -> (d0, d1)
+                            domain:
+                            d0 in [0, 9]
+                            d1 in [0, 19]
+                          )"))));
+}
+
 TEST_F(IndexingAnalysisTest, BitcastIsReshape) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
@@ -2255,7 +2303,7 @@ TEST_F(IndexingAnalysisTest, ConvolutionOp_FeatureGroups) {
   auto input_indexing = GetOutputToInputIndexing(root);
   EXPECT_THAT(input_indexing.indexing_maps,
               ElementsAre(ElementsAre(MatchIndexingMap(R"(
-                            (d0, d1, d2, d3)[s0, s1, s2] -> (d0, d1 + s0, d2 + s1, (d3 floordiv 4) * 4 + s2)
+                            (d0, d1, d2, d3)[s0, s1, s2] -> (d0, d1 + s0, d2 + s1, (d3 floordiv 8) * 4 + s2)
                             domain:
                             d0 in [0, 0]
                             d1 in [0, 9]
diff --git a/third_party/xla/xla/service/gpu/model/indexing_map.cc b/third_party/xla/xla/service/gpu/model/indexing_map.cc
index a740243bc2e795..eef79d639e7772 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_map.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_map.cc
@@ -25,9 +25,9 @@ limitations under the License.
 #include <sstream>
 #include <string>
 #include <utility>
-#include <variant>
 #include <vector>
 
+#include "absl/base/optimization.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallBitVector.h"
@@ -61,16 +61,6 @@ using mlir::getAffineBinaryOpExpr;
 using mlir::getAffineConstantExpr;
 using mlir::MLIRContext;
 
-int64_t FloorDiv(int64_t dividend, int64_t divisor) {
-  return dividend / divisor -
-         (((dividend >= 0) != (divisor >= 0) && dividend % divisor) ? 1 : 0);
-}
-
-int64_t CeilDiv(int64_t dividend, int64_t divisor) {
-  return dividend / divisor +
-         (((dividend >= 0) == (divisor >= 0) && dividend % divisor) ? 1 : 0);
-}
-
 class AffineExprSimplifier {
  public:
   explicit AffineExprSimplifier(RangeEvaluator* range_evaluator)
@@ -625,6 +615,16 @@ SmallVector<AffineExpr, 4> MapSymbolsToComposedSymbolsList(
 
 }  // namespace
 
+int64_t FloorDiv(int64_t dividend, int64_t divisor) {
+  return dividend / divisor -
+         (((dividend >= 0) != (divisor >= 0) && dividend % divisor) ? 1 : 0);
+}
+
+int64_t CeilDiv(int64_t dividend, int64_t divisor) {
+  return dividend / divisor +
+         (((dividend >= 0) == (divisor >= 0) && dividend % divisor) ? 1 : 0);
+}
+
 std::string Interval::ToString() const {
   std::stringstream ss;
   Print(ss);
@@ -743,6 +743,12 @@ void IndexingMap::AddConstraint(mlir::AffineExpr expr, Interval range) {
     current_range = Intersect(current_range, range);
     return;
   }
+  if (auto constant_expr = mlir::dyn_cast<AffineConstantExpr>(expr)) {
+    if (constant_expr.getValue() >= range.lower &&
+        constant_expr.getValue() <= range.upper) {
+      return;
+    }
+  }
   if (SimplifyConstraintRange(&expr, &range)) {
     AddConstraint(expr, range);
     return;
@@ -1339,28 +1345,57 @@ bool IndexingMap::RescaleSymbols() {
   return !to_delete.empty();
 }
 
-// Returns either:
-// 1. an AffineExpr if the RTVar folds entirely into a constant expression
-// 2. an updated RTVar if some partial optimization was possible
-// 3. an unchanged RTVar if no optimization was possible
-static std::variant<AffineExpr, RTVar> OptimizeRTVar(
-    RTVar rt_var, MLIRContext* mlir_context,
+// The return type of `OptimizeRTVar` below
+struct RTVarOptimizationResult {
+  // An affine expr which maps the old RTVar to the new, optimized RTVar:
+  // `()[sk] -> s'k` (with k being `symbol_index` in the `OptimizeRTVar` call).
+  // If `expr` doesn't depend on `sk` it means the RTVar could be optimized
+  // away completely and the value of `rt_var` can be ignored.
+  AffineExpr remapped_symbol;
+
+  // The new, optimized RTVar
+  RTVar rt_var;
+};
+
+namespace {
+// Tries to optimize the given RTVar by removing some parts (or entirety) of
+// the dependent HLO graph:
+//
+// 1. If no optimization is possible it returns `{sk, rt_var}` - the
+// identity expr and the unchanged rt_var.
+//
+// 2. If full optimization is possible, it returns
+// `{const, rt_var}` - an affine expr that does not anymore depend
+// on `sk` and an arbitrary rt_var.
+//
+// 3. if partial optimization is possible, it returns
+// `{()[sk] -> f(sk), rt_var_new }` - an affine expression that maps from the
+// old RTVar to the new RTVar, and the new RTVar itself. The new RTVar now
+// references some HLO subgraph of the old RTVar's HLO.
+RTVarOptimizationResult OptimizeRTVar(
+    RTVar rt_var, int64_t symbol_index, MLIRContext* mlir_context,
     IndexingMap::IndexingMapProvider indexing_map_provider) {
+  const auto symbol = getAffineSymbolExpr(symbol_index, mlir_context);
+  auto result_expr = symbol;
+
   while (true) {
     if (auto constant_expr = DynCast<HloConstantInstruction>(rt_var.hlo)) {
       if (rt_var.map.isConstant()) {
         const auto idx = rt_var.map.getConstantResults();
-        return getAffineConstantExpr(
-            constant_expr->literal().GetIntegralAsS64(idx).value(),
-            mlir_context);
+        result_expr = result_expr.replace(
+            symbol, getAffineConstantExpr(
+                        constant_expr->literal().GetIntegralAsS64(idx).value(),
+                        mlir_context));
       }
-      return rt_var;
+      return {result_expr, rt_var};
     }
 
     if (auto iota_expr = DynCast<HloIotaInstruction>(rt_var.hlo)) {
       auto iota_dimension = iota_expr->iota_dimension();
       CHECK(iota_dimension < rt_var.map.getNumResults());
-      return rt_var.map.getResults()[iota_dimension];
+      return {
+          result_expr.replace(symbol, rt_var.map.getResults()[iota_dimension]),
+          rt_var};
     }
 
     auto is_indexing_transformation = [](const HloInstruction* instr) {
@@ -1381,9 +1416,68 @@ static std::variant<AffineExpr, RTVar> OptimizeRTVar(
       continue;
     }
 
-    return rt_var;
+    if (rt_var.hlo->opcode() == HloOpcode::kNegate) {
+      rt_var.hlo = rt_var.hlo->operand(0);
+      result_expr = result_expr.replace(symbol, -symbol);
+      continue;
+    }
+
+    if (rt_var.hlo->opcode() == HloOpcode::kAdd ||
+        rt_var.hlo->opcode() == HloOpcode::kSubtract ||
+        rt_var.hlo->opcode() == HloOpcode::kMultiply ||
+        rt_var.hlo->opcode() == HloOpcode::kDivide) {
+      const auto apply_op = [&](const AffineExpr& lhs,
+                                const AffineExpr& rhs) -> AffineExpr {
+        switch (rt_var.hlo->opcode()) {
+          case HloOpcode::kAdd:
+            return lhs + rhs;
+          case HloOpcode::kSubtract:
+            return lhs - rhs;
+          case HloOpcode::kMultiply:
+            return lhs * rhs;
+          case HloOpcode::kDivide:
+            return lhs.floorDiv(rhs);
+          default:
+            ABSL_UNREACHABLE();
+        }
+      };
+
+      auto lhs = OptimizeRTVar(
+          RTVar{rt_var.feasible_values, rt_var.hlo->operand(0), rt_var.map},
+          symbol_index, mlir_context, indexing_map_provider);
+
+      if (!lhs.remapped_symbol.isFunctionOfSymbol(symbol_index)) {
+        // This means that lhs is constant-like and we can eliminate the
+        // operand.
+        result_expr =
+            result_expr.replace(symbol, apply_op(lhs.remapped_symbol, symbol));
+
+        // We continue optimizing the `rhs` operand
+        rt_var.hlo = rt_var.hlo->operand(1);
+        continue;
+      }
+
+      auto rhs = OptimizeRTVar(
+          RTVar{rt_var.feasible_values, rt_var.hlo->operand(1), rt_var.map},
+          symbol_index, mlir_context, indexing_map_provider);
+
+      if (!rhs.remapped_symbol.isFunctionOfSymbol(symbol_index)) {
+        // This means that rhs is constant-like and we can eliminate the
+        // operand.
+        result_expr =
+            result_expr.replace(symbol, apply_op(symbol, rhs.remapped_symbol));
+
+        // We can also take advantage of the optimization already done for lhs:
+        result_expr = result_expr.replace(symbol, lhs.remapped_symbol);
+        rt_var = lhs.rt_var;
+        continue;
+      }
+    }
+
+    return {result_expr, rt_var};
   }
 }
+}  // namespace
 
 bool IndexingMap::ReplaceConstantRTVars(
     IndexingMap::IndexingMapProvider indexing_map_provider) {
@@ -1393,44 +1487,43 @@ bool IndexingMap::ReplaceConstantRTVars(
 
   for (auto index = 0; index < rt_vars_.size(); ++index) {
     auto& rt_var = rt_vars_[index];
-    auto result =
-        OptimizeRTVar(rt_var, GetMLIRContext(), indexing_map_provider);
-
-    // If we got an RTVar back, then we just replace it and move on.
-    if (std::holds_alternative<RTVar>(result)) {
-      rt_var = std::get<RTVar>(std::move(result));
-      continue;
-    }
-
-    // But if we received an AffineExpr we can eliminate the RTVar from
-    // all expressions in the indexing map.
-    auto folded_expr = std::get<AffineExpr>(std::move(result));
 
     // range_vars and rt_vars share the symbol space, with the rt_vars coming
     // after the range_vars.
     auto symbol_index = range_vars_.size() + index;
-    affine_map_ = affine_map_.replace(
-        {{mlir::getAffineSymbolExpr(symbol_index, GetMLIRContext()),
-          folded_expr}});
+    auto rt_var_symbol = getAffineSymbolExpr(symbol_index, GetMLIRContext());
 
-    llvm::DenseMap<AffineExpr, AffineExpr> replacements;
+    RTVarOptimizationResult result = OptimizeRTVar(
+        rt_var, symbol_index, GetMLIRContext(), indexing_map_provider);
 
-    for (const auto& [constraint, interval] : constraints_) {
-      auto modified_constraint = constraint.replace(
-          mlir::getAffineSymbolExpr(symbol_index, GetMLIRContext()),
-          folded_expr);
+    if (result.remapped_symbol != rt_var_symbol) {
+      affine_map_ =
+          affine_map_.replace({{rt_var_symbol, result.remapped_symbol}});
 
-      if (constraint == modified_constraint) continue;
-      replacements[constraint] = modified_constraint;
-    }
+      llvm::DenseMap<AffineExpr, AffineExpr> replacements;
+
+      for (const auto& [constraint, interval] : constraints_) {
+        auto modified_constraint =
+            constraint.replace(rt_var_symbol, result.remapped_symbol);
+
+        if (constraint == modified_constraint) continue;
+        replacements[constraint] = modified_constraint;
+      }
 
-    for (const auto& [old_expr, new_expr] : replacements) {
-      auto interval = constraints_.at(old_expr);
-      constraints_.erase(old_expr);
-      constraints_[new_expr] = interval;
+      for (const auto& [old_expr, new_expr] : replacements) {
+        auto interval = constraints_.at(old_expr);
+        constraints_.erase(old_expr);
+        constraints_[new_expr] = interval;
+      }
     }
 
-    to_delete.emplace_back(index);
+    if (result.remapped_symbol.isFunctionOfSymbol(symbol_index)) {
+      // If we still depend on the rt_var, then we update it.
+      rt_var = std::move(result.rt_var);
+    } else {
+      // Otherwise we schedule the rt_var for removal.
+      to_delete.emplace_back(index);
+    }
   }
 
   for (auto index : llvm::reverse(to_delete)) {
diff --git a/third_party/xla/xla/service/gpu/model/indexing_map.h b/third_party/xla/xla/service/gpu/model/indexing_map.h
index bfc8abf30bdd39..f8994f9000221b 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_map.h
+++ b/third_party/xla/xla/service/gpu/model/indexing_map.h
@@ -44,6 +44,7 @@ struct Interval {
   void Print(std::ostream& out) const;
 
   bool IsPoint() const { return lower == upper; }
+  int64_t NumElements() const { return upper - lower + 1; }
 
   bool Contains(int64_t value) const {
     return value >= lower && value <= upper;
@@ -370,6 +371,9 @@ H AbslHashValue(H h, const IndexingMap& indexing_map) {
                     indexing_map.GetConstraintsCount());
 }
 
+int64_t FloorDiv(int64_t dividend, int64_t divisor);
+int64_t CeilDiv(int64_t dividend, int64_t divisor);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/gpu/model/indexing_map_test.cc b/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
index 2b250db7871099..87d06fdd8edb99 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
@@ -15,23 +15,24 @@ limitations under the License.
 
 #include "xla/service/gpu/model/indexing_map.h"
 
-#include <cstdint>
+#include <memory>
 #include <optional>
 #include <utility>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/statusor.h"
 #include "mlir/IR/AffineExpr.h"  // from @llvm-project
 #include "mlir/IR/AffineMap.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/literal_util.h"
 #include "xla/service/gpu/model/affine_map_printer.h"
 #include "xla/service/gpu/model/indexing_analysis.h"
 #include "xla/service/gpu/model/indexing_test_utils.h"
-#include "xla/shape_util.h"
 #include "xla/tests/hlo_test_base.h"
+#include "xla/tests/verified_hlo_module.h"
 #include "tsl/platform/test.h"
 
 namespace xla {
@@ -221,6 +222,32 @@ TEST_F(IndexingMapTest, RemoveUnusedSymbols_ConstraintUsesOnlyUnusedSymbols) {
                         )"));
 }
 
+TEST_F(IndexingMapTest, RemoveUnusedSymbols_ConstraintIsAConstantWithinRange) {
+  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      ParseAffineMap("(d0) -> (d0)", &mlir_context_), {50}, {});
+  indexing_map.AddConstraint(ParseAffineExpr("0", &mlir_context_),
+                             Interval{-10, 5});
+  EXPECT_THAT(indexing_map, MatchIndexingMap(R"(
+                          (d0) -> (d0)
+                          domain:
+                          d0 in [0, 49]
+                        )"));
+}
+
+TEST_F(IndexingMapTest, RemoveUnusedSymbols_ConstraintIsAConstantOutOfRange) {
+  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      ParseAffineMap("(d0) -> (d0)", &mlir_context_), {50}, {});
+  // Addition of this constraint makes the domain empty.
+  indexing_map.AddConstraint(ParseAffineExpr("0", &mlir_context_),
+                             Interval{10, 15});
+  EXPECT_THAT(indexing_map, MatchIndexingMap(R"(
+                          (d0) -> (d0)
+                          domain:
+                          d0 in [0, 49]
+                          0 in [10, 15]
+                        )"));
+}
+
 TEST_F(IndexingMapTest, RemoveUnusedSymbols_ConstraintsWithManySymbols) {
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap("(d0)[s0, s1, s2, s3, s4] -> (d0 * 4 + s1 + s3 - 42)",
@@ -710,15 +737,24 @@ TEST(IntervalComparisionTest, Comparisons) {
 }
 
 TEST_F(IndexingMapTest, ReplaceConstantRTVars_ScalarConstant) {
-  // auto zero_dim_map = AffineMap::get(&mlir_context_);
-  auto constant =
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32_t>(42));
+  absl::StatusOr<std::unique_ptr<VerifiedHloModule>> hlo_module =
+      ParseAndReturnVerifiedModule(R"hlo(
+      HloModule m
+
+      ENTRY e {
+        ROOT %constant = s64[] constant(42)
+      }
+    )hlo");
 
-  IndexingMap indexing_map(ParseAffineMap("()[s0] -> (s0)", &mlir_context_),
-                           /*dimensions=*/{},
-                           /*range_vars=*/{},
-                           {RTVar{Interval{42, 42}, constant.get(),
-                                  AffineMap::get(0, 0, {}, &mlir_context_)}});
+  ASSERT_TRUE(hlo_module.ok());
+
+  IndexingMap indexing_map(
+      ParseAffineMap("()[s0] -> (s0)", &mlir_context_),
+      /*dimensions=*/{},
+      /*range_vars=*/{},
+      {RTVar{Interval{42, 42},
+             hlo_module.value()->entry_computation()->root_instruction(),
+             AffineMap::get(0, 0, {}, &mlir_context_)}});
 
   EXPECT_TRUE(indexing_map.Simplify(GetIndexingMapForInstruction));
 
@@ -729,15 +765,23 @@ TEST_F(IndexingMapTest, ReplaceConstantRTVars_ScalarConstant) {
 }
 
 TEST_F(IndexingMapTest, ReplaceConstantRTVars_StaticIndexIntoTensorConstant) {
-  // auto zero_dim_map = AffineMap::get(&mlir_context_);
-  auto constant = HloInstruction::CreateConstant(
-      LiteralUtil::CreateR2<uint32_t>({{1, 2, 3, 4}, {11, 12, 13, 14}}));
+  absl::StatusOr<std::unique_ptr<VerifiedHloModule>> hlo_module =
+      ParseAndReturnVerifiedModule(R"hlo(
+      HloModule m
+
+      ENTRY e {
+        ROOT %constant = s64[2, 4]{1,0} constant({{1, 2, 3, 4}, {11, 12, 13, 14}})
+      }
+    )hlo");
+
+  ASSERT_TRUE(hlo_module.ok());
 
   IndexingMap indexing_map(
       ParseAffineMap("()[s0] -> (s0)", &mlir_context_),
       /*dimensions=*/{},
       /*range_vars=*/{},
-      {RTVar{Interval{1, 14}, constant.get(),
+      {RTVar{Interval{1, 14},
+             hlo_module.value()->entry_computation()->root_instruction(),
              ParseAffineMap("() -> (1,2)", &mlir_context_)}});
 
   EXPECT_TRUE(indexing_map.Simplify(GetIndexingMapForInstruction));
@@ -749,29 +793,46 @@ TEST_F(IndexingMapTest, ReplaceConstantRTVars_StaticIndexIntoTensorConstant) {
 }
 
 TEST_F(IndexingMapTest, ReplaceConstantRTVars_NonFoldableTensor) {
-  // auto zero_dim_map = AffineMap::get(&mlir_context_);
-  auto constant = HloInstruction::CreateConstant(
-      LiteralUtil::CreateR2<uint32_t>({{1, 2, 3, 4}, {11, 12, 13, 14}}));
+  absl::StatusOr<std::unique_ptr<VerifiedHloModule>> hlo_module =
+      ParseAndReturnVerifiedModule(R"hlo(
+      HloModule m
+
+      ENTRY e {
+        ROOT %constant = s64[2, 4]{1,0} constant({{1, 2, 3, 4}, {11, 12, 13, 14}})
+      }
+    )hlo");
+
+  ASSERT_TRUE(hlo_module.ok());
 
   IndexingMap indexing_map(
       ParseAffineMap("(d0)[s0] -> (s0)", &mlir_context_),
       /*dimensions=*/{},
       /*range_vars=*/{},
-      {RTVar{Interval{1, 14}, constant.get(),
+      {RTVar{Interval{1, 14},
+             hlo_module.value()->entry_computation()->root_instruction(),
              ParseAffineMap("(d0) -> (1, d0)", &mlir_context_)}});
 
   EXPECT_FALSE(indexing_map.Simplify(GetIndexingMapForInstruction));
 }
 
 TEST_F(IndexingMapTest, ReplaceConstantRTVars_Iota) {
-  auto iota = HloInstruction::CreateIota(
-      ShapeUtil::MakeShape(PrimitiveType::S64, {10, 10}), 0);
+  absl::StatusOr<std::unique_ptr<VerifiedHloModule>> hlo_module =
+      ParseAndReturnVerifiedModule(R"hlo(
+      HloModule m
+
+      ENTRY e {
+        ROOT %iota = s64[10, 10]{1,0} iota(), iota_dimension=0
+      }
+    )hlo");
+
+  ASSERT_TRUE(hlo_module.ok());
 
   IndexingMap indexing_map(
       ParseAffineMap("(d0)[s0] -> (d0, s0)", &mlir_context_),
       /*dimensions=*/{{0, 255}},
       /*range_vars=*/{},
-      {RTVar{Interval{0, 9}, iota.get(),
+      {RTVar{Interval{0, 9},
+             hlo_module.value()->entry_computation()->root_instruction(),
              ParseAffineMap("(d0) -> (d0, 7)", &mlir_context_)}});
 
   EXPECT_TRUE(indexing_map.Simplify(GetIndexingMapForInstruction));
@@ -784,14 +845,23 @@ TEST_F(IndexingMapTest, ReplaceConstantRTVars_Iota) {
 }
 
 TEST_F(IndexingMapTest, ReplaceConstantRTVars_IotaAsConstant) {
-  auto iota = HloInstruction::CreateIota(
-      ShapeUtil::MakeShape(PrimitiveType::S64, {10, 10}), 1);
+  absl::StatusOr<std::unique_ptr<VerifiedHloModule>> hlo_module =
+      ParseAndReturnVerifiedModule(R"hlo(
+      HloModule m
+
+      ENTRY e {
+        ROOT %iota = s64[10, 10]{1,0} iota(), iota_dimension=1
+      }
+    )hlo");
+
+  ASSERT_TRUE(hlo_module.ok());
 
   IndexingMap indexing_map(
       ParseAffineMap("(d0)[s0] -> (d0, s0)", &mlir_context_),
       /*dimensions=*/{{0, 255}},
       /*range_vars=*/{},
-      {RTVar{Interval{0, 9}, iota.get(),
+      {RTVar{Interval{0, 9},
+             hlo_module.value()->entry_computation()->root_instruction(),
              ParseAffineMap("(d0) -> (d0, 7)", &mlir_context_)}});
 
   EXPECT_TRUE(indexing_map.Simplify(GetIndexingMapForInstruction));
@@ -804,14 +874,23 @@ TEST_F(IndexingMapTest, ReplaceConstantRTVars_IotaAsConstant) {
 }
 
 TEST_F(IndexingMapTest, ReplaceConstantRTVars_ConstraintsGetUpdated) {
-  auto iota = HloInstruction::CreateIota(
-      ShapeUtil::MakeShape(PrimitiveType::S64, {10, 10}), 0);
+  absl::StatusOr<std::unique_ptr<VerifiedHloModule>> hlo_module =
+      ParseAndReturnVerifiedModule(R"hlo(
+      HloModule m
+
+      ENTRY e {
+        ROOT %iota = s64[10, 10]{1,0} iota(), iota_dimension=0
+      }
+    )hlo");
+
+  ASSERT_TRUE(hlo_module.ok());
 
   IndexingMap indexing_map(
       ParseAffineMap("(d0)[s0] -> (d0, s0)", &mlir_context_),
       /*dimensions=*/{{0, 255}},
       /*range_vars=*/{},
-      {RTVar{Interval{0, 9}, iota.get(),
+      {RTVar{Interval{0, 9},
+             hlo_module.value()->entry_computation()->root_instruction(),
              ParseAffineMap("(d0) -> (d0, 7)", &mlir_context_)}});
   indexing_map.AddConstraint(ParseAffineExpr("s0 mod 2", &mlir_context_),
                              Interval{0, 0});
@@ -827,10 +906,17 @@ TEST_F(IndexingMapTest, ReplaceConstantRTVars_ConstraintsGetUpdated) {
 }
 
 TEST_F(IndexingMapTest, ReplaceConstantRTVars_Broadcast) {
-  auto iota = HloInstruction::CreateIota(
-      ShapeUtil::MakeShape(PrimitiveType::S64, {12}), 0);
-  auto transpose = HloInstruction::CreateBroadcast(
-      ShapeUtil::MakeShape(PrimitiveType::S64, {32, 12}), iota.get(), {1});
+  absl::StatusOr<std::unique_ptr<VerifiedHloModule>> hlo_module =
+      ParseAndReturnVerifiedModule(R"hlo(
+      HloModule m
+
+      ENTRY e {
+        %iota = s64[12]{0} iota(), iota_dimension=0
+        ROOT %broadcast = s64[32, 12]{1,0} broadcast(s64[12]{0} %iota), dimensions={1}
+      }
+    )hlo");
+
+  ASSERT_TRUE(hlo_module.ok());
 
   // (d0, 11): d0 maps into the broadcasted dimension, so it doesn't matter
   // and 11 maps to 11 in iota.
@@ -838,7 +924,8 @@ TEST_F(IndexingMapTest, ReplaceConstantRTVars_Broadcast) {
       ParseAffineMap("(d0)[s0] -> (d0, s0)", &mlir_context_),
       /*dimensions=*/{{0, 31}},
       /*range_vars=*/{},
-      {RTVar{Interval{0, 11}, transpose.get(),
+      {RTVar{Interval{0, 11},
+             hlo_module.value()->entry_computation()->root_instruction(),
              ParseAffineMap("(d0) -> (d0, 11)", &mlir_context_)}});
 
   indexing_map.Simplify(GetIndexingMapForInstruction);
@@ -851,15 +938,19 @@ TEST_F(IndexingMapTest, ReplaceConstantRTVars_Broadcast) {
 }
 
 TEST_F(IndexingMapTest, ReplaceConstantRTVars_ChainedNoncomputeOps) {
-  auto iota = HloInstruction::CreateIota(
-      ShapeUtil::MakeShape(PrimitiveType::S64, {12}), 0);
-  auto reverse = HloInstruction::CreateReverse(
-      ShapeUtil::MakeShape(PrimitiveType::S64, {12}), iota.get(), {0});
-  auto reshape = HloInstruction::CreateReshape(
-      ShapeUtil::MakeShape(PrimitiveType::S64, {3, 4}), reverse.get());
-  auto broadcast = HloInstruction::CreateBroadcast(
-      ShapeUtil::MakeShape(PrimitiveType::S64, {36, 3, 4}), reshape.get(),
-      {1, 2});
+  absl::StatusOr<std::unique_ptr<VerifiedHloModule>> hlo_module =
+      ParseAndReturnVerifiedModule(R"hlo(
+      HloModule m
+
+      ENTRY e {
+        %iota = s64[12]{0} iota(), iota_dimension=0
+        %reverse = s64[12]{0} reverse(s64[12]{0} %iota), dimensions={0}
+        %reshape = s64[3,4]{1,0} reshape(s64[12]{0} %reverse)
+        ROOT %broadcast = s64[36,3,4]{2,1,0} broadcast(s64[3,4]{1,0} %reshape), dimensions={1,2}
+      }
+    )hlo");
+
+  ASSERT_TRUE(hlo_module.ok());
 
   // - Iota: [0, 1, ,,,, 11]
   // - Reverse: [11, 10, ..., 0]
@@ -874,7 +965,8 @@ TEST_F(IndexingMapTest, ReplaceConstantRTVars_ChainedNoncomputeOps) {
       /*dimensions=*/{{0, 35}},
       /*range_vars=*/{},
       {RTVar{
-          Interval{0, 11}, broadcast.get(),
+          Interval{0, 11},
+          hlo_module.value()->entry_computation()->root_instruction(),
           ParseAffineMap("(d0) -> (d0, d0 floordiv 12, 3)", &mlir_context_)}});
 
   indexing_map.Simplify(GetIndexingMapForInstruction);
@@ -887,10 +979,17 @@ TEST_F(IndexingMapTest, ReplaceConstantRTVars_ChainedNoncomputeOps) {
 }
 
 TEST_F(IndexingMapTest, ReplaceConstantRTVars_PartialRTVarRemoval) {
-  auto iota = HloInstruction::CreateConstant(
-      LiteralUtil::CreateR1<int64_t>({1, 7, 25, 1, 7, 25, 1, 7, 25, 1, 7, 25}));
-  auto broadcast = HloInstruction::CreateBroadcast(
-      ShapeUtil::MakeShape(PrimitiveType::S64, {24, 12}), iota.get(), {1});
+  absl::StatusOr<std::unique_ptr<VerifiedHloModule>> hlo_module =
+      ParseAndReturnVerifiedModule(R"hlo(
+      HloModule m
+
+      ENTRY e {
+        %constant = s64[12]{0} constant({...})
+        ROOT %broadcast = s64[24,12]{1,0} broadcast(s64[12]{0} %constant), dimensions={1}
+      }
+    )hlo");
+
+  ASSERT_TRUE(hlo_module.ok());
 
   // (d0, d0 floordiv 2): d0 maps into the broadcasted dimension, so it can't be
   // removed, but d0 floordiv 2 doesn't yield an affine expression so we need to
@@ -899,7 +998,8 @@ TEST_F(IndexingMapTest, ReplaceConstantRTVars_PartialRTVarRemoval) {
       ParseAffineMap("(d0)[s0] -> (d0, s0)", &mlir_context_),
       /*dimensions=*/{{0, 23}},
       /*range_vars=*/{},
-      {RTVar{Interval{0, 512}, broadcast.get(),
+      {RTVar{Interval{0, 512},
+             hlo_module.value()->entry_computation()->root_instruction(),
              ParseAffineMap("(d0) -> (d0, d0 floordiv 2)", &mlir_context_)}});
 
   indexing_map.Simplify(GetIndexingMapForInstruction);
@@ -914,6 +1014,118 @@ TEST_F(IndexingMapTest, ReplaceConstantRTVars_PartialRTVarRemoval) {
               )"));
 }
 
+TEST_F(IndexingMapTest, ReplaceConstantRTVars_Add) {
+  absl::StatusOr<std::unique_ptr<VerifiedHloModule>> hlo_module =
+      ParseAndReturnVerifiedModule(R"hlo(
+      HloModule m
+
+      ENTRY e {
+        %constant = s64[] constant(42)
+        %broadcast = s64[12,13,24]{2,1,0} broadcast(s64[] %constant), dimensions={}
+        %iota = s64[12,13,24]{2,1,0} iota(), iota_dimension=2
+        ROOT %add = s64[12,13,24]{2,1,0} add(s64[12,13,24]{2,1,0} %broadcast, s64[12,13,24]{2,1,0} %iota)
+      }
+    )hlo");
+
+  ASSERT_TRUE(hlo_module.ok());
+
+  // The iota dimension is the last dimension in (d0, 7, 2 * d0), hence this
+  // composes to 42 + 2 * d0
+  IndexingMap indexing_map(
+      ParseAffineMap("(d0)[s0] -> (d0, s0)", &mlir_context_),
+      /*dimensions=*/{{0, 11}},
+      /*range_vars=*/{},
+      {RTVar{Interval{0, 11},
+             hlo_module.value()->entry_computation()->root_instruction(),
+             ParseAffineMap("(d0) -> (d0, 7, 2 * d0)", &mlir_context_)}});
+
+  indexing_map.Simplify(GetIndexingMapForInstruction);
+
+  EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
+              (d0) -> (d0, d0 * 2 + 42)
+              domain:
+              d0 in [0, 11]
+              )"));
+}
+
+TEST_F(IndexingMapTest, ReplaceConstantRTVars_Multiply) {
+  absl::StatusOr<std::unique_ptr<VerifiedHloModule>> hlo_module =
+      ParseAndReturnVerifiedModule(R"hlo(
+      HloModule m
+
+      ENTRY e {
+        %iota0 = s64[12,12]{1,0} iota(), iota_dimension=0
+        %iota1 = s64[12]{0} iota(), iota_dimension=0
+        %broadcast = s64[12,12]{1,0} broadcast(s64[12]{0} %iota1), dimensions={1}
+        %multiply = s64[12,12]{1,0} multiply(s64[12,12]{1,0} %iota0, s64[12,12]{1,0} %broadcast)
+        ROOT %reverse = s64[12,12]{1,0} reverse(s64[12,12]{1,0} %multiply), dimensions={0}
+      }
+    )hlo");
+
+  ASSERT_TRUE(hlo_module.ok());
+
+  // Iota0: [[0, ..., 0], [1, ..., 1], ..., [11, ..., 11]]
+  // Iota1: [0, ..., 11]
+  // Broadcast1: [[0, 1, ..., 11], [0, 1, ..., 11], ..., [0, 1, ..., 11]]
+  // Mul: [[0, .., 0], [0, 1, ..., 11], [0, 2, ..., 22], ..., [0, 11, ..., 121]]
+  // Reverse: [[0, 11, ..., 121], [0, 10, ..., 110], ..., [0, ..., 0]]
+  // Therefore (d0, d0) evaluates to: (11 - d0) * d0.
+  IndexingMap indexing_map(
+      ParseAffineMap("(d0)[s0] -> (d0, s0)", &mlir_context_),
+      /*dimensions=*/{{0, 11}},
+      /*range_vars=*/{},
+      {RTVar{Interval{0, 11},
+             hlo_module.value()->entry_computation()->root_instruction(),
+             ParseAffineMap("(d0) -> (d0, d0)", &mlir_context_)}});
+
+  indexing_map.Simplify(GetIndexingMapForInstruction);
+
+  EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
+              (d0) -> (d0, (-d0 + 11) * d0)
+              domain:
+              d0 in [0, 11]
+              )"));
+}
+
+TEST_F(IndexingMapTest, ReplaceConstantRTVars_PartiallyOptimizableAdd) {
+  absl::StatusOr<std::unique_ptr<VerifiedHloModule>> hlo_module =
+      ParseAndReturnVerifiedModule(R"hlo(
+      HloModule m
+
+      ENTRY e {
+        %constant = s64[12]{0} constant({...})
+        %broadcast = s64[12,13,24]{2,1,0} broadcast(s64[12]{0} %constant), dimensions={0}
+        %iota = s64[12,13,24]{2,1,0} iota(), iota_dimension=2
+        ROOT %add = s64[12,13,24]{2,1,0} add(s64[12,13,24]{2,1,0} %broadcast, s64[12,13,24]{2,1,0} %iota)
+      }
+    )hlo");
+
+  ASSERT_TRUE(hlo_module.ok());
+
+  // The iota dimension is the last dimension in (d0, 7, 2 * d0), the constant
+  // only depends on the first dimension. The constant consists of some
+  // arbitrary values that cannot be represent as an affine expression, hence
+  // the RTVar remains in-place.
+  IndexingMap indexing_map(
+      ParseAffineMap("(d0)[s0] -> (d0, s0)", &mlir_context_),
+      /*dimensions=*/{{0, 11}},
+      /*range_vars=*/{},
+      {RTVar{Interval{0, 11},
+             hlo_module.value()->entry_computation()->root_instruction(),
+             ParseAffineMap("(d0) -> (d0, 7, 2 * d0)", &mlir_context_)}});
+
+  indexing_map.Simplify(GetIndexingMapForInstruction);
+
+  EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
+              (d0)[s0] -> (d0, d0 * 2 + s0)
+              domain:
+              d0 in [0, 11]
+              s0 in [0, 11]
+                hlo: %constant = s64[12]{0} constant({...})
+                (d0) -> (d0)
+              )"));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc
index 41852118a2c4c2..3b9e376c8b32e4 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc
@@ -27,10 +27,10 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/Casting.h"
 #include "mlir/IR/AffineExpr.h"  // from @llvm-project
 #include "mlir/IR/AffineMap.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
@@ -40,8 +40,12 @@ limitations under the License.
 #include "xla/service/gpu/model/indexing_analysis.h"
 #include "xla/service/gpu/model/indexing_map.h"
 #include "xla/service/gpu/model/symbolic_tile.h"
+#include "xla/service/gpu/model/symbolic_tiled_hlo_instruction.h"
+#include "xla/service/gpu/model/tiled_hlo_instruction.h"
 #include "xla/service/instruction_fusion.h"
 #include "xla/status.h"
+#include "tsl/platform/status.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
@@ -49,22 +53,99 @@ namespace gpu {
 namespace {
 
 using ::mlir::AffineExpr;
-using ::mlir::AffineMap;
 using ::mlir::MLIRContext;
-using ::mlir::SmallVector;
+
+// Computes indexing map from program id into the tile offset for the given
+// shape and tile sizes.
+IndexingMap ComputeBlockIdToOutputTileIndexing(
+    absl::Span<const int64_t> dimensions, absl::Span<const int64_t> tile_sizes,
+    mlir::MLIRContext* mlir_context) {
+  CHECK_EQ(dimensions.size(), tile_sizes.size());  // Crash OK
+
+  int num_tiles = 1;
+  std::vector<int64_t> outer_loop_bounds;
+  outer_loop_bounds.reserve(dimensions.size());
+  for (auto [dim_size, tile_size] : llvm::zip(dimensions, tile_sizes)) {
+    int num_tiles_per_dim = (dim_size + tile_size - 1) / tile_size;
+
+    num_tiles *= num_tiles_per_dim;
+    outer_loop_bounds.push_back(num_tiles_per_dim);
+  }
+
+  mlir::AffineExpr program_id = mlir::getAffineDimExpr(0, mlir_context);
+
+  // Delinearize the block id.
+  auto tile_exprs =
+      DelinearizeIndex(outer_loop_bounds, program_id, mlir_context);
+
+  // Scale each index by the tile size to produce tile offset.
+  for (auto [tile_expr, tile_size] : llvm::zip(tile_exprs, tile_sizes)) {
+    tile_expr = tile_expr * tile_size;
+  }
+
+  return IndexingMap::FromTensorSizes(
+      mlir::AffineMap::get(
+          /*dimCount=*/1, /*symbolCount=*/0, tile_exprs, mlir_context),
+      /*dim_upper_bounds=*/{num_tiles}, /*symbol_upper_bounds=*/{});
+}
+
+absl::StatusOr<IndexingMap> ComputeBlockIdToTileOffsetIndexing(
+    const SymbolicTiledHloInstruction& tiled_hlo,
+    const IndexingMap& block_id_to_root_tile_offset,
+    mlir::MLIRContext* mlir_context) {
+  IndexingMap block_id_to_tile_offset_indexing = ComposeIndexingMaps(
+      block_id_to_root_tile_offset, tiled_hlo.indexing_map());
+
+  // A symbol in an indexing map means that to produce on element of output, we
+  // need to read all elements of input in the symbol range. Since this function
+  // computes start of the tile, we need to substitute each symbol with its
+  // lower bound value. We assume here the iteration order is normalized.
+  // TODO(b/330906085): Support cases when tile offsets are not 0.
+  if (absl::c_any_of(block_id_to_tile_offset_indexing.GetSymbolBounds(),
+                     [](const Interval& symbol_bound) {
+                       return symbol_bound.lower != 0;
+                     })) {
+    return absl::FailedPreconditionError(
+        absl::StrCat("Symbol lower bound is not zero. ",
+                     block_id_to_tile_offset_indexing.ToString()));
+  }
+
+  std::vector<AffineExpr> symbol_lower_bounds(
+      block_id_to_tile_offset_indexing.GetSymbolCount(),
+      mlir::getAffineConstantExpr(0, mlir_context));
+
+  mlir::AffineMap simplified_affine_map =
+      block_id_to_tile_offset_indexing.GetAffineMap().replaceDimsAndSymbols(
+          /*dimReplacements=*/{}, symbol_lower_bounds,
+          block_id_to_tile_offset_indexing.GetDimVarsCount(),
+          /*numResultSyms=*/
+          block_id_to_tile_offset_indexing.GetRangeVarsCount());
+
+  IndexingMap simplified_indexing_map = IndexingMap{
+      simplified_affine_map, block_id_to_tile_offset_indexing.GetDimVars(),
+      block_id_to_tile_offset_indexing.GetRangeVars(),
+      block_id_to_tile_offset_indexing.GetRTVars()};
+
+  simplified_indexing_map.Simplify(GetIndexingMapForInstruction);
+  simplified_indexing_map.RescaleSymbols();
+  simplified_indexing_map.RemoveUnusedSymbols();
+
+  return simplified_indexing_map;
+}
 
 }  // namespace
 
 /*static*/ SymbolicTileAnalysisOrError SymbolicTileAnalysis::AnalyzeComputation(
     const HloComputation& computation, MLIRContext* ctx) {
-  std::vector<std::unique_ptr<TiledHloInstruction>> tiled_hlo_instructions;
+  std::vector<std::unique_ptr<SymbolicTiledHloInstruction>>
+      tiled_hlo_instructions;
   absl::flat_hash_map<std::pair<const HloInstruction*, IndexingMap>,
-                      TiledHloInstruction*>
+                      SymbolicTiledHloInstruction*>
       tiled_hlo_instructions_map;
 
-  absl::flat_hash_map<TiledHloInstruction*, int64_t> topological_order;
+  absl::flat_hash_map<SymbolicTiledHloInstruction*, int64_t> topological_order;
 
-  std::function<std::variant<TiledHloInstruction*, FusionDecision>(
+  std::function<std::variant<SymbolicTiledHloInstruction*, FusionDecision>(
       const HloInstruction*, IndexingMap)>
       get_tiled_hlo_instruction;
 
@@ -72,7 +153,7 @@ using ::mlir::SmallVector;
   // cache for the given hlo and indexing map.
   get_tiled_hlo_instruction = [&](const HloInstruction* hlo,
                                   IndexingMap indexing_map)
-      -> std::variant<TiledHloInstruction*, FusionDecision> {
+      -> std::variant<SymbolicTiledHloInstruction*, FusionDecision> {
     auto key = std::make_pair(hlo, indexing_map);
 
     auto it = tiled_hlo_instructions_map.find(key);
@@ -103,27 +184,28 @@ using ::mlir::SmallVector;
                               << hlo->ToString();
     }
 
-    tiled_hlo_instructions.push_back(std::make_unique<TiledHloInstruction>(
-        hlo, std::move(indexing_map), std::move(*symbolic_tile)));
+    tiled_hlo_instructions.push_back(
+        std::make_unique<SymbolicTiledHloInstruction>(
+            hlo, std::move(indexing_map), std::move(*symbolic_tile)));
 
     auto tiled_hlo_instruction = tiled_hlo_instructions.back().get();
 
     std::optional<HloInstructionIndexing> operands_indexing =
-        ComputeOutputToInputIndexing(tiled_hlo_instruction->hlo,
+        ComputeOutputToInputIndexing(tiled_hlo_instruction->hlo(),
                                      /*output_id=*/0, ctx);
 
     if (!operands_indexing.has_value()) {
       return FusionDecision{} << "Failed to compute operands indexing for "
-                              << tiled_hlo_instruction->hlo->ToString();
+                              << tiled_hlo_instruction->hlo()->ToString();
     }
 
     for (auto [operand, operand_indexing_map_set] :
-         llvm::zip(tiled_hlo_instruction->hlo->operands(),
+         llvm::zip(tiled_hlo_instruction->hlo()->operands(),
                    operands_indexing->indexing_maps)) {
-      CHECK_EQ(operand_indexing_map_set.size(), 1);
+      CHECK_EQ(operand_indexing_map_set.size(), 1);  // Crash OK
 
       IndexingMap operand_indexing_map =
-          ComposeIndexingMaps(tiled_hlo_instruction->indexing_map,
+          ComposeIndexingMaps(tiled_hlo_instruction->indexing_map(),
                               *operand_indexing_map_set.begin());
 
       auto tiled_operand_or =
@@ -134,8 +216,8 @@ using ::mlir::SmallVector;
         return *fusion_decison;
       }
 
-      tiled_hlo_instruction->operands.push_back(
-          std::get<TiledHloInstruction*>(tiled_operand_or));
+      tiled_hlo_instruction->AppendOperand(
+          std::get<SymbolicTiledHloInstruction*>(tiled_operand_or));
     }
 
     topological_order[tiled_hlo_instruction] = topological_order.size();
@@ -158,57 +240,78 @@ using ::mlir::SmallVector;
   return SymbolicTileAnalysis(std::move(tiled_hlo_instructions), ctx);
 }
 
-namespace {
+absl::StatusOr<std::vector<std::unique_ptr<TiledHloInstruction>>>
+SymbolicTileAnalysis::ComputeTiledHloInstructions(
+    const std::vector<int64_t>& tile_parameters) const {
+  IndexingMap block_id_to_root_tile_offset = ComputeBlockIdToOutputTileIndexing(
+      GetRoot()->hlo()->shape().dimensions(), tile_parameters, context_);
 
-std::vector<int64_t> EvaluateTileMap(AffineMap affine_map,
-                                     absl::Span<int64_t const> parameters) {
-  CHECK_EQ(affine_map.getNumSymbols(), parameters.size());
-  CHECK_EQ(affine_map.getNumDims(), 0);
+  std::vector<std::unique_ptr<TiledHloInstruction>> tiled_hlo_instructions;
+  absl::flat_hash_map<const SymbolicTiledHloInstruction*, TiledHloInstruction*>
+      symbolic_to_tiled_hlo_map;
+  absl::flat_hash_set<TiledHloInstruction*, TiledHloInstruction::PtrHash,
+                      TiledHloInstruction::PtrEqual>
+      tiled_hlo_instructions_set;
 
-  SmallVector<AffineExpr> symbol_replacements = llvm::to_vector(
-      llvm::map_range(parameters, [affine_map](const int64_t v) -> AffineExpr {
-        return mlir::getAffineConstantExpr(v, affine_map.getContext());
-      }));
+  absl::flat_hash_map<TiledHloInstruction*, int64_t> topological_order;
 
-  AffineMap simplified_affine_map =
-      mlir::simplifyAffineMap(affine_map.replaceDimsAndSymbols(
-          /*dimReplacements=*/{}, symbol_replacements, /*numResultDims=*/0,
-          /*numResultSyms=*/0));
+  std::function<absl::StatusOr<TiledHloInstruction*>(
+      const SymbolicTiledHloInstruction*)>
+      get_tiled_hlo_instruction;
 
-  SmallVector<int64_t> results = llvm::to_vector(llvm::map_range(
-      simplified_affine_map.getResults(), [](AffineExpr result) -> int64_t {
-        return llvm::cast<mlir::AffineConstantExpr>(result).getValue();
-      }));
+  get_tiled_hlo_instruction =
+      [&](const SymbolicTiledHloInstruction* symbolic_tiled_hlo)
+      -> absl::StatusOr<TiledHloInstruction*> {
+    auto it1 = symbolic_to_tiled_hlo_map.find(symbolic_tiled_hlo);
+    if (it1 != symbolic_to_tiled_hlo_map.end()) {
+      return it1->second;
+    }
 
-  return std::vector<int64_t>(results.begin(), results.end());
-}
+    std::vector<int64_t> tile_sizes =
+        symbolic_tiled_hlo->TileSizes(tile_parameters);
+    std::vector<int64_t> tile_strides =
+        symbolic_tiled_hlo->TileStrides(tile_parameters);
+
+    TF_ASSIGN_OR_RETURN(
+        IndexingMap block_id_to_block_offset_indexing,
+        ComputeBlockIdToTileOffsetIndexing(
+            *symbolic_tiled_hlo, block_id_to_root_tile_offset, context_));
+
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<TiledHloInstruction> tiled_hlo_holder,
+                        TiledHloInstruction::Create(
+                            symbolic_tiled_hlo->hlo(), std::move(tile_sizes),
+                            std::move(tile_strides),
+                            std::move(block_id_to_block_offset_indexing)));
+
+    auto it2 = tiled_hlo_instructions_set.find(tiled_hlo_holder.get());
+    if (it2 != tiled_hlo_instructions_set.end()) {
+      return *it2;
+    }
 
-}  // namespace
+    tiled_hlo_instructions.push_back(std::move(tiled_hlo_holder));
+    TiledHloInstruction* tiled_hlo = tiled_hlo_instructions.back().get();
+    tiled_hlo_instructions_set.insert(tiled_hlo);
+    symbolic_to_tiled_hlo_map[symbolic_tiled_hlo] = tiled_hlo;
 
-std::vector<int64_t> SymbolicTileAnalysis::TileOffsets(
-    const TiledHloInstruction& tiled_hlo) const {
-  CHECK(tile_parameters_.has_value());
-  return EvaluateTileMap(tiled_hlo.symbolic_tile.offset_map(),
-                         *tile_parameters_);
-}
+    for (SymbolicTiledHloInstruction* operand :
+         symbolic_tiled_hlo->operands()) {
+      TF_ASSIGN_OR_RETURN(TiledHloInstruction * tiled_operand,
+                          get_tiled_hlo_instruction(operand));
+      tiled_hlo->AppendOperand(tiled_operand);
+    }
 
-// TODO(bchetioui): remove dependency on stride and offset parameters.
-std::vector<int64_t> SymbolicTileAnalysis::TileSizes(
-    const TiledHloInstruction& tiled_hlo) const {
-  CHECK(tile_parameters_.has_value());
-  return EvaluateTileMap(tiled_hlo.symbolic_tile.size_map(), *tile_parameters_);
-}
+    topological_order[tiled_hlo] = topological_order.size();
+    return tiled_hlo;
+  };
 
-std::vector<int64_t> SymbolicTileAnalysis::TileStrides(
-    const TiledHloInstruction& tiled_hlo) const {
-  CHECK(tile_parameters_.has_value());
-  return EvaluateTileMap(tiled_hlo.symbolic_tile.stride_map(),
-                         *tile_parameters_);
-}
+  TF_CHECK_OK(get_tiled_hlo_instruction(GetRoot()).status());
+
+  // Order instructions in def-before-use order.
+  absl::c_sort(tiled_hlo_instructions, [&](const auto& i1, const auto& i2) {
+    return topological_order.at(i1.get()) < topological_order.at(i2.get());
+  });
 
-void SymbolicTileAnalysis::SetTileSizes(std::vector<int64_t> sizes) {
-  // TODO(bchetioui): CHECK num parameters somehow?
-  tile_parameters_ = std::vector(std::move(sizes));
+  return tiled_hlo_instructions;
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h
index be9953fa0ff574..40b184d795f382 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h
@@ -18,15 +18,15 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
-#include <optional>
 #include <utility>
 #include <variant>
 #include <vector>
 
+#include "absl/status/statusor.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/service/gpu/model/indexing_map.h"
-#include "xla/service/gpu/model/symbolic_tile.h"
+#include "xla/service/gpu/model/symbolic_tiled_hlo_instruction.h"
+#include "xla/service/gpu/model/tiled_hlo_instruction.h"
 #include "xla/service/instruction_fusion.h"
 
 namespace xla {
@@ -36,29 +36,6 @@ class SymbolicTileAnalysis;
 using SymbolicTileAnalysisOrError =
     std::variant<SymbolicTileAnalysis, FusionDecision>;
 
-// A node in the tiled representation of an HLO computation. During tiling and
-// codegen an HLO instruction may need to be emitted multiple times with
-// different tiling parameters.
-struct TiledHloInstruction {
-  // Pointer to the original HLO instruction.
-  const HloInstruction* hlo;
-
-  // Indexing map from the computation root to this instruction output.
-  IndexingMap indexing_map;
-
-  // Symbolic tile derived from the indexing map.
-  SymbolicTile symbolic_tile;
-
-  // Operands of the instruction in the tiled computation graph.
-  std::vector<TiledHloInstruction*> operands;
-
-  TiledHloInstruction(const HloInstruction* hlo, IndexingMap indexing_map,
-                      SymbolicTile symbolic_tile)
-      : hlo(hlo),
-        indexing_map(std::move(indexing_map)),
-        symbolic_tile(std::move(symbolic_tile)) {}
-};
-
 // Constructs and holds symbolic tiles for all the instructions within a
 // computation. We may hold several different symbolic tiles for the same
 // instruction if the instruction is indexed in several different ways in order
@@ -67,64 +44,44 @@ struct TiledHloInstruction {
 // instruction of the computation to the relevant instruction.
 class SymbolicTileAnalysis {
  public:
-  // `InstructionPathFromRoot` allows representing a graph path from the root
-  // instruction of a computation up to one of its consumers. Each integer
-  // in the path represents the index of the operand edge to follow to reach
-  // the instruction, starting from the root instruction.
-  using InstructionPathFromRoot = std::vector<int>;
-
   // Tries to construct a symbolic tile analysis from a computation. Returns
   // a diagnostic if the construction fails for any reason.
   static SymbolicTileAnalysisOrError AnalyzeComputation(
       const HloComputation& computation, mlir::MLIRContext* ctx);
 
-  // Evaluates the tile offsets of an instruction from the analyzed computation
-  // following the provided path from the root. Tile parameters must have been
-  // set before calling this method.
-  std::vector<int64_t> TileOffsets(const TiledHloInstruction& tiled_hlo) const;
-  // Evaluates the tile sizes of an instruction from the analyzed computation
-  // following the provided path from the root. Tile parameters must have been
-  // set before calling this method.
-  std::vector<int64_t> TileSizes(const TiledHloInstruction& tiled_hlo) const;
-  // Evaluates the tile strides of an instruction from the analyzed computation
-  // following the provided path from the root. Tile parameters must have been
-  // set before calling this method.
-  std::vector<int64_t> TileStrides(const TiledHloInstruction& tiled_hlo) const;
-
-  // Populates input tile sizes. This is a prerequisite in order to extract
-  // concrete values using `TileOffsets`, `TileSizes`, and `TileStrides`.
-  void SetTileSizes(std::vector<int64_t> sizes);
+  // Returns a graph of HLO instructions tiled with the given tile parameters.
+  // Result vector has instructions in def-before-use order.
+  absl::StatusOr<std::vector<std::unique_ptr<TiledHloInstruction>>>
+  ComputeTiledHloInstructions(
+      const std::vector<int64_t>& tile_parameters) const;
 
   // Returns the tiled root instruction.
-  const TiledHloInstruction* GetRoot() const {
-    return tiled_hlo_instructions_.back().get();
+  const SymbolicTiledHloInstruction* GetRoot() const {
+    return symbolic_tiled_hlo_instructions_.back().get();
   }
 
-  // Returns the tiled HLO instructions in def-before-use order.
-  const std::vector<std::unique_ptr<TiledHloInstruction>>&
-  GetTiledHloInstructions() const {
-    return tiled_hlo_instructions_;
+  // Returns the symbolic tiled HLO instructions in def-before-use order.
+  const std::vector<std::unique_ptr<SymbolicTiledHloInstruction>>&
+  GetSymbolicTiledHloComputation() const {
+    return symbolic_tiled_hlo_instructions_;
   }
 
   // Return the underlying MLIRContext.
   mlir::MLIRContext* GetMLIRContext() const { return context_; };
 
  private:
-  SymbolicTileAnalysis(
-      std::vector<std::unique_ptr<TiledHloInstruction>> tiled_hlo_instructions,
-      mlir::MLIRContext* context)
-      : tiled_hlo_instructions_(std::move(tiled_hlo_instructions)),
+  SymbolicTileAnalysis(std::vector<std::unique_ptr<SymbolicTiledHloInstruction>>
+                           symbolic_tiled_hlo_instructions,
+                       mlir::MLIRContext* context)
+      : symbolic_tiled_hlo_instructions_(
+            std::move(symbolic_tiled_hlo_instructions)),
         context_(context) {}
 
   // The tiled HLO instructions in def-before-use order.
-  std::vector<std::unique_ptr<TiledHloInstruction>> tiled_hlo_instructions_;
+  std::vector<std::unique_ptr<SymbolicTiledHloInstruction>>
+      symbolic_tiled_hlo_instructions_;
 
   mlir::MLIRContext* context_;
-  // Optionally set tile parameters. These parameters can be set by calling
-  // `SetTileParameters`, and correspond to the output tile for the analyzed
-  // computation. The order and type of parameters are as explained in the
-  // documentation of `SymbolicTile`.
-  std::optional<std::vector<int64_t>> tile_parameters_;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc
index 324937a4ac8991..b6340a83636e33 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc
@@ -16,13 +16,18 @@ limitations under the License.
 #include "xla/service/gpu/model/symbolic_tile_analysis.h"
 
 #include <memory>
+#include <optional>
 #include <utility>
 #include <variant>
+#include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/strings/string_view.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/gpu/model/indexing_test_utils.h"
+#include "xla/service/gpu/model/tiled_hlo_instruction.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tests/verified_hlo_module.h"
 #include "tsl/platform/statusor.h"
@@ -32,7 +37,24 @@ namespace gpu {
 namespace {
 
 using ::testing::ElementsAre;
-using SymbolicTileAnalysisTest = HloTestBase;
+
+class SymbolicTileAnalysisTest : public HloTestBase {
+ public:
+  bool SetAnalysis(HloModule* module) {
+    SymbolicTileAnalysisOrError analysis_or_error =
+        SymbolicTileAnalysis::AnalyzeComputation(*module->entry_computation(),
+                                                 &mlir_context_);
+
+    if (std::holds_alternative<SymbolicTileAnalysis>(analysis_or_error)) {
+      analysis_ = std::get<SymbolicTileAnalysis>(std::move(analysis_or_error));
+      return true;
+    }
+    return false;
+  }
+
+  mlir::MLIRContext mlir_context_;
+  std::optional<SymbolicTileAnalysis> analysis_;
+};
 
 TEST_F(SymbolicTileAnalysisTest, SimpleNormalizationDiamondIsSupported) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
@@ -51,30 +73,42 @@ ENTRY main {
   ROOT subtract = f32[2,97]{1,0} subtract(p0, broadcast)
 })"));
 
-  mlir::MLIRContext mlir_ctx;
+  EXPECT_TRUE(SetAnalysis(module.get()));
 
-  SymbolicTileAnalysisOrError analysis_or_error =
-      SymbolicTileAnalysis::AnalyzeComputation(*module->entry_computation(),
-                                               &mlir_ctx);
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<std::unique_ptr<TiledHloInstruction>> tiled_hlo_instructions,
+      analysis_->ComputeTiledHloInstructions(/*tile_parameters=*/{1, 10}));
 
-  ASSERT_TRUE(std::holds_alternative<SymbolicTileAnalysis>(analysis_or_error));
-  SymbolicTileAnalysis analysis =
-      std::get<SymbolicTileAnalysis>(std::move(analysis_or_error));
+  TiledHloInstruction* root = tiled_hlo_instructions.back().get();
 
-  analysis.SetTileSizes(/*sizes=*/{1, 10});
+  EXPECT_THAT(root->block_id_to_tile_offsets_indexing(), MatchIndexingMap(R"(
+    (d0) -> (d0 floordiv 10, (d0 mod 10) * 10)
+    domain:
+    d0 in [0, 19]
+  )"));
 
-  const TiledHloInstruction* root = analysis.GetRoot();
+  auto p0_from_subtract0 = root->operand(0);
+  auto p0_from_subtract1 = root->operand(1)->operand(0)->operand(0);
 
-  auto p0_from_subtract0 = root->operands[0];
-  auto p0_from_subtract1 = root->operands[1]->operands[0]->operands[0];
+  EXPECT_THAT(p0_from_subtract0->tile_sizes(), ElementsAre(1, 10));
+  EXPECT_THAT(p0_from_subtract0->tile_strides(), ElementsAre(1, 1));
 
-  EXPECT_THAT(analysis.TileOffsets(*p0_from_subtract0), ElementsAre(0, 0));
-  EXPECT_THAT(analysis.TileSizes(*p0_from_subtract0), ElementsAre(1, 10));
-  EXPECT_THAT(analysis.TileStrides(*p0_from_subtract0), ElementsAre(1, 1));
+  EXPECT_THAT(p0_from_subtract0->block_id_to_tile_offsets_indexing(),
+              MatchIndexingMap(R"(
+    (d0) -> (d0 floordiv 10, (d0 mod 10) * 10)
+    domain:
+    d0 in [0, 19]
+  )"));
 
-  EXPECT_THAT(analysis.TileOffsets(*p0_from_subtract1), ElementsAre(0, 0));
-  EXPECT_THAT(analysis.TileSizes(*p0_from_subtract1), ElementsAre(1, 97));
-  EXPECT_THAT(analysis.TileStrides(*p0_from_subtract1), ElementsAre(1, 1));
+  EXPECT_THAT(p0_from_subtract1->tile_sizes(), ElementsAre(1, 97));
+  EXPECT_THAT(p0_from_subtract1->tile_strides(), ElementsAre(1, 1));
+
+  EXPECT_THAT(p0_from_subtract1->block_id_to_tile_offsets_indexing(),
+              MatchIndexingMap(R"(
+    (d0) -> (d0 floordiv 10, 0)
+    domain:
+    d0 in [0, 19]
+  )"));
 }
 
 TEST_F(SymbolicTileAnalysisTest, ElementwiseDiamondCSEIsSupported) {
@@ -87,23 +121,91 @@ ENTRY main {
   ROOT subtract = f32[2,97] subtract(exp, log)
 })"));
 
-  mlir::MLIRContext mlir_ctx;
-  SymbolicTileAnalysisOrError analysis_or_error =
-      SymbolicTileAnalysis::AnalyzeComputation(*module->entry_computation(),
-                                               &mlir_ctx);
+  EXPECT_TRUE(SetAnalysis(module.get()));
 
-  EXPECT_TRUE(std::holds_alternative<SymbolicTileAnalysis>(analysis_or_error));
-  SymbolicTileAnalysis analysis =
-      std::get<SymbolicTileAnalysis>(std::move(analysis_or_error));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<std::unique_ptr<TiledHloInstruction>> tiled_hlo_instructions,
+      analysis_->ComputeTiledHloInstructions(/*tile_parameters=*/{1, 10}));
 
-  const TiledHloInstruction* root = analysis.GetRoot();
+  TiledHloInstruction* root = tiled_hlo_instructions.back().get();
 
-  auto p0_from_subtract0 = root->operands[0]->operands[0];
-  auto p0_from_subtract1 = root->operands[1]->operands[0];
+  auto p0_from_subtract0 = root->operand(0)->operand(0);
+  auto p0_from_subtract1 = root->operand(1)->operand(0);
 
   EXPECT_EQ(p0_from_subtract0, p0_from_subtract1);
 }
 
+TEST_F(SymbolicTileAnalysisTest, TransposeOffsetIndexingIsCorrect) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+ENTRY main {
+  p0 = f32[8,16,4] parameter(0)
+  ROOT transpose = f32[4,8,16] transpose(p0), dimensions={2,0,1}
+})"));
+
+  EXPECT_TRUE(SetAnalysis(module.get()));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<std::unique_ptr<TiledHloInstruction>> tiled_hlo_instructions,
+      analysis_->ComputeTiledHloInstructions(/*tile_parameters=*/{2, 4, 2}));
+
+  TiledHloInstruction* root = tiled_hlo_instructions.back().get();
+
+  EXPECT_THAT(root->block_id_to_tile_offsets_indexing(), MatchIndexingMap(R"(
+    (d0) -> ((d0 floordiv 16) * 2, ((d0 floordiv 8) mod 2) * 4, (d0 mod 8) * 2)
+    domain:
+    d0 in [0, 31]
+  )"));
+
+  EXPECT_THAT(root->operand(0)->block_id_to_tile_offsets_indexing(),
+              MatchIndexingMap(R"(
+    (d0) -> (((d0 floordiv 8) mod 2) * 4, (d0 mod 8) * 2, (d0 floordiv 16) * 2)
+    domain:
+    d0 in [0, 31]
+  )"));
+}
+
+TEST_F(SymbolicTileAnalysisTest, SliceOffsetIndexingIsCorrect) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+ENTRY main {
+  p0 = f32[8,16] parameter(0)
+  slice.0 = f32[4,8] slice(p0), slice={[0:4], [2:10]}
+  slice.1 = f32[4,8] slice(p0), slice={[3:7], [4:12]}
+  ROOT add = f32[4,8] add(slice.0, slice.1)
+})"));
+
+  EXPECT_TRUE(SetAnalysis(module.get()));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<std::unique_ptr<TiledHloInstruction>> tiled_hlo_instructions,
+      analysis_->ComputeTiledHloInstructions(/*tile_parameters=*/{2, 2}));
+
+  TiledHloInstruction* root = tiled_hlo_instructions.back().get();
+  const TiledHloInstruction* p0_from_slice0 = root->operand(0)->operand(0);
+  const TiledHloInstruction* p0_from_slice1 = root->operand(1)->operand(0);
+
+  EXPECT_THAT(root->block_id_to_tile_offsets_indexing(), MatchIndexingMap(R"(
+    (d0) -> ((d0 floordiv 4) * 2, (d0 mod 4) * 2)
+    domain:
+    d0 in [0, 7]
+  )"));
+
+  EXPECT_THAT(p0_from_slice0->block_id_to_tile_offsets_indexing(),
+              MatchIndexingMap(R"(
+    (d0) -> ((d0 floordiv 4) * 2, (d0 mod 4) * 2 + 2)
+    domain:
+    d0 in [0, 7]
+  )"));
+
+  EXPECT_THAT(p0_from_slice1->block_id_to_tile_offsets_indexing(),
+              MatchIndexingMap(R"(
+    (d0) -> ((d0 floordiv 4) * 2 + 3, (d0 mod 4) * 2 + 4)
+    domain:
+    d0 in [0, 7]
+  )"));
+}
+
 TEST_F(SymbolicTileAnalysisTest, BailOutOnUnsupportedDot) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(R"(
@@ -115,11 +217,7 @@ ENTRY main {
     lhs_contracting_dims={1}, rhs_contracting_dims={0}
 })"));
 
-  mlir::MLIRContext mlir_ctx;
-  SymbolicTileAnalysisOrError analysis_or_error =
-      SymbolicTileAnalysis::AnalyzeComputation(*module->entry_computation(),
-                                               &mlir_ctx);
-  EXPECT_FALSE(std::holds_alternative<SymbolicTileAnalysis>(analysis_or_error));
+  EXPECT_FALSE(SetAnalysis(module.get()));
 }
 
 TEST_F(SymbolicTileAnalysisTest, BailOutOnUnsupportedReshape) {
@@ -130,11 +228,7 @@ ENTRY main {
   ROOT reshape = f32[2] reshape(p0)
 })"));
 
-  mlir::MLIRContext mlir_ctx;
-  SymbolicTileAnalysisOrError analysis_or_error =
-      SymbolicTileAnalysis::AnalyzeComputation(*module->entry_computation(),
-                                               &mlir_ctx);
-  EXPECT_FALSE(std::holds_alternative<SymbolicTileAnalysis>(analysis_or_error));
+  EXPECT_FALSE(SetAnalysis(module.get()));
 }
 
 TEST_F(SymbolicTileAnalysisTest, BailOutOnUnsupportedBitcast) {
@@ -145,11 +239,7 @@ ENTRY main {
   ROOT bitcast = f32[2] bitcast(p0)
 })"));
 
-  mlir::MLIRContext mlir_ctx;
-  SymbolicTileAnalysisOrError analysis_or_error =
-      SymbolicTileAnalysis::AnalyzeComputation(*module->entry_computation(),
-                                               &mlir_ctx);
-  EXPECT_FALSE(std::holds_alternative<SymbolicTileAnalysis>(analysis_or_error));
+  EXPECT_FALSE(SetAnalysis(module.get()));
 }
 
 TEST_F(SymbolicTileAnalysisTest, BailOutOnUnsupportedConcatenate) {
@@ -161,11 +251,7 @@ ENTRY main {
   ROOT concatenate = f32[2,3] concatenate(p0, p1), dimensions={0}
 })"));
 
-  mlir::MLIRContext mlir_ctx;
-  SymbolicTileAnalysisOrError analysis_or_error =
-      SymbolicTileAnalysis::AnalyzeComputation(*module->entry_computation(),
-                                               &mlir_ctx);
-  EXPECT_FALSE(std::holds_alternative<SymbolicTileAnalysis>(analysis_or_error));
+  EXPECT_FALSE(SetAnalysis(module.get()));
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tiled_hlo_instruction.cc b/third_party/xla/xla/service/gpu/model/symbolic_tiled_hlo_instruction.cc
new file mode 100644
index 00000000000000..c419cb91cc36fe
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tiled_hlo_instruction.cc
@@ -0,0 +1,80 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/model/symbolic_tiled_hlo_instruction.h"
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/IR/AffineExpr.h"  // from @llvm-project
+#include "mlir/IR/AffineMap.h"  // from @llvm-project
+#include "xla/service/gpu/model/symbolic_tile.h"
+#include "xla/status.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+using ::mlir::AffineExpr;
+using ::mlir::AffineMap;
+using ::mlir::SmallVector;
+
+std::vector<int64_t> EvaluateTileMap(AffineMap affine_map,
+                                     absl::Span<int64_t const> parameters) {
+  CHECK_EQ(affine_map.getNumSymbols(), parameters.size());
+  CHECK_EQ(affine_map.getNumDims(), 0);
+
+  SmallVector<AffineExpr> symbol_replacements = llvm::to_vector(
+      llvm::map_range(parameters, [affine_map](const int64_t v) -> AffineExpr {
+        return mlir::getAffineConstantExpr(v, affine_map.getContext());
+      }));
+
+  AffineMap simplified_affine_map =
+      mlir::simplifyAffineMap(affine_map.replaceDimsAndSymbols(
+          /*dimReplacements=*/{}, symbol_replacements, /*numResultDims=*/0,
+          /*numResultSyms=*/0));
+
+  SmallVector<int64_t> results = llvm::to_vector(llvm::map_range(
+      simplified_affine_map.getResults(), [](AffineExpr result) -> int64_t {
+        return llvm::cast<mlir::AffineConstantExpr>(result).getValue();
+      }));
+
+  return std::vector<int64_t>(results.begin(), results.end());
+}
+
+}  // namespace
+
+std::vector<int64_t> SymbolicTiledHloInstruction::TileOffsets(
+    absl::Span<int64_t const> tile_parameters) const {
+  return EvaluateTileMap(symbolic_tile_.offset_map(), tile_parameters);
+}
+
+std::vector<int64_t> SymbolicTiledHloInstruction::TileSizes(
+    absl::Span<int64_t const> tile_parameters) const {
+  return EvaluateTileMap(symbolic_tile_.size_map(), tile_parameters);
+}
+
+std::vector<int64_t> SymbolicTiledHloInstruction::TileStrides(
+    absl::Span<int64_t const> tile_parameters) const {
+  return EvaluateTileMap(symbolic_tile_.stride_map(), tile_parameters);
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tiled_hlo_instruction.h b/third_party/xla/xla/service/gpu/model/symbolic_tiled_hlo_instruction.h
new file mode 100644
index 00000000000000..18cab47c7984aa
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tiled_hlo_instruction.h
@@ -0,0 +1,89 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_MODEL_SYMBOLIC_TILED_HLO_INSTRUCTION_H_
+#define XLA_SERVICE_GPU_MODEL_SYMBOLIC_TILED_HLO_INSTRUCTION_H_
+
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/model/indexing_map.h"
+#include "xla/service/gpu/model/symbolic_tile.h"
+
+namespace xla {
+namespace gpu {
+
+// A node in the symbolic tiled representation of an HLO computation. During
+// tiling and codegen an HLO instruction may need to be emitted multiple times
+// with different tiling parameters.
+class SymbolicTiledHloInstruction {
+ public:
+  SymbolicTiledHloInstruction(const HloInstruction* hlo,
+                              IndexingMap indexing_map,
+                              SymbolicTile symbolic_tile)
+      : hlo_(hlo),
+        indexing_map_(std::move(indexing_map)),
+        symbolic_tile_(std::move(symbolic_tile)) {}
+
+  // Evaluates the tile offsets of an instruction with given tile parameters.
+  std::vector<int64_t> TileOffsets(
+      absl::Span<int64_t const> tile_parameters) const;
+  // Evaluates the tile sizes of an instruction with given tile parameters.
+  std::vector<int64_t> TileSizes(
+      absl::Span<int64_t const> tile_parameters) const;
+  // Evaluates the tile strides of an instruction with given tile parameters.
+  std::vector<int64_t> TileStrides(
+      absl::Span<int64_t const> tile_parameters) const;
+
+  const HloInstruction* hlo() const { return hlo_; }
+  const IndexingMap& indexing_map() const { return indexing_map_; }
+  const SymbolicTile& symbolic_tile() const { return symbolic_tile_; }
+
+  const SymbolicTiledHloInstruction* operand(int64_t operand_id) const {
+    return operands_[operand_id];
+  }
+  SymbolicTiledHloInstruction* operand(int64_t operand_id) {
+    return operands_[operand_id];
+  }
+  const std::vector<SymbolicTiledHloInstruction*>& operands() const {
+    return operands_;
+  }
+
+  // Appends an operand to the end of the operand list.
+  void AppendOperand(SymbolicTiledHloInstruction* operand) {
+    operands_.push_back(operand);
+  }
+
+ private:
+  // Pointer to the original HLO instruction.
+  const HloInstruction* hlo_;
+
+  // Indexing map from the computation root to this instruction output.
+  IndexingMap indexing_map_;
+
+  // Symbolic tile derived from the indexing map.
+  SymbolicTile symbolic_tile_;
+
+  // Operands of the instruction in the tiled computation graph.
+  std::vector<SymbolicTiledHloInstruction*> operands_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_MODEL_SYMBOLIC_TILED_HLO_INSTRUCTION_H_
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tiled_hlo_instruction_test.cc b/third_party/xla/xla/service/gpu/model/symbolic_tiled_hlo_instruction_test.cc
new file mode 100644
index 00000000000000..0f8fea04493d58
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tiled_hlo_instruction_test.cc
@@ -0,0 +1,97 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/model/symbolic_tiled_hlo_instruction.h"
+
+#include <cstdint>
+#include <optional>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/hlo_traversal.h"
+#include "xla/service/gpu/model/indexing_analysis.h"
+#include "xla/service/gpu/model/indexing_map.h"
+#include "xla/service/gpu/model/symbolic_tile.h"
+#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/verified_hlo_module.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+using ::testing::ElementsAre;
+using SymbolicTiledHloInstructionTest = HloTestBase;
+
+TEST_F(SymbolicTiledHloInstructionTest, TransposeTileSizesAreSupported) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+fused_computation {
+  p0 = f32[16,32] parameter(0)
+  p1 = f32[32,16] parameter(1)
+  transpose = f32[32,16] transpose(p0), dimensions={1,0}
+  ROOT subtract = f32[32,16] subtract(transpose, p1)
+}
+
+ENTRY main {
+  p0 = f32[16,32] parameter(0)
+  p1 = f32[32,16] parameter(1)
+  ROOT root = f32[32,16] fusion(p0, p1), kind=kLoop, calls=fused_computation
+}
+)"));
+
+  mlir::MLIRContext mlir_ctx;
+  auto fusion = module->entry_computation()->root_instruction();
+  auto fusion_adaptor = HloFusionAdaptor::ForInstruction(fusion);
+
+  auto output_to_input_indexing = ComputeGroupedOutputToInputIndexing(
+      *fusion_adaptor, fusion_adaptor->GetRoots()[0], &mlir_ctx);
+
+  HloInstruction* subtract = fusion->fused_expression_root();
+  HloInstruction* p0 = subtract->mutable_operand(0)->mutable_operand(0);
+  HloInstruction* p1 = subtract->mutable_operand(1);
+
+  // We use `fusion->operand(0)` to get indexing from the map instead of `p0`,
+  // because `HloFusionAdaptor` and `ComputeGroupedOutputToInputIndexing` ignore
+  // kParameter instructions inside the fusion and produces indexing for fusion
+  // operands.
+  IndexingMap p0_indexing =
+      *output_to_input_indexing[fusion->operand(0)].begin();
+  std::optional<SymbolicTile> p0_symbolic_tile =
+      SymbolicTile::FromIndexingMap(p0_indexing);
+  ASSERT_TRUE(p0_symbolic_tile.has_value());
+  SymbolicTiledHloInstruction tiled_p0(p0, p0_indexing, *p0_symbolic_tile);
+  ASSERT_TRUE(p0_symbolic_tile.has_value());
+
+  IndexingMap p1_indexing =
+      *output_to_input_indexing[fusion->operand(1)].begin();
+  std::optional<SymbolicTile> p1_symbolic_tile =
+      SymbolicTile::FromIndexingMap(p1_indexing);
+  ASSERT_TRUE(p1_symbolic_tile.has_value());
+  SymbolicTiledHloInstruction tiled_p1(p1, p1_indexing, *p1_symbolic_tile);
+
+  std::vector<int64_t> output_tile_sizes = {8, 4};
+
+  auto p0_tile_sizes = tiled_p0.TileSizes(output_tile_sizes);
+  EXPECT_THAT(tiled_p0.TileSizes(output_tile_sizes), ElementsAre(4, 8));
+  EXPECT_THAT(tiled_p1.TileSizes(output_tile_sizes), ElementsAre(8, 4));
+}
+
+}  // namespace
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/tiled_hlo_instruction.cc b/third_party/xla/xla/service/gpu/model/tiled_hlo_instruction.cc
new file mode 100644
index 00000000000000..7c0698f2e40c07
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/tiled_hlo_instruction.cc
@@ -0,0 +1,118 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/model/tiled_hlo_instruction.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/hash/hash.h"
+#include "absl/memory/memory.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/model/indexing_map.h"
+#include "xla/util.h"
+
+namespace xla {
+namespace gpu {
+
+size_t TiledHloInstruction::PtrHash::operator()(
+    const TiledHloInstruction* tiled_hlo) const {
+  return absl::HashOf(*tiled_hlo);
+}
+
+bool TiledHloInstruction::PtrEqual::operator()(
+    const TiledHloInstruction* lhs, const TiledHloInstruction* rhs) const {
+  return *lhs == *rhs;
+}
+
+bool operator==(const TiledHloInstruction& lhs,
+                const TiledHloInstruction& rhs) {
+  return lhs.hlo() == rhs.hlo() && lhs.tile_sizes() == rhs.tile_sizes() &&
+         lhs.tile_strides() == rhs.tile_strides() &&
+         lhs.block_id_to_tile_offsets_indexing() ==
+             rhs.block_id_to_tile_offsets_indexing();
+}
+
+bool operator!=(const TiledHloInstruction& lhs,
+                const TiledHloInstruction& rhs) {
+  return !(lhs == rhs);
+}
+
+/*static*/
+absl::StatusOr<std::unique_ptr<TiledHloInstruction>>
+TiledHloInstruction::Create(const HloInstruction* hlo,
+                            std::vector<int64_t> tile_sizes,
+                            std::vector<int64_t> tile_strides,
+                            IndexingMap block_id_to_tile_offsets_indexing) {
+  int rank = hlo->shape().rank();
+
+  if (tile_sizes.size() != rank) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Number of tile sizes must be equal to the rank of the "
+                     "hlo shape. tile_sizes = ",
+                     tile_sizes.size(), ", hlo = ", hlo->ToString()));
+  }
+
+  if (tile_strides.size() != rank) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Number of tile strides must be equal to the rank of the "
+                     "hlo shape. tile_sizes = ",
+                     tile_strides.size(), ", hlo = ", hlo->ToString()));
+  }
+
+  if (block_id_to_tile_offsets_indexing.GetDimensionCount() != 1 ||
+      block_id_to_tile_offsets_indexing.GetSymbolCount() != 0) {
+    return absl::InvalidArgumentError(absl::StrCat(
+        "block_id_to_tile_offsets_indexing must have 1 dim and 0 symbols. "
+        "block_id_to_tile_offsets_indexing = ",
+        block_id_to_tile_offsets_indexing.ToString()));
+  }
+
+  if (block_id_to_tile_offsets_indexing.GetAffineMap().getNumResults() !=
+      rank) {
+    return absl::InvalidArgumentError(absl::StrCat(
+        "block_id_to_tile_offsets_indexing must have the same number of "
+        "results as the rank of the hlo shape. "
+        "block_id_to_tile_offsets_indexing = ",
+        block_id_to_tile_offsets_indexing.ToString(),
+        ", hlo = ", hlo->ToString()));
+  }
+
+  return absl::WrapUnique(new TiledHloInstruction(
+      hlo, std::move(tile_sizes), std::move(tile_strides),
+      std::move(block_id_to_tile_offsets_indexing)));
+}
+
+std::string TiledHloInstruction::ToString() const {
+  std::stringstream ss;
+  ss << "hlo: " << hlo_->ToString() << "\n";
+  ss << "tile_sizes: {" << absl::StrJoin(tile_sizes_, ", ") << "}\n";
+  ss << "tile_strides: {" << absl::StrJoin(tile_strides_, ", ") << "}\n";
+  ss << "block_id_to_tile_offsets_indexing: "
+     << block_id_to_tile_offsets_indexing_;
+  return ss.str();
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/tiled_hlo_instruction.h b/third_party/xla/xla/service/gpu/model/tiled_hlo_instruction.h
new file mode 100644
index 00000000000000..045cfd27dd8aaf
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/tiled_hlo_instruction.h
@@ -0,0 +1,136 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_MODEL_TILED_HLO_INSTRUCTION_H_
+#define XLA_SERVICE_GPU_MODEL_TILED_HLO_INSTRUCTION_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/model/indexing_map.h"
+
+namespace xla {
+namespace gpu {
+
+// A wrapper around HloInstruction that represents a tiled HLO instruction.
+//
+// The class contains information required to emit this instruction in
+// block-level codegen. Tile sizes and strides are constants and do not depend
+// on the block id. Tile offsets are computed using an indexing map of form:
+// `(block_id) -> (tile_offset0, tile_offset1, ...)`.
+class TiledHloInstruction {
+ public:
+  // PtrHash and PtrEqual are helper classes to use in hash maps and sets that
+  // compare values behind the pointers. For example,
+  // absl::flat_hash_set<TiledHloInstruction*, PtrHash, PtrEqual> hlo_set;
+  struct PtrHash {
+    size_t operator()(const TiledHloInstruction* tiled_hlo) const;
+  };
+
+  struct PtrEqual {
+    bool operator()(const TiledHloInstruction* lhs,
+                    const TiledHloInstruction* rhs) const;
+  };
+
+  // Creates an instance of TiledHloInstruction. Returns an error if any of the
+  // following preconditions is not met:
+  // * Number of tile sizes, strides should match HLO shape rank.
+  // * Number of result of `block_id_to_tile_offsets_indexing` should match HLO
+  //   shape rank.
+  // * `block_id_to_tile_offsets_indexing` should have only 1 dimension and 0
+  //   symbols.
+  static absl::StatusOr<std::unique_ptr<TiledHloInstruction>> Create(
+      const HloInstruction* hlo, std::vector<int64_t> tile_sizes,
+      std::vector<int64_t> tile_strides,
+      IndexingMap block_id_to_tile_offsets_indexing);
+
+  // Returns the original HLO instruction.
+  const HloInstruction* hlo() const { return hlo_; }
+
+  // Returns the tile sizes. The number of tile sizes is equal to the rank of
+  // the output shape.
+  const std::vector<int64_t>& tile_sizes() const { return tile_sizes_; }
+
+  // Returns the tile strides. The number of tile strides is equal to the rank
+  // of the output shape.
+  const std::vector<int64_t>& tile_strides() const { return tile_strides_; }
+
+  // Returns the indexing map from block_id to tile offsets. The map has a form
+  // of `(block_id) -> (tile_offset0, tile_offset1, ...)`. The number of tile
+  // offsets is equal to the rank of the output shape.
+  const IndexingMap& block_id_to_tile_offsets_indexing() const {
+    return block_id_to_tile_offsets_indexing_;
+  }
+
+  const TiledHloInstruction* operand(int64_t operand_id) const {
+    return operands_[operand_id];
+  }
+
+  const std::vector<TiledHloInstruction*>& operands() const {
+    return operands_;
+  }
+
+  void AppendOperand(TiledHloInstruction* operand) {
+    operands_.push_back(operand);
+  }
+
+  std::string ToString() const;
+
+ private:
+  TiledHloInstruction(const HloInstruction* hlo,
+                      std::vector<int64_t> tile_sizes,
+                      std::vector<int64_t> tile_strides,
+                      IndexingMap block_id_to_tile_offsets_indexing)
+      : hlo_(hlo),
+        tile_sizes_(std::move(tile_sizes)),
+        tile_strides_(std::move(tile_strides)),
+        block_id_to_tile_offsets_indexing_(
+            std::move(block_id_to_tile_offsets_indexing)) {}
+
+  // Pointer to the original HLO instruction.
+  const HloInstruction* hlo_;
+
+  // Tile sizes and strides.
+  std::vector<int64_t> tile_sizes_;
+  std::vector<int64_t> tile_strides_;
+
+  // Indexing map from block_id to tile offsets.
+  IndexingMap block_id_to_tile_offsets_indexing_;
+
+  // Operands of the instruction in the tiled computation graph.
+  std::vector<TiledHloInstruction*> operands_;
+};
+
+bool operator==(const TiledHloInstruction& lhs, const TiledHloInstruction& rhs);
+bool operator!=(const TiledHloInstruction& lhs, const TiledHloInstruction& rhs);
+
+template <typename H>
+H AbslHashValue(H h, const TiledHloInstruction& tiled_hlo_instruction) {
+  return H::combine(std::move(h), tiled_hlo_instruction.hlo(),
+                    tiled_hlo_instruction.tile_sizes(),
+                    tiled_hlo_instruction.tile_strides(),
+                    tiled_hlo_instruction.block_id_to_tile_offsets_indexing());
+}
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_MODEL_TILED_HLO_INSTRUCTION_H_
diff --git a/third_party/xla/xla/service/gpu/model/tiled_hlo_instruction_test.cc b/third_party/xla/xla/service/gpu/model/tiled_hlo_instruction_test.cc
new file mode 100644
index 00000000000000..dc2db9b3d96bd6
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/tiled_hlo_instruction_test.cc
@@ -0,0 +1,142 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/model/tiled_hlo_instruction.h"
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/container/flat_hash_set.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/model/indexing_map.h"
+#include "xla/service/gpu/model/indexing_test_utils.h"
+#include "xla/shape_util.h"
+#include "xla/tests/hlo_test_base.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class TiledHloInstructionTest : public HloTestBase {
+ public:
+  mlir::MLIRContext mlir_context_;
+};
+
+TEST_F(TiledHloInstructionTest, PtrHashAndPtrEqualWorkCorrectly) {
+  std::unique_ptr<HloInstruction> hlo = HloInstruction::CreateParameter(
+      /*parameter_number=*/0,
+      ShapeUtil::MakeShape(PrimitiveType::F32, {32, 64}), "p0");
+
+  IndexingMap block_id_to_tile_offsets_indexing = IndexingMap::FromTensorSizes(
+      ParseAffineMap("(d0) -> (d0 floordiv 16, (d0 mod 16) * 16)",
+                     &mlir_context_),
+      /*dim_upper_bounds=*/{8},
+      /*symbol_upper_bounds=*/{});
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<TiledHloInstruction> tiled_hlo1,
+      TiledHloInstruction::Create(hlo.get(), /*tile_sizes=*/{16, 16},
+                                  /*tile_strides=*/{1, 1},
+                                  block_id_to_tile_offsets_indexing));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<TiledHloInstruction> tiled_hlo2,
+      TiledHloInstruction::Create(hlo.get(), /*tile_sizes=*/{16, 16},
+                                  /*tile_strides=*/{1, 1},
+                                  block_id_to_tile_offsets_indexing));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<TiledHloInstruction> tiled_hlo3,
+      TiledHloInstruction::Create(hlo.get(), /*tile_sizes=*/{16, 32},
+                                  /*tile_strides=*/{1, 1},
+                                  block_id_to_tile_offsets_indexing));
+
+  EXPECT_EQ(*tiled_hlo1, *tiled_hlo2);
+  EXPECT_NE(*tiled_hlo1, *tiled_hlo3);
+
+  absl::flat_hash_set<TiledHloInstruction*, TiledHloInstruction::PtrHash,
+                      TiledHloInstruction::PtrEqual>
+      tiled_hlo_set = {tiled_hlo1.get(), tiled_hlo2.get(), tiled_hlo3.get()};
+  EXPECT_EQ(tiled_hlo_set.size(), 2);
+}
+
+TEST_F(TiledHloInstructionTest, TileSizesAndStridesShouldMatchHloShapeRank) {
+  std::unique_ptr<HloInstruction> hlo = HloInstruction::CreateParameter(
+      /*parameter_number=*/0,
+      ShapeUtil::MakeShape(PrimitiveType::F32, {32, 64}), "p0");
+
+  IndexingMap block_id_to_tile_offsets_indexing = IndexingMap::FromTensorSizes(
+      ParseAffineMap("(d0) -> (d0 floordiv 16, (d0 mod 16) * 16)",
+                     &mlir_context_),
+      /*dim_upper_bounds=*/{8},
+      /*symbol_upper_bounds=*/{});
+
+  EXPECT_THAT(
+      TiledHloInstruction::Create(hlo.get(), /*tile_sizes=*/{16},
+                                  /*tile_strides=*/{1, 1},
+                                  block_id_to_tile_offsets_indexing)
+          .status()
+          .message(),
+      ::testing::HasSubstr("Number of tile sizes must be equal to the rank"));
+
+  EXPECT_THAT(
+      TiledHloInstruction::Create(hlo.get(), /*tile_sizes=*/{16, 16},
+                                  /*tile_strides=*/{1, 1, 1},
+                                  block_id_to_tile_offsets_indexing)
+          .status()
+          .message(),
+      ::testing::HasSubstr("Number of tile strides must be equal to the rank"));
+}
+
+TEST_F(TiledHloInstructionTest,
+       ShouldReturnErrorIfBlockIdToTileOffsetsIndexingIsInvalid) {
+  std::unique_ptr<HloInstruction> hlo = HloInstruction::CreateParameter(
+      /*parameter_number=*/0,
+      ShapeUtil::MakeShape(PrimitiveType::F32, {32, 64}), "p0");
+
+  IndexingMap block_id_to_tile_offsets_indexing1 = IndexingMap::FromTensorSizes(
+      ParseAffineMap("(d0) -> (d0 floordiv 16)", &mlir_context_),
+      /*dim_upper_bounds=*/{8},
+      /*symbol_upper_bounds=*/{});
+
+  EXPECT_THAT(
+      TiledHloInstruction::Create(hlo.get(), /*tile_sizes=*/{16, 16},
+                                  /*tile_strides=*/{1, 1},
+                                  block_id_to_tile_offsets_indexing1)
+          .status()
+          .message(),
+      ::testing::HasSubstr(
+          "must have the same number of results as the rank of the hlo shape"));
+
+  IndexingMap block_id_to_tile_offsets_indexing2 = IndexingMap::FromTensorSizes(
+      ParseAffineMap("(d0)[s0] -> (d0 + s0, d0 floordiv 16)", &mlir_context_),
+      /*dim_upper_bounds=*/{8},
+      /*symbol_upper_bounds=*/{8});
+
+  EXPECT_THAT(TiledHloInstruction::Create(hlo.get(), /*tile_sizes=*/{16, 16},
+                                          /*tile_strides=*/{1, 1},
+                                          block_id_to_tile_offsets_indexing2)
+                  .status()
+                  .message(),
+              ::testing::HasSubstr("must have 1 dim and 0 symbols"));
+}
+
+}  // namespace
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/nccl_clique_key.cc b/third_party/xla/xla/service/gpu/nccl_clique_key.cc
index eebfd23b8dc05f..763ee8fe2dc711 100644
--- a/third_party/xla/xla/service/gpu/nccl_clique_key.cc
+++ b/third_party/xla/xla/service/gpu/nccl_clique_key.cc
@@ -37,7 +37,8 @@ namespace xla::gpu {
 //===----------------------------------------------------------------------===//
 
 NcclCliqueKey::NcclCliqueKey(std::vector<GlobalDeviceId> devices,
-                             int64_t stream_id, AsyncStreamKind stream_kind)
+                             NcclStreamId stream_id,
+                             AsyncStreamKind stream_kind)
     : devices_(std::move(devices)),
       stream_id_(stream_id),
       stream_kind_(stream_kind) {}
@@ -46,7 +47,7 @@ absl::Span<const GlobalDeviceId> NcclCliqueKey::devices() const {
   return devices_;
 }
 
-int64_t NcclCliqueKey::stream_id() const { return stream_id_; }
+NcclStreamId NcclCliqueKey::stream_id() const { return stream_id_; }
 
 std::optional<int64_t> NcclCliqueKey::rank(GlobalDeviceId id) const {
   if (auto it = absl::c_find(devices_, id); it != devices_.end()) {
@@ -64,7 +65,7 @@ bool NcclCliqueKey::IsSubsetOf(const NcclCliqueKey& other) const {
 
 std::string NcclCliqueKey::ToString() const {
   return absl::StrFormat("devices=[%s]; stream=%d",
-                         GlobalDeviceIdsToString(devices_), stream_id_);
+                         GlobalDeviceIdsToString(devices_), stream_id_.value());
 }
 
 bool operator==(const NcclCliqueKey& a, const NcclCliqueKey& b) {
@@ -78,7 +79,7 @@ bool operator<(const NcclCliqueKey& a, const NcclCliqueKey& b) {
   if (a.devices_ < b.devices_) return true;
   if (b.devices_ < a.devices_) return false;
 
-  return a.stream_id_ < b.stream_id_;
+  return a.stream_id_.value() < b.stream_id_.value();
 }
 
 bool operator>(const NcclCliqueKey& a, const NcclCliqueKey& b) {
@@ -90,7 +91,7 @@ bool operator>(const NcclCliqueKey& a, const NcclCliqueKey& b) {
 
   // We still use `<` to order by stream id as we want to acquire sync cliques
   // before async ones.
-  return a.stream_id_ < b.stream_id_;
+  return a.stream_id_.value() < b.stream_id_.value();
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/third_party/xla/xla/service/gpu/nccl_clique_key.h b/third_party/xla/xla/service/gpu/nccl_clique_key.h
index dbd7ba1200b723..805893c0e95dd3 100644
--- a/third_party/xla/xla/service/gpu/nccl_clique_key.h
+++ b/third_party/xla/xla/service/gpu/nccl_clique_key.h
@@ -30,6 +30,8 @@ limitations under the License.
 
 namespace xla::gpu {
 
+TSL_LIB_GTL_DEFINE_INT_TYPE(NcclStreamId, uint64_t);
+
 // A standalone library without any dependencies on NCCL that allows us to
 // include this header in all of XLA without worrying about NCCL availability.
 
@@ -58,9 +60,12 @@ constexpr static int64_t kAsyncStreamTotal =
 
 // Assigns a unique ID to a stream for asynchronous or synchronous execution.
 // These IDs can be used, for example, to look up the NCCL communicator.
-inline uint64_t GetStreamId(
-    bool is_async, AsyncStreamKind stream_kind = AsyncStreamKind::kCollective) {
-  return is_async ? static_cast<int64_t>(stream_kind) + 1 : 0;
+inline NcclStreamId GetStreamId(
+    uint64_t main_stream_id, bool is_async,
+    AsyncStreamKind stream_kind = AsyncStreamKind::kCollective) {
+  return NcclStreamId(is_async ? (main_stream_id << 3) +
+                                     static_cast<uint64_t>(stream_kind) + 1
+                               : main_stream_id << 3);
 }
 
 //===----------------------------------------------------------------------===//
@@ -75,12 +80,13 @@ inline uint64_t GetStreamId(
 class NcclCliqueKey {
  public:
   explicit NcclCliqueKey(
-      std::vector<GlobalDeviceId> devices, int64_t stream_id = 0,
+      std::vector<GlobalDeviceId> devices,
+      NcclStreamId stream_id = NcclStreamId(0),
       AsyncStreamKind stream_kind = AsyncStreamKind::kCollective);
 
   absl::Span<const GlobalDeviceId> devices() const;
 
-  int64_t stream_id() const;
+  NcclStreamId stream_id() const;
 
   // Returns the rank of the global device in the clique.
   std::optional<int64_t> rank(GlobalDeviceId id) const;
@@ -105,7 +111,7 @@ class NcclCliqueKey {
 
  private:
   std::vector<GlobalDeviceId> devices_;
-  int64_t stream_id_;
+  NcclStreamId stream_id_;
   AsyncStreamKind stream_kind_;
 };
 
diff --git a/third_party/xla/xla/service/gpu/nccl_clique_key_test.cc b/third_party/xla/xla/service/gpu/nccl_clique_key_test.cc
index ca804a4e186696..d8db220362289e 100644
--- a/third_party/xla/xla/service/gpu/nccl_clique_key_test.cc
+++ b/third_party/xla/xla/service/gpu/nccl_clique_key_test.cc
@@ -30,10 +30,10 @@ TEST(NcclCliqueKeyTest, IsSubsetOf) {
   GlobalDeviceId id2 = GlobalDeviceId(2);
   GlobalDeviceId id3 = GlobalDeviceId(3);
 
-  NcclCliqueKey key0({id0, id1}, 0);
-  NcclCliqueKey key1({id0, id1, id2, id3}, 0);
-  NcclCliqueKey key2({id0, id1, id2, id3}, 1);
-  NcclCliqueKey key3({id1, id2, id3}, 0);
+  NcclCliqueKey key0({id0, id1}, NcclStreamId(0));
+  NcclCliqueKey key1({id0, id1, id2, id3}, NcclStreamId(0));
+  NcclCliqueKey key2({id0, id1, id2, id3}, NcclStreamId(1));
+  NcclCliqueKey key3({id1, id2, id3}, NcclStreamId(0));
 
   EXPECT_TRUE(key0.IsSubsetOf(key1));
   EXPECT_FALSE(key0.IsSubsetOf(key2));
@@ -46,8 +46,8 @@ TEST(NcclCliqueKeyTest, Compare) {
   GlobalDeviceId id2 = GlobalDeviceId(2);
   GlobalDeviceId id3 = GlobalDeviceId(3);
 
-  NcclCliqueKey key0({id0, id1}, 0);
-  NcclCliqueKey key1({id1, id2, id3}, 0);
+  NcclCliqueKey key0({id0, id1}, NcclStreamId(0));
+  NcclCliqueKey key1({id1, id2, id3}, NcclStreamId(0));
 
   EXPECT_LT(key0, key1);
   EXPECT_GT(key1, key0);
@@ -59,8 +59,8 @@ TEST(NcclCliqueKeyTest, BtreeIterationOrder) {
   GlobalDeviceId id2 = GlobalDeviceId(2);
   GlobalDeviceId id3 = GlobalDeviceId(3);
 
-  NcclCliqueKey key0({id0, id2}, 0);
-  NcclCliqueKey key1({id0, id1, id2, id3}, 0);
+  NcclCliqueKey key0({id0, id2}, NcclStreamId(0));
+  NcclCliqueKey key1({id0, id1, id2, id3}, NcclStreamId(0));
 
   absl::btree_map<NcclCliqueKey, int64_t, std::greater<NcclCliqueKey>> map;
   map[key0] = 0;
diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler.cc b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
index 8620bbbb8c0dbb..da5682f38ccbc0 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler.cc
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
@@ -61,6 +61,7 @@ limitations under the License.
 #include "xla/service/gpu/cudnn_pad_for_convolutions.h"
 #include "xla/service/gpu/cudnn_simplify_padding.h"
 #include "xla/service/gpu/cudnn_vectorize_convolutions.h"
+#include "xla/service/gpu/cudnn_workspace_rewriter.h"
 #include "xla/service/gpu/cusolver_rewriter.h"
 #include "xla/service/gpu/gemm_algorithm_picker.h"
 #include "xla/service/gpu/gemm_fusion_autotuner.h"
@@ -88,6 +89,7 @@ limitations under the License.
 #include "xla/service/reshape_mover.h"
 #include "xla/service/tuple_simplifier.h"
 #include "xla/status.h"
+#include "xla/stream_executor/cuda/cuda_asm_compiler.h"
 #include "xla/stream_executor/cuda/cuda_diagnostics.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
 #include "xla/stream_executor/cuda/ptx_compiler.h"
@@ -294,7 +296,9 @@ absl::Status NVPTXCompiler::OptimizeHloPostLayoutAssignment(
   // Transform TriangularSolve ops into custom-calls, so we can add temp
   // memory.
   post_pipeline.AddPass<TriangularSolveRewriter>();
-
+  if (stream_exec) {
+    post_pipeline.AddPass<CuDnnWorkspaceRewriter>(*stream_exec);
+  }
   TF_RETURN_IF_ERROR(post_pipeline.Run(hlo_module).status());
 
   return absl::OkStatus();
diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler_test.cc b/third_party/xla/xla/service/gpu/nvptx_compiler_test.cc
index f247582d38a5a3..f8fab8f8a4d043 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler_test.cc
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler_test.cc
@@ -26,6 +26,8 @@ limitations under the License.
 #include "xla/hlo/utils/hlo_query.h"
 #include "xla/service/backend.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/gpu_constants.h"
+#include "xla/service/gpu/gpu_hlo_schedule.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/util.h"
@@ -59,10 +61,22 @@ class NVPTXCompilerTest : public HloTestBase {
  public:
   absl::StatusOr<std::unique_ptr<BufferAssignment>> AssignBuffers(
       HloModule* module) {
-    Backend& test_backend = backend();
-    NVPTXCompiler compiler;
-    return compiler.AssignBuffers(module,
-                                  test_backend.default_stream_executor());
+    constexpr uint64_t pointer_size = 4;
+    const se::DeviceDescription& gpu_device_info =
+        backend().default_stream_executor()->GetDeviceDescription();
+    TF_RETURN_IF_ERROR(
+        ScheduleGpuModule(module, pointer_size, gpu_device_info).status());
+
+    auto buffer_size_bytes_function =
+        [this](const BufferValue& buffer_value) -> int64_t {
+      return GetSizeOfShape(buffer_value.shape(), pointer_size);
+    };
+
+    return BufferAssigner::Run(
+        module, std::make_unique<SequentialHloOrdering>(module->schedule()),
+        buffer_size_bytes_function,
+        /*color_alignment=*/
+        [](LogicalBuffer::Color) { return kXlaAllocatedBufferAlignBytes; });
   }
 };
 
diff --git a/third_party/xla/xla/service/gpu/priority_fusion.cc b/third_party/xla/xla/service/gpu/priority_fusion.cc
index c96c65319b259e..7d39f4aae766b1 100644
--- a/third_party/xla/xla/service/gpu/priority_fusion.cc
+++ b/third_party/xla/xla/service/gpu/priority_fusion.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/dump.h"
 #include "xla/service/fusion_queue.h"
+#include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/fusion_process_dump.pb.h"
 #include "xla/service/gpu/gpu_fusible.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
@@ -50,6 +51,7 @@ limitations under the License.
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/service/gpu/model/gpu_performance_model.h"
 #include "xla/service/gpu/model/gpu_performance_model_base.h"
+#include "xla/service/gpu/triton_fusion_analysis.h"
 #include "xla/service/hlo_graph_dumper.h"
 #include "xla/service/instruction_fusion.h"
 #include "xla/shape.h"
@@ -129,12 +131,15 @@ class GpuPriorityFusionQueue {
       const se::DeviceDescription* device_info,
       FusionProcessDumpProto* fusion_process_dump,
       tsl::thread::ThreadPool* thread_pool,
-      HloFusionAnalysisCache& fusion_analysis_cache)
+      HloFusionAnalysisCache& fusion_analysis_cache,
+      bool triton_softmax_priority_fusion_enabled)
       : computation_(computation),
         cost_analysis_(cost_analysis_options, device_info),
         fusion_process_dump_(fusion_process_dump),
         thread_pool_(thread_pool),
-        fusion_analysis_cache_(fusion_analysis_cache) {
+        fusion_analysis_cache_(fusion_analysis_cache),
+        triton_softmax_priority_fusion_enabled_(
+            triton_softmax_priority_fusion_enabled) {
     VLOG(2) << "Running full HLO cost analysis for " << computation_->name();
     TF_CHECK_OK(computation_->Accept(&cost_analysis_));
 
@@ -409,7 +414,37 @@ class GpuPriorityFusionQueue {
                                     run_times.time_fused);
   }
 
+  FusionDecision CanFuseTriton(HloInstruction* producer,
+                               HloInstruction* consumer) {
+    if (!triton_softmax_priority_fusion_enabled_) {
+      return "triton softmax fusion is not enabled";
+    }
+
+    if (IsTritonSoftmaxFusion(*producer)) {
+      if (!IsFusible(*consumer)) {
+        return "the consumer is not fusible";
+      }
+    } else {
+      if (!IsFusible(*producer)) {
+        return "the producer is not fusible";
+      }
+    }
+
+    // TODO(b/316143118): Replace TritonFusionAnalysis with SymbolicTileAnalysis
+    // once symbolic analysis is ready.
+    if (!TritonFusionAnalysis::ExecuteForProducerConsumer(*producer, *consumer)
+             .ok()) {
+      return "triton codegen can't handle the fusion";
+    }
+
+    return {};
+  }
+
   FusionDecision CanFuse(HloInstruction* producer, HloInstruction* consumer) {
+    if (IsTritonSoftmaxFusion(*producer) || IsTritonSoftmaxFusion(*consumer)) {
+      return CanFuseTriton(producer, consumer);
+    }
+
     if (!IsFusible(*producer)) {
       return "the producer is not fusible";
     }
@@ -597,6 +632,8 @@ class GpuPriorityFusionQueue {
 
   GpuPerformanceModelCache gpu_performance_model_cache_;
 
+  bool triton_softmax_priority_fusion_enabled_;
+
   bool dump_fusion_visualization_;
 };
 
@@ -678,6 +715,11 @@ absl::StatusOr<bool> GpuPriorityFusion::Run(
         module->ToString(HloPrintOptions::ShortParsable()));
   }
 
+  bool triton_softmax_priority_fusion_enabled =
+      module->config()
+          .debug_options()
+          .xla_gpu_enable_triton_softmax_priority_fusion();
+
   int changed = false;
   for (auto* computation :
        GetNonFusionComputations(module, execution_threads)) {
@@ -685,7 +727,8 @@ absl::StatusOr<bool> GpuPriorityFusion::Run(
 
     auto fusion_queue = std::make_unique<GpuPriorityFusionQueue>(
         computation, cost_analysis_options_, &device_info_,
-        fusion_process_dump_.get(), thread_pool_, fusion_analysis_cache_);
+        fusion_process_dump_.get(), thread_pool_, fusion_analysis_cache_,
+        triton_softmax_priority_fusion_enabled);
 
     while (fusion_queue->DequeueNextProducer()) {
       auto producer = fusion_queue->current_producer();
@@ -792,6 +835,11 @@ HloInstruction* GpuPriorityFusion::FuseInstruction(
     HloInstruction* fusion_instruction, HloInstruction* producer) {
   HloInstruction* result = fusion_instruction;
   if (producer->opcode() == HloOpcode::kFusion) {
+    if (IsTritonSoftmaxFusion(*producer)) {
+      TF_CHECK_OK(fusion_instruction->set_backend_config(
+          *producer->backend_config<GpuBackendConfig>()));
+    }
+
     fusion_instruction->MergeFusionInstruction(producer);
   } else {
     result = InstructionFusion::FuseInstruction(fusion_instruction, producer);
diff --git a/third_party/xla/xla/service/gpu/priority_fusion_test.cc b/third_party/xla/xla/service/gpu/priority_fusion_test.cc
index 14831b4cef88e3..7fef06d680a4e8 100644
--- a/third_party/xla/xla/service/gpu/priority_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/priority_fusion_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include <memory>
 #include <optional>
+#include <string>
 #include <utility>
 #include <vector>
 
@@ -29,7 +30,9 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/service/gpu/gpu_fusible.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/service/hlo_cost_analysis.h"
@@ -40,6 +43,7 @@ limitations under the License.
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tests/verified_hlo_module.h"
 #include "tsl/platform/status_matchers.h"
+#include "tsl/platform/statusor.h"
 
 namespace m = ::xla::match;
 
@@ -855,5 +859,59 @@ TEST_F(PriorityFusionTest, DoNotFuseProducerConsumerMergedTooLarge) {
   EXPECT_THAT(priority_fusion_.Run(module.get()), IsOkAndHolds(false));
 }
 
+TEST_F(PriorityFusionTest, CanMergeTritonFusionWithBothProducerAndConsumer) {
+#ifndef GOOGLE_CUDA
+  GTEST_SKIP() << "Triton fusion only enable for CUDA devices.";
+#endif
+
+  const std::string kHloText = R"(
+HloModule t
+add {
+  Arg_0 = f32[] parameter(0)
+  Arg_1 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0, Arg_1)
+}
+
+producer_computation {
+  parameter_0 = f32[125]{0} parameter(0)
+  ROOT broadcast = f32[125,127]{1,0} broadcast(parameter_0), dimensions={0}
+}
+
+consumer_computation {
+  parameter_0 = f32[125,127]{1,0} parameter(0)
+  parameter_1 = f32[125,127]{1,0} parameter(1)
+  ROOT multiply = f32[125,127]{1,0} multiply(parameter_1, parameter_0)
+}
+
+triton_softmax_computation {
+  parameter_0 = f32[125,127]{1,0} parameter(0)
+  multiply_0 = f32[125,127]{1,0} multiply(parameter_0, parameter_0)
+  constant_0 = f32[] constant(0)
+  reduce_0 = f32[125]{0} reduce(multiply_0, constant_0), dimensions={1}, to_apply=add
+  broadcast_4 = f32[125,127]{1,0} broadcast(reduce_0), dimensions={0}
+  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast_4)
+}
+
+ENTRY main {
+  param_0 = f32[125]{0} parameter(0)
+  param_1 = f32[125,127]{1,0} parameter(1)
+  producer_fusion = f32[125,127]{1,0} fusion(param_0), kind=kLoop, calls=producer_computation
+  triton_softmax = f32[125,127]{1,0} fusion(producer_fusion), kind=kCustom, calls=triton_softmax_computation, backend_config={"fusion_backend_config": {"kind":"__triton_softmax"}}
+  ROOT consumer_fusion = f32[125,127]{1,0} fusion(param_1, triton_softmax), kind=kLoop, calls=consumer_computation
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHloText));
+  auto debug_options = module->config().debug_options();
+  debug_options.set_xla_gpu_enable_triton_softmax_priority_fusion(true);
+  module->mutable_config().set_debug_options(debug_options);
+
+  EXPECT_TRUE(priority_fusion_.Run(module.get()).value());
+  EXPECT_TRUE(verifier().Run(module.get()).status().ok());
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Fusion(m::Parameter(), m::Parameter())));
+  EXPECT_EQ(root->fusion_kind(), HloInstruction::FusionKind::kCustom);
+  EXPECT_TRUE(IsTritonSoftmaxFusion(*root));
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/reduction_utils.cc b/third_party/xla/xla/service/gpu/reduction_utils.cc
index 5403d8c5b85875..82721a4420293a 100644
--- a/third_party/xla/xla/service/gpu/reduction_utils.cc
+++ b/third_party/xla/xla/service/gpu/reduction_utils.cc
@@ -33,7 +33,7 @@ limitations under the License.
 
 #ifdef GOOGLE_CUDA
 #include "xla/service/gpu/gpu_asm_opts_util.h"
-#include "xla/stream_executor/gpu/asm_compiler.h"
+#include "xla/stream_executor/cuda/cuda_asm_compiler.h"
 #endif  // GOOGLE_CUDA
 
 namespace xla {
diff --git a/third_party/xla/xla/service/gpu/runtime/BUILD b/third_party/xla/xla/service/gpu/runtime/BUILD
index 3a7451e83774b5..a9a5caee2393fb 100644
--- a/third_party/xla/xla/service/gpu/runtime/BUILD
+++ b/third_party/xla/xla/service/gpu/runtime/BUILD
@@ -1,9 +1,10 @@
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
-load("@local_tsl//tsl:tsl.bzl", "nvtx_headers")
 load("@local_tsl//tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured")
 load("//xla/service/gpu:build_defs.bzl", "get_cub_sort_kernel_types")
 load("//xla/stream_executor:build_defs.bzl", "if_gpu_is_configured")
 load("//xla/tests:build_defs.bzl", "xla_test")
+load("//xla/tsl:tsl.bzl", "if_google", "if_nccl", "nvtx_headers")
+load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -74,6 +75,7 @@ cc_library(
         ":custom_call_thunk",
         ":nccl_all_gather_thunk",
         ":nccl_all_reduce_thunk",
+        ":nccl_api",
         ":nccl_collective_broadcast_thunk",
         ":nccl_collective_thunk",
         "//xla:executable_run_options",
@@ -94,7 +96,6 @@ cc_library(
         "//xla/service/gpu:buffer_allocations",
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:matmul_utils",
-        "//xla/service/gpu:nccl_api",
         "//xla/service/gpu:nccl_clique_key",
         "//xla/service/gpu:stream_executor_util",
         "//xla/service/gpu/kernels:custom_kernel",
@@ -184,6 +185,113 @@ xla_test(
     ],
 )
 
+#===-------------------------------------------------------------------------------------------===//
+# NCCL integration
+#===-------------------------------------------------------------------------------------------===//
+
+# A lot of build complexity below is because NCCL dependency might not always be available and we
+# have `if_nccl` and `if_gpu_configured` that do not compose. NCCL header included directly in
+# :nccl_api target and all other targets should use this header to launch collective operations.
+# This allows to minimize the spreading of #ifdef all over the XLA code base.
+alias(
+    name = "nccl_api",
+    actual = if_nccl(":_nccl_api_impl", ":_nccl_api_stub"),
+)
+
+cc_library(
+    name = "_nccl_api_impl",
+    srcs = if_gpu_is_configured(
+        ["nccl_api.cc"],
+        ["nccl_api_stub.cc"],
+    ),
+    hdrs = ["nccl_api.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla/service:collective_ops_utils",
+        "//xla/service/gpu:nccl_clique_key",
+        "//xla/stream_executor",
+        "//xla/stream_executor/gpu:gpu_activation",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:btree",
+        "@com_google_absl//absl/hash",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/concurrency:ref_count",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:statusor",
+    ] + if_cuda_is_configured([
+        "@local_config_nccl//:nccl",
+        "//xla/stream_executor/cuda:cuda_driver",
+        "//xla/stream_executor/cuda:cuda_executor",
+    ]) + if_rocm_is_configured([
+        "@local_config_rocm//rocm:rccl",
+        "//xla/stream_executor/rocm:rocm_driver",
+        "//xla/stream_executor/rocm:rocm_executor",
+    ]) + if_gpu_is_configured([
+        "//xla/stream_executor/gpu:gpu_stream",
+    ]),
+)
+
+cc_library(
+    name = "_nccl_api_stub",
+    srcs = ["nccl_api_stub.cc"],
+    hdrs = ["nccl_api.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla/service:collective_ops_utils",
+        "//xla/service/gpu:nccl_clique_key",
+        "//xla/stream_executor",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/concurrency:ref_count",
+        "@local_tsl//tsl/platform:logging",
+    ],
+)
+
+cc_library(
+    name = "nccl_clique",
+    srcs = ["nccl_clique.cc"],
+    hdrs = ["nccl_clique.h"],
+    deps = [
+        ":nccl_api",
+        "//xla:debug_options_flags",
+        "//xla:executable_run_options",
+        "//xla:status_macros",
+        "//xla/service:global_device_id",
+        "//xla/service:lockable",
+        "//xla/service:rendezvous",
+        "//xla/service/gpu:nccl_clique_key",
+        "//xla/stream_executor",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:btree",
+        "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/functional:function_ref",
+        "@com_google_absl//absl/hash",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:hash",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
 #===-------------------------------------------------------------------------------------------===//
 # XLA Thunks Runtime
 #===-------------------------------------------------------------------------------------------===//
@@ -219,8 +327,8 @@ xla_test(
     name = "address_computation_thunk_test",
     srcs = if_gpu_is_configured(["address_computation_thunk_test.cc"]),
     backend_tags = {
-        "gpu_a100": ["config-cuda-only"],
-        "gpu_v100": ["config-cuda-only"],
+        "gpu_a100": if_google(["config-cuda-only"]),
+        "gpu_v100": if_google(["config-cuda-only"]),
     },
     backends = [
         "gpu_a100",
@@ -319,8 +427,8 @@ xla_test(
     name = "command_buffer_thunk_test",
     srcs = if_gpu_is_configured(["command_buffer_thunk_test.cc"]),
     backend_tags = {
-        "gpu_a100": ["config-cuda-only"],
-        "gpu_v100": ["config-cuda-only"],
+        "gpu_a100": if_google(["config-cuda-only"]),
+        "gpu_v100": if_google(["config-cuda-only"]),
     },
     backends = [
         "gpu_a100",
@@ -625,12 +733,12 @@ cc_library(
     srcs = ["nccl_all_gather_thunk.cc"],
     hdrs = ["nccl_all_gather_thunk.h"],
     deps = [
+        ":nccl_api",
         ":nccl_collective_thunk",
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
         "//xla/service:collective_ops_utils",
         "//xla/service/gpu:backend_configs_cc",
-        "//xla/service/gpu:nccl_api",
         "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/status",
@@ -647,13 +755,13 @@ cc_library(
     srcs = ["nccl_all_reduce_thunk.cc"],
     hdrs = ["nccl_all_reduce_thunk.h"],
     deps = [
+        ":nccl_api",
         ":nccl_collective_thunk",
         "//xla:status_macros",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:collective_ops_utils",
         "//xla/service/gpu:backend_configs_cc",
-        "//xla/service/gpu:nccl_api",
         "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/status",
@@ -671,13 +779,13 @@ cc_library(
     srcs = ["nccl_all_to_all_thunk.cc"],
     hdrs = ["nccl_all_to_all_thunk.h"],
     deps = [
+        ":nccl_api",
         ":nccl_collective_thunk",
         "//xla:shape_util",
         "//xla:status_macros",
         "//xla/hlo/ir:hlo",
         "//xla/service:collective_ops_utils",
         "//xla/service/gpu:ir_emission_utils",
-        "//xla/service/gpu:nccl_api",
         "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/status",
@@ -694,12 +802,12 @@ cc_library(
     srcs = ["nccl_collective_broadcast_thunk.cc"],
     hdrs = ["nccl_collective_broadcast_thunk.h"],
     deps = [
+        ":nccl_api",
         ":nccl_collective_thunk",
         "//xla:status",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:collective_ops_utils",
-        "//xla/service/gpu:nccl_api",
         "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/types:span",
@@ -713,6 +821,7 @@ cc_library(
     srcs = ["nccl_collective_permute_thunk.cc"],
     hdrs = ["nccl_collective_permute_thunk.h"],
     deps = [
+        ":nccl_api",
         ":nccl_collective_thunk",
         ":nccl_p2p_thunk_common",
         "//xla:status_macros",
@@ -722,7 +831,6 @@ cc_library(
         "//xla/service:global_device_id",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:ir_emission_utils",
-        "//xla/service/gpu:nccl_api",
         "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "//xla/translate/mhlo_to_hlo:attribute_exporter",
@@ -731,6 +839,7 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
+        "@local_tsl//tsl/concurrency:async_value",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -744,6 +853,8 @@ cc_library(
         "TENSORFLOW=1",
     ]),
     deps = [
+        ":nccl_api",
+        ":nccl_clique",
         "//xla:debug_options_flags",
         "//xla:shape_util",
         "//xla:status",
@@ -757,8 +868,6 @@ cc_library(
         "//xla/service:rendezvous",
         "//xla/service/gpu:buffer_allocations",
         "//xla/service/gpu:ir_emission_utils",
-        "//xla/service/gpu:nccl_api",
-        "//xla/service/gpu:nccl_clique",
         "//xla/service/gpu:nccl_clique_key",
         "//xla/service/gpu/runtime:thunk",
         "//xla/service/llvm_ir:llvm_util",
@@ -821,6 +930,7 @@ cc_library(
     srcs = ["nccl_recv_thunk.cc"],
     hdrs = ["nccl_recv_thunk.h"],
     deps = [
+        ":nccl_api",
         ":nccl_collective_thunk",
         ":nccl_p2p_thunk_common",
         "//xla:status_macros",
@@ -828,7 +938,6 @@ cc_library(
         "//xla/service:collective_ops_utils",
         "//xla/service:computation_placer",
         "//xla/service:global_device_id",
-        "//xla/service/gpu:nccl_api",
         "//xla/service/gpu:nccl_clique_key",
         "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
@@ -846,6 +955,7 @@ cc_library(
     srcs = ["nccl_send_thunk.cc"],
     hdrs = ["nccl_send_thunk.h"],
     deps = [
+        ":nccl_api",
         ":nccl_collective_thunk",
         ":nccl_p2p_thunk_common",
         "//xla:status_macros",
@@ -853,7 +963,6 @@ cc_library(
         "//xla/service:collective_ops_utils",
         "//xla/service:computation_placer",
         "//xla/service:global_device_id",
-        "//xla/service/gpu:nccl_api",
         "//xla/service/gpu:nccl_clique_key",
         "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
@@ -961,6 +1070,8 @@ cc_library(
     srcs = ["thunk.cc"],
     hdrs = ["thunk.h"],
     deps = [
+        ":nccl_api",
+        ":nccl_clique",
         "//xla:executable_run_options",
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
@@ -969,8 +1080,6 @@ cc_library(
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:buffer_allocations",
         "//xla/service/gpu:gpu_executable_run_options",
-        "//xla/service/gpu:nccl_api",
-        "//xla/service/gpu:nccl_clique",
         "//xla/service/gpu:nccl_clique_key",
         "//xla/stream_executor",
         "//xla/translate/mhlo_to_hlo:location_exporter",
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
index ee6b4eee6b6164..887f23d8e5575b 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
@@ -71,8 +71,7 @@ static se::StreamExecutor* GpuExecutor() {
 TEST(AddressComputationThunkTest, SlicedGemm) {
   se::StreamExecutor* executor = GpuExecutor();
 
-  se::Stream stream(executor);
-  TF_ASSERT_OK(stream.Initialize());
+  TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
 
   int64_t lhs_length = sizeof(float) * 2 * 4;
   int64_t rhs_length = sizeof(float) * 3 * 1;
@@ -161,27 +160,29 @@ TEST(AddressComputationThunkTest, SlicedGemm) {
   //        5.0, 6.0, 7.0, 8.0]
   se::DeviceMemory<float> lhs = executor->AllocateArray<float>(2 * 4);
   std::vector<float> lhs_arr{1, 2, 3, 4, 5, 6, 7, 8};
-  TF_ASSERT_OK(stream.Memcpy(&lhs, lhs_arr.data(), lhs_length));
+  TF_ASSERT_OK(stream->Memcpy(&lhs, lhs_arr.data(), lhs_length));
 
   // rhs = [1.0,
   //        1.0,
   //        1.0]
   se::DeviceMemory<float> rhs = executor->AllocateArray<float>(3 * 1);
   std::vector<float> rhs_arr(3, 1);
-  TF_ASSERT_OK(stream.Memcpy(&rhs, rhs_arr.data(), rhs_length));
+  TF_ASSERT_OK(stream->Memcpy(&rhs, rhs_arr.data(), rhs_length));
 
   se::DeviceMemory<float> out = executor->AllocateArray<float>(1 * 1);
-  TF_ASSERT_OK(stream.MemZero(&out, out_length));
+  TF_ASSERT_OK(stream->MemZero(&out, out_length));
 
   se::DeviceMemory<float> workspace =
       executor->AllocateArray<float>(1024 * 1024);
-  TF_ASSERT_OK(stream.MemZero(&workspace, 1024 * 1024));
+  TF_ASSERT_OK(stream->MemZero(&workspace, 1024 * 1024));
 
   se::DeviceMemory<int64_t> lhs_offset_0 = executor->AllocateArray<int64_t>(1);
   se::DeviceMemory<int64_t> lhs_offset_1 = executor->AllocateArray<int64_t>(1);
   std::vector<int64_t> lhs_offset_arr{0, 1};
-  TF_ASSERT_OK(stream.Memcpy(&lhs_offset_0, &lhs_offset_arr[0], offset_length));
-  TF_ASSERT_OK(stream.Memcpy(&lhs_offset_1, &lhs_offset_arr[1], offset_length));
+  TF_ASSERT_OK(
+      stream->Memcpy(&lhs_offset_0, &lhs_offset_arr[0], offset_length));
+  TF_ASSERT_OK(
+      stream->Memcpy(&lhs_offset_1, &lhs_offset_arr[1], offset_length));
 
   // Preparing parameters for thunk execution.
   ServiceExecutableRunOptions run_options;
@@ -189,20 +190,21 @@ TEST(AddressComputationThunkTest, SlicedGemm) {
       {lhs, rhs, out, workspace, lhs_offset_0, lhs_offset_1}, 0,
       executor->GetAllocator());
 
-  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
-      run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
+  Thunk::ExecuteParams params =
+      Thunk::ExecuteParams::Create(run_options, allocations, stream.get(),
+                                   stream.get(), {}, nullptr, nullptr);
 
   Thunk::ExecutableSource source = {/*text=*/"", /*binary=*/{}};
-  TF_ASSERT_OK(
-      thunk.Initialize({executor, source, &allocations, &stream, &stream}));
+  TF_ASSERT_OK(thunk.Initialize(
+      {executor, source, &allocations, stream.get(), stream.get()}));
 
   // Executing address computation thunk.
   TF_ASSERT_OK(thunk.ExecuteOnStream(params));
-  TF_ASSERT_OK(stream.BlockHostUntilDone());
+  TF_ASSERT_OK(stream->BlockHostUntilDone());
 
   // Copying `out` data back to host for verification.
   std::vector<float> dst(1, 0);
-  TF_ASSERT_OK(stream.Memcpy(dst.data(), out, out_length));
+  TF_ASSERT_OK(stream->Memcpy(dst.data(), out, out_length));
 
   ASSERT_EQ(dst, std::vector<float>({9}));
 }
@@ -210,8 +212,7 @@ TEST(AddressComputationThunkTest, SlicedGemm) {
 TEST(AddressComputationThunkTest, SlicedNonContiguousGemm) {
   se::StreamExecutor* executor = GpuExecutor();
 
-  se::Stream stream(executor);
-  TF_ASSERT_OK(stream.Initialize());
+  TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
 
   int64_t lhs_length = sizeof(float) * 2 * 4;
   int64_t rhs_length = sizeof(float) * 4 * 3;
@@ -318,7 +319,7 @@ TEST(AddressComputationThunkTest, SlicedNonContiguousGemm) {
   //        5.0, 6.0, 7.0, 8.0]
   se::DeviceMemory<float> lhs = executor->AllocateArray<float>(2 * 4);
   std::vector<float> lhs_arr{1, 2, 3, 4, 5, 6, 7, 8};
-  TF_ASSERT_OK(stream.Memcpy(&lhs, lhs_arr.data(), lhs_length));
+  TF_ASSERT_OK(stream->Memcpy(&lhs, lhs_arr.data(), lhs_length));
 
   // Given a `rhs` tensor of shape f32[4,3]{1,0}
   // The `rhs` slice that we want to use will be equivalent to this static
@@ -330,26 +331,30 @@ TEST(AddressComputationThunkTest, SlicedNonContiguousGemm) {
   //        1.0, 1.0, 1.0]
   se::DeviceMemory<float> rhs = executor->AllocateArray<float>(4 * 3);
   std::vector<float> rhs_arr(12, 1);
-  TF_ASSERT_OK(stream.Memcpy(&rhs, rhs_arr.data(), rhs_length));
+  TF_ASSERT_OK(stream->Memcpy(&rhs, rhs_arr.data(), rhs_length));
 
   se::DeviceMemory<float> out = executor->AllocateArray<float>(2 * 2);
-  TF_ASSERT_OK(stream.MemZero(&out, out_length));
+  TF_ASSERT_OK(stream->MemZero(&out, out_length));
 
   se::DeviceMemory<float> workspace =
       executor->AllocateArray<float>(1024 * 1024);
-  TF_ASSERT_OK(stream.MemZero(&workspace, 1024 * 1024));
+  TF_ASSERT_OK(stream->MemZero(&workspace, 1024 * 1024));
 
   se::DeviceMemory<int64_t> lhs_offset_0 = executor->AllocateArray<int64_t>(1);
   se::DeviceMemory<int64_t> lhs_offset_1 = executor->AllocateArray<int64_t>(1);
   std::vector<int64_t> lhs_offset_arr{0, 1};
-  TF_ASSERT_OK(stream.Memcpy(&lhs_offset_0, &lhs_offset_arr[0], offset_length));
-  TF_ASSERT_OK(stream.Memcpy(&lhs_offset_1, &lhs_offset_arr[1], offset_length));
+  TF_ASSERT_OK(
+      stream->Memcpy(&lhs_offset_0, &lhs_offset_arr[0], offset_length));
+  TF_ASSERT_OK(
+      stream->Memcpy(&lhs_offset_1, &lhs_offset_arr[1], offset_length));
 
   se::DeviceMemory<int64_t> rhs_offset_0 = executor->AllocateArray<int64_t>(1);
   se::DeviceMemory<int64_t> rhs_offset_1 = executor->AllocateArray<int64_t>(1);
   std::vector<int64_t> rhs_offset_arr{2, 1};
-  TF_ASSERT_OK(stream.Memcpy(&rhs_offset_0, &rhs_offset_arr[0], offset_length));
-  TF_ASSERT_OK(stream.Memcpy(&rhs_offset_1, &rhs_offset_arr[1], offset_length));
+  TF_ASSERT_OK(
+      stream->Memcpy(&rhs_offset_0, &rhs_offset_arr[0], offset_length));
+  TF_ASSERT_OK(
+      stream->Memcpy(&rhs_offset_1, &rhs_offset_arr[1], offset_length));
 
   // Preparing parameters for thunk execution.
   ServiceExecutableRunOptions run_options;
@@ -357,12 +362,13 @@ TEST(AddressComputationThunkTest, SlicedNonContiguousGemm) {
                                  lhs_offset_1, rhs_offset_0, rhs_offset_1},
                                 0, executor->GetAllocator());
 
-  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
-      run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
+  Thunk::ExecuteParams params =
+      Thunk::ExecuteParams::Create(run_options, allocations, stream.get(),
+                                   stream.get(), {}, nullptr, nullptr);
 
   Thunk::ExecutableSource source = {/*text=*/"", /*binary=*/{}};
-  TF_ASSERT_OK(
-      thunk.Initialize({executor, source, &allocations, &stream, &stream}));
+  TF_ASSERT_OK(thunk.Initialize(
+      {executor, source, &allocations, stream.get(), stream.get()}));
 
   // Execute address computation thunk and verify that it failed because of non
   // contiguous slices on both `lhs` and `rhs`.
@@ -372,8 +378,7 @@ TEST(AddressComputationThunkTest, SlicedNonContiguousGemm) {
 TEST(AddressComputationThunkTest, MulipleSlicedOperandsGemm) {
   se::StreamExecutor* executor = GpuExecutor();
 
-  se::Stream stream(executor);
-  TF_ASSERT_OK(stream.Initialize());
+  TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
 
   int64_t length = sizeof(float) * 2 * 4;
   int64_t out_length = sizeof(float) * 1;
@@ -479,7 +484,7 @@ TEST(AddressComputationThunkTest, MulipleSlicedOperandsGemm) {
   //        5.0, 6.0, 7.0, 8.0]
   std::vector<float> arr{1, 2, 3, 4, 5, 6, 7, 8};
   se::DeviceMemory<float> lhs = executor->AllocateArray<float>(2 * 4);
-  TF_ASSERT_OK(stream.Memcpy(&lhs, arr.data(), length));
+  TF_ASSERT_OK(stream->Memcpy(&lhs, arr.data(), length));
 
   // Given a `rhs` tensor of shape f32[8,1]{1,0}
   // The `rhs` slice that we want to use will be equivalent to this static
@@ -495,26 +500,30 @@ TEST(AddressComputationThunkTest, MulipleSlicedOperandsGemm) {
   //        8.0]
   se::DeviceMemory<float> rhs = executor->AllocateArray<float>(8);
   std::vector<float> rhs_arr(8, 1);
-  TF_ASSERT_OK(stream.Memcpy(&rhs, arr.data(), length));
+  TF_ASSERT_OK(stream->Memcpy(&rhs, arr.data(), length));
 
   se::DeviceMemory<float> out = executor->AllocateArray<float>(1);
-  TF_ASSERT_OK(stream.MemZero(&out, out_length));
+  TF_ASSERT_OK(stream->MemZero(&out, out_length));
 
   se::DeviceMemory<float> workspace =
       executor->AllocateArray<float>(1024 * 1024);
-  TF_ASSERT_OK(stream.MemZero(&workspace, 1024 * 1024));
+  TF_ASSERT_OK(stream->MemZero(&workspace, 1024 * 1024));
 
   se::DeviceMemory<int64_t> lhs_offset_0 = executor->AllocateArray<int64_t>(1);
   se::DeviceMemory<int64_t> lhs_offset_1 = executor->AllocateArray<int64_t>(1);
   std::vector<int64_t> lhs_offset_arr{0, 1};
-  TF_ASSERT_OK(stream.Memcpy(&lhs_offset_0, &lhs_offset_arr[0], offset_length));
-  TF_ASSERT_OK(stream.Memcpy(&lhs_offset_1, &lhs_offset_arr[1], offset_length));
+  TF_ASSERT_OK(
+      stream->Memcpy(&lhs_offset_0, &lhs_offset_arr[0], offset_length));
+  TF_ASSERT_OK(
+      stream->Memcpy(&lhs_offset_1, &lhs_offset_arr[1], offset_length));
 
   se::DeviceMemory<int64_t> rhs_offset_0 = executor->AllocateArray<int64_t>(1);
   se::DeviceMemory<int64_t> rhs_offset_1 = executor->AllocateArray<int64_t>(1);
   std::vector<int64_t> rhs_offset_arr{2, 0};
-  TF_ASSERT_OK(stream.Memcpy(&rhs_offset_0, &rhs_offset_arr[0], offset_length));
-  TF_ASSERT_OK(stream.Memcpy(&rhs_offset_1, &rhs_offset_arr[1], offset_length));
+  TF_ASSERT_OK(
+      stream->Memcpy(&rhs_offset_0, &rhs_offset_arr[0], offset_length));
+  TF_ASSERT_OK(
+      stream->Memcpy(&rhs_offset_1, &rhs_offset_arr[1], offset_length));
 
   // Preparing parameters for thunk execution.
   ServiceExecutableRunOptions run_options;
@@ -522,29 +531,30 @@ TEST(AddressComputationThunkTest, MulipleSlicedOperandsGemm) {
                                  lhs_offset_1, rhs_offset_0, rhs_offset_1},
                                 0, executor->GetAllocator());
 
-  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
-      run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
+  Thunk::ExecuteParams params =
+      Thunk::ExecuteParams::Create(run_options, allocations, stream.get(),
+                                   stream.get(), {}, nullptr, nullptr);
 
   Thunk::ExecutableSource source = {/*text=*/"", /*binary=*/{}};
-  TF_ASSERT_OK(
-      thunk.Initialize({executor, source, &allocations, &stream, &stream}));
+  TF_ASSERT_OK(thunk.Initialize(
+      {executor, source, &allocations, stream.get(), stream.get()}));
 
   // Execute address computation thunk and verify that it executed a GEMM on the
   // right slices.
   TF_ASSERT_OK(thunk.ExecuteOnStream(params));
-  TF_ASSERT_OK(stream.BlockHostUntilDone());
+  TF_ASSERT_OK(stream->BlockHostUntilDone());
 
   // Copy `out` data back to host for verification.
   std::vector<float> dst(1, 0);
-  TF_ASSERT_OK(stream.Memcpy(dst.data(), out, out_length));
+  TF_ASSERT_OK(stream->Memcpy(dst.data(), out, out_length));
 
   ASSERT_EQ(dst, std::vector<float>({2 * 3 + 3 * 4 + 4 * 5}));
 }
 
 static absl::Status Memcpy(se::Stream* stream, ffi::BufferBase src,
-                           ffi::BufferBase dst) {
+                           ffi::Result<ffi::BufferBase> dst) {
   return stream->MemcpyD2D(
-      &dst.data, src.data,
+      &dst->data, src.data,
       absl::c_accumulate(src.dimensions, 1.0, std::multiplies<int64_t>()) *
           sizeof(float));
 }
@@ -553,7 +563,7 @@ XLA_FFI_DEFINE_HANDLER(kMemcpy, Memcpy,
                        ffi::Ffi::Bind()
                            .Ctx<ffi::Stream>()
                            .Arg<ffi::BufferBase>()  // src
-                           .Arg<ffi::BufferBase>()  // dst
+                           .Ret<ffi::BufferBase>()  // dst
 );
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$memcpy", PLATFORM,
                          kMemcpy);
@@ -561,8 +571,7 @@ XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$memcpy", PLATFORM,
 TEST(AddressComputationThunkTest, SlicedMemcpy) {
   se::StreamExecutor* executor = GpuExecutor();
 
-  se::Stream stream(executor);
-  TF_ASSERT_OK(stream.Initialize());
+  TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
 
   int64_t src_count = 8 * 8 * 10 * 8;
   int64_t dst_count = 8 * 8;
@@ -647,20 +656,20 @@ TEST(AddressComputationThunkTest, SlicedMemcpy) {
   se::DeviceMemory<int32_t> src = executor->AllocateArray<int32_t>(src_count);
   std::vector<int32_t> src_arr(src_count, 0);
   for (unsigned i = 0; i < src_count; ++i) src_arr[i] = i;
-  TF_ASSERT_OK(stream.Memcpy(&src, src_arr.data(), src_length));
+  TF_ASSERT_OK(stream->Memcpy(&src, src_arr.data(), src_length));
 
   se::DeviceMemory<int32_t> dst = executor->AllocateArray<int32_t>(dst_count);
-  TF_ASSERT_OK(stream.MemZero(&dst, dst_length));
+  TF_ASSERT_OK(stream->MemZero(&dst, dst_length));
 
   se::DeviceMemory<int64_t> offset_0 = executor->AllocateArray<int64_t>(1);
   se::DeviceMemory<int64_t> offset_1 = executor->AllocateArray<int64_t>(1);
   se::DeviceMemory<int64_t> offset_2 = executor->AllocateArray<int64_t>(1);
   se::DeviceMemory<int64_t> offset_3 = executor->AllocateArray<int64_t>(1);
   std::vector<int64_t> offset_arr{3, 5, 2, 0};
-  TF_ASSERT_OK(stream.Memcpy(&offset_0, &offset_arr[0], offset_length));
-  TF_ASSERT_OK(stream.Memcpy(&offset_1, &offset_arr[1], offset_length));
-  TF_ASSERT_OK(stream.Memcpy(&offset_2, &offset_arr[2], offset_length));
-  TF_ASSERT_OK(stream.Memcpy(&offset_3, &offset_arr[3], offset_length));
+  TF_ASSERT_OK(stream->Memcpy(&offset_0, &offset_arr[0], offset_length));
+  TF_ASSERT_OK(stream->Memcpy(&offset_1, &offset_arr[1], offset_length));
+  TF_ASSERT_OK(stream->Memcpy(&offset_2, &offset_arr[2], offset_length));
+  TF_ASSERT_OK(stream->Memcpy(&offset_3, &offset_arr[3], offset_length));
 
   // Preparing parameters for thunk execution.
   ServiceExecutableRunOptions run_options;
@@ -668,20 +677,21 @@ TEST(AddressComputationThunkTest, SlicedMemcpy) {
       {src, dst, offset_0, offset_1, offset_2, offset_3}, 0,
       executor->GetAllocator());
 
-  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
-      run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
+  Thunk::ExecuteParams params =
+      Thunk::ExecuteParams::Create(run_options, allocations, stream.get(),
+                                   stream.get(), {}, nullptr, nullptr);
 
   Thunk::ExecutableSource source = {/*text=*/"", /*binary=*/{}};
-  TF_ASSERT_OK(
-      thunk.Initialize({executor, source, &allocations, &stream, &stream}));
+  TF_ASSERT_OK(thunk.Initialize(
+      {executor, source, &allocations, stream.get(), stream.get()}));
 
   // Executing address computation thunk.
   TF_ASSERT_OK(thunk.ExecuteOnStream(params));
-  TF_ASSERT_OK(stream.BlockHostUntilDone());
+  TF_ASSERT_OK(stream->BlockHostUntilDone());
 
   // Copying `dst` data back to host for verification.
   std::vector<int32_t> out(dst_count, 0);
-  TF_ASSERT_OK(stream.Memcpy(out.data(), dst, dst_length));
+  TF_ASSERT_OK(stream->Memcpy(out.data(), dst, dst_length));
 
   // Verifying that the right slice of `src` was copied to `dst`.
   std::vector<int32_t> ref(dst_count, 0);
@@ -696,8 +706,7 @@ TEST(AddressComputationThunkTest, SlicedMemcpy) {
 TEST(AddressComputationThunkTest, SlicedOutputMemcpy) {
   se::StreamExecutor* executor = GpuExecutor();
 
-  se::Stream stream(executor);
-  TF_ASSERT_OK(stream.Initialize());
+  TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
 
   int64_t src_count = 8 * 8 * 10 * 2;
   int64_t dst_count = 2 * 2 * 2 * 2;
@@ -817,30 +826,38 @@ TEST(AddressComputationThunkTest, SlicedOutputMemcpy) {
   se::DeviceMemory<int32_t> src = executor->AllocateArray<int32_t>(src_count);
   std::vector<int32_t> src_arr(src_count, 0);
   for (unsigned i = 0; i < src_count; ++i) src_arr[i] = i;
-  TF_ASSERT_OK(stream.Memcpy(&src, src_arr.data(), src_length));
+  TF_ASSERT_OK(stream->Memcpy(&src, src_arr.data(), src_length));
 
   se::DeviceMemory<int32_t> dst = executor->AllocateArray<int32_t>(dst_count);
-  TF_ASSERT_OK(stream.MemZero(&dst, dst_length));
+  TF_ASSERT_OK(stream->MemZero(&dst, dst_length));
 
   se::DeviceMemory<int64_t> src_offset_0 = executor->AllocateArray<int64_t>(1);
   se::DeviceMemory<int64_t> src_offset_1 = executor->AllocateArray<int64_t>(1);
   se::DeviceMemory<int64_t> src_offset_2 = executor->AllocateArray<int64_t>(1);
   se::DeviceMemory<int64_t> src_offset_3 = executor->AllocateArray<int64_t>(1);
   std::vector<int64_t> src_offset_arr{3, 5, 2, 0};
-  TF_ASSERT_OK(stream.Memcpy(&src_offset_0, &src_offset_arr[0], offset_length));
-  TF_ASSERT_OK(stream.Memcpy(&src_offset_1, &src_offset_arr[1], offset_length));
-  TF_ASSERT_OK(stream.Memcpy(&src_offset_2, &src_offset_arr[2], offset_length));
-  TF_ASSERT_OK(stream.Memcpy(&src_offset_3, &src_offset_arr[3], offset_length));
+  TF_ASSERT_OK(
+      stream->Memcpy(&src_offset_0, &src_offset_arr[0], offset_length));
+  TF_ASSERT_OK(
+      stream->Memcpy(&src_offset_1, &src_offset_arr[1], offset_length));
+  TF_ASSERT_OK(
+      stream->Memcpy(&src_offset_2, &src_offset_arr[2], offset_length));
+  TF_ASSERT_OK(
+      stream->Memcpy(&src_offset_3, &src_offset_arr[3], offset_length));
 
   se::DeviceMemory<int64_t> dst_offset_0 = executor->AllocateArray<int64_t>(1);
   se::DeviceMemory<int64_t> dst_offset_1 = executor->AllocateArray<int64_t>(1);
   se::DeviceMemory<int64_t> dst_offset_2 = executor->AllocateArray<int64_t>(1);
   se::DeviceMemory<int64_t> dst_offset_3 = executor->AllocateArray<int64_t>(1);
   std::vector<int64_t> dst_offset_arr{1, 1, 0, 0};
-  TF_ASSERT_OK(stream.Memcpy(&dst_offset_0, &dst_offset_arr[0], offset_length));
-  TF_ASSERT_OK(stream.Memcpy(&dst_offset_1, &dst_offset_arr[1], offset_length));
-  TF_ASSERT_OK(stream.Memcpy(&dst_offset_2, &dst_offset_arr[2], offset_length));
-  TF_ASSERT_OK(stream.Memcpy(&dst_offset_3, &dst_offset_arr[3], offset_length));
+  TF_ASSERT_OK(
+      stream->Memcpy(&dst_offset_0, &dst_offset_arr[0], offset_length));
+  TF_ASSERT_OK(
+      stream->Memcpy(&dst_offset_1, &dst_offset_arr[1], offset_length));
+  TF_ASSERT_OK(
+      stream->Memcpy(&dst_offset_2, &dst_offset_arr[2], offset_length));
+  TF_ASSERT_OK(
+      stream->Memcpy(&dst_offset_3, &dst_offset_arr[3], offset_length));
 
   // Preparing parameters for thunk execution.
   ServiceExecutableRunOptions run_options;
@@ -849,20 +866,21 @@ TEST(AddressComputationThunkTest, SlicedOutputMemcpy) {
        dst_offset_0, dst_offset_1, dst_offset_2, dst_offset_3},
       0, executor->GetAllocator());
 
-  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
-      run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
+  Thunk::ExecuteParams params =
+      Thunk::ExecuteParams::Create(run_options, allocations, stream.get(),
+                                   stream.get(), {}, nullptr, nullptr);
 
   Thunk::ExecutableSource source = {/*text=*/"", /*binary=*/{}};
-  TF_ASSERT_OK(
-      thunk.Initialize({executor, source, &allocations, &stream, &stream}));
+  TF_ASSERT_OK(thunk.Initialize(
+      {executor, source, &allocations, stream.get(), stream.get()}));
 
   // Executing address computation thunk.
   TF_ASSERT_OK(thunk.ExecuteOnStream(params));
-  TF_ASSERT_OK(stream.BlockHostUntilDone());
+  TF_ASSERT_OK(stream->BlockHostUntilDone());
 
   // Copying `dst` data back to host for verification.
   std::vector<int32_t> out(dst_count, 0);
-  TF_ASSERT_OK(stream.Memcpy(out.data(), dst, dst_length));
+  TF_ASSERT_OK(stream->Memcpy(out.data(), dst, dst_length));
 
   // Verifying that the right slice of `src` was copied to `dst`.
   std::vector<int32_t> ref(dst_count, 0);
@@ -882,8 +900,7 @@ TEST(AddressComputationThunkTest, SlicedOutputMemcpy) {
 TEST(AddressComputationThunkTest, SlicedGemmArbitraryArgumentOrder) {
   se::StreamExecutor* executor = GpuExecutor();
 
-  se::Stream stream(executor);
-  TF_ASSERT_OK(stream.Initialize());
+  TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
 
   int64_t lhs_length = sizeof(float) * 2 * 4;
   int64_t rhs_length = sizeof(float) * 3 * 1;
@@ -981,27 +998,29 @@ TEST(AddressComputationThunkTest, SlicedGemmArbitraryArgumentOrder) {
   //        5.0, 6.0, 7.0, 8.0]
   se::DeviceMemory<float> lhs = executor->AllocateArray<float>(2 * 4);
   std::vector<float> lhs_arr{1, 2, 3, 4, 5, 6, 7, 8};
-  TF_ASSERT_OK(stream.Memcpy(&lhs, lhs_arr.data(), lhs_length));
+  TF_ASSERT_OK(stream->Memcpy(&lhs, lhs_arr.data(), lhs_length));
 
   // rhs = [1.0,
   //        1.0,
   //        1.0]
   se::DeviceMemory<float> rhs = executor->AllocateArray<float>(3 * 1);
   std::vector<float> rhs_arr(3, 1);
-  TF_ASSERT_OK(stream.Memcpy(&rhs, rhs_arr.data(), rhs_length));
+  TF_ASSERT_OK(stream->Memcpy(&rhs, rhs_arr.data(), rhs_length));
 
   se::DeviceMemory<float> out = executor->AllocateArray<float>(1 * 1);
-  TF_ASSERT_OK(stream.MemZero(&out, out_length));
+  TF_ASSERT_OK(stream->MemZero(&out, out_length));
 
   se::DeviceMemory<float> workspace =
       executor->AllocateArray<float>(1024 * 1024);
-  TF_ASSERT_OK(stream.MemZero(&workspace, 1024 * 1024));
+  TF_ASSERT_OK(stream->MemZero(&workspace, 1024 * 1024));
 
   se::DeviceMemory<int64_t> lhs_offset_0 = executor->AllocateArray<int64_t>(1);
   se::DeviceMemory<int64_t> lhs_offset_1 = executor->AllocateArray<int64_t>(1);
   std::vector<int64_t> lhs_offset_arr{0, 1};
-  TF_ASSERT_OK(stream.Memcpy(&lhs_offset_0, &lhs_offset_arr[0], offset_length));
-  TF_ASSERT_OK(stream.Memcpy(&lhs_offset_1, &lhs_offset_arr[1], offset_length));
+  TF_ASSERT_OK(
+      stream->Memcpy(&lhs_offset_0, &lhs_offset_arr[0], offset_length));
+  TF_ASSERT_OK(
+      stream->Memcpy(&lhs_offset_1, &lhs_offset_arr[1], offset_length));
 
   // Preparing parameters for thunk execution.
   ServiceExecutableRunOptions run_options;
@@ -1009,20 +1028,21 @@ TEST(AddressComputationThunkTest, SlicedGemmArbitraryArgumentOrder) {
       {workspace, lhs, out, rhs, lhs_offset_0, lhs_offset_1}, 0,
       executor->GetAllocator());
 
-  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
-      run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
+  Thunk::ExecuteParams params =
+      Thunk::ExecuteParams::Create(run_options, allocations, stream.get(),
+                                   stream.get(), {}, nullptr, nullptr);
 
   Thunk::ExecutableSource source = {/*text=*/"", /*binary=*/{}};
-  TF_ASSERT_OK(
-      thunk.Initialize({executor, source, &allocations, &stream, &stream}));
+  TF_ASSERT_OK(thunk.Initialize(
+      {executor, source, &allocations, stream.get(), stream.get()}));
 
   // Executing address computation thunk.
   TF_ASSERT_OK(thunk.ExecuteOnStream(params));
-  TF_ASSERT_OK(stream.BlockHostUntilDone());
+  TF_ASSERT_OK(stream->BlockHostUntilDone());
 
   // Copying `out` data back to host for verification.
   std::vector<float> dst(1, 0);
-  TF_ASSERT_OK(stream.Memcpy(dst.data(), out, out_length));
+  TF_ASSERT_OK(stream->Memcpy(dst.data(), out, out_length));
 
   ASSERT_EQ(dst, std::vector<float>({9}));
 }
@@ -1030,8 +1050,7 @@ TEST(AddressComputationThunkTest, SlicedGemmArbitraryArgumentOrder) {
 TEST(AddressComputationThunkTest, SlicedGemmArbitraryNumberOfArguments) {
   se::StreamExecutor* executor = GpuExecutor();
 
-  se::Stream stream(executor);
-  TF_ASSERT_OK(stream.Initialize());
+  TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
 
   int64_t lhs_length = sizeof(float) * 2 * 4;
   int64_t rhs_length = sizeof(float) * 3 * 1;
@@ -1129,27 +1148,29 @@ TEST(AddressComputationThunkTest, SlicedGemmArbitraryNumberOfArguments) {
   //        5.0, 6.0, 7.0, 8.0]
   se::DeviceMemory<float> lhs = executor->AllocateArray<float>(2 * 4);
   std::vector<float> lhs_arr{1, 2, 3, 4, 5, 6, 7, 8};
-  TF_ASSERT_OK(stream.Memcpy(&lhs, lhs_arr.data(), lhs_length));
+  TF_ASSERT_OK(stream->Memcpy(&lhs, lhs_arr.data(), lhs_length));
 
   // rhs = [1.0,
   //        1.0,
   //        1.0]
   se::DeviceMemory<float> rhs = executor->AllocateArray<float>(3 * 1);
   std::vector<float> rhs_arr(3, 1);
-  TF_ASSERT_OK(stream.Memcpy(&rhs, rhs_arr.data(), rhs_length));
+  TF_ASSERT_OK(stream->Memcpy(&rhs, rhs_arr.data(), rhs_length));
 
   se::DeviceMemory<float> out = executor->AllocateArray<float>(1 * 1);
-  TF_ASSERT_OK(stream.MemZero(&out, out_length));
+  TF_ASSERT_OK(stream->MemZero(&out, out_length));
 
   se::DeviceMemory<float> workspace =
       executor->AllocateArray<float>(1024 * 1024);
-  TF_ASSERT_OK(stream.MemZero(&workspace, 1024 * 1024));
+  TF_ASSERT_OK(stream->MemZero(&workspace, 1024 * 1024));
 
   se::DeviceMemory<int64_t> lhs_offset_0 = executor->AllocateArray<int64_t>(1);
   se::DeviceMemory<int64_t> lhs_offset_1 = executor->AllocateArray<int64_t>(1);
   std::vector<int64_t> lhs_offset_arr{0, 1};
-  TF_ASSERT_OK(stream.Memcpy(&lhs_offset_0, &lhs_offset_arr[0], offset_length));
-  TF_ASSERT_OK(stream.Memcpy(&lhs_offset_1, &lhs_offset_arr[1], offset_length));
+  TF_ASSERT_OK(
+      stream->Memcpy(&lhs_offset_0, &lhs_offset_arr[0], offset_length));
+  TF_ASSERT_OK(
+      stream->Memcpy(&lhs_offset_1, &lhs_offset_arr[1], offset_length));
 
   // Preparing parameters for thunk execution.
   ServiceExecutableRunOptions run_options;
@@ -1158,20 +1179,21 @@ TEST(AddressComputationThunkTest, SlicedGemmArbitraryNumberOfArguments) {
        lhs_offset_0, lhs_offset_1, /*garbage, to be ignored*/ rhs, lhs},
       0, executor->GetAllocator());
 
-  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
-      run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
+  Thunk::ExecuteParams params =
+      Thunk::ExecuteParams::Create(run_options, allocations, stream.get(),
+                                   stream.get(), {}, nullptr, nullptr);
 
   Thunk::ExecutableSource source = {/*text=*/"", /*binary=*/{}};
-  TF_ASSERT_OK(
-      thunk.Initialize({executor, source, &allocations, &stream, &stream}));
+  TF_ASSERT_OK(thunk.Initialize(
+      {executor, source, &allocations, stream.get(), stream.get()}));
 
   // Executing address computation thunk.
   TF_ASSERT_OK(thunk.ExecuteOnStream(params));
-  TF_ASSERT_OK(stream.BlockHostUntilDone());
+  TF_ASSERT_OK(stream->BlockHostUntilDone());
 
   // Copying `out` data back to host for verification.
   std::vector<float> dst(1, 0);
-  TF_ASSERT_OK(stream.Memcpy(dst.data(), out, out_length));
+  TF_ASSERT_OK(stream->Memcpy(dst.data(), out, out_length));
 
   ASSERT_EQ(dst, std::vector<float>({9}));
 }
@@ -1179,8 +1201,7 @@ TEST(AddressComputationThunkTest, SlicedGemmArbitraryNumberOfArguments) {
 TEST(AddressComputationThunkTest, SlicedTupledOperandGemm) {
   se::StreamExecutor* executor = GpuExecutor();
 
-  se::Stream stream(executor);
-  TF_ASSERT_OK(stream.Initialize());
+  TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
 
   int64_t lhs_length = sizeof(float) * 2 * 4;
   int64_t rhs_length = sizeof(float) * 3 * 1;
@@ -1272,31 +1293,33 @@ TEST(AddressComputationThunkTest, SlicedTupledOperandGemm) {
   // f32[1,3]{1,0} slice(lhs), slice={[0:1], [1:4]}
   se::DeviceMemory<float> lhs_whole_buffer =
       executor->AllocateArray<float>(2 * 4 * 3);
-  TF_ASSERT_OK(stream.MemZero(&lhs_whole_buffer, 2 * 4 * 3));
+  TF_ASSERT_OK(stream->MemZero(&lhs_whole_buffer, 2 * 4 * 3));
   std::vector<float> lhs_arr{1, 2, 3, 4, 5, 6, 7, 8};
   se::DeviceMemoryBase lhs =
       lhs_whole_buffer.GetByteSlice(lhs_length, lhs_length);
-  TF_ASSERT_OK(stream.Memcpy(&lhs, lhs_arr.data(), lhs_length));
+  TF_ASSERT_OK(stream->Memcpy(&lhs, lhs_arr.data(), lhs_length));
 
   // rhs = [1.0,
   //        1.0,
   //        1.0]
   se::DeviceMemory<float> rhs = executor->AllocateArray<float>(3 * 1);
   std::vector<float> rhs_arr(3, 1);
-  TF_ASSERT_OK(stream.Memcpy(&rhs, rhs_arr.data(), rhs_length));
+  TF_ASSERT_OK(stream->Memcpy(&rhs, rhs_arr.data(), rhs_length));
 
   se::DeviceMemory<float> out = executor->AllocateArray<float>(1 * 1);
-  TF_ASSERT_OK(stream.MemZero(&out, out_length));
+  TF_ASSERT_OK(stream->MemZero(&out, out_length));
 
   se::DeviceMemory<float> workspace =
       executor->AllocateArray<float>(1024 * 1024);
-  TF_ASSERT_OK(stream.MemZero(&workspace, 1024 * 1024));
+  TF_ASSERT_OK(stream->MemZero(&workspace, 1024 * 1024));
 
   se::DeviceMemory<int64_t> lhs_offset_0 = executor->AllocateArray<int64_t>(1);
   se::DeviceMemory<int64_t> lhs_offset_1 = executor->AllocateArray<int64_t>(1);
   std::vector<int64_t> lhs_offset_arr{0, 1};
-  TF_ASSERT_OK(stream.Memcpy(&lhs_offset_0, &lhs_offset_arr[0], offset_length));
-  TF_ASSERT_OK(stream.Memcpy(&lhs_offset_1, &lhs_offset_arr[1], offset_length));
+  TF_ASSERT_OK(
+      stream->Memcpy(&lhs_offset_0, &lhs_offset_arr[0], offset_length));
+  TF_ASSERT_OK(
+      stream->Memcpy(&lhs_offset_1, &lhs_offset_arr[1], offset_length));
 
   // Preparing parameters for thunk execution.
   ServiceExecutableRunOptions run_options;
@@ -1304,20 +1327,21 @@ TEST(AddressComputationThunkTest, SlicedTupledOperandGemm) {
       {lhs_whole_buffer, rhs, out, workspace, lhs_offset_0, lhs_offset_1}, 0,
       executor->GetAllocator());
 
-  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
-      run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
+  Thunk::ExecuteParams params =
+      Thunk::ExecuteParams::Create(run_options, allocations, stream.get(),
+                                   stream.get(), {}, nullptr, nullptr);
 
   Thunk::ExecutableSource source = {/*text=*/"", /*binary=*/{}};
-  TF_ASSERT_OK(
-      thunk.Initialize({executor, source, &allocations, &stream, &stream}));
+  TF_ASSERT_OK(thunk.Initialize(
+      {executor, source, &allocations, stream.get(), stream.get()}));
 
   // Executing address computation thunk.
   TF_ASSERT_OK(thunk.ExecuteOnStream(params));
-  TF_ASSERT_OK(stream.BlockHostUntilDone());
+  TF_ASSERT_OK(stream->BlockHostUntilDone());
 
   // Copying `out` data back to host for verification.
   std::vector<float> dst(1, 0);
-  TF_ASSERT_OK(stream.Memcpy(dst.data(), out, out_length));
+  TF_ASSERT_OK(stream->Memcpy(dst.data(), out, out_length));
 
   ASSERT_EQ(dst, std::vector<float>({9}));
 }
@@ -1325,8 +1349,7 @@ TEST(AddressComputationThunkTest, SlicedTupledOperandGemm) {
 TEST(AddressComputationThunkTest, SlicedMemcpyOOB) {
   se::StreamExecutor* executor = GpuExecutor();
 
-  se::Stream stream(executor);
-  TF_ASSERT_OK(stream.Initialize());
+  TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
 
   int64_t src_count = 8 * 8 * 10 * 2;
   int64_t dst_count = 2 * 2 * 2 * 2;
@@ -1446,10 +1469,10 @@ TEST(AddressComputationThunkTest, SlicedMemcpyOOB) {
   se::DeviceMemory<int32_t> src = executor->AllocateArray<int32_t>(src_count);
   std::vector<int32_t> src_arr(src_count, 0);
   for (unsigned i = 0; i < src_count; ++i) src_arr[i] = i;
-  TF_ASSERT_OK(stream.Memcpy(&src, src_arr.data(), src_length));
+  TF_ASSERT_OK(stream->Memcpy(&src, src_arr.data(), src_length));
 
   se::DeviceMemory<int32_t> dst = executor->AllocateArray<int32_t>(dst_count);
-  TF_ASSERT_OK(stream.MemZero(&dst, dst_length));
+  TF_ASSERT_OK(stream->MemZero(&dst, dst_length));
 
   se::DeviceMemory<int64_t> src_offset_0 = executor->AllocateArray<int64_t>(1);
   se::DeviceMemory<int64_t> src_offset_1 = executor->AllocateArray<int64_t>(1);
@@ -1457,10 +1480,14 @@ TEST(AddressComputationThunkTest, SlicedMemcpyOOB) {
   se::DeviceMemory<int64_t> src_offset_3 = executor->AllocateArray<int64_t>(1);
   std::vector<int64_t> src_ref_offset_arr{3, 5, 2, 0};
   std::vector<int64_t> src_offset_arr{3, 5, 2, -3};
-  TF_ASSERT_OK(stream.Memcpy(&src_offset_0, &src_offset_arr[0], offset_length));
-  TF_ASSERT_OK(stream.Memcpy(&src_offset_1, &src_offset_arr[1], offset_length));
-  TF_ASSERT_OK(stream.Memcpy(&src_offset_2, &src_offset_arr[2], offset_length));
-  TF_ASSERT_OK(stream.Memcpy(&src_offset_3, &src_offset_arr[3], offset_length));
+  TF_ASSERT_OK(
+      stream->Memcpy(&src_offset_0, &src_offset_arr[0], offset_length));
+  TF_ASSERT_OK(
+      stream->Memcpy(&src_offset_1, &src_offset_arr[1], offset_length));
+  TF_ASSERT_OK(
+      stream->Memcpy(&src_offset_2, &src_offset_arr[2], offset_length));
+  TF_ASSERT_OK(
+      stream->Memcpy(&src_offset_3, &src_offset_arr[3], offset_length));
 
   se::DeviceMemory<int64_t> dst_offset_0 = executor->AllocateArray<int64_t>(1);
   se::DeviceMemory<int64_t> dst_offset_1 = executor->AllocateArray<int64_t>(1);
@@ -1468,10 +1495,14 @@ TEST(AddressComputationThunkTest, SlicedMemcpyOOB) {
   se::DeviceMemory<int64_t> dst_offset_3 = executor->AllocateArray<int64_t>(1);
   std::vector<int64_t> dst_ref_offset_arr{1, 1, 0, 0};
   std::vector<int64_t> dst_offset_arr{3, 2, 5, -4};
-  TF_ASSERT_OK(stream.Memcpy(&dst_offset_0, &dst_offset_arr[0], offset_length));
-  TF_ASSERT_OK(stream.Memcpy(&dst_offset_1, &dst_offset_arr[1], offset_length));
-  TF_ASSERT_OK(stream.Memcpy(&dst_offset_2, &dst_offset_arr[2], offset_length));
-  TF_ASSERT_OK(stream.Memcpy(&dst_offset_3, &dst_offset_arr[3], offset_length));
+  TF_ASSERT_OK(
+      stream->Memcpy(&dst_offset_0, &dst_offset_arr[0], offset_length));
+  TF_ASSERT_OK(
+      stream->Memcpy(&dst_offset_1, &dst_offset_arr[1], offset_length));
+  TF_ASSERT_OK(
+      stream->Memcpy(&dst_offset_2, &dst_offset_arr[2], offset_length));
+  TF_ASSERT_OK(
+      stream->Memcpy(&dst_offset_3, &dst_offset_arr[3], offset_length));
 
   // Preparing parameters for thunk execution.
   ServiceExecutableRunOptions run_options;
@@ -1480,20 +1511,21 @@ TEST(AddressComputationThunkTest, SlicedMemcpyOOB) {
        dst_offset_0, dst_offset_1, dst_offset_2, dst_offset_3},
       0, executor->GetAllocator());
 
-  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
-      run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
+  Thunk::ExecuteParams params =
+      Thunk::ExecuteParams::Create(run_options, allocations, stream.get(),
+                                   stream.get(), {}, nullptr, nullptr);
 
   Thunk::ExecutableSource source = {/*text=*/"", /*binary=*/{}};
-  TF_ASSERT_OK(
-      thunk.Initialize({executor, source, &allocations, &stream, &stream}));
+  TF_ASSERT_OK(thunk.Initialize(
+      {executor, source, &allocations, stream.get(), stream.get()}));
 
   // Executing address computation thunk.
   TF_ASSERT_OK(thunk.ExecuteOnStream(params));
-  TF_ASSERT_OK(stream.BlockHostUntilDone());
+  TF_ASSERT_OK(stream->BlockHostUntilDone());
 
   // Copying `dst` data back to host for verification.
   std::vector<int32_t> out(dst_count, 0);
-  TF_ASSERT_OK(stream.Memcpy(out.data(), dst, dst_length));
+  TF_ASSERT_OK(stream->Memcpy(out.data(), dst, dst_length));
 
   // Verifying that the right slice of `src` was copied to `dst`.
   std::vector<int32_t> ref(dst_count, 0);
@@ -1514,8 +1546,7 @@ TEST(AddressComputationThunkTest, SlicedMemcpyOOB) {
 TEST(AddressComputationThunkTest, SlicedOperandsSameBufferGemm) {
   se::StreamExecutor* executor = GpuExecutor();
 
-  se::Stream stream(executor);
-  TF_ASSERT_OK(stream.Initialize());
+  TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
 
   int64_t lhs_length = sizeof(float) * 2 * 4;
   int64_t rhs_length = sizeof(float) * 3 * 1;
@@ -1614,51 +1645,54 @@ TEST(AddressComputationThunkTest, SlicedOperandsSameBufferGemm) {
   // f32[1,3]{1,0} slice(lhs), slice={[0:1], [1:4]}
   se::DeviceMemory<float> buffer =
       executor->AllocateArray<float>(lhs_length + rhs_length + out_length);
-  TF_ASSERT_OK(stream.MemZero(&buffer, lhs_length + rhs_length + out_length));
+  TF_ASSERT_OK(stream->MemZero(&buffer, lhs_length + rhs_length + out_length));
 
   se::DeviceMemoryBase lhs = buffer.GetByteSlice(0, lhs_length);
   std::vector<float> lhs_arr{1, 2, 3, 4, 5, 6, 7, 8};
-  TF_ASSERT_OK(stream.Memcpy(&lhs, lhs_arr.data(), lhs_length));
+  TF_ASSERT_OK(stream->Memcpy(&lhs, lhs_arr.data(), lhs_length));
 
   // rhs = [1.0,
   //        1.0,
   //        1.0]
   se::DeviceMemoryBase rhs = buffer.GetByteSlice(lhs_length, rhs_length);
   std::vector<float> rhs_arr(3, 1);
-  TF_ASSERT_OK(stream.Memcpy(&rhs, rhs_arr.data(), rhs_length));
+  TF_ASSERT_OK(stream->Memcpy(&rhs, rhs_arr.data(), rhs_length));
 
   se::DeviceMemoryBase out =
       buffer.GetByteSlice(lhs_length + rhs_length, out_length);
 
   se::DeviceMemory<float> workspace =
       executor->AllocateArray<float>(1024 * 1024);
-  TF_ASSERT_OK(stream.MemZero(&workspace, 1024 * 1024));
+  TF_ASSERT_OK(stream->MemZero(&workspace, 1024 * 1024));
 
   se::DeviceMemory<int64_t> lhs_offset_0 = executor->AllocateArray<int64_t>(1);
   se::DeviceMemory<int64_t> lhs_offset_1 = executor->AllocateArray<int64_t>(1);
   std::vector<int64_t> lhs_offset_arr{0, 1};
-  TF_ASSERT_OK(stream.Memcpy(&lhs_offset_0, &lhs_offset_arr[0], offset_length));
-  TF_ASSERT_OK(stream.Memcpy(&lhs_offset_1, &lhs_offset_arr[1], offset_length));
+  TF_ASSERT_OK(
+      stream->Memcpy(&lhs_offset_0, &lhs_offset_arr[0], offset_length));
+  TF_ASSERT_OK(
+      stream->Memcpy(&lhs_offset_1, &lhs_offset_arr[1], offset_length));
 
   // Preparing parameters for thunk execution.
   ServiceExecutableRunOptions run_options;
   BufferAllocations allocations({buffer, workspace, lhs_offset_0, lhs_offset_1},
                                 0, executor->GetAllocator());
 
-  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
-      run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
+  Thunk::ExecuteParams params =
+      Thunk::ExecuteParams::Create(run_options, allocations, stream.get(),
+                                   stream.get(), {}, nullptr, nullptr);
 
   Thunk::ExecutableSource source = {/*text=*/"", /*binary=*/{}};
-  TF_ASSERT_OK(
-      thunk.Initialize({executor, source, &allocations, &stream, &stream}));
+  TF_ASSERT_OK(thunk.Initialize(
+      {executor, source, &allocations, stream.get(), stream.get()}));
 
   // Executing address computation thunk.
   TF_ASSERT_OK(thunk.ExecuteOnStream(params));
-  TF_ASSERT_OK(stream.BlockHostUntilDone());
+  TF_ASSERT_OK(stream->BlockHostUntilDone());
 
   // Copying `out` data back to host for verification.
   std::vector<float> dst(1, 0);
-  TF_ASSERT_OK(stream.Memcpy(dst.data(), out, out_length));
+  TF_ASSERT_OK(stream->Memcpy(dst.data(), out, out_length));
 
   ASSERT_EQ(dst, std::vector<float>({9}));
 }
diff --git a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.cc b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.cc
index f8507cd4b205a0..887af3900c9748 100644
--- a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.cc
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.cc
@@ -49,11 +49,11 @@ limitations under the License.
 #include "xla/service/gpu/kernels/custom_kernel.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/matmul_utils.h"
-#include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/nccl_clique_key.h"
 #include "xla/service/gpu/runtime/annotation.h"
 #include "xla/service/gpu/runtime/nccl_all_gather_thunk.h"
 #include "xla/service/gpu/runtime/nccl_all_reduce_thunk.h"
+#include "xla/service/gpu/runtime/nccl_api.h"
 #include "xla/service/gpu/runtime/nccl_collective_broadcast_thunk.h"
 #include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/thunk.h"
@@ -158,12 +158,18 @@ CommandBufferCmd::State* CommandBufferCmd::StateManager::GetOrCreate(
 }
 
 se::CommandBuffer::ExecutionScopeId CommandBufferCmd::GetExecutionScope(
-    const RecordParams& record_params) const {
-  int64_t base = record_params.execution_scope_id.value();
-  int64_t offset = execution_stream_id_.value();
+    const RecordParams& record_params,
+    ExecutionStreamId execution_stream_id) const {
+  uint64_t base = record_params.execution_scope_id.value();
+  uint64_t offset = execution_stream_id.value();
   return se::CommandBuffer::ExecutionScopeId(base + offset);
 }
 
+se::CommandBuffer::ExecutionScopeId CommandBufferCmd::GetExecutionScope(
+    const RecordParams& record_params) const {
+  return GetExecutionScope(record_params, execution_stream_id_);
+}
+
 //===----------------------------------------------------------------------===//
 // CommandBufferCmdSequence
 //===----------------------------------------------------------------------===//
@@ -1371,6 +1377,31 @@ CommandBufferCmd::BufferUsageVector CustomCallCmd::buffers() {
   return buffer_usage;
 }
 
+//===----------------------------------------------------------------------===//
+// BarrierCmd
+//===----------------------------------------------------------------------===//
+
+BarrierCmd::BarrierCmd(ExecutionStreamId execution_stream_id,
+                       ExecutionStreamId from_stream_id)
+    : CommandBufferCmd(execution_stream_id), from_stream_id_(from_stream_id) {}
+
+absl::Status BarrierCmd::Record(const Thunk::ExecuteParams& execute_params,
+                                const RecordParams& record_params,
+                                se::CommandBuffer* command_buffer) {
+  VLOG(5) << "BarrierCmd from stream " << from_stream_id_.value()
+          << " to stream " << execution_stream_id().value();
+  if (from_stream_id_ != execution_stream_id()) {
+    TF_RETURN_IF_ERROR(command_buffer->Barrier(
+        execute_params.stream->parent(),
+        CommandBufferCmd::GetExecutionScope(record_params, from_stream_id_),
+        CommandBufferCmd::GetExecutionScope(record_params,
+                                            execution_stream_id())));
+  }
+  return absl::OkStatus();
+}
+
+BarrierCmd::BufferUsageVector BarrierCmd::buffers() { return {}; }
+
 //===----------------------------------------------------------------------===//
 // CollectiveCmd
 //===----------------------------------------------------------------------===//
@@ -1405,7 +1436,7 @@ absl::Status CollectiveCmd::Prepare(
       collectives->global_device_id_map ? &local_devices : nullptr);
 
   return resource_requests.AddClique(
-      NcclCliqueKey(std::move(participants), /*stream_id=*/0,
+      NcclCliqueKey(std::move(participants), /*stream_id=*/NcclStreamId(0),
                     GetAsyncStreamKind()),
       num_local_participants);
 }
@@ -1448,12 +1479,12 @@ absl::Status AllReduceCmd::Record(const Thunk::ExecuteParams& execute_params,
 
   // Today when recording collective operations into command buffers we always
   // use a sync mode and a stream id `0`.
-  TF_ASSIGN_OR_RETURN(NcclApi::NcclCommHandle comm,
-                      GetNcclComm(*execute_params.collective_params,
-                                  *execute_params.collective_cliques,
-                                  config().replica_groups, config().group_mode,
-                                  /*stream_id=*/0, GetAsyncStreamKind()));
-
+  TF_ASSIGN_OR_RETURN(
+      NcclCommHandleWrapper comm_handle,
+      GetNcclComm(*execute_params.collective_params,
+                  *execute_params.collective_cliques, config().replica_groups,
+                  config().group_mode, NcclStreamId(0), GetAsyncStreamKind()));
+  NcclApi::NcclCommHandle comm = comm_handle.comm_handle;
   // Use custom allocator for persistent execution plans.
   NcclApi::ScopedPersistentPlanAllocator scoped_allocator(
       comm, tsl::MakeRef<NcclApi::PersistentPlanAllocator>(
@@ -1516,12 +1547,12 @@ absl::Status ReduceScatterCmd::Record(
 
   // Today when recording collective operations into command buffers we always
   // use a sync mode and a stream id `0`.
-  TF_ASSIGN_OR_RETURN(NcclApi::NcclCommHandle comm,
-                      GetNcclComm(*execute_params.collective_params,
-                                  *execute_params.collective_cliques,
-                                  config().replica_groups, config().group_mode,
-                                  /*stream_id=*/0, GetAsyncStreamKind()));
-
+  TF_ASSIGN_OR_RETURN(
+      NcclCommHandleWrapper comm_handle,
+      GetNcclComm(*execute_params.collective_params,
+                  *execute_params.collective_cliques, config().replica_groups,
+                  config().group_mode, NcclStreamId(0), GetAsyncStreamKind()));
+  NcclApi::NcclCommHandle comm = comm_handle.comm_handle;
   // Use custom allocator for persistent execution plans.
   NcclApi::ScopedPersistentPlanAllocator scoped_allocator(
       comm, tsl::MakeRef<NcclApi::PersistentPlanAllocator>(
@@ -1581,12 +1612,12 @@ absl::Status AllGatherCmd::Record(const Thunk::ExecuteParams& execute_params,
 
   // Today when recording collective operations into command buffers we always
   // use a sync mode and a stream id `0`.
-  TF_ASSIGN_OR_RETURN(NcclApi::NcclCommHandle comm,
-                      GetNcclComm(*execute_params.collective_params,
-                                  *execute_params.collective_cliques,
-                                  config().replica_groups, config().group_mode,
-                                  /*stream_id=*/0, GetAsyncStreamKind()));
-
+  TF_ASSIGN_OR_RETURN(
+      NcclCommHandleWrapper comm_handle,
+      GetNcclComm(*execute_params.collective_params,
+                  *execute_params.collective_cliques, config().replica_groups,
+                  config().group_mode, NcclStreamId(0), GetAsyncStreamKind()));
+  NcclApi::NcclCommHandle comm = comm_handle.comm_handle;
   // Use custom allocator for persistent execution plans.
   NcclApi::ScopedPersistentPlanAllocator scoped_allocator(
       comm, tsl::MakeRef<NcclApi::PersistentPlanAllocator>(
@@ -1646,12 +1677,12 @@ absl::Status CollectiveBroadcastCmd::Record(
 
   // Today when recording collective operations into command buffers we always
   // use a sync mode and a stream id `0`.
-  TF_ASSIGN_OR_RETURN(NcclApi::NcclCommHandle comm,
-                      GetNcclComm(*execute_params.collective_params,
-                                  *execute_params.collective_cliques,
-                                  config().replica_groups, config().group_mode,
-                                  /*stream_id=*/0, GetAsyncStreamKind()));
-
+  TF_ASSIGN_OR_RETURN(
+      NcclCommHandleWrapper comm_handle,
+      GetNcclComm(*execute_params.collective_params,
+                  *execute_params.collective_cliques, config().replica_groups,
+                  config().group_mode, NcclStreamId(0), GetAsyncStreamKind()));
+  NcclApi::NcclCommHandle comm = comm_handle.comm_handle;
   // Use custom allocator for persistent execution plans.
   NcclApi::ScopedPersistentPlanAllocator scoped_allocator(
       comm, tsl::MakeRef<NcclApi::PersistentPlanAllocator>(
diff --git a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.h b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.h
index ddbbff9d361694..30581b2fb95171 100644
--- a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.h
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.h
@@ -41,9 +41,9 @@ limitations under the License.
 #include "xla/service/gpu/kernels/custom_kernel.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/matmul_utils.h"
-#include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/nccl_clique_key.h"
 #include "xla/service/gpu/runtime/custom_call_thunk.h"
+#include "xla/service/gpu/runtime/nccl_api.h"
 #include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/thunk.h"
 #include "xla/status.h"
@@ -200,8 +200,14 @@ class CommandBufferCmd {
   // Returns true if command implemented as a nested command buffer.
   virtual bool IsNestedCommandBuffer() const { return false; }
 
-  // Returns a command execution scope computed from the command stream id and
-  // the default command buffer execution scope.
+  // Returns a command execution scope created from the specified
+  // 'execution_stream_id'.
+  se::CommandBuffer::ExecutionScopeId GetExecutionScope(
+      const RecordParams& record_params,
+      ExecutionStreamId execution_stream_id) const;
+
+  // Return the execution scope created from the execution stream id of the
+  // thunk which is lowered to current command.
   se::CommandBuffer::ExecutionScopeId GetExecutionScope(
       const RecordParams& record_params) const;
 
@@ -846,6 +852,28 @@ class CustomCallCmd : public CommandBufferCmd {
   std::vector<std::optional<Slice>> results_;
 };
 
+//===----------------------------------------------------------------------===//
+// BarrierCmd insert a barrier from the execution scope created from the
+// 'from_stream_id' to the execution scope created from the
+// 'execution_stream_id', e.g. Async operator lowered to command buffer requires
+// a barrier from the launching stream to the async operator's execution stream.
+//===----------------------------------------------------------------------===//
+
+class BarrierCmd : public CommandBufferCmd {
+ public:
+  BarrierCmd(ExecutionStreamId execution_stream_id,
+             ExecutionStreamId from_stream_id);
+
+  absl::Status Record(const Thunk::ExecuteParams& execute_params,
+                      const RecordParams& record_params,
+                      se::CommandBuffer* command_buffer) override;
+
+  BufferUsageVector buffers() override;
+
+ private:
+  ExecutionStreamId from_stream_id_;
+};
+
 //===----------------------------------------------------------------------===//
 // CollectiveCmd
 //===----------------------------------------------------------------------===//
diff --git a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_test.cc b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_test.cc
index d174c4733d8867..ff689d8441b2cb 100644
--- a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_test.cc
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_test.cc
@@ -227,6 +227,74 @@ TEST(CommandBufferCmdTest, MemcpyCmd) {
   ASSERT_EQ(dst, std::vector<int32_t>(4, 42));
 }
 
+TEST(CommandBufferCmdTest, BarrierCmd) {
+  se::StreamExecutor* executor = GpuExecutor();
+
+  auto stream = executor->CreateStream().value();
+
+  int64_t length = 4;
+  int64_t byte_length = sizeof(int32_t) * length;
+
+  // Prepare arguments: a=42, b=0
+  se::DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
+  se::DeviceMemory<int32_t> b = executor->AllocateArray<int32_t>(length, 0);
+  se::DeviceMemory<int32_t> c = executor->AllocateArray<int32_t>(length, 0);
+  se::DeviceMemory<int32_t> d = executor->AllocateArray<int32_t>(length, 0);
+  se::DeviceMemory<int32_t> e = executor->AllocateArray<int32_t>(length, 0);
+
+  TF_ASSERT_OK(stream->Memset32(&a, 42, byte_length));
+  TF_ASSERT_OK(stream->MemZero(&b, byte_length));
+  TF_ASSERT_OK(stream->MemZero(&c, byte_length));
+  TF_ASSERT_OK(stream->MemZero(&d, byte_length));
+  TF_ASSERT_OK(stream->MemZero(&e, byte_length));
+
+  // Prepare buffer allocations for recording command buffer.
+  BufferAllocation alloc_a(/*index=*/0, byte_length, /*color=*/0);
+  BufferAllocation alloc_b(/*index=*/1, byte_length, /*color=*/0);
+  BufferAllocation alloc_c(/*index=*/2, byte_length, /*color=*/0);
+  BufferAllocation alloc_d(/*index=*/3, byte_length, /*color=*/0);
+  BufferAllocation alloc_e(/*index=*/4, byte_length, /*color=*/0);
+
+  BufferAllocation::Slice slice_a(&alloc_a, 0, byte_length);
+  BufferAllocation::Slice slice_b(&alloc_b, 0, byte_length);
+  BufferAllocation::Slice slice_c(&alloc_c, 0, byte_length);
+  BufferAllocation::Slice slice_d(&alloc_d, 0, byte_length);
+  BufferAllocation::Slice slice_e(&alloc_e, 0, byte_length);
+
+  // Prepare commands sequence for constructing command buffer.
+  CommandBufferCmdSequence commands;
+  commands.Emplace<MemcpyDeviceToDeviceCmd>(s0, slice_b, slice_a, byte_length);
+  commands.Emplace<BarrierCmd>(s1, s0);
+  commands.Emplace<MemcpyDeviceToDeviceCmd>(s1, slice_c, slice_b, byte_length);
+  commands.Emplace<BarrierCmd>(s0, s1);
+  commands.Emplace<MemcpyDeviceToDeviceCmd>(s0, slice_d, slice_c, byte_length);
+  commands.Emplace<BarrierCmd>(s1, s0);
+  commands.Emplace<MemcpyDeviceToDeviceCmd>(s1, slice_e, slice_d, byte_length);
+
+  ServiceExecutableRunOptions run_options;
+  BufferAllocations allocations({a, b, c, d, e}, 0, executor->GetAllocator());
+
+  CommandBufferCmd::StateManager state;
+
+  Thunk::ExecuteParams params =
+      Thunk::ExecuteParams::Create(run_options, allocations, stream.get(),
+                                   stream.get(), {}, nullptr, nullptr);
+
+  CommandBufferCmd::RecordParams record_params = {state};
+
+  auto command_buffer = se::CommandBuffer::Create(executor).value();
+  TF_ASSERT_OK(commands.Record(params, record_params, command_buffer.get()));
+
+  // Execute command buffer and verify that it copied the memory.
+  TF_ASSERT_OK(executor->Submit(stream.get(), *command_buffer));
+
+  // Copy `b` data back to host.
+  std::vector<int32_t> dst(4, 0);
+  TF_ASSERT_OK(stream->Memcpy(dst.data(), e, byte_length));
+
+  ASSERT_EQ(dst, std::vector<int32_t>(4, 42));
+}
+
 TEST(CommandBufferCmdTest, LaunchCmd) {
   se::StreamExecutor* executor = GpuExecutor();
 
diff --git a/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.cc b/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.cc
index 0eaf0aaf4c39a4..5c4f17fc831546 100644
--- a/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.cc
@@ -122,20 +122,26 @@ absl::Status CustomCallThunk::ExecuteFfiHandler(const ExecuteParams& params) {
   // separate from arguments, as they do not change after thunk is constructed.
   CallFrameBuilder builder;
 
-  for (auto& slices : {operands_, results_}) {
-    for (const std::optional<Slice>& slice : slices) {
-      // TODO(ezhulenev): Add a token argument type to XLA:FFI.
-      if (!slice.has_value()) {
-        return Internal("FFI handlers do not support tokens (yet)!");
-      }
+  for (auto& operand : operands_) {
+    if (!operand.has_value())
+      return Internal("FFI handlers do not support tokens (yet)!");
+    if (!operand->slice.allocation())
+      return Internal("custom call argument missing buffer allocation");
+
+    builder.AddBufferArg(
+        params.buffer_allocations->GetDeviceAddress(operand->slice),
+        operand->shape.element_type(), operand->shape.dimensions());
+  }
 
-      if (!slice->slice.allocation())
-        return Internal("custom call input missing buffer allocation");
+  for (auto& result : results_) {
+    if (!result.has_value())
+      return Internal("FFI handlers do not support tokens (yet)!");
+    if (!result->slice.allocation())
+      return Internal("custom call result missing buffer allocation");
 
-      builder.AddBufferArg(
-          params.buffer_allocations->GetDeviceAddress(slice->slice),
-          slice->shape.element_type(), slice->shape.dimensions());
-    }
+    builder.AddBufferRet(
+        params.buffer_allocations->GetDeviceAddress(result->slice),
+        result->shape.element_type(), result->shape.dimensions());
   }
 
   CallFrameBuilder::AttributesBuilder attrs;
diff --git a/third_party/xla/xla/service/gpu/runtime/fused_mha_thunk.cc b/third_party/xla/xla/service/gpu/runtime/fused_mha_thunk.cc
index b39ba7a373000c..bb993ddfb3b013 100644
--- a/third_party/xla/xla/service/gpu/runtime/fused_mha_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/fused_mha_thunk.cc
@@ -110,7 +110,6 @@ FusedMHABackwardThunk::FusedMHABackwardThunk(
     BufferAllocation::Slice d_output, BufferAllocation::Slice scratch,
     BufferAllocation::Slice d_bmm1_lhs, BufferAllocation::Slice d_bmm1_rhs,
     BufferAllocation::Slice d_bmm2_rhs, BufferAllocation::Slice d_s,
-    BufferAllocation::Slice softmax_sum, BufferAllocation::Slice d_Q_accum,
     BufferAllocation::Slice mask, BufferAllocation::Slice d_bias,
     BufferAllocation::Slice fwd_output, BufferAllocation::Slice bias,
     BufferAllocation::Slice seqlen_q, BufferAllocation::Slice seqlen_k)
@@ -125,8 +124,6 @@ FusedMHABackwardThunk::FusedMHABackwardThunk(
       d_bmm1_rhs_buffer_(d_bmm1_rhs),
       d_bmm2_rhs_buffer_(d_bmm2_rhs),
       d_s_buffer_(d_s),
-      softmax_sum_buffer_(softmax_sum),
-      d_Q_accum_buffer_(d_Q_accum),
       mask_buffer_(mask),
       d_bias_buffer_(d_bias),
       fwd_output_buffer_(fwd_output),
@@ -182,10 +179,6 @@ absl::Status FusedMHABackwardThunk::ExecuteOnStream(
 
   std::optional<se::DeviceMemoryBase> d_s_buffer =
       AssignBufferIfNotNull(buffer_allocations, d_s_buffer_);
-  std::optional<se::DeviceMemoryBase> softmax_sum_buffer =
-      AssignBufferIfNotNull(buffer_allocations, softmax_sum_buffer_);
-  std::optional<se::DeviceMemoryBase> d_Q_accum_buffer =
-      AssignBufferIfNotNull(buffer_allocations, d_Q_accum_buffer_);
   std::optional<se::DeviceMemoryBase> mask_buffer =
       AssignBufferIfNotNull(buffer_allocations, mask_buffer_);
   std::optional<se::DeviceMemoryBase> d_bias_buffer =
@@ -206,9 +199,8 @@ absl::Status FusedMHABackwardThunk::ExecuteOnStream(
       config_, bmm1_grad_gemm1_rhs_buffer, bmm1_grad_gemm2_rhs_buffer,
       bmm2_grad_gemm1_lhs_buffer, bmm2_grad_gemm2_rhs_buffer, d_output_buffer,
       scratch_buffer, d_bmm1_lhs_buffer, d_bmm1_rhs_buffer, d_bmm2_rhs_buffer,
-      d_s_buffer, softmax_sum_buffer, d_Q_accum_buffer, mask_buffer,
-      d_bias_buffer, fwd_output_buffer, bias_buffer, seqlen_q_buffer,
-      seqlen_k_buffer, params.stream, opts));
+      d_s_buffer, mask_buffer, d_bias_buffer, fwd_output_buffer, bias_buffer,
+      seqlen_q_buffer, seqlen_k_buffer, params.stream, opts));
   if (!params.stream->ok()) {
     return Internal("FusedMHABackwardThunk::ExecuteOnStream failed.");
   }
diff --git a/third_party/xla/xla/service/gpu/runtime/fused_mha_thunk.h b/third_party/xla/xla/service/gpu/runtime/fused_mha_thunk.h
index 6e93541f09c123..e89365b6786896 100644
--- a/third_party/xla/xla/service/gpu/runtime/fused_mha_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/fused_mha_thunk.h
@@ -92,8 +92,6 @@ class FusedMHABackwardThunk : public Thunk {
                         BufferAllocation::Slice d_bmm1_rhs_slice,
                         BufferAllocation::Slice d_bmm2_rhs_slice,
                         BufferAllocation::Slice d_s_slice,
-                        BufferAllocation::Slice softmax_sum_slice,
-                        BufferAllocation::Slice d_Q_accum_slice,
                         BufferAllocation::Slice mask_slice,
                         BufferAllocation::Slice d_bias_slice,
                         BufferAllocation::Slice fwd_output_slice,
@@ -117,8 +115,6 @@ class FusedMHABackwardThunk : public Thunk {
   BufferAllocation::Slice d_bmm1_rhs_buffer_;
   BufferAllocation::Slice d_bmm2_rhs_buffer_;
   BufferAllocation::Slice d_s_buffer_;
-  BufferAllocation::Slice softmax_sum_buffer_;
-  BufferAllocation::Slice d_Q_accum_buffer_;
   BufferAllocation::Slice mask_buffer_;
   BufferAllocation::Slice d_bias_buffer_;
   BufferAllocation::Slice fwd_output_buffer_;
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_all_gather_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_all_gather_thunk.cc
index 3a9f2c1d75401b..156efbc74ccd44 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_all_gather_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_all_gather_thunk.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/service/gpu/nccl_api.h"
+#include "xla/service/gpu/runtime/nccl_api.h"
 #include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/thunk.h"
 #include "xla/shape.h"
@@ -87,12 +87,13 @@ NcclAllGatherStartThunk::NcclAllGatherStartThunk(
 
 absl::Status NcclAllGatherStartThunk::RunNcclCollective(
     const ExecuteParams& params, se::Stream& stream,
-    NcclApi::NcclCommHandle comm) {
+    NcclCommHandleWrapper comm_wrapper) {
   TF_ASSIGN_OR_RETURN(
       std::vector<DeviceBufferPair> device_buffers,
       ConvertToDeviceBuffers(params, buffers_,
                              config_.config.operand_element_type));
-  return xla::gpu::RunAllGather(nccl_api(), device_buffers, stream, comm);
+  return xla::gpu::RunAllGather(nccl_api(), device_buffers, stream,
+                                comm_wrapper.comm_handle);
 }
 
 absl::Status RunAllGather(NcclApi* nccl_api,
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_all_gather_thunk.h b/third_party/xla/xla/service/gpu/runtime/nccl_all_gather_thunk.h
index 95512b9099116e..4b5d63e6639e3e 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_all_gather_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_all_gather_thunk.h
@@ -23,7 +23,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/collective_ops_utils.h"
-#include "xla/service/gpu/nccl_api.h"
+#include "xla/service/gpu/runtime/nccl_api.h"
 #include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/stream_executor/stream.h"
 
@@ -56,7 +56,7 @@ class NcclAllGatherStartThunk : public NcclCollectiveThunk {
  protected:
   absl::Status RunNcclCollective(const ExecuteParams& params,
                                  se::Stream& stream,
-                                 NcclApi::NcclCommHandle comm) override;
+                                 NcclCommHandleWrapper comm_wrapper) override;
 
  private:
   const NcclAllGatherConfig config_;
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_all_reduce_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_all_reduce_thunk.cc
index 47742afa87648f..686cf88136f7ef 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_all_reduce_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_all_reduce_thunk.cc
@@ -29,7 +29,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/service/gpu/nccl_api.h"
+#include "xla/service/gpu/runtime/nccl_api.h"
 #include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/thunk.h"
 #include "xla/status_macros.h"
@@ -177,13 +177,14 @@ CollectiveOpGroupMode NcclAllReduceStartThunk::GetGroupMode(
 
 absl::Status NcclAllReduceStartThunk::RunNcclCollective(
     const ExecuteParams& params, se::Stream& stream,
-    NcclApi::NcclCommHandle comm) {
+    NcclCommHandleWrapper comm_wrapper) {
   TF_ASSIGN_OR_RETURN(
       std::vector<DeviceBufferPair> device_buffers,
       ConvertToDeviceBuffers(params, buffers_,
                              config_.config.operand_element_type));
   return ::xla::gpu::RunAllReduce(nccl_api(), config_.reduction_kind,
-                                  device_buffers, stream, comm);
+                                  device_buffers, stream,
+                                  comm_wrapper.comm_handle);
 }
 
 NcclReduceScatterStartThunk::NcclReduceScatterStartThunk(
@@ -211,13 +212,14 @@ NcclReduceScatterStartThunk::NcclReduceScatterStartThunk(
 
 absl::Status NcclReduceScatterStartThunk::RunNcclCollective(
     const ExecuteParams& params, se::Stream& stream,
-    NcclApi::NcclCommHandle comm) {
+    NcclCommHandleWrapper comm_wrapper) {
   TF_ASSIGN_OR_RETURN(
       std::vector<DeviceBufferPair> device_buffers,
       ConvertToDeviceBuffers(params, buffers_,
                              config_.config.operand_element_type));
   return ::xla::gpu::RunReduceScatter(nccl_api(), config_.reduction_kind,
-                                      device_buffers, stream, comm);
+                                      device_buffers, stream,
+                                      comm_wrapper.comm_handle);
 }
 
 absl::Status RunReduceScatter(NcclApi* nccl_api, ReductionKind reduction_kind,
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_all_reduce_thunk.h b/third_party/xla/xla/service/gpu/runtime/nccl_all_reduce_thunk.h
index 7fedfee6786f5a..7d70edaf2dab56 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_all_reduce_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_all_reduce_thunk.h
@@ -23,7 +23,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/collective_ops_utils.h"
-#include "xla/service/gpu/nccl_api.h"
+#include "xla/service/gpu/runtime/nccl_api.h"
 #include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/stream_executor/stream.h"
 
@@ -77,7 +77,7 @@ class NcclAllReduceStartThunk : public NcclAllReduceReduceScatterThunkBase {
  protected:
   absl::Status RunNcclCollective(const ExecuteParams& params,
                                  se::Stream& stream,
-                                 NcclApi::NcclCommHandle comm) override;
+                                 NcclCommHandleWrapper comm_wrapper) override;
 };
 
 // -----------------------------------------------------------------------------
@@ -101,7 +101,7 @@ class NcclReduceScatterStartThunk : public NcclAllReduceReduceScatterThunkBase {
  protected:
   absl::Status RunNcclCollective(const ExecuteParams& params,
                                  se::Stream& stream,
-                                 NcclApi::NcclCommHandle comm) override;
+                                 NcclCommHandleWrapper comm_wrapper) override;
 };
 
 // -----------------------------------------------------------------------------
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_all_to_all_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_all_to_all_thunk.cc
index 623bf8095e2ff6..7fe641c044f11e 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_all_to_all_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_all_to_all_thunk.cc
@@ -28,7 +28,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/ir_emission_utils.h"
-#include "xla/service/gpu/nccl_api.h"
+#include "xla/service/gpu/runtime/nccl_api.h"
 #include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -94,13 +94,14 @@ NcclAllToAllStartThunk::NcclAllToAllStartThunk(
 
 absl::Status NcclAllToAllStartThunk::RunNcclCollective(
     const ExecuteParams& params, se::Stream& stream,
-    NcclApi::NcclCommHandle comm) {
+    NcclCommHandleWrapper comm_wrapper) {
   TF_ASSIGN_OR_RETURN(
       std::vector<DeviceBufferPair> device_buffers,
       ConvertToDeviceBuffers(params, buffers_,
                              config_.config.operand_element_type));
   return xla::gpu::RunAllToAll(nccl_api(), config_.has_split_dimension,
-                               device_buffers, stream, comm);
+                               device_buffers, stream,
+                               comm_wrapper.comm_handle);
 }
 
 absl::Status RunAllToAll(NcclApi* nccl_api, bool has_split_dimension,
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_all_to_all_thunk.h b/third_party/xla/xla/service/gpu/runtime/nccl_all_to_all_thunk.h
index 083283388f03ad..3bc8e2e78cb192 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_all_to_all_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_all_to_all_thunk.h
@@ -22,7 +22,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/collective_ops_utils.h"
-#include "xla/service/gpu/nccl_api.h"
+#include "xla/service/gpu/runtime/nccl_api.h"
 #include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/stream_executor/stream.h"
 
@@ -56,7 +56,7 @@ class NcclAllToAllStartThunk : public NcclCollectiveThunk {
   const NcclCollectiveConfig& config() const override { return config_.config; }
   absl::Status RunNcclCollective(const ExecuteParams& params,
                                  se::Stream& stream,
-                                 NcclApi::NcclCommHandle comm) override;
+                                 NcclCommHandleWrapper comm_wrapper) override;
 
  private:
   const NcclAllToAllConfig config_;
diff --git a/third_party/xla/xla/service/gpu/nccl_api.cc b/third_party/xla/xla/service/gpu/runtime/nccl_api.cc
similarity index 98%
rename from third_party/xla/xla/service/gpu/nccl_api.cc
rename to third_party/xla/xla/service/gpu/runtime/nccl_api.cc
index 6068430c89bf87..f360c66cef6744 100644
--- a/third_party/xla/xla/service/gpu/nccl_api.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_api.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/nccl_api.h"
+#include "xla/service/gpu/runtime/nccl_api.h"
 
 #include <cstddef>
 #include <cstdint>
@@ -376,17 +376,20 @@ DefaultNcclApi::CommInitRanks(int32_t nranks, const NcclCliqueId& clique_id,
   VLOG(1) << "Initialize NCCL communicator for " << ranks.size()
           << " devices; hash(id)=" << absl::HashOf(clique_id);
 
-#if !defined(TENSORFLOW_USE_ROCM) || \
-    (defined(TENSORFLOW_USE_ROCM) && TF_ROCM_VERSION > 50700)
   ncclConfig_t comm_config = NCCL_CONFIG_INITIALIZER;
+#if !defined(TENSORFLOW_USE_ROCM) || TF_ROCM_VERSION > 50700
   comm_config.splitShare = config.split_share;
+#endif
   if (config.max_nchannels > 0) {
     comm_config.maxCTAs = config.max_nchannels;
     VLOG(1) << "Maximum number of channels for hash(id)="
             << absl::HashOf(clique_id) << " is set to: " << comm_config.maxCTAs;
   }
 
+  std::vector<ncclComm_t> comm_handles;
   std::vector<OwnedNcclComm> comms;
+
+  comm_handles.resize(ranks.size(), nullptr);
   comms.reserve(ranks.size());
 
   TF_RETURN_IF_ERROR(GroupStart());
@@ -396,21 +399,17 @@ DefaultNcclApi::CommInitRanks(int32_t nranks, const NcclCliqueId& clique_id,
 
     se::gpu::ScopedActivateExecutorContext activate_context(ranks[i].device);
 
-    ncclComm_t comm_handle = nullptr;
-    XLA_NCCL_RETURN_IF_ERROR(
-        ncclCommInitRankConfig(&comm_handle, nranks, AsNcclUniqueId(clique_id),
-                               ranks[i].rank, &comm_config));
+    XLA_NCCL_RETURN_IF_ERROR(ncclCommInitRankConfig(
+        &comm_handles[i], nranks, AsNcclUniqueId(clique_id), ranks[i].rank,
+        &comm_config));
+  }
+  TF_RETURN_IF_ERROR(GroupEnd());
 
+  for (ncclComm_t comm_handle : comm_handles) {
     comms.emplace_back(Cast(comm_handle), NcclCommDeleter{this});
   }
-  TF_RETURN_IF_ERROR(GroupEnd());
 
   return comms;
-#else
-  return absl::UnimplementedError(absl::StrFormat(
-      "%s:%d: NCCL operation ncclCommInitRankConfig not implemented", __FILE__,
-      __LINE__));
-#endif
 }
 
 absl::StatusOr<std::vector<NcclApi::OwnedNcclComm>> DefaultNcclApi::CommSplit(
diff --git a/third_party/xla/xla/service/gpu/nccl_api.h b/third_party/xla/xla/service/gpu/runtime/nccl_api.h
similarity index 98%
rename from third_party/xla/xla/service/gpu/nccl_api.h
rename to third_party/xla/xla/service/gpu/runtime/nccl_api.h
index 6aab3b98ea4614..05f230088826c7 100644
--- a/third_party/xla/xla/service/gpu/nccl_api.h
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_api.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_NCCL_API_H_
-#define XLA_SERVICE_GPU_NCCL_API_H_
+#ifndef XLA_SERVICE_GPU_RUNTIME_NCCL_API_H_
+#define XLA_SERVICE_GPU_RUNTIME_NCCL_API_H_
 
 #include <cstddef>
 #include <cstdint>
@@ -271,4 +271,4 @@ class NcclApi {
 
 }  // namespace xla::gpu
 
-#endif  // XLA_SERVICE_GPU_NCCL_API_H_
+#endif  // XLA_SERVICE_GPU_RUNTIME_NCCL_API_H_
diff --git a/third_party/xla/xla/service/gpu/nccl_api_stub.cc b/third_party/xla/xla/service/gpu/runtime/nccl_api_stub.cc
similarity index 99%
rename from third_party/xla/xla/service/gpu/nccl_api_stub.cc
rename to third_party/xla/xla/service/gpu/runtime/nccl_api_stub.cc
index c061867b3f5f4f..a0eab15e445428 100644
--- a/third_party/xla/xla/service/gpu/nccl_api_stub.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_api_stub.cc
@@ -22,8 +22,8 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
 #include "xla/service/collective_ops_utils.h"
-#include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/nccl_clique_key.h"
+#include "xla/service/gpu/runtime/nccl_api.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/stream.h"
 #include "tsl/concurrency/ref_count.h"
diff --git a/third_party/xla/xla/service/gpu/nccl_clique.cc b/third_party/xla/xla/service/gpu/runtime/nccl_clique.cc
similarity index 99%
rename from third_party/xla/xla/service/gpu/nccl_clique.cc
rename to third_party/xla/xla/service/gpu/runtime/nccl_clique.cc
index 84156ac58fcf0c..605c449c3c5fd5 100644
--- a/third_party/xla/xla/service/gpu/nccl_clique.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_clique.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/nccl_clique.h"
+#include "xla/service/gpu/runtime/nccl_clique.h"
 
 #include <cstdint>
 #include <cstdlib>
@@ -42,8 +42,8 @@ limitations under the License.
 #include "xla/debug_options_flags.h"
 #include "xla/executable_run_options.h"
 #include "xla/service/global_device_id.h"
-#include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/nccl_clique_key.h"
+#include "xla/service/gpu/runtime/nccl_api.h"
 #include "xla/service/lockable.h"
 #include "xla/service/rendezvous.h"
 #include "xla/status_macros.h"
diff --git a/third_party/xla/xla/service/gpu/nccl_clique.h b/third_party/xla/xla/service/gpu/runtime/nccl_clique.h
similarity index 96%
rename from third_party/xla/xla/service/gpu/nccl_clique.h
rename to third_party/xla/xla/service/gpu/runtime/nccl_clique.h
index 3275ba5b625b38..d02c68a5e1e7d4 100644
--- a/third_party/xla/xla/service/gpu/nccl_clique.h
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_clique.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_NCCL_CLIQUE_H_
-#define XLA_SERVICE_GPU_NCCL_CLIQUE_H_
+#ifndef XLA_SERVICE_GPU_RUNTIME_NCCL_CLIQUE_H_
+#define XLA_SERVICE_GPU_RUNTIME_NCCL_CLIQUE_H_
 
 #include <cstddef>
 #include <cstdint>
@@ -29,8 +29,8 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
 #include "xla/executable_run_options.h"
-#include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/nccl_clique_key.h"
+#include "xla/service/gpu/runtime/nccl_api.h"
 #include "xla/service/lockable.h"
 #include "xla/stream_executor/stream_executor.h"
 
@@ -142,4 +142,4 @@ absl::StatusOr<std::shared_ptr<NcclClique::Lock>> AcquireNcclClique(
 
 }  // namespace xla::gpu
 
-#endif  // XLA_SERVICE_GPU_NCCL_CLIQUE_H_
+#endif  // XLA_SERVICE_GPU_RUNTIME_NCCL_CLIQUE_H_
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.cc
index 79155dc95c687d..f11b8c850417d6 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/collective_ops_utils.h"
-#include "xla/service/gpu/nccl_api.h"
+#include "xla/service/gpu/runtime/nccl_api.h"
 #include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/thunk.h"
 #include "xla/status.h"
@@ -57,12 +57,12 @@ NcclCollectiveBroadcastStartThunk::GetGroupMode(
 
 Status NcclCollectiveBroadcastStartThunk::RunNcclCollective(
     const ExecuteParams& params, se::Stream& stream,
-    NcclApi::NcclCommHandle comm) {
+    NcclCommHandleWrapper comm_wrapper) {
   TF_ASSIGN_OR_RETURN(
       std::vector<DeviceBufferPair> device_buffers,
       ConvertToDeviceBuffers(params, buffers_, config_.operand_element_type));
-  return ::xla::gpu::RunCollectiveBroadcast(device_buffers, stream, comm,
-                                            nccl_api());
+  return ::xla::gpu::RunCollectiveBroadcast(
+      device_buffers, stream, comm_wrapper.comm_handle, nccl_api());
 }
 
 Status RunCollectiveBroadcast(std::vector<DeviceBufferPair>& buffers,
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.h b/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.h
index 5f5f993085a91c..1bf7a021bd46b7 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.h
@@ -23,7 +23,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/collective_ops_utils.h"
-#include "xla/service/gpu/nccl_api.h"
+#include "xla/service/gpu/runtime/nccl_api.h"
 #include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/status.h"
 #include "xla/stream_executor/stream.h"
@@ -51,7 +51,7 @@ class NcclCollectiveBroadcastStartThunk : public NcclCollectiveThunk {
 
  protected:
   Status RunNcclCollective(const ExecuteParams& params, se::Stream& stream,
-                           NcclApi::NcclCommHandle comm) override;
+                           NcclCommHandleWrapper comm_wrapper) override;
 
  private:
   const NcclCollectiveConfig config_;
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_collective_permute_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_collective_permute_thunk.cc
index 8eddb792f91cf9..804c30b42e3690 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_collective_permute_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_collective_permute_thunk.cc
@@ -29,7 +29,7 @@ limitations under the License.
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/global_device_id.h"
 #include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/service/gpu/nccl_api.h"
+#include "xla/service/gpu/runtime/nccl_api.h"
 #include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/nccl_p2p_thunk_common.h"
 #include "xla/service/gpu/runtime/thunk.h"
@@ -39,15 +39,31 @@ limitations under the License.
 
 namespace xla {
 namespace gpu {
+namespace {
+absl::StatusOr<const int64_t> GetCurrentId(
+    Thunk::CollectiveExecuteParams* collective_params,
+    const NcclP2PConfig& config) {
+  GlobalDeviceId global_device_id = collective_params->global_device_id;
+  TF_ASSIGN_OR_RETURN(
+      const DeviceAssignment::LogicalID current_logical_id,
+      collective_params->device_assn->LogicalIdForDevice(global_device_id));
+  const int64_t current_id =
+      config.config.group_mode == CollectiveOpGroupMode::kCrossReplica
+          ? current_logical_id.replica_id
+          : current_logical_id.computation_id;
+  return current_id;
+}
+}  // namespace
 
 NcclCollectivePermuteStartThunk::NcclCollectivePermuteStartThunk(
     ThunkInfo thunk_info, NcclApi* nccl_api,
     const HloCollectivePermuteInstruction* instr, int64_t replica_count,
-    int64_t partition_count, const Buffer& buffer)
+    int64_t partition_count, const Buffer& buffer, bool p2p_memcpy_enabled)
     : NcclCollectiveThunk(Thunk::kNcclCollectivePermuteStart, thunk_info,
                           nccl_api, IsSyncCollective(instr)),
       config_(GetNcclP2PConfig(instr, replica_count, partition_count)),
-      buffer_(buffer) {}
+      buffer_(buffer),
+      p2p_memcpy_enabled_(p2p_memcpy_enabled) {}
 
 /*static*/ NcclP2PConfig NcclCollectivePermuteStartThunk::GetNcclP2PConfig(
     const HloCollectivePermuteInstruction* instr, int64_t replica_count,
@@ -114,38 +130,49 @@ NcclCollectivePermuteStartThunk::NcclCollectivePermuteStartThunk(
       .value();
 }
 
+absl::Status NcclCollectivePermuteStartThunk::Initialize(
+    const InitializeParams& params) {
+  TF_RETURN_IF_ERROR(NcclCollectiveThunk::Initialize(params));
+  if (p2p_memcpy_enabled_) {
+    TF_ASSIGN_OR_RETURN(const int64_t current_id,
+                        GetCurrentId(params.collective_params, config_));
+
+    TF_RETURN_IF_ERROR(recv_ptr_map_.InitializeId(current_id));
+  }
+
+  return absl::OkStatus();
+}
+
 absl::Status NcclCollectivePermuteStartThunk::RunNcclCollective(
     const ExecuteParams& params, se::Stream& stream,
-    NcclApi::NcclCommHandle comm) {
+    NcclCommHandleWrapper comm_wrapper) {
   TF_ASSIGN_OR_RETURN(
       std::vector<DeviceBufferPair> device_buffers,
       ConvertToDeviceBuffers(params, {buffer_},
                              config_.config.operand_element_type));
   TF_RET_CHECK(device_buffers.size() == 1) << "Expected one buffer pair.";
-
-  GlobalDeviceId global_device_id = params.collective_params->global_device_id;
-
-  TF_ASSIGN_OR_RETURN(const DeviceAssignment::LogicalID current_logical_id,
-                      params.collective_params->device_assn->LogicalIdForDevice(
-                          global_device_id));
-  const int64_t current_id =
-      config_.config.group_mode == CollectiveOpGroupMode::kCrossReplica
-          ? current_logical_id.replica_id
-          : current_logical_id.computation_id;
+  TF_ASSIGN_OR_RETURN(const int64_t current_id,
+                      GetCurrentId(params.collective_params, config_));
   std::string device_string = GetDeviceString(*params.collective_params);
 
   const NcclP2PConfig::SourceTargetMapEntry source_target =
       NcclP2PConfig::GetSourceTarget(config_.id_to_source_target, current_id);
 
-  return ::xla::gpu::RunCollectivePermute(nccl_api(), source_target,
-                                          device_buffers[0], stream, comm,
-                                          device_string, current_id);
+  bool use_memcpy = comm_wrapper.is_local &&
+                    recv_ptr_map_.IsInitialized(current_id) &&
+                    p2p_memcpy_enabled_;
+
+  return ::xla::gpu::RunCollectivePermute(
+      nccl_api(), source_target, device_buffers[0], stream,
+      comm_wrapper.comm_handle, device_string, current_id, use_memcpy,
+      recv_ptr_map_);
 }
 
 absl::Status RunCollectivePermute(
     NcclApi* nccl_api, NcclP2PConfig::SourceTargetMapEntry source_target,
     DeviceBufferPair& buffer, se::Stream& stream, NcclApi::NcclCommHandle comm,
-    absl::string_view device_string, int64_t current_id) {
+    absl::string_view device_string, int64_t current_id, bool use_memcpy,
+    NcclCollectivePermuteStartThunk::RecvPtrMap& recv_ptr_map) {
   // Determine the source and target IDs for this instance. The source ID is the
   // ID which will copy its data to this instance. The destination ID is the ID
   // to which this instance will copy its data. Either are optional.
@@ -172,7 +199,7 @@ absl::Status RunCollectivePermute(
 
   int device_ordinal = stream.parent()->device_ordinal();
   VLOG(3) << "Performing collective permute from device ordinal: "
-          << device_ordinal << "current_id " << current_id;
+          << device_ordinal << " current_id " << current_id;
   TF_RETURN_IF_ERROR(
       MaybeRegisterBuffers(nccl_api, device_ordinal, {buffer}, comm));
 
@@ -186,28 +213,38 @@ absl::Status RunCollectivePermute(
                                 device_string, current_id,
                                 source_id.value_or(-1), target_id.value_or(-1));
 
-  // GroupStart/End API is needed only if we will issue both send & recv calls.
-  const bool is_nccl_group_needed = (target_id && source_id);
-  if (is_nccl_group_needed) {
-    TF_RETURN_IF_ERROR(nccl_api->GroupStart());
-  }
-
-  // Send source buffer to target peer if needed.
-  if (target_id) {
-    TF_RETURN_IF_ERROR(nccl_api->Send(src_addr, buffer.element_type,
-                                      buffer.element_count, *target_id, comm,
-                                      &stream));
-  }
-
-  // Receive data from the source peer to the destination buffer.
-  if (source_id) {
-    TF_RETURN_IF_ERROR(nccl_api->Recv(dest_addr, buffer.element_type,
-                                      buffer.element_count, *source_id, comm,
-                                      &stream));
-  }
-
-  if (is_nccl_group_needed) {
-    TF_RETURN_IF_ERROR(nccl_api->GroupEnd());
+  // If all peers are local, only get/send device pointer values and invoke
+  // memcpy.
+  if (use_memcpy) {
+    // If sending to another peer, get the pointer value of the src addr.
+    // Only change the pointer value when it's different from stored one.
+    if (source_id) {
+      TF_RETURN_IF_ERROR(
+          recv_ptr_map.PutRecvPtr(current_id, dest_addr.opaque()));
+    }
+  } else {
+    // GroupStart/End API is needed only if we will issue both send & recv
+    // calls.
+    const bool is_nccl_group_needed = (target_id && source_id);
+    if (is_nccl_group_needed) {
+      TF_RETURN_IF_ERROR(nccl_api->GroupStart());
+    }
+    // Send source buffer to target peer if needed.
+    if (target_id) {
+      TF_RETURN_IF_ERROR(nccl_api->Send(src_addr, buffer.element_type,
+                                        buffer.element_count, *target_id, comm,
+                                        &stream));
+    }
+
+    // Receive data from the source peer to the destination buffer.
+    if (source_id) {
+      TF_RETURN_IF_ERROR(nccl_api->Recv(dest_addr, buffer.element_type,
+                                        buffer.element_count, *source_id, comm,
+                                        &stream));
+    }
+    if (is_nccl_group_needed) {
+      TF_RETURN_IF_ERROR(nccl_api->GroupEnd());
+    }
   }
 
   if (!source_id) {
@@ -217,6 +254,21 @@ absl::Status RunCollectivePermute(
                                   device_string);
     TF_RETURN_IF_ERROR(stream.MemZero(&dest_addr, dest_addr.size()));
   }
+  if (use_memcpy && target_id) {
+    TF_ASSIGN_OR_RETURN(auto recv_ptr, recv_ptr_map.GetRecvPtr(*target_id));
+    if (recv_ptr.IsUnavailable()) {
+      // TODO make BlockUntilReady support AsyncValueRef directly.
+      BlockUntilReady(recv_ptr.GetAsyncValue());
+    }
+
+    VLOG(3) << "Using memcpy, received target pointer: " << recv_ptr.get()
+            << " current_id " << current_id << " target_id: " << *target_id;
+
+    VLOG(3) << current_id << " initiating memcpy to " << *target_id;
+    se::DeviceMemoryBase dst_addr = se::DeviceMemoryBase(recv_ptr.get());
+    TF_RETURN_IF_ERROR(stream.MemcpyD2D(&dst_addr, src_addr, src_addr.size()));
+  }
+
   return absl::OkStatus();
 }
 
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_collective_permute_thunk.h b/third_party/xla/xla/service/gpu/runtime/nccl_collective_permute_thunk.h
index 29498660195bef..0ba3df3ac5b91f 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_collective_permute_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_collective_permute_thunk.h
@@ -22,17 +22,64 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/collective_ops_utils.h"
-#include "xla/service/gpu/nccl_api.h"
+#include "xla/service/gpu/runtime/nccl_api.h"
 #include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/nccl_p2p_thunk_common.h"
 #include "xla/stream_executor/stream.h"
+#include "tsl/concurrency/async_value.h"
+#include "tsl/concurrency/async_value_ref.h"
 
 namespace xla {
 namespace gpu {
 
+using tsl::AsyncValueRef;
+
 // Thunk that performs a NCCL-based collective permute.
 class NcclCollectivePermuteStartThunk : public NcclCollectiveThunk {
  public:
+  class RecvPtrMap {
+   public:
+    bool IsInitialized(int64_t current_id) {
+      absl::MutexLock lock(&mutex_);
+      return recv_ptrs_.find(current_id) != recv_ptrs_.end();
+    }
+
+    absl::Status InitializeId(int64_t current_id) {
+      absl::MutexLock lock(&mutex_);
+      if (recv_ptrs_.find(current_id) == recv_ptrs_.end()) {
+        recv_ptrs_[current_id] = tsl::MakeUnconstructedAsyncValueRef<void*>();
+      }
+      return OkStatus();
+    }
+
+    absl::Status PutRecvPtr(int64_t current_id, void* ptr) {
+      if (!IsInitialized(current_id)) {
+        return absl::InternalError(absl::StrCat("Current ID ", current_id,
+                                                " has not been initialized!"));
+      }
+      absl::MutexLock lock(&mutex_);
+      if (recv_ptrs_.at(current_id).IsUnavailable()) {
+        VLOG(3) << "Putting pointer: " << ptr << " current_id " << current_id;
+        recv_ptrs_.at(current_id).emplace(ptr);
+      }
+      return OkStatus();
+    }
+
+    absl::StatusOr<AsyncValueRef<void*>> GetRecvPtr(int64_t target_id) {
+      if (!IsInitialized(target_id)) {
+        return absl::InternalError(absl::StrCat("Target ID ", target_id,
+                                                " has not been initialized!"));
+      }
+      absl::MutexLock lock(&mutex_);
+      return recv_ptrs_[target_id];
+    }
+
+   private:
+    absl::Mutex mutex_;
+    absl::node_hash_map<int64_t, AsyncValueRef<void*>> recv_ptrs_
+        ABSL_GUARDED_BY(mutex_);
+  };
+
   static NcclP2PConfig GetNcclP2PConfig(
       const HloCollectivePermuteInstruction* instr, int64_t replica_count,
       int64_t partition_count);
@@ -46,8 +93,9 @@ class NcclCollectivePermuteStartThunk : public NcclCollectiveThunk {
   NcclCollectivePermuteStartThunk(ThunkInfo thunk_info, NcclApi* nccl_api,
                                   const HloCollectivePermuteInstruction* instr,
                                   int64_t replica_count,
-                                  int64_t partition_count,
-                                  const Buffer& buffer);
+                                  int64_t partition_count, const Buffer& buffer,
+                                  bool p2p_memcpy_enabled);
+  absl::Status Initialize(const InitializeParams& params) override;
 
   static const char* GetHloOpName() { return "collective-permute-start"; }
 
@@ -55,17 +103,20 @@ class NcclCollectivePermuteStartThunk : public NcclCollectiveThunk {
   const NcclCollectiveConfig& config() const override { return config_.config; }
   absl::Status RunNcclCollective(const ExecuteParams& params,
                                  se::Stream& stream,
-                                 NcclApi::NcclCommHandle comm) override;
+                                 NcclCommHandleWrapper comm_wrapper) override;
 
  private:
   const NcclP2PConfig config_;
   const Buffer buffer_;
+  RecvPtrMap recv_ptr_map_;
+  bool p2p_memcpy_enabled_ = false;
 };
 
 absl::Status RunCollectivePermute(
     NcclApi* nccl_api, NcclP2PConfig::SourceTargetMapEntry source_target,
     DeviceBufferPair& buffer, se::Stream& stream, NcclApi::NcclCommHandle comm,
-    absl::string_view device_string, int64_t current_id);
+    absl::string_view device_string, int64_t current_id, bool use_memcpy,
+    NcclCollectivePermuteStartThunk::RecvPtrMap& recv_ptr_map);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.cc
index 70d7b4351cc1fc..8d6b513c02774e 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.cc
@@ -44,9 +44,9 @@ limitations under the License.
 #include "xla/service/global_device_id.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/ir_emission_utils.h"
-#include "xla/service/gpu/nccl_api.h"
-#include "xla/service/gpu/nccl_clique.h"
 #include "xla/service/gpu/nccl_clique_key.h"
+#include "xla/service/gpu/runtime/nccl_api.h"
+#include "xla/service/gpu/runtime/nccl_clique.h"
 #include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/rendezvous.h"
 #include "xla/shape.h"
@@ -68,7 +68,7 @@ namespace gpu {
 namespace {
 
 static constexpr int64_t kCollectiveMemorySpaceColor = 1;
-static constexpr int64_t kNoStreamId = 0;
+static constexpr NcclStreamId kNoStreamId = NcclStreamId(0);
 
 bool IsTypeSupportedByNccl(PrimitiveType element_type,
                            Thunk::Kind reduction_op) {
@@ -221,7 +221,7 @@ NcclCollectiveThunk::NcclCollectiveThunk(Kind kind, ThunkInfo thunk_info,
 static absl::StatusOr<NcclCliqueKey> GetNcclCliqueKey(
     const Thunk::CollectiveExecuteParams& params,
     const std::vector<ReplicaGroup>& replica_groups,
-    CollectiveOpGroupMode group_mode, int64_t stream_id,
+    CollectiveOpGroupMode group_mode, NcclStreamId stream_id,
     AsyncStreamKind stream_kind) {
   GlobalDeviceId global_device_id = params.global_device_id;
 
@@ -244,18 +244,23 @@ static absl::StatusOr<NcclCliqueKey> GetNcclCliqueKey(
                        stream_kind);
 }
 
-absl::StatusOr<NcclApi::NcclCommHandle> GetNcclComm(
+absl::StatusOr<NcclCommHandleWrapper> GetNcclComm(
     const Thunk::CollectiveExecuteParams& params,
     const Thunk::CollectiveCliques& collective_cliques,
     const std::vector<ReplicaGroup>& replica_groups,
-    CollectiveOpGroupMode group_mode, int64_t stream_id,
+    CollectiveOpGroupMode group_mode, NcclStreamId stream_id,
     AsyncStreamKind stream_kind) {
   TF_ASSIGN_OR_RETURN(NcclCliqueKey clique_key,
                       GetNcclCliqueKey(params, replica_groups, group_mode,
                                        stream_id, stream_kind));
 
   std::optional<int64_t> rank = clique_key.rank(params.global_device_id);
-  return collective_cliques.GetComm(std::move(clique_key), *rank);
+  TF_ASSIGN_OR_RETURN(bool is_local,
+                      collective_cliques.is_local_clique(clique_key));
+  TF_ASSIGN_OR_RETURN(NcclApi::NcclCommHandle comm,
+                      collective_cliques.GetComm(std::move(clique_key), *rank));
+
+  return NcclCommHandleWrapper(comm, is_local);
 }
 
 absl::StatusOr<std::vector<DeviceBufferPair>> ConvertToDeviceBuffers(
@@ -430,14 +435,13 @@ bool operator==(const FirstCallRendezvousKey& a,
 Status NcclCollectiveThunk::ExecuteOnStream(const ExecuteParams& params) {
   VLOG(1) << absl::StreamFormat("Starting %s %s.", IsAsync() ? "async" : "sync",
                                 Thunk::KindToString(kind()));
-  const int64_t stream_id = GetStreamId();
+  const NcclStreamId stream_id = GetStreamId();
   AsyncStreamKind stream_kind = GetAsyncStreamKind();
   TF_ASSIGN_OR_RETURN(
-      NcclApi::NcclCommHandle comm,
+      NcclCommHandleWrapper comm_handle,
       GetNcclComm(*params.collective_params, *params.collective_cliques,
                   config().replica_groups, config().group_mode, stream_id,
                   stream_kind));
-
   se::StreamExecutor* executor = params.stream->parent();
   int64_t async_stream_idx = static_cast<int64_t>(stream_kind);
 
@@ -448,7 +452,7 @@ Status NcclCollectiveThunk::ExecuteOnStream(const ExecuteParams& params) {
     // Wait for main compute stream to make sure all buffers are ready.
     TF_RETURN_IF_ERROR(async_stream.WaitFor(params.stream));
 
-    TF_RETURN_IF_ERROR(RunNcclCollective(params, async_stream, comm));
+    TF_RETURN_IF_ERROR(RunNcclCollective(params, async_stream, comm_handle));
 
     // Record collective operation completion.
     TF_ASSIGN_OR_RETURN(se::Event * event, async_events_->GetEvent(executor));
@@ -456,7 +460,7 @@ Status NcclCollectiveThunk::ExecuteOnStream(const ExecuteParams& params) {
 
   } else {
     // Launch collective operation on a main stream.
-    TF_RETURN_IF_ERROR(RunNcclCollective(params, *params.stream, comm));
+    TF_RETURN_IF_ERROR(RunNcclCollective(params, *params.stream, comm_handle));
   }
 
   // After a first execution of this instance of collective operation do a
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.h b/third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.h
index 7eadf2ec270029..95715152b7458a 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.h
@@ -39,8 +39,8 @@ limitations under the License.
 #include "xla/service/global_device_id.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/ir_emission_utils.h"
-#include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/nccl_clique_key.h"
+#include "xla/service/gpu/runtime/nccl_api.h"
 #include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/service/rendezvous.h"
@@ -107,6 +107,18 @@ NcclCollectiveConfig GetNcclCollectiveConfigForMlir(
   return config;
 }
 
+// This wraps the ncclCommHandle object along with other information
+// that could be useful.
+struct NcclCommHandleWrapper {
+  NcclCommHandleWrapper(NcclApi::NcclCommHandle handle, bool is_local)
+      : comm_handle(handle), is_local(is_local) {}
+
+  // Communicator handle.
+  NcclApi::NcclCommHandle comm_handle;
+  // Whether this comm is a node-local comm.
+  bool is_local;
+};
+
 //===----------------------------------------------------------------------===//
 // NcclCollectiveThunk
 //===----------------------------------------------------------------------===//
@@ -165,9 +177,9 @@ class NcclCollectiveThunk : public Thunk {
   }
 
  protected:
-  virtual absl::Status RunNcclCollective(const ExecuteParams& params,
-                                         se::Stream& stream,
-                                         NcclApi::NcclCommHandle comm) = 0;
+  virtual absl::Status RunNcclCollective(
+      const ExecuteParams& params, se::Stream& stream,
+      NcclCommHandleWrapper comm_wrapper) = 0;
   virtual const NcclCollectiveConfig& config() const = 0;
   virtual AsyncStreamKind GetAsyncStreamKind() const {
     return AsyncStreamKind::kCollective;
@@ -185,8 +197,9 @@ class NcclCollectiveThunk : public Thunk {
 
  private:
   bool IsAsync() const { return async_events_ != nullptr; }
-  int64_t GetStreamId() const {
-    return xla::gpu::GetStreamId(IsAsync(), GetAsyncStreamKind());
+  NcclStreamId GetStreamId() const {
+    return xla::gpu::GetStreamId(execution_stream_id().value(), IsAsync(),
+                                 GetAsyncStreamKind());
   }
 
   NcclApi* nccl_api_;
@@ -261,11 +274,13 @@ size_t GetNumLocalParticipants(
     const std::vector<GlobalDeviceId>& participants,
     const std::vector<GlobalDeviceId>* local_devices);  // may be null
 
-absl::StatusOr<NcclApi::NcclCommHandle> GetNcclComm(
+// Returns a nccl comm handle and a flag indicating if
+// it's a local communicator.
+absl::StatusOr<NcclCommHandleWrapper> GetNcclComm(
     const Thunk::CollectiveExecuteParams& params,
     const Thunk::CollectiveCliques& collective_cliques,
     const std::vector<ReplicaGroup>& replica_groups,
-    CollectiveOpGroupMode group_mode, int64_t stream_id,
+    CollectiveOpGroupMode group_mode, NcclStreamId stream_id,
     AsyncStreamKind stream_kind);
 
 struct DeviceBufferPair {
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_recv_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_recv_thunk.cc
index a20b33813d966e..b6520aaab733ad 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_recv_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_recv_thunk.cc
@@ -28,7 +28,7 @@ limitations under the License.
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/global_device_id.h"
-#include "xla/service/gpu/nccl_api.h"
+#include "xla/service/gpu/runtime/nccl_api.h"
 #include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/nccl_p2p_thunk_common.h"
 #include "xla/service/gpu/runtime/thunk.h"
@@ -63,9 +63,9 @@ absl::Status NcclRecvThunk::Initialize(const InitializeParams& params) {
   return absl::OkStatus();
 }
 
-absl::Status NcclRecvThunk::RunNcclCollective(const ExecuteParams& params,
-                                              se::Stream& stream,
-                                              NcclApi::NcclCommHandle comm) {
+absl::Status NcclRecvThunk::RunNcclCollective(
+    const ExecuteParams& params, se::Stream& stream,
+    NcclCommHandleWrapper comm_wrapper) {
   TF_ASSIGN_OR_RETURN(
       std::vector<DeviceBufferPair> device_buffers,
       ConvertToDeviceBuffers(params, {buffer_},
@@ -93,8 +93,8 @@ absl::Status NcclRecvThunk::RunNcclCollective(const ExecuteParams& params,
   int device_ordinal = stream.parent()->device_ordinal();
   VLOG(3) << "Performing Recv from device ordinal: " << device_ordinal
           << "current_id " << current_id;
-  TF_RETURN_IF_ERROR(
-      MaybeRegisterBuffers(nccl_api(), device_ordinal, {buffer}, comm));
+  TF_RETURN_IF_ERROR(MaybeRegisterBuffers(nccl_api(), device_ordinal, {buffer},
+                                          comm_wrapper.comm_handle));
 
   const std::optional<int64_t> source_id = source_target.source;
   se::DeviceMemoryBase dest_addr = buffer.destination_buffer;
@@ -127,7 +127,7 @@ absl::Status NcclRecvThunk::RunNcclCollective(const ExecuteParams& params,
     if (should_run) {
       TF_RETURN_IF_ERROR(nccl_api()->Recv(dest_addr, buffer.element_type,
                                           buffer.element_count, *source_id,
-                                          comm, &stream));
+                                          comm_wrapper.comm_handle, &stream));
     }
 
   } else {
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_recv_thunk.h b/third_party/xla/xla/service/gpu/runtime/nccl_recv_thunk.h
index 587530c93128d1..b688f0c2c4abbc 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_recv_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_recv_thunk.h
@@ -22,8 +22,8 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/nccl_clique_key.h"
+#include "xla/service/gpu/runtime/nccl_api.h"
 #include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/nccl_p2p_thunk_common.h"
 #include "xla/stream_executor/stream.h"
@@ -43,7 +43,7 @@ class NcclRecvThunk : public NcclCollectiveThunk {
   const NcclCollectiveConfig& config() const override { return config_.config; }
   absl::Status RunNcclCollective(const ExecuteParams& params,
                                  se::Stream& stream,
-                                 NcclApi::NcclCommHandle comm) override;
+                                 NcclCommHandleWrapper comm_wrapper) override;
   AsyncStreamKind GetAsyncStreamKind() const override { return stream_kind_; }
   bool NeedFirstCallRendzevous() const override { return false; }
 
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_send_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_send_thunk.cc
index 757f98745dd09e..874e561d1762d6 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_send_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_send_thunk.cc
@@ -29,7 +29,7 @@ limitations under the License.
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/global_device_id.h"
-#include "xla/service/gpu/nccl_api.h"
+#include "xla/service/gpu/runtime/nccl_api.h"
 #include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/nccl_p2p_thunk_common.h"
 #include "xla/service/gpu/runtime/thunk.h"
@@ -64,9 +64,9 @@ absl::Status NcclSendThunk::Initialize(const InitializeParams& params) {
   return absl::OkStatus();
 }
 
-absl::Status NcclSendThunk::RunNcclCollective(const ExecuteParams& params,
-                                              se::Stream& stream,
-                                              NcclApi::NcclCommHandle comm) {
+absl::Status NcclSendThunk::RunNcclCollective(
+    const ExecuteParams& params, se::Stream& stream,
+    NcclCommHandleWrapper comm_wrapper) {
   TF_ASSIGN_OR_RETURN(
       std::vector<DeviceBufferPair> device_buffers,
       ConvertToDeviceBuffers(params, {buffer_},
@@ -93,8 +93,8 @@ absl::Status NcclSendThunk::RunNcclCollective(const ExecuteParams& params,
   int device_ordinal = stream.parent()->device_ordinal();
   VLOG(3) << "Performing collective permute from device ordinal: "
           << device_ordinal << "current_id " << current_id;
-  TF_RETURN_IF_ERROR(
-      MaybeRegisterBuffers(nccl_api(), device_ordinal, {buffer}, comm));
+  TF_RETURN_IF_ERROR(MaybeRegisterBuffers(nccl_api(), device_ordinal, {buffer},
+                                          comm_wrapper.comm_handle));
 
   const std::optional<int64_t> target_id = source_target.target;
   se::DeviceMemoryBase src_addr = buffer.source_buffer;
@@ -128,7 +128,7 @@ absl::Status NcclSendThunk::RunNcclCollective(const ExecuteParams& params,
     if (should_run) {
       TF_RETURN_IF_ERROR(nccl_api()->Send(src_addr, buffer.element_type,
                                           buffer.element_count, *target_id,
-                                          comm, &stream));
+                                          comm_wrapper.comm_handle, &stream));
     }
   }
 
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_send_thunk.h b/third_party/xla/xla/service/gpu/runtime/nccl_send_thunk.h
index 30f63215a83a86..a622848394c40b 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_send_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_send_thunk.h
@@ -23,8 +23,8 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/collective_ops_utils.h"
-#include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/nccl_clique_key.h"
+#include "xla/service/gpu/runtime/nccl_api.h"
 #include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/nccl_p2p_thunk_common.h"
 #include "xla/stream_executor/stream.h"
@@ -44,7 +44,7 @@ class NcclSendThunk : public NcclCollectiveThunk {
   const NcclCollectiveConfig& config() const override { return config_.config; }
   absl::Status RunNcclCollective(const ExecuteParams& params,
                                  se::Stream& stream,
-                                 NcclApi::NcclCommHandle comm) override;
+                                 NcclCommHandleWrapper comm_wrapper) override;
   AsyncStreamKind GetAsyncStreamKind() const override { return stream_kind_; }
   bool NeedFirstCallRendzevous() const override { return false; }
 
diff --git a/third_party/xla/xla/service/gpu/runtime/thunk.cc b/third_party/xla/xla/service/gpu/runtime/thunk.cc
index 17af8b837b00bb..3c38f28be70106 100644
--- a/third_party/xla/xla/service/gpu/runtime/thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/thunk.cc
@@ -38,9 +38,9 @@ limitations under the License.
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/gpu_executable_run_options.h"
-#include "xla/service/gpu/nccl_api.h"
-#include "xla/service/gpu/nccl_clique.h"
 #include "xla/service/gpu/nccl_clique_key.h"
+#include "xla/service/gpu/runtime/nccl_api.h"
+#include "xla/service/gpu/runtime/nccl_clique.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/translate/mhlo_to_hlo/location_exporter.h"
@@ -77,6 +77,18 @@ absl::StatusOr<NcclApi::NcclCommHandle> Thunk::CollectiveCliques::GetComm(
   return *communicator;
 }
 
+absl::StatusOr<bool> Thunk::CollectiveCliques::is_local_clique(
+    const NcclCliqueKey& clique_key) const {
+  // Check that we locked access to a clique for `clique_key`.
+  auto clique = cliques_map_.find(clique_key);
+  if (clique == cliques_map_.end()) {
+    return absl::NotFoundError(absl::StrCat("No clique found for clique key: ",
+                                            clique_key.ToString()));
+  }
+
+  return (*clique->second)->IsLocal();
+}
+
 absl::StatusOr<size_t> Thunk::CollectiveCliques::num_communicators(
     const NcclCliqueKey& clique_key) const {
   // Check that we locked access to a clique for `clique_key`.
@@ -339,9 +351,9 @@ Thunk::ThunkInfo Thunk::ThunkInfo::WithProfileAnnotation(
   thunk_info.profile_annotation = instr->name();
   auto gpu_backend_config = instr->backend_config<GpuBackendConfig>();
   if (gpu_backend_config.ok()) {
-    thunk_info.execution_stream_id =
-        std::max(kDefaultExecutionStreamId.value(),
-                 gpu_backend_config->operation_queue_id());
+    thunk_info.execution_stream_id = std::max<uint64_t>(
+        kDefaultExecutionStreamId.value(),
+        static_cast<uint64_t>(gpu_backend_config->operation_queue_id()));
   }
   return thunk_info;
 }
diff --git a/third_party/xla/xla/service/gpu/runtime/thunk.h b/third_party/xla/xla/service/gpu/runtime/thunk.h
index c9d037209062c9..633974e4e1c293 100644
--- a/third_party/xla/xla/service/gpu/runtime/thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/thunk.h
@@ -38,9 +38,9 @@ limitations under the License.
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/global_device_id.h"
 #include "xla/service/gpu/buffer_allocations.h"
-#include "xla/service/gpu/nccl_api.h"
-#include "xla/service/gpu/nccl_clique.h"
 #include "xla/service/gpu/nccl_clique_key.h"
+#include "xla/service/gpu/runtime/nccl_api.h"
+#include "xla/service/gpu/runtime/nccl_clique.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
@@ -49,7 +49,32 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-TSL_LIB_GTL_DEFINE_INT_TYPE(ExecutionStreamId, int64_t);
+// Execution stream id allows to specify what Gpu stream Thunk should be using
+// for launching device work (kernels, library calls, etc.). By default all
+// thunks use stream #0, which is the default compute stream of an XLA
+// executable.
+//
+// Stream synchronizations are explicit and represented as WaitForStreams thunk
+// in a ThunkSequence. When ThunkSequence converted to CommandBuffer, execution
+// streams mapped to concurrent execution scopes and barriers between them.
+//
+// IMPORTANT: Async execution semantics and execution stream id
+//
+// For async thunks (i.e. thunks corresponding to `all-reduce-start` and
+// `all-reduce-done`) execution stream id means NOT a stream where the async
+// operation must execute, but a stream that async operation must be
+// synchronized with:
+//
+//   - Start operation must wait for the completion of all launched work on the
+//     execution stream id (usually by adding a stream wait) and after that
+//     launch async work on implementation defined extra stream (can be borrowed
+//     from a pool)
+//
+//   - Corresponding Done operation must synchronize execution stream id with
+//     an implementation defined stream that is running async work, again
+//     usually by adding a stream wait.
+//
+TSL_LIB_GTL_DEFINE_INT_TYPE(ExecutionStreamId, uint64_t);
 
 // Thunk acts as the bridge between IrEmitter and GpuExecutable. It stores the
 // metadata IrEmitter generates for GpuExecutable to invoke an HloInstruction.
@@ -201,6 +226,9 @@ class Thunk {
     absl::StatusOr<size_t> num_communicators(
         const NcclCliqueKey& clique_key) const;
 
+    // Returns whether the clique is a local clique.
+    absl::StatusOr<bool> is_local_clique(const NcclCliqueKey& clique_key) const;
+
     bool empty() const { return cliques_map_.empty(); }
 
    private:
diff --git a/third_party/xla/xla/service/gpu/stream_attribute_annotator.cc b/third_party/xla/xla/service/gpu/stream_attribute_annotator.cc
index 0bfa2cef837e2d..0b8d00984df7a3 100644
--- a/third_party/xla/xla/service/gpu/stream_attribute_annotator.cc
+++ b/third_party/xla/xla/service/gpu/stream_attribute_annotator.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_query.h"
 #include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/gpu_fusible.h"
 #include "xla/service/gpu/runtime/thunk.h"
 #include "xla/statusor.h"
 #include "xla/util.h"
@@ -102,6 +103,39 @@ absl::StatusOr<bool> AnnotateStreamAttributesForCopyStart(
   return true;
 }
 
+absl::StatusOr<bool> WrapIntoFusionAndAnnotateStreamAttributes(
+    HloInstruction* instruction, int64_t channel_id,
+    GpuBackendConfig& instr_gpu_config) {
+  auto* computation = instruction->parent();
+  auto* module = computation->parent();
+  auto* fusion_instruction =
+      computation->AddInstruction(HloInstruction::CreateFusion(
+          instruction->shape(), ChooseFusionKind(*instruction, *instruction),
+          instruction));
+  const absl::string_view wrapped_opcode =
+      HloOpcodeString(instruction->opcode());
+  module->SetAndUniquifyInstrName(fusion_instruction,
+                                  absl::StrCat("wrapped_", wrapped_opcode));
+  module->SetAndUniquifyComputationName(
+      fusion_instruction->fused_instructions_computation(),
+      absl::StrCat("wrapped_", wrapped_opcode, "_computation"));
+  if (module->has_schedule()) {
+    module->schedule().replace_instruction(computation, instruction,
+                                           fusion_instruction);
+  }
+  TF_RETURN_IF_ERROR(fusion_instruction->CopyAllControlDepsFrom(instruction));
+  TF_RETURN_IF_ERROR(instruction->DropAllControlDeps());
+  TF_RETURN_IF_ERROR(instruction->ReplaceAllUsesWith(fusion_instruction));
+  TF_RETURN_IF_ERROR(computation->RemoveInstruction(instruction));
+
+  instr_gpu_config.set_operation_queue_id(channel_id);
+  TF_RETURN_IF_ERROR(fusion_instruction->set_backend_config(instr_gpu_config));
+  VLOG(3) << "Add async stream " << channel_id << " and wrapped instruction "
+          << instruction->ToString();
+  VLOG(3) << "  Fusion wrapper: " << fusion_instruction->ToString();
+  return true;
+}
+
 absl::StatusOr<bool> AnnotateStreamAttributesForUsers(
     HloInstruction* instr, GpuBackendConfig& instr_gpu_config) {
   bool changed = false;
@@ -140,7 +174,8 @@ absl::StatusOr<bool> StreamAttributeAnnotator::Run(
       5, "StreamAttributeAnnotator::Run(), before:\n" + module->ToString());
   bool changed = false;
   int64_t channel_id = hlo_query::NextChannelId(*module);
-  for (const HloComputation* comp : module->computations(execution_threads)) {
+  for (const HloComputation* comp :
+       module->MakeComputationPostOrder(execution_threads)) {
     for (HloInstruction* instr : comp->MakeInstructionPostOrder()) {
       auto instr_gpu_config = instr->backend_config<GpuBackendConfig>();
       if (!instr_gpu_config.ok()) {
@@ -160,6 +195,14 @@ absl::StatusOr<bool> StreamAttributeAnnotator::Run(
                                 instr, channel_id, instr_gpu_config.value()));
         changed |= comp_result;
         continue;
+      } else if (comp->IsAsyncComputation() &&
+                 (instr->opcode() == HloOpcode::kDynamicSlice ||
+                  instr->opcode() == HloOpcode::kDynamicUpdateSlice)) {
+        TF_ASSIGN_OR_RETURN(bool comp_result,
+                            WrapIntoFusionAndAnnotateStreamAttributes(
+                                instr, channel_id, instr_gpu_config.value()));
+        changed |= comp_result;
+        continue;
       }
 
       TF_ASSIGN_OR_RETURN(
diff --git a/third_party/xla/xla/service/gpu/stream_attribute_annotator_test.cc b/third_party/xla/xla/service/gpu/stream_attribute_annotator_test.cc
index 2861f9a82a7ef1..17d9b2f1e212d7 100644
--- a/third_party/xla/xla/service/gpu/stream_attribute_annotator_test.cc
+++ b/third_party/xla/xla/service/gpu/stream_attribute_annotator_test.cc
@@ -208,5 +208,80 @@ TEST_F(StreamAttributeAnnotatorTest, CopyStartIsAnnotated) {
     EXPECT_EQ(gpu_config.operation_queue_id(), 1);
   }
 }
+
+TEST_F(StreamAttributeAnnotatorTest, DynamicUpdateSliceWrappedAndAnnotated) {
+  constexpr absl::string_view kHloString = R"(
+  HloModule ModuleWithAsyncDynamicUpdateSlice
+
+  ENTRY entry (param_0: f32[256,128,128], param_1: f32[1,128,128]) -> f32[256,128,128] {
+    param_0 = f32[256,128,128]{2,1,0:S(5)} parameter(0)
+    param_1 = f32[1,128,128]{2,1,0} parameter(1)
+    izero = s32[] constant(0)
+    dynamic-update-slice-start.2 = ((f32[256,128,128]{2,1,0:S(5)}, f32[1,128,128]{2,1,0}, s32[], s32[], s32[]), f32[256,128,128]{2,1,0:S(5)}, u32[])
+        dynamic-update-slice-start(param_0, param_1, izero, izero, izero)
+    ROOT dynamic-update-slice-done.2 = f32[256,128,128]{2,1,0:S(5)}
+        dynamic-update-slice-done(dynamic-update-slice-start.2)
+  }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          StreamAttributeAnnotator().Run(module.get()));
+  EXPECT_TRUE(changed);
+
+  // Check that the dynamic-update-slice instruction is wrapped in a fusion
+  // and the fusion is annotated with the correct operation_queue_id.
+  const HloInstruction* dus =
+      FindInstruction(module.get(), HloOpcode::kDynamicUpdateSlice);
+  const HloComputation* computation = dus->parent();
+  EXPECT_TRUE(computation->IsFusionComputation());
+  const HloInstruction* fusion = computation->FusionInstruction();
+  EXPECT_EQ(fusion->opcode(), HloOpcode::kFusion);
+  EXPECT_TRUE(fusion->parent()->IsAsyncComputation());
+
+  EXPECT_TRUE(fusion->has_backend_config());
+  TF_ASSERT_OK_AND_ASSIGN(GpuBackendConfig gpu_config,
+                          fusion->backend_config<GpuBackendConfig>());
+  EXPECT_EQ(gpu_config.operation_queue_id(), 1);
+}
+
+TEST_F(StreamAttributeAnnotatorTest, DynamicSliceWrappedAndAnnotated) {
+  constexpr absl::string_view kHloString = R"(
+  HloModule ModuleWithAsyncDynamicSlice
+
+  ENTRY entry (param_0: f32[256,128,128]) -> f32[1,128,128] {
+    param_0 = f32[256,128,128]{2,1,0:S(5)} parameter(0)
+    izero = s32[] constant(0)
+    dynamic-slice-start.2 = ((f32[256,128,128]{2,1,0:S(5)}, s32[], s32[], s32[]), f32[1,128,128]{2,1,0}, u32[])
+        dynamic-slice-start(param_0, izero, izero, izero), dynamic_slice_sizes={1,128,128}
+    ROOT dynamic-slice-done.2 = f32[1,128,128]{2,1,0}
+        dynamic-slice-done(dynamic-slice-start.2)
+  }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          StreamAttributeAnnotator().Run(module.get()));
+  EXPECT_TRUE(changed);
+
+  // Check that the dynamic-slice instruction is wrapped in a fusion
+  // and the fusion is annotated with the correct operation_queue_id.
+  const HloInstruction* ds =
+      FindInstruction(module.get(), HloOpcode::kDynamicSlice);
+  const HloComputation* computation = ds->parent();
+  EXPECT_TRUE(computation->IsFusionComputation());
+  const HloInstruction* fusion = computation->FusionInstruction();
+  EXPECT_EQ(fusion->opcode(), HloOpcode::kFusion);
+  EXPECT_TRUE(fusion->parent()->IsAsyncComputation());
+
+  EXPECT_TRUE(fusion->has_backend_config());
+  TF_ASSERT_OK_AND_ASSIGN(GpuBackendConfig gpu_config,
+                          fusion->backend_config<GpuBackendConfig>());
+  EXPECT_EQ(gpu_config.operation_queue_id(), 1);
+}
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/stream_executor_util.cc b/third_party/xla/xla/service/gpu/stream_executor_util.cc
index 4827fc251ce087..16d813cfe64db8 100644
--- a/third_party/xla/xla/service/gpu/stream_executor_util.cc
+++ b/third_party/xla/xla/service/gpu/stream_executor_util.cc
@@ -530,6 +530,24 @@ absl::StatusOr<se::dnn::NormKind> GetDNNNormKindFromCudnnNormKind(
   }
 }
 
+absl::StatusOr<se::dnn::FMHAMaskKind> GetDNNFmhaMaskKindFromCudnnFmhaMaskKind(
+    CudnnfMHAMaskKind kind) {
+  switch (kind) {
+    case CudnnfMHAMaskKind::kNoMask:
+      return se::dnn::NO_MASK;
+    case CudnnfMHAMaskKind::kPadding:
+      return se::dnn::PADDING;
+    case CudnnfMHAMaskKind::kCausal:
+      return se::dnn::CAUSAL;
+    case CudnnfMHAMaskKind::kPaddingCausal:
+      return se::dnn::PADDING_CAUSAL;
+    case CudnnfMHAMaskKind::kAlibi:
+      return se::dnn::ALIBI;
+    default:
+      return Internal("Unexpected fmha mask kind");
+  }
+}
+
 absl::StatusOr<se::dnn::FusedMHAKind> GetDNNFusedMHAKindFromCudnnfMHAKind(
     CudnnfMHAKind kind) {
   switch (kind) {
@@ -660,7 +678,7 @@ absl::Span<AutotuneResult const> TopResultsWithinMeasurementError(
   // This value was picked by repeatedly running a few kernels that run for a
   // short time and observing the run-time variance. A more rigorous analysis
   // of the measurement error might yield a better error threshold.
-  constexpr absl::Duration kMeasurementError = absl::Microseconds(2);
+  constexpr absl::Duration kMeasurementError = absl::Microseconds(4);
 
   absl::Duration min_time = tsl::proto_utils::FromDurationProto(
       results_sorted_by_runtime.front().run_time());
diff --git a/third_party/xla/xla/service/gpu/stream_executor_util.h b/third_party/xla/xla/service/gpu/stream_executor_util.h
index b52062ee314f55..fb901abaa08f60 100644
--- a/third_party/xla/xla/service/gpu/stream_executor_util.h
+++ b/third_party/xla/xla/service/gpu/stream_executor_util.h
@@ -127,6 +127,9 @@ absl::StatusOr<se::dnn::ConvolutionKind> GetDNNConvKindFromCudnnConvKind(
 absl::StatusOr<se::dnn::NormKind> GetDNNNormKindFromCudnnNormKind(
     CudnnNormKind kind);
 
+absl::StatusOr<se::dnn::FMHAMaskKind> GetDNNFmhaMaskKindFromCudnnFmhaMaskKind(
+    CudnnfMHAMaskKind kind);
+
 absl::StatusOr<se::dnn::FusedMHAKind> GetDNNFusedMHAKindFromCudnnfMHAKind(
     CudnnfMHAKind kind);
 
diff --git a/third_party/xla/xla/service/gpu/stream_executor_util_test.cc b/third_party/xla/xla/service/gpu/stream_executor_util_test.cc
index cb3be24a6ceaa6..afec2cd8749e33 100644
--- a/third_party/xla/xla/service/gpu/stream_executor_util_test.cc
+++ b/third_party/xla/xla/service/gpu/stream_executor_util_test.cc
@@ -61,7 +61,7 @@ std::vector<AutotuneResult> Results(const std::vector<Result>& stats) {
 TEST(StreamExecutorTest, PickBestResult) {
   absl::StatusOr<AutotuneResult> atr;
 
-  atr = PickBestResult(Results({{5000, 0}, {1000, 0}, {6000, 0}}), "", {});
+  atr = PickBestResult(Results({{9000, 0}, {1000, 0}, {16000, 0}}), "", {});
   EXPECT_EQ(ATRToResult(atr.value()), Result({1000, 0}));
 
   atr = PickBestResult(Results({{4700, 0}, {4600, 0}, {4500, 0}}), "", {});
diff --git a/third_party/xla/xla/service/gpu/tests/BUILD b/third_party/xla/xla/service/gpu/tests/BUILD
index b9e7764047d8c4..55bbeab1148556 100644
--- a/third_party/xla/xla/service/gpu/tests/BUILD
+++ b/third_party/xla/xla/service/gpu/tests/BUILD
@@ -5,7 +5,6 @@ load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "if_rocm_is_configured",
 )
-load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
 load(
     "@local_tsl//tsl/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
@@ -21,6 +20,7 @@ load(
     "xla_cc_test",
 )
 load("//xla/tests:build_defs.bzl", "xla_test")
+load("//xla/tsl:tsl.default.bzl", "filegroup")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -895,7 +895,7 @@ xla_cc_test(
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_main",
     ] + if_cuda_is_configured([
-        "//xla/stream_executor/gpu:asm_compiler",
+        "//xla/stream_executor/cuda:cuda_asm_compiler",
         "//xla/service/gpu:gpu_asm_opts_util",
         "//xla/stream_executor",
         "//xla/service/gpu:stream_executor_util",
diff --git a/third_party/xla/xla/service/gpu/tests/dynamic_shared_memory_test.cc b/third_party/xla/xla/service/gpu/tests/dynamic_shared_memory_test.cc
index 3442e24ad83a1d..0c2800e5becdcd 100644
--- a/third_party/xla/xla/service/gpu/tests/dynamic_shared_memory_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/dynamic_shared_memory_test.cc
@@ -21,9 +21,9 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/service/gpu/gpu_asm_opts_util.h"
 #include "xla/service/gpu/stream_executor_util.h"
+#include "xla/stream_executor/cuda/cuda_asm_compiler.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/gpu/asm_compiler.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc
index 115f56f7db167d..313cd58ef1c77e 100644
--- a/third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc
@@ -683,916 +683,6 @@ class MultiHeadedAttentionBMMBMM : public MultiHeadedAttentionTest {
   }
 };
 
-// BMM1 - Scale - Bias - Mask - Softmax - BMM2
-class MultiHeadedAttentionBMMScaleBiasMaskSoftmaxBMM
-    : public MultiHeadedAttentionTest {
- protected:
-  std::string GetModuleFMHABMM1_Scale_Bias_Mask_Softmax_BMM2_HloString_F16() {
-    const std::string hlo_text = R"(
-    HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(f16[16,16,256,64]{3,2,1,0},f16[16,16,256,64]{3,2,1,0},f16[16,16,256,64]{3,2,1,0},pred[16,16,256,256]{3,2,1,0})->f16[16,16,256,64]{3,2,1,0}}
-
-    region_0.17 {
-      Arg_0.18 = f16[] parameter(0)
-      Arg_1.19 = f16[] parameter(1)
-      ROOT maximum.20 = f16[] maximum(Arg_0.18, Arg_1.19)
-    }
-
-    region_1.29 {
-      Arg_0.30 = f32[] parameter(0)
-      Arg_1.31 = f32[] parameter(1)
-      ROOT add.32 = f32[] add(Arg_0.30, Arg_1.31)
-    }
-
-    ENTRY main.41 {
-      constant.12 = pred[16,16,256,256]{3,2,1,0} parameter(3)
-      Arg_0.1 = f16[16,16,256,64]{3,2,1,0} parameter(0)
-      Arg_1.2 = f16[16,16,256,64]{3,2,1,0} parameter(1)
-      dot.13 = f16[16,16,256,256]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
-      constant.6 = f16[] constant(2)
-      broadcast.7 = f16[16,16,256,256]{3,2,1,0} broadcast(constant.6), dimensions={}
-      multiply.14 = f16[16,16,256,256]{3,2,1,0} multiply(dot.13, broadcast.7)
-      constant.10 = f16[] constant(1)
-      broadcast.11 = f16[16,16,256,256]{3,2,1,0} broadcast(constant.10), dimensions={}
-      add.15 = f16[16,16,256,256]{3,2,1,0} add(multiply.14, broadcast.11)
-      constant.4 = f16[] constant(0)
-      broadcast.5 = f16[16,16,256,256]{3,2,1,0} broadcast(constant.4), dimensions={}
-      select.16 = f16[16,16,256,256]{3,2,1,0} select(constant.12, add.15, broadcast.5)
-      constant.9 = f16[] constant(-inf)
-      reduce.21 = f16[16,16,256]{2,1,0} reduce(select.16, constant.9), dimensions={3}, to_apply=region_0.17
-      reshape.22 = f16[16,16,256,1]{3,2,1,0} reshape(reduce.21)
-      broadcast.23 = f16[16,16,256,1]{3,2,1,0} broadcast(reshape.22), dimensions={0,1,2,3}
-      reshape.24 = f16[16,16,256]{2,1,0} reshape(broadcast.23)
-      broadcast.25 = f16[16,16,256,256]{3,2,1,0} broadcast(reshape.24), dimensions={0,1,2}
-      subtract.26 = f16[16,16,256,256]{3,2,1,0} subtract(select.16, broadcast.25)
-      exponential.27 = f16[16,16,256,256]{3,2,1,0} exponential(subtract.26)
-      convert.28 = f32[16,16,256,256]{3,2,1,0} convert(exponential.27)
-      constant.8 = f32[] constant(0)
-      reduce.33 = f32[16,16,256]{2,1,0} reduce(convert.28, constant.8), dimensions={3}, to_apply=region_1.29
-      reshape.34 = f32[16,16,256,1]{3,2,1,0} reshape(reduce.33)
-      convert.35 = f16[16,16,256,1]{3,2,1,0} convert(reshape.34)
-      broadcast.36 = f16[16,16,256,1]{3,2,1,0} broadcast(convert.35), dimensions={0,1,2,3}
-      reshape.37 = f16[16,16,256]{2,1,0} reshape(broadcast.36)
-      broadcast.38 = f16[16,16,256,256]{3,2,1,0} broadcast(reshape.37), dimensions={0,1,2}
-      divide.39 = f16[16,16,256,256]{3,2,1,0} divide(exponential.27, broadcast.38)
-      Arg_2.3 = f16[16,16,256,64]{3,2,1,0} parameter(2)
-      ROOT dot.40 = f16[16,16,256,64]{3,2,1,0} dot(divide.39, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-    }
-  )";
-
-    return hlo_text;
-  }
-
-  const std::string  // NOLINT
-  GetModuleFMHA_Training_BMM1_Scale_Bias_Mask_Softmax_BMM2_HloString_F16() {  // NOLINT
-    const std::string hlo_text = R"(
-    HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(f16[2,6,128,64]{3,2,1,0},f16[2,6,64,128]{3,2,1,0},f16[2,6,128,64]{3,2,1,0},f16[2,6,128,64]{3,2,1,0},pred[2,6,128,128]{3,2,1,0})->(f16[2,6,128,64]{3,2,1,0}, f16[2,6,128,64]{3,2,1,0}, f16[2,6,64,128]{3,2,1,0}, f16[2,6,128,64]{3,2,1,0})}, allow_spmd_sharding_propagation_to_output={true,true,true,true}
-
-    region_0.21 {
-      Arg_0.22 = f16[] parameter(0)
-      Arg_1.23 = f16[] parameter(1)
-      ROOT maximum.24 = f16[] maximum(Arg_0.22, Arg_1.23)
-    }
-
-    region_1.33 {
-      Arg_0.34 = f32[] parameter(0)
-      Arg_1.35 = f32[] parameter(1)
-      ROOT add.36 = f32[] add(Arg_0.34, Arg_1.35)
-    }
-
-    region_2.55 {
-      Arg_0.56 = f16[] parameter(0)
-      Arg_1.57 = f16[] parameter(1)
-      ROOT add.58 = f16[] add(Arg_0.56, Arg_1.57)
-    }
-
-    region_3.67 {
-      Arg_0.68 = f32[] parameter(0)
-      Arg_1.69 = f32[] parameter(1)
-      ROOT add.70 = f32[] add(Arg_0.68, Arg_1.69)
-    }
-
-    ENTRY main.82 {
-      constant.16 = pred[2,6,128,128]{3,2,1,0} parameter(4)
-      Arg_0.1 = f16[2,6,128,64]{3,2,1,0} parameter(0)
-      Arg_1.2 = f16[2,6,64,128]{3,2,1,0} parameter(1)
-      dot.17 = f16[2,6,128,128]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-      constant.5 = f16[] constant(2)
-      broadcast.6 = f16[2,6,128,128]{3,2,1,0} broadcast(constant.5), dimensions={}
-      multiply.18 = f16[2,6,128,128]{3,2,1,0} multiply(dot.17, broadcast.6)
-      constant.14 = f16[] constant(1)
-      broadcast.15 = f16[2,6,128,128]{3,2,1,0} broadcast(constant.14), dimensions={}
-      add.19 = f16[2,6,128,128]{3,2,1,0} add(multiply.18, broadcast.15)
-      constant.7 = f16[] constant(0)
-      broadcast.8 = f16[2,6,128,128]{3,2,1,0} broadcast(constant.7), dimensions={}
-      select.20 = f16[2,6,128,128]{3,2,1,0} select(constant.16, add.19, broadcast.8)
-      constant.12 = f16[] constant(-inf)
-      reduce.25 = f16[2,6,128]{2,1,0} reduce(select.20, constant.12), dimensions={3}, to_apply=region_0.21
-      reshape.26 = f16[2,6,128,1]{3,2,1,0} reshape(reduce.25)
-      broadcast.27 = f16[2,6,128,1]{3,2,1,0} broadcast(reshape.26), dimensions={0,1,2,3}
-      reshape.28 = f16[2,6,128]{2,1,0} reshape(broadcast.27)
-      broadcast.29 = f16[2,6,128,128]{3,2,1,0} broadcast(reshape.28), dimensions={0,1,2}
-      subtract.30 = f16[2,6,128,128]{3,2,1,0} subtract(select.20, broadcast.29)
-      exponential.31 = f16[2,6,128,128]{3,2,1,0} exponential(subtract.30)
-      convert.32 = f32[2,6,128,128]{3,2,1,0} convert(exponential.31)
-      constant.11 = f32[] constant(0)
-      reduce.37 = f32[2,6,128]{2,1,0} reduce(convert.32, constant.11), dimensions={3}, to_apply=region_1.33
-      reshape.38 = f32[2,6,128,1]{3,2,1,0} reshape(reduce.37)
-      convert.39 = f16[2,6,128,1]{3,2,1,0} convert(reshape.38)
-      broadcast.40 = f16[2,6,128,1]{3,2,1,0} broadcast(convert.39), dimensions={0,1,2,3}
-      reshape.41 = f16[2,6,128]{2,1,0} reshape(broadcast.40)
-      broadcast.42 = f16[2,6,128,128]{3,2,1,0} broadcast(reshape.41), dimensions={0,1,2}
-      divide.43 = f16[2,6,128,128]{3,2,1,0} divide(exponential.31, broadcast.42)
-      Arg_2.3 = f16[2,6,128,64]{3,2,1,0} parameter(2)
-      dot.46 = f16[2,6,128,64]{3,2,1,0} dot(divide.43, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-      Arg_3.4 = f16[2,6,128,64]{3,2,1,0} parameter(3)
-      dot.49 = f16[2,6,128,128]{3,2,1,0} dot(Arg_3.4, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
-      broadcast.62 = f16[2,6,128,1]{3,2,1,0} broadcast(convert.39), dimensions={0,1,2,3}
-      reshape.63 = f16[2,6,128]{2,1,0} reshape(broadcast.62)
-      broadcast.64 = f16[2,6,128,128]{3,2,1,0} broadcast(reshape.63), dimensions={0,1,2}
-      divide.65 = f16[2,6,128,128]{3,2,1,0} divide(dot.49, broadcast.64)
-      constant.9 = f16[] constant(1)
-      broadcast.10 = f16[2,6,128,1]{3,2,1,0} broadcast(constant.9), dimensions={}
-      multiply.44 = f16[2,6,128,1]{3,2,1,0} multiply(convert.39, convert.39)
-      divide.45 = f16[2,6,128,1]{3,2,1,0} divide(broadcast.10, multiply.44)
-      broadcast.50 = f16[2,6,128,1]{3,2,1,0} broadcast(divide.45), dimensions={0,1,2,3}
-      reshape.51 = f16[2,6,128]{2,1,0} reshape(broadcast.50)
-      broadcast.52 = f16[2,6,128,128]{3,2,1,0} broadcast(reshape.51), dimensions={0,1,2}
-      multiply.53 = f16[2,6,128,128]{3,2,1,0} multiply(dot.49, broadcast.52)
-      multiply.54 = f16[2,6,128,128]{3,2,1,0} multiply(multiply.53, exponential.31)
-      constant.13 = f16[] constant(0)
-      reduce.59 = f16[2,6,128]{2,1,0} reduce(multiply.54, constant.13), dimensions={3}, to_apply=region_2.55
-      reshape.60 = f16[2,6,128,1]{3,2,1,0} reshape(reduce.59)
-      negate.61 = f16[2,6,128,1]{3,2,1,0} negate(reshape.60)
-      convert.66 = f32[2,6,128,1]{3,2,1,0} convert(negate.61)
-      reduce.71 = f32[2,6,128]{2,1,0} reduce(convert.66, constant.11), dimensions={3}, to_apply=region_3.67
-      broadcast.72 = f32[2,6,128,128]{3,2,1,0} broadcast(reduce.71), dimensions={0,1,2}
-      convert.73 = f16[2,6,128,128]{3,2,1,0} convert(broadcast.72)
-      add.74 = f16[2,6,128,128]{3,2,1,0} add(divide.65, convert.73)
-      multiply.75 = f16[2,6,128,128]{3,2,1,0} multiply(add.74, exponential.31)
-      select.76 = f16[2,6,128,128]{3,2,1,0} select(constant.16, multiply.75, broadcast.8)
-      multiply.77 = f16[2,6,128,128]{3,2,1,0} multiply(select.76, broadcast.6)
-      dot.80 = f16[2,6,128,64]{3,2,1,0} dot(multiply.77, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
-      dot.78 = f16[2,6,128,64]{3,2,1,0} dot(multiply.77, Arg_0.1), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-      transpose.79 = f16[2,6,64,128]{2,3,1,0} transpose(dot.78), dimensions={0,1,3,2}
-      dot.47 = f16[2,6,64,128]{3,2,1,0} dot(Arg_3.4, divide.43), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-      transpose.48 = f16[2,6,128,64]{2,3,1,0} transpose(dot.47), dimensions={0,1,3,2}
-      ROOT tuple.81 = (f16[2,6,128,64]{3,2,1,0}, f16[2,6,128,64]{3,2,1,0}, f16[2,6,64,128]{2,3,1,0}, f16[2,6,128,64]{2,3,1,0}) tuple(dot.46, dot.80, transpose.79, transpose.48)
-    }
-  )";
-
-    return hlo_text;
-  }
-
-  const std::string  // NOLINT
-  GetModuleFMHA_Training_BMM1_Scale_Bias_Mask_Softmax_BMM2_HloString_BF16() {  // NOLINT
-    const std::string hlo_text = R"(
-    HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[2,6,128,64]{3,2,1,0},bf16[2,6,64,128]{3,2,1,0},bf16[2,6,128,64]{3,2,1,0},bf16[2,6,128,64]{3,2,1,0},pred[2,6,128,128]{3,2,1,0})->(bf16[2,6,128,64]{3,2,1,0}, bf16[2,6,128,64]{3,2,1,0}, bf16[2,6,64,128]{3,2,1,0}, bf16[2,6,128,64]{3,2,1,0})}, allow_spmd_sharding_propagation_to_output={true,true,true,true}
-
-    region_0.21 {
-      Arg_0.22 = bf16[] parameter(0)
-      Arg_1.23 = bf16[] parameter(1)
-      ROOT maximum.24 = bf16[] maximum(Arg_0.22, Arg_1.23)
-    }
-
-    region_1.33 {
-      Arg_0.34 = f32[] parameter(0)
-      Arg_1.35 = f32[] parameter(1)
-      ROOT add.36 = f32[] add(Arg_0.34, Arg_1.35)
-    }
-
-    region_2.55 {
-      Arg_0.56 = bf16[] parameter(0)
-      Arg_1.57 = bf16[] parameter(1)
-      ROOT add.58 = bf16[] add(Arg_0.56, Arg_1.57)
-    }
-
-    region_3.67 {
-      Arg_0.68 = f32[] parameter(0)
-      Arg_1.69 = f32[] parameter(1)
-      ROOT add.70 = f32[] add(Arg_0.68, Arg_1.69)
-    }
-
-    ENTRY main.82 {
-      constant.16 = pred[2,6,128,128]{3,2,1,0} parameter(4)
-      Arg_0.1 = bf16[2,6,128,64]{3,2,1,0} parameter(0)
-      Arg_1.2 = bf16[2,6,64,128]{3,2,1,0} parameter(1)
-      dot.17 = bf16[2,6,128,128]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-      constant.5 = bf16[] constant(2)
-      broadcast.6 = bf16[2,6,128,128]{3,2,1,0} broadcast(constant.5), dimensions={}
-      multiply.18 = bf16[2,6,128,128]{3,2,1,0} multiply(dot.17, broadcast.6)
-      constant.14 = bf16[] constant(1)
-      broadcast.15 = bf16[2,6,128,128]{3,2,1,0} broadcast(constant.14), dimensions={}
-      add.19 = bf16[2,6,128,128]{3,2,1,0} add(multiply.18, broadcast.15)
-      constant.7 = bf16[] constant(0)
-      broadcast.8 = bf16[2,6,128,128]{3,2,1,0} broadcast(constant.7), dimensions={}
-      select.20 = bf16[2,6,128,128]{3,2,1,0} select(constant.16, add.19, broadcast.8)
-      constant.12 = bf16[] constant(-inf)
-      reduce.25 = bf16[2,6,128]{2,1,0} reduce(select.20, constant.12), dimensions={3}, to_apply=region_0.21
-      reshape.26 = bf16[2,6,128,1]{3,2,1,0} reshape(reduce.25)
-      broadcast.27 = bf16[2,6,128,1]{3,2,1,0} broadcast(reshape.26), dimensions={0,1,2,3}
-      reshape.28 = bf16[2,6,128]{2,1,0} reshape(broadcast.27)
-      broadcast.29 = bf16[2,6,128,128]{3,2,1,0} broadcast(reshape.28), dimensions={0,1,2}
-      subtract.30 = bf16[2,6,128,128]{3,2,1,0} subtract(select.20, broadcast.29)
-      exponential.31 = bf16[2,6,128,128]{3,2,1,0} exponential(subtract.30)
-      convert.32 = f32[2,6,128,128]{3,2,1,0} convert(exponential.31)
-      constant.11 = f32[] constant(0)
-      reduce.37 = f32[2,6,128]{2,1,0} reduce(convert.32, constant.11), dimensions={3}, to_apply=region_1.33
-      reshape.38 = f32[2,6,128,1]{3,2,1,0} reshape(reduce.37)
-      convert.39 = bf16[2,6,128,1]{3,2,1,0} convert(reshape.38)
-      broadcast.40 = bf16[2,6,128,1]{3,2,1,0} broadcast(convert.39), dimensions={0,1,2,3}
-      reshape.41 = bf16[2,6,128]{2,1,0} reshape(broadcast.40)
-      broadcast.42 = bf16[2,6,128,128]{3,2,1,0} broadcast(reshape.41), dimensions={0,1,2}
-      divide.43 = bf16[2,6,128,128]{3,2,1,0} divide(exponential.31, broadcast.42)
-      Arg_2.3 = bf16[2,6,128,64]{3,2,1,0} parameter(2)
-      dot.46 = bf16[2,6,128,64]{3,2,1,0} dot(divide.43, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-      Arg_3.4 = bf16[2,6,128,64]{3,2,1,0} parameter(3)
-      dot.49 = bf16[2,6,128,128]{3,2,1,0} dot(Arg_3.4, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
-      broadcast.62 = bf16[2,6,128,1]{3,2,1,0} broadcast(convert.39), dimensions={0,1,2,3}
-      reshape.63 = bf16[2,6,128]{2,1,0} reshape(broadcast.62)
-      broadcast.64 = bf16[2,6,128,128]{3,2,1,0} broadcast(reshape.63), dimensions={0,1,2}
-      divide.65 = bf16[2,6,128,128]{3,2,1,0} divide(dot.49, broadcast.64)
-      constant.9 = bf16[] constant(1)
-      broadcast.10 = bf16[2,6,128,1]{3,2,1,0} broadcast(constant.9), dimensions={}
-      multiply.44 = bf16[2,6,128,1]{3,2,1,0} multiply(convert.39, convert.39)
-      divide.45 = bf16[2,6,128,1]{3,2,1,0} divide(broadcast.10, multiply.44)
-      broadcast.50 = bf16[2,6,128,1]{3,2,1,0} broadcast(divide.45), dimensions={0,1,2,3}
-      reshape.51 = bf16[2,6,128]{2,1,0} reshape(broadcast.50)
-      broadcast.52 = bf16[2,6,128,128]{3,2,1,0} broadcast(reshape.51), dimensions={0,1,2}
-      multiply.53 = bf16[2,6,128,128]{3,2,1,0} multiply(dot.49, broadcast.52)
-      multiply.54 = bf16[2,6,128,128]{3,2,1,0} multiply(multiply.53, exponential.31)
-      constant.13 = bf16[] constant(0)
-      reduce.59 = bf16[2,6,128]{2,1,0} reduce(multiply.54, constant.13), dimensions={3}, to_apply=region_2.55
-      reshape.60 = bf16[2,6,128,1]{3,2,1,0} reshape(reduce.59)
-      negate.61 = bf16[2,6,128,1]{3,2,1,0} negate(reshape.60)
-      convert.66 = f32[2,6,128,1]{3,2,1,0} convert(negate.61)
-      reduce.71 = f32[2,6,128]{2,1,0} reduce(convert.66, constant.11), dimensions={3}, to_apply=region_3.67
-      broadcast.72 = f32[2,6,128,128]{3,2,1,0} broadcast(reduce.71), dimensions={0,1,2}
-      convert.73 = bf16[2,6,128,128]{3,2,1,0} convert(broadcast.72)
-      add.74 = bf16[2,6,128,128]{3,2,1,0} add(divide.65, convert.73)
-      multiply.75 = bf16[2,6,128,128]{3,2,1,0} multiply(add.74, exponential.31)
-      select.76 = bf16[2,6,128,128]{3,2,1,0} select(constant.16, multiply.75, broadcast.8)
-      multiply.77 = bf16[2,6,128,128]{3,2,1,0} multiply(select.76, broadcast.6)
-      dot.80 = bf16[2,6,128,64]{3,2,1,0} dot(multiply.77, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
-      dot.78 = bf16[2,6,128,64]{3,2,1,0} dot(multiply.77, Arg_0.1), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-      transpose.79 = bf16[2,6,64,128]{2,3,1,0} transpose(dot.78), dimensions={0,1,3,2}
-      dot.47 = bf16[2,6,64,128]{3,2,1,0} dot(Arg_3.4, divide.43), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-      transpose.48 = bf16[2,6,128,64]{2,3,1,0} transpose(dot.47), dimensions={0,1,3,2}
-      ROOT tuple.81 = (bf16[2,6,128,64]{3,2,1,0}, bf16[2,6,128,64]{3,2,1,0}, bf16[2,6,64,128]{2,3,1,0}, bf16[2,6,128,64]{2,3,1,0}) tuple(dot.46, dot.80, transpose.79, transpose.48)
-    }
-  )";
-
-    return hlo_text;
-  }
-
-  std::string GetModuleFMHABMM1_Scale_Bias_Mask_Softmax_BMM2_HloString_BF16() {
-    const std::string hlo_text = R"(
-    HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0},pred[16,16,256,256]{3,2,1,0})->bf16[16,16,256,64]{3,2,1,0}}
-
-    region_0.17 {
-      Arg_0.18 = bf16[] parameter(0)
-      Arg_1.19 = bf16[] parameter(1)
-      ROOT maximum.20 = bf16[] maximum(Arg_0.18, Arg_1.19)
-    }
-
-    region_1.29 {
-      Arg_0.30 = f32[] parameter(0)
-      Arg_1.31 = f32[] parameter(1)
-      ROOT add.32 = f32[] add(Arg_0.30, Arg_1.31)
-    }
-
-    ENTRY main.41 {
-      constant.12 = pred[16,16,256,256]{3,2,1,0} parameter(3)
-      Arg_0.1 = bf16[16,16,256,64]{3,2,1,0} parameter(0)
-      Arg_1.2 = bf16[16,16,256,64]{3,2,1,0} parameter(1)
-      dot.13 = bf16[16,16,256,256]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
-      constant.6 = bf16[] constant(2)
-      broadcast.7 = bf16[16,16,256,256]{3,2,1,0} broadcast(constant.6), dimensions={}
-      multiply.14 = bf16[16,16,256,256]{3,2,1,0} multiply(dot.13, broadcast.7)
-      constant.10 = bf16[] constant(1)
-      broadcast.11 = bf16[16,16,256,256]{3,2,1,0} broadcast(constant.10), dimensions={}
-      add.15 = bf16[16,16,256,256]{3,2,1,0} add(multiply.14, broadcast.11)
-      constant.4 = bf16[] constant(0)
-      broadcast.5 = bf16[16,16,256,256]{3,2,1,0} broadcast(constant.4), dimensions={}
-      select.16 = bf16[16,16,256,256]{3,2,1,0} select(constant.12, add.15, broadcast.5)
-      constant.9 = bf16[] constant(-inf)
-      reduce.21 = bf16[16,16,256]{2,1,0} reduce(select.16, constant.9), dimensions={3}, to_apply=region_0.17
-      reshape.22 = bf16[16,16,256,1]{3,2,1,0} reshape(reduce.21)
-      broadcast.23 = bf16[16,16,256,1]{3,2,1,0} broadcast(reshape.22), dimensions={0,1,2,3}
-      reshape.24 = bf16[16,16,256]{2,1,0} reshape(broadcast.23)
-      broadcast.25 = bf16[16,16,256,256]{3,2,1,0} broadcast(reshape.24), dimensions={0,1,2}
-      subtract.26 = bf16[16,16,256,256]{3,2,1,0} subtract(select.16, broadcast.25)
-      exponential.27 = bf16[16,16,256,256]{3,2,1,0} exponential(subtract.26)
-      convert.28 = f32[16,16,256,256]{3,2,1,0} convert(exponential.27)
-      constant.8 = f32[] constant(0)
-      reduce.33 = f32[16,16,256]{2,1,0} reduce(convert.28, constant.8), dimensions={3}, to_apply=region_1.29
-      reshape.34 = f32[16,16,256,1]{3,2,1,0} reshape(reduce.33)
-      convert.35 = bf16[16,16,256,1]{3,2,1,0} convert(reshape.34)
-      broadcast.36 = bf16[16,16,256,1]{3,2,1,0} broadcast(convert.35), dimensions={0,1,2,3}
-      reshape.37 = bf16[16,16,256]{2,1,0} reshape(broadcast.36)
-      broadcast.38 = bf16[16,16,256,256]{3,2,1,0} broadcast(reshape.37), dimensions={0,1,2}
-      divide.39 = bf16[16,16,256,256]{3,2,1,0} divide(exponential.27, broadcast.38)
-      Arg_2.3 = bf16[16,16,256,64]{3,2,1,0} parameter(2)
-      ROOT dot.40 = bf16[16,16,256,64]{3,2,1,0} dot(divide.39, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-    }
-  )";
-
-    return hlo_text;
-  }
-
-  std::string
-  GetModuleFMHABMM1_Scale_Bias_Mask_Softmax_BMM2_HloString_BF16_smaller() {  // NOLINT
-    const std::string hlo_text = R"(
-    HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[2,6,40,64]{3,2,1,0},bf16[2,6,64,40]{3,2,1,0},bf16[2,6,40,64]{3,2,1,0},pred[2,6,40,40]{3,2,1,0})->bf16[2,6,40,64]{3,2,1,0}}
-
-    region_0.17 {
-      Arg_0.18 = bf16[] parameter(0)
-      Arg_1.19 = bf16[] parameter(1)
-      ROOT maximum.20 = bf16[] maximum(Arg_0.18, Arg_1.19)
-    }
-
-    region_1.29 {
-      Arg_0.30 = f32[] parameter(0)
-      Arg_1.31 = f32[] parameter(1)
-      ROOT add.32 = f32[] add(Arg_0.30, Arg_1.31)
-    }
-
-    ENTRY main.41 {
-      constant.12 = pred[2,6,40,40]{3,2,1,0} parameter(3)
-      Arg_0.1 = bf16[2,6,40,64]{3,2,1,0} parameter(0)
-      Arg_1.2 = bf16[2,6,64,40]{3,2,1,0} parameter(1)
-      dot.13 = bf16[2,6,40,40]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-      constant.6 = bf16[] constant(2)
-      broadcast.7 = bf16[2,6,40,40]{3,2,1,0} broadcast(constant.6), dimensions={}
-      multiply.14 = bf16[2,6,40,40]{3,2,1,0} multiply(dot.13, broadcast.7)
-      constant.10 = bf16[] constant(1)
-      broadcast.11 = bf16[2,6,40,40]{3,2,1,0} broadcast(constant.10), dimensions={}
-      add.15 = bf16[2,6,40,40]{3,2,1,0} add(multiply.14, broadcast.11)
-      constant.4 = bf16[] constant(0)
-      broadcast.5 = bf16[2,6,40,40]{3,2,1,0} broadcast(constant.4), dimensions={}
-      select.16 = bf16[2,6,40,40]{3,2,1,0} select(constant.12, add.15, broadcast.5)
-      constant.9 = bf16[] constant(-inf)
-      reduce.21 = bf16[2,6,40]{2,1,0} reduce(select.16, constant.9), dimensions={3}, to_apply=region_0.17
-      reshape.22 = bf16[2,6,40,1]{3,2,1,0} reshape(reduce.21)
-      broadcast.23 = bf16[2,6,40,1]{3,2,1,0} broadcast(reshape.22), dimensions={0,1,2,3}
-      reshape.24 = bf16[2,6,40]{2,1,0} reshape(broadcast.23)
-      broadcast.25 = bf16[2,6,40,40]{3,2,1,0} broadcast(reshape.24), dimensions={0,1,2}
-      subtract.26 = bf16[2,6,40,40]{3,2,1,0} subtract(select.16, broadcast.25)
-      exponential.27 = bf16[2,6,40,40]{3,2,1,0} exponential(subtract.26)
-      convert.28 = f32[2,6,40,40]{3,2,1,0} convert(exponential.27)
-      constant.8 = f32[] constant(0)
-      reduce.33 = f32[2,6,40]{2,1,0} reduce(convert.28, constant.8), dimensions={3}, to_apply=region_1.29
-      reshape.34 = f32[2,6,40,1]{3,2,1,0} reshape(reduce.33)
-      convert.35 = bf16[2,6,40,1]{3,2,1,0} convert(reshape.34)
-      broadcast.36 = bf16[2,6,40,1]{3,2,1,0} broadcast(convert.35), dimensions={0,1,2,3}
-      reshape.37 = bf16[2,6,40]{2,1,0} reshape(broadcast.36)
-      broadcast.38 = bf16[2,6,40,40]{3,2,1,0} broadcast(reshape.37), dimensions={0,1,2}
-      divide.39 = bf16[2,6,40,40]{3,2,1,0} divide(exponential.27, broadcast.38)
-      Arg_2.3 = bf16[2,6,40,64]{3,2,1,0} parameter(2)
-      ROOT dot.40 = bf16[2,6,40,64]{3,2,1,0} dot(divide.39, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-    }
-  )";
-
-    return hlo_text;
-  }
-
-  std::string
-  GetModuleFMHABMM1_Scale_Bias_Mask_Softmax_BMM2_HloString_F16_smaller() {  // NOLINT
-    const std::string hlo_text = R"(
-  HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(f16[2,6,40,64]{3,2,1,0},f16[2,6,64,40]{3,2,1,0},f16[2,6,40,64]{3,2,1,0},pred[2,6,40,40]{3,2,1,0})->f16[2,6,40,64]{3,2,1,0}}
-
-  region_0.17 {
-    Arg_0.18 = f16[] parameter(0)
-    Arg_1.19 = f16[] parameter(1)
-    ROOT maximum.20 = f16[] maximum(Arg_0.18, Arg_1.19)
-  }
-
-  region_1.29 {
-    Arg_0.30 = f32[] parameter(0)
-    Arg_1.31 = f32[] parameter(1)
-    ROOT add.32 = f32[] add(Arg_0.30, Arg_1.31)
-  }
-
-  ENTRY main.41 {
-    constant.12 = pred[2,6,40,40]{3,2,1,0} parameter(3)
-    Arg_0.1 = f16[2,6,40,64]{3,2,1,0} parameter(0)
-    Arg_1.2 = f16[2,6,64,40]{3,2,1,0} parameter(1)
-    dot.13 = f16[2,6,40,40]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-    constant.6 = f16[] constant(2)
-    broadcast.7 = f16[2,6,40,40]{3,2,1,0} broadcast(constant.6), dimensions={}
-    multiply.14 = f16[2,6,40,40]{3,2,1,0} multiply(dot.13, broadcast.7)
-    constant.10 = f16[] constant(1)
-    broadcast.11 = f16[2,6,40,40]{3,2,1,0} broadcast(constant.10), dimensions={}
-    add.15 = f16[2,6,40,40]{3,2,1,0} add(multiply.14, broadcast.11)
-    constant.4 = f16[] constant(0)
-    broadcast.5 = f16[2,6,40,40]{3,2,1,0} broadcast(constant.4), dimensions={}
-    select.16 = f16[2,6,40,40]{3,2,1,0} select(constant.12, add.15, broadcast.5)
-    constant.9 = f16[] constant(-inf)
-    reduce.21 = f16[2,6,40]{2,1,0} reduce(select.16, constant.9), dimensions={3}, to_apply=region_0.17
-    reshape.22 = f16[2,6,40,1]{3,2,1,0} reshape(reduce.21)
-    broadcast.23 = f16[2,6,40,1]{3,2,1,0} broadcast(reshape.22), dimensions={0,1,2,3}
-    reshape.24 = f16[2,6,40]{2,1,0} reshape(broadcast.23)
-    broadcast.25 = f16[2,6,40,40]{3,2,1,0} broadcast(reshape.24), dimensions={0,1,2}
-    subtract.26 = f16[2,6,40,40]{3,2,1,0} subtract(select.16, broadcast.25)
-    exponential.27 = f16[2,6,40,40]{3,2,1,0} exponential(subtract.26)
-    convert.28 = f32[2,6,40,40]{3,2,1,0} convert(exponential.27)
-    constant.8 = f32[] constant(0)
-    reduce.33 = f32[2,6,40]{2,1,0} reduce(convert.28, constant.8), dimensions={3}, to_apply=region_1.29
-    reshape.34 = f32[2,6,40,1]{3,2,1,0} reshape(reduce.33)
-    convert.35 = f16[2,6,40,1]{3,2,1,0} convert(reshape.34)
-    broadcast.36 = f16[2,6,40,1]{3,2,1,0} broadcast(convert.35), dimensions={0,1,2,3}
-    reshape.37 = f16[2,6,40]{2,1,0} reshape(broadcast.36)
-    broadcast.38 = f16[2,6,40,40]{3,2,1,0} broadcast(reshape.37), dimensions={0,1,2}
-    divide.39 = f16[2,6,40,40]{3,2,1,0} divide(exponential.27, broadcast.38)
-    Arg_2.3 = f16[2,6,40,64]{3,2,1,0} parameter(2)
-    ROOT dot.40 = f16[2,6,40,64]{3,2,1,0} dot(divide.39, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  }
-)";
-
-    return hlo_text;
-  }
-
-  std::string
-  GetModuleFMHABMM1_Scale_Bias_Mask_Softmax_BMM2_arg_reversal_HloString_F16() {  // NOLINT
-    const std::string hlo_text = R"(
-    HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(f16[16,16,256,64]{3,2,1,0},f16[16,16,256,64]{3,2,1,0},f16[16,16,256,64]{3,2,1,0},pred[16,16,256,256]{3,2,1,0})->f16[16,16,64,256]{3,2,1,0}}
-
-    region_0.17 {
-      Arg_0.18 = f16[] parameter(0)
-      Arg_1.19 = f16[] parameter(1)
-      ROOT maximum.20 = f16[] maximum(Arg_0.18, Arg_1.19)
-    }
-
-    region_1.29 {
-      Arg_0.30 = f32[] parameter(0)
-      Arg_1.31 = f32[] parameter(1)
-      ROOT add.32 = f32[] add(Arg_0.30, Arg_1.31)
-    }
-
-    ENTRY main.41 {
-      constant.12 = pred[16,16,256,256]{3,2,1,0} parameter(3)
-      Arg_0.1 = f16[16,16,256,64]{3,2,1,0} parameter(0)
-      Arg_1.2 = f16[16,16,256,64]{3,2,1,0} parameter(1)
-      dot.13 = f16[16,16,256,256]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
-      constant.6 = f16[] constant(2)
-      broadcast.7 = f16[16,16,256,256]{3,2,1,0} broadcast(constant.6), dimensions={}
-      multiply.14 = f16[16,16,256,256]{3,2,1,0} multiply(dot.13, broadcast.7)
-      constant.10 = f16[] constant(1)
-      broadcast.11 = f16[16,16,256,256]{3,2,1,0} broadcast(constant.10), dimensions={}
-      add.15 = f16[16,16,256,256]{3,2,1,0} add(multiply.14, broadcast.11)
-      constant.4 = f16[] constant(0)
-      broadcast.5 = f16[16,16,256,256]{3,2,1,0} broadcast(constant.4), dimensions={}
-      select.16 = f16[16,16,256,256]{3,2,1,0} select(constant.12, add.15, broadcast.5)
-      constant.9 = f16[] constant(-inf)
-      reduce.21 = f16[16,16,256]{2,1,0} reduce(select.16, constant.9), dimensions={3}, to_apply=region_0.17
-      reshape.22 = f16[16,16,256,1]{3,2,1,0} reshape(reduce.21)
-      broadcast.23 = f16[16,16,256,1]{3,2,1,0} broadcast(reshape.22), dimensions={0,1,2,3}
-      reshape.24 = f16[16,16,256]{2,1,0} reshape(broadcast.23)
-      broadcast.25 = f16[16,16,256,256]{3,2,1,0} broadcast(reshape.24), dimensions={0,1,2}
-      subtract.26 = f16[16,16,256,256]{3,2,1,0} subtract(select.16, broadcast.25)
-      exponential.27 = f16[16,16,256,256]{3,2,1,0} exponential(subtract.26)
-      convert.28 = f32[16,16,256,256]{3,2,1,0} convert(exponential.27)
-      constant.8 = f32[] constant(0)
-      reduce.33 = f32[16,16,256]{2,1,0} reduce(convert.28, constant.8), dimensions={3}, to_apply=region_1.29
-      reshape.34 = f32[16,16,256,1]{3,2,1,0} reshape(reduce.33)
-      convert.35 = f16[16,16,256,1]{3,2,1,0} convert(reshape.34)
-      broadcast.36 = f16[16,16,256,1]{3,2,1,0} broadcast(convert.35), dimensions={0,1,2,3}
-      reshape.37 = f16[16,16,256]{2,1,0} reshape(broadcast.36)
-      broadcast.38 = f16[16,16,256,256]{3,2,1,0} broadcast(reshape.37), dimensions={0,1,2}
-      divide.39 = f16[16,16,256,256]{3,2,1,0} divide(exponential.27, broadcast.38)
-      Arg_2.3 = f16[16,16,256,64]{3,2,1,0} parameter(2)
-      ROOT dot.40 = f16[16,16,64,256]{3,2,1,0} dot(Arg_2.3, divide.39), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
-    }
-  )";
-
-    return hlo_text;
-  }
-
-  std::string
-  GetModuleFMHABMM1_Scale_Bias_Mask_Softmax_BMM2_arg_reversal_HloString_BF16() {  // NOLINT
-    const std::string hlo_text = R"(
-  HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0},pred[16,16,256,256]{3,2,1,0})->bf16[16,16,64,256]{3,2,1,0}}
-
-  region_0.17 {
-    Arg_0.18 = bf16[] parameter(0)
-    Arg_1.19 = bf16[] parameter(1)
-    ROOT maximum.20 = bf16[] maximum(Arg_0.18, Arg_1.19)
-  }
-
-  region_1.29 {
-    Arg_0.30 = f32[] parameter(0)
-    Arg_1.31 = f32[] parameter(1)
-    ROOT add.32 = f32[] add(Arg_0.30, Arg_1.31)
-  }
-
-  ENTRY main.41 {
-    constant.12 = pred[16,16,256,256]{3,2,1,0} parameter(3)
-    Arg_0.1 = bf16[16,16,256,64]{3,2,1,0} parameter(0)
-    Arg_1.2 = bf16[16,16,256,64]{3,2,1,0} parameter(1)
-    dot.13 = bf16[16,16,256,256]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
-    constant.6 = bf16[] constant(2)
-    broadcast.7 = bf16[16,16,256,256]{3,2,1,0} broadcast(constant.6), dimensions={}
-    multiply.14 = bf16[16,16,256,256]{3,2,1,0} multiply(dot.13, broadcast.7)
-    constant.10 = bf16[] constant(1)
-    broadcast.11 = bf16[16,16,256,256]{3,2,1,0} broadcast(constant.10), dimensions={}
-    add.15 = bf16[16,16,256,256]{3,2,1,0} add(multiply.14, broadcast.11)
-    constant.4 = bf16[] constant(0)
-    broadcast.5 = bf16[16,16,256,256]{3,2,1,0} broadcast(constant.4), dimensions={}
-    select.16 = bf16[16,16,256,256]{3,2,1,0} select(constant.12, add.15, broadcast.5)
-    constant.9 = bf16[] constant(-inf)
-    reduce.21 = bf16[16,16,256]{2,1,0} reduce(select.16, constant.9), dimensions={3}, to_apply=region_0.17
-    reshape.22 = bf16[16,16,256,1]{3,2,1,0} reshape(reduce.21)
-    broadcast.23 = bf16[16,16,256,1]{3,2,1,0} broadcast(reshape.22), dimensions={0,1,2,3}
-    reshape.24 = bf16[16,16,256]{2,1,0} reshape(broadcast.23)
-    broadcast.25 = bf16[16,16,256,256]{3,2,1,0} broadcast(reshape.24), dimensions={0,1,2}
-    subtract.26 = bf16[16,16,256,256]{3,2,1,0} subtract(select.16, broadcast.25)
-    exponential.27 = bf16[16,16,256,256]{3,2,1,0} exponential(subtract.26)
-    convert.28 = f32[16,16,256,256]{3,2,1,0} convert(exponential.27)
-    constant.8 = f32[] constant(0)
-    reduce.33 = f32[16,16,256]{2,1,0} reduce(convert.28, constant.8), dimensions={3}, to_apply=region_1.29
-    reshape.34 = f32[16,16,256,1]{3,2,1,0} reshape(reduce.33)
-    convert.35 = bf16[16,16,256,1]{3,2,1,0} convert(reshape.34)
-    broadcast.36 = bf16[16,16,256,1]{3,2,1,0} broadcast(convert.35), dimensions={0,1,2,3}
-    reshape.37 = bf16[16,16,256]{2,1,0} reshape(broadcast.36)
-    broadcast.38 = bf16[16,16,256,256]{3,2,1,0} broadcast(reshape.37), dimensions={0,1,2}
-    divide.39 = bf16[16,16,256,256]{3,2,1,0} divide(exponential.27, broadcast.38)
-    Arg_2.3 = bf16[16,16,256,64]{3,2,1,0} parameter(2)
-    ROOT dot.40 = bf16[16,16,64,256]{3,2,1,0} dot(Arg_2.3, divide.39), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
-  }
-)";
-
-    return hlo_text;
-  }
-
-  // BMM1 - Scale - Bias - Mask - Softmax - BMM2
-  template <typename T>
-  void TestImpl_FMHABMM1_Scale_Bias_Mask_Softmax_BMM2_vanilla() {
-    if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-    XlaBuilder builder(TestName());
-
-    auto lhs_bmm1_literal =
-        GetInput4DLiteral<T>({16, 16, 256, 64}, {3, 2, 1, 0});
-    auto rhs_bmm1_literal =
-        GetInput4DLiteral<T>({16, 16, 256, 64}, {3, 2, 1, 0});
-    auto rhs_bmm2_literal =
-        GetInput4DLiteral<T>({16, 16, 256, 64}, {3, 2, 1, 0});
-    auto mask_literal = GetMask4DLiteral({16, 16, 256, 256}, {3, 2, 1, 0});
-
-    std::string hlo_string = "";
-    if (std::is_same<T, Eigen::half>::value) {
-      hlo_string =
-          GetModuleFMHABMM1_Scale_Bias_Mask_Softmax_BMM2_HloString_F16();
-    } else if (std::is_same<T, bfloat16>::value) {
-      hlo_string =
-          GetModuleFMHABMM1_Scale_Bias_Mask_Softmax_BMM2_HloString_BF16();
-    }
-
-    ExecuteAndCompare(hlo_string, {&lhs_bmm1_literal, &rhs_bmm1_literal,
-                                   &rhs_bmm2_literal, &mask_literal});
-  }
-
-  template <typename T>
-  void TestImpl_FMHABMM1_Scale_Bias_Mask_Softmax_BMM2_vanilla_smaller() {
-    if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-    XlaBuilder builder(TestName());
-
-    auto lhs_bmm1_literal = GetInput4DLiteral<T>({2, 6, 40, 64}, {3, 2, 1, 0});
-    auto rhs_bmm1_literal = GetInput4DLiteral<T>({2, 6, 64, 40}, {3, 2, 1, 0});
-    auto rhs_bmm2_literal = GetInput4DLiteral<T>({2, 6, 40, 64}, {3, 2, 1, 0});
-    auto mask_literal = GetMask4DLiteral({2, 6, 40, 40}, {3, 2, 1, 0});
-
-    std::string hlo_string = "";
-    if (std::is_same<T, Eigen::half>::value) {
-      hlo_string =
-          GetModuleFMHABMM1_Scale_Bias_Mask_Softmax_BMM2_HloString_F16_smaller();  // NOLINT
-    } else if (std::is_same<T, bfloat16>::value) {
-      hlo_string =
-          GetModuleFMHABMM1_Scale_Bias_Mask_Softmax_BMM2_HloString_BF16_smaller();  // NOLINT
-    }
-
-    ExecuteAndCompare(hlo_string, {&lhs_bmm1_literal, &rhs_bmm1_literal,
-                                   &rhs_bmm2_literal, &mask_literal});
-  }
-
-  template <typename T>
-  void TestImpl_FMHABMM1_Scale_Bias_Mask_Softmax_BMM2_arg_reversal() {
-    if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-    XlaBuilder builder(TestName());
-
-    auto lhs_bmm1_literal =
-        GetInput4DLiteral<T>({16, 16, 256, 64}, {3, 2, 1, 0});
-    auto rhs_bmm1_literal =
-        GetInput4DLiteral<T>({16, 16, 256, 64}, {3, 2, 1, 0});
-    auto rhs_bmm2_literal =
-        GetInput4DLiteral<T>({16, 16, 256, 64}, {3, 2, 1, 0});
-    auto mask_literal = GetMask4DLiteral({16, 16, 256, 256}, {3, 2, 1, 0});
-
-    std::string hlo_string = "";
-    if (std::is_same<T, Eigen::half>::value) {
-      hlo_string =
-          GetModuleFMHABMM1_Scale_Bias_Mask_Softmax_BMM2_arg_reversal_HloString_F16();  // NOLINT
-    } else if (std::is_same<T, bfloat16>::value) {
-      hlo_string =
-          GetModuleFMHABMM1_Scale_Bias_Mask_Softmax_BMM2_arg_reversal_HloString_BF16();  // NOLINT
-    }
-
-    ExecuteAndCompare(hlo_string, {&lhs_bmm1_literal, &rhs_bmm1_literal,
-                                   &rhs_bmm2_literal, &mask_literal});
-  }
-  // Traning BMM1 - Scale - bias - Mask - Softmax - BMM2
-  template <typename T>
-  void TestImpl_FMHA_Training_BMM1_Scale_Bias_Mask_Softmax_BMM2_vanilla() {
-    if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-    XlaBuilder builder(TestName());
-
-    auto lhs_bmm1_literal = GetInput4DLiteral<T>({2, 6, 128, 64}, {3, 2, 1, 0});
-    auto rhs_bmm1_literal = GetInput4DLiteral<T>({2, 6, 128, 64}, {3, 2, 1, 0});
-    auto rhs_bmm2_literal = GetInput4DLiteral<T>({2, 6, 64, 128}, {3, 2, 1, 0});
-    auto do_bmm2_literal = GetInput4DLiteral<T>({2, 6, 128, 64}, {3, 2, 1, 0});
-    auto mask_literal = GetMask4DLiteral({2, 6, 128, 128}, {3, 2, 1, 0});
-    std::string hlo_string = "";
-    if (std::is_same<T, Eigen::half>::value) {
-      hlo_string =
-          GetModuleFMHA_Training_BMM1_Scale_Bias_Mask_Softmax_BMM2_HloString_F16();  // NOLINT
-    } else if (std::is_same<T, bfloat16>::value) {
-      hlo_string =
-          GetModuleFMHA_Training_BMM1_Scale_Bias_Mask_Softmax_BMM2_HloString_BF16();  // NOLINT
-    }
-
-    ExecuteAndCompare(hlo_string,
-                      {&lhs_bmm1_literal, &rhs_bmm1_literal, &rhs_bmm2_literal,
-                       &do_bmm2_literal, &mask_literal},
-                      /*expected_num_fmha_calls=*/2);
-  }
-};
-
-// BMM1 - Scale - Mask - Softmax - BMM2
-class MultiHeadedAttentionBMMScaleMaskSoftmaxBMM
-    : public MultiHeadedAttentionTest {
- protected:
-  const std::string                                            // NOLINT
-  GetModuleFMHABMM1_Scale_Mask_Softmax_BMM2_HloString_F16() {  // NOLINT
-    const std::string hlo_text = R"(
-    HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(f16[16,16,256,64]{3,2,1,0},f16[16,16,256,64]{3,2,1,0},f16[16,16,256,64]{3,2,1,0},pred[16,16,256,256]{3,2,1,0})->f16[16,16,256,64]{3,2,1,0}}
-
-    region_0.14 {
-      Arg_0.15 = f16[] parameter(0)
-      Arg_1.16 = f16[] parameter(1)
-      ROOT maximum.17 = f16[] maximum(Arg_0.15, Arg_1.16)
-    }
-
-    region_1.26 {
-      Arg_0.27 = f32[] parameter(0)
-      Arg_1.28 = f32[] parameter(1)
-      ROOT add.29 = f32[] add(Arg_0.27, Arg_1.28)
-    }
-
-    ENTRY main.38 {
-      constant.10 = pred[16,16,256,256]{3,2,1,0} parameter(3)
-      Arg_0.1 = f16[16,16,256,64]{3,2,1,0} parameter(0)
-      Arg_1.2 = f16[16,16,256,64]{3,2,1,0} parameter(1)
-      dot.11 = f16[16,16,256,256]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
-      constant.6 = f16[] constant(2)
-      broadcast.7 = f16[16,16,256,256]{3,2,1,0} broadcast(constant.6), dimensions={}
-      multiply.12 = f16[16,16,256,256]{3,2,1,0} multiply(dot.11, broadcast.7)
-      constant.4 = f16[] constant(0)
-      broadcast.5 = f16[16,16,256,256]{3,2,1,0} broadcast(constant.4), dimensions={}
-      select.13 = f16[16,16,256,256]{3,2,1,0} select(constant.10, multiply.12, broadcast.5)
-      constant.9 = f16[] constant(-inf)
-      reduce.18 = f16[16,16,256]{2,1,0} reduce(select.13, constant.9), dimensions={3}, to_apply=region_0.14
-      reshape.19 = f16[16,16,256,1]{3,2,1,0} reshape(reduce.18)
-      broadcast.20 = f16[16,16,256,1]{3,2,1,0} broadcast(reshape.19), dimensions={0,1,2,3}
-      reshape.21 = f16[16,16,256]{2,1,0} reshape(broadcast.20)
-      broadcast.22 = f16[16,16,256,256]{3,2,1,0} broadcast(reshape.21), dimensions={0,1,2}
-      subtract.23 = f16[16,16,256,256]{3,2,1,0} subtract(select.13, broadcast.22)
-      exponential.24 = f16[16,16,256,256]{3,2,1,0} exponential(subtract.23)
-      convert.25 = f32[16,16,256,256]{3,2,1,0} convert(exponential.24)
-      constant.8 = f32[] constant(0)
-      reduce.30 = f32[16,16,256]{2,1,0} reduce(convert.25, constant.8), dimensions={3}, to_apply=region_1.26
-      reshape.31 = f32[16,16,256,1]{3,2,1,0} reshape(reduce.30)
-      convert.32 = f16[16,16,256,1]{3,2,1,0} convert(reshape.31)
-      broadcast.33 = f16[16,16,256,1]{3,2,1,0} broadcast(convert.32), dimensions={0,1,2,3}
-      reshape.34 = f16[16,16,256]{2,1,0} reshape(broadcast.33)
-      broadcast.35 = f16[16,16,256,256]{3,2,1,0} broadcast(reshape.34), dimensions={0,1,2}
-      divide.36 = f16[16,16,256,256]{3,2,1,0} divide(exponential.24, broadcast.35)
-      Arg_2.3 = f16[16,16,256,64]{3,2,1,0} parameter(2)
-      ROOT dot.37 = f16[16,16,256,64]{3,2,1,0} dot(divide.36, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-    }
-  )";
-
-    return hlo_text;
-  }
-
-  const std::string                                             // NOLINT
-  GetModuleFMHABMM1_Scale_Mask_Softmax_BMM2_HloString_BF16() {  // NOLINT
-    const std::string hlo_text = R"(
-    HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0},pred[16,16,256,256]{3,2,1,0})->bf16[16,16,256,64]{3,2,1,0}}
-
-    region_0.14 {
-      Arg_0.15 = bf16[] parameter(0)
-      Arg_1.16 = bf16[] parameter(1)
-      ROOT maximum.17 = bf16[] maximum(Arg_0.15, Arg_1.16)
-    }
-
-    region_1.26 {
-      Arg_0.27 = f32[] parameter(0)
-      Arg_1.28 = f32[] parameter(1)
-      ROOT add.29 = f32[] add(Arg_0.27, Arg_1.28)
-    }
-
-    ENTRY main.38 {
-      constant.10 = pred[16,16,256,256]{3,2,1,0} parameter(3)
-      Arg_0.1 = bf16[16,16,256,64]{3,2,1,0} parameter(0)
-      Arg_1.2 = bf16[16,16,256,64]{3,2,1,0} parameter(1)
-      dot.11 = bf16[16,16,256,256]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
-      constant.6 = bf16[] constant(2)
-      broadcast.7 = bf16[16,16,256,256]{3,2,1,0} broadcast(constant.6), dimensions={}
-      multiply.12 = bf16[16,16,256,256]{3,2,1,0} multiply(dot.11, broadcast.7)
-      constant.4 = bf16[] constant(0)
-      broadcast.5 = bf16[16,16,256,256]{3,2,1,0} broadcast(constant.4), dimensions={}
-      select.13 = bf16[16,16,256,256]{3,2,1,0} select(constant.10, multiply.12, broadcast.5)
-      constant.9 = bf16[] constant(-inf)
-      reduce.18 = bf16[16,16,256]{2,1,0} reduce(select.13, constant.9), dimensions={3}, to_apply=region_0.14
-      reshape.19 = bf16[16,16,256,1]{3,2,1,0} reshape(reduce.18)
-      broadcast.20 = bf16[16,16,256,1]{3,2,1,0} broadcast(reshape.19), dimensions={0,1,2,3}
-      reshape.21 = bf16[16,16,256]{2,1,0} reshape(broadcast.20)
-      broadcast.22 = bf16[16,16,256,256]{3,2,1,0} broadcast(reshape.21), dimensions={0,1,2}
-      subtract.23 = bf16[16,16,256,256]{3,2,1,0} subtract(select.13, broadcast.22)
-      exponential.24 = bf16[16,16,256,256]{3,2,1,0} exponential(subtract.23)
-      convert.25 = f32[16,16,256,256]{3,2,1,0} convert(exponential.24)
-      constant.8 = f32[] constant(0)
-      reduce.30 = f32[16,16,256]{2,1,0} reduce(convert.25, constant.8), dimensions={3}, to_apply=region_1.26
-      reshape.31 = f32[16,16,256,1]{3,2,1,0} reshape(reduce.30)
-      convert.32 = bf16[16,16,256,1]{3,2,1,0} convert(reshape.31)
-      broadcast.33 = bf16[16,16,256,1]{3,2,1,0} broadcast(convert.32), dimensions={0,1,2,3}
-      reshape.34 = bf16[16,16,256]{2,1,0} reshape(broadcast.33)
-      broadcast.35 = bf16[16,16,256,256]{3,2,1,0} broadcast(reshape.34), dimensions={0,1,2}
-      divide.36 = bf16[16,16,256,256]{3,2,1,0} divide(exponential.24, broadcast.35)
-      Arg_2.3 = bf16[16,16,256,64]{3,2,1,0} parameter(2)
-      ROOT dot.37 = bf16[16,16,256,64]{3,2,1,0} dot(divide.36, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-    }
-  )";
-
-    return hlo_text;
-  }
-
-  std::string
-  GetModuleFMHABMM1_Scale_Mask_Softmax_BMM2_arg_reversal_HloString_F16() {  // NOLINT
-    const std::string hlo_text = R"(
-    HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(f16[16,16,256,64]{3,2,1,0},f16[16,16,256,64]{3,2,1,0},f16[16,16,256,64]{3,2,1,0},pred[16,16,256,256]{3,2,1,0})->f16[16,16,64,256]{3,2,1,0}}
-
-    region_0.14 {
-      Arg_0.15 = f16[] parameter(0)
-      Arg_1.16 = f16[] parameter(1)
-      ROOT maximum.17 = f16[] maximum(Arg_0.15, Arg_1.16)
-    }
-
-    region_1.26 {
-      Arg_0.27 = f32[] parameter(0)
-      Arg_1.28 = f32[] parameter(1)
-      ROOT add.29 = f32[] add(Arg_0.27, Arg_1.28)
-    }
-
-    ENTRY main.38 {
-      constant.10 = pred[16,16,256,256]{3,2,1,0} parameter(3)
-      Arg_0.1 = f16[16,16,256,64]{3,2,1,0} parameter(0)
-      Arg_1.2 = f16[16,16,256,64]{3,2,1,0} parameter(1)
-      dot.11 = f16[16,16,256,256]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
-      constant.6 = f16[] constant(2)
-      broadcast.7 = f16[16,16,256,256]{3,2,1,0} broadcast(constant.6), dimensions={}
-      multiply.12 = f16[16,16,256,256]{3,2,1,0} multiply(dot.11, broadcast.7)
-      constant.4 = f16[] constant(0)
-      broadcast.5 = f16[16,16,256,256]{3,2,1,0} broadcast(constant.4), dimensions={}
-      select.13 = f16[16,16,256,256]{3,2,1,0} select(constant.10, multiply.12, broadcast.5)
-      constant.9 = f16[] constant(-inf)
-      reduce.18 = f16[16,16,256]{2,1,0} reduce(select.13, constant.9), dimensions={3}, to_apply=region_0.14
-      reshape.19 = f16[16,16,256,1]{3,2,1,0} reshape(reduce.18)
-      broadcast.20 = f16[16,16,256,1]{3,2,1,0} broadcast(reshape.19), dimensions={0,1,2,3}
-      reshape.21 = f16[16,16,256]{2,1,0} reshape(broadcast.20)
-      broadcast.22 = f16[16,16,256,256]{3,2,1,0} broadcast(reshape.21), dimensions={0,1,2}
-      subtract.23 = f16[16,16,256,256]{3,2,1,0} subtract(select.13, broadcast.22)
-      exponential.24 = f16[16,16,256,256]{3,2,1,0} exponential(subtract.23)
-      convert.25 = f32[16,16,256,256]{3,2,1,0} convert(exponential.24)
-      constant.8 = f32[] constant(0)
-      reduce.30 = f32[16,16,256]{2,1,0} reduce(convert.25, constant.8), dimensions={3}, to_apply=region_1.26
-      reshape.31 = f32[16,16,256,1]{3,2,1,0} reshape(reduce.30)
-      convert.32 = f16[16,16,256,1]{3,2,1,0} convert(reshape.31)
-      broadcast.33 = f16[16,16,256,1]{3,2,1,0} broadcast(convert.32), dimensions={0,1,2,3}
-      reshape.34 = f16[16,16,256]{2,1,0} reshape(broadcast.33)
-      broadcast.35 = f16[16,16,256,256]{3,2,1,0} broadcast(reshape.34), dimensions={0,1,2}
-      divide.36 = f16[16,16,256,256]{3,2,1,0} divide(exponential.24, broadcast.35)
-      Arg_2.3 = f16[16,16,256,64]{3,2,1,0} parameter(2)
-      ROOT dot.37 = f16[16,16,64,256]{3,2,1,0} dot(Arg_2.3, divide.36), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
-    }
-  )";
-
-    return hlo_text;
-  }
-
-  std::string
-  GetModuleFMHABMM1_Scale_Mask_Softmax_BMM2_arg_reversal_HloString_BF16() {  // NOLINT
-    const std::string hlo_text = R"(
-    HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0},pred[16,16,256,256]{3,2,1,0})->bf16[16,16,64,256]{3,2,1,0}}
-
-    region_0.14 {
-      Arg_0.15 = bf16[] parameter(0)
-      Arg_1.16 = bf16[] parameter(1)
-      ROOT maximum.17 = bf16[] maximum(Arg_0.15, Arg_1.16)
-    }
-
-    region_1.26 {
-      Arg_0.27 = f32[] parameter(0)
-      Arg_1.28 = f32[] parameter(1)
-      ROOT add.29 = f32[] add(Arg_0.27, Arg_1.28)
-    }
-
-    ENTRY main.38 {
-      constant.10 = pred[16,16,256,256]{3,2,1,0} parameter(3)
-      Arg_0.1 = bf16[16,16,256,64]{3,2,1,0} parameter(0)
-      Arg_1.2 = bf16[16,16,256,64]{3,2,1,0} parameter(1)
-      dot.11 = bf16[16,16,256,256]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
-      constant.6 = bf16[] constant(2)
-      broadcast.7 = bf16[16,16,256,256]{3,2,1,0} broadcast(constant.6), dimensions={}
-      multiply.12 = bf16[16,16,256,256]{3,2,1,0} multiply(dot.11, broadcast.7)
-      constant.4 = bf16[] constant(0)
-      broadcast.5 = bf16[16,16,256,256]{3,2,1,0} broadcast(constant.4), dimensions={}
-      select.13 = bf16[16,16,256,256]{3,2,1,0} select(constant.10, multiply.12, broadcast.5)
-      constant.9 = bf16[] constant(-inf)
-      reduce.18 = bf16[16,16,256]{2,1,0} reduce(select.13, constant.9), dimensions={3}, to_apply=region_0.14
-      reshape.19 = bf16[16,16,256,1]{3,2,1,0} reshape(reduce.18)
-      broadcast.20 = bf16[16,16,256,1]{3,2,1,0} broadcast(reshape.19), dimensions={0,1,2,3}
-      reshape.21 = bf16[16,16,256]{2,1,0} reshape(broadcast.20)
-      broadcast.22 = bf16[16,16,256,256]{3,2,1,0} broadcast(reshape.21), dimensions={0,1,2}
-      subtract.23 = bf16[16,16,256,256]{3,2,1,0} subtract(select.13, broadcast.22)
-      exponential.24 = bf16[16,16,256,256]{3,2,1,0} exponential(subtract.23)
-      convert.25 = f32[16,16,256,256]{3,2,1,0} convert(exponential.24)
-      constant.8 = f32[] constant(0)
-      reduce.30 = f32[16,16,256]{2,1,0} reduce(convert.25, constant.8), dimensions={3}, to_apply=region_1.26
-      reshape.31 = f32[16,16,256,1]{3,2,1,0} reshape(reduce.30)
-      convert.32 = bf16[16,16,256,1]{3,2,1,0} convert(reshape.31)
-      broadcast.33 = bf16[16,16,256,1]{3,2,1,0} broadcast(convert.32), dimensions={0,1,2,3}
-      reshape.34 = bf16[16,16,256]{2,1,0} reshape(broadcast.33)
-      broadcast.35 = bf16[16,16,256,256]{3,2,1,0} broadcast(reshape.34), dimensions={0,1,2}
-      divide.36 = bf16[16,16,256,256]{3,2,1,0} divide(exponential.24, broadcast.35)
-      Arg_2.3 = bf16[16,16,256,64]{3,2,1,0} parameter(2)
-      ROOT dot.37 = bf16[16,16,64,256]{3,2,1,0} dot(Arg_2.3, divide.36), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
-    }
-  )";
-
-    return hlo_text;
-  }
-
-  // BMM1 - Scale - Mask - Softmax - BMM2
-  template <typename T>
-  void TestImpl_FMHABMM1_Scale_Mask_Softmax_BMM2_vanilla() {
-    if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-    XlaBuilder builder(TestName());
-
-    auto lhs_bmm1_literal =
-        GetInput4DLiteral<T>({16, 16, 256, 64}, {3, 2, 1, 0});
-    auto rhs_bmm1_literal =
-        GetInput4DLiteral<T>({16, 16, 256, 64}, {3, 2, 1, 0});
-    auto rhs_bmm2_literal =
-        GetInput4DLiteral<T>({16, 16, 256, 64}, {3, 2, 1, 0});
-    auto mask_literal = GetMask4DLiteral({16, 16, 256, 256}, {3, 2, 1, 0});
-
-    std::string hlo_string = "";
-    if (std::is_same<T, Eigen::half>::value) {
-      hlo_string = GetModuleFMHABMM1_Scale_Mask_Softmax_BMM2_HloString_F16();
-    } else if (std::is_same<T, bfloat16>::value) {
-      hlo_string = GetModuleFMHABMM1_Scale_Mask_Softmax_BMM2_HloString_BF16();
-    }
-
-    ExecuteAndCompare(hlo_string, {&lhs_bmm1_literal, &rhs_bmm1_literal,
-                                   &rhs_bmm2_literal, &mask_literal});
-  }
-
-  template <typename T>
-  void TestImpl_FMHABMM1_Scale_Mask_Softmax_BMM2_arg_reversal() {
-    if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-    XlaBuilder builder(TestName());
-
-    auto lhs_bmm1_literal =
-        GetInput4DLiteral<T>({16, 16, 256, 64}, {3, 2, 1, 0});
-    auto rhs_bmm1_literal =
-        GetInput4DLiteral<T>({16, 16, 256, 64}, {3, 2, 1, 0});
-    auto rhs_bmm2_literal =
-        GetInput4DLiteral<T>({16, 16, 64, 256}, {3, 2, 1, 0});
-    auto mask_literal = GetMask4DLiteral({16, 16, 256, 256}, {3, 2, 1, 0});
-
-    std::string hlo_string = "";
-    if (std::is_same<T, Eigen::half>::value) {
-      hlo_string =
-          GetModuleFMHABMM1_Scale_Mask_Softmax_BMM2_arg_reversal_HloString_F16();  // NOLINT
-    } else if (std::is_same<T, bfloat16>::value) {
-      hlo_string =
-          GetModuleFMHABMM1_Scale_Mask_Softmax_BMM2_arg_reversal_HloString_BF16();  // NOLINT
-    }
-
-    ExecuteAndCompare(hlo_string, {&lhs_bmm1_literal, &rhs_bmm1_literal,
-                                   &rhs_bmm2_literal, &mask_literal});
-  }
-};
-
 // Bmm1 - Softmax - Bmm2
 class MultiHeadedAttentionBMMSoftmaxBMM : public MultiHeadedAttentionTest {
  protected:
@@ -2606,135 +1696,6 @@ class FlashAttentionBMMScaleBiasSoftmaxBMM : public MultiHeadedAttentionTest {
   }
 };
 
-class FlashAttentionBMMScaleBiasMaskSoftmaxBMM
-    : public MultiHeadedAttentionTest {
- protected:
-  const std::string  // NOLINT
-  GetModuleFlash_Attention_Training_BMM1_Bias_Mask_Softmax_BMM2_HloString_BF16() {  // NOLINT
-    const std::string hlo_text = R"(
-    HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[2,6,1024,64]{3,2,1,0},bf16[2,6,64,1024]{3,2,1,0},bf16[2,6,1024,64]{3,2,1,0},bf16[2,6,1024,64]{3,2,1,0},pred[2,6,1024,1024]{3,2,1,0})->(bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,64,1024]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0})}, allow_spmd_sharding_propagation_to_output={true,true,true,true}
-
-    region_0.21 {
-      Arg_0.22 = bf16[] parameter(0)
-      Arg_1.23 = bf16[] parameter(1)
-      ROOT maximum.24 = bf16[] maximum(Arg_0.22, Arg_1.23)
-    }
-
-    region_1.33 {
-      Arg_0.34 = f32[] parameter(0)
-      Arg_1.35 = f32[] parameter(1)
-      ROOT add.36 = f32[] add(Arg_0.34, Arg_1.35)
-    }
-
-    region_2.55 {
-      Arg_0.56 = bf16[] parameter(0)
-      Arg_1.57 = bf16[] parameter(1)
-      ROOT add.58 = bf16[] add(Arg_0.56, Arg_1.57)
-    }
-
-    region_3.67 {
-      Arg_0.68 = f32[] parameter(0)
-      Arg_1.69 = f32[] parameter(1)
-      ROOT add.70 = f32[] add(Arg_0.68, Arg_1.69)
-    }
-
-    ENTRY main.82 {
-      constant.16 = pred[2,6,1024,1024]{3,2,1,0} parameter(4)
-      Arg_0.1 = bf16[2,6,1024,64]{3,2,1,0} parameter(0)
-      Arg_1.2 = bf16[2,6,64,1024]{3,2,1,0} parameter(1)
-      dot.17 = bf16[2,6,1024,1024]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-      constant.5 = bf16[] constant(2)
-      broadcast.6 = bf16[2,6,1024,1024]{3,2,1,0} broadcast(constant.5), dimensions={}
-      multiply.18 = bf16[2,6,1024,1024]{3,2,1,0} multiply(dot.17, broadcast.6)
-      constant.14 = bf16[] constant(1)
-      broadcast.15 = bf16[2,6,1024,1024]{3,2,1,0} broadcast(constant.14), dimensions={}
-      add.19 = bf16[2,6,1024,1024]{3,2,1,0} add(multiply.18, broadcast.15)
-      constant.7 = bf16[] constant(0)
-      broadcast.8 = bf16[2,6,1024,1024]{3,2,1,0} broadcast(constant.7), dimensions={}
-      select.20 = bf16[2,6,1024,1024]{3,2,1,0} select(constant.16, add.19, broadcast.8)
-      constant.12 = bf16[] constant(-inf)
-      reduce.25 = bf16[2,6,1024]{2,1,0} reduce(select.20, constant.12), dimensions={3}, to_apply=region_0.21
-      reshape.26 = bf16[2,6,1024,1]{3,2,1,0} reshape(reduce.25)
-      broadcast.27 = bf16[2,6,1024,1]{3,2,1,0} broadcast(reshape.26), dimensions={0,1,2,3}
-      reshape.28 = bf16[2,6,1024]{2,1,0} reshape(broadcast.27)
-      broadcast.29 = bf16[2,6,1024,1024]{3,2,1,0} broadcast(reshape.28), dimensions={0,1,2}
-      subtract.30 = bf16[2,6,1024,1024]{3,2,1,0} subtract(select.20, broadcast.29)
-      exponential.31 = bf16[2,6,1024,1024]{3,2,1,0} exponential(subtract.30)
-      convert.32 = f32[2,6,1024,1024]{3,2,1,0} convert(exponential.31)
-      constant.11 = f32[] constant(0)
-      reduce.37 = f32[2,6,1024]{2,1,0} reduce(convert.32, constant.11), dimensions={3}, to_apply=region_1.33
-      reshape.38 = f32[2,6,1024,1]{3,2,1,0} reshape(reduce.37)
-      convert.39 = bf16[2,6,1024,1]{3,2,1,0} convert(reshape.38)
-      broadcast.40 = bf16[2,6,1024,1]{3,2,1,0} broadcast(convert.39), dimensions={0,1,2,3}
-      reshape.41 = bf16[2,6,1024]{2,1,0} reshape(broadcast.40)
-      broadcast.42 = bf16[2,6,1024,1024]{3,2,1,0} broadcast(reshape.41), dimensions={0,1,2}
-      divide.43 = bf16[2,6,1024,1024]{3,2,1,0} divide(exponential.31, broadcast.42)
-      Arg_2.3 = bf16[2,6,1024,64]{3,2,1,0} parameter(2)
-      dot.46 = bf16[2,6,1024,64]{3,2,1,0} dot(divide.43, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-      Arg_3.4 = bf16[2,6,1024,64]{3,2,1,0} parameter(3)
-      dot.49 = bf16[2,6,1024,1024]{3,2,1,0} dot(Arg_3.4, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
-      broadcast.62 = bf16[2,6,1024,1]{3,2,1,0} broadcast(convert.39), dimensions={0,1,2,3}
-      reshape.63 = bf16[2,6,1024]{2,1,0} reshape(broadcast.62)
-      broadcast.64 = bf16[2,6,1024,1024]{3,2,1,0} broadcast(reshape.63), dimensions={0,1,2}
-      divide.65 = bf16[2,6,1024,1024]{3,2,1,0} divide(dot.49, broadcast.64)
-      constant.9 = bf16[] constant(1)
-      broadcast.10 = bf16[2,6,1024,1]{3,2,1,0} broadcast(constant.9), dimensions={}
-      multiply.44 = bf16[2,6,1024,1]{3,2,1,0} multiply(convert.39, convert.39)
-      divide.45 = bf16[2,6,1024,1]{3,2,1,0} divide(broadcast.10, multiply.44)
-      broadcast.50 = bf16[2,6,1024,1]{3,2,1,0} broadcast(divide.45), dimensions={0,1,2,3}
-      reshape.51 = bf16[2,6,1024]{2,1,0} reshape(broadcast.50)
-      broadcast.52 = bf16[2,6,1024,1024]{3,2,1,0} broadcast(reshape.51), dimensions={0,1,2}
-      multiply.53 = bf16[2,6,1024,1024]{3,2,1,0} multiply(dot.49, broadcast.52)
-      multiply.54 = bf16[2,6,1024,1024]{3,2,1,0} multiply(multiply.53, exponential.31)
-      constant.13 = bf16[] constant(0)
-      reduce.59 = bf16[2,6,1024]{2,1,0} reduce(multiply.54, constant.13), dimensions={3}, to_apply=region_2.55
-      reshape.60 = bf16[2,6,1024,1]{3,2,1,0} reshape(reduce.59)
-      negate.61 = bf16[2,6,1024,1]{3,2,1,0} negate(reshape.60)
-      convert.66 = f32[2,6,1024,1]{3,2,1,0} convert(negate.61)
-      reduce.71 = f32[2,6,1024]{2,1,0} reduce(convert.66, constant.11), dimensions={3}, to_apply=region_3.67
-      broadcast.72 = f32[2,6,1024,1024]{3,2,1,0} broadcast(reduce.71), dimensions={0,1,2}
-      convert.73 = bf16[2,6,1024,1024]{3,2,1,0} convert(broadcast.72)
-      add.74 = bf16[2,6,1024,1024]{3,2,1,0} add(divide.65, convert.73)
-      multiply.75 = bf16[2,6,1024,1024]{3,2,1,0} multiply(add.74, exponential.31)
-      select.76 = bf16[2,6,1024,1024]{3,2,1,0} select(constant.16, multiply.75, broadcast.8)
-      multiply.77 = bf16[2,6,1024,1024]{3,2,1,0} multiply(select.76, broadcast.6)
-      dot.80 = bf16[2,6,1024,64]{3,2,1,0} dot(multiply.77, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
-      dot.78 = bf16[2,6,1024,64]{3,2,1,0} dot(multiply.77, Arg_0.1), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-      transpose.79 = bf16[2,6,64,1024]{2,3,1,0} transpose(dot.78), dimensions={0,1,3,2}
-      dot.47 = bf16[2,6,64,1024]{3,2,1,0} dot(Arg_3.4, divide.43), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-      transpose.48 = bf16[2,6,1024,64]{2,3,1,0} transpose(dot.47), dimensions={0,1,3,2}
-      ROOT tuple.81 = (bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,64,1024]{2,3,1,0}, bf16[2,6,1024,64]{2,3,1,0}) tuple(dot.46, dot.80, transpose.79, transpose.48)
-    }
-  )";
-
-    return hlo_text;
-  }
-
-  template <typename T>
-  void TestImpl_Flash_Attention_Training_BMM1_Bias_Mask_Softmax_BMM2() {
-    if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-    if (GetDnnVersionInfo(backend().default_stream_executor()) <
-        se::dnn::VersionInfo(8, 9, 3)) {
-      GTEST_SKIP() << "Flash Attention requires cuDNN >= 8.9.3.";
-    }
-    XlaBuilder builder(TestName());
-    auto lhs_bmm1_literal =
-        GetInput4DLiteral<T>({2, 6, 1024, 64}, {3, 2, 1, 0});
-    auto rhs_bmm1_literal =
-        GetInput4DLiteral<T>({2, 6, 64, 1024}, {3, 2, 1, 0});
-    auto rhs_bmm2_literal =
-        GetInput4DLiteral<T>({2, 6, 1024, 64}, {3, 2, 1, 0});
-    auto mask_literal = GetMask4DLiteral({2, 6, 1024, 1024}, {3, 2, 1, 0});
-    auto do_literal = GetInput4DLiteral<T>({2, 6, 1024, 64}, {3, 2, 1, 0});
-    std::string hlo_string =
-        GetModuleFlash_Attention_Training_BMM1_Bias_Mask_Softmax_BMM2_HloString_BF16();  // NOLINT
-    ExecuteAndCompare(hlo_string,
-                      {&lhs_bmm1_literal, &rhs_bmm1_literal, &rhs_bmm2_literal,
-                       &do_literal, &mask_literal},
-                      /*expected_num_fmha_calls=*/2);
-  }
-};
-
 class FlashAttentionBMMScaleSoftmaxBMM : public MultiHeadedAttentionTest {
  protected:
   const std::string  // NOLINT
@@ -2770,7 +1731,7 @@ class FlashAttentionBMMScaleSoftmaxBMM : public MultiHeadedAttentionTest {
       Arg_0.1 = bf16[2,6,1024,64]{3,2,1,0} parameter(0), sharding={replicated}
       Arg_1.2 = bf16[2,6,64,1024]{3,2,1,0} parameter(1), sharding={replicated}
       dot.11 = bf16[2,6,1024,1024]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-      constant.17 = bf16[] constant(2)
+      constant.17 = bf16[] constant(37)
       broadcast.29 = bf16[2,6,1024,1024]{3,2,1,0} broadcast(constant.17), dimensions={}
       multiply.2 = bf16[2,6,1024,1024]{3,2,1,0} multiply(dot.11, broadcast.29)
       constant.9 = bf16[] constant(-inf)
@@ -2853,131 +1814,6 @@ class FlashAttentionBMMScaleSoftmaxBMM : public MultiHeadedAttentionTest {
   }
 };
 
-class FlashAttentionBMMScaleMaskSoftmaxBMM : public MultiHeadedAttentionTest {
- protected:
-  const std::string  // NOLINT
-  GetModuleFlash_Attention_Training_BMM1_Mask_Softmax_BMM2_HloString_BF16() {  // NOLINT
-    const std::string hlo_text = R"(
-    HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[2,6,1024,64]{3,2,1,0},bf16[2,6,64,1024]{3,2,1,0},bf16[2,6,1024,64]{3,2,1,0},bf16[2,6,1024,64]{3,2,1,0},pred[2,6,1024,1024]{3,2,1,0})->(bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,64,1024]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0})}, allow_spmd_sharding_propagation_to_output={true,true,true,true}
-
-    region_0.21 {
-      Arg_0.22 = bf16[] parameter(0)
-      Arg_1.23 = bf16[] parameter(1)
-      ROOT maximum.24 = bf16[] maximum(Arg_0.22, Arg_1.23)
-    }
-
-    region_1.33 {
-      Arg_0.34 = f32[] parameter(0)
-      Arg_1.35 = f32[] parameter(1)
-      ROOT add.36 = f32[] add(Arg_0.34, Arg_1.35)
-    }
-
-    region_2.55 {
-      Arg_0.56 = bf16[] parameter(0)
-      Arg_1.57 = bf16[] parameter(1)
-      ROOT add.58 = bf16[] add(Arg_0.56, Arg_1.57)
-    }
-
-    region_3.67 {
-      Arg_0.68 = f32[] parameter(0)
-      Arg_1.69 = f32[] parameter(1)
-      ROOT add.70 = f32[] add(Arg_0.68, Arg_1.69)
-    }
-
-    ENTRY main.82 {
-      constant.16 = pred[2,6,1024,1024]{3,2,1,0} parameter(4)
-      Arg_0.1 = bf16[2,6,1024,64]{3,2,1,0} parameter(0)
-      Arg_1.2 = bf16[2,6,64,1024]{3,2,1,0} parameter(1)
-      dot.17 = bf16[2,6,1024,1024]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-      constant.5 = bf16[] constant(2)
-      broadcast.6 = bf16[2,6,1024,1024]{3,2,1,0} broadcast(constant.5), dimensions={}
-      multiply.18 = bf16[2,6,1024,1024]{3,2,1,0} multiply(dot.17, broadcast.6)
-      constant.7 = bf16[] constant(0)
-      broadcast.8 = bf16[2,6,1024,1024]{3,2,1,0} broadcast(constant.7), dimensions={}
-      select.20 = bf16[2,6,1024,1024]{3,2,1,0} select(constant.16, multiply.18, broadcast.8)
-      constant.12 = bf16[] constant(-inf)
-      reduce.25 = bf16[2,6,1024]{2,1,0} reduce(select.20, constant.12), dimensions={3}, to_apply=region_0.21
-      reshape.26 = bf16[2,6,1024,1]{3,2,1,0} reshape(reduce.25)
-      broadcast.27 = bf16[2,6,1024,1]{3,2,1,0} broadcast(reshape.26), dimensions={0,1,2,3}
-      reshape.28 = bf16[2,6,1024]{2,1,0} reshape(broadcast.27)
-      broadcast.29 = bf16[2,6,1024,1024]{3,2,1,0} broadcast(reshape.28), dimensions={0,1,2}
-      subtract.30 = bf16[2,6,1024,1024]{3,2,1,0} subtract(select.20, broadcast.29)
-      exponential.31 = bf16[2,6,1024,1024]{3,2,1,0} exponential(subtract.30)
-      convert.32 = f32[2,6,1024,1024]{3,2,1,0} convert(exponential.31)
-      constant.11 = f32[] constant(0)
-      reduce.37 = f32[2,6,1024]{2,1,0} reduce(convert.32, constant.11), dimensions={3}, to_apply=region_1.33
-      reshape.38 = f32[2,6,1024,1]{3,2,1,0} reshape(reduce.37)
-      convert.39 = bf16[2,6,1024,1]{3,2,1,0} convert(reshape.38)
-      broadcast.40 = bf16[2,6,1024,1]{3,2,1,0} broadcast(convert.39), dimensions={0,1,2,3}
-      reshape.41 = bf16[2,6,1024]{2,1,0} reshape(broadcast.40)
-      broadcast.42 = bf16[2,6,1024,1024]{3,2,1,0} broadcast(reshape.41), dimensions={0,1,2}
-      divide.43 = bf16[2,6,1024,1024]{3,2,1,0} divide(exponential.31, broadcast.42)
-      Arg_2.3 = bf16[2,6,1024,64]{3,2,1,0} parameter(2)
-      dot.46 = bf16[2,6,1024,64]{3,2,1,0} dot(divide.43, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-      Arg_3.4 = bf16[2,6,1024,64]{3,2,1,0} parameter(3)
-      dot.49 = bf16[2,6,1024,1024]{3,2,1,0} dot(Arg_3.4, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
-      broadcast.62 = bf16[2,6,1024,1]{3,2,1,0} broadcast(convert.39), dimensions={0,1,2,3}
-      reshape.63 = bf16[2,6,1024]{2,1,0} reshape(broadcast.62)
-      broadcast.64 = bf16[2,6,1024,1024]{3,2,1,0} broadcast(reshape.63), dimensions={0,1,2}
-      divide.65 = bf16[2,6,1024,1024]{3,2,1,0} divide(dot.49, broadcast.64)
-      constant.9 = bf16[] constant(1)
-      broadcast.10 = bf16[2,6,1024,1]{3,2,1,0} broadcast(constant.9), dimensions={}
-      multiply.44 = bf16[2,6,1024,1]{3,2,1,0} multiply(convert.39, convert.39)
-      divide.45 = bf16[2,6,1024,1]{3,2,1,0} divide(broadcast.10, multiply.44)
-      broadcast.50 = bf16[2,6,1024,1]{3,2,1,0} broadcast(divide.45), dimensions={0,1,2,3}
-      reshape.51 = bf16[2,6,1024]{2,1,0} reshape(broadcast.50)
-      broadcast.52 = bf16[2,6,1024,1024]{3,2,1,0} broadcast(reshape.51), dimensions={0,1,2}
-      multiply.53 = bf16[2,6,1024,1024]{3,2,1,0} multiply(dot.49, broadcast.52)
-      multiply.54 = bf16[2,6,1024,1024]{3,2,1,0} multiply(multiply.53, exponential.31)
-      constant.13 = bf16[] constant(0)
-      reduce.59 = bf16[2,6,1024]{2,1,0} reduce(multiply.54, constant.13), dimensions={3}, to_apply=region_2.55
-      reshape.60 = bf16[2,6,1024,1]{3,2,1,0} reshape(reduce.59)
-      negate.61 = bf16[2,6,1024,1]{3,2,1,0} negate(reshape.60)
-      convert.66 = f32[2,6,1024,1]{3,2,1,0} convert(negate.61)
-      reduce.71 = f32[2,6,1024]{2,1,0} reduce(convert.66, constant.11), dimensions={3}, to_apply=region_3.67
-      broadcast.72 = f32[2,6,1024,1024]{3,2,1,0} broadcast(reduce.71), dimensions={0,1,2}
-      convert.73 = bf16[2,6,1024,1024]{3,2,1,0} convert(broadcast.72)
-      add.74 = bf16[2,6,1024,1024]{3,2,1,0} add(divide.65, convert.73)
-      multiply.75 = bf16[2,6,1024,1024]{3,2,1,0} multiply(add.74, exponential.31)
-      select.76 = bf16[2,6,1024,1024]{3,2,1,0} select(constant.16, multiply.75, broadcast.8)
-      multiply.77 = bf16[2,6,1024,1024]{3,2,1,0} multiply(select.76, broadcast.6)
-      dot.80 = bf16[2,6,1024,64]{3,2,1,0} dot(multiply.77, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
-      dot.78 = bf16[2,6,1024,64]{3,2,1,0} dot(multiply.77, Arg_0.1), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-      transpose.79 = bf16[2,6,64,1024]{2,3,1,0} transpose(dot.78), dimensions={0,1,3,2}
-      dot.47 = bf16[2,6,64,1024]{3,2,1,0} dot(Arg_3.4, divide.43), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-      transpose.48 = bf16[2,6,1024,64]{2,3,1,0} transpose(dot.47), dimensions={0,1,3,2}
-      ROOT tuple.81 = (bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,64,1024]{2,3,1,0}, bf16[2,6,1024,64]{2,3,1,0}) tuple(dot.46, dot.80, transpose.79, transpose.48)
-    }
-  )";
-
-    return hlo_text;
-  }
-
-  template <typename T>
-  void TestImpl_Flash_Attention_Training_BMM1_Mask_Softmax_BMM2() {
-    if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-    if (GetDnnVersionInfo(backend().default_stream_executor()) <
-        se::dnn::VersionInfo(8, 9, 3)) {
-      GTEST_SKIP() << "Flash Attention requires cuDNN >= 8.9.3.";
-    }
-    XlaBuilder builder(TestName());
-    auto lhs_bmm1_literal =
-        GetInput4DLiteral<T>({2, 6, 1024, 64}, {3, 2, 1, 0});
-    auto rhs_bmm1_literal =
-        GetInput4DLiteral<T>({2, 6, 64, 1024}, {3, 2, 1, 0});
-    auto rhs_bmm2_literal =
-        GetInput4DLiteral<T>({2, 6, 1024, 64}, {3, 2, 1, 0});
-    auto mask_literal = GetMask4DLiteral({2, 6, 1024, 1024}, {3, 2, 1, 0});
-    auto do_literal = GetInput4DLiteral<T>({2, 6, 1024, 64}, {3, 2, 1, 0});
-    std::string hlo_string =
-        GetModuleFlash_Attention_Training_BMM1_Mask_Softmax_BMM2_HloString_BF16();  // NOLINT
-    ExecuteAndCompare(hlo_string,
-                      {&lhs_bmm1_literal, &rhs_bmm1_literal, &rhs_bmm2_literal,
-                       &do_literal, &mask_literal},
-                      /*expected_num_fmha_calls=*/2);
-  }
-};
-
 // BMM1 - BMM2
 XLA_TEST_F(MultiHeadedAttentionBMMBMM, FMHABMM_BMM_vanilla_F16) {
   TestImpl_FMHABMM_BMM_vanilla<Eigen::half>();
@@ -3024,59 +1860,6 @@ XLA_TEST_F(MultiHeadedAttentionBMMBMM,
   TestImpl_BMM_BMM2_non_contracting_dim_stride_not_1<Eigen::half>();
 }
 
-// BMM1 - Scale - Bias - Mask - Softmax - BMM2
-XLA_TEST_F(MultiHeadedAttentionBMMScaleBiasMaskSoftmaxBMM,
-           FMHABMM1_Scale_Bias_Mask_Softmax_BMM2_vanilla_F16) {
-  TestImpl_FMHABMM1_Scale_Bias_Mask_Softmax_BMM2_vanilla<Eigen::half>();
-}
-
-XLA_TEST_F(MultiHeadedAttentionBMMScaleBiasMaskSoftmaxBMM,
-           FMHABMM1_Scale_Bias_Mask_Softmax_BMM2_vanilla_BF16) {
-  TestImpl_FMHABMM1_Scale_Bias_Mask_Softmax_BMM2_vanilla<bfloat16>();
-}
-
-XLA_TEST_F(MultiHeadedAttentionBMMScaleBiasMaskSoftmaxBMM,
-           FMHABMM1_Scale_Bias_Mask_Softmax_BMM2_vanilla_BF16_smaller) {
-  TestImpl_FMHABMM1_Scale_Bias_Mask_Softmax_BMM2_vanilla_smaller<bfloat16>();
-}
-
-XLA_TEST_F(MultiHeadedAttentionBMMScaleBiasMaskSoftmaxBMM,
-           FMHABMM1_Scale_Bias_Mask_Softmax_BMM2_arg_reversal_F16) {
-  TestImpl_FMHABMM1_Scale_Bias_Mask_Softmax_BMM2_arg_reversal<Eigen::half>();
-}
-
-XLA_TEST_F(MultiHeadedAttentionBMMScaleBiasMaskSoftmaxBMM,
-           FMHA_Training_BMM1_Scale_Bias_Mask_Softmax_BMM2_vanilla_F16) {
-  TestImpl_FMHA_Training_BMM1_Scale_Bias_Mask_Softmax_BMM2_vanilla<
-      Eigen::half>();
-}
-
-XLA_TEST_F(MultiHeadedAttentionBMMScaleBiasMaskSoftmaxBMM,
-           FMHA_Training_BMM1_Scale_Bias_Mask_Softmax_BMM2_vanilla_BF16) {
-  TestImpl_FMHA_Training_BMM1_Scale_Bias_Mask_Softmax_BMM2_vanilla<bfloat16>();
-}
-
-// BMM1 - Scale - Mask - Softmax - BMM2
-XLA_TEST_F(MultiHeadedAttentionBMMScaleMaskSoftmaxBMM,
-           FMHABMM1_Scale_Mask_Softmax_BMM2_vanilla_F16) {
-  TestImpl_FMHABMM1_Scale_Mask_Softmax_BMM2_vanilla<Eigen::half>();
-}
-
-XLA_TEST_F(MultiHeadedAttentionBMMScaleMaskSoftmaxBMM,
-           FMHABMM1_Scale_Mask_Softmax_BMM2_vanilla_BF16) {
-  TestImpl_FMHABMM1_Scale_Mask_Softmax_BMM2_vanilla<bfloat16>();
-}
-
-XLA_TEST_F(MultiHeadedAttentionBMMScaleMaskSoftmaxBMM,
-           FMHABMM1_Scale_Mask_Softmax_BMM2_arg_reversal_F16) {
-  TestImpl_FMHABMM1_Scale_Mask_Softmax_BMM2_arg_reversal<Eigen::half>();
-}
-
-XLA_TEST_F(MultiHeadedAttentionBMMScaleMaskSoftmaxBMM,
-           FMHABMM1_Scale_Mask_Softmax_BMM2_arg_reversal_BF16) {
-  TestImpl_FMHABMM1_Scale_Mask_Softmax_BMM2_arg_reversal<bfloat16>();
-}
-
 // BMM1 - Softmax - BMM2
 XLA_TEST_F(MultiHeadedAttentionBMMSoftmaxBMM,
            FMHABMM1_Softmax_BMM2_vanilla_F16) {
@@ -3137,23 +1920,11 @@ XLA_TEST_F(FlashAttentionBMMScaleBiasSoftmaxBMM,
   TestImpl_Flash_Attention_BMM1_Bias_Softmax_BMM2_Cross_Attention<bfloat16>();
 }
 
-// BMM1 - Scale - Bias - Mask - Softmax - BMM2
-XLA_TEST_F(FlashAttentionBMMScaleBiasMaskSoftmaxBMM,
-           Flash_Attention_Training_BMM1_Bias_Mask_Softmax_BMM2_BF16) {
-  TestImpl_Flash_Attention_Training_BMM1_Bias_Mask_Softmax_BMM2<bfloat16>();
-}
-
 // BMM1 - Scale - Softmax - BMM2
 XLA_TEST_F(FlashAttentionBMMScaleSoftmaxBMM,
            Flash_Attention_Training_BMM1_Softmax_BMM2_BF16) {
   TestImpl_Flash_Attention_Training_BMM1_Softmax_BMM2<bfloat16>();
 }
-
-// BMM1 - Scale - Mask - Softmax - BMM2
-XLA_TEST_F(FlashAttentionBMMScaleMaskSoftmaxBMM,
-           Flash_Attention_Training_BMM1_Mask_Softmax_BMM2_BF16) {
-  TestImpl_Flash_Attention_Training_BMM1_Mask_Softmax_BMM2<bfloat16>();
-}
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc b/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
index 89b54ed4b4a4fa..9abec4560784a9 100644
--- a/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
+++ b/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
@@ -943,9 +943,24 @@ DimOrderMapOrError GetPropagatedDimOrders(const HloInstruction& hlo,
     if (!std::holds_alternative<DotProperties>(properties)) {
       return "Concatenations for now are only supported in GEMM fusions.";
     }
-    auto dim = LogicalIndexOfLabeledDimension(
-        hlo.shape(), src_dim_order,
-        std::get<DotProperties>(properties).noncontracting_dimension);
+
+    int64_t noncontracting_dim_label =
+        std::get<DotProperties>(properties).noncontracting_dimension;
+    const FragmentOrders& src_dim_fragments_orders =
+        src_dim_order.DimFragmentsOrders();
+
+    auto noncontracting_dim_fragment_order_it =
+        src_dim_fragments_orders.find(noncontracting_dim_label);
+    if (noncontracting_dim_fragment_order_it !=
+        src_dim_fragments_orders.end()) {
+      if (noncontracting_dim_fragment_order_it->second.size() > 1) {
+        return "Concatenations on split non-contracting dimensions are "
+               "unsupported.";
+      }
+    }
+
+    auto dim = LogicalIndexOfLabeledDimension(hlo.shape(), src_dim_order,
+                                              noncontracting_dim_label);
     if (!dim.has_value() || dim.value() != hlo.concatenate_dimension()) {
       return "Unsupported concatenation.";
     }
diff --git a/third_party/xla/xla/service/graphcycles/BUILD b/third_party/xla/xla/service/graphcycles/BUILD
index 78e5a7b878df21..1c7d54d1847284 100644
--- a/third_party/xla/xla/service/graphcycles/BUILD
+++ b/third_party/xla/xla/service/graphcycles/BUILD
@@ -1,6 +1,6 @@
-load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//xla:xla.bzl", "xla_cc_test")
+load("//xla/tsl:tsl.bzl", "internal_visibility")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/service/hlo_alias_analysis.cc b/third_party/xla/xla/service/hlo_alias_analysis.cc
index 1056ea696120fa..444255d9c344b2 100644
--- a/third_party/xla/xla/service/hlo_alias_analysis.cc
+++ b/third_party/xla/xla/service/hlo_alias_analysis.cc
@@ -381,7 +381,7 @@ std::string HloAliasAnalysis::ToString() const {
 }
 
 /* static */
-StatusOr<std::unique_ptr<HloAliasAnalysis>> HloAliasAnalysis::Run(
+absl::StatusOr<std::unique_ptr<HloAliasAnalysis>> HloAliasAnalysis::Run(
     const HloModule* module,
     const HloDataflowAnalysis::CanShareBuffer& can_share_buffer) {
   VLOG(2) << "HloAliasAnalysis::Run on module " << module->name();
diff --git a/third_party/xla/xla/service/hlo_alias_analysis.h b/third_party/xla/xla/service/hlo_alias_analysis.h
index 6b4c74a8bdd32e..b0896a93c6883e 100644
--- a/third_party/xla/xla/service/hlo_alias_analysis.h
+++ b/third_party/xla/xla/service/hlo_alias_analysis.h
@@ -39,7 +39,7 @@ class HloAliasAnalysis {
  public:
   // The callgraph of the given HloModule must be flattened
   // (xla::FlattenCallGraph) prior to running the analysis.
-  static StatusOr<std::unique_ptr<HloAliasAnalysis>> Run(
+  static absl::StatusOr<std::unique_ptr<HloAliasAnalysis>> Run(
       const HloModule* module,
       const HloDataflowAnalysis::CanShareBuffer& can_share_buffer = nullptr);
 
diff --git a/third_party/xla/xla/service/hlo_computation_deduplicator.cc b/third_party/xla/xla/service/hlo_computation_deduplicator.cc
index f5e8c484c26c0a..52c5ef8326380d 100644
--- a/third_party/xla/xla/service/hlo_computation_deduplicator.cc
+++ b/third_party/xla/xla/service/hlo_computation_deduplicator.cc
@@ -36,7 +36,7 @@ bool HloComputationDeduplicator::ContainsLargeConstants(HloComputation* comp) {
   }
   return false;
 }
-StatusOr<bool> HloComputationDeduplicator::Run(
+absl::StatusOr<bool> HloComputationDeduplicator::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   absl::flat_hash_map<std::string, HloComputation*> unique_comps;
diff --git a/third_party/xla/xla/service/hlo_computation_deduplicator.h b/third_party/xla/xla/service/hlo_computation_deduplicator.h
index 4381c95cc6e0ad..96d67c6eb73bfe 100644
--- a/third_party/xla/xla/service/hlo_computation_deduplicator.h
+++ b/third_party/xla/xla/service/hlo_computation_deduplicator.h
@@ -37,7 +37,7 @@ class HloComputationDeduplicator : public HloModulePass {
   absl::string_view name() const override { return "computation-deduplicator"; }
 
   using HloPassInterface::Run;
-  StatusOr<bool> Run(
+  absl::StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/hlo_constant_folding.cc b/third_party/xla/xla/service/hlo_constant_folding.cc
index 7afdb75649edc3..732ef23bd654ed 100644
--- a/third_party/xla/xla/service/hlo_constant_folding.cc
+++ b/third_party/xla/xla/service/hlo_constant_folding.cc
@@ -66,7 +66,7 @@ static bool IsOrContainsIllegalInstr(const HloInstruction* instr) {
 
 /*static*/ std::atomic<int64_t> HloConstantFolding::slow_op_counter_{0};
 
-StatusOr<bool> HloConstantFolding::Run(
+absl::StatusOr<bool> HloConstantFolding::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // Limit the constant folding to 0 iterations to skip folding loops. This
@@ -231,7 +231,7 @@ StatusOr<bool> HloConstantFolding::Run(
 
       VLOG(4) << "Constant folded: " << instruction->ToString();
       dead_instructions.push_back(instruction);
-      HloInstruction* new_constant = computation->AddInstruction(
+      HloInstruction* new_constant = instruction->AddInstruction(
           HloInstruction::CreateConstant(std::move(result)));
       if (new_constant->shape().has_layout()) {
         // Update element_size_in_bits on the new instruction's layout. Literals
diff --git a/third_party/xla/xla/service/hlo_constant_folding.h b/third_party/xla/xla/service/hlo_constant_folding.h
index 0865ae1e9e8274..e20be0d2db9adf 100644
--- a/third_party/xla/xla/service/hlo_constant_folding.h
+++ b/third_party/xla/xla/service/hlo_constant_folding.h
@@ -30,7 +30,7 @@ class HloConstantFolding : public HloModulePass {
   // Run constant folding operations on the given module. Returns whether the
   // module was changed (constant expressions folded).
   using HloPassInterface::Run;
-  StatusOr<bool> Run(
+  absl::StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/hlo_constant_folding_test.cc b/third_party/xla/xla/service/hlo_constant_folding_test.cc
index 4958bee65f54d1..20007d2e32cfc4 100644
--- a/third_party/xla/xla/service/hlo_constant_folding_test.cc
+++ b/third_party/xla/xla/service/hlo_constant_folding_test.cc
@@ -19,9 +19,11 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/layout_util.h"
 #include "xla/literal.h"
 #include "xla/permutation_util.h"
@@ -34,12 +36,13 @@ limitations under the License.
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/types.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace {
 
+namespace op = xla::testing::opcode_matchers;
 namespace m = xla::match;
-
 using HloConstantFoldingTest = HloTestBase;
 
 TEST_F(HloConstantFoldingTest, ConvertF32ToS64) {
@@ -238,6 +241,33 @@ TEST_F(HloConstantFoldingTest, ConstantFoldReduce) {
                    .GetFirstElement<int32_t>());
 }
 
+constexpr absl::string_view kConstantFoldReduceWithMetadata = R"(
+  HloModule ConstantFoldReduce
+
+  add {
+    a = s32[] parameter(0)
+    b = s32[] parameter(1)
+    ROOT add = s32[] add(a, b)
+  }
+
+  ENTRY r {
+    x = s32[3] constant({1, 2, 3}), metadata={op_name="constant"}
+    init = s32[] constant(0), metadata={op_name="zero_constant"}
+    ROOT reduce = s32[] reduce(x, init), metadata={op_name="reduce"}, dimensions={0}, to_apply=add
+  })";
+
+TEST_F(HloConstantFoldingTest, ConstantFoldReduceCheckMetadata) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto m, ParseAndReturnVerifiedModule(kConstantFoldReduceWithMetadata));
+  HloConstantFolding const_folder;
+  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(m.get()));
+  EXPECT_TRUE(result);
+  OpMetadata reduce_metadata;
+  reduce_metadata.set_op_name("reduce");
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              AllOf(op::Constant(), op::Metadata(reduce_metadata)));
+}
+
 TEST_F(HloConstantFoldingTest, ConstantFoldReduceNoLayout) {
   TF_ASSERT_OK_AND_ASSIGN(auto m,
                           ParseAndReturnVerifiedModule(kConstantFoldReduce));
diff --git a/third_party/xla/xla/service/hlo_cost_analysis.cc b/third_party/xla/xla/service/hlo_cost_analysis.cc
index d64cddffcf41e9..c60a4193aaee20 100644
--- a/third_party/xla/xla/service/hlo_cost_analysis.cc
+++ b/third_party/xla/xla/service/hlo_cost_analysis.cc
@@ -182,7 +182,28 @@ int64_t HloCostAnalysis::FusionParameterReadBytes(
     switch (user->opcode()) {
       case HloOpcode::kFusion: {
         for (int64_t idx : user->OperandIndices(hlo)) {
-          size += FusionParameterReadBytes(user->fused_parameter(idx));
+          auto nested_size =
+              FusionParameterReadBytes(user->fused_parameter(idx));
+          const HloInstruction* root_instruction =
+              user->fused_instructions_computation()->root_instruction();
+          // We define the nested fusion as simple if the parameter directly
+          // feeds the root.
+          const bool fusion_is_simple =
+              user->fused_parameter(idx) == root_instruction->operand(0);
+          const auto& fusion_users = user->users();
+          auto is_slice = [](const HloInstruction* hlo) {
+            return hlo->opcode() == HloOpcode::kSlice ||
+                   hlo->opcode() == HloOpcode::kDynamicSlice;
+          };
+          // If the nested fusion is simple and the user is a slice,
+          // we only load that portion of the parameter.
+          // TODO(b/332998529): deal with nested fusions more generally.
+          if (fusion_is_simple && fusion_users.size() == 1 &&
+              is_slice(fusion_users[0])) {
+            size += GetShapeSize(fusion_users[0]->shape());
+          } else {
+            size += nested_size;
+          }
         }
         break;
       }
@@ -1416,8 +1437,8 @@ int64_t HloCostAnalysis::GetBytesWritten(
   return bytes_written;
 }
 
-StatusOr<HloCostAnalysis::Properties> HloCostAnalysis::ProcessSubcomputation(
-    HloComputation* computation) {
+absl::StatusOr<HloCostAnalysis::Properties>
+HloCostAnalysis::ProcessSubcomputation(HloComputation* computation) {
   auto visitor = CreateNestedCostAnalysis();
   visitor->ReserveVisitStates(computation->instruction_count());
   TF_RETURN_IF_ERROR(computation->Accept(visitor.get()));
diff --git a/third_party/xla/xla/service/hlo_cost_analysis.h b/third_party/xla/xla/service/hlo_cost_analysis.h
index c356a0363341b5..0a6ecebec265c3 100644
--- a/third_party/xla/xla/service/hlo_cost_analysis.h
+++ b/third_party/xla/xla/service/hlo_cost_analysis.h
@@ -621,7 +621,7 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
   // given hlo. The cost of visited sub HLO instructions is saved to
   // hlo_properties_, which will be used by functions such as
   // flop_count(hlo_instruction) to return cost of a particular HLO instruction.
-  virtual StatusOr<Properties> ProcessSubcomputation(
+  virtual absl::StatusOr<Properties> ProcessSubcomputation(
       HloComputation* computation);
 
   // Utility function to handle all element-wise operations.
diff --git a/third_party/xla/xla/service/hlo_cost_analysis_test.cc b/third_party/xla/xla/service/hlo_cost_analysis_test.cc
index 073d93244cd00c..341003499a6236 100644
--- a/third_party/xla/xla/service/hlo_cost_analysis_test.cc
+++ b/third_party/xla/xla/service/hlo_cost_analysis_test.cc
@@ -832,6 +832,62 @@ TEST_F(FusionCostAnalysis, LoopFusion) {
   }
 }
 
+TEST_F(FusionCostAnalysis, NestedCopyFusion) {
+  absl::string_view nested_fusion_text = R"(
+HloModule temp, is_scheduled=true
+
+copy_fusion.1291.clone {
+  input.1291 = s8[2,6144,2,256]{3,1,0,2:T(32,128)(4,1)S(1)} parameter(0)
+  ROOT copy.74276 = s8[2,6144,2,256]{3,1,0,2:T(8,128)(4,1)} copy(input.1291)
+}
+
+fused_computation.4150.clone {
+  param_0.185389 = s8[2,6144,2,256]{3,1,0,2:T(32,128)(4,1)} parameter(0)
+  fusion.103344 = s8[2,6144,2,256]{3,1,0,2:T(8,128)(4,1)} fusion(param_0.185389), kind=kLoop, calls=copy_fusion.1291.clone
+  constant.230138 = s32[]{:T(128)} constant(0)
+  param_1.219146 = s32[]{:T(128)S(6)} parameter(1)
+  ROOT dynamic-slice.40526 = s8[2,384,2,256]{3,1,0,2:T(8,128)(4,1)} dynamic-slice(fusion.103344, constant.230138, param_1.219146, constant.230138, constant.230138), dynamic_slice_sizes={2,384,2,256}
+}
+
+ENTRY temp {
+  param_2.123719 = s8[2,6144,2,256]{3,1,0,2:T(32,128)(4,1)} parameter(0)
+  param_3.66279 = s32[]{:T(128)S(6)} parameter(1)
+  ROOT fusion.85943 = s8[2,384,2,256]{3,1,0,2:T(8,128)(4,1)} fusion(param_2.123719, param_3.66279), kind=kLoop, calls=fused_computation.4150.clone
+}
+)";
+  absl::string_view fusion_text = R"(
+HloModule temp, is_scheduled=true
+
+fused_computation.4150.clone {
+  param_0.185389 = s8[2,6144,2,256]{3,1,0,2:T(32,128)(4,1)} parameter(0)
+  constant.230138 = s32[]{:T(128)} constant(0)
+  param_1.219146 = s32[]{:T(128)S(6)} parameter(1)
+  ROOT dynamic-slice.40526 = s8[2,384,2,256]{3,1,0,2:T(8,128)(4,1)} dynamic-slice(param_0.185389, constant.230138, param_1.219146, constant.230138, constant.230138), dynamic_slice_sizes={2,384,2,256}
+}
+
+ENTRY temp {
+  param_2.123719 = s8[2,6144,2,256]{3,1,0,2:T(32,128)(4,1)} parameter(0)
+  param_3.66279 = s32[]{:T(128)S(6)} parameter(1)
+  ROOT fusion.85943 = s8[2,384,2,256]{3,1,0,2:T(8,128)(4,1)} fusion(param_2.123719, param_3.66279), kind=kLoop, calls=fused_computation.4150.clone
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto nested_fusion_module,
+                          ParseAndReturnVerifiedModule(nested_fusion_text));
+  HloCostAnalysis nested_analysis(ShapeSize);
+  auto* nested_root =
+      nested_fusion_module->entry_computation()->root_instruction();
+  ASSERT_IS_OK(nested_root->Accept(&nested_analysis));
+  TF_ASSERT_OK_AND_ASSIGN(auto fusion_module,
+                          ParseAndReturnVerifiedModule(fusion_text));
+  HloCostAnalysis fusion_analysis(ShapeSize);
+  auto* fusion_root = fusion_module->entry_computation()->root_instruction();
+  ASSERT_IS_OK(fusion_root->Accept(&fusion_analysis));
+  // The nested fusion should only access the bytes size amount of the parameter
+  // based on the size of the consuming dynamic slice.
+  EXPECT_EQ(nested_analysis.bytes_accessed(*nested_root),
+            fusion_analysis.bytes_accessed(*fusion_root));
+}
+
 TEST_F(FusionCostAnalysis, LoopFusionTupleOutput) {
   Shape r2f32 = ShapeUtil::MakeShape(F32, {2, 2});
 
diff --git a/third_party/xla/xla/service/hlo_creation_utils.cc b/third_party/xla/xla/service/hlo_creation_utils.cc
index 222351a1449455..b919bd75faa2d5 100644
--- a/third_party/xla/xla/service/hlo_creation_utils.cc
+++ b/third_party/xla/xla/service/hlo_creation_utils.cc
@@ -46,15 +46,16 @@ limitations under the License.
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
 using absl::StrCat;
 
-StatusOr<HloInstruction*> MakeUnaryHlo(HloOpcode opcode,
-                                       HloInstruction* operand,
-                                       const OpMetadata* metadata) {
+absl::StatusOr<HloInstruction*> MakeUnaryHlo(HloOpcode opcode,
+                                             HloInstruction* operand,
+                                             const OpMetadata* metadata) {
   HloComputation* computation = operand->parent();
   TF_ASSIGN_OR_RETURN(Shape unary_op_shape,
                       ShapeInference::InferUnaryOpShape(opcode, operand));
@@ -67,22 +68,21 @@ HloInstruction* MakeCopyHlo(HloInstruction* from, const Shape& to) {
       HloInstruction::CreateUnary(to, HloOpcode::kCopy, from));
 }
 
-StatusOr<HloInstruction*> MakeBinaryHlo(HloOpcode opcode, HloInstruction* lhs,
-                                        HloInstruction* rhs,
-                                        const OpMetadata* metadata) {
+absl::StatusOr<HloInstruction*> MakeBinaryHlo(
+    HloOpcode opcode, HloInstruction* lhs, HloInstruction* rhs,
+    const OpMetadata* metadata, const FrontendAttributes* frontend_attributes) {
   HloComputation* computation = lhs->parent();
   CHECK_EQ(computation, rhs->parent());
   TF_ASSIGN_OR_RETURN(Shape binary_op_shape,
                       ShapeInference::InferBinaryOpShape(opcode, lhs, rhs));
   return computation->AddInstruction(
-      HloInstruction::CreateBinary(binary_op_shape, opcode, lhs, rhs),
-      metadata);
+      HloInstruction::CreateBinary(binary_op_shape, opcode, lhs, rhs), metadata,
+      frontend_attributes);
 }
 
-StatusOr<HloInstruction*> MakeCompareHlo(ComparisonDirection direction,
-                                         HloInstruction* lhs,
-                                         HloInstruction* rhs,
-                                         const OpMetadata* metadata) {
+absl::StatusOr<HloInstruction*> MakeCompareHlo(
+    ComparisonDirection direction, HloInstruction* lhs, HloInstruction* rhs,
+    const OpMetadata* metadata, const FrontendAttributes* frontend_attributes) {
   HloComputation* computation = lhs->parent();
   CHECK_EQ(computation, rhs->parent());
   TF_ASSIGN_OR_RETURN(
@@ -90,30 +90,30 @@ StatusOr<HloInstruction*> MakeCompareHlo(ComparisonDirection direction,
       ShapeInference::InferBinaryOpShape(HloOpcode::kCompare, lhs, rhs));
   return computation->AddInstruction(
       HloInstruction::CreateCompare(binary_op_shape, lhs, rhs, direction),
-      metadata);
+      metadata, frontend_attributes);
 }
 
-StatusOr<HloInstruction*> MakePadHlo(HloInstruction* operand,
-                                     HloInstruction* padding_value,
-                                     const PaddingConfig& padding_config,
-                                     const OpMetadata* metadata) {
+absl::StatusOr<HloInstruction*> MakePadHlo(
+    HloInstruction* operand, HloInstruction* padding_value,
+    const PaddingConfig& padding_config, const OpMetadata* metadata,
+    const FrontendAttributes* frontend_attributes) {
   HloComputation* computation = operand->parent();
   CHECK_EQ(computation, padding_value->parent());
   TF_ASSIGN_OR_RETURN(
       Shape pad_shape,
       ShapeInference::InferPadShape(operand->shape(), padding_value->shape(),
                                     padding_config));
+
   return computation->AddInstruction(
       HloInstruction::CreatePad(pad_shape, operand, padding_value,
                                 padding_config),
-      metadata);
+      metadata, frontend_attributes);
 }
 
-StatusOr<HloInstruction*> MakeSliceHlo(HloInstruction* operand,
-                                       absl::Span<const int64_t> start_indices,
-                                       absl::Span<const int64_t> limit_indices,
-                                       absl::Span<const int64_t> strides,
-                                       const OpMetadata* metadata) {
+absl::StatusOr<HloInstruction*> MakeSliceHlo(
+    HloInstruction* operand, absl::Span<const int64_t> start_indices,
+    absl::Span<const int64_t> limit_indices, absl::Span<const int64_t> strides,
+    const OpMetadata* metadata, const FrontendAttributes* frontend_attributes) {
   HloComputation* computation = operand->parent();
   TF_ASSIGN_OR_RETURN(Shape slice_shape, ShapeInference::InferSliceShape(
                                              operand->shape(), start_indices,
@@ -121,16 +121,16 @@ StatusOr<HloInstruction*> MakeSliceHlo(HloInstruction* operand,
   return computation->AddInstruction(
       HloInstruction::CreateSlice(slice_shape, operand, start_indices,
                                   limit_indices, strides),
-      metadata);
+      metadata, frontend_attributes);
 }
 
-StatusOr<HloInstruction*> MakeConvolveHlo(
+absl::StatusOr<HloInstruction*> MakeConvolveHlo(
     HloInstruction* lhs, HloInstruction* rhs, int64_t feature_group_count,
     int64_t batch_group_count, const Window& window,
     const ConvolutionDimensionNumbers& dimension_numbers,
     const PrecisionConfig& precision_config,
     std::optional<PrimitiveType> preferred_element_type,
-    const OpMetadata* metadata) {
+    const OpMetadata* metadata, const FrontendAttributes* frontend_attributes) {
   HloComputation* computation = lhs->parent();
   CHECK_EQ(computation, rhs->parent());
   TF_ASSIGN_OR_RETURN(
@@ -142,10 +142,10 @@ StatusOr<HloInstruction*> MakeConvolveHlo(
       HloInstruction::CreateConvolve(
           convolve_shape, lhs, rhs, feature_group_count, batch_group_count,
           window, dimension_numbers, precision_config),
-      metadata);
+      metadata, frontend_attributes);
 }
 
-StatusOr<HloInstruction*> MakeTransposeHlo(
+absl::StatusOr<HloInstruction*> MakeTransposeHlo(
     HloInstruction* operand, absl::Span<const int64_t> dimensions) {
   TF_ASSIGN_OR_RETURN(
       Shape transpose_shape,
@@ -154,13 +154,13 @@ StatusOr<HloInstruction*> MakeTransposeHlo(
       HloInstruction::CreateTranspose(transpose_shape, operand, dimensions));
 }
 
-StatusOr<HloInstruction*> MakeReshapeHlo(const Shape& result_shape,
-                                         HloInstruction* operand) {
+absl::StatusOr<HloInstruction*> MakeReshapeHlo(const Shape& result_shape,
+                                               HloInstruction* operand) {
   return operand->AddInstruction(
       HloInstruction::CreateReshape(result_shape, operand));
 }
 
-StatusOr<HloInstruction*> MakeReshapeHlo(
+absl::StatusOr<HloInstruction*> MakeReshapeHlo(
     absl::Span<const int64_t> result_shape_dim_bounds,
     HloInstruction* operand) {
   Shape new_shape = ShapeUtil::MakeShape(operand->shape().element_type(),
@@ -168,7 +168,7 @@ StatusOr<HloInstruction*> MakeReshapeHlo(
   return MakeReshapeHlo(new_shape, operand);
 }
 
-StatusOr<HloInstruction*> MakeDynamicSliceHlo(
+absl::StatusOr<HloInstruction*> MakeDynamicSliceHlo(
     HloInstruction* operand, absl::Span<HloInstruction* const> start_indices,
     absl::Span<const int64_t> slice_sizes, const OpMetadata* metadata) {
   // slice of a scalar is no-op
@@ -189,7 +189,7 @@ StatusOr<HloInstruction*> MakeDynamicSliceHlo(
       metadata);
 }
 
-StatusOr<HloInstruction*> MakeDynamicSliceHlo(
+absl::StatusOr<HloInstruction*> MakeDynamicSliceHlo(
     HloInstruction* operand, HloInstruction* start_indices,
     absl::Span<const int64_t> slice_sizes, const OpMetadata* metadata) {
   HloComputation* computation = operand->parent();
@@ -218,7 +218,7 @@ StatusOr<HloInstruction*> MakeDynamicSliceHlo(
       metadata);
 }
 
-StatusOr<HloInstruction*> MakeDynamicUpdateSliceHlo(
+absl::StatusOr<HloInstruction*> MakeDynamicUpdateSliceHlo(
     HloInstruction* operand, HloInstruction* update,
     HloInstruction* start_indices, const OpMetadata* metadata) {
   HloComputation* computation = operand->parent();
@@ -248,7 +248,7 @@ StatusOr<HloInstruction*> MakeDynamicUpdateSliceHlo(
       metadata);
 }
 
-StatusOr<HloInstruction*> MakeDynamicUpdateSliceHlo(
+absl::StatusOr<HloInstruction*> MakeDynamicUpdateSliceHlo(
     HloInstruction* operand, HloInstruction* update,
     absl::Span<HloInstruction* const> start_indices,
     const OpMetadata* metadata) {
@@ -269,29 +269,28 @@ StatusOr<HloInstruction*> MakeDynamicUpdateSliceHlo(
       metadata);
 }
 
-HloInstruction* MakeBroadcastHlo(HloInstruction* operand,
-                                 absl::Span<const int64_t> broadcast_dimensions,
-                                 absl::Span<const int64_t> result_shape_bounds,
-                                 const OpMetadata* metadata) {
+HloInstruction* MakeBroadcastHlo(
+    HloInstruction* operand, absl::Span<const int64_t> broadcast_dimensions,
+    absl::Span<const int64_t> result_shape_bounds, const OpMetadata* metadata,
+    const FrontendAttributes* frontend_attributes) {
   Shape broadcast_shape = ShapeUtil::MakeShape(operand->shape().element_type(),
                                                result_shape_bounds);
   return MakeBroadcastHlo(operand, broadcast_dimensions, broadcast_shape,
-                          metadata);
+                          metadata, frontend_attributes);
 }
 
-HloInstruction* MakeBroadcastHlo(HloInstruction* operand,
-                                 absl::Span<const int64_t> broadcast_dimensions,
-                                 const Shape& shape,
-                                 const OpMetadata* metadata) {
+HloInstruction* MakeBroadcastHlo(
+    HloInstruction* operand, absl::Span<const int64_t> broadcast_dimensions,
+    const Shape& shape, const OpMetadata* metadata,
+    const FrontendAttributes* frontend_attributes) {
   HloComputation* computation = operand->parent();
   return computation->AddInstruction(
       HloInstruction::CreateBroadcast(shape, operand, broadcast_dimensions),
-      metadata);
+      metadata, frontend_attributes);
 }
 
-StatusOr<HloInstruction*> MakeGetTupleElementHlo(HloInstruction* operand,
-                                                 int64_t index,
-                                                 const OpMetadata* metadata) {
+absl::StatusOr<HloInstruction*> MakeGetTupleElementHlo(
+    HloInstruction* operand, int64_t index, const OpMetadata* metadata) {
   HloComputation* computation = operand->parent();
 
   TF_ASSIGN_OR_RETURN(
@@ -302,9 +301,9 @@ StatusOr<HloInstruction*> MakeGetTupleElementHlo(HloInstruction* operand,
       metadata);
 }
 
-StatusOr<HloInstruction*> MakeConcatHlo(
+absl::StatusOr<HloInstruction*> MakeConcatHlo(
     absl::Span<HloInstruction* const> operands, int64_t dimension,
-    const OpMetadata* metadata) {
+    const OpMetadata* metadata, const FrontendAttributes* frontend_attributes) {
   CHECK_GT(operands.size(), 0);
 
   HloComputation* computation = operands[0]->parent();
@@ -320,7 +319,7 @@ StatusOr<HloInstruction*> MakeConcatHlo(
                                               operand_shapes, dimension));
   return computation->AddInstruction(
       HloInstruction::CreateConcatenate(concat_shape, operands, dimension),
-      metadata);
+      metadata, frontend_attributes);
 }
 
 HloInstruction* MakeConvertToHlo(HloInstruction* hlo, PrimitiveType type,
@@ -370,7 +369,7 @@ HloInstruction* MakeIotaHlo(HloComputation* computation, const Shape& shape,
       HloInstruction::CreateIota(shape, iota_dimension));
 }
 
-StatusOr<HloInstruction*> MakeDotHlo(
+absl::StatusOr<HloInstruction*> MakeDotHlo(
     HloInstruction* lhs, HloInstruction* rhs,
     const DotDimensionNumbers& dim_numbers,
     const PrecisionConfig& precision_config,
@@ -389,9 +388,9 @@ StatusOr<HloInstruction*> MakeDotHlo(
       metadata);
 }
 
-StatusOr<HloInstruction*> MakeMapHlo(absl::Span<HloInstruction* const> operands,
-                                     HloComputation* map_computation,
-                                     const OpMetadata* metadata) {
+absl::StatusOr<HloInstruction*> MakeMapHlo(
+    absl::Span<HloInstruction* const> operands, HloComputation* map_computation,
+    const OpMetadata* metadata) {
   CHECK(!operands.empty()) << "Map Hlo requires at least one operand.";
   HloComputation* computation = operands.front()->parent();
   std::vector<const Shape*> operand_shapes;
@@ -421,21 +420,20 @@ HloInstruction* MakeReducePrecisionHlo(HloInstruction* operand,
       metadata);
 }
 
-StatusOr<HloInstruction*> MakeReduceHlo(HloInstruction* operand,
-                                        HloInstruction* init_value,
-                                        absl::Span<const int64_t> dimensions,
-                                        HloComputation* reduce_computation,
-                                        const OpMetadata* metadata) {
+absl::StatusOr<HloInstruction*> MakeReduceHlo(
+    HloInstruction* operand, HloInstruction* init_value,
+    absl::Span<const int64_t> dimensions, HloComputation* reduce_computation,
+    const OpMetadata* metadata, const FrontendAttributes* frontend_attributes) {
   auto scalar_shape = ShapeUtil::MakeShape(operand->shape().element_type(), {});
   auto result_shape = ShapeUtil::DeleteDimensions(dimensions, operand->shape());
 
   return operand->parent()->AddInstruction(
       HloInstruction::CreateReduce(result_shape, operand, init_value,
                                    dimensions, reduce_computation),
-      metadata);
+      metadata, frontend_attributes);
 }
 
-StatusOr<HloInstruction*> MakeReduceWindowHlo(
+absl::StatusOr<HloInstruction*> MakeReduceWindowHlo(
     HloInstruction* operand, HloInstruction* init_value, const Window& window,
     HloComputation* reduce_computation, const OpMetadata* metadata) {
   TF_ASSIGN_OR_RETURN(Shape inferred_shape,
@@ -448,11 +446,10 @@ StatusOr<HloInstruction*> MakeReduceWindowHlo(
       metadata);
 }
 
-StatusOr<HloInstruction*> MakeReduceHlo(HloInstruction* operand,
-                                        HloInstruction* init_value,
-                                        absl::Span<const int64_t> dimensions,
-                                        HloOpcode binary_opcode,
-                                        const OpMetadata* metadata) {
+absl::StatusOr<HloInstruction*> MakeReduceHlo(
+    HloInstruction* operand, HloInstruction* init_value,
+    absl::Span<const int64_t> dimensions, HloOpcode binary_opcode,
+    const OpMetadata* metadata, const FrontendAttributes* frontend_attributes) {
   auto scalar_shape = ShapeUtil::MakeShape(operand->shape().element_type(), {});
   HloComputation* reduce_computation;
   {
@@ -468,14 +465,13 @@ StatusOr<HloInstruction*> MakeReduceHlo(HloInstruction* operand,
         operand->GetModule()->AddEmbeddedComputation(b.Build());
   }
   return MakeReduceHlo(operand, init_value, dimensions, reduce_computation,
-                       metadata);
+                       metadata, frontend_attributes);
 }
 
-StatusOr<HloInstruction*> MakeReduceHlo(HloInstruction* operand,
-                                        HloInstruction* init_value,
-                                        HloOpcode binary_opcode,
-                                        HloModule* module,
-                                        const OpMetadata* metadata) {
+absl::StatusOr<HloInstruction*> MakeReduceHlo(
+    HloInstruction* operand, HloInstruction* init_value,
+    HloOpcode binary_opcode, HloModule* module, const OpMetadata* metadata,
+    const FrontendAttributes* frontend_attributes) {
   DCHECK_NE(nullptr, module);
   std::vector<int64_t> all_dims(operand->shape().rank());
   std::iota(all_dims.begin(), all_dims.end(), 0);
@@ -494,14 +490,14 @@ StatusOr<HloInstruction*> MakeReduceHlo(HloInstruction* operand,
     reduce_computation = module->AddEmbeddedComputation(b.Build());
   }
   return MakeReduceHlo(operand, init_value, all_dims, reduce_computation,
-                       metadata);
+                       metadata, frontend_attributes);
 }
 
-StatusOr<HloInstruction*> MakeReduceHlo(
+absl::StatusOr<HloInstruction*> MakeReduceHlo(
     absl::Span<HloInstruction* const> operands,
     absl::Span<HloInstruction* const> init_values,
     absl::Span<const int64_t> dimensions, HloComputation* reduce_computation,
-    const OpMetadata* metadata) {
+    const OpMetadata* metadata, const FrontendAttributes* frontend_attributes) {
   CHECK(!operands.empty());
   CHECK_EQ(operands.size(), init_values.size());
   auto root = reduce_computation->root_instruction();
@@ -525,12 +521,12 @@ StatusOr<HloInstruction*> MakeReduceHlo(
   return operands[0]->parent()->AddInstruction(
       HloInstruction::CreateReduce(output_shape, operands, init_values,
                                    dimensions, reduce_computation),
-      metadata);
+      metadata, frontend_attributes);
 }
 
-StatusOr<HloInstruction*> MakeReverseHlo(HloInstruction* operand,
-                                         absl::Span<const int64_t> dimensions,
-                                         const OpMetadata* metadata) {
+absl::StatusOr<HloInstruction*> MakeReverseHlo(
+    HloInstruction* operand, absl::Span<const int64_t> dimensions,
+    const OpMetadata* metadata) {
   HloComputation* computation = operand->parent();
   TF_ASSIGN_OR_RETURN(Shape reverse_shape, ShapeInference::InferReverseShape(
                                                operand->shape(), dimensions));
@@ -539,10 +535,10 @@ StatusOr<HloInstruction*> MakeReverseHlo(HloInstruction* operand,
       metadata);
 }
 
-StatusOr<HloInstruction*> MakeSelectHlo(HloInstruction* pred,
-                                        HloInstruction* on_true,
-                                        HloInstruction* on_false,
-                                        HloInstruction* derived_from) {
+absl::StatusOr<HloInstruction*> MakeSelectHlo(
+    HloInstruction* pred, HloInstruction* on_true, HloInstruction* on_false,
+    HloInstruction* derived_from, const OpMetadata* metadata,
+    const FrontendAttributes* frontend_attributes) {
   HloComputation* computation = pred->parent();
   DCHECK_EQ(computation, on_true->parent());
   DCHECK_EQ(computation, on_false->parent());
@@ -551,9 +547,11 @@ StatusOr<HloInstruction*> MakeSelectHlo(HloInstruction* pred,
     if (!ShapeUtil::IsScalar(op_shape) && !op_shape.IsTuple()) {
       // If the output is not scalar, we need to broadcast the condition
       // to match the contract of kSelect.
-      pred = computation->AddInstruction(HloInstruction::CreateBroadcast(
-          ShapeUtil::ChangeElementType(op_shape, PrimitiveType::PRED), pred,
-          {}));
+      pred = computation->AddInstruction(
+          HloInstruction::CreateBroadcast(
+              ShapeUtil::ChangeElementType(op_shape, PrimitiveType::PRED), pred,
+              {}),
+          metadata, frontend_attributes);
       if (derived_from) {
         derived_from->SetupDerivedInstruction(pred);
       }
@@ -564,9 +562,10 @@ StatusOr<HloInstruction*> MakeSelectHlo(HloInstruction* pred,
   TF_ASSIGN_OR_RETURN(Shape select_shape,
                       ShapeInference::InferTernaryOpShape(select_op_code, pred,
                                                           on_true, on_false));
-  HloInstruction* select =
-      computation->AddInstruction(HloInstruction::CreateTernary(
-          select_shape, select_op_code, pred, on_true, on_false));
+  HloInstruction* select = computation->AddInstruction(
+      HloInstruction::CreateTernary(select_shape, select_op_code, pred, on_true,
+                                    on_false),
+      metadata, frontend_attributes);
   if (derived_from) {
     derived_from->SetupDerivedInstruction(select);
   }
@@ -582,7 +581,7 @@ HloInstruction* MaybeMakeTuple(absl::Span<HloInstruction* const> operands) {
       HloInstruction::CreateTuple(operands));
 }
 
-StatusOr<HloInstruction*> MakeSortHlo(
+absl::StatusOr<HloInstruction*> MakeSortHlo(
     const Shape& sort_shape, absl::Span<HloInstruction* const> operands,
     int64_t dimension_to_sort, bool is_stable, HloComputation::Builder* builder,
     HloModule* module, const OpMetadata* metadata) {
@@ -608,8 +607,8 @@ StatusOr<HloInstruction*> MakeSortHlo(
       sort_shape, dimension_to_sort, operands, compare_computation, is_stable));
 }
 
-StatusOr<HloInstruction*> CollapseFirstNDims(HloInstruction* operand,
-                                             int64_t n) {
+absl::StatusOr<HloInstruction*> CollapseFirstNDims(HloInstruction* operand,
+                                                   int64_t n) {
   CHECK_GT(n, 0);
 
   const Shape& operand_shape = operand->shape();
@@ -642,8 +641,8 @@ StatusOr<HloInstruction*> CollapseFirstNDims(HloInstruction* operand,
   return MakeReshapeHlo(output_shape, operand);
 }
 
-StatusOr<HloInstruction*> PrependDegenerateDims(HloInstruction* operand,
-                                                int64_t n) {
+absl::StatusOr<HloInstruction*> PrependDegenerateDims(HloInstruction* operand,
+                                                      int64_t n) {
   CHECK_GT(n, 0);
   std::vector<int64_t> new_shape_dims;
   const Shape& operand_shape = operand->shape();
@@ -653,7 +652,7 @@ StatusOr<HloInstruction*> PrependDegenerateDims(HloInstruction* operand,
   return MakeReshapeHlo(new_shape_dims, operand);
 }
 
-StatusOr<HloInstruction*> ExpandFirstDimIntoNDims(
+absl::StatusOr<HloInstruction*> ExpandFirstDimIntoNDims(
     HloInstruction* operand, absl::Span<const int64_t> expanded_dims) {
   CHECK_GT(operand->shape().dimensions_size(), 0);
   CHECK_EQ(operand->shape().dimensions(0), Product(expanded_dims));
@@ -670,7 +669,7 @@ StatusOr<HloInstruction*> ExpandFirstDimIntoNDims(
   return MakeReshapeHlo(new_shape, operand);
 }
 
-StatusOr<HloInstruction*> ElideDegenerateDims(
+absl::StatusOr<HloInstruction*> ElideDegenerateDims(
     HloInstruction* operand, absl::Span<const int64_t> dims_to_elide) {
   return MakeReshapeHlo(ShapeUtil::FilterDimensions(
                             [&](int64_t dim) {
@@ -680,7 +679,7 @@ StatusOr<HloInstruction*> ElideDegenerateDims(
                         operand);
 }
 
-StatusOr<HloInstruction*> InsertDegenerateDims(
+absl::StatusOr<HloInstruction*> InsertDegenerateDims(
     HloInstruction* operand, absl::Span<const int64_t> dims_to_insert) {
   CHECK(absl::c_is_sorted(dims_to_insert));
 
@@ -712,9 +711,9 @@ StatusOr<HloInstruction*> InsertDegenerateDims(
   return MakeReshapeHlo(output_shape, operand);
 }
 
-StatusOr<HloInstruction*> PadVectorWithZeros(HloInstruction* operand,
-                                             int64_t zeros_to_prepend,
-                                             int64_t zeros_to_append) {
+absl::StatusOr<HloInstruction*> PadVectorWithZeros(HloInstruction* operand,
+                                                   int64_t zeros_to_prepend,
+                                                   int64_t zeros_to_append) {
   HloComputation* computation = operand->parent();
   CHECK_EQ(operand->shape().dimensions_size(), 1);
   PaddingConfig padding_config;
@@ -753,7 +752,7 @@ HloInstruction* BroadcastOnes(HloComputation* computation,
                           /*result_shape_bounds=*/broadcast_dimensions);
 }
 
-StatusOr<HloInstruction*> MakeFusionInstruction(
+absl::StatusOr<HloInstruction*> MakeFusionInstruction(
     HloInstruction* fused, HloInstruction::FusionKind kind) {
   HloComputation* comp = fused->parent();
   HloInstruction* fusion_instruction = comp->AddInstruction(
@@ -778,7 +777,7 @@ HloInstruction* CreateDummyOp(HloComputation::Builder* b, const Shape& shape) {
   return b->AddInstruction(HloInstruction::CreateTuple(sub_instructions));
 }
 
-StatusOr<std::unique_ptr<HloComputation>> CreateComputationWithSignature(
+absl::StatusOr<std::unique_ptr<HloComputation>> CreateComputationWithSignature(
     absl::Span<const Shape* const> domain, const Shape& range,
     absl::string_view name) {
   HloComputation::Builder b{std::string(name)};
diff --git a/third_party/xla/xla/service/hlo_creation_utils.h b/third_party/xla/xla/service/hlo_creation_utils.h
index 86f129d562cd25..95313b9396e2f2 100644
--- a/third_party/xla/xla/service/hlo_creation_utils.h
+++ b/third_party/xla/xla/service/hlo_creation_utils.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_SERVICE_HLO_CREATION_UTILS_H_
 #define XLA_SERVICE_HLO_CREATION_UTILS_H_
 
+#include <cstddef>
 #include <memory>
 #include <optional>
 #include <vector>
@@ -24,7 +25,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/literal_util.h"
-#include "xla/statusor.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
@@ -35,75 +35,77 @@ namespace xla {
 
 // Creates a unary HLO instruction and adds it to the computation containing
 // `operand`.
-StatusOr<HloInstruction*> MakeUnaryHlo(HloOpcode opcode,
-                                       HloInstruction* operand,
-                                       const OpMetadata* metadata = nullptr);
+absl::StatusOr<HloInstruction*> MakeUnaryHlo(
+    HloOpcode opcode, HloInstruction* operand,
+    const OpMetadata* metadata = nullptr);
 
 // Creates a binary HLO instruction and adds it to the computation containing
 // `lhs` and `rhs` (`lhs` and `rhs` must be in the same computation).
-StatusOr<HloInstruction*> MakeBinaryHlo(HloOpcode opcode, HloInstruction* lhs,
-                                        HloInstruction* rhs,
-                                        const OpMetadata* metadata = nullptr);
+absl::StatusOr<HloInstruction*> MakeBinaryHlo(
+    HloOpcode opcode, HloInstruction* lhs, HloInstruction* rhs,
+    const OpMetadata* metadata = nullptr,
+    const FrontendAttributes* frontend_attributes = nullptr);
 
 // Creates a kCopy HLO.
 HloInstruction* MakeCopyHlo(HloInstruction* from, const Shape& to);
 
 // Creates a compare HLO instruction and adds it to the computation containing
 // `lhs` and `rhs` (`lhs` and `rhs` must be in the same computation).
-StatusOr<HloInstruction*> MakeCompareHlo(Comparison::Direction direction,
-                                         HloInstruction* lhs,
-                                         HloInstruction* rhs,
-                                         const OpMetadata* metadata = nullptr);
+absl::StatusOr<HloInstruction*> MakeCompareHlo(
+    Comparison::Direction direction, HloInstruction* lhs, HloInstruction* rhs,
+    const OpMetadata* metadata = nullptr,
+    const FrontendAttributes* frontend_attributes = nullptr);
 
 // Creates a pad HLO instruction and adds it to the computation containing
 // `operand` and `padding_value` (`operand` and `padding_value` must be in the
 // same computation).
-StatusOr<HloInstruction*> MakePadHlo(HloInstruction* operand,
-                                     HloInstruction* padding_value,
-                                     const PaddingConfig& padding_config,
-                                     const OpMetadata* metadata = nullptr);
+absl::StatusOr<HloInstruction*> MakePadHlo(
+    HloInstruction* operand, HloInstruction* padding_value,
+    const PaddingConfig& padding_config, const OpMetadata* metadata = nullptr,
+    const FrontendAttributes* frontend_attributes = nullptr);
 
 // Creates a slice HLO instruction and adds it to the computation containing
 // `operand`.
-StatusOr<HloInstruction*> MakeSliceHlo(HloInstruction* operand,
-                                       absl::Span<const int64_t> start_indices,
-                                       absl::Span<const int64_t> limit_indices,
-                                       absl::Span<const int64_t> strides,
-                                       const OpMetadata* metadata = nullptr);
+absl::StatusOr<HloInstruction*> MakeSliceHlo(
+    HloInstruction* operand, absl::Span<const int64_t> start_indices,
+    absl::Span<const int64_t> limit_indices, absl::Span<const int64_t> strides,
+    const OpMetadata* metadata = nullptr,
+    const FrontendAttributes* frontend_attributes = nullptr);
 
 // Creates a convolution HLO instruction and adds it to the computation
 // containing `lhs` and `rhs` (`lhs` and `rhs` must be in the same computation).
 // If the result shape has integral element type, an optional
 // preferred_element_type can be specified to override the element type.
-StatusOr<HloInstruction*> MakeConvolveHlo(
+absl::StatusOr<HloInstruction*> MakeConvolveHlo(
     HloInstruction* lhs, HloInstruction* rhs, int64_t feature_group_count,
     int64_t batch_group_count, const Window& window,
     const ConvolutionDimensionNumbers& dimension_numbers,
     const PrecisionConfig& precision_config,
     std::optional<PrimitiveType> preferred_element_type,
-    const OpMetadata* metadata = nullptr);
+    const OpMetadata* metadata = nullptr,
+    const FrontendAttributes* frontend_attributes = nullptr);
 
 // Creates a transpose HLO instruction and adds it to the computation containing
 // `operand`.
-StatusOr<HloInstruction*> MakeTransposeHlo(
+absl::StatusOr<HloInstruction*> MakeTransposeHlo(
     HloInstruction* operand, absl::Span<const int64_t> dimensions);
 
 // Creates a reshape HLO instruction and adds it to the computation containing
 // `operand`.
-StatusOr<HloInstruction*> MakeReshapeHlo(const Shape& result_shape,
-                                         HloInstruction* operand);
+absl::StatusOr<HloInstruction*> MakeReshapeHlo(const Shape& result_shape,
+                                               HloInstruction* operand);
 
-StatusOr<HloInstruction*> MakeReshapeHlo(
+absl::StatusOr<HloInstruction*> MakeReshapeHlo(
     absl::Span<const int64_t> result_shape_dim_bounds, HloInstruction* operand);
 
 // Creates a dynamic-slice HLO instruction and adds it to the computation
 // containing `operand` and `start_indices` (`operand` and `start_indices` must
 // be in the same computation).
-StatusOr<HloInstruction*> MakeDynamicSliceHlo(
+absl::StatusOr<HloInstruction*> MakeDynamicSliceHlo(
     HloInstruction* operand, absl::Span<HloInstruction* const> start_indices,
     absl::Span<const int64_t> slice_sizes,
     const OpMetadata* metadata = nullptr);
-StatusOr<HloInstruction*> MakeDynamicSliceHlo(
+absl::StatusOr<HloInstruction*> MakeDynamicSliceHlo(
     HloInstruction* operand, HloInstruction* start_indices,
     absl::Span<const int64_t> slice_sizes,
     const OpMetadata* metadata = nullptr);
@@ -111,40 +113,42 @@ StatusOr<HloInstruction*> MakeDynamicSliceHlo(
 // Creates a dynamic-update-slice HLO instruction and adds it to the computation
 // containing `operand`, `update` and `start_indices` (`operand`, `update` and
 // `start_indices` must be in the same computation).
-StatusOr<HloInstruction*> MakeDynamicUpdateSliceHlo(
+absl::StatusOr<HloInstruction*> MakeDynamicUpdateSliceHlo(
     HloInstruction* operand, HloInstruction* update,
     HloInstruction* start_indices, const OpMetadata* metadata = nullptr);
 
 // a variant of dynamic-update-slice where `start_indices` is a vector of HLO
 // instructions
-StatusOr<HloInstruction*> MakeDynamicUpdateSliceHlo(
+absl::StatusOr<HloInstruction*> MakeDynamicUpdateSliceHlo(
     HloInstruction* operand, HloInstruction* update,
     absl::Span<HloInstruction* const> start_indices,
     const OpMetadata* metadata = nullptr);
 
 // Creates a broadcast HLO instruction and adds it to the computation containing
 // `operand`.
-HloInstruction* MakeBroadcastHlo(HloInstruction* operand,
-                                 absl::Span<const int64_t> broadcast_dimensions,
-                                 absl::Span<const int64_t> result_shape_bounds,
-                                 const OpMetadata* metadata = nullptr);
-HloInstruction* MakeBroadcastHlo(HloInstruction* operand,
-                                 absl::Span<const int64_t> broadcast_dimensions,
-                                 const Shape& shape,
-                                 const OpMetadata* metadata = nullptr);
+HloInstruction* MakeBroadcastHlo(
+    HloInstruction* operand, absl::Span<const int64_t> broadcast_dimensions,
+    absl::Span<const int64_t> result_shape_bounds,
+    const OpMetadata* metadata = nullptr,
+    const FrontendAttributes* frontend_attributes = nullptr);
+HloInstruction* MakeBroadcastHlo(
+    HloInstruction* operand, absl::Span<const int64_t> broadcast_dimensions,
+    const Shape& shape, const OpMetadata* metadata = nullptr,
+    const FrontendAttributes* frontend_attributes = nullptr);
 
 // Creates a GetTupleElement HLO instruction and adds it to the computation
 // containing `operand`.
-StatusOr<HloInstruction*> MakeGetTupleElementHlo(
+absl::StatusOr<HloInstruction*> MakeGetTupleElementHlo(
     HloInstruction* operand, int64_t index,
     const OpMetadata* metadata = nullptr);
 
 // Creates a Concatenate HLO instruction and adds it to the computation
 // containing `operands` (`operands` must be non-empty and every element must be
 // contained in the same computation).
-StatusOr<HloInstruction*> MakeConcatHlo(
+absl::StatusOr<HloInstruction*> MakeConcatHlo(
     absl::Span<HloInstruction* const> operands, int64_t dimension,
-    const OpMetadata* metadata = nullptr);
+    const OpMetadata* metadata = nullptr,
+    const FrontendAttributes* frontend_attributes = nullptr);
 
 // Creates a Convert HLO instruction that converts the given instruction to have
 // the given primitive type.
@@ -168,7 +172,7 @@ HloInstruction* MakeIotaHlo(HloComputation* computation, const Shape& shape,
 // integral element type, an optional preferred_element_type can be specified to
 // override the element type. If 'sparsity' is set, then 'sparse_meta' must also
 // be present (and have the same size).
-StatusOr<HloInstruction*> MakeDotHlo(
+absl::StatusOr<HloInstruction*> MakeDotHlo(
     HloInstruction* lhs, HloInstruction* rhs,
     const DotDimensionNumbers& dim_numbers,
     const PrecisionConfig& precision_config,
@@ -179,9 +183,9 @@ StatusOr<HloInstruction*> MakeDotHlo(
 
 // Creates a Map HLO instruction and adds it to the computation containing the
 // operands. All operands must be in the same computation.
-StatusOr<HloInstruction*> MakeMapHlo(absl::Span<HloInstruction* const> operands,
-                                     HloComputation* map_computation,
-                                     const OpMetadata* metadata = nullptr);
+absl::StatusOr<HloInstruction*> MakeMapHlo(
+    absl::Span<HloInstruction* const> operands, HloComputation* map_computation,
+    const OpMetadata* metadata = nullptr);
 
 // Creates a reduce-precision op, where operand is the data to reduce in
 // precision, and exponent_bits and mantissa_bits describe the precision to
@@ -190,30 +194,30 @@ HloInstruction* MakeReducePrecisionHlo(HloInstruction* operand,
                                        int exponent_bits, int mantissa_bits,
                                        const OpMetadata* metadata = nullptr);
 
-StatusOr<HloInstruction*> MakeReduceWindowHlo(
+absl::StatusOr<HloInstruction*> MakeReduceWindowHlo(
     HloInstruction* operand, HloInstruction* init_value, const Window& window,
     HloComputation* reduce_computation, const OpMetadata* metadata = nullptr);
 
 // Creates a Reduce HLO instruction and adds it to the computation containing
 // the operand. This will create the sub-computation needed for the reduction in
 // the given module. binary_opcode should represent a binary operation.
-StatusOr<HloInstruction*> MakeReduceHlo(HloInstruction* operand,
-                                        HloInstruction* init_value,
-                                        absl::Span<const int64_t> dimensions,
-                                        HloOpcode binary_opcode,
-                                        const OpMetadata* metadata = nullptr);
-
-StatusOr<HloInstruction*> MakeReduceHlo(HloInstruction* operand,
-                                        HloInstruction* init_value,
-                                        absl::Span<const int64_t> dimensions,
-                                        HloComputation* reduce_computation,
-                                        const OpMetadata* metadata = nullptr);
+absl::StatusOr<HloInstruction*> MakeReduceHlo(
+    HloInstruction* operand, HloInstruction* init_value,
+    absl::Span<const int64_t> dimensions, HloOpcode binary_opcode,
+    const OpMetadata* metadata = nullptr,
+    const FrontendAttributes* frontend_attributes = nullptr);
+
+absl::StatusOr<HloInstruction*> MakeReduceHlo(
+    HloInstruction* operand, HloInstruction* init_value,
+    absl::Span<const int64_t> dimensions, HloComputation* reduce_computation,
+    const OpMetadata* metadata = nullptr,
+    const FrontendAttributes* frontend_attributes = nullptr);
 
-StatusOr<HloInstruction*> MakeReduceHlo(HloInstruction* operand,
-                                        HloInstruction* init_value,
-                                        HloOpcode binary_opcode,
-                                        HloModule* module,
-                                        const OpMetadata* metadata = nullptr);
+absl::StatusOr<HloInstruction*> MakeReduceHlo(
+    HloInstruction* operand, HloInstruction* init_value,
+    HloOpcode binary_opcode, HloModule* module,
+    const OpMetadata* metadata = nullptr,
+    const FrontendAttributes* frontend_attributes = nullptr);
 
 // Generic helper function to create a reduction.
 //
@@ -222,26 +226,28 @@ StatusOr<HloInstruction*> MakeReduceHlo(HloInstruction* operand,
 //
 // Creates a non-variadic reduction if the size is singular, and a variadic one
 // otherwise.
-StatusOr<HloInstruction*> MakeReduceHlo(
+absl::StatusOr<HloInstruction*> MakeReduceHlo(
     absl::Span<HloInstruction* const> operands,
     absl::Span<HloInstruction* const> init_values,
     absl::Span<const int64_t> dimensions, HloComputation* reduce_computation,
-    const OpMetadata* metadata = nullptr);
+    const OpMetadata* metadata = nullptr,
+    const FrontendAttributes* frontend_attributes = nullptr);
 
 // Creates a Reverse HLO instruction and adds it to the computation containing
 // `operand`.
-StatusOr<HloInstruction*> MakeReverseHlo(HloInstruction* operand,
-                                         absl::Span<const int64_t> dimensions,
-                                         const OpMetadata* metadata = nullptr);
+absl::StatusOr<HloInstruction*> MakeReverseHlo(
+    HloInstruction* operand, absl::Span<const int64_t> dimensions,
+    const OpMetadata* metadata = nullptr);
 
 // Creates a Select HLO instruction and adds it to the computation containing
 // the predicate. The on_true and on_false instructions must also be contained
 // in the same computation. If on_true and on_false are tuples, create a tuple
 // select instead. `pred` is broadcasted up from a scalar if necessary.
-StatusOr<HloInstruction*> MakeSelectHlo(HloInstruction* pred,
-                                        HloInstruction* on_true,
-                                        HloInstruction* on_false,
-                                        HloInstruction* derived_from = nullptr);
+absl::StatusOr<HloInstruction*> MakeSelectHlo(
+    HloInstruction* pred, HloInstruction* on_true, HloInstruction* on_false,
+    HloInstruction* derived_from = nullptr,
+    const OpMetadata* metadata = nullptr,
+    const FrontendAttributes* frontend_attributes = nullptr);
 
 // Forwards the first operand if operands.size() == 1, or creates a tuple
 // instruction with all the operands. Crashes if `operands` is empty.
@@ -251,7 +257,7 @@ HloInstruction* MaybeMakeTuple(absl::Span<HloInstruction* const> operands);
 // operands. All operands must be in the same computation. Also creates a
 // default compare sub-computation which sorts the first operand into ascending
 // order. 'is_stable' specifies whether the sorting should be stable.
-StatusOr<HloInstruction*> MakeSortHlo(
+absl::StatusOr<HloInstruction*> MakeSortHlo(
     const Shape& sort_shape, absl::Span<HloInstruction* const> operands,
     int64_t dimension_to_sort, bool is_stable, HloComputation::Builder* builder,
     HloModule* module, const OpMetadata* metadata = nullptr);
@@ -259,9 +265,9 @@ StatusOr<HloInstruction*> MakeSortHlo(
 // Creates an R1 Constant HLO instruction of the given PrimitiveType with the
 // given values and adds it to the given computation.
 template <typename NativeT>
-StatusOr<HloInstruction*> MakeR1ConstantHlo(HloComputation* computation,
-                                            PrimitiveType type,
-                                            absl::Span<const NativeT> values) {
+absl::StatusOr<HloInstruction*> MakeR1ConstantHlo(
+    HloComputation* computation, PrimitiveType type,
+    absl::Span<const NativeT> values) {
   Literal literal = LiteralUtil::CreateR1<NativeT>(values);
   if (literal.shape().element_type() != type) {
     TF_ASSIGN_OR_RETURN(literal, literal.Convert(type));
@@ -296,7 +302,7 @@ HloInstruction* MakeScalarLike(HloInstruction* base, NativeT value) {
 
 // Creates a fusion instruction and fuses `fused` into the created fusion
 // instruction.
-StatusOr<HloInstruction*> MakeFusionInstruction(
+absl::StatusOr<HloInstruction*> MakeFusionInstruction(
     HloInstruction* fused, HloInstruction::FusionKind kind);
 
 // -----------------------------------------------------------------------------
@@ -310,8 +316,8 @@ StatusOr<HloInstruction*> MakeFusionInstruction(
 //
 // For instance if `operand` has shape f32[7,8,9] and n is 2 then the output is
 // the `operand` reshaped to [56,9].
-StatusOr<HloInstruction*> CollapseFirstNDims(HloInstruction* operand,
-                                             int64_t n);
+absl::StatusOr<HloInstruction*> CollapseFirstNDims(HloInstruction* operand,
+                                                   int64_t n);
 
 // Prepends `n` degenerate dimensions (dimensions with bound = 1) to `operand`
 // using a reshape.
@@ -319,8 +325,8 @@ StatusOr<HloInstruction*> CollapseFirstNDims(HloInstruction* operand,
 // For instance if operand has shape f32[3,4,5] then this returns the operand
 // reshaped to f32[1,3,4,5].  If the operand is a f32 scalar (i.e. has shape
 // f32[]) then this returns the operand reshaped to f32[1].
-StatusOr<HloInstruction*> PrependDegenerateDims(HloInstruction* operand,
-                                                int64_t n);
+absl::StatusOr<HloInstruction*> PrependDegenerateDims(HloInstruction* operand,
+                                                      int64_t n);
 
 // Expands (via reshape) the first (logical) dimension of `operand` into a
 // sequence of `expanded_dims` dimensions.  `operand` must at least be of rank 1
@@ -329,7 +335,7 @@ StatusOr<HloInstruction*> PrependDegenerateDims(HloInstruction* operand,
 //
 // For instance if `operand` has shape f32[200,9,7] and expanded_dims is
 // {2,5,20} the result is `operand` reshaped to [2,5,20,9,7].
-StatusOr<HloInstruction*> ExpandFirstDimIntoNDims(
+absl::StatusOr<HloInstruction*> ExpandFirstDimIntoNDims(
     HloInstruction* operand, absl::Span<const int64_t> expanded_dims);
 
 // Elides (via reshape) a set of degenerate dimensions (dimensions containing
@@ -339,7 +345,7 @@ StatusOr<HloInstruction*> ExpandFirstDimIntoNDims(
 //
 // For example if `operand` is of shape f32[19,1,20,1,7,1,9] and dims_to_elide
 // is {1,5} then the result is `operand` reshaped to [19,20,1,7,9].
-StatusOr<HloInstruction*> ElideDegenerateDims(
+absl::StatusOr<HloInstruction*> ElideDegenerateDims(
     HloInstruction* operand, absl::Span<const int64_t> dims_to_elide);
 
 // Inserts (via reshape) a set of degenerate dimensions (dimensions containing
@@ -349,14 +355,14 @@ StatusOr<HloInstruction*> ElideDegenerateDims(
 //
 // For example, if `operand` is of shape f32[12,21,8,34] and dims_to_insert is
 // {0, 2}, then the result is `operand` reshaped to [1,12,1,21,8,34].
-StatusOr<HloInstruction*> InsertDegenerateDims(
+absl::StatusOr<HloInstruction*> InsertDegenerateDims(
     HloInstruction* operand, absl::Span<const int64_t> dims_to_insert);
 
 // Pads `operand` (which must have rank 1) with `zeros_to_prepend` zeros in the
 // front and `zeros_to_append` zeros in the back.
-StatusOr<HloInstruction*> PadVectorWithZeros(HloInstruction* operand,
-                                             int64_t zeros_to_prepend,
-                                             int64_t zeros_to_append);
+absl::StatusOr<HloInstruction*> PadVectorWithZeros(HloInstruction* operand,
+                                                   int64_t zeros_to_prepend,
+                                                   int64_t zeros_to_append);
 
 // Broadcasts a zero value of type `element_type` into a tensor with element
 // type `element_type` and dimension bounds `broadcast_dimensions`.  The
@@ -376,7 +382,7 @@ HloInstruction* BroadcastOnes(HloComputation* computation,
 
 // Creates a HLO computation that takes arguments of type `domain` and produces
 // a value of type `range`.
-StatusOr<std::unique_ptr<HloComputation>> CreateComputationWithSignature(
+absl::StatusOr<std::unique_ptr<HloComputation>> CreateComputationWithSignature(
     absl::Span<const Shape* const> domain, const Shape& range,
     absl::string_view name);
 
diff --git a/third_party/xla/xla/service/hlo_cse.cc b/third_party/xla/xla/service/hlo_cse.cc
index 46a4b52c3b9698..9ade2a7d03d9cb 100644
--- a/third_party/xla/xla/service/hlo_cse.cc
+++ b/third_party/xla/xla/service/hlo_cse.cc
@@ -221,7 +221,7 @@ struct CseKey {
 
 }  // namespace
 
-StatusOr<bool> HloCSE::Run(
+absl::StatusOr<bool> HloCSE::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/hlo_cse.h b/third_party/xla/xla/service/hlo_cse.h
index 8e87e4f8ab9100..1ccab0d5872eb0 100644
--- a/third_party/xla/xla/service/hlo_cse.h
+++ b/third_party/xla/xla/service/hlo_cse.h
@@ -45,7 +45,7 @@ class HloCSE : public HloModulePass {
   // Run CSE on the given module. Returns whether the module was changed (common
   // subexpressions were found and eliminated).
   using HloPassInterface::Run;
-  StatusOr<bool> Run(
+  absl::StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/hlo_dataflow_analysis.cc b/third_party/xla/xla/service/hlo_dataflow_analysis.cc
index b2b846e9291c52..a60e0211ca5648 100644
--- a/third_party/xla/xla/service/hlo_dataflow_analysis.cc
+++ b/third_party/xla/xla/service/hlo_dataflow_analysis.cc
@@ -1668,7 +1668,7 @@ void HloDataflowAnalysis::OptimizePhiValues() {
 }
 
 /* static */
-StatusOr<std::unique_ptr<HloDataflowAnalysis>> HloDataflowAnalysis::Run(
+absl::StatusOr<std::unique_ptr<HloDataflowAnalysis>> HloDataflowAnalysis::Run(
     const HloModule& module, bool ssa_form, bool bitcast_defines_value,
     const CanShareBuffer& can_share_buffer, const ForwardsValue& forwards_value,
     absl::flat_hash_set<absl::string_view> execution_threads) {
diff --git a/third_party/xla/xla/service/hlo_dataflow_analysis.h b/third_party/xla/xla/service/hlo_dataflow_analysis.h
index 2b2c456a433dab..a5cf84409aae4d 100644
--- a/third_party/xla/xla/service/hlo_dataflow_analysis.h
+++ b/third_party/xla/xla/service/hlo_dataflow_analysis.h
@@ -110,7 +110,7 @@ class HloDataflowAnalysis {
   //   bitcast_defines_value : If true then the Bitcast HLO instruction defines
   //     a new HLO value in the analysis. If false then Bitcast forwards the
   //     value of its operand.
-  static StatusOr<std::unique_ptr<HloDataflowAnalysis>> Run(
+  static absl::StatusOr<std::unique_ptr<HloDataflowAnalysis>> Run(
       const HloModule& module, bool ssa_form = false,
       bool bitcast_defines_value = false,
       const CanShareBuffer& can_share_buffer = nullptr,
diff --git a/third_party/xla/xla/service/hlo_dce.cc b/third_party/xla/xla/service/hlo_dce.cc
index 3ea6c2a9be504d..1a898850c38508 100644
--- a/third_party/xla/xla/service/hlo_dce.cc
+++ b/third_party/xla/xla/service/hlo_dce.cc
@@ -61,7 +61,7 @@ bool IsRemovableWhile(HloInstruction* instruction,
 
 }  // namespace
 
-/*static*/ StatusOr<bool> HloDCE::RunOnComputation(
+/*static*/ absl::StatusOr<bool> HloDCE::RunOnComputation(
     HloComputation* computation, bool remove_cross_partition_collective_ops) {
   bool changed = false;
   VLOG(3) << "Before dce:";
@@ -103,14 +103,16 @@ bool IsRemovableWhile(HloInstruction* instruction,
 Status HloDCE::RecursivelyRemoveDeadComputation(
     HloModule* module, HloComputation* computation,
     absl::flat_hash_map<HloComputation*, int>& live_call_counts) {
+  std::vector<HloComputation*> to_be_deleted;
   // First loops all the sub-instructions/sub-computations.
   for (HloInstruction* instruction : computation->instructions()) {
     for (HloComputation* subcomp : instruction->called_computations()) {
       auto iter = live_call_counts.find(subcomp);
       if (iter == live_call_counts.end()) {
         return tsl::errors::Internal(
-            "called computation not found in live_call_counts table during "
-            "HloDCE");
+            "called computation %s not found in live_call_counts table during "
+            "HloDCE",
+            subcomp->name());
       }
 
       // Decrements the live call count and sees if there are no more live
@@ -118,17 +120,27 @@ Status HloDCE::RecursivelyRemoveDeadComputation(
       int live_call_count = --iter->second;
       CHECK_GE(live_call_count, 0);
       if (live_call_count == 0) {
-        TF_RETURN_IF_ERROR(RecursivelyRemoveDeadComputation(module, subcomp,
-                                                            live_call_counts));
+        to_be_deleted.push_back(subcomp);
+        live_call_counts.erase(iter);
       }
     }
   }
   VLOG(1) << "Removing dead computation " << computation->name();
   // After looping called subcomputations, now safe to delete the computation.
-  return module->RemoveEmbeddedComputation(computation);
+  TF_RETURN_IF_ERROR(module->RemoveEmbeddedComputation(computation));
+
+  // Only remove the to be deleted subcomputations now after 'computation' has
+  // been removed. Otherwise we might still have pointers to subcomputations
+  // that we want to delete.
+  for (HloComputation* subcomp : to_be_deleted) {
+    TF_RETURN_IF_ERROR(
+        RecursivelyRemoveDeadComputation(module, subcomp, live_call_counts));
+  }
+  return OkStatus();
 }
 
-StatusOr<bool> HloDCE::RecursivelyRemoveDeadComputations(HloModule* module) {
+absl::StatusOr<bool> HloDCE::RecursivelyRemoveDeadComputations(
+    HloModule* module) {
   // Tracks whether any dead code is eliminated by this pass.
   bool module_contains_dead_code = false;
 
@@ -166,7 +178,7 @@ StatusOr<bool> HloDCE::RecursivelyRemoveDeadComputations(HloModule* module) {
   return module_contains_dead_code;
 }
 
-StatusOr<bool> HloDCE::Run(
+absl::StatusOr<bool> HloDCE::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/hlo_dce.h b/third_party/xla/xla/service/hlo_dce.h
index 8c1347280f3984..8e8e3fab4d58ec 100644
--- a/third_party/xla/xla/service/hlo_dce.h
+++ b/third_party/xla/xla/service/hlo_dce.h
@@ -47,20 +47,20 @@ class HloDCE : public HloModulePass {
   absl::string_view name() const override { return "dce"; }
 
   // Run DCE on a computation.
-  static StatusOr<bool> RunOnComputation(
+  static absl::StatusOr<bool> RunOnComputation(
       HloComputation* computation, bool remove_cross_partition_collective_ops);
 
   // Run the pass on the given module. Returns whether the module was changed
   // (instructions were removed).
   using HloPassInterface::Run;
-  StatusOr<bool> Run(
+  absl::StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
  private:
   // Finds all computations that are not called by any instruction and removes
   // them from the module. Returns whether any dead code was removed.
-  StatusOr<bool> RecursivelyRemoveDeadComputations(HloModule* module);
+  absl::StatusOr<bool> RecursivelyRemoveDeadComputations(HloModule* module);
 
   // Given a dead computation, decrements the ref count of all its called
   // computations and checks if any of the subcomputations become dead after the
diff --git a/third_party/xla/xla/service/hlo_domain_isolator.cc b/third_party/xla/xla/service/hlo_domain_isolator.cc
index 38919610d67959..c5355ef98d6777 100644
--- a/third_party/xla/xla/service/hlo_domain_isolator.cc
+++ b/third_party/xla/xla/service/hlo_domain_isolator.cc
@@ -29,8 +29,8 @@ namespace xla {
 namespace {
 
 // Add domains which are used as users of a specific instruction.
-StatusOr<int64_t> AddExitDomains(HloInstruction* instruction,
-                                 HloDomainIsolator::DomainCreator* creator) {
+absl::StatusOr<int64_t> AddExitDomains(
+    HloInstruction* instruction, HloDomainIsolator::DomainCreator* creator) {
   int64_t added_domains = 0;
   if (instruction->opcode() == HloOpcode::kDomain) {
     return added_domains;
@@ -55,7 +55,7 @@ StatusOr<int64_t> AddExitDomains(HloInstruction* instruction,
   return added_domains;
 }
 
-StatusOr<bool> RunInternal(
+absl::StatusOr<bool> RunInternal(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads,
     HloDomainIsolator::DomainCreator* creator) {
@@ -98,7 +98,8 @@ StatusOr<bool> RunInternal(
 HloDomainIsolator::HloDomainIsolator(DomainCreatorFactory creator_factory)
     : creator_factory_(std::move(creator_factory)) {}
 
-StatusOr<bool> HloDomainIsolator::UpdateDomains(HloInstruction* instruction) {
+absl::StatusOr<bool> HloDomainIsolator::UpdateDomains(
+    HloInstruction* instruction) {
   DomainCreator creator = creator_factory_();
   bool changed = false;
   // Update exit domains.
@@ -122,7 +123,7 @@ StatusOr<bool> HloDomainIsolator::UpdateDomains(HloInstruction* instruction) {
   return changed;
 }
 
-StatusOr<bool> HloDomainIsolator::Run(
+absl::StatusOr<bool> HloDomainIsolator::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   DomainCreator creator = creator_factory_();
diff --git a/third_party/xla/xla/service/hlo_domain_isolator.h b/third_party/xla/xla/service/hlo_domain_isolator.h
index b9988874a0d88c..db9385847f0343 100644
--- a/third_party/xla/xla/service/hlo_domain_isolator.h
+++ b/third_party/xla/xla/service/hlo_domain_isolator.h
@@ -43,10 +43,10 @@ class HloDomainIsolator : public HloModulePass {
   absl::string_view name() const override { return "domain_isolator"; }
 
   // Update domains for an instruction.
-  StatusOr<bool> UpdateDomains(HloInstruction* instruction);
+  absl::StatusOr<bool> UpdateDomains(HloInstruction* instruction);
 
   using HloPassInterface::Run;
-  StatusOr<bool> Run(
+  absl::StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/hlo_domain_map.cc b/third_party/xla/xla/service/hlo_domain_map.cc
index 817c08d9b2533f..6aa3933244de8f 100644
--- a/third_party/xla/xla/service/hlo_domain_map.cc
+++ b/third_party/xla/xla/service/hlo_domain_map.cc
@@ -28,14 +28,14 @@ limitations under the License.
 
 namespace xla {
 
-/* static */ StatusOr<std::unique_ptr<HloDomainMap>> HloDomainMap::Create(
+/* static */ absl::StatusOr<std::unique_ptr<HloDomainMap>> HloDomainMap::Create(
     HloComputation* computation, std::string domain_kind) {
   auto domain_map = absl::WrapUnique(new HloDomainMap(std::move(domain_kind)));
   TF_RETURN_IF_ERROR(domain_map->Populate(computation));
   return std::move(domain_map);
 }
 
-/* static */ StatusOr<std::unique_ptr<HloDomainMap>> HloDomainMap::Create(
+/* static */ absl::StatusOr<std::unique_ptr<HloDomainMap>> HloDomainMap::Create(
     HloModule* module, std::string domain_kind) {
   auto domain_map = absl::WrapUnique(new HloDomainMap(std::move(domain_kind)));
   for (HloComputation* computation : module->computations()) {
@@ -193,7 +193,8 @@ Status HloDomainMap::ExpandDomain(HloInstruction* instruction,
   return OkStatus();
 }
 
-StatusOr<std::unique_ptr<DomainMetadata::Domain>> HloDomainMap::CreateDomain(
+absl::StatusOr<std::unique_ptr<DomainMetadata::Domain>>
+HloDomainMap::CreateDomain(
     HloInstruction* instruction,
     const InstructionOrderMap& instructions_order) const {
   auto domain = std::make_unique<DomainMetadata::Domain>();
diff --git a/third_party/xla/xla/service/hlo_domain_map.h b/third_party/xla/xla/service/hlo_domain_map.h
index d6aacda60ee8ec..97814571274337 100644
--- a/third_party/xla/xla/service/hlo_domain_map.h
+++ b/third_party/xla/xla/service/hlo_domain_map.h
@@ -41,14 +41,14 @@ class HloDomainMap {
   // computation, of the given kind. If domain_kind is not empty, only the
   // kDomain instructions of domain_kind will be considered as separators.
   // Otherwise every kDomain instruction will be splitting domains.
-  static StatusOr<std::unique_ptr<HloDomainMap>> Create(
+  static absl::StatusOr<std::unique_ptr<HloDomainMap>> Create(
       HloComputation* computation, std::string domain_kind);
 
   // Creates a new HloDomainMap, creating all the domains within the input
   // module, of the given kind. If domain_kind is not empty, only the
   // kDomain instructions of domain_kind will be considered as separators.
   // Otherwise every kDomain instruction will be splitting domains.
-  static StatusOr<std::unique_ptr<HloDomainMap>> Create(
+  static absl::StatusOr<std::unique_ptr<HloDomainMap>> Create(
       HloModule* module, std::string domain_kind);
 
   // Retrieves all the domains the input module or computation are composed by.
@@ -105,7 +105,7 @@ class HloDomainMap {
                       DomainMetadata::Domain* domain) const;
 
   // Creates a domain data structure using the ExpandDomain() API.
-  StatusOr<std::unique_ptr<DomainMetadata::Domain>> CreateDomain(
+  absl::StatusOr<std::unique_ptr<DomainMetadata::Domain>> CreateDomain(
       HloInstruction* instruction,
       const InstructionOrderMap& instructions_order) const;
 
diff --git a/third_party/xla/xla/service/hlo_domain_remover.cc b/third_party/xla/xla/service/hlo_domain_remover.cc
index 279064b266b5eb..3d35a25595e1ba 100644
--- a/third_party/xla/xla/service/hlo_domain_remover.cc
+++ b/third_party/xla/xla/service/hlo_domain_remover.cc
@@ -30,7 +30,7 @@ class HloDomainRemover::RunContext {
   RunContext(HloModule* module, HloDomainRemover* remover)
       : module_(module), remover_(remover) {}
 
-  StatusOr<bool> Run(
+  absl::StatusOr<bool> Run(
       const absl::flat_hash_set<absl::string_view>& execution_threads);
 
  private:
@@ -58,7 +58,7 @@ Status HloDomainRemover::RunContext::VerifyAndNormalizeDomain(
   return OkStatus();
 }
 
-StatusOr<bool> HloDomainRemover::RunContext::Run(
+absl::StatusOr<bool> HloDomainRemover::RunContext::Run(
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(4) << "Processing metadata domain: '" << remover_->kind_ << "'";
   int64_t removed_domains = 0;
@@ -100,7 +100,7 @@ StatusOr<bool> HloDomainRemover::RunContext::Run(
   return removed_domains > 0;
 }
 
-StatusOr<int64_t> HloDomainRemover::RemoveExitDomains(
+absl::StatusOr<int64_t> HloDomainRemover::RemoveExitDomains(
     HloInstruction* instruction, absl::string_view domain_kind) {
   int64_t removed_domains = 0;
   HloComputation* computation = instruction->parent();
@@ -120,7 +120,7 @@ StatusOr<int64_t> HloDomainRemover::RemoveExitDomains(
   return removed_domains;
 }
 
-StatusOr<bool> HloDomainRemover::Run(
+absl::StatusOr<bool> HloDomainRemover::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   RunContext run_context(module, this);
diff --git a/third_party/xla/xla/service/hlo_domain_remover.h b/third_party/xla/xla/service/hlo_domain_remover.h
index 6ff73b0a4404aa..9cda66d36e8451 100644
--- a/third_party/xla/xla/service/hlo_domain_remover.h
+++ b/third_party/xla/xla/service/hlo_domain_remover.h
@@ -46,11 +46,11 @@ class HloDomainRemover : public HloModulePass {
 
   // Remove domains of a given kind which are used as users of a specific
   // instruction.
-  static StatusOr<int64_t> RemoveExitDomains(HloInstruction* instruction,
-                                             absl::string_view domain_kind);
+  static absl::StatusOr<int64_t> RemoveExitDomains(
+      HloInstruction* instruction, absl::string_view domain_kind);
 
   using HloPassInterface::Run;
-  StatusOr<bool> Run(
+  absl::StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/hlo_domain_verifier.cc b/third_party/xla/xla/service/hlo_domain_verifier.cc
index 20e6e9dfaf5886..0d83a318dbf9ea 100644
--- a/third_party/xla/xla/service/hlo_domain_verifier.cc
+++ b/third_party/xla/xla/service/hlo_domain_verifier.cc
@@ -84,7 +84,7 @@ Status HloDomainVerifier::RunContext::Run(
   return OkStatus();
 }
 
-StatusOr<bool> HloDomainVerifier::Run(
+absl::StatusOr<bool> HloDomainVerifier::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   RunContext run_context(module, this);
@@ -92,7 +92,7 @@ StatusOr<bool> HloDomainVerifier::Run(
   return false;
 }
 
-StatusOr<const DomainMetadata*> HloDomainVerifier::VerifyDomain(
+absl::StatusOr<const DomainMetadata*> HloDomainVerifier::VerifyDomain(
     const DomainMetadata::Domain& domain) {
   const DomainMetadata* ref_metadata = nullptr;
   VLOG(4) << "Reach set:";
diff --git a/third_party/xla/xla/service/hlo_domain_verifier.h b/third_party/xla/xla/service/hlo_domain_verifier.h
index 545bab0cccbda5..15ab7e3894bef8 100644
--- a/third_party/xla/xla/service/hlo_domain_verifier.h
+++ b/third_party/xla/xla/service/hlo_domain_verifier.h
@@ -37,7 +37,7 @@ class HloDomainVerifier : public HloModulePass {
   absl::string_view name() const override { return "domain_verifier"; }
 
   using HloPassInterface::Run;
-  StatusOr<bool> Run(
+  absl::StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
@@ -55,7 +55,7 @@ class HloDomainVerifier : public HloModulePass {
   // represents the common metadata within such domain. If the returned
   // DomainMetadata pointer is nullptr, the input domain had no kDomain
   // boundary.
-  static StatusOr<const DomainMetadata*> VerifyDomain(
+  static absl::StatusOr<const DomainMetadata*> VerifyDomain(
       const DomainMetadata::Domain& domain);
 
  private:
diff --git a/third_party/xla/xla/service/hlo_element_type_converter.cc b/third_party/xla/xla/service/hlo_element_type_converter.cc
index a04c1ad1f632d2..05721ae9b2a2ce 100644
--- a/third_party/xla/xla/service/hlo_element_type_converter.cc
+++ b/third_party/xla/xla/service/hlo_element_type_converter.cc
@@ -113,7 +113,7 @@ HloElementTypeConverter::HloElementTypeConverter(
 
 // This routine converts the arithmetic operations in the given module that use
 // eliminate_type_ to operations that use replace_with_type_.
-StatusOr<bool> HloElementTypeConverter::Run(
+absl::StatusOr<bool> HloElementTypeConverter::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   XLA_VLOG_LINES(
diff --git a/third_party/xla/xla/service/hlo_element_type_converter.h b/third_party/xla/xla/service/hlo_element_type_converter.h
index b414765676b43e..6f1eced4bf3d90 100644
--- a/third_party/xla/xla/service/hlo_element_type_converter.h
+++ b/third_party/xla/xla/service/hlo_element_type_converter.h
@@ -36,7 +36,7 @@ class HloElementTypeConverter : public HloModulePass {
 
   // Returns the pass on the module and returns whether the module was modified.
   using HloPassInterface::Run;
-  StatusOr<bool> Run(
+  absl::StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/hlo_graph_dumper.cc b/third_party/xla/xla/service/hlo_graph_dumper.cc
index 5f56e24c2e0dd9..8a2dfe9b4ef63b 100644
--- a/third_party/xla/xla/service/hlo_graph_dumper.cc
+++ b/third_party/xla/xla/service/hlo_graph_dumper.cc
@@ -1410,14 +1410,14 @@ std::string HloDotDumper::GetInstructionNodeBackendConfig(
   // !show_backend_config, but this is simpler, and it's not too noisy.)
   std::vector<std::pair<std::string, std::string>> props;
   if (gpu::IsCustomCallToDnnConvolution(*instr)) {
-    StatusOr<gpu::GpuBackendConfig> config =
+    absl::StatusOr<gpu::GpuBackendConfig> config =
         instr->backend_config<gpu::GpuBackendConfig>();
     if (config.ok()) {
       props = ExtractCudnnConvBackendConfigProps(
           config->cudnn_conv_backend_config());
     }
   } else if (gpu::IsCublasGemm(*instr)) {
-    StatusOr<gpu::GpuBackendConfig> config =
+    absl::StatusOr<gpu::GpuBackendConfig> config =
         instr->backend_config<gpu::GpuBackendConfig>();
     if (config.ok()) {
       // gemm strides are generally uninteresting (derived from the instruction
@@ -1788,7 +1788,7 @@ NodeFilter MakeNodeFromToFilter(const HloInstruction* from,
 }
 
 absl::Mutex url_renderer_mu(absl::kConstInit);
-std::function<StatusOr<std::string>(absl::string_view)>* url_renderer
+std::function<absl::StatusOr<std::string>(absl::string_view)>* url_renderer
     ABSL_GUARDED_BY(url_renderer_mu) = nullptr;
 
 // Storage for fusion visualization: (module_id, computation_id) -> sequence of
@@ -1836,7 +1836,7 @@ static std::pair<int, int> FusionVisualizerStateKey(
 }  // namespace
 
 // Compress with zlib + b64 encode.
-static StatusOr<std::string> CompressAndEncode(absl::string_view input) {
+static absl::StatusOr<std::string> CompressAndEncode(absl::string_view input) {
   class WritableStringFile : public tsl::WritableFile {
    public:
     explicit WritableStringFile(std::string* data) : data_(data){};
@@ -1877,7 +1877,7 @@ static std::string EscapeJSONString(absl::string_view raw) {
       "\"");
 }
 
-StatusOr<std::string> WrapFusionExplorer(
+absl::StatusOr<std::string> WrapFusionExplorer(
     const FusionVisualizerProgress& visualizer_progress,
     absl::string_view graph_title) {
   if (visualizer_progress.frames.empty()) {
@@ -2097,15 +2097,16 @@ static std::string GraphTitle(const HloComputation& computation) {
   return absl::StrCat(computation.parent()->name(), "_", computation.name());
 }
 
-StatusOr<std::string> WrapFusionExplorer(const HloComputation& computation) {
+absl::StatusOr<std::string> WrapFusionExplorer(
+    const HloComputation& computation) {
   absl::MutexLock lock(&fusion_visualizer_state_mu);
   const FusionVisualizerProgress& visualizer_progress =
       fusion_visualizer_states[FusionVisualizerStateKey(computation)];
   return WrapFusionExplorer(visualizer_progress, GraphTitle(computation));
 }
 
-static StatusOr<std::string> WrapDotInHtml(absl::string_view dot,
-                                           absl::string_view title) {
+static absl::StatusOr<std::string> WrapDotInHtml(absl::string_view dot,
+                                                 absl::string_view title) {
   FusionVisualizerProgress progress;
   progress.AddState(dot, title, std::nullopt);
   return WrapFusionExplorer(progress, title);
@@ -2117,10 +2118,9 @@ static StatusOr<std::string> WrapDotInHtml(absl::string_view dot,
 // returning an error because we want to fail quickly when there's no URL
 // renderer available, and this function runs only after we've done all the work
 // of producing dot for the graph.)
-static StatusOr<std::string> WrapDotInFormat(const HloComputation& computation,
-                                             absl::string_view dot,
-                                             RenderedGraphFormat format)
-    ABSL_EXCLUSIVE_LOCKS_REQUIRED(url_renderer_mu) {
+static absl::StatusOr<std::string> WrapDotInFormat(
+    const HloComputation& computation, absl::string_view dot,
+    RenderedGraphFormat format) ABSL_EXCLUSIVE_LOCKS_REQUIRED(url_renderer_mu) {
   switch (format) {
     case RenderedGraphFormat::kUrl:
       CHECK(url_renderer != nullptr)
@@ -2134,7 +2134,7 @@ static StatusOr<std::string> WrapDotInFormat(const HloComputation& computation,
 }
 
 void RegisterGraphToURLRenderer(
-    std::function<StatusOr<std::string>(absl::string_view)> renderer) {
+    std::function<absl::StatusOr<std::string>(absl::string_view)> renderer) {
   absl::MutexLock lock(&url_renderer_mu);
   if (url_renderer != nullptr) {
     LOG(WARNING) << "Multiple calls to RegisterGraphToURLRenderer. Last call "
@@ -2142,8 +2142,9 @@ void RegisterGraphToURLRenderer(
                     "nondeterministic, this may not be what you want.";
   }
   delete url_renderer;
-  url_renderer = new std::function<StatusOr<std::string>(absl::string_view)>(
-      std::move(renderer));
+  url_renderer =
+      new std::function<absl::StatusOr<std::string>(absl::string_view)>(
+          std::move(renderer));
 }
 
 void RegisterFusionState(const HloComputation& computation,
@@ -2195,7 +2196,8 @@ absl::StatusOr<std::string> RenderGraph(
   return WrapDotInFormat(computation, rendered_dot, format);
 }
 
-StatusOr<std::string> RenderAllComputationsToHtml(const HloModule& module) {
+absl::StatusOr<std::string> RenderAllComputationsToHtml(
+    const HloModule& module) {
   FusionVisualizerProgress progress;
 
   std::vector<HloInstruction*> instrs =
@@ -2233,7 +2235,7 @@ StatusOr<std::string> RenderAllComputationsToHtml(const HloModule& module) {
   return WrapFusionExplorer(progress, module.name());
 }
 
-StatusOr<std::string> RenderNeighborhoodAround(
+absl::StatusOr<std::string> RenderNeighborhoodAround(
     const HloInstruction& node, int radius, RenderedGraphFormat format,
     HloRenderOptions hlo_render_options,
     const absl::flat_hash_set<const HloInstruction*>& boundary,
@@ -2256,7 +2258,7 @@ StatusOr<std::string> RenderNeighborhoodAround(
   return WrapDotInFormat(*node.parent(), rendered_dot, format);
 }
 
-StatusOr<std::string> RenderAllPathsFromTo(
+absl::StatusOr<std::string> RenderAllPathsFromTo(
     const HloInstruction& from, const HloInstruction& to, int64_t max_nodes,
     RenderedGraphFormat format, HloRenderOptions hlo_render_options) {
   absl::MutexLock lock(&url_renderer_mu);
diff --git a/third_party/xla/xla/service/hlo_graph_dumper.h b/third_party/xla/xla/service/hlo_graph_dumper.h
index 22209f36e2fc22..1b5c7353d0f5c3 100644
--- a/third_party/xla/xla/service/hlo_graph_dumper.h
+++ b/third_party/xla/xla/service/hlo_graph_dumper.h
@@ -88,7 +88,8 @@ absl::StatusOr<std::string> RenderGraph(
     std::optional<absl::flat_hash_map<const HloInstruction*, ColorStats>>
         color_map = std::nullopt);
 
-StatusOr<std::string> RenderAllComputationsToHtml(const HloModule& module);
+absl::StatusOr<std::string> RenderAllComputationsToHtml(
+    const HloModule& module);
 
 // Like RenderGraph, but renders only nodes "near" the given node in the graph.
 //
@@ -98,7 +99,7 @@ StatusOr<std::string> RenderAllComputationsToHtml(const HloModule& module);
 //
 // The optional boundary specifies a set of boundary nodes, beyond which nodes
 // will be omitted even if they are within the radius.
-StatusOr<std::string> RenderNeighborhoodAround(
+absl::StatusOr<std::string> RenderNeighborhoodAround(
     const HloInstruction& node, int radius, RenderedGraphFormat format,
     HloRenderOptions hlo_render_options = {},
     const absl::flat_hash_set<const HloInstruction*>& boundary = {},
@@ -108,7 +109,7 @@ StatusOr<std::string> RenderNeighborhoodAround(
 // Renders nodes on any of the paths from `from` to `to`.  If there are more
 // than max_nodes on all paths, restricts to the max_nodes nodes on the shortest
 // paths.
-StatusOr<std::string> RenderAllPathsFromTo(
+absl::StatusOr<std::string> RenderAllPathsFromTo(
     const HloInstruction& from, const HloInstruction& to, int64_t max_nodes,
     RenderedGraphFormat format, HloRenderOptions hlo_render_options = {});
 
@@ -132,11 +133,12 @@ void RegisterFusionState(const HloComputation& computation,
 // There can only be one active renderer, and the last call to this function
 // wins.
 void RegisterGraphToURLRenderer(
-    std::function<StatusOr<std::string>(absl::string_view dot)> renderer);
+    std::function<absl::StatusOr<std::string>(absl::string_view dot)> renderer);
 
 // Generates a fusion explorer for the given computation using the data in
 // fusion_visualizer_state.
-StatusOr<std::string> WrapFusionExplorer(const HloComputation& computation);
+absl::StatusOr<std::string> WrapFusionExplorer(
+    const HloComputation& computation);
 
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/hlo_liveness_analysis.cc b/third_party/xla/xla/service/hlo_liveness_analysis.cc
index 00d08b2186a085..83ae6250e3d8c2 100644
--- a/third_party/xla/xla/service/hlo_liveness_analysis.cc
+++ b/third_party/xla/xla/service/hlo_liveness_analysis.cc
@@ -341,7 +341,7 @@ bool HloLivenessAnalysis::IsLive(const HloInstruction* instruction,
 }
 
 /* static */
-StatusOr<std::unique_ptr<HloLivenessAnalysis>> HloLivenessAnalysis::Run(
+absl::StatusOr<std::unique_ptr<HloLivenessAnalysis>> HloLivenessAnalysis::Run(
     const HloModule& module) {
   VLOG(1) << "HloLivenessAnalysis::Run on module " << module.name();
   XLA_VLOG_LINES(2, module.ToString());
diff --git a/third_party/xla/xla/service/hlo_liveness_analysis.h b/third_party/xla/xla/service/hlo_liveness_analysis.h
index 5c4f95b1527d63..0c49dda4aaaf8c 100644
--- a/third_party/xla/xla/service/hlo_liveness_analysis.h
+++ b/third_party/xla/xla/service/hlo_liveness_analysis.h
@@ -44,7 +44,7 @@ class HloLivenessAnalysis {
 
   // Runs liveness analysis on 'module'. Returns HloLivenessAnalysis object
   // which exports liveness for each {HloInstruction, ShapeIndex} in 'module'.
-  static StatusOr<std::unique_ptr<HloLivenessAnalysis>> Run(
+  static absl::StatusOr<std::unique_ptr<HloLivenessAnalysis>> Run(
       const HloModule& module);
 
   // Returns true if output of 'instruction' at 'shape_index' is live.
diff --git a/third_party/xla/xla/service/hlo_memory_scheduler.cc b/third_party/xla/xla/service/hlo_memory_scheduler.cc
index 35e444bc113849..f073b34556ef57 100644
--- a/third_party/xla/xla/service/hlo_memory_scheduler.cc
+++ b/third_party/xla/xla/service/hlo_memory_scheduler.cc
@@ -77,7 +77,7 @@ class ListScheduler {
  public:
   // Construct and return a memory-minimizing sequence of HLO instructions
   // containing the given HLO computation.
-  static StatusOr<HloInstructionSequence> Run(
+  static absl::StatusOr<HloInstructionSequence> Run(
       HloComputation* computation,
       const TuplePointsToAnalysis& points_to_analysis,
       const BufferValue::SizeFunction& size_function,
@@ -410,7 +410,7 @@ int64_t SumLogicalBufferSizes(
   return size;
 }
 
-StatusOr<HloInstructionSequence> ScheduleComputationHelper(
+absl::StatusOr<HloInstructionSequence> ScheduleComputationHelper(
     HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const HloAliasAnalysis& alias_analysis,
@@ -433,7 +433,7 @@ StatusOr<HloInstructionSequence> ScheduleComputationHelper(
 
 }  // namespace
 
-StatusOr<HloInstructionSequence> DFSMemoryScheduler(
+absl::StatusOr<HloInstructionSequence> DFSMemoryScheduler(
     HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const HloAliasAnalysis& alias_analysis,
@@ -532,7 +532,7 @@ ModuleSchedulerAlgorithm ComputationSchedulerToModuleScheduler(
              const HloAliasAnalysis& alias_analysis,
              const LogicalBuffer::SizeFunction& size_func,
              const absl::flat_hash_set<absl::string_view>& execution_threads,
-             int64_t* peak_memory) -> StatusOr<HloSchedule> {
+             int64_t* peak_memory) -> absl::StatusOr<HloSchedule> {
     HloSchedule schedule(module);
     absl::flat_hash_map<const HloComputation*, int64_t> memory_by_computation;
     for (auto* computation :
@@ -555,7 +555,7 @@ ModuleSchedulerAlgorithm ComputationSchedulerToModuleScheduler(
   };
 }
 
-StatusOr<HloInstructionSequence> ListMemoryScheduler(
+absl::StatusOr<HloInstructionSequence> ListMemoryScheduler(
     HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const HloAliasAnalysis& alias_analysis,
@@ -578,7 +578,7 @@ StatusOr<HloInstructionSequence> ListMemoryScheduler(
   return sequence;
 }
 
-StatusOr<HloInstructionSequence> PostOrderMemoryScheduler(
+absl::StatusOr<HloInstructionSequence> PostOrderMemoryScheduler(
     HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const HloAliasAnalysis& alias_analysis,
@@ -599,7 +599,7 @@ StatusOr<HloInstructionSequence> PostOrderMemoryScheduler(
   return sequence;
 }
 
-StatusOr<HloInstructionSequence> DefaultMemoryScheduler(
+absl::StatusOr<HloInstructionSequence> DefaultMemoryScheduler(
     HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const HloAliasAnalysis& alias_analysis,
@@ -660,7 +660,7 @@ StatusOr<HloInstructionSequence> DefaultMemoryScheduler(
   }
 }
 
-StatusOr<HloSchedule> DefaultModuleScheduler(
+absl::StatusOr<HloSchedule> DefaultModuleScheduler(
     const HloModule* module, const TuplePointsToAnalysis& points_to_analysis,
     const HloAliasAnalysis& alias_analysis,
     const BufferValue::SizeFunction& size_function,
@@ -720,7 +720,7 @@ StatusOr<HloSchedule> DefaultModuleScheduler(
   }
 }
 
-StatusOr<HloSchedule> ScheduleModule(
+absl::StatusOr<HloSchedule> ScheduleModule(
     const HloModule* module, const BufferValue::SizeFunction& size_function,
     const ModuleSchedulerAlgorithm& algorithm,
     const absl::flat_hash_set<absl::string_view>& execution_threads,
@@ -740,7 +740,7 @@ StatusOr<HloSchedule> ScheduleModule(
   return std::move(schedule);
 }
 
-StatusOr<HloInstructionSequence> ScheduleComputation(
+absl::StatusOr<HloInstructionSequence> ScheduleComputation(
     HloComputation* computation, const BufferValue::SizeFunction& size_function,
     const MemorySchedulerPostprocessor& postprocessor) {
   CHECK(!computation->IsFusionComputation());
@@ -760,7 +760,7 @@ HloMemoryScheduler::HloMemoryScheduler(
     const ModuleSchedulerAlgorithm& algorithm)
     : size_function_(size_function), algorithm_(algorithm) {}
 
-StatusOr<bool> HloMemoryScheduler::Run(
+absl::StatusOr<bool> HloMemoryScheduler::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_ASSIGN_OR_RETURN(
@@ -770,7 +770,7 @@ StatusOr<bool> HloMemoryScheduler::Run(
   return true;
 }
 
-StatusOr<bool> HloTrivialScheduler::Run(
+absl::StatusOr<bool> HloTrivialScheduler::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   HloSchedule schedule(module);
@@ -792,7 +792,7 @@ StatusOr<bool> HloTrivialScheduler::Run(
   return true;
 }
 
-StatusOr<bool> HloDescheduler::Run(
+absl::StatusOr<bool> HloDescheduler::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = module->has_schedule();
diff --git a/third_party/xla/xla/service/hlo_memory_scheduler.h b/third_party/xla/xla/service/hlo_memory_scheduler.h
index d7d44ba18760f4..24ca085a9a40b9 100644
--- a/third_party/xla/xla/service/hlo_memory_scheduler.h
+++ b/third_party/xla/xla/service/hlo_memory_scheduler.h
@@ -42,7 +42,7 @@ using MemorySchedulerPostprocessor =
 // HeapSimulator.
 //
 // TODO(yunxing): Cleanup usage of TuplePointsToAnalysis.
-typedef std::function<StatusOr<HloInstructionSequence>(
+typedef std::function<absl::StatusOr<HloInstructionSequence>(
     HloComputation*, const TuplePointsToAnalysis&, const HloAliasAnalysis&,
     const LogicalBuffer::SizeFunction&,
     const absl::flat_hash_map<const HloComputation*, int64_t>&,
@@ -51,7 +51,7 @@ typedef std::function<StatusOr<HloInstructionSequence>(
     MemorySchedulerAlgorithm;
 
 // Scheduler for the entire module.
-typedef std::function<StatusOr<HloSchedule>(
+typedef std::function<absl::StatusOr<HloSchedule>(
     const HloModule*, const TuplePointsToAnalysis&, const HloAliasAnalysis&,
     const LogicalBuffer::SizeFunction&,
     const absl::flat_hash_set<absl::string_view>& execution_threads,
@@ -64,7 +64,7 @@ ModuleSchedulerAlgorithm ComputationSchedulerToModuleScheduler(
     const MemorySchedulerAlgorithm&, const MemorySchedulerPostprocessor& = {});
 
 // List scheduler
-StatusOr<HloInstructionSequence> ListMemoryScheduler(
+absl::StatusOr<HloInstructionSequence> ListMemoryScheduler(
     HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const HloAliasAnalysis& alias_analysis,
@@ -74,7 +74,7 @@ StatusOr<HloInstructionSequence> ListMemoryScheduler(
     const MemorySchedulerPostprocessor& postprocessor, int64_t* peak_memory);
 
 // DFS-order scheduler
-StatusOr<HloInstructionSequence> DFSMemoryScheduler(
+absl::StatusOr<HloInstructionSequence> DFSMemoryScheduler(
     HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const HloAliasAnalysis& alias_analysis,
@@ -84,7 +84,7 @@ StatusOr<HloInstructionSequence> DFSMemoryScheduler(
     const MemorySchedulerPostprocessor& postprocessor, int64_t* peak_memory);
 
 // Naive Post Order scheduler
-StatusOr<HloInstructionSequence> PostOrderMemoryScheduler(
+absl::StatusOr<HloInstructionSequence> PostOrderMemoryScheduler(
     HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const HloAliasAnalysis& alias_analysis,
@@ -97,7 +97,7 @@ StatusOr<HloInstructionSequence> PostOrderMemoryScheduler(
 // and the post-order scheduler and chooses whichever returns a lower min-
 // memory, not accounting for fragmentation. peak_memory (may be nullptr) is set
 // to the peak memory of the resulting schedule according to the HeapSimulator.
-StatusOr<HloInstructionSequence> DefaultMemoryScheduler(
+absl::StatusOr<HloInstructionSequence> DefaultMemoryScheduler(
     HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const HloAliasAnalysis& alias_analysis,
@@ -106,7 +106,7 @@ StatusOr<HloInstructionSequence> DefaultMemoryScheduler(
         memory_by_computation,
     const MemorySchedulerPostprocessor& postprocessor, int64_t* peak_memory);
 
-StatusOr<HloSchedule> DefaultModuleScheduler(
+absl::StatusOr<HloSchedule> DefaultModuleScheduler(
     const HloModule* module, const TuplePointsToAnalysis& points_to_analysis,
     const HloAliasAnalysis& alias_analysis,
     const LogicalBuffer::SizeFunction& size_function,
@@ -117,7 +117,7 @@ StatusOr<HloSchedule> DefaultModuleScheduler(
 // module. size_function is the function returning the number of bytes required
 // for a LogicalBuffer. peak_memory (if not nullptr) is set to the largest peak
 // memory (according to the HeapSimulator) of all computations in the module.
-StatusOr<HloSchedule> ScheduleModule(
+absl::StatusOr<HloSchedule> ScheduleModule(
     const HloModule* module, const LogicalBuffer::SizeFunction& size_function,
     const ModuleSchedulerAlgorithm& algorithm = {},
     const absl::flat_hash_set<absl::string_view>& execution_threads = {},
@@ -125,7 +125,7 @@ StatusOr<HloSchedule> ScheduleModule(
 
 // Computes the schedule for a single computation.
 // Currently only used by the GPU backend.
-StatusOr<HloInstructionSequence> ScheduleComputation(
+absl::StatusOr<HloInstructionSequence> ScheduleComputation(
     HloComputation* computation,
     const LogicalBuffer::SizeFunction& size_function,
     const MemorySchedulerPostprocessor& postprocessor);
@@ -146,7 +146,7 @@ class HloMemoryScheduler : public HloModulePass {
   absl::string_view name() const override { return "hlo-memory-scheduler"; }
 
   using HloPassInterface::Run;
-  StatusOr<bool> Run(
+  absl::StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
@@ -163,7 +163,7 @@ class HloTrivialScheduler : public HloModulePass {
   absl::string_view name() const override { return "hlo-trivial-scheduler"; }
 
   using HloPassInterface::Run;
-  StatusOr<bool> Run(
+  absl::StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
@@ -177,7 +177,7 @@ class HloDescheduler : public HloModulePass {
   absl::string_view name() const override { return "hlo-descheduler"; }
 
   using HloPassInterface::Run;
-  StatusOr<bool> Run(
+  absl::StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/hlo_module_config.cc b/third_party/xla/xla/service/hlo_module_config.cc
index 4280bf3efd2d74..0cf04f10387f45 100644
--- a/third_party/xla/xla/service/hlo_module_config.cc
+++ b/third_party/xla/xla/service/hlo_module_config.cc
@@ -258,7 +258,7 @@ static void AssignStructPhaseOrderingConfig(HloModuleConfig& config,
   *config.mutable_phase_ordering_config() = std::move(module_config);
 }
 
-StatusOr<HloModuleConfigProto> HloModuleConfig::ToProto() const {
+absl::StatusOr<HloModuleConfigProto> HloModuleConfig::ToProto() const {
   HloModuleConfigProto proto;
   if (has_entry_computation_layout()) {
     *proto.mutable_entry_computation_layout() =
@@ -324,8 +324,8 @@ StatusOr<HloModuleConfigProto> HloModuleConfig::ToProto() const {
   return proto;
 }
 
-StatusOr<std::unique_ptr<HloModuleConfig>> HloModuleConfig::CreateFromProto(
-    const HloModuleConfigProto& proto) {
+absl::StatusOr<std::unique_ptr<HloModuleConfig>>
+HloModuleConfig::CreateFromProto(const HloModuleConfigProto& proto) {
   auto config = std::make_unique<HloModuleConfig>();
 
   if (proto.has_entry_computation_layout()) {
diff --git a/third_party/xla/xla/service/hlo_module_config.h b/third_party/xla/xla/service/hlo_module_config.h
index 4a137c59c426fb..754750018f6bd3 100644
--- a/third_party/xla/xla/service/hlo_module_config.h
+++ b/third_party/xla/xla/service/hlo_module_config.h
@@ -85,8 +85,8 @@ class HloModuleConfig {
   explicit HloModuleConfig(ComputationLayout entry_computation_layout);
 
   // Convert an HloModuleConfig to or from a proto.
-  StatusOr<HloModuleConfigProto> ToProto() const;
-  static StatusOr<std::unique_ptr<HloModuleConfig>> CreateFromProto(
+  absl::StatusOr<HloModuleConfigProto> ToProto() const;
+  static absl::StatusOr<std::unique_ptr<HloModuleConfig>> CreateFromProto(
       const HloModuleConfigProto& proto);
 
   // Assigns the repeated ShardableValueUpdatePairProto field to the given
diff --git a/third_party/xla/xla/service/hlo_module_dce.cc b/third_party/xla/xla/service/hlo_module_dce.cc
index cd4f26b4168c18..de04957795aa25 100644
--- a/third_party/xla/xla/service/hlo_module_dce.cc
+++ b/third_party/xla/xla/service/hlo_module_dce.cc
@@ -37,7 +37,7 @@ namespace xla {
 
 namespace {
 
-StatusOr<bool> RunWhileDCE(
+absl::StatusOr<bool> RunWhileDCE(
     HloModule* module, HloLivenessAnalysis* liveness,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
@@ -105,7 +105,7 @@ StatusOr<bool> RunWhileDCE(
 
 }  // namespace
 
-StatusOr<bool> HloModuleDCE::Run(
+absl::StatusOr<bool> HloModuleDCE::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(2) << "Before HloModuleDCE:";
diff --git a/third_party/xla/xla/service/hlo_module_dce.h b/third_party/xla/xla/service/hlo_module_dce.h
index b321b255d2de18..2bd5df9deb87c3 100644
--- a/third_party/xla/xla/service/hlo_module_dce.h
+++ b/third_party/xla/xla/service/hlo_module_dce.h
@@ -36,7 +36,7 @@ class HloModuleDCE : public HloModulePass {
   // Run the pass on the given module. Returns whether the module was changed
   // (instructions were removed).
   using HloPassInterface::Run;
-  StatusOr<bool> Run(
+  absl::StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/hlo_module_group_metadata.cc b/third_party/xla/xla/service/hlo_module_group_metadata.cc
index dcebb90f501c16..2ab8c03c7ec030 100644
--- a/third_party/xla/xla/service/hlo_module_group_metadata.cc
+++ b/third_party/xla/xla/service/hlo_module_group_metadata.cc
@@ -57,7 +57,7 @@ std::string HloModuleGroupMetadata::TrackedInstruction::ToString() const {
   return repr;
 }
 
-/* static */ StatusOr<std::unique_ptr<HloModuleGroupMetadata>>
+/* static */ absl::StatusOr<std::unique_ptr<HloModuleGroupMetadata>>
 HloModuleGroupMetadata::Build(absl::Span<HloModule* const> modules) {
   auto metadata = std::make_unique<HloModuleGroupMetadata>(modules);
   TF_RETURN_IF_ERROR(metadata->Build());
@@ -78,11 +78,11 @@ Status HloModuleGroupMetadata::Build() {
       return OkStatus();
     }
 
-    if (IsChannelInstruction(hlo) || hlo->IsCrossModuleAllReduce()) {
+    if (IsChannelInstruction(hlo) || IsNonSpmdCrossModuleAllReduce(hlo)) {
       std::vector<HloComputation*> peers;
       if (IsChannelInstruction(hlo)) {
         peers.push_back(PeerComputation(hlo));
-      } else if (hlo->IsCrossModuleAllReduce()) {
+      } else if (IsNonSpmdCrossModuleAllReduce(hlo)) {
         for (HloInstruction* instr : GetAllReduceGroup(*hlo->channel_id())) {
           if (instr == hlo) {
             continue;
@@ -217,10 +217,16 @@ bool HloModuleGroupMetadata::IsCompanionInstruction(HloInstruction* hlo) const {
   return companion_set_index_.contains(hlo);
 }
 
+bool HloModuleGroupMetadata::IsNonSpmdCrossModuleAllReduce(
+    HloInstruction* hlo) const {
+  return hlo->IsCrossModuleAllReduce() &&
+         !hlo->GetModule()->config().use_spmd_partitioning();
+}
+
 bool HloModuleGroupMetadata::InstructionCommunicates(
     HloInstruction* hlo) const {
   return IsChannelInstruction(hlo) || IsCompanionInstruction(hlo) ||
-         hlo->IsCrossModuleAllReduce();
+         IsNonSpmdCrossModuleAllReduce(hlo);
 }
 
 const HloModuleGroupMetadata::Channel& HloModuleGroupMetadata::GetChannel(
@@ -332,7 +338,7 @@ Status HloModuleGroupMetadata::RecordInstructions() {
     }
 
     // Group cross module all-reduce instructions by the channel id.
-    if (hlo->IsCrossModuleAllReduce()) {
+    if (IsNonSpmdCrossModuleAllReduce(hlo)) {
       TF_RET_CHECK(channel_id_map_.find(*hlo->channel_id()) ==
                    channel_id_map_.end())
           << "channel_id " << *hlo->channel_id()
diff --git a/third_party/xla/xla/service/hlo_module_group_metadata.h b/third_party/xla/xla/service/hlo_module_group_metadata.h
index 6fc148cacdf4d1..f977b19ab2ed77 100644
--- a/third_party/xla/xla/service/hlo_module_group_metadata.h
+++ b/third_party/xla/xla/service/hlo_module_group_metadata.h
@@ -114,7 +114,7 @@ class HloModuleGroupMetadata {
   ~HloModuleGroupMetadata() = default;
 
   // Build and return the metadata for the given modules.
-  static StatusOr<std::unique_ptr<HloModuleGroupMetadata>> Build(
+  static absl::StatusOr<std::unique_ptr<HloModuleGroupMetadata>> Build(
       absl::Span<HloModule* const> modules);
 
   // Returns true if the instruction is one of the 4 channel instructions (Send,
@@ -125,8 +125,12 @@ class HloModuleGroupMetadata {
   // comment above on companion instructions.
   bool IsCompanionInstruction(HloInstruction* hlo) const;
 
+  // Returns true if the instruction is either a cross-module all-reduce
+  // instruction in a non-spmd module.
+  bool IsNonSpmdCrossModuleAllReduce(HloInstruction* hlo) const;
+
   // Returns true if the instruction is either a channel instruction, a
-  // cross-module all-reduce instruction, or a companion instruction.
+  // cross-module non-spmd all-reduce instruction, or a companion instruction.
   bool InstructionCommunicates(HloInstruction* hlo) const;
 
   // Returns the Channel instance for the given channel id.
diff --git a/third_party/xla/xla/service/hlo_module_group_util.cc b/third_party/xla/xla/service/hlo_module_group_util.cc
index e29a7eccc03bd6..362c3ce46c3e31 100644
--- a/third_party/xla/xla/service/hlo_module_group_util.cc
+++ b/third_party/xla/xla/service/hlo_module_group_util.cc
@@ -362,7 +362,7 @@ Status HloModuleGroupUtil::VerifyComputations(
   return OkStatus();
 }
 
-StatusOr<std::unique_ptr<HloReachabilityMap>>
+absl::StatusOr<std::unique_ptr<HloReachabilityMap>>
 HloModuleGroupUtil::ComputeReachability(
     absl::Span<HloComputation* const> computations) {
   std::vector<HloInstruction*> post_order;
diff --git a/third_party/xla/xla/service/hlo_module_group_util.h b/third_party/xla/xla/service/hlo_module_group_util.h
index 126dd5312f5891..9d5e9af41e478b 100644
--- a/third_party/xla/xla/service/hlo_module_group_util.h
+++ b/third_party/xla/xla/service/hlo_module_group_util.h
@@ -103,7 +103,7 @@ class HloModuleGroupUtil {
   // they can handle instructions across multiple computations.
   //
   // Creates the reachability map for the instructions in the computations.
-  StatusOr<std::unique_ptr<HloReachabilityMap>> ComputeReachability(
+  absl::StatusOr<std::unique_ptr<HloReachabilityMap>> ComputeReachability(
       absl::Span<HloComputation* const> computations);
 
   // Updates the reachability of the given instruction, taking the global
diff --git a/third_party/xla/xla/service/hlo_module_test.cc b/third_party/xla/xla/service/hlo_module_test.cc
index 7e08a1bff86006..49d00abadb03a3 100644
--- a/third_party/xla/xla/service/hlo_module_test.cc
+++ b/third_party/xla/xla/service/hlo_module_test.cc
@@ -791,7 +791,7 @@ static HloModuleConfigProto::BoolList MakeOneHotBoolList(unsigned num_vals,
   return list;
 }
 
-static StatusOr<HloModuleConfigProto> MakeTestModuleConfigProto() {
+static absl::StatusOr<HloModuleConfigProto> MakeTestModuleConfigProto() {
   HloModuleConfigProto proto;
   // entry_computation_layout_ is optional
   proto.set_seed(0xdeadbeef);
diff --git a/third_party/xla/xla/service/hlo_module_util.cc b/third_party/xla/xla/service/hlo_module_util.cc
index 44fd94a1033786..eca1d682061e6e 100644
--- a/third_party/xla/xla/service/hlo_module_util.cc
+++ b/third_party/xla/xla/service/hlo_module_util.cc
@@ -46,7 +46,7 @@ Status ValidateResultShape(const Shape& client_shape,
 }
 }  // namespace
 
-StatusOr<std::unique_ptr<HloModuleConfig>> CreateModuleConfig(
+absl::StatusOr<std::unique_ptr<HloModuleConfig>> CreateModuleConfig(
     const ProgramShape& program_shape,
     absl::Span<const Shape* const> argument_shapes,
     const ExecutionOptions* execution_options, int default_num_replicas,
diff --git a/third_party/xla/xla/service/hlo_module_util.h b/third_party/xla/xla/service/hlo_module_util.h
index 3c2f270db33ba3..8692bf927e4443 100644
--- a/third_party/xla/xla/service/hlo_module_util.h
+++ b/third_party/xla/xla/service/hlo_module_util.h
@@ -33,7 +33,7 @@ namespace xla {
 // If execution_options does not set num_replicas, default_num_replicas is used.
 // num_threads is optional; if not given, intra_op_parallelism_threads not set.
 // aot_options is optional; if not given a default is used.
-StatusOr<std::unique_ptr<HloModuleConfig>> CreateModuleConfig(
+absl::StatusOr<std::unique_ptr<HloModuleConfig>> CreateModuleConfig(
     const ProgramShape& program_shape,
     absl::Span<const Shape* const> argument_shapes,
     const ExecutionOptions* execution_options, int default_num_replicas,
diff --git a/third_party/xla/xla/service/hlo_parser.cc b/third_party/xla/xla/service/hlo_parser.cc
index 36e86b3910074b..2937c1ff918616 100644
--- a/third_party/xla/xla/service/hlo_parser.cc
+++ b/third_party/xla/xla/service/hlo_parser.cc
@@ -249,17 +249,18 @@ class HloParserImpl : public HloParser {
   std::string GetError() const { return StrJoin(error_, "\n"); }
 
   // Stand alone parsing utils for various aggregate data types.
-  StatusOr<Shape> ParseShapeOnly();
-  StatusOr<Layout> ParseLayoutOnly();
-  StatusOr<HloSharding> ParseShardingOnly();
-  StatusOr<FrontendAttributes> ParseFrontendAttributesOnly();
-  StatusOr<StatisticsViz> ParseStatisticsVizOnly();
-  StatusOr<std::vector<bool>> ParseParameterReplicationOnly();
-  StatusOr<BoolList> ParseBooleanListOrSingleBooleanOnly();
-  StatusOr<Window> ParseWindowOnly();
-  StatusOr<ConvolutionDimensionNumbers> ParseConvolutionDimensionNumbersOnly();
-  StatusOr<PaddingConfig> ParsePaddingConfigOnly();
-  StatusOr<std::vector<ReplicaGroup>> ParseReplicaGroupsOnly();
+  absl::StatusOr<Shape> ParseShapeOnly();
+  absl::StatusOr<Layout> ParseLayoutOnly();
+  absl::StatusOr<HloSharding> ParseShardingOnly();
+  absl::StatusOr<FrontendAttributes> ParseFrontendAttributesOnly();
+  absl::StatusOr<StatisticsViz> ParseStatisticsVizOnly();
+  absl::StatusOr<std::vector<bool>> ParseParameterReplicationOnly();
+  absl::StatusOr<BoolList> ParseBooleanListOrSingleBooleanOnly();
+  absl::StatusOr<Window> ParseWindowOnly();
+  absl::StatusOr<ConvolutionDimensionNumbers>
+  ParseConvolutionDimensionNumbersOnly();
+  absl::StatusOr<PaddingConfig> ParsePaddingConfigOnly();
+  absl::StatusOr<std::vector<ReplicaGroup>> ParseReplicaGroupsOnly();
 
  private:
   // Types of attributes.
@@ -1456,7 +1457,7 @@ HloInstruction* HloParserImpl::CreateInstruction(  // NOLINT
     operands = *preset_operands;
   }
   const auto maybe_infer_shape =
-      [&](absl::FunctionRef<StatusOr<Shape>()> infer) {
+      [&](absl::FunctionRef<absl::StatusOr<Shape>()> infer) {
         if (shape.has_value()) {
           return true;
         }
@@ -6581,7 +6582,7 @@ bool HloParserImpl::AddComputation(const std::string& name,
   return true;
 }
 
-StatusOr<Shape> HloParserImpl::ParseShapeOnly() {
+absl::StatusOr<Shape> HloParserImpl::ParseShapeOnly() {
   lexer_.Lex();
   Shape shape;
   if (!ParseShape(&shape)) {
@@ -6593,7 +6594,7 @@ StatusOr<Shape> HloParserImpl::ParseShapeOnly() {
   return shape;
 }
 
-StatusOr<Layout> HloParserImpl::ParseLayoutOnly() {
+absl::StatusOr<Layout> HloParserImpl::ParseLayoutOnly() {
   lexer_.Lex();
   Layout layout;
   if (!ParseLayout(&layout)) {
@@ -6605,7 +6606,7 @@ StatusOr<Layout> HloParserImpl::ParseLayoutOnly() {
   return layout;
 }
 
-StatusOr<HloSharding> HloParserImpl::ParseShardingOnly() {
+absl::StatusOr<HloSharding> HloParserImpl::ParseShardingOnly() {
   lexer_.Lex();
   OpSharding op_sharding;
   if (!ParseSharding(&op_sharding)) {
@@ -6617,7 +6618,8 @@ StatusOr<HloSharding> HloParserImpl::ParseShardingOnly() {
   return HloSharding::FromProto(op_sharding);
 }
 
-StatusOr<FrontendAttributes> HloParserImpl::ParseFrontendAttributesOnly() {
+absl::StatusOr<FrontendAttributes>
+HloParserImpl::ParseFrontendAttributesOnly() {
   lexer_.Lex();
   FrontendAttributes attributes;
   if (!ParseFrontendAttributes(&attributes)) {
@@ -6630,7 +6632,7 @@ StatusOr<FrontendAttributes> HloParserImpl::ParseFrontendAttributesOnly() {
   return attributes;
 }
 
-StatusOr<StatisticsViz> HloParserImpl::ParseStatisticsVizOnly() {
+absl::StatusOr<StatisticsViz> HloParserImpl::ParseStatisticsVizOnly() {
   lexer_.Lex();
   StatisticsViz statistics_viz;
   if (!ParseStatisticsViz(&statistics_viz)) {
@@ -6642,7 +6644,8 @@ StatusOr<StatisticsViz> HloParserImpl::ParseStatisticsVizOnly() {
   return statistics_viz;
 }
 
-StatusOr<std::vector<bool>> HloParserImpl::ParseParameterReplicationOnly() {
+absl::StatusOr<std::vector<bool>>
+HloParserImpl::ParseParameterReplicationOnly() {
   lexer_.Lex();
   ParameterReplication parameter_replication;
   if (!ParseParameterReplication(&parameter_replication)) {
@@ -6657,7 +6660,7 @@ StatusOr<std::vector<bool>> HloParserImpl::ParseParameterReplicationOnly() {
       parameter_replication.replicated_at_leaf_buffers().end());
 }
 
-StatusOr<HloParserImpl::BoolList>
+absl::StatusOr<HloParserImpl::BoolList>
 HloParserImpl::ParseBooleanListOrSingleBooleanOnly() {
   lexer_.Lex();
   BoolList booleans;
@@ -6670,7 +6673,8 @@ HloParserImpl::ParseBooleanListOrSingleBooleanOnly() {
   return booleans;
 }
 
-StatusOr<std::vector<ReplicaGroup>> HloParserImpl::ParseReplicaGroupsOnly() {
+absl::StatusOr<std::vector<ReplicaGroup>>
+HloParserImpl::ParseReplicaGroupsOnly() {
   lexer_.Lex();
   std::vector<ReplicaGroup> replica_groups;
   if (!ParseReplicaGroupsOnly(&replica_groups)) {
@@ -6682,7 +6686,7 @@ StatusOr<std::vector<ReplicaGroup>> HloParserImpl::ParseReplicaGroupsOnly() {
   return replica_groups;
 }
 
-StatusOr<Window> HloParserImpl::ParseWindowOnly() {
+absl::StatusOr<Window> HloParserImpl::ParseWindowOnly() {
   lexer_.Lex();
   Window window;
   if (!ParseWindow(&window, /*expect_outer_curlies=*/false)) {
@@ -6694,7 +6698,7 @@ StatusOr<Window> HloParserImpl::ParseWindowOnly() {
   return window;
 }
 
-StatusOr<ConvolutionDimensionNumbers>
+absl::StatusOr<ConvolutionDimensionNumbers>
 HloParserImpl::ParseConvolutionDimensionNumbersOnly() {
   lexer_.Lex();
   ConvolutionDimensionNumbers dnums;
@@ -6708,7 +6712,7 @@ HloParserImpl::ParseConvolutionDimensionNumbersOnly() {
   return dnums;
 }
 
-StatusOr<PaddingConfig> HloParserImpl::ParsePaddingConfigOnly() {
+absl::StatusOr<PaddingConfig> HloParserImpl::ParsePaddingConfigOnly() {
   lexer_.Lex();
   PaddingConfig padding_config;
   if (!ParsePaddingConfig(&padding_config)) {
@@ -6779,7 +6783,7 @@ bool HloParserImpl::ParseSingleInstruction(HloModule* module) {
 
 }  // namespace
 
-StatusOr<std::unique_ptr<HloModule>> ParseAndReturnUnverifiedModule(
+absl::StatusOr<std::unique_ptr<HloModule>> ParseAndReturnUnverifiedModule(
     absl::string_view str, const HloModuleConfig& config) {
   auto module = std::make_unique<HloModule>(/*name=*/"_", config);
   HloParserImpl parser(str);
@@ -6787,65 +6791,67 @@ StatusOr<std::unique_ptr<HloModule>> ParseAndReturnUnverifiedModule(
   return std::move(module);
 }
 
-StatusOr<std::unique_ptr<HloModule>> ParseAndReturnUnverifiedModule(
+absl::StatusOr<std::unique_ptr<HloModule>> ParseAndReturnUnverifiedModule(
     absl::string_view str) {
   return ParseAndReturnUnverifiedModule(str, HloModuleConfig());
 }
 
-StatusOr<HloSharding> ParseSharding(absl::string_view str) {
+absl::StatusOr<HloSharding> ParseSharding(absl::string_view str) {
   HloParserImpl parser(str);
   return parser.ParseShardingOnly();
 }
 
-StatusOr<FrontendAttributes> ParseFrontendAttributes(absl::string_view str) {
+absl::StatusOr<FrontendAttributes> ParseFrontendAttributes(
+    absl::string_view str) {
   HloParserImpl parser(str);
   return parser.ParseFrontendAttributesOnly();
 }
 
-StatusOr<StatisticsViz> ParseStatisticsViz(absl::string_view str) {
+absl::StatusOr<StatisticsViz> ParseStatisticsViz(absl::string_view str) {
   HloParserImpl parser(str);
   return parser.ParseStatisticsVizOnly();
 }
 
-StatusOr<std::vector<bool>> ParseParameterReplication(absl::string_view str) {
+absl::StatusOr<std::vector<bool>> ParseParameterReplication(
+    absl::string_view str) {
   HloParserImpl parser(str);
   return parser.ParseParameterReplicationOnly();
 }
 
-StatusOr<HloParserImpl::BoolList> ParseBooleanListOrSingleBoolean(
+absl::StatusOr<HloParserImpl::BoolList> ParseBooleanListOrSingleBoolean(
     absl::string_view str) {
   HloParserImpl parser(str);
   return parser.ParseBooleanListOrSingleBooleanOnly();
 }
 
-StatusOr<std::vector<ReplicaGroup>> ParseReplicaGroupsOnly(
+absl::StatusOr<std::vector<ReplicaGroup>> ParseReplicaGroupsOnly(
     absl::string_view str) {
   HloParserImpl parser(str);
   return parser.ParseReplicaGroupsOnly();
 }
 
-StatusOr<Window> ParseWindow(absl::string_view str) {
+absl::StatusOr<Window> ParseWindow(absl::string_view str) {
   HloParserImpl parser(str);
   return parser.ParseWindowOnly();
 }
 
-StatusOr<ConvolutionDimensionNumbers> ParseConvolutionDimensionNumbers(
+absl::StatusOr<ConvolutionDimensionNumbers> ParseConvolutionDimensionNumbers(
     absl::string_view str) {
   HloParserImpl parser(str);
   return parser.ParseConvolutionDimensionNumbersOnly();
 }
 
-StatusOr<PaddingConfig> ParsePaddingConfig(absl::string_view str) {
+absl::StatusOr<PaddingConfig> ParsePaddingConfig(absl::string_view str) {
   HloParserImpl parser(str);
   return parser.ParsePaddingConfigOnly();
 }
 
-StatusOr<Shape> ParseShape(absl::string_view str) {
+absl::StatusOr<Shape> ParseShape(absl::string_view str) {
   HloParserImpl parser(str);
   return parser.ParseShapeOnly();
 }
 
-StatusOr<Layout> ParseLayout(absl::string_view str) {
+absl::StatusOr<Layout> ParseLayout(absl::string_view str) {
   HloParserImpl parser(str);
   return parser.ParseLayoutOnly();
 }
diff --git a/third_party/xla/xla/service/hlo_parser.h b/third_party/xla/xla/service/hlo_parser.h
index d82cf4760d1731..1a29800d6fedcd 100644
--- a/third_party/xla/xla/service/hlo_parser.h
+++ b/third_party/xla/xla/service/hlo_parser.h
@@ -33,58 +33,60 @@ namespace xla {
 // creates a HloModule with the given config.
 // Note: Tests derived from HloTestBase should use
 // ParseAndReturnVerifiedModule() instead!
-StatusOr<std::unique_ptr<HloModule>> ParseAndReturnUnverifiedModule(
+absl::StatusOr<std::unique_ptr<HloModule>> ParseAndReturnUnverifiedModule(
     absl::string_view str, const HloModuleConfig& config);
 
 // Given a string in the HloModule::ToString() format, parses the string and
 // creates a HloModule with default config.
 // Note: Tests derived from HloTestBase should use
 // ParseAndReturnVerifiedModule() instead!
-StatusOr<std::unique_ptr<HloModule>> ParseAndReturnUnverifiedModule(
+absl::StatusOr<std::unique_ptr<HloModule>> ParseAndReturnUnverifiedModule(
     absl::string_view str);
 
 // Parses sharding from str. str is supposed to contain the body of the
 // sharding, i.e. just the rhs of the "sharding={...}" attribute string, e.g.,
 // "{replicated}".
-StatusOr<HloSharding> ParseSharding(absl::string_view str);
+absl::StatusOr<HloSharding> ParseSharding(absl::string_view str);
 
 // Parses frontend attributes from str. str is supposed to contain the body of
 // the frontend attributes , i.e. just the rhs of the
 // "frontend_attributes={...}" attribute string, e.g.,
 // "{attr_a=a,attr_b=b}".
-StatusOr<FrontendAttributes> ParseFrontendAttributes(absl::string_view str);
+absl::StatusOr<FrontendAttributes> ParseFrontendAttributes(
+    absl::string_view str);
 
 // Parses statistics viz from str. str is supposed to contain the body of the
 // statistics visualization, i.e. just the rhs of the "statistics={...}"
 // attribute string, e.g., "{visualizing_index=1,nan_percent=50}".
-StatusOr<StatisticsViz> ParseStatisticsViz(absl::string_view str);
+absl::StatusOr<StatisticsViz> ParseStatisticsViz(absl::string_view str);
 
 // Parses parameter replication from str. str is supposed to contain the body of
 // the parameter replication, i.e. just the rhs of the
 // "parameter_replication={...}" attribute string, e.g., "{true, false}".
-StatusOr<std::vector<bool>> ParseParameterReplication(absl::string_view str);
+absl::StatusOr<std::vector<bool>> ParseParameterReplication(
+    absl::string_view str);
 
 // Parses the result of window_util::ToString(const Window&).
-StatusOr<Window> ParseWindow(absl::string_view str);
+absl::StatusOr<Window> ParseWindow(absl::string_view str);
 
 // Parses the result of ConvolutionDimensionNumbersToString(), e.g.
 // "b0f_0io->b0f".
-StatusOr<ConvolutionDimensionNumbers> ParseConvolutionDimensionNumbers(
+absl::StatusOr<ConvolutionDimensionNumbers> ParseConvolutionDimensionNumbers(
     absl::string_view str);
 
 // Parses the result of PaddingConfigToString(), e.g. "0_0x1_1".
-StatusOr<PaddingConfig> ParsePaddingConfig(absl::string_view str);
+absl::StatusOr<PaddingConfig> ParsePaddingConfig(absl::string_view str);
 
 // Parses and returns a Shape::ToString-format string.
-StatusOr<Shape> ParseShape(absl::string_view str);
+absl::StatusOr<Shape> ParseShape(absl::string_view str);
 
 // Parses and returns a Layout::ToString-format string.
-StatusOr<Layout> ParseLayout(absl::string_view str);
+absl::StatusOr<Layout> ParseLayout(absl::string_view str);
 
 // Parses and returns a std::vector<ReplicaGroup> from str. str is supposed to
 // contain a list of the replica groups, i.e. just the rhs of the
 // "replica_groups={...}" attribute string, e.g., "{{0,1}, {2,3}}".
-StatusOr<std::vector<ReplicaGroup>> ParseReplicaGroupsOnly(
+absl::StatusOr<std::vector<ReplicaGroup>> ParseReplicaGroupsOnly(
     absl::string_view str);
 
 class HloParser {
diff --git a/third_party/xla/xla/service/hlo_parser_test.cc b/third_party/xla/xla/service/hlo_parser_test.cc
index 5a097d2d80565d..7ff7b661208ac8 100644
--- a/third_party/xla/xla/service/hlo_parser_test.cc
+++ b/third_party/xla/xla/service/hlo_parser_test.cc
@@ -2597,8 +2597,8 @@ class HloParserTest : public ::testing::Test {
     EXPECT_TRUE(absl::StrContains(s, expected))
         << "'" << s << "' does not contain '" << expected << "'";
   }
-  StatusOr<std::unique_ptr<VerifiedHloModule>> ParseAndReturnVerifiedModule(
-      absl::string_view hlo_text) {
+  absl::StatusOr<std::unique_ptr<VerifiedHloModule>>
+  ParseAndReturnVerifiedModule(absl::string_view hlo_text) {
     auto module = std::make_unique<VerifiedHloModule>(
         ::testing::UnitTest::GetInstance()->current_test_info()->name(),
         HloModuleConfig(),
@@ -3747,7 +3747,7 @@ TEST(HloParserSingleOpTest, SingleOp) {
 TEST(HloParserSingleOpTest, SingleOpNoShapeProducesError) {
   const std::string text =
       "multiply(f32[2,4]{1,0} %broadcast, f32[2,4]{1,0} %x)";
-  StatusOr<std::unique_ptr<HloModule>> module =
+  absl::StatusOr<std::unique_ptr<HloModule>> module =
       ParseAndReturnUnverifiedModule(text);
   ASSERT_TRUE(!module.status().ok());
   LOG(INFO) << "Status: " << module.status();
@@ -3757,7 +3757,7 @@ TEST(HloParserSingleOpTest, SingleOpNoShapeProducesError) {
 
 TEST(HloParserSingleOpTest, SingleOpNoOperandShapesProducesError) {
   const std::string text = "%multiply = f32[2,4]{1,0} multiply(%broadcast, %x)";
-  StatusOr<std::unique_ptr<HloModule>> module =
+  absl::StatusOr<std::unique_ptr<HloModule>> module =
       ParseAndReturnUnverifiedModule(text);
   ASSERT_TRUE(!module.status().ok());
   LOG(INFO) << "Status: " << module.status();
@@ -4288,7 +4288,7 @@ TEST_F(HloParserTest, ParseInvalidShapeString) {
   std::string shape_strings[] = {"f32[123,456]foobar{0,1}", "f32[123,456]{foo}",
                                  "f32[123,456]dense{foo}"};
   for (const std::string& shape_string : shape_strings) {
-    StatusOr<Shape> result = ParseShape(shape_string);
+    absl::StatusOr<Shape> result = ParseShape(shape_string);
     ASSERT_FALSE(result.ok()) << "shape: " << shape_string;
   }
 }
@@ -4315,7 +4315,7 @@ TEST_F(HloParserTest, ParseDynamicTuple) {
 
 TEST_F(HloParserTest, ParseInvalidDimLevel) {
   constexpr std::string_view shape_string = "f32[123]{0:D(D+~)}";
-  StatusOr<Shape> result = ParseShape(shape_string);
+  absl::StatusOr<Shape> result = ParseShape(shape_string);
   ASSERT_THAT(
       result.status(),
       tsl::testing::StatusIs(
diff --git a/third_party/xla/xla/service/hlo_pass_fix.h b/third_party/xla/xla/service/hlo_pass_fix.h
index 95a2095105d31a..3835a013b9031b 100644
--- a/third_party/xla/xla/service/hlo_pass_fix.h
+++ b/third_party/xla/xla/service/hlo_pass_fix.h
@@ -50,18 +50,19 @@ class HloPassFix : public Pass {
   }
 
   using HloPassInterface::Run;
-  StatusOr<bool> Run(HloModule* module,
-                     const absl::flat_hash_set<absl::string_view>&
-                         execution_threads) override {
+  absl::StatusOr<bool> Run(HloModule* module,
+                           const absl::flat_hash_set<absl::string_view>&
+                               execution_threads) override {
     RunState run_state(module);
     TF_RETURN_IF_ERROR(RunToFixPoint(module, &run_state, execution_threads));
     return !run_state.changed.empty();
   }
 
   using HloPassInterface::RunOnModuleGroup;
-  StatusOr<bool> RunOnModuleGroup(HloModuleGroup* module_group,
-                                  const absl::flat_hash_set<absl::string_view>&
-                                      execution_threads) override {
+  absl::StatusOr<bool> RunOnModuleGroup(
+      HloModuleGroup* module_group,
+      const absl::flat_hash_set<absl::string_view>& execution_threads)
+      override {
     bool changed = false;
     bool changed_this_iteration = true;
     int64_t iteration_count = 0;
diff --git a/third_party/xla/xla/service/hlo_pass_interface.h b/third_party/xla/xla/service/hlo_pass_interface.h
index 166e8da47d9367..64fee1155c7f14 100644
--- a/third_party/xla/xla/service/hlo_pass_interface.h
+++ b/third_party/xla/xla/service/hlo_pass_interface.h
@@ -83,10 +83,10 @@ class HloPassInterface {
   //        override;
   //   };
   //
-  StatusOr<bool> Run(HloModule* module) {
+  absl::StatusOr<bool> Run(HloModule* module) {
     return Run(module, /*execution_threads=*/{});
   }
-  virtual StatusOr<bool> Run(
+  virtual absl::StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) = 0;
 
@@ -132,10 +132,10 @@ class HloPassInterface {
   //        override;
   //   };
   //
-  StatusOr<bool> RunOnModuleGroup(HloModuleGroup* module_group) {
+  absl::StatusOr<bool> RunOnModuleGroup(HloModuleGroup* module_group) {
     return RunOnModuleGroup(module_group, /*execution_threads=*/{});
   }
-  virtual StatusOr<bool> RunOnModuleGroup(
+  virtual absl::StatusOr<bool> RunOnModuleGroup(
       HloModuleGroup* module_group,
       const absl::flat_hash_set<absl::string_view>& execution_threads) = 0;
 
@@ -147,9 +147,10 @@ class HloModulePass : public HloPassInterface {
  public:
   // Runs the pass on a module group by iterating through each module in the
   // group.
-  StatusOr<bool> RunOnModuleGroup(HloModuleGroup* module_group,
-                                  const absl::flat_hash_set<absl::string_view>&
-                                      execution_threads) override {
+  absl::StatusOr<bool> RunOnModuleGroup(
+      HloModuleGroup* module_group,
+      const absl::flat_hash_set<absl::string_view>& execution_threads)
+      override {
     bool changed = false;
     for (HloModule* module : module_group->modules()) {
       TF_ASSIGN_OR_RETURN(bool module_changed, Run(module, execution_threads));
@@ -171,9 +172,9 @@ class HloModulePass : public HloPassInterface {
 // on an HLO module.
 class HloModuleGroupPass : public HloPassInterface {
  public:
-  StatusOr<bool> Run(HloModule* module,
-                     const absl::flat_hash_set<absl::string_view>&
-                         execution_threads) override {
+  absl::StatusOr<bool> Run(HloModule* module,
+                           const absl::flat_hash_set<absl::string_view>&
+                               execution_threads) override {
     return Internal("Module group pass cannot be run on a module");
   }
 };
diff --git a/third_party/xla/xla/service/hlo_pass_pipeline.cc b/third_party/xla/xla/service/hlo_pass_pipeline.cc
index 01a9c7ee103794..ece25312e9dcfd 100644
--- a/third_party/xla/xla/service/hlo_pass_pipeline.cc
+++ b/third_party/xla/xla/service/hlo_pass_pipeline.cc
@@ -107,7 +107,7 @@ Status HloPassPipeline::RunInvariantCheckers(
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   for (auto& invariant_checker : invariant_checkers_) {
     VLOG(1) << "    Invariant checker " << invariant_checker->name();
-    StatusOr<bool> changed_status =
+    absl::StatusOr<bool> changed_status =
         RunHelper(invariant_checker.get(), hlo, execution_threads);
     VLOG(1) << "    Invariant checker done " << invariant_checker->name();
     if (!changed_status.ok()) {
@@ -137,7 +137,7 @@ std::string UniqueId(const HloModuleGroup& group) {
 }  // namespace
 
 template <typename HloT>
-StatusOr<bool> HloPassPipeline::RunPassesInternal(
+absl::StatusOr<bool> HloPassPipeline::RunPassesInternal(
     HloT* hlo, const DebugOptions& debug_options,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   auto passes = GetEnabledPasses(debug_options);
@@ -284,7 +284,7 @@ void HloPassPipeline::MaybeDumpHloAndSaveFilenames(
   }
 }
 
-StatusOr<bool> HloPassPipeline::Run(
+absl::StatusOr<bool> HloPassPipeline::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   run_called_ = true;
@@ -296,7 +296,7 @@ StatusOr<bool> HloPassPipeline::Run(
                            execution_threads);
 }
 
-StatusOr<bool> HloPassPipeline::RunOnModuleGroup(
+absl::StatusOr<bool> HloPassPipeline::RunOnModuleGroup(
     HloModuleGroup* module_group,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   run_called_ = true;
diff --git a/third_party/xla/xla/service/hlo_pass_pipeline.h b/third_party/xla/xla/service/hlo_pass_pipeline.h
index cd1d67ce67d047..6ee1fc5e200566 100644
--- a/third_party/xla/xla/service/hlo_pass_pipeline.h
+++ b/third_party/xla/xla/service/hlo_pass_pipeline.h
@@ -79,11 +79,11 @@ class HloPassPipeline : public HloPassInterface {
   }
 
   using HloPassInterface::Run;
-  StatusOr<bool> Run(
+  absl::StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
   using HloPassInterface::RunOnModuleGroup;
-  StatusOr<bool> RunOnModuleGroup(
+  absl::StatusOr<bool> RunOnModuleGroup(
       HloModuleGroup* module_group,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
@@ -125,7 +125,7 @@ class HloPassPipeline : public HloPassInterface {
   // Helper which runs the given pass on the given HLO. HloT can be either
   // HloModule or HloModuleGroup.
   template <typename HloT>
-  StatusOr<bool> RunPassesInternal(
+  absl::StatusOr<bool> RunPassesInternal(
       HloT* hlo, const DebugOptions& debug_options,
       const absl::flat_hash_set<absl::string_view>& execution_threads);
 
@@ -134,14 +134,14 @@ class HloPassPipeline : public HloPassInterface {
   // empty thread list means all `execution_threads` are considered. These
   // helpers enable templating of the core of the pipeline logic by providing
   // HloModule and HloModuleGroup specific methods with the same name.
-  static StatusOr<bool> RunHelper(
+  static absl::StatusOr<bool> RunHelper(
       HloPassInterface* pass, HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) {
     TF_ASSIGN_OR_RETURN(bool changed, pass->Run(module, execution_threads));
     module->Cleanup();
     return changed;
   }
-  static StatusOr<bool> RunHelper(
+  static absl::StatusOr<bool> RunHelper(
       HloPassInterface* pass, HloModuleGroup* module_group,
       const absl::flat_hash_set<absl::string_view>& execution_threads) {
     TF_ASSIGN_OR_RETURN(
diff --git a/third_party/xla/xla/service/hlo_runner.cc b/third_party/xla/xla/service/hlo_runner.cc
index 6e4331a9fb61ef..f08b6ea5e320e3 100644
--- a/third_party/xla/xla/service/hlo_runner.cc
+++ b/third_party/xla/xla/service/hlo_runner.cc
@@ -525,7 +525,7 @@ absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
                                                     argument_buffer_slices));
         } else {
           absl::Mutex mutex;
-          std::vector<StatusOr<ScopedShapedBuffer>> thread_results(
+          std::vector<absl::StatusOr<ScopedShapedBuffer>> thread_results(
               options.num_replicas);
           {
             VLOG(1) << "Creating thread pool for " << options.num_replicas
@@ -581,7 +581,7 @@ absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
         TF_RET_CHECK(options.use_threads);
         std::vector<ScopedShapedBuffer> results;
         absl::Mutex mutex;
-        std::vector<StatusOr<ScopedShapedBuffer>> thread_results(
+        std::vector<absl::StatusOr<ScopedShapedBuffer>> thread_results(
             options.num_replicas);
         {
           VLOG(1) << "Creating thread pool for " << options.num_replicas
diff --git a/third_party/xla/xla/service/hlo_runner_pjrt.cc b/third_party/xla/xla/service/hlo_runner_pjrt.cc
index fbf1dcd539153e..27228fe523e38a 100644
--- a/third_party/xla/xla/service/hlo_runner_pjrt.cc
+++ b/third_party/xla/xla/service/hlo_runner_pjrt.cc
@@ -193,7 +193,7 @@ HloRunnerPjRt::ExecuteWithDeviceBuffers(
 
   auto devices = pjrt_client_->addressable_devices();
 
-  std::optional<PjRtFuture<Status>> returned_future = {};
+  std::optional<PjRtFuture<>> returned_future = {};
 
   TF_ASSIGN_OR_RETURN(
       auto output_buffers,
diff --git a/third_party/xla/xla/service/hlo_value_semantics_analysis.cc b/third_party/xla/xla/service/hlo_value_semantics_analysis.cc
index 9771e57943846e..093fd302b42adb 100644
--- a/third_party/xla/xla/service/hlo_value_semantics_analysis.cc
+++ b/third_party/xla/xla/service/hlo_value_semantics_analysis.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/memory/memory.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
@@ -273,7 +274,7 @@ void SetDepthFromTupleDepth(ShapeTree<int>& depth_tree,
 
 }  // namespace
 
-EinsumDepthMap::iterator EinsumDepthAnalysis::GetOrCreateDepthTree(
+ShapeTree<int>& EinsumDepthAnalysis::GetOrCreateDepthTree(
     const HloInstruction* instruction) {
   auto depth_iter = einsum_depth_map_.find(instruction);
   if (depth_iter == einsum_depth_map_.end()) {
@@ -282,29 +283,27 @@ EinsumDepthMap::iterator EinsumDepthAnalysis::GetOrCreateDepthTree(
         std::make_pair(instruction, std::move(depth_tree)));
     depth_iter = inserted.first;
   }
-  return depth_iter;
+  return depth_iter->second;
 }
 
-EinsumDepthMap::iterator EinsumDepthAnalysis::GetDepthTreeOrDie(
+ShapeTree<int>& EinsumDepthAnalysis::GetDepthTreeOrDie(
     const HloInstruction* instruction) {
   auto depth_iter = einsum_depth_map_.find(instruction);
   CHECK(depth_iter != einsum_depth_map_.end())
       << "No depth tree found for instruction: " << instruction->ToString();
-  return depth_iter;
+  return depth_iter->second;
 }
 
 absl::Status EinsumDepthAnalysis::SetInstructionDepth(
     const HloInstruction* instruction, int depth) {
-  auto depth_iter = GetOrCreateDepthTree(instruction);
-  ShapeTree<int>& depth_tree = depth_iter->second;
+  ShapeTree<int>& depth_tree = GetOrCreateDepthTree(instruction);
   SetDepth(depth_tree, depth);
   return OkStatus();
 }
 
 absl::Status EinsumDepthAnalysis::SetInstructionDepth(
     const HloInstruction* instruction, const ShapeTree<int>& depth) {
-  auto depth_iter = GetOrCreateDepthTree(instruction);
-  ShapeTree<int>& depth_tree = depth_iter->second;
+  ShapeTree<int>& depth_tree = GetOrCreateDepthTree(instruction);
   SetDepth(depth_tree, depth);
   return OkStatus();
 }
@@ -312,15 +311,13 @@ absl::Status EinsumDepthAnalysis::SetInstructionDepth(
 absl::Status EinsumDepthAnalysis::SetInstructionDepthFromTupleDepth(
     const HloInstruction* instruction, const ShapeTree<int>& tuple_depth_tree,
     int tuple_index) {
-  auto depth_iter = GetOrCreateDepthTree(instruction);
-  ShapeTree<int>& depth_tree = depth_iter->second;
+  ShapeTree<int>& depth_tree = GetOrCreateDepthTree(instruction);
   SetDepthFromTupleDepth(depth_tree, tuple_depth_tree, tuple_index);
   return OkStatus();
 }
 
 absl::Status EinsumDepthAnalysis::DefaultAction(HloInstruction* instruction) {
-  auto depth_iter = GetDepthTreeOrDie(instruction);
-  const ShapeTree<int>& depth_tree = depth_iter->second;
+  const ShapeTree<int>& depth_tree = GetDepthTreeOrDie(instruction);
   int max_depth = GetMaxDepth(depth_tree);
   for (int operand_index = 0; operand_index < instruction->operand_count();
        ++operand_index) {
@@ -342,13 +339,11 @@ absl::Status EinsumDepthAnalysis::HandleAllReduce(HloInstruction* all_reduce) {
 }
 
 absl::Status EinsumDepthAnalysis::HandleTupleLike(HloInstruction* tuple_like) {
-  auto depth_iter = GetDepthTreeOrDie(tuple_like);
-  const ShapeTree<int> depth_tree = depth_iter->second;
+  const ShapeTree<int>& depth_tree = GetDepthTreeOrDie(tuple_like);
   for (int operand_index = 0; operand_index < tuple_like->operand_count();
        ++operand_index) {
     HloInstruction* operand = tuple_like->mutable_operand(operand_index);
-    auto operand_depth_iter = GetOrCreateDepthTree(operand);
-    ShapeTree<int>& operand_depth = operand_depth_iter->second;
+    ShapeTree<int>& operand_depth = GetOrCreateDepthTree(operand);
     SetDepthFromTupleDepth(operand_depth, depth_tree, operand_index);
   }
   return OkStatus();
@@ -356,13 +351,11 @@ absl::Status EinsumDepthAnalysis::HandleTupleLike(HloInstruction* tuple_like) {
 
 absl::Status EinsumDepthAnalysis::HandleGetTupleElement(
     HloInstruction* get_tuple_element) {
-  auto depth_iter = GetDepthTreeOrDie(get_tuple_element);
-  const ShapeTree<int> depth_tree = depth_iter->second;
+  const ShapeTree<int>& depth_tree = GetDepthTreeOrDie(get_tuple_element);
 
   HloInstruction* operand = get_tuple_element->mutable_operand(0);
   int tuple_index = get_tuple_element->tuple_index();
-  auto operand_depth_iter = GetOrCreateDepthTree(operand);
-  ShapeTree<int>& operand_depth = operand_depth_iter->second;
+  ShapeTree<int>& operand_depth = GetOrCreateDepthTree(operand);
   operand_depth.ForEachMutableElement(
       [&operand_depth, &depth_tree, tuple_index](const ShapeIndex& shape_index,
                                                  int* depth_ptr) {
@@ -380,8 +373,8 @@ absl::Status EinsumDepthAnalysis::HandleGetTupleElement(
 
 absl::Status EinsumDepthAnalysis::HandleDepthIncrementInstruction(
     HloInstruction* instruction) {
-  auto depth_iter = GetDepthTreeOrDie(instruction);
-  int instruction_depth = depth_iter->second.element({});
+  ShapeTree<int>& depth_tree = GetDepthTreeOrDie(instruction);
+  int instruction_depth = depth_tree.element({});
   for (HloInstruction* operand : instruction->mutable_operands()) {
     TF_RETURN_IF_ERROR(SetInstructionDepth(
         operand, instruction_depth >= 0 ? instruction_depth + 1
@@ -400,22 +393,19 @@ absl::Status EinsumDepthAnalysis::HandleConvolution(
 }
 
 absl::Status EinsumDepthAnalysis::HandleCall(HloInstruction* call) {
-  auto depth_iter = GetDepthTreeOrDie(call);
-  const ShapeTree<int> depth_tree = depth_iter->second;
+  const ShapeTree<int>& depth_tree = GetDepthTreeOrDie(call);
   return HandleCalledComputation(*call->called_computations()[0], depth_tree,
                                  call->operands());
 }
 
 absl::Status EinsumDepthAnalysis::HandleFusion(HloInstruction* fusion) {
-  auto depth_iter = GetDepthTreeOrDie(fusion);
-  const ShapeTree<int> depth_tree = depth_iter->second;
+  const ShapeTree<int>& depth_tree = GetDepthTreeOrDie(fusion);
   return HandleCalledComputation(*fusion->called_computations()[0], depth_tree,
                                  fusion->operands());
 }
 
 absl::Status EinsumDepthAnalysis::HandleWhile(HloInstruction* xla_while) {
-  auto depth_iter = GetDepthTreeOrDie(xla_while);
-  const ShapeTree<int>& depth_tree = depth_iter->second;
+  const ShapeTree<int>& depth_tree = GetDepthTreeOrDie(xla_while);
   int max_depth = GetMaxDepth(depth_tree);
   HloComputation* condition_computation = xla_while->while_condition();
   HloInstruction* condition_root = condition_computation->root_instruction();
@@ -425,9 +415,8 @@ absl::Status EinsumDepthAnalysis::HandleWhile(HloInstruction* xla_while) {
   const ShapeTree<int>* root_depth_ptr = &depth_tree;
   HloComputation* body_computation = xla_while->while_body();
   bool run_depth_propagation_on_body = true;
-  auto root_depth_iter =
+  ShapeTree<int>& root_depth =
       GetOrCreateDepthTree(body_computation->root_instruction());
-  ShapeTree<int>& root_depth = root_depth_iter->second;
   while (run_depth_propagation_on_body) {
     run_depth_propagation_on_body = false;
     TF_RETURN_IF_ERROR(HandleCalledComputation(
@@ -436,7 +425,7 @@ absl::Status EinsumDepthAnalysis::HandleWhile(HloInstruction* xla_while) {
     // If such elements exist, we set its root depth to it operand depth. Then
     // recompute while loop instruction depths.
     HloInstruction* operand = body_computation->parameter_instruction(0);
-    const ShapeTree<int>& operand_depth = GetOrCreateDepthTree(operand)->second;
+    const ShapeTree<int>& operand_depth = GetOrCreateDepthTree(operand);
 
     root_depth.ForEachMutableElement(
         [&run_depth_propagation_on_body, &root_depth, &operand_depth](
@@ -457,8 +446,7 @@ absl::Status EinsumDepthAnalysis::HandleWhile(HloInstruction* xla_while) {
 
 absl::Status EinsumDepthAnalysis::HandleConditional(
     HloInstruction* conditional) {
-  auto depth_iter = GetDepthTreeOrDie(conditional);
-  const ShapeTree<int> depth_tree = depth_iter->second;
+  const ShapeTree<int>& depth_tree = GetDepthTreeOrDie(conditional);
   // Conditionals have one more operand than the number of branches. The first
   // operand is the pred.
   TF_RETURN_IF_ERROR(
@@ -479,23 +467,26 @@ absl::Status EinsumDepthAnalysis::HandleCalledComputation(
   for (int i = 0; i < operands.size(); ++i) {
     HloInstruction* operand = operands[i];
     HloInstruction* parameter = called_computation.parameter_instruction(i);
-    const ShapeTree<int> parameter_depth =
-        GetOrCreateDepthTree(parameter)->second;
+    const ShapeTree<int>& parameter_depth = GetOrCreateDepthTree(parameter);
     TF_RETURN_IF_ERROR(SetInstructionDepth(operand, parameter_depth));
   }
   return OkStatus();
 }
 
 absl::Status EinsumDepthAnalysis::HandleAfterAll(HloInstruction* after_all) {
+  const ShapeTree<int>& depth_tree = GetDepthTreeOrDie(after_all);
+  int max_depth = GetMaxDepth(depth_tree);
+  for (HloInstruction* operand_token : after_all->mutable_operands()) {
+    CHECK(operand_token->shape().IsToken());
+    TF_RETURN_IF_ERROR(SetInstructionDepth(operand_token, max_depth));
+  }
   return OkStatus();
 }
 
 absl::Status EinsumDepthAnalysis::HandleSend(HloInstruction* send) {
-  auto depth_iter = GetDepthTreeOrDie(send);
-  const ShapeTree<int>& depth_tree = depth_iter->second;
+  const ShapeTree<int>& depth_tree = GetDepthTreeOrDie(send);
   HloInstruction* send_buffer = send->mutable_operand(0);
-  auto send_buffer_depth_iter = GetOrCreateDepthTree(send_buffer);
-  ShapeTree<int>& send_buffer_depth = send_buffer_depth_iter->second;
+  ShapeTree<int>& send_buffer_depth = GetOrCreateDepthTree(send_buffer);
   SetDepthFromTupleDepth(send_buffer_depth, depth_tree, 0);
   int max_depth = GetMaxDepth(depth_tree);
   HloInstruction* token = send->mutable_operand(1);
@@ -503,14 +494,12 @@ absl::Status EinsumDepthAnalysis::HandleSend(HloInstruction* send) {
 }
 
 absl::Status EinsumDepthAnalysis::HandleRecv(HloInstruction* recv) {
-  auto depth_iter = GetDepthTreeOrDie(recv);
-  const ShapeTree<int>& depth_tree = depth_iter->second;
+  const ShapeTree<int>& depth_tree = GetDepthTreeOrDie(recv);
   TF_ASSIGN_OR_RETURN(HloInstruction * send,
                       send_recv_group_map_->GetMatchingSendOrRecv(recv));
   CHECK(send) << "recv: " << recv->name()
               << " not found in send_recv_group_map: " << recv->ToString();
-  auto send_depth_iter = GetOrCreateDepthTree(send);
-  ShapeTree<int>& send_depth = send_depth_iter->second;
+  ShapeTree<int>& send_depth = GetOrCreateDepthTree(send);
   int max_depth = GetMaxDepth(depth_tree);
   send_depth.ForEachMutableElement([&depth_tree, &send_depth, max_depth](
                                        const ShapeIndex& index, int* depth) {
@@ -529,19 +518,16 @@ absl::Status EinsumDepthAnalysis::HandleRecv(HloInstruction* recv) {
 
 absl::Status EinsumDepthAnalysis::HandleSendDone(HloInstruction* send_done) {
   HloInstruction* send = send_done->mutable_operand(0);
-  auto depth_iter = GetDepthTreeOrDie(send_done);
-  const ShapeTree<int>& depth_tree = depth_iter->second;
+  const ShapeTree<int>& depth_tree = GetDepthTreeOrDie(send_done);
   int max_depth = GetMaxDepth(depth_tree);
   return SetInstructionDepth(send, max_depth);
 }
 
 absl::Status EinsumDepthAnalysis::HandleRecvDone(HloInstruction* recv_done) {
-  auto depth_iter = GetDepthTreeOrDie(recv_done);
-  const ShapeTree<int>& depth_tree = depth_iter->second;
+  const ShapeTree<int>& depth_tree = GetDepthTreeOrDie(recv_done);
   int max_depth = GetMaxDepth(depth_tree);
   HloInstruction* recv = recv_done->mutable_operand(0);
-  auto recv_depth_iter = GetOrCreateDepthTree(recv);
-  ShapeTree<int>& recv_depth = recv_depth_iter->second;
+  ShapeTree<int>& recv_depth = GetOrCreateDepthTree(recv);
   recv_depth.ForEachMutableElement([&depth_tree, &recv_depth, max_depth](
                                        const ShapeIndex& index, int* depth) {
     if (!recv_depth.IsLeaf(index)) {
@@ -556,6 +542,33 @@ absl::Status EinsumDepthAnalysis::HandleRecvDone(HloInstruction* recv_done) {
   return OkStatus();
 }
 
+absl::Status EinsumDepthAnalysis::HandleAsyncStart(
+    HloInstruction* async_start) {
+  const ShapeTree<int>& depth_tree = GetDepthTreeOrDie(async_start);
+  TF_ASSIGN_OR_RETURN(ShapeTree<int> output_depth_tree,
+                      depth_tree.SubShapeTree({1}));
+  return HandleCalledComputation(*(async_start->async_wrapped_computation()),
+                                 output_depth_tree, async_start->operands());
+}
+
+absl::Status EinsumDepthAnalysis::HandleAsyncDone(HloInstruction* async_done) {
+  const ShapeTree<int>& depth_tree = GetDepthTreeOrDie(async_done);
+  HloInstruction* async_start = async_done->mutable_operand(0);
+  ShapeTree<int>& async_start_depth = GetOrCreateDepthTree(async_start);
+  async_start_depth.ForEachMutableElement(
+      [&depth_tree, &async_start_depth](const ShapeIndex& index, int* depth) {
+        if (!async_start_depth.IsLeaf(index)) {
+          return;
+        }
+        if (index.front() == 1) {
+          ShapeIndex output_index = index;
+          output_index.pop_front();
+          *depth = MergeDepth(*depth, depth_tree.element(output_index));
+        }
+      });
+  return OkStatus();
+}
+
 namespace {
 
 int MergeHeight(int original_height, int new_height) {
@@ -637,7 +650,7 @@ absl::Status EinsumHeightAnalysis::RunInternal(
   return HandleCalledComputation(computation, operands);
 }
 
-EinsumHeightMap::iterator EinsumHeightAnalysis::GetOrCreateHeightTree(
+ShapeTree<int>& EinsumHeightAnalysis::GetOrCreateHeightTree(
     const HloInstruction* instruction) {
   auto height_iter = einsum_height_map_.find(instruction);
   if (height_iter == einsum_height_map_.end()) {
@@ -646,14 +659,14 @@ EinsumHeightMap::iterator EinsumHeightAnalysis::GetOrCreateHeightTree(
         std::make_pair(instruction, std::move(height_tree)));
     height_iter = inserted.first;
   }
-  return height_iter;
+  return height_iter->second;
 }
 
-EinsumHeightMap::iterator EinsumHeightAnalysis::GetHeightTreeOrDie(
+ShapeTree<int>& EinsumHeightAnalysis::GetHeightTreeOrDie(
     const HloInstruction* instruction) {
   auto height_iter = einsum_height_map_.find(instruction);
   CHECK(height_iter != einsum_height_map_.end());
-  return height_iter;
+  return height_iter->second;
 }
 
 bool EinsumHeightAnalysis::HasHeightFor(
@@ -663,16 +676,14 @@ bool EinsumHeightAnalysis::HasHeightFor(
 
 absl::Status EinsumHeightAnalysis::SetInstructionHeight(
     const HloInstruction* instruction, int height) {
-  auto height_iter = GetOrCreateHeightTree(instruction);
-  ShapeTree<int>& height_tree = height_iter->second;
+  ShapeTree<int>& height_tree = GetOrCreateHeightTree(instruction);
   SetHeight(height_tree, height);
   return OkStatus();
 }
 
 absl::Status EinsumHeightAnalysis::SetInstructionHeight(
     const HloInstruction* instruction, const ShapeTree<int>& height) {
-  auto height_iter = GetOrCreateHeightTree(instruction);
-  ShapeTree<int>& height_tree = height_iter->second;
+  ShapeTree<int>& height_tree = GetOrCreateHeightTree(instruction);
   SetHeight(height_tree, height);
   return OkStatus();
 }
@@ -684,11 +695,10 @@ absl::Status EinsumHeightAnalysis::SetInstructionHeight(
 
 absl::Status EinsumHeightAnalysis::HandleHeightIncrementInstruction(
     HloInstruction* instruction) {
-  auto height_iter = GetOrCreateHeightTree(instruction);
+  ShapeTree<int>& height_tree = GetOrCreateHeightTree(instruction);
   for (HloInstruction* operand : instruction->mutable_operands()) {
-    auto operand_height_iter = GetHeightTreeOrDie(operand);
-    int operand_height = operand_height_iter->second.element({});
-    SetHeight(height_iter->second, operand_height + 1);
+    const ShapeTree<int>& operand_height_tree = GetHeightTreeOrDie(operand);
+    SetHeight(height_tree, operand_height_tree.element({}) + 1);
   }
   return OkStatus();
 }
@@ -697,15 +707,19 @@ absl::Status EinsumHeightAnalysis::HandleCalledComputation(
     const HloComputation& computation,
     absl::Span<HloInstruction* const> operands) {
   if (!operands.empty()) {
-    CHECK(computation.num_parameters() == operands.size());
+    if (computation.num_parameters() != operands.size()) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          operands.size(), " operands were passed for the computation ",
+          computation.name(), " with ", computation.num_parameters(),
+          " parameters."));
+    }
     for (int parameter_index = 0;
          parameter_index < computation.num_parameters(); ++parameter_index) {
       HloInstruction* parameter =
           computation.parameter_instruction(parameter_index);
       HloInstruction* operand = operands[parameter_index];
-      auto operand_height_iter = GetHeightTreeOrDie(operand);
-      TF_RETURN_IF_ERROR(
-          SetInstructionHeight(parameter, operand_height_iter->second));
+      const ShapeTree<int>& operand_height_tree = GetHeightTreeOrDie(operand);
+      TF_RETURN_IF_ERROR(SetInstructionHeight(parameter, operand_height_tree));
     }
   }
   for (HloInstruction* instruction : computation.instructions()) {
@@ -723,8 +737,7 @@ absl::Status EinsumHeightAnalysis::DefaultAction(HloInstruction* instruction) {
 }
 
 absl::Status EinsumHeightAnalysis::HandleTupleLike(HloInstruction* tuple_like) {
-  auto height_iter = GetOrCreateHeightTree(tuple_like);
-  ShapeTree<int>& height_tree = height_iter->second;
+  ShapeTree<int>& height_tree = GetOrCreateHeightTree(tuple_like);
   height_tree.ForEachMutableElement([&height_tree, tuple_like, this](
                                         const ShapeIndex& index, int* height) {
     if (!height_tree.IsLeaf(index)) {
@@ -732,13 +745,10 @@ absl::Status EinsumHeightAnalysis::HandleTupleLike(HloInstruction* tuple_like) {
     }
     int operand_index = index.front();
     const HloInstruction* operand = tuple_like->operand(operand_index);
-    auto operand_height_iter = GetHeightTreeOrDie(operand);
-    CHECK(operand_height_iter != einsum_height_map_.end())
-        << "operand: " << operand->name();
+    const ShapeTree<int>& operand_height_tree = GetHeightTreeOrDie(operand);
     ShapeIndex source_index = index;
     source_index.pop_front();
-    *height =
-        MergeHeight(*height, operand_height_iter->second.element(source_index));
+    *height = MergeHeight(*height, operand_height_tree.element(source_index));
   });
   return OkStatus();
 }
@@ -751,10 +761,9 @@ absl::Status EinsumHeightAnalysis::HandleTuple(HloInstruction* tuple) {
 absl::Status EinsumHeightAnalysis::HandleGetTupleElement(
     HloInstruction* get_tuple_element) {
   RETURN_IF_HEIGHT_EXISTS(get_tuple_element);
-  auto height_iter = GetOrCreateHeightTree(get_tuple_element);
-  ShapeTree<int>& height_tree = height_iter->second;
-  auto tuple_height_iter = GetHeightTreeOrDie(get_tuple_element->operand(0));
-  const ShapeTree<int>& tuple_height_tree = tuple_height_iter->second;
+  ShapeTree<int>& height_tree = GetOrCreateHeightTree(get_tuple_element);
+  const ShapeTree<int>& tuple_height_tree =
+      GetHeightTreeOrDie(get_tuple_element->operand(0));
   int tuple_index = get_tuple_element->tuple_index();
   SetHeight(height_tree, tuple_height_tree, {tuple_index}, {});
   return OkStatus();
@@ -775,9 +784,9 @@ absl::Status EinsumHeightAnalysis::HandleCall(HloInstruction* call) {
   RETURN_IF_HEIGHT_EXISTS(call);
   TF_RETURN_IF_ERROR(HandleCalledComputation(*(call->called_computations()[0]),
                                              call->mutable_operands()));
-  auto root_height_iter =
+  const ShapeTree<int>& root_height_tree =
       GetHeightTreeOrDie(call->called_computations()[0]->root_instruction());
-  TF_RETURN_IF_ERROR(SetInstructionHeight(call, root_height_iter->second));
+  TF_RETURN_IF_ERROR(SetInstructionHeight(call, root_height_tree));
   return OkStatus();
 }
 
@@ -792,22 +801,27 @@ absl::Status EinsumHeightAnalysis::HandleWhile(HloInstruction* xla_while) {
                                              xla_while->mutable_operands()));
   TF_RETURN_IF_ERROR(HandleCalledComputation(*(xla_while->while_body()),
                                              xla_while->mutable_operands()));
-  auto root_height_iter =
+  const ShapeTree<int>& root_height_tree =
       GetHeightTreeOrDie(xla_while->while_body()->root_instruction());
-  return SetInstructionHeight(xla_while, root_height_iter->second);
+  return SetInstructionHeight(xla_while, root_height_tree);
 }
 
 absl::Status EinsumHeightAnalysis::HandleConditional(
     HloInstruction* conditional) {
   RETURN_IF_HEIGHT_EXISTS(conditional);
-  auto conditional_height_iter = GetOrCreateHeightTree(conditional);
-  ShapeTree<int>& height_tree = conditional_height_iter->second;
-  for (HloComputation* computation : conditional->branch_computations()) {
-    TF_RETURN_IF_ERROR(
-        HandleCalledComputation(*computation, conditional->mutable_operands()));
-    auto branch_root_height_iter =
+  ShapeTree<int>& height_tree = GetOrCreateHeightTree(conditional);
+  for (size_t i = 0; i < conditional->branch_count(); ++i) {
+    HloComputation* computation = conditional->branch_computation(i);
+    // An N-way conditional op has N + 1 operands where the first one is the
+    // branch index determining what branch to take, and the remaining N
+    // operands correspond to arguments to be passed to each of the N branch
+    // computations, if they are executed. So the (i + 1)th operand corresponds
+    // to the ith branch computation.
+    TF_RETURN_IF_ERROR(HandleCalledComputation(
+        *computation, {conditional->mutable_operands()[i + 1]}));
+    ShapeTree<int>& branch_root_height_tree =
         GetHeightTreeOrDie(computation->root_instruction());
-    SetHeight(height_tree, branch_root_height_iter->second);
+    SetHeight(height_tree, branch_root_height_tree);
   }
   return OkStatus();
 }
@@ -815,12 +829,10 @@ absl::Status EinsumHeightAnalysis::HandleConditional(
 absl::Status EinsumHeightAnalysis::HandleSend(HloInstruction* send) {
   RETURN_IF_HEIGHT_EXISTS(send);
   HloInstruction* send_buffer = send->mutable_operand(0);
-  auto send_buffer_height_iter = GetHeightTreeOrDie(send_buffer);
   const ShapeTree<int>& send_buffer_height_tree =
-      send_buffer_height_iter->second;
+      GetHeightTreeOrDie(send_buffer);
 
-  auto height_iter = GetOrCreateHeightTree(send);
-  ShapeTree<int>& height_tree = height_iter->second;
+  ShapeTree<int>& height_tree = GetOrCreateHeightTree(send);
   SetHeight(height_tree, send_buffer_height_tree, {}, {0});
   return OkStatus();
 }
@@ -831,12 +843,9 @@ absl::Status EinsumHeightAnalysis::HandleRecv(HloInstruction* recv) {
                       send_recv_group_map_->GetMatchingSendOrRecv(recv));
   TF_RETURN_IF_ERROR(send->Accept(this));
   HloInstruction* send_buffer = send->mutable_operand(0);
-  auto send_buffer_height_iter = GetHeightTreeOrDie(send_buffer);
   const ShapeTree<int>& send_buffer_height_tree =
-      send_buffer_height_iter->second;
-
-  auto height_iter = GetOrCreateHeightTree(recv);
-  ShapeTree<int>& height_tree = height_iter->second;
+      GetHeightTreeOrDie(send_buffer);
+  ShapeTree<int>& height_tree = GetOrCreateHeightTree(recv);
   SetHeight(height_tree, send_buffer_height_tree, {}, {0});
   return OkStatus();
 }
@@ -850,10 +859,8 @@ absl::Status EinsumHeightAnalysis::HandleSendDone(HloInstruction* send_done) {
 absl::Status EinsumHeightAnalysis::HandleRecvDone(HloInstruction* recv_done) {
   RETURN_IF_HEIGHT_EXISTS(recv_done);
   HloInstruction* recv = recv_done->mutable_operand(0);
-  auto recv_height_iter = GetHeightTreeOrDie(recv);
-  const ShapeTree<int>& recv_height_tree = recv_height_iter->second;
-  auto height_iter = GetOrCreateHeightTree(recv_done);
-  ShapeTree<int>& height_tree = height_iter->second;
+  const ShapeTree<int>& recv_height_tree = GetHeightTreeOrDie(recv);
+  ShapeTree<int>& height_tree = GetOrCreateHeightTree(recv_done);
   SetHeight(height_tree, recv_height_tree, {0}, {0});
   return OkStatus();
 }
@@ -866,6 +873,35 @@ absl::Status EinsumHeightAnalysis::HandleAllReduce(HloInstruction* all_reduce) {
   return HandleTupleLike(all_reduce);
 }
 
+absl::Status EinsumHeightAnalysis::HandleAsyncStart(
+    HloInstruction* async_start) {
+  RETURN_IF_HEIGHT_EXISTS(async_start);
+  TF_RETURN_IF_ERROR(
+      HandleCalledComputation(*(async_start->async_wrapped_computation()),
+                              async_start->mutable_operands()));
+  const ShapeTree<int>& root_height_tree = GetHeightTreeOrDie(
+      async_start->async_wrapped_computation()->root_instruction());
+  ShapeTree<int>& height_tree = GetOrCreateHeightTree(async_start);
+  SetHeight(height_tree, root_height_tree, {}, {1});
+  for (int operand_index = 0; operand_index < async_start->operands().size();
+       ++operand_index) {
+    HloInstruction* operand = async_start->mutable_operands()[operand_index];
+    const ShapeTree<int>& operand_height_tree = GetHeightTreeOrDie(operand);
+    SetHeight(height_tree, operand_height_tree, {}, {0, operand_index});
+  }
+  return OkStatus();
+}
+
+absl::Status EinsumHeightAnalysis::HandleAsyncDone(HloInstruction* async_done) {
+  RETURN_IF_HEIGHT_EXISTS(async_done);
+  ShapeTree<int>& height_tree = GetOrCreateHeightTree(async_done);
+  HloInstruction* async_start = async_done->mutable_operand(0);
+  const ShapeTree<int>& async_start_height_tree =
+      GetHeightTreeOrDie(async_start);
+  SetHeight(height_tree, async_start_height_tree, {1}, {});
+  return OkStatus();
+}
+
 std::string HloValueSemanticLabelToString(HloValueSemanticLabel label) {
   switch (label) {
     case HloValueSemanticLabel::kStatic:
@@ -1476,6 +1512,13 @@ HloValueSemanticsPropagation::MergeSemanticsForAnInstruction(
       replace_operands_semantics_with(semantics);
       continue;
     }
+    if (operand_list[0].label() == HloValueSemanticLabel::kTupleOrToken &&
+        operand_list[1].label() == HloValueSemanticLabel::kTupleOrToken) {
+      HloValueSemantics semantics =
+          CopySemanticsWithNewOrigin(operand_list[0], instruction);
+      replace_operands_semantics_with(semantics);
+      continue;
+    }
     LOG(FATAL) << "We don't expect to handle operands of label "
                << HloValueSemanticLabelToString(operand_list[0].label())
                << " and "
@@ -1506,7 +1549,8 @@ HloValueSemanticsPropagation::ComputeSemanticsFromOperands(
                      ? ShapeIndex()
                      : operand_shape_indices[operand_index]);
     auto operand_height_iter = analysis_->GetEinsumHeightMap().find(operand);
-    CHECK(operand_height_iter != analysis_->GetEinsumHeightMap().end());
+    CHECK(operand_height_iter != analysis_->GetEinsumHeightMap().end())
+        << "operand: " << operand->name();
     VLOG(3) << __func__ << ", operand_index: " << operand_index
             << ", operand: " << operand->name()
             << ", operand_semantics: " << operand_semantics->ToString()
@@ -1775,102 +1819,71 @@ absl::Status HloValueSemanticsPropagation::HandleDynamicUpdateSlice(
 
 absl::Status HloValueSemanticsPropagation::HandleCopyStart(
     HloInstruction* copy_start) {
-  RETURN_IF_ALREADY_PROPAGATED(copy_start);
-  ShapeTree<const HloValueSemantics*> semantics_shape_tree(copy_start->shape());
-  const ShapeTree<const HloValueSemantics*>& operand_semantics_shape_tree =
-      analysis_->GetInstructionSemantics(copy_start->operand(0));
-  analysis_->DeepCopyHloValueSemantics(semantics_shape_tree,
-                                       operand_semantics_shape_tree, {}, {0});
-  analysis_->DeepCopyHloValueSemantics(semantics_shape_tree,
-                                       operand_semantics_shape_tree, {}, {1});
-  semantics_shape_tree.ForEachMutableElement(
-      [this, copy_start](const ShapeIndex& shape_index,
-                         const HloValueSemantics** semantics) {
-        if (shape_index.empty()) {
-          *semantics = analysis_->NewHloValueSemantics(
-              HloValueSemanticLabel::kTupleOrToken, {copy_start, shape_index});
-        }
-        if (shape_index == ShapeIndex{2}) {
-          *semantics = analysis_->NewHloValueSemantics(
-              HloValueSemanticLabel::kRandom, {copy_start, shape_index});
-        }
-        if (shape_index == ShapeIndex{3}) {
-          *semantics = analysis_->NewHloValueSemantics(
-              HloValueSemanticLabel::kRandom, {copy_start, shape_index});
-        }
-      });
-  analysis_->SetHloValueSemantics(copy_start, semantics_shape_tree);
-  return OkStatus();
+  return HandleCollectiveOrCopyStart(copy_start);
 }
 
 absl::Status HloValueSemanticsPropagation::HandleCopyDone(
     HloInstruction* copy_done) {
-  RETURN_IF_ALREADY_PROPAGATED(copy_done);
-  const ShapeTree<const HloValueSemantics*>& operand_semantics_shape_tree =
-      analysis_->GetInstructionSemantics(copy_done->operand(0));
-  analysis_->DeepCopyHloValueSemantics(copy_done, operand_semantics_shape_tree,
-                                       {0});
-  return OkStatus();
+  return HandleCollectiveOrCopyDone(copy_done);
 }
 
-absl::Status HloValueSemanticsPropagation::HandleCollectiveStart(
-    HloInstruction* collective_start) {
-  RETURN_IF_ALREADY_PROPAGATED(collective_start);
-  ShapeTree<const HloValueSemantics*> semantics_shape_tree(
-      collective_start->shape());
+absl::Status HloValueSemanticsPropagation::HandleCollectiveOrCopyStart(
+    HloInstruction* op_start) {
+  RETURN_IF_ALREADY_PROPAGATED(op_start);
+  ShapeTree<const HloValueSemantics*> semantics_shape_tree(op_start->shape());
   const ShapeTree<const HloValueSemantics*>& operand_semantics_shape_tree =
-      analysis_->GetInstructionSemantics(collective_start->operand(0));
+      analysis_->GetInstructionSemantics(op_start->operand(0));
   analysis_->DeepCopyHloValueSemantics(semantics_shape_tree,
                                        operand_semantics_shape_tree, {}, {0});
   analysis_->DeepCopyHloValueSemantics(semantics_shape_tree,
                                        operand_semantics_shape_tree, {}, {1});
   semantics_shape_tree.ForEachMutableElement(
-      [this, collective_start](const ShapeIndex& shape_index,
-                               const HloValueSemantics** semantics) {
+      [this, op_start](const ShapeIndex& shape_index,
+                       const HloValueSemantics** semantics) {
         if (shape_index.empty()) {
           *semantics = analysis_->NewHloValueSemantics(
-              HloValueSemanticLabel::kTupleOrToken, {collective_start, {}});
+              HloValueSemanticLabel::kTupleOrToken, {op_start, {}});
         }
         if (shape_index == ShapeIndex{2}) {
           *semantics = analysis_->NewHloValueSemantics(
-              HloValueSemanticLabel::kRandom, {collective_start, shape_index});
+              HloValueSemanticLabel::kRandom, {op_start, shape_index});
         }
         if (shape_index == ShapeIndex{3}) {
           *semantics = analysis_->NewHloValueSemantics(
-              HloValueSemanticLabel::kRandom, {collective_start, shape_index});
+              HloValueSemanticLabel::kRandom, {op_start, shape_index});
         }
       });
-  analysis_->SetHloValueSemantics(collective_start, semantics_shape_tree);
+  analysis_->SetHloValueSemantics(op_start, semantics_shape_tree);
   return OkStatus();
 }
 
-absl::Status HloValueSemanticsPropagation::HandleCollectiveDone(
-    HloInstruction* collective_done) {
-  RETURN_IF_ALREADY_PROPAGATED(collective_done);
+absl::Status HloValueSemanticsPropagation::HandleCollectiveOrCopyDone(
+    HloInstruction* op_done) {
+  RETURN_IF_ALREADY_PROPAGATED(op_done);
   const ShapeTree<const HloValueSemantics*>& operand_semantics_shape_tree =
-      analysis_->GetInstructionSemantics(collective_done->operand(0));
-  analysis_->DeepCopyHloValueSemantics(collective_done,
-                                       operand_semantics_shape_tree, {1});
+      analysis_->GetInstructionSemantics(op_done->operand(0));
+  analysis_->DeepCopyHloValueSemantics(op_done, operand_semantics_shape_tree,
+                                       {1});
   return OkStatus();
 }
 
 absl::Status HloValueSemanticsPropagation::HandleAllGatherStart(
     HloInstruction* all_gather_start) {
-  return HandleCollectiveStart(all_gather_start);
+  return HandleCollectiveOrCopyStart(all_gather_start);
 }
 
 absl::Status HloValueSemanticsPropagation::HandleAllGatherDone(
     HloInstruction* all_gather_done) {
-  return HandleCollectiveDone(all_gather_done);
+  return HandleCollectiveOrCopyDone(all_gather_done);
 }
 
 absl::Status HloValueSemanticsPropagation::HandleCollectivePermuteStart(
     HloInstruction* collective_permute_start) {
-  return HandleCollectiveStart(collective_permute_start);
+  return HandleCollectiveOrCopyStart(collective_permute_start);
 }
 absl::Status HloValueSemanticsPropagation::HandleCollectivePermuteDone(
     HloInstruction* collective_permute_done) {
-  return HandleCollectiveDone(collective_permute_done);
+  return HandleCollectiveOrCopyDone(collective_permute_done);
 }
 absl::Status HloValueSemanticsPropagation::HandleGather(
     HloInstruction* gather) {
@@ -1917,10 +1930,13 @@ absl::Status HloValueSemanticsPropagation::HandleAllReduce(
 absl::Status HloValueSemanticsPropagation::HandleAsyncStart(
     HloInstruction* async_start) {
   RETURN_IF_ALREADY_PROPAGATED(async_start);
-  const HloValueSemantics* semantics = analysis_->NewHloValueSemantics(
-      HloValueSemanticLabel::kTupleOrToken, {async_start, {}});
+  HloComputation* computation = async_start->async_wrapped_computation();
+  TF_RETURN_IF_ERROR(
+      analysis_->RunOnComputation(*computation, async_start->operands()));
+  const ShapeTree<const HloValueSemantics*>& root_semantics =
+      analysis_->GetInstructionSemantics(computation->root_instruction());
   ShapeTree<const HloValueSemantics*> semantics_shape_tree(async_start->shape(),
-                                                           semantics);
+                                                           nullptr);
   for (int operand_index = 0; operand_index < async_start->operand_count();
        ++operand_index) {
     HloInstruction* operand = async_start->mutable_operand(operand_index);
@@ -1929,27 +1945,17 @@ absl::Status HloValueSemanticsPropagation::HandleAsyncStart(
     analysis_->DeepCopyHloValueSemantics(
         semantics_shape_tree, operand_semantics_tree, {}, {0, operand_index});
   }
-  std::vector<int64_t> operand_indices(async_start->operand_count());
-  std::iota(operand_indices.begin(), operand_indices.end(), 0);
-  TF_ASSIGN_OR_RETURN(
-      HloValueSemantics output_semantics,
-      ComputeSemanticsFromOperands(async_start, operand_indices));
+  analysis_->DeepCopyHloValueSemantics(semantics_shape_tree, root_semantics, {},
+                                       {1});
   semantics_shape_tree.ForEachMutableElement(
-      [&output_semantics, &semantics_shape_tree, this, async_start](
+      [&semantics_shape_tree, this, async_start](
           const ShapeIndex& index, const HloValueSemantics** semantics_ptr) {
-        if (index.empty() || index.front() == 0) {
-          return;
-        }
         if (!semantics_shape_tree.IsLeaf(index)) {
           *semantics_ptr = analysis_->NewHloValueSemantics(
               HloValueSemanticLabel::kTupleOrToken, {async_start, {}});
           return;
         }
-        if (index.front() == 1) {
-          *semantics_ptr = AddSemantics(output_semantics);
-          return;
-        }
-        if (index.front() == 2) {
+        if (index.front() == 2 || index.front() == 3) {
           *semantics_ptr = analysis_->NewHloValueSemantics(
               HloValueSemanticLabel::kRandom, {async_start, {}});
         }
@@ -1957,6 +1963,7 @@ absl::Status HloValueSemanticsPropagation::HandleAsyncStart(
   analysis_->SetHloValueSemantics(async_start, semantics_shape_tree);
   return OkStatus();
 }
+
 absl::Status HloValueSemanticsPropagation::HandleAsyncDone(
     HloInstruction* async_done) {
   RETURN_IF_ALREADY_PROPAGATED(async_done);
diff --git a/third_party/xla/xla/service/hlo_value_semantics_analysis.h b/third_party/xla/xla/service/hlo_value_semantics_analysis.h
index 262bc0da5f7814..48b0bbd68e1809 100644
--- a/third_party/xla/xla/service/hlo_value_semantics_analysis.h
+++ b/third_party/xla/xla/service/hlo_value_semantics_analysis.h
@@ -108,6 +108,8 @@ class EinsumDepthAnalysis : public DfsHloVisitorWithDefault {
   Status HandleSendDone(HloInstruction* send_done) override;
   Status HandleRecvDone(HloInstruction* recv_done) override;
   Status HandleAllReduce(HloInstruction* all_reduce) override;
+  Status HandleAsyncStart(HloInstruction* async_start) override;
+  Status HandleAsyncDone(HloInstruction* async_done) override;
   const EinsumDepthMap& GetEinsumDepthMap() const { return einsum_depth_map_; }
 
  private:
@@ -115,9 +117,8 @@ class EinsumDepthAnalysis : public DfsHloVisitorWithDefault {
       : send_recv_group_map_(&send_recv_group_map) {}
   Status RunInternal(const HloComputation& computation,
                      const std::optional<ShapeTree<int>>& root_depth);
-  EinsumDepthMap::iterator GetOrCreateDepthTree(
-      const HloInstruction* instruction);
-  EinsumDepthMap::iterator GetDepthTreeOrDie(const HloInstruction* instruction);
+  ShapeTree<int>& GetOrCreateDepthTree(const HloInstruction* instruction);
+  ShapeTree<int>& GetDepthTreeOrDie(const HloInstruction* instruction);
   Status SetInstructionDepth(const HloInstruction* instruction, int depth);
   Status SetInstructionDepth(const HloInstruction* instruction,
                              const ShapeTree<int>& depth);
@@ -159,6 +160,8 @@ class EinsumHeightAnalysis : public DfsHloVisitorWithDefault {
   Status HandleSendDone(HloInstruction* send_done) override;
   Status HandleRecvDone(HloInstruction* recv_done) override;
   Status HandleAllReduce(HloInstruction* all_reduce) override;
+  Status HandleAsyncStart(HloInstruction* async_start) override;
+  Status HandleAsyncDone(HloInstruction* async_done) override;
   const EinsumHeightMap& GetEinsumHeightMap() const {
     return einsum_height_map_;
   }
@@ -168,10 +171,8 @@ class EinsumHeightAnalysis : public DfsHloVisitorWithDefault {
       : send_recv_group_map_(&send_recv_group_map) {}
   Status RunInternal(const HloComputation& computation,
                      absl::Span<HloInstruction* const> operands);
-  EinsumHeightMap::iterator GetOrCreateHeightTree(
-      const HloInstruction* instruction);
-  EinsumHeightMap::iterator GetHeightTreeOrDie(
-      const HloInstruction* instruction);
+  ShapeTree<int>& GetOrCreateHeightTree(const HloInstruction* instruction);
+  ShapeTree<int>& GetHeightTreeOrDie(const HloInstruction* instruction);
   bool HasHeightFor(const HloInstruction* instruction) const;
   Status SetInstructionHeight(const HloInstruction* instruction, int height);
   Status SetInstructionHeight(const HloInstruction* instruction,
@@ -419,8 +420,8 @@ class HloValueSemanticsPropagation : public DfsHloVisitorWithDefault {
       HloInstruction* instruction, absl::Span<const int64_t> operand_indices,
       absl::Span<const ShapeIndex> operand_shape_indices = {}) const;
   Status HandleTupleLike(HloInstruction* tuple_like);
-  Status HandleCollectiveStart(HloInstruction* collective_start);
-  Status HandleCollectiveDone(HloInstruction* collective_done);
+  Status HandleCollectiveOrCopyStart(HloInstruction* op_start);
+  Status HandleCollectiveOrCopyDone(HloInstruction* op_done);
   HloValueSemanticsAnalysis* analysis_;
 };
 
diff --git a/third_party/xla/xla/service/hlo_value_semantics_analysis_test.cc b/third_party/xla/xla/service/hlo_value_semantics_analysis_test.cc
index 12da33d7707e1a..f021441a2ad598 100644
--- a/third_party/xla/xla/service/hlo_value_semantics_analysis_test.cc
+++ b/third_party/xla/xla/service/hlo_value_semantics_analysis_test.cc
@@ -195,6 +195,12 @@ class HloValueSemanticsAnalysisTest : public HloTestBase {
     return HasLabel(hlo_value_semantics_analysis, module, instruction_name,
                     HloValueSemanticLabel::kWeightGradient);
   }
+  bool IsTupleOrToken(
+      const HloValueSemanticsAnalysis& hlo_value_semantics_analysis,
+      HloModule* module, absl::string_view instruction_name) {
+    return HasLabel(hlo_value_semantics_analysis, module, instruction_name,
+                    HloValueSemanticLabel::kTupleOrToken);
+  }
 };
 
 TEST_F(HloValueSemanticsAnalysisTest, OneMatmul) {
@@ -244,6 +250,41 @@ ENTRY entry {
   EXPECT_TRUE(IsWeight(*hlo_value_semantics_analysis, module.get(), "dot.2"));
 }
 
+TEST_F(HloValueSemanticsAnalysisTest, HandleConditional) {
+  const std::string module_str = R"(
+    HloModule Module
+
+    branch0 {
+      tparam = f32[4] parameter(0)
+      tgte1 = f32[4] ceil(tparam)
+      ROOT tuple = (f32[4], f32[4]) tuple(tparam, tgte1)
+    }
+
+    branch1 {
+      fparam = f32[4] parameter(0)
+      %async-start = ((f32[4]), f32[4], s32[]) abs-start(f32[4] fparam), async_execution_thread="parallel_thread"
+      %async-done = f32[4] abs-done(((f32[4]), f32[4], s32[]) %async-start)
+      ROOT tuple = (f32[4], f32[4]) tuple(fparam, %async-done)
+    }
+
+    ENTRY entry {
+      p0 = f32[4] parameter(0)
+      b0 = s32[] parameter(1)
+      ROOT conditional = (f32[4], f32[4]) conditional(b0, p0, p0),
+        branch_computations={branch0, branch1}
+    }
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, ParseAndReturnVerifiedModule(module_str, /*replica_count=*/1,
+                                                /*num_partitions=*/2));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloValueSemanticsAnalysis> hlo_value_semantics_analysis,
+      HloValueSemanticsAnalysis::Run(*module));
+  EXPECT_TRUE(IsTupleOrToken(*hlo_value_semantics_analysis, module.get(),
+                             "conditional"));
+}
+
 TEST_F(HloValueSemanticsAnalysisTest, TwoMatmuls) {
   const std::string module_str = R"(
 HloModule TwoMatmuls
@@ -593,8 +634,8 @@ TEST_F(EinsumDepthAnalysisTest, HandleConditional) {
 
     branch1 {
       fparam = f32[4] parameter(0)
-      %async-start = ((f32[4]), f32[4], s32[]) custom-call-start(f32[4] fparam), async_execution_thread="parallel_thread", custom_call_target="foo"
-      ROOT %async-done = f32[4] custom-call-done(((f32[4]), f32[4], s32[]) %async-start)
+      %async-start = ((f32[4]), f32[4], s32[]) abs-start(f32[4] fparam), async_execution_thread="parallel_thread"
+      ROOT %async-done = f32[4] abs-done(((f32[4]), f32[4], s32[]) %async-start)
     }
 
     branch2 {
@@ -622,6 +663,29 @@ TEST_F(EinsumDepthAnalysisTest, HandleConditional) {
             0);
 }
 
+TEST_F(EinsumDepthAnalysisTest, HandleAfterAll) {
+  const char* const hlo_string = R"(
+    ENTRY entry {
+      after-all.1 = token[] after-all()
+      parameter.1 = f32[] parameter(0)
+      send.1 = (f32[], u32[], token[]) send(parameter.1, after-all.1), channel_id=1, is_host_transfer=true, frontend_attributes={_xla_host_transfer_handler_name="tf_rendezvous",_xla_host_transfer_rendezvous="rendezvous1"}
+      send-done.1 = token[] send-done(send.1), channel_id=1, is_host_transfer=true, frontend_attributes={_xla_host_transfer_handler_name="tf_rendezvous",_xla_host_transfer_rendezvous="rendezvous1"}
+      ROOT after-all.2 = token[] after-all(send-done.1), frontend_attributes={_xla_host_transfer_handler_name="tf_rendezvous",_xla_host_transfer_rendezvous="rendezvous1"}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<EinsumDepthAnalysis> einsum_depth_analysis,
+      EinsumDepthAnalysis::Run(*module->entry_computation(),
+                               SendRecvGroupMap(*module)));
+  const EinsumDepthMap& einsum_depth_map =
+      einsum_depth_analysis->GetEinsumDepthMap();
+  HloComputation* computation = module->GetComputationWithName("entry");
+  EXPECT_EQ(GetInstructionDepth(einsum_depth_map, computation, "after-all.2"),
+            0);
+}
+
 class EinsumHeightAnalysisTest : public HloTestBase {
  public:
   int GetInstructionHeight(const EinsumHeightMap& height_map,
diff --git a/third_party/xla/xla/service/hlo_verifier.cc b/third_party/xla/xla/service/hlo_verifier.cc
index 751756ccdab359..5c4390fe141d6f 100644
--- a/third_party/xla/xla/service/hlo_verifier.cc
+++ b/third_party/xla/xla/service/hlo_verifier.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/service/hlo_verifier.h"
 
 #include <algorithm>
+#include <cstddef>
 #include <cstdint>
 #include <iterator>
 #include <map>
@@ -28,11 +29,16 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/comparison_util.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
@@ -42,21 +48,23 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/layout.h"
+#include "xla/layout_util.h"
 #include "xla/permutation_util.h"
 #include "xla/primitive_util.h"
 #include "xla/service/collective_ops_utils.h"
-#include "xla/service/pattern_matcher.h"
+#include "xla/service/hlo_module_config.h"
 #include "xla/service/shape_inference.h"
+#include "xla/shape.h"
+#include "xla/shape_layout.h"
 #include "xla/shape_util.h"
+#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
-
-namespace m = match;
-
 namespace {
 
 bool IsCallerInstruction(HloInstruction* hlo) {
@@ -1751,7 +1759,8 @@ Status CheckMixedPrecisionOperands(const HloInstruction* instruction) {
       for (auto operand : instruction->operands()) {
         TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
             operand->shape(),
-            [&](const Shape& subshape, const ShapeIndex& index) {
+            [&](const Shape& subshape,
+                const ShapeIndex& index) -> absl::Status {
               if (!ShapeUtil::ElementIsFloating(subshape)) {
                 return OkStatus();
               }
@@ -2001,17 +2010,21 @@ std::string ComputationsToString(
 
 // Verifies various invariants about the structure of the HLO:
 //
-// (1) each instruction has a non-null parent() set to the HloComputation
-// which
-//     contains it.
+// (1) each instruction is non-null and has a non-null parent() set to the
+// HloComputation which contains it.
 //
-// (2) each computation has a non-null parent() set to the HloModule which
-//     contains it.
+// (2) each computation is non-null and has a non-null parent() set to the
+// HloModule which contains it.
 //
-// (3) the operands of each instruction are in the same computation as the
-//     instruction.
+// (3) the operands of each instruction are non-null and are in the same
+// computation as the instruction.
 Status VerifyHloStructure(HloModule* module) {
   for (const HloComputation* computation : module->computations()) {
+    if (computation == nullptr) {
+      return Internal("Computation in module %s is a null pointer",
+                      module->name());
+    }
+
     if (computation->parent() == nullptr) {
       return Internal("Computation %s has a null parent pointer",
                       computation->name());
@@ -2022,6 +2035,10 @@ Status VerifyHloStructure(HloModule* module) {
     }
 
     for (const HloInstruction* instruction : computation->instructions()) {
+      if (instruction == nullptr) {
+        return Internal("Instruction in computation %s is a null pointer",
+                        computation->name());
+      }
       if (instruction->parent() == nullptr) {
         return Internal("Instruction %s has a null parent pointer",
                         instruction->name());
@@ -2042,6 +2059,17 @@ Status VerifyHloStructure(HloModule* module) {
     for (const HloInstruction* instruction : computation->instructions()) {
       for (int i = 0; i < instruction->operand_count(); ++i) {
         const HloInstruction* operand = instruction->operand(i);
+        if (operand == nullptr) {
+          return Internal(
+              "Operand %d (out of %d) of instruction: %s is a null pointer", i,
+              instruction->operand_count(), instruction->name());
+        }
+        if (operand->parent() == nullptr) {
+          return Internal(
+              "Operand %d (out of %d) of instruction: %s has a null pointer "
+              "parent",
+              i, instruction->operand_count(), instruction->name());
+        }
         if (operand->parent() != instruction->parent()) {
           return Internal(
               "Operand %d (%s) of instruction %s is in a different "
@@ -2070,22 +2098,6 @@ bool ShapeContainsToken(const Shape& shape) {
   return contains_token;
 }
 
-// Verifies that all types entering and exiting the entry computation are
-// legal.
-Status VerifyEntryAndExitShapes(const HloModule& module) {
-  // Tokens cannot be passed as entry parameters.
-  // TODO(b/80000000): Remove this constraint.
-  for (int i = 0; i < module.entry_computation()->num_parameters(); ++i) {
-    HloInstruction* param =
-        module.entry_computation()->parameter_instruction(i);
-    if (ShapeContainsToken(param->shape())) {
-      return Internal("Entry parameter %d is or contains a token shape: %s", i,
-                      ShapeUtil::HumanString(param->shape()));
-    }
-  }
-  return OkStatus();
-}
-
 // Checks if the given two instructions share the same channel id.
 Status CheckSameChannel(const HloInstruction* instr1,
                         const HloInstruction* instr2) {
@@ -2100,7 +2112,7 @@ Status CheckSameChannel(const HloInstruction* instr1,
 }
 
 // Checks if the given two instructions have the same is_host_transfer
-// attribute value. Intsructions must be send/recv instructions or their
+// attribute value. Instructions must be send/recv instructions or their
 // 'done' variant.
 Status CheckSameIsHostTransfer(const HloInstruction* instr1,
                                const HloInstruction* instr2) {
@@ -2292,7 +2304,7 @@ Status VerifyChannels(const HloModule& module) {
   // (5) the group should contain equal number uses of each Send/Recv related
   //     instructions.
   //
-  // Comparing the verification of unpiplined Send/Recv with the verification
+  // Comparing the verification of unpipelined Send/Recv with the verification
   // of pipelined, what we missing verifying is that the direct connection
   // between Send/Recv and SendDone/RecvDone through operands.
   //
@@ -2930,7 +2942,6 @@ absl::StatusOr<bool> HloVerifier::Run(
     }
 
     TF_RETURN_IF_ERROR(shape_verifier->VerifyEntryComputationLayout(*module));
-    TF_RETURN_IF_ERROR(VerifyEntryAndExitShapes(*module));
 
     // If the module has a schedule, it must be valid.
     if (module->has_schedule()) {
diff --git a/third_party/xla/xla/service/host_offload_legalize.cc b/third_party/xla/xla/service/host_offload_legalize.cc
index c6349f7680a544..aa46246ad5b2fc 100644
--- a/third_party/xla/xla/service/host_offload_legalize.cc
+++ b/third_party/xla/xla/service/host_offload_legalize.cc
@@ -401,6 +401,42 @@ absl::StatusOr<bool> ProcessAnnotationForCopyMovement(
         VLOG(5) << "Couldn't find annotation for consumer instruction in chain";
         return false;
       }
+
+      // Fix up while body's root instruction shape along the way.
+      if (annotation->IsCustomCall(
+              host_memory_offload_annotations::kMoveToDeviceCustomCallTarget)) {
+        for (HloInstruction* user : annotation->users()) {
+          HloInstruction* root_instruction =
+              annotation->parent()->root_instruction();
+          if (root_instruction == user &&
+              root_instruction->opcode() == HloOpcode::kTuple) {
+            auto callers =
+                call_graph->GetComputationCallers(annotation->parent());
+            if (callers.size() != 1 ||
+                callers[0]->opcode() != HloOpcode::kWhile) {
+              return absl::InvalidArgumentError(
+                  "Expected to be called only by one caller and caller be a "
+                  "While");
+            }
+            for (int i = 0; i < user->operands().size(); i++) {
+              if (user->operands()[i] == annotation &&
+                  annotation->operand(0)->opcode() ==
+                      HloOpcode::kGetTupleElement &&
+                  annotation->operand(0)->operand(0)->opcode() ==
+                      HloOpcode::kParameter &&
+                  annotation->operand(0)->tuple_index() == i) {
+                // A special case where move-to-device is put into the result
+                // tuple element at the same index as where the move-to-device
+                // gets the data from. In this case, while loop's result tuple
+                // should not use move-to-device since at loop entry it's still
+                // on host.
+                user->ReplaceOperandWith(i, annotation->mutable_operand(0))
+                    .IgnoreError();
+              }
+            }
+          }
+        }
+      }
       stack.pop_back();
       continue;
     }
@@ -478,25 +514,8 @@ absl::StatusOr<bool> ProcessAnnotationForCopyMovement(
                 "Expected to be called only by one caller");
           }
           auto* caller = callers[0];
-          if (caller->opcode() == HloOpcode::kWhile) {
-            update_shape_layout(std::make_pair(caller, instruction.second),
-                                copy_to_move.first);
-
-            HloInstruction* root_instruction =
-                caller->while_body()->root_instruction();
-            // Fix while loop's result tuple to not use move-to-device since
-            // at loop entry it's still on host.
-            if (root_instruction->operand(instruction.second)
-                    ->IsCustomCall(host_memory_offload_annotations::
-                                       kMoveToDeviceCustomCallTarget)) {
-              root_instruction
-                  ->ReplaceOperandWith(
-                      instruction.second,
-                      root_instruction->mutable_operand(instruction.second)
-                          ->mutable_operand(0))
-                  .IgnoreError();
-            }
-          }
+          update_shape_layout(std::make_pair(caller, instruction.second),
+                              copy_to_move.first);
         }
       }
       stack.pop_back();
diff --git a/third_party/xla/xla/service/host_offloader.cc b/third_party/xla/xla/service/host_offloader.cc
index 747b877617b16c..34f335f0a4966e 100644
--- a/third_party/xla/xla/service/host_offloader.cc
+++ b/third_party/xla/xla/service/host_offloader.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
+#include "xla/status_macros.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
@@ -539,45 +540,34 @@ Status HostOffloader::MemoryOnlyOffloadInsertCopies(
   return OkStatus();
 }
 
-absl::StatusOr<bool> HostOffloader::TryParameterStreaming(
-    HloInstruction* custom_call) {
-  HloInstruction* operand_of_load_annotation = custom_call->mutable_operand(0);
-  const HloBuffer& unique_buffer =
-      alias_analysis_->GetUniqueBufferAt(operand_of_load_annotation);
-  bool is_defined_by_entry_param_with_host_memory_space = false;
-  const HloComputation* const entry_computation =
-      custom_call->GetModule()->entry_computation();
-  for (const HloValue* value : unique_buffer.values()) {
-    // Check if this is memory-only.
-    if (!AllPositionsAreAllowed(value)) {
-      // Found a position which is not allowed.
-      return false;
-    }
-    // Look for a value defined by a entry computation parameter.
-    HloInstruction* defining_instruction =
-        value->defining_position().instruction;
-    if (defining_instruction->opcode() == HloOpcode::kParameter) {
-      if (defining_instruction->parent() == entry_computation) {
-        const Shape& param_shape =
-            entry_computation->parent()
-                ->entry_computation_layout()
-                .parameter_shape(defining_instruction->parameter_number());
-        CHECK(param_shape.has_layout());
-        if (param_shape.layout().memory_space() == kHostMemorySpaceColor) {
-          is_defined_by_entry_param_with_host_memory_space = true;
-        }
-      }
-    }
+Status HostOffloader::DynamifySlice(HloInstruction* slice) {
+  VLOG(3) << "Dynamifying slice " << slice->ToString();
+  std::vector<HloInstruction*> start_constants;
+  for (int64_t start : slice->slice_starts()) {
+    HloInstruction* constant = slice->parent()->AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32_t>(start)));
+    start_constants.push_back(constant);
   }
-  if (!is_defined_by_entry_param_with_host_memory_space) {
-    VLOG(1) << absl::StreamFormat(
-        "Buffer annotated by %s is not defined by an entry computation "
-        "parameter with host memory space.",
-        custom_call->name());
-    return false;
+  std::vector<int64_t> slice_sizes;
+  slice_sizes.reserve(slice->slice_limits().size());
+  for (int i = 0; i < slice->slice_limits().size(); ++i) {
+    slice_sizes.push_back(slice->slice_limits()[i] - slice->slice_starts()[i]);
   }
+  HloInstruction* new_ds =
+      slice->parent()->AddInstruction(HloInstruction::CreateDynamicSlice(
+          slice->shape(), slice->mutable_operand(0), start_constants,
+          slice_sizes));
+  VLOG(3) << "Newly created dynamic slice: " << new_ds->name();
+  TF_RETURN_IF_ERROR(slice->ReplaceAllUsesWith(new_ds));
+  TF_RETURN_IF_ERROR(slice->parent()->RemoveInstruction(slice));
+  return OkStatus();
+}
 
-  // Create a copy to the device.
+// Taking an instruction representing a move-to-device custom call, creates a
+// copy to device for that operand and replaces all uses of the operand of the
+// load annotation with the copy.
+Status HostOffloader::CreateCopyForInputStreaming(HloInstruction* custom_call) {
+  HloInstruction* operand_of_load_annotation = custom_call->mutable_operand(0);
   Shape copy_shape = operand_of_load_annotation->shape();
   SetMemorySpace(&copy_shape, Layout::kDefaultMemorySpace);
   HloInstruction* copy_to_device =
@@ -593,60 +583,101 @@ absl::StatusOr<bool> HostOffloader::TryParameterStreaming(
     if (callers.size() > 1) {
       return absl::InvalidArgumentError(
           "Expected to be called only by one caller");
-    } else if (callers.size() == 1) {
-      auto* caller = callers[0];
-      if (caller->opcode() == HloOpcode::kWhile &&
-          use->opcode() == HloOpcode::kTuple && use->IsRoot()) {
-        // Do not replace the while loop parameter with the moved data. Because
-        // of the nature of while loops, since the data started on the host, it
-        // must end on the host. Only the while loop body's root should not use
-        // copy_to_device since it's on host at the loop entry.
-        continue;
+    }
+    if (callers.size() == 1 && callers[0]->opcode() == HloOpcode::kWhile &&
+        use->opcode() == HloOpcode::kTuple && use->IsRoot()) {
+      // Need some special filtering for while body's root instruction.
+      for (int i = 0; i < use->operands().size(); i++) {
+        if (use->operands()[i] == operand_of_load_annotation) {
+          if (operand_of_load_annotation->opcode() ==
+                  HloOpcode::kGetTupleElement &&
+              operand_of_load_annotation->operand(0)->opcode() ==
+                  HloOpcode::kParameter &&
+              operand_of_load_annotation->tuple_index() == i) {
+            // A special case where move-to-device is put into the result
+            // tuple element at the same index as where the move-to-device
+            // gets the data from. In this case, while loop's result tuple
+            // should not use move-to-device since at loop entry it's still
+            // on host.
+            continue;
+          }
+          TF_RETURN_IF_ERROR(operand_of_load_annotation->ReplaceUseWith(
+              use, i, copy_to_device));
+        }
       }
+    } else {
+      TF_RETURN_IF_ERROR(
+          operand_of_load_annotation->ReplaceUseWith(use, copy_to_device));
     }
-
-    TF_RETURN_IF_ERROR(
-        operand_of_load_annotation->ReplaceUseWith(use, copy_to_device));
   }
-
-  AddAllPositionsToBeMovedToHostMemory(unique_buffer);
-  return true;
+  return OkStatus();
 }
 
-Status HostOffloader::HandleMoveToDeviceCustomCall(
-    HloInstruction* custom_call) {
-  VLOG(2) << "Found a custom call annotating end-of-host-offload: "
-          << custom_call->ToString();
-  TF_ASSIGN_OR_RETURN(bool did_parameter_streaming,
-                      TryParameterStreaming(custom_call));
-  if (did_parameter_streaming) {
-    expected_host_to_device_annotations_.emplace(custom_call);
+// From a unique buffer on host memory, finds move-to-device custom calls
+// for this buffer and inserts the appropriate copies.
+Status HostOffloader::HandleStreamedBuffer(const HloBuffer& unique_buffer) {
+  // Find all move-to-device custom calls that are using this buffer.
+  for (const HloValue* value : unique_buffer.values()) {
+    for (const HloUse& use : value->GetUses()) {
+      if (use.instruction->IsCustomCall(
+              host_memory_offload_annotations::kMoveToDeviceCustomCallTarget)) {
+        HloInstruction* move_to_device_custom_call = use.instruction;
+
+        // Create a copy to device for the move-to-device custom call. Mark
+        // the move-to-device custom call as expected.
+        TF_RETURN_IF_ERROR(
+            CreateCopyForInputStreaming(move_to_device_custom_call));
+        expected_host_to_device_annotations_.emplace(
+            move_to_device_custom_call);
+      }
+    }
   }
-  // Save a pointer to this custom call for later removal.
-  found_host_to_device_annotations_.emplace(custom_call);
+  AddAllPositionsToBeMovedToHostMemory(unique_buffer);
   return OkStatus();
 }
 
-Status HostOffloader::DynamifySlice(HloInstruction* slice) {
-  VLOG(3) << "Dynamifying slice " << slice->ToString();
-  std::vector<HloInstruction*> start_constants;
-  for (int64_t start : slice->slice_starts()) {
-    HloInstruction* constant = slice->parent()->AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32_t>(start)));
-    start_constants.push_back(constant);
-  }
-  std::vector<int64_t> slice_sizes;
-  slice_sizes.reserve(slice->slice_limits().size());
-  for (int i = 0; i < slice->slice_limits().size(); ++i) {
-    slice_sizes.push_back(slice->slice_limits()[i] - slice->slice_starts()[i]);
+// Finds parameters of the entry computation that are in host memory space and
+// corresponding move-to-device custom calls for these parameters. Once found,
+// adds these move-to-device custom calls to the expected host-to-device
+// annotations, and creates the necessary copies for input streaming.
+Status HostOffloader::HandleInputStreaming(HloComputation* computation) {
+  const ComputationLayout& entry_computation_layout =
+      computation->parent()->entry_computation_layout();
+
+  for (int i = 0; i < entry_computation_layout.parameter_count(); ++i) {
+    if (entry_computation_layout.parameter_shape(i).IsTuple()) {
+      // Handle tuple parameters, which may contain streamed elements. Nested
+      // tuples are not supported.
+      const Shape& tuple_shape = entry_computation_layout.parameter_shape(i);
+      for (int j = 0; j < tuple_shape.tuple_shapes_size(); ++j) {
+        const Shape& tuple_element_shape = tuple_shape.tuple_shapes(j);
+        // TODO(b/335498881): Support nested tuples.
+        if (tuple_element_shape.IsTuple()) {
+          LOG(WARNING)
+              << "Nested tuple parameters are not supported for streaming.";
+          continue;
+        }
+        TF_RET_CHECK(tuple_element_shape.has_layout());
+        if (tuple_element_shape.layout().memory_space() ==
+            kHostMemorySpaceColor) {
+          VLOG(4) << "Handling streamed element in tuple parameter: "
+                  << tuple_element_shape.ToString(/*print_layout=*/true);
+          const HloBuffer& unique_buffer = alias_analysis_->GetUniqueBufferAt(
+              computation->parameter_instruction(i), {j});
+          TF_RETURN_IF_ERROR(HandleStreamedBuffer(unique_buffer));
+        }
+      }
+    } else if (entry_computation_layout.parameter_layout(i)
+                   .layout()
+                   .memory_space() == kHostMemorySpaceColor) {
+      HloInstruction* streamed_input = computation->parameter_instruction(i);
+      VLOG(4) << "Handling streamed input: " << streamed_input->ToString();
+      const HloBuffer& unique_buffer =
+          alias_analysis_->GetUniqueBufferAt(streamed_input);
+
+      TF_RETURN_IF_ERROR(HandleStreamedBuffer(unique_buffer));
+    }
   }
-  HloInstruction* new_ds =
-      slice->parent()->AddInstruction(HloInstruction::CreateDynamicSlice(
-          slice->shape(), slice->mutable_operand(0), start_constants,
-          slice_sizes));
-  VLOG(3) << "Newly created dynamic slice: " << new_ds->name();
-  TF_RETURN_IF_ERROR(slice->ReplaceAllUsesWith(new_ds));
-  TF_RETURN_IF_ERROR(slice->parent()->RemoveInstruction(slice));
   return OkStatus();
 }
 
@@ -663,6 +694,9 @@ absl::StatusOr<bool> HostOffloader::Run(
   // Iterate over all instructions and look for XLA host offload annotations.
   for (HloComputation* computation :
        module->MakeNonfusionComputations(execution_threads)) {
+    if (computation->IsEntryComputation()) {
+      TF_RETURN_IF_ERROR(HandleInputStreaming(computation));
+    }
     for (HloInstruction* instruction :
          computation->MakeInstructionPostOrder()) {
       if (instruction->opcode() != HloOpcode::kCustomCall) {
@@ -674,7 +708,7 @@ absl::StatusOr<bool> HostOffloader::Run(
       } else if (instruction->custom_call_target() ==
                  host_memory_offload_annotations::
                      kMoveToDeviceCustomCallTarget) {
-        TF_RETURN_IF_ERROR(HandleMoveToDeviceCustomCall(instruction));
+        found_host_to_device_annotations_.emplace(instruction);
       }
     }
   }
diff --git a/third_party/xla/xla/service/host_offloader.h b/third_party/xla/xla/service/host_offloader.h
index cd6e319a6fd50a..b56df4878df7c9 100644
--- a/third_party/xla/xla/service/host_offloader.h
+++ b/third_party/xla/xla/service/host_offloader.h
@@ -23,6 +23,7 @@
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/hlo_alias_analysis.h"
+#include "xla/service/hlo_buffer.h"
 #include "xla/service/hlo_pass_interface.h"
 
 namespace xla {
@@ -68,10 +69,17 @@ class HostOffloader : public HloModulePass {
   // positions_to_move_to_host_memory_.
   void AddAllPositionsToBeMovedToHostMemory(const HloBuffer& unique_buffer);
 
+  // Process streamed inputs for the given computation, finding the relevant
+  // move-to-device custom calls and inserting the appropriate copies.
+  Status HandleInputStreaming(HloComputation* computation);
+  // From a unique buffer on host memory, finds move-to-device custom calls
+  // for this buffer and inserts the appropriate copies.
+  Status HandleStreamedBuffer(const HloBuffer& unique_buffer);
+  // Creates a copy to device for the input streaming custom call.
+  Status CreateCopyForInputStreaming(HloInstruction* custom_call);
   absl::StatusOr<bool> TryParameterStreaming(HloInstruction* custom_call);
   absl::StatusOr<bool> TryOutputStreaming(HloInstruction* custom_call);
   Status HandleMoveToHostCustomCall(HloInstruction* custom_call);
-  Status HandleMoveToDeviceCustomCall(HloInstruction* custom_call);
 
   // Handle memory-only offloading where the data is written to the host via a
   // dynamic-update-slice and is read back via a dynamic-slice.
diff --git a/third_party/xla/xla/service/host_offloader_test.cc b/third_party/xla/xla/service/host_offloader_test.cc
index 73579a11d1c3d4..0f2851a0dc4254 100644
--- a/third_party/xla/xla/service/host_offloader_test.cc
+++ b/third_party/xla/xla/service/host_offloader_test.cc
@@ -224,6 +224,54 @@ ENTRY main {
   VLOG(1) << "module after: " << module->ToString();
 }
 
+TEST_F(HostOffloaderTest, ParameterStreamingFeedingIntoWhile) {
+  const std::string& hlo_string = R"(
+HloModule jit__prefill_impl, entry_computation_layout={(bf16[2,16,16]{2,1,0:T(8,128)(2,1)S(5)})->bf16[2,16,16]{2,1,0:T(8,128)(2,1)}}
+
+while_condition {
+  condition_param = (s32[], bf16[2,16,16]{2,1,0:T(8,128)(2,1)}, bf16[2,16,16]{2,1,0:T(8,128)(2,1)}) parameter(0)
+  condition_current_iteration_index = s32[] get-tuple-element(condition_param), index=0
+  condition_iteration_count = s32[] constant(16)
+  ROOT condition_result = pred[] compare(condition_current_iteration_index, condition_iteration_count), direction=LT
+}
+
+while_body {
+  input_tuple.0 = (s32[], bf16[2,16,16]{2,1,0:T(8,128)(2,1)}, bf16[2,16,16]{2,1,0:T(8,128)(2,1)}) parameter(0)
+  current_iteration_index.0 = s32[] get-tuple-element(input_tuple.0), index=0
+  orig_data = bf16[2,16,16]{2,1,0:T(8,128)(2,1)} get-tuple-element(input_tuple.0), index=1
+  custom-call.0 = bf16[2,16,16]{2,1,0:T(8,128)(2,1)} custom-call(orig_data), custom_call_target="MoveToDevice"
+  sum = bf16[2,16,16]{2,1,0:T(8,128)(2,1)} get-tuple-element(input_tuple.0), index=2
+  sum.1 = bf16[2,16,16]{2,1,0:T(8,128)(2,1)} add(custom-call.0, sum)
+
+  constant_1 = s32[] constant(1)
+  /* Increment iteration index */
+  incremented_index.0 = s32[] add(current_iteration_index.0, constant_1)
+  ROOT tuple_result.0 = (s32[], bf16[2,16,16]{2,1,0:T(8,128)(2,1)}, bf16[2,16,16]{2,1,0:T(8,128)(2,1)}) tuple(incremented_index.0, custom-call.0, sum.1)
+}
+
+ENTRY main {
+  param.0 = bf16[2,16,16]{2,1,0:T(8,128)(2,1)} parameter(0)
+  constant_0 = s32[] constant(0)
+  constant_0.0 = bf16[] constant(0.0)
+  broadcast = bf16[2,16,16]{2,1,0:T(8,128)(2,1)} broadcast(constant_0.0), dimensions={}
+  tuple_for_while = (s32[], bf16[2,16,16]{2,1,0:T(8,128)(2,1)}, bf16[2,16,16]{2,1,0:T(8,128)(2,1)}) tuple(constant_0, param.0, broadcast)
+  while = (s32[], bf16[2,16,16]{2,1,0:T(8,128)(2,1)}, bf16[2,16,16]{2,1,0:T(8,128)(2,1)}) while(tuple_for_while), condition=while_condition, body=while_body
+  ROOT gte = bf16[2,16,16]{2,1,0:T(8,128)(2,1)} get-tuple-element(while), index=2
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, RunHostOffloader(module.get(), /*after_layout=*/true));
+  EXPECT_TRUE(changed);
+  EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
+  HloVerifier verifier(/*layout_sensitive=*/true,
+                       /*allow_mixed_precision=*/true);
+  TF_EXPECT_OK(verifier.Run(module.get()).status());
+  VLOG(1) << "module after: " << module->ToString();
+}
+
 TEST_F(HostOffloaderTest, BasicNoCopy) {
   const std::string& hlo_string = R"(
 HloModule my_module
@@ -1906,6 +1954,54 @@ ENTRY main {
   EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
 }
 
+TEST_F(HostOffloaderTest, TupleParameterStreaming) {
+  const std::string& hlo_string = R"(
+HloModule ParameterStreaming, entry_computation_layout={((s32[2,1]{1,0:T(2,128)}, s32[2,1]{1,0:T(2,128)S(5)}))->s32[2,1]{1,0:T(2,128)}}
+
+ENTRY main {
+  param_tuple = (s32[2,1], s32[2,1]) parameter(0)
+  x = get-tuple-element(param_tuple), index=0
+  y_host = get-tuple-element(param_tuple), index=1
+  y = s32[2,1] custom-call(y_host), custom_call_target="MoveToDevice"
+  ROOT crs = s32[2,1] add(x, y)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHostOffloader(module.get()));
+  EXPECT_TRUE(changed);
+
+  // Look for the following pattern:
+  // param0: tuple(x   ,   y)
+  //              /         \
+  // get-tuple-element  get-tuple-element
+  //             \      |
+  //              \    copy
+  //               \   /
+  //                add
+  HloInstruction* param;
+  HloInstruction* gte_x;
+  HloInstruction* gte_y;
+  HloInstruction* copy;
+  HloInstruction* add;
+  auto parameter_pattern = m::Parameter(&param, 0);
+  ASSERT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Add(
+          &add, m::GetTupleElement(&gte_x, parameter_pattern),
+          m::Copy(&copy, m::GetTupleElement(&gte_y, parameter_pattern)))));
+  TestShapeHasMemorySpace(param->shape().tuple_shapes(0),
+                          Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(gte_x->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(add->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(copy->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(param->shape().tuple_shapes(1),
+                          kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(gte_y->shape(), kHostMemorySpaceColor);
+}
+
 TEST_F(HostOffloaderTest, OutputStreaming) {
   const std::string& hlo_string = R"(
     HloModule ParameterStreaming, entry_computation_layout={(s32[2,1]{1,0:T(2,128)}, s32[2,1]{1,0:T(2,128)})->(s32[2,1]{1,0:T(2,128)S(5)}, s32[2,1]{1,0:T(2,128)})}
diff --git a/third_party/xla/xla/service/latency_hiding_scheduler.cc b/third_party/xla/xla/service/latency_hiding_scheduler.cc
index 53cf5eaf27e8c1..c007eb9af345eb 100644
--- a/third_party/xla/xla/service/latency_hiding_scheduler.cc
+++ b/third_party/xla/xla/service/latency_hiding_scheduler.cc
@@ -60,7 +60,9 @@ bool IsNopInstruction(const HloInstruction& hlo) {
   return op == HloOpcode::kGetTupleElement || op == HloOpcode::kBitcast ||
          op == HloOpcode::kConstant || op == HloOpcode::kParameter ||
          op == HloOpcode::kBroadcast || op == HloOpcode::kIota ||
-         hlo.IsEffectiveBitcast();
+         hlo.IsEffectiveBitcast() ||
+         (op == HloOpcode::kTuple && hlo.user_count() == 1 &&
+          hlo.users().front()->opcode() == HloOpcode::kWhile);
 }
 }  // namespace
 
@@ -592,7 +594,8 @@ void MemoryPressureTracker::UpdateBuffers(const HloInstruction* instruction) {
   for (auto* op : instruction->operands()) {
     auto& output_values = output_buffers_[op];
     for (auto& info : output_values) {
-      if (ShouldSkipBufferAllocations(instruction, info.second) ||
+      if (ShouldSkipBufferAllocations(instruction, info.second,
+                                      info.first.first_definition) ||
           (info.first.value->values()[0]->shape().has_layout() &&
            info.first.value->values()[0]->shape().layout().memory_space() !=
                kDefaultMemorySpace)) {
@@ -651,7 +654,8 @@ std::pair<int64_t, int64_t> MemoryPressureTracker::MemoryPressureDifference(
     auto it = output_buffers_.find(op);
     CHECK(it != output_buffers_.end());
     for (auto& b : it->second) {
-      if (ShouldSkipBufferAllocations(instruction, b.second) ||
+      if (ShouldSkipBufferAllocations(instruction, b.second,
+                                      b.first.first_definition) ||
           (b.first.value->values()[0]->shape().has_layout() &&
            b.first.value->values()[0]->shape().layout().memory_space() !=
                kDefaultMemorySpace)) {
@@ -713,6 +717,12 @@ class ReadySetLt {
   DefaultSchedulerCore::CandidateResult operator()(
       DefaultSchedulerCore::ScheduleCandidate& a,
       DefaultSchedulerCore::ScheduleCandidate& b) const {
+    // Schedule according to ForceEarly.
+    if (auto value = DefaultSchedulerCore::ChooseBestCandidate(
+            a.node->GetForceEarly(), a, b.node->GetForceEarly(), b,
+            "kForceEarly")) {
+      return *value;
+    }
     // Schedule according to ForceDelay first.
     if (auto value = DefaultSchedulerCore::ChooseBestCandidate(
             !a.node->GetForceDelay(), a, !b.node->GetForceDelay(), b,
diff --git a/third_party/xla/xla/service/latency_hiding_scheduler.h b/third_party/xla/xla/service/latency_hiding_scheduler.h
index 96876c62ac7631..30a31761ef4303 100644
--- a/third_party/xla/xla/service/latency_hiding_scheduler.h
+++ b/third_party/xla/xla/service/latency_hiding_scheduler.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/str_cat.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/hlo_alias_analysis.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/hlo_pass_interface.h"
@@ -342,6 +343,8 @@ class HloGraphNode {
   void SetGraphDepth(TimeCost graph_depth) { graph_depth_ = graph_depth; }
   bool GetForceDelay() const { return force_delay_; }
   void SetForceDelay(bool force_delay) { force_delay_ = force_delay; }
+  bool GetForceEarly() const { return force_early_; }
+  void SetForceEarly(bool force_early) { force_early_ = force_early; }
   ResourcesVector GetResources() const { return resources_; }
   bool DoesOccupyAnyResource() const {
     return absl::c_any_of(resources_, [](const ResourcePair& resource) {
@@ -414,6 +417,7 @@ class HloGraphNode {
     absl::StrAppend(&result, "Depth: ", depth_, "\n");
     absl::StrAppend(&result, "Graph Depth: ", graph_depth_, "\n");
     absl::StrAppend(&result, "Force Delay: ", force_delay_, "\n");
+    absl::StrAppend(&result, "Force Early: ", force_early_, "\n");
     absl::StrAppend(&result, "Predecessors:\n");
     for (const HloEdge& e : predecessors_) {
       absl::StrAppend(&result, e.ToString());
@@ -466,6 +470,8 @@ class HloGraphNode {
   ResourcesVector resources_;
   // Force the scheduling of the nodes with attribute set as late as possible.
   bool force_delay_ = false;
+  // Force the scheduling of the nodes with attribute set as early as possible.
+  bool force_early_ = false;
   // Whether this node has been scheduled or not yet.
   bool scheduled_ = false;
   // Shareable resources released by this node.
@@ -606,8 +612,9 @@ class MemoryPressureTracker {
   const MemoryPressureState& pressure_state() const { return pressure_state_; }
 
  private:
-  static bool ShouldSkipBufferAllocations(const HloInstruction* instruction,
-                                          const ShapeIndex& idx) {
+  static bool ShouldSkipBufferAllocations(
+      const HloInstruction* instruction, const ShapeIndex& idx,
+      const HloInstruction* first_definition) {
     // Make GetTupleElement/kBitcast make alive only the tuple pointer if not
     // array shape.
     if ((instruction->opcode() == HloOpcode::kGetTupleElement ||
@@ -615,11 +622,16 @@ class MemoryPressureTracker {
         !idx.empty()) {
       return true;
     }
+    // Skip entry computation parameters because their memory usage is already
+    // accounted for.
+    if (first_definition->opcode() == HloOpcode::kParameter &&
+        first_definition->parent()->IsEntryComputation()) {
+      return true;
+    }
     return false;
   }
   static bool ShouldSkipBufferReleases(const HloInstruction* instruction) {
-    // Make GetTupleElement/kBitcast make alive only the tuple pointer if not
-    // array shape.
+    // Do not release parameter buffers as they are still in use by the caller.
     if (instruction->opcode() == HloOpcode::kParameter) {
       return true;
     }
diff --git a/third_party/xla/xla/service/latency_hiding_scheduler_test.cc b/third_party/xla/xla/service/latency_hiding_scheduler_test.cc
index 2ed00500230ec4..2782cbc9522d91 100644
--- a/third_party/xla/xla/service/latency_hiding_scheduler_test.cc
+++ b/third_party/xla/xla/service/latency_hiding_scheduler_test.cc
@@ -2896,12 +2896,12 @@ TEST_F(LatencyHidingSchedulerTest, RerunWithSmallerMemoryLimit) {
   std::vector<HloInstruction*> original_instruction_sequence =
       module_schedule.sequence(entry_computation).instructions();
   auto sched_config = GetDefaultSchedConfig();
-  sched_config.memory_limit = 140;
+  sched_config.memory_limit = 110;
   sched_config.rerun = 1;
   EXPECT_TRUE(RunScheduler(hlo_module.get(), sched_config).ok());
   // LatencyHidingScheduler runs an additional "rerun" iteration because the
-  // peak memory usage after the first run was 152 bytes (> 140 bytes), so it
-  // sets the new limit to 126 and obtains a peak memory usage of 104 bytes at
+  // peak memory usage after the first run was 136 bytes (> 110 bytes), so it
+  // sets the new limit to 99 and obtains a peak memory usage of 88 bytes at
   // the end of the rerun. In the first run, collective-permute overlaps the
   // slice op, whereas in the rerun, it does not overlap anything.
   std::vector<HloInstruction*> new_instruction_sequence =
diff --git a/third_party/xla/xla/service/layout_assignment.cc b/third_party/xla/xla/service/layout_assignment.cc
index a6a3b9ebf94480..af985a05c6b2ec 100644
--- a/third_party/xla/xla/service/layout_assignment.cc
+++ b/third_party/xla/xla/service/layout_assignment.cc
@@ -718,7 +718,7 @@ Status LayoutAssignment::AddMandatoryConstraints(
                 instruction->parameter_number());
         // Allow some paramter/result layouts to be unset in the entry
         // computation.
-        if (parameter_layout.LayoutIsSet()) {
+        if (parameter_layout.AnyLayoutIsSet()) {
           // Parameter layouts must match the respective layout in
           // ComputationLayout, if there is one.
           Shape param_shape = parameter_layout.shape();
@@ -942,7 +942,7 @@ Status LayoutAssignment::AddMandatoryConstraints(
   } else if (reverse_computation_order_ ||
              (constraints->computation()->IsEntryComputation() &&
               entry_computation_layout_->AnyLayoutSet() &&
-              entry_computation_layout_->result_layout().LayoutIsSet()) ||
+              entry_computation_layout_->result_layout().AnyLayoutIsSet()) ||
              current_priority_ > LayoutConstraint::kBeginningPriority) {
     const ShapeLayout* result_layout = constraints->ResultLayout();
     if (result_layout != nullptr) {
@@ -1055,7 +1055,8 @@ Status CheckParameterLayout(HloInstruction* parameter,
       computation_layout.parameter_layout(parameter->parameter_number());
   return ShapeUtil::ForEachSubshapeWithStatus(
       parameter_layout.shape(),
-      [&](const Shape& subshape, const ShapeIndex& shape_index) {
+      [&](const Shape& subshape,
+          const ShapeIndex& shape_index) -> absl::Status {
         if (!ShapeUtil::IsLeafIndex(parameter_layout.shape(), shape_index) ||
             !subshape.has_layout()) {
           return OkStatus();
@@ -2540,7 +2541,8 @@ Status LayoutAssignment::PropagateComputationLayouts(
     bool needs_assign = false;
     TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
         param_layout->shape(),
-        [&](const Shape& subshape, const ShapeIndex& shape_index) {
+        [&](const Shape& subshape,
+            const ShapeIndex& shape_index) -> absl::Status {
           if (!ShapeUtil::IsLeafIndex(param_layout->shape(), shape_index)) {
             return OkStatus();
           }
@@ -2550,8 +2552,8 @@ Status LayoutAssignment::PropagateComputationLayouts(
           }
           const auto& computed_subshape = ShapeUtil::GetSubshape(
               computed_computation_layout.parameter_shape(i), shape_index);
-          if (!Layout::Equal().IgnoreMemorySpace()(
-                  subshape.layout(), computed_subshape.layout())) {
+          if (!Layout::Equal().MinorToMajorOnly()(subshape.layout(),
+                                                  computed_subshape.layout())) {
             return Internal(
                 "Assigned parameter shape %s does not match layout of "
                 "computation shape: %s",
diff --git a/third_party/xla/xla/service/layout_assignment_test.cc b/third_party/xla/xla/service/layout_assignment_test.cc
index 9a0766e8bc92e4..d4a3982d2e1a88 100644
--- a/third_party/xla/xla/service/layout_assignment_test.cc
+++ b/third_party/xla/xla/service/layout_assignment_test.cc
@@ -1722,6 +1722,74 @@ TEST_F(LayoutAssignmentTest, PartialEntryParameterLayout) {
                  {0, 1, 2});
 }
 
+// Test the ability to enforce tuple constraint with no result constraint.
+TEST_F(LayoutAssignmentTest, TupleEntryParameterLayoutNoResultConstraint) {
+  const char* module_str = R"(
+ HloModule EntryLayout, entry_computation_layout={((f32[32,650]{0,1},s32[16,1,18]{0,1,2}))->(f32[208,100]{1,0},s32[2,144]{1,0})}
+
+ ENTRY %main {
+   p = (f32[32,650],s32[16,1,18]) parameter(0)
+   operand = f32[32,650] get-tuple-element(p), index=0 
+   reshape = f32[208,100] reshape(operand)
+   indices = s32[16,1,18] get-tuple-element(p), index=1
+   reshape_indices = s32[2,144] reshape(indices)
+   ROOT t = tuple(reshape, reshape_indices)
+ } )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(module_str));
+  // Allow propagation only to parameter 0
+  m->mutable_entry_computation_layout()->mutable_result_layout()->Clear();
+
+  LayoutAssignment layout_assignment(m->mutable_entry_computation_layout(),
+                                     nullptr);
+  EXPECT_IS_OK(layout_assignment.Run(m.get()).status());
+  // Assign bitcasting layout to parameter 0
+  ExpectLayoutIs(
+      m->entry_computation_layout().parameter_layout(0).shape().tuple_shapes(0),
+      {0, 1});
+  // Parameter layout that is set is unmodified.
+  ExpectLayoutIs(
+      m->entry_computation_layout().parameter_layout(0).shape().tuple_shapes(1),
+      {0, 1, 2});
+}
+
+// Test the ability to enforce tuple constraint with no result constraint and
+// one tuple element not specified.
+TEST_F(LayoutAssignmentTest,
+       PartialTupleEntryParameterLayoutNoResultConstraint) {
+  const char* module_str = R"(
+ HloModule EntryLayout, entry_computation_layout={((f32[32,650]{0,1},s32[16,1,18]{0,1,2}))->(f32[208,100]{1,0},s32[2,144]{1,0})}
+
+ ENTRY %main {
+   p = (f32[32,650],s32[16,1,18]) parameter(0)
+   operand = f32[32,650] get-tuple-element(p), index=0 
+   reshape = f32[208,100] reshape(operand)
+   indices = s32[16,1,18] get-tuple-element(p), index=1
+   reshape_indices = s32[2,144] reshape(indices)
+   ROOT t = tuple(reshape, reshape_indices)
+ } )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(module_str));
+  // Allow propagation only to parameter 0
+  m->mutable_entry_computation_layout()->mutable_result_layout()->Clear();
+  m->mutable_entry_computation_layout()->mutable_parameter_layout(0)->Clear(
+      {1});
+
+  LayoutAssignment layout_assignment(m->mutable_entry_computation_layout(),
+                                     nullptr);
+  EXPECT_IS_OK(layout_assignment.Run(m.get()).status());
+  // Assign bitcasting layout to parameter 0
+  ExpectLayoutIs(
+      m->entry_computation_layout().parameter_layout(0).shape().tuple_shapes(0),
+      {0, 1});
+  // Parameter layout that is set is unmodified.
+  ExpectLayoutIs(
+      m->entry_computation_layout().parameter_layout(0).shape().tuple_shapes(1),
+      {2, 1, 0});
+}
+
 // Test the ability to enforce aliasing .
 TEST_F(LayoutAssignmentTest, AliasParameterAndOutput) {
   const char* module_str = R"(
diff --git a/third_party/xla/xla/service/llvm_ir/BUILD b/third_party/xla/xla/service/llvm_ir/BUILD
index 068c4ea8d98709..34a94ec25e8e04 100644
--- a/third_party/xla/xla/service/llvm_ir/BUILD
+++ b/third_party/xla/xla/service/llvm_ir/BUILD
@@ -1,10 +1,10 @@
 # Description:
 #    Libraries for helping construct LLVM IR for XLA backends.
 
-load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
-load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//xla:xla.bzl", "xla_cc_test")
+load("//xla/tsl:tsl.bzl", "internal_visibility")
+load("//xla/tsl:tsl.default.bzl", "filegroup")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -50,11 +50,9 @@ xla_cc_test(
     name = "alias_analysis_test",
     srcs = ["alias_analysis_test.cc"],
     deps = [
-        ":alias_analysis",
         "//xla/service:custom_call_status_public_headers",
         "//xla/service:custom_call_target_registry",
         "//xla/service/cpu/tests:cpu_codegen_test",
-        "//xla/tests:filecheck",
         "@local_tsl//tsl/platform:test",
     ],
 )
diff --git a/third_party/xla/xla/service/llvm_ir/alias_analysis_test.cc b/third_party/xla/xla/service/llvm_ir/alias_analysis_test.cc
index efcf91a651f504..8016fd24aaa9d3 100644
--- a/third_party/xla/xla/service/llvm_ir/alias_analysis_test.cc
+++ b/third_party/xla/xla/service/llvm_ir/alias_analysis_test.cc
@@ -13,15 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/llvm_ir/alias_analysis.h"
-
-#include <memory>
-#include <utility>
-
 #include "xla/service/cpu/tests/cpu_codegen_test.h"
 #include "xla/service/custom_call_status.h"
 #include "xla/service/custom_call_target_registry.h"
-#include "xla/tests/filecheck.h"
 #include "tsl/platform/test.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/memory_space_assignment/BUILD b/third_party/xla/xla/service/memory_space_assignment/BUILD
index 9e95c1e61e3b53..4b0f937c26e7d2 100644
--- a/third_party/xla/xla/service/memory_space_assignment/BUILD
+++ b/third_party/xla/xla/service/memory_space_assignment/BUILD
@@ -1,7 +1,6 @@
 # Description:
 #   Memory Space Assignment service implementation.
 
-load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load(
     "@local_tsl//tsl/platform:build_config.bzl",
     "tf_proto_library",
@@ -11,6 +10,7 @@ load(
     "//xla:xla.bzl",
     "xla_cc_test",
 )
+load("//xla/tsl:tsl.bzl", "internal_visibility")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -37,45 +37,33 @@ cc_library(
     srcs = ["memory_space_assignment.cc"],
     hdrs = ["memory_space_assignment.h"],
     deps = [
+        ":algorithm",
         ":allocation",
         ":cost_analysis",
-        ":memory_bound_loop_optimizer",
         ":memory_space_assignment_proto_cc",
         ":options",
-        ":repacking",
         ":slice",
-        ":tuning_utils",
-        ":utils",
-        "//xla:debug_options_flags",
         "//xla:shape_util",
         "//xla:status",
         "//xla:status_macros",
-        "//xla:statusor",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_live_range",
         "//xla/service:buffer_value",
-        "//xla/service:call_graph",
         "//xla/service:hlo_alias_analysis",
         "//xla/service:hlo_buffer",
-        "//xla/service:hlo_cost_analysis",
         "//xla/service:hlo_dataflow_analysis",
         "//xla/service:hlo_proto_cc",
         "//xla/service:hlo_value",
-        "//xla/service:time_utils",
         "//xla/service/heap_simulator",
-        "//xla/service/heap_simulator:allocation_block",
         "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
-        "@com_googlesource_code_re2//:re2",
         "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
@@ -88,7 +76,9 @@ xla_cc_test(
     name = "memory_space_assignment_test",
     srcs = ["memory_space_assignment_test.cc"],
     deps = [
+        ":algorithm",
         ":allocation",
+        ":buffer_interval_comparator",
         ":cost_analysis",
         ":memory_space_assignment",
         ":memory_space_assignment_proto_cc",
@@ -108,7 +98,6 @@ xla_cc_test(
         "//xla/service:hlo_cost_analysis",
         "//xla/service:hlo_value",
         "//xla/service:instruction_hoister",
-        "//xla/service:time_utils",
         "//xla/service/heap_simulator",
         "//xla/service/heap_simulator:allocation_block",
         "//xla/tests:hlo_test_base",
@@ -117,7 +106,6 @@ xla_cc_test(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
@@ -184,11 +172,27 @@ cc_library(
     deps = [
         ":memory_space_assignment_proto_cc",
         "//xla:shape_util",
+        "//xla/service:time_utils",
         "//xla/service/heap_simulator",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
     ],
 )
 
+xla_cc_test(
+    name = "slice_test",
+    srcs = ["slice_test.cc"],
+    deps = [
+        ":slice",
+        "//xla/service:time_utils",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/log:check",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "allocation",
     srcs = ["allocation.cc"],
@@ -196,7 +200,6 @@ cc_library(
         "allocation.h",
     ],
     deps = [
-        ":cost_analysis",
         ":memory_space_assignment_proto_cc",
         ":slice",
         "//xla:shape_util",
@@ -213,9 +216,9 @@ cc_library(
         "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:errors",
     ],
 )
@@ -235,6 +238,7 @@ cc_library(
     srcs = [],
     hdrs = ["options.h"],
     deps = [
+        ":buffer_interval_comparator",
         ":cost_analysis",
         ":memory_space_assignment_proto_cc",
         ":prefetch_interval_picker",
@@ -410,6 +414,84 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "algorithm",
+    srcs = ["algorithm.cc"],
+    hdrs = ["algorithm.h"],
+    deps = [
+        ":allocation",
+        ":buffer_interval_comparator",
+        ":cost_analysis",
+        ":memory_bound_loop_optimizer",
+        ":memory_space_assignment_proto_cc",
+        ":options",
+        ":repacking",
+        ":slice",
+        ":tuning_utils",
+        ":utils",
+        "//xla:debug_options_flags",
+        "//xla:shape_util",
+        "//xla:status",
+        "//xla:status_macros",
+        "//xla:statusor",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/utils:hlo_live_range",
+        "//xla/service:buffer_value",
+        "//xla/service:call_graph",
+        "//xla/service:hlo_alias_analysis",
+        "//xla/service:hlo_buffer",
+        "//xla/service:hlo_cost_analysis",
+        "//xla/service:hlo_dataflow_analysis",
+        "//xla/service:hlo_proto_cc",
+        "//xla/service:hlo_value",
+        "//xla/service:time_utils",
+        "//xla/service/heap_simulator",
+        "//xla/service/heap_simulator:allocation_block",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:btree",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+        "@com_googlesource_code_re2//:re2",
+        "@local_tsl//tsl/platform:casts",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+cc_library(
+    name = "buffer_interval_comparator",
+    srcs = ["buffer_interval_comparator.cc"],
+    hdrs = ["buffer_interval_comparator.h"],
+    deps = [
+        ":cost_analysis",
+        ":memory_space_assignment_proto_cc",
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/utils:hlo_live_range",
+        "//xla/service:buffer_value",
+        "//xla/service:hlo_value",
+        "//xla/service/heap_simulator",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_googlesource_code_re2//:re2",
+    ],
+)
+
 xla_cc_test(
     name = "prefetch_interval_picker_test",
     srcs = ["prefetch_interval_picker_test.cc"],
diff --git a/third_party/xla/xla/service/memory_space_assignment/algorithm.cc b/third_party/xla/xla/service/memory_space_assignment/algorithm.cc
new file mode 100644
index 00000000000000..b294141f1fbbb8
--- /dev/null
+++ b/third_party/xla/xla/service/memory_space_assignment/algorithm.cc
@@ -0,0 +1,4964 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/memory_space_assignment/algorithm.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <iterator>
+#include <limits>
+#include <list>
+#include <map>
+#include <memory>
+#include <optional>
+#include <set>
+#include <string>
+#include <string_view>
+#include <tuple>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/log/check.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "re2/re2.h"
+#include "xla/debug_options_flags.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_schedule.h"
+#include "xla/hlo/utils/hlo_live_range.h"
+#include "xla/service/buffer_value.h"
+#include "xla/service/call_graph.h"
+#include "xla/service/heap_simulator/allocation_block.h"
+#include "xla/service/heap_simulator/heap_simulator.h"
+#include "xla/service/hlo_alias_analysis.h"
+#include "xla/service/hlo_buffer.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/service/hlo_dataflow_analysis.h"
+#include "xla/service/hlo_value.h"
+#include "xla/service/memory_space_assignment/allocation.h"
+#include "xla/service/memory_space_assignment/buffer_interval_comparator.h"
+#include "xla/service/memory_space_assignment/cost_analysis.h"
+#include "xla/service/memory_space_assignment/memory_bound_loop_optimizer.h"
+#include "xla/service/memory_space_assignment/memory_space_assignment.pb.h"
+#include "xla/service/memory_space_assignment/options.h"
+#include "xla/service/memory_space_assignment/repacking.h"
+#include "xla/service/memory_space_assignment/slice.h"
+#include "xla/service/memory_space_assignment/tuning_utils.h"
+#include "xla/service/memory_space_assignment/utils.h"
+#include "xla/service/time_utils.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/status.h"
+#include "xla/status_macros.h"
+#include "xla/statusor.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/casts.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/logging.h"
+#include "tsl/platform/status.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace memory_space_assignment {
+namespace {
+
+// Define a dummy chunk for chunks that will be allocated in the default memory
+// space and for keeping track of number of asynchronous copies.
+const HeapSimulator::Chunk kDummyChunk =
+    HeapSimulator::Chunk::FromOffsetSize(-1, -1);
+// For cross-program prefetched buffer, we only perform the freeing optimization
+// if the buffer occupies less of the execution time ratio than this value.
+const float kCrossProgramPrefetchOccupyFreeingLimit = 0.6;
+
+template <typename T>
+std::string VectorToString(const std::vector<T>& v,
+                           bool include_indices = false, int start = 0,
+                           int end = std::numeric_limits<int>::max()) {
+  std::vector<std::string> elements;
+
+  for (int i = start; i < end && i < v.size(); ++i) {
+    std::string prefix;
+    if (include_indices) {
+      prefix = absl::StrCat(i, ": ");
+    }
+    elements.push_back(absl::StrCat(prefix, v[i]));
+  }
+
+  return absl::StrCat("[ ", absl::StrJoin(elements, ", "), " ]");
+}
+
+bool LooksLikeAnActivation(const HloInstruction* inst) {
+  for (HloInstruction* user : inst->users()) {
+    switch (user->opcode()) {
+      case HloOpcode::kConvolution:
+      case HloOpcode::kDot:
+        if (user->operand(0) == inst) {
+          return true;
+        }
+        break;
+      case HloOpcode::kGather:
+        if (user->operand(1) == inst) {
+          return true;
+        }
+        break;
+      case HloOpcode::kFusion:
+        for (int i = 0; i < user->operand_count(); ++i) {
+          if (user->operand(i) == inst &&
+              LooksLikeAnActivation(user->fused_parameter(i))) {
+            return true;
+          }
+        }
+        break;
+      case HloOpcode::kBitcast:
+      case HloOpcode::kBroadcast:
+      case HloOpcode::kTranspose:
+        if (LooksLikeAnActivation(user)) {
+          return true;
+        }
+        break;
+      case HloOpcode::kCopy:
+        if (user->IsFused() && (user == user->parent()->root_instruction())) {
+          user = user->parent()->FusionInstruction();
+          if (LooksLikeAnActivation(user)) {
+            return true;
+          } else {
+            break;
+          }
+        }
+        return true;
+      case HloOpcode::kDynamicUpdateSlice:
+      case HloOpcode::kDynamicSlice:
+        if (std::find(user->operands().begin() + 1, user->operands().end(),
+                      inst) != user->operands().end()) {
+          return true;
+        }
+        if (LooksLikeAnActivation(user)) {
+          return true;
+        }
+        break;
+      case HloOpcode::kReduce:
+        // Check init operands.
+        if (std::find(user->operands().begin() + user->operand_count() / 2,
+                      user->operands().end(), inst) != user->operands().end()) {
+          return true;
+        }
+        if (LooksLikeAnActivation(user)) {
+          return true;
+        }
+        break;
+      default:
+        return true;
+    }
+  }
+  return false;
+}
+
+// Filters out buffer uses that cannot use the cross-program prefetch due to
+// aliasing with program output.
+std::vector<HloUse> FindCrossProgramPrefetchUses(
+    absl::Span<const HloUse> buffer_uses,
+    const HloAliasAnalysis& alias_analysis) {
+  std::vector<HloUse> uses;
+  if (buffer_uses.empty()) {
+    return uses;
+  }
+  const HloInstruction* root_instruction = buffer_uses.at(0)
+                                               .instruction->GetModule()
+                                               ->entry_computation()
+                                               ->root_instruction();
+  // This function returns true if the use value does not live out of the
+  // module. The value lives out if it is the root or it aliases with another
+  // value that lives out. We recurse to detect the latter case.
+  std::function<bool(const HloUse&)> use_does_not_live_out =
+      [&](const HloUse& use) {
+        if (use.instruction == root_instruction &&
+            (use.instruction->opcode() == HloOpcode::kTuple ||
+             use.instruction->opcode() == HloOpcode::kBitcast)) {
+          return false;
+        }
+        auto in_place_pairs =
+            HloDataflowAnalysis::GetInPlaceInputOutputPairs(use.instruction);
+        return absl::c_all_of(
+            in_place_pairs,
+            [&](const std::pair<HloOperandIndex, ShapeIndex>& in_place_pair) {
+              if (in_place_pair.first.operand_number == use.operand_number &&
+                  in_place_pair.first.operand_index == use.operand_index) {
+                return use.instruction != root_instruction &&
+                       absl::c_all_of(
+                           alias_analysis.dataflow_analysis()
+                               .GetUniqueValueAt(use.instruction,
+                                                 in_place_pair.second)
+                               .GetUses(),
+                           use_does_not_live_out);
+              }
+              return true;
+            });
+      };
+
+  absl::c_copy_if(buffer_uses, std::back_inserter(uses), use_does_not_live_out);
+  return uses;
+}
+
+bool IsCrossProgramPrefetchCandidate(const HloValue& value,
+                                     const HloAliasAnalysis& alias_analysis,
+                                     const Options& options) {
+  // Filter out values that alias with the entry computation root.
+  const HloBuffer& buffer = alias_analysis.GetBufferContainingValue(value);
+  const HloInstruction* root = alias_analysis.dataflow_analysis()
+                                   .module()
+                                   .entry_computation()
+                                   ->root_instruction();
+  for (const HloPosition& position : buffer.ComputePositions()) {
+    if (position.instruction == root) {
+      return false;
+    }
+  }
+  std::vector<HloUse> uses =
+      FindCrossProgramPrefetchUses(value.GetUses(), alias_analysis);
+  return value.defining_instruction()->parent() ==
+             value.defining_instruction()->GetModule()->entry_computation() &&
+         value.defining_instruction()->opcode() == HloOpcode::kParameter &&
+         (!value.shape().has_layout() ||
+          value.shape().layout().memory_space() !=
+              options.alternate_memory_space) &&
+         value.index().size() <= 1 && value.shape().IsArray() &&
+         !uses.empty() && options.size_fn(value) <= options.max_size_in_bytes &&
+         absl::c_all_of(uses, [&](const HloUse& use) {
+           const HloInstruction* inst =
+               use.instruction->operand(use.operand_number);
+
+           // Skip the LooksLikeAnActivation test since we're testing the
+           // parent GTE/parameter and its children below.
+           if (inst->opcode() == HloOpcode::kBitcast &&
+               ((inst->operand(0)->opcode() == HloOpcode::kGetTupleElement &&
+                 inst->operand(0)->operand(0)->opcode() ==
+                     HloOpcode::kParameter) ||
+                inst->operand(0)->opcode() == HloOpcode::kParameter)) {
+             return true;
+           }
+
+           return (inst->opcode() == HloOpcode::kGetTupleElement ||
+                   inst->opcode() == HloOpcode::kParameter) &&
+                  !LooksLikeAnActivation(inst);
+         });
+}
+
+struct CrossProgramPrefetchBufferSortValues {
+  int64_t latest_use = 0;
+  int64_t use_size = 0;
+};
+
+std::vector<MsaBufferInterval> FindCrossProgramPrefetchCandidates(
+    const HloAliasAnalysis& alias_analysis, const HloLiveRange& hlo_live_range,
+    const Options& options) {
+  std::vector<MsaBufferInterval> candidates;
+  for (const HloBuffer& buffer : alias_analysis.buffers()) {
+    CHECK_GE(buffer.values().size(), 1);
+    const HloValue* value = buffer.values().at(0);
+    if (IsCrossProgramPrefetchCandidate(*value, alias_analysis, options)) {
+      MsaBufferInterval interval;
+      interval.buffer = value;
+      interval.size = options.size_fn(*value);
+      interval.start = 0;
+      interval.end = hlo_live_range.schedule_end_time();
+      interval.need_allocation = true;
+      interval.colocations = {++buffer.values().begin(), buffer.values().end()};
+      candidates.emplace_back(interval);
+    }
+  }
+
+  DefaultCrossProgramPrefetchBufferIntervalComparator default_comparator(
+      hlo_live_range);
+  BufferIntervalComparator* comparator =
+      (options.default_cross_program_prefetch_heuristic &&
+               options.buffer_interval_comparator
+           ? options.buffer_interval_comparator
+           : &default_comparator);
+  absl::c_sort(candidates, comparator->GetComparisonFunctor());
+
+  VLOG(3) << "Cross-program prefetch candidates: " << candidates.size()
+          << ". Sorting criteria: " << comparator->DescribeComparisonCriteria();
+  for (auto& candidate : candidates) {
+    VLOG(3) << "Cross-program prefetch candidate. Sorting criteria: "
+            << comparator->CriteriaToString(candidate)
+            << ". Candidate: " << candidate.buffer->ToString();
+  }
+  return candidates;
+}
+
+absl::StatusOr<xla::HloLiveRange::LogicalTime>
+GetScheduleTimeFromInstructionName(
+    absl::string_view name,
+    const absl::flat_hash_map<const xla::HloInstruction*,
+                              xla::HloLiveRange::LogicalTime>& schedule) {
+  for (auto schedule_entry : schedule) {
+    if (schedule_entry.first->name() == name) {
+      return schedule_entry.second;
+    }
+  }
+  return NotFound("Reference instruction %s was not found in the schedule.",
+                  name);
+}
+
+bool DoesOperandMatchFilter(const HloOperandFilter& filter,
+                            int64_t operand_size, const HloUse& hlo_use) {
+  if (filter.has_size_gte() && operand_size < filter.size_gte()) {
+    return false;
+  }
+  if (filter.has_size_lte() && operand_size > filter.size_lte()) {
+    return false;
+  }
+  if (filter.has_operand_number() &&
+      hlo_use.operand_number != filter.operand_number()) {
+    return false;
+  }
+  if (filter.has_instruction_name_regex() &&
+      !RE2::FullMatch(hlo_use.instruction->name(),
+                      filter.instruction_name_regex())) {
+    return false;
+  }
+  if (filter.has_tuple_index() &&
+      hlo_use.operand_index != ShapeIndex(filter.tuple_index().index().begin(),
+                                          filter.tuple_index().index().end())) {
+    return false;
+  }
+  return true;
+}
+
+absl::StatusOr<std::optional<int64_t>> GetPrefetchTimeByEagerness(
+    float prefetch_eagerness, int64_t earliest_prefetch_time,
+    int64_t latest_prefetch_time) {
+  CHECK_GE(prefetch_eagerness, 0.0);
+  CHECK_LE(prefetch_eagerness, 1.0);
+  if (earliest_prefetch_time > latest_prefetch_time) {
+    return static_cast<std::optional<int64_t>>(std::nullopt);
+  }
+  return static_cast<std::optional<int64_t>>(
+      earliest_prefetch_time +
+      (latest_prefetch_time - earliest_prefetch_time) * prefetch_eagerness);
+}
+
+absl::StatusOr<std::optional<int64_t>> GetPrefetchTimeAfterInstruction(
+    const std::string& after_instruction_name,
+    const absl::flat_hash_map<const xla::HloInstruction*,
+                              xla::HloLiveRange::LogicalTime>& schedule) {
+  TF_ASSIGN_OR_RETURN(
+      auto reference_instruction_time,
+      GetScheduleTimeFromInstructionName(after_instruction_name, schedule));
+  return static_cast<std::optional<int64_t>>(reference_instruction_time);
+}
+
+absl::StatusOr<std::optional<int64_t>> GetPrefetchTimeBeforeInstruction(
+    const std::string& before_instruction_name,
+    const absl::flat_hash_map<const xla::HloInstruction*,
+                              xla::HloLiveRange::LogicalTime>& schedule) {
+  TF_ASSIGN_OR_RETURN(
+      auto reference_instruction_time,
+      GetScheduleTimeFromInstructionName(before_instruction_name, schedule));
+  return static_cast<std::optional<int64_t>>(reference_instruction_time - 1);
+}
+
+absl::StatusOr<std::optional<int64_t>> GetPrefetchTime(
+    const PreferredPrefetchOverrideOptions& override_options,
+    int64_t earliest_prefetch_time, int64_t latest_prefetch_time,
+    const absl::flat_hash_map<const HloInstruction*, HloLiveRange::LogicalTime>&
+        instruction_schedule) {
+  switch (override_options.options_case()) {
+    case PreferredPrefetchOverrideOptions::kPrefetchEagerness:
+      return GetPrefetchTimeByEagerness(override_options.prefetch_eagerness(),
+                                        earliest_prefetch_time,
+                                        latest_prefetch_time);
+    case PreferredPrefetchOverrideOptions::kAfterInstructionName:
+      return GetPrefetchTimeAfterInstruction(
+          override_options.after_instruction_name(), instruction_schedule);
+    case PreferredPrefetchOverrideOptions::kBeforeInstructionName:
+      return GetPrefetchTimeBeforeInstruction(
+          override_options.before_instruction_name(), instruction_schedule);
+    case PreferredPrefetchOverrideOptions::OPTIONS_NOT_SET:
+      break;
+  }
+  return static_cast<absl::StatusOr<std::optional<int64_t>>>(std::nullopt);
+}
+
+absl::StatusOr<std::optional<int64_t>> GetOverriddenPreferredPrefetchTime(
+    const PreferredPrefetchOverrides& preferred_prefetch_overrides,
+    int64_t operand_size, const HloUse& hlo_use,
+    const absl::flat_hash_map<const HloInstruction*, HloLiveRange::LogicalTime>&
+        instruction_schedule,
+    int64_t earliest_prefetch_time, int64_t latest_prefetch_time) {
+  for (const auto& override : preferred_prefetch_overrides.overrides()) {
+    if (!DoesOperandMatchFilter(override.hlo_operand_filter(), operand_size,
+                                hlo_use)) {
+      continue;
+    }
+    LOG(INFO) << "Config match for instruction " << hlo_use.instruction->name()
+              << " operand number " << hlo_use.operand_number
+              << " operand index " << hlo_use.operand_index.ToString()
+              << " size " << operand_size << " live range ("
+              << earliest_prefetch_time << ", " << latest_prefetch_time << ")";
+    TF_ASSIGN_OR_RETURN(
+        auto prefetch_time,
+        GetPrefetchTime(override.override_options(), earliest_prefetch_time,
+                        latest_prefetch_time, instruction_schedule));
+    if (prefetch_time.has_value() &&
+        prefetch_time.value() >= earliest_prefetch_time &&
+        prefetch_time.value() <= latest_prefetch_time) {
+      return prefetch_time;
+    }
+  }
+  return static_cast<absl::StatusOr<std::optional<int64_t>>>(std::nullopt);
+}
+
+}  // namespace
+
+std::string AllocationValue::ToString() const {
+  std::string out = absl::StrCat("computation = ", computation()->name());
+  absl::StrAppend(&out,
+                  (requires_contiguous_allocation_ ? " (cont alloc)" : ""));
+  absl::StrAppend(&out, "\n position:\n");
+  absl::StrAppend(&out, "  ", defining_position_.ToString(), "\n");
+  absl::StrAppend(&out, " uses:\n");
+  for (const Use& use : uses_) {
+    absl::StrAppend(&out, "  ", use.hlo_use.ToString(), "\n");
+  }
+  return out;
+}
+
+std::string AllocationValue::ToShortString() const {
+  return absl::StrCat("computation = ", computation()->name(),
+                      ", position = ", defining_position_.ToString(),
+                      ", value = ", value_->ToShortString(),
+                      (requires_contiguous_allocation_ ? " (cont alloc)" : ""));
+}
+
+bool MsaAlgorithm::IsIntervalPinnedToAlternateMemory(
+    const MsaBufferInterval& interval) const {
+  const Shape& shape = interval.buffer->shape();
+  return shape.has_layout() &&
+         shape.layout().memory_space() == options_.alternate_memory_space;
+}
+
+MsaAlgorithm::MsaAlgorithm(AllocationSequence* allocations,
+                           const Options& options,
+                           const HloAliasAnalysis& alias_analysis,
+                           const HloLiveRange& hlo_live_range)
+    : GlobalDecreasingSizeBestFitHeap(
+          options.alignment_in_bytes,
+          /*type=*/kSpatial, /*buffer_interval_compare=*/nullptr,
+          (options.sliced_prefetch_options.max_slices() >
+                   options.sliced_prefetch_options
+                       .all_slice_time_permutations_threshold()
+               ? SliceTimePermutationIterator::Ty::kPreferred
+               : SliceTimePermutationIterator::Ty::kAll)),
+      allocations_(allocations),
+      options_(options),
+      alias_analysis_(alias_analysis),
+      hlo_live_range_(hlo_live_range),
+      peak_memory_usage_(hlo_live_range.schedule_end_time() + 1) {
+  // Override buffer interval compare if provided.
+  auto comparison_function = GetSpatialBufferIntervalCompare();
+  if (options.buffer_interval_comparator) {
+    comparison_function =
+        options.buffer_interval_comparator->GetComparisonFunctor();
+  }
+
+  // Prioritize pinned buffers in the buffer interval order.
+  buffer_interval_compare_ =
+      [this, comparison_function = std::move(comparison_function)](
+          const MsaBufferInterval& a, const MsaBufferInterval& b) {
+        bool is_a_pinned = IsIntervalPinnedToAlternateMemory(a);
+        bool is_b_pinned = IsIntervalPinnedToAlternateMemory(b);
+        if (is_a_pinned && !is_b_pinned) {
+          return true;
+        }
+        if (!is_a_pinned && is_b_pinned) {
+          return false;
+        }
+        return comparison_function(a, b);
+      };
+
+  call_graph_ = CallGraph::Build(&alias_analysis_.dataflow_analysis().module());
+
+  std::vector<float> initial_resources(hlo_live_range.schedule_end_time(), 1.0);
+  if (options.cost_analysis) {
+    const std::vector<HloInstruction*>& flattened_instructions =
+        hlo_live_range.flattened_instruction_sequence().instructions();
+    for (int i = 0; i < flattened_instructions.size(); ++i) {
+      const HloInstruction* inst = flattened_instructions[i];
+      if (inst->opcode() == HloOpcode::kWhile ||
+          inst->opcode() == HloOpcode::kConditional) {
+        initial_resources[i] = 0;
+      } else {
+        initial_resources[i] =
+            options.cost_analysis->GetInstructionElapsed(*inst);
+        if (options_.use_repeated_instance_for_preferred_prefetch_time ||
+            options_.memory_bound_loop_optimizer_options.enabled()) {
+          std::string fingerprint;
+          absl::StrAppend(&fingerprint, inst->shape().ToString(), " ",
+                          HloOpcodeString(inst->opcode()), "(");
+          for (int operand_idx = 0; operand_idx < inst->operands().size();
+               ++operand_idx) {
+            if (operand_idx > 0) {
+              absl::StrAppend(&fingerprint, ", ");
+            }
+            absl::StrAppend(&fingerprint,
+                            inst->operand(operand_idx)->shape().ToString());
+          }
+          absl::StrAppend(&fingerprint, ")");
+          fingerprint_map_[inst] = fingerprint;
+          repeated_inst_map_[fingerprint].push_back(inst);
+        }
+      }
+      VLOG(2) << "Initial resource[" << i << "] = " << initial_resources[i]
+              << " (" << inst->name() << ")";
+    }
+  }
+  prefetch_async_copy_resource_ = AsynchronousCopyResource(initial_resources);
+  eviction_async_copy_resource_ = AsynchronousCopyResource(initial_resources);
+}
+
+void MsaAlgorithm::CreateAllocationValues(
+    const MsaBufferInterval& buffer_interval,
+    std::vector<AllocationValue>& allocation_values) const {
+  const HloValue* value = buffer_interval.buffer;
+  VLOG(3) << "Creating AllocationValues for: " << value->ToString();
+
+  // Find and sort all non-trivial (excluding GTE, Tuple, and bitcast)
+  // positions. We create an AllocationValue object for each non-trivial
+  // position. And for each AllocationValue object, we create an
+  // AllocationSequence consisting of one or more Allocation objects.The reason
+  // why we exclude the trivial positions from AllocationValue is because
+  // Allocation objects have special support for tuples and bitcasts.
+  const absl::flat_hash_map<const HloInstruction*, int64_t>&
+      instruction_schedule = hlo_live_range_.instruction_schedule();
+  std::vector<HloPosition> positions;
+  for (const HloPosition& position : value->positions()) {
+    const HloInstruction* instruction = position.instruction;
+    if (instruction->opcode() != HloOpcode::kGetTupleElement &&
+        instruction->opcode() != HloOpcode::kTuple &&
+        instruction->opcode() != HloOpcode::kBitcast) {
+      positions.push_back(position);
+    }
+  }
+  absl::c_stable_sort(positions,
+                      [&](const HloPosition& pos1, const HloPosition& pos2) {
+                        return instruction_schedule.at(pos1.instruction) <
+                               instruction_schedule.at(pos2.instruction);
+                      });
+
+  // Create an AllocationValue for each non-trivial position.
+  int beginning_idx = allocation_values.size();
+  for (int i = 0; i < positions.size(); ++i) {
+    const HloPosition& position = positions.at(i);
+    allocation_values.emplace_back(value, position, buffer_interval.size);
+  }
+
+  std::vector<HloUse> uses(value->GetUses().begin(), value->GetUses().end());
+  absl::c_stable_sort(uses, [&](const HloUse& use1, const HloUse& use2) {
+    return instruction_schedule.at(use1.instruction) <
+           instruction_schedule.at(use2.instruction);
+  });
+
+  // Associate each use with an AllocationValue. Each AllocationValue contains a
+  // position and uses in the same computation. Furthermore, if the original
+  // HloValue had multiple non-trivial positions in the same computation, those
+  // will get their own AllocationValue as well. We split these HloValues so
+  // that when we insert CopyStart/CopyDone in CopyAllocation::Process, they
+  // point to the latest position. We then replace the operand of the use with
+  // CopyStart/CopyDone with an operand of the latest position.
+  for (const HloUse& use : uses) {
+    int64_t use_time = instruction_schedule.at(use.instruction);
+    HloComputation* use_computation = use.instruction->parent();
+
+    AllocationValue* last_allocation_value = nullptr;
+    for (int i = beginning_idx; i < allocation_values.size(); ++i) {
+      AllocationValue* allocation_value = &allocation_values.at(i);
+      if (HloDataflowAnalysis::IsAsynchronousOperationDone(
+              use.instruction->opcode())) {
+        if (allocation_value->defining_instruction() ==
+                use.instruction->operand(0) &&
+            use.operand_index == allocation_value->defining_position().index) {
+          last_allocation_value = allocation_value;
+        }
+      } else if (!HloDataflowAnalysis::IsAsynchronousOperationStart(
+                     allocation_value->defining_instruction()->opcode()) &&
+                 allocation_value->computation() == use_computation &&
+                 instruction_schedule.at(
+                     allocation_value->defining_position().instruction) <
+                     use_time) {
+        last_allocation_value = allocation_value;
+      }
+    }
+    CHECK(last_allocation_value != nullptr);
+    last_allocation_value->AddUse(use, use_time);
+  }
+
+  for (int i = beginning_idx; i < allocation_values.size(); ++i) {
+    AllocationValue& allocation_value = allocation_values.at(i);
+    if (HloDataflowAnalysis::IsAsynchronousOperationStart(
+            allocation_value.defining_instruction()->opcode())) {
+      CHECK_EQ(allocation_value.uses().size(), 1);
+      CHECK(HloDataflowAnalysis::IsAsynchronousOperationDone(
+          allocation_value.uses().at(0).hlo_use.instruction->opcode()));
+      VLOG(3) << "Mark " << allocation_value.ToShortString()
+              << " to require contiguous allocation because it is an async "
+                 "start operation.";
+      allocation_value.set_requires_contiguous_allocation(true);
+    } else if (options_.position_requires_contiguous_allocation_fn(
+                   allocation_value.defining_position())) {
+      VLOG(3) << "Mark " << allocation_value.ToShortString()
+              << " to require contiguous allocation because of options.";
+      allocation_value.set_requires_contiguous_allocation(true);
+    }
+    VLOG(3) << "Created allocation value: "
+            << allocation_values.at(i).ToString();
+  }
+}
+
+void MsaAlgorithm::FindAliases(
+    std::vector<AllocationValue>* allocation_values) const {
+  absl::flat_hash_map<const HloInstruction*,
+                      std::vector<const AllocationValue*>>
+      values_by_defining_inst;
+  for (AllocationValue& value : *allocation_values) {
+    values_by_defining_inst[value.defining_instruction()].push_back(&value);
+  }
+  auto maybe_add_alias_with_instruction = [&](const HloInstruction* instruction,
+                                              AllocationValue::Use* use) {
+    auto aliased_values_it = values_by_defining_inst.find(instruction);
+    if (aliased_values_it != values_by_defining_inst.end()) {
+      for (const AllocationValue* aliased_value : aliased_values_it->second) {
+        VLOG(3) << "Adding aliasing for use " << use->hlo_use.ToString()
+                << " to " << aliased_value->ToShortString();
+        use->aliases.push_back(aliased_value->defining_position());
+      }
+    }
+  };
+
+  for (AllocationValue& value : *allocation_values) {
+    for (AllocationValue::Use& use : value.uses()) {
+      // Find any aliases with the instruction itself (operand and output must
+      // alias).
+      maybe_add_alias_with_instruction(use.hlo_use.instruction, &use);
+
+      // Find any aliases with the parameters of called computations.
+      for (const HloComputation* called_computation :
+           use.hlo_use.instruction->called_computations()) {
+        for (const HloInstruction* parameter_instruction :
+             called_computation->parameter_instructions()) {
+          maybe_add_alias_with_instruction(parameter_instruction, &use);
+        }
+      }
+
+      // Special case for kWhile: the root of the body computation must alias as
+      // well.
+      if (use.hlo_use.instruction->opcode() == HloOpcode::kWhile) {
+        HloPosition root_alias{
+            use.hlo_use.instruction->while_body()->root_instruction(),
+            use.hlo_use.operand_index};
+        VLOG(3) << "Adding while body root aliasing for use "
+                << use.hlo_use.ToString() << " to " << root_alias;
+        use.aliases.push_back(root_alias);
+      }
+    }
+  }
+}
+
+std::vector<const MsaBufferInterval*> MsaAlgorithm::GetSortedColocatedIntervals(
+    const MsaBufferInterval& interval) const {
+  std::vector<const MsaBufferInterval*> colocated_intervals;
+  std::vector<const MsaBufferInterval*> worklist = {&interval};
+  while (!worklist.empty()) {
+    const MsaBufferInterval* item = worklist.back();
+    worklist.pop_back();
+    colocated_intervals.push_back(item);
+    for (const HloValue* buffer_colocated : item->colocations) {
+      worklist.push_back(&buffer_intervals_.at(buffer_colocated));
+    }
+  }
+
+  absl::c_stable_sort(colocated_intervals, [&](const MsaBufferInterval* x,
+                                               const MsaBufferInterval* y) {
+    return std::make_pair(x->start, x->end) < std::make_pair(y->start, y->end);
+  });
+  return colocated_intervals;
+}
+
+bool MsaAlgorithm::IsUseAllowedInAlternateMemory(const AllocationValue& value,
+                                                 const HloUse& use) const {
+  const auto& instruction_schedule = hlo_live_range_.instruction_schedule();
+  if (!options_.is_use_allowed_in_alternate_mem_fn(use)) {
+    return false;
+  }
+  if (use.instruction->opcode() == HloOpcode::kWhile) {
+    HloComputation* while_body = use.instruction->while_body();
+
+    // We don't want to allocate this buffer in alternate memory if it will be
+    // evicted anyway. Find out if it has an early use or a late definition that
+    // would make sense to keep it in the alternate memory.
+    HloValue* parameter_value =
+        &alias_analysis_.dataflow_analysis().GetUniqueValueAt(
+            while_body->parameter_instruction(0), use.operand_index);
+    int64_t parameter_time =
+        instruction_schedule.at(while_body->parameter_instruction(0));
+    int64_t root_time = instruction_schedule.at(while_body->root_instruction());
+    int64_t min_use_time = root_time;
+    for (const HloUse& parameter_use : parameter_value->GetUses()) {
+      int64_t use_time = instruction_schedule.at(parameter_use.instruction);
+      if (parameter_use.instruction->opcode() != HloOpcode::kGetTupleElement &&
+          parameter_use.instruction->opcode() != HloOpcode::kTuple &&
+          parameter_use.instruction->opcode() != HloOpcode::kBitcast &&
+          use_time > parameter_time) {
+        min_use_time = std::min(min_use_time, use_time);
+      }
+    }
+    // If there is no use of this buffer inside the while loop, there is no need
+    // to allocate it in the loop.
+    if (min_use_time == root_time) {
+      VLOG(4) << "While allocation not allowed in alternate memory. "
+              << "use time = " << min_use_time << ", root time = " << root_time;
+      return false;
+    }
+    const Shape& shape = parameter_value->shape();
+    // Allow the buffer in alternate memory if the buffer has a short live range
+    // either at the beginning or end of the while loop body.
+    if (!options_.prefetch_interval_picker->CanAllocateInAlternateMemoryNoCopy(
+            shape, parameter_time, min_use_time)) {
+      VLOG(4) << "While allocation not allowed in alternate memory. "
+              << "use time = " << min_use_time << ", root time = " << root_time;
+      return false;
+    }
+    // Check if there is a required assignment for the while loop output.
+    HloValue* while_value =
+        &alias_analysis_.dataflow_analysis().GetUniqueValueAt(
+            use.instruction, use.operand_index);
+    int64_t while_time = instruction_schedule.at(use.instruction);
+    auto existing_required_assignment =
+        RequiredMemoryAssignmentAt(while_value, while_time);
+    if (existing_required_assignment &&
+        existing_required_assignment->memory_space == MemorySpace::kDefault) {
+      VLOG(4) << "While allocation not allowed in alternate memory because "
+                 "there is a required default memory assignment.";
+      return false;
+    }
+  } else if (use.instruction->opcode() == HloOpcode::kConditional) {
+    // For any use of this conditional (the same value might be passed into
+    // multiple called computations), determine if the parameter->first use
+    // dependency is short.
+    int64_t conditional_time = instruction_schedule.at(use.instruction);
+    for (const AllocationValue::Use& other_use : value.uses()) {
+      if (other_use.hlo_use.instruction != use.instruction) {
+        continue;
+      }
+      // Operand 0 is not passed into the computation.
+      if (other_use.hlo_use.operand_number == 0) {
+        continue;
+      }
+      HloComputation* called_computation =
+          use.instruction->called_computations().at(
+              other_use.hlo_use.operand_number - 1);
+      const HloInstruction* parameter_instruction =
+          called_computation->parameter_instruction(0);
+      HloValue* parameter_value =
+          &alias_analysis_.dataflow_analysis().GetUniqueValueAt(
+              parameter_instruction, other_use.hlo_use.operand_index);
+      int64_t parameter_time = instruction_schedule.at(parameter_instruction);
+      int64_t min_use_time = conditional_time;
+      for (const HloUse& parameter_use : parameter_value->GetUses()) {
+        if (parameter_use.instruction->parent() == called_computation &&
+            parameter_use.instruction->opcode() !=
+                HloOpcode::kGetTupleElement &&
+            parameter_use.instruction->opcode() != HloOpcode::kTuple &&
+            parameter_use.instruction->opcode() != HloOpcode::kBitcast) {
+          min_use_time = std::min(
+              min_use_time, instruction_schedule.at(parameter_use.instruction));
+        }
+      }
+      if (options_.prefetch_interval_picker->CanAllocateInAlternateMemoryNoCopy(
+              parameter_value->shape(), parameter_time, min_use_time)) {
+        VLOG(4) << "Conditional allocation allowed in alternate memory for "
+                   "computation = "
+                << called_computation->name()
+                << ", parameter time = " << parameter_time
+                << ", min use time = " << min_use_time;
+        return true;
+      } else {
+        VLOG(4) << "Conditional allocation not allowed in alternate memory for "
+                   "computation = "
+                << called_computation->name()
+                << ", parameter time = " << parameter_time
+                << ", min use time = " << min_use_time;
+      }
+    }
+    return false;
+  }
+
+  return true;
+}
+
+namespace {
+// Columns in buffer information:
+// buffer_id: int. This value can be used to match the allocation in
+// allocation information.
+// buffer_name: string.
+// alt_mem_benefit: float. Roughly corresponds to how much the cost analysis
+// thought it would be beneficial to put this in the alternate memory. The
+// higher the value, the more it is memory bound.
+// size: int. In bytes.
+// definition_time: int. Logical time this value was defined in the schedule.
+// use_times: string. This is a semicolon-separated list of integers for all
+// the use times.
+// use_names: string. This is a semicolon-separated list of string
+// representation of uses.
+// is_scoped: int. A value of 1 indicates that the buffer is a scoped
+// allocation.
+constexpr absl::string_view kBufferInfoColumnNames =
+    "buffer_id,buffer_name,alt_mem_benefit,size,definition_time,use_times,use_"
+    "names,is_scoped";
+}  // namespace
+
+void MsaAlgorithm::AppendBufferInfoDebugString(
+    const MsaBufferInterval& interval, std::string* debug_str) const {
+  if (debug_str->empty()) {
+    // Append the column names.
+    absl::StrAppend(debug_str, kBufferInfoColumnNames, "\n");
+  }
+  const HloBuffer& buffer =
+      alias_analysis_.GetBufferContainingValue(*interval.buffer);
+  const auto& instruction_schedule = hlo_live_range_.instruction_schedule();
+  int64_t definition_time =
+      instruction_schedule.at(interval.buffer->defining_position().instruction);
+  std::vector<std::pair<int64_t, std::string>> uses;
+  for (const HloValue* value : buffer.values()) {
+    for (const HloUse& use : value->GetUses()) {
+      uses.push_back(
+          {instruction_schedule.at(use.instruction), use.ToString()});
+    }
+  }
+  absl::c_sort(uses);
+  std::vector<int64_t> use_times;
+  std::vector<std::string> use_names;
+  use_times.reserve(uses.size());
+  use_names.reserve(uses.size());
+  for (const auto& use : uses) {
+    use_times.push_back(use.first);
+    use_names.push_back(use.second);
+  }
+
+  absl::StrAppend(debug_str, buffer.id(), ",");
+  absl::StrAppend(debug_str, "\"", interval.buffer->ToShortString(), "\",");
+  auto alternate_memory_benefit =
+      options_.prefetch_interval_picker->BufferIntervalAlternateMemoryBenefit(
+          interval);
+  absl::StrAppend(
+      debug_str, alternate_memory_benefit ? *alternate_memory_benefit : 0, ",");
+  absl::StrAppend(debug_str, interval.size, ",");
+  absl::StrAppend(debug_str, definition_time, ",");
+  absl::StrAppend(debug_str, "\"", absl::StrJoin(use_times, ";"), "\",");
+  absl::StrAppend(debug_str, "\"", absl::StrJoin(use_names, ";"), "\",");
+  absl::StrAppend(debug_str, "0");  // is_scoped
+  absl::StrAppend(debug_str, "\n");
+}
+
+void MsaAlgorithm::AppendScopedAllocationBufferInfoDebugString(
+    const HloInstruction* instruction, int64_t time, int64_t size,
+    std::string& debug_str) const {
+  if (debug_str.empty()) {
+    // Append the column names.
+    absl::StrAppend(&debug_str, kBufferInfoColumnNames, "\n");
+  }
+  const HloBuffer& buffer = alias_analysis_.GetUniqueBufferAt(instruction);
+
+  // As a convention, we use negative values for scoped allocations.
+  absl::StrAppend(&debug_str, -buffer.id(), ",");
+  absl::StrAppend(&debug_str, "\"scoped allocation for ", instruction->name(),
+                  "\",");
+  absl::StrAppend(&debug_str, 0, ",");  // alt_mem_benefit
+  absl::StrAppend(&debug_str, size, ",");
+  absl::StrAppend(&debug_str, time, ",");
+  absl::StrAppend(&debug_str, "\"\",");  // use_times
+  absl::StrAppend(&debug_str, "\"\",");  // use_names
+  absl::StrAppend(&debug_str, "1");      // is_scoped
+  absl::StrAppend(&debug_str, "\n");
+}
+
+void MsaAlgorithm::AppendAllocationInfoDebugString(
+    const Allocation& allocation, std::string& debug_str) const {
+  // Columns in allocation information:
+  // buffer_id: int. This value can be used the match with buffer info.
+  // size: int. In bytes.
+  // offset: int. In bytes.
+  // start_time: int. Logical start time of the allocation.
+  // end_time: int. Logical end time of the allocation.
+  if (debug_str.empty()) {
+    // Append the column names.
+    absl::StrAppend(&debug_str, "buffer_id,size,offset,start_time,end_time\n");
+  }
+  if (allocation.memory_space() == MemorySpace::kAlternate) {
+    const HloPosition& position = allocation.defining_position();
+    const HloBuffer& buffer =
+        alias_analysis_.GetUniqueBufferAt(position.instruction, position.index);
+    // As a convention, we use negative values for scoped allocations.
+    absl::StrAppend(
+        &debug_str,
+        allocation.is_scoped_allocation() ? -buffer.id() : buffer.id(), ",");
+    absl::StrAppend(&debug_str, allocation.chunk().size, ",");
+    absl::StrAppend(&debug_str, allocation.chunk().offset, ",");
+    absl::StrAppend(&debug_str, allocation.start_time(), ",");
+    absl::StrAppend(&debug_str, allocation.end_time(), "\n");
+  }
+}
+
+void MsaAlgorithm::DumpDebugStringsIfEnabled() const {
+  if (!options_.dump_fn) {
+    return;
+  }
+  options_.dump_fn("bufferinfo", buffer_info_str_);
+  options_.dump_fn("allocinfo", allocation_info_str_);
+  options_.dump_fn("scheduleinfo", instruction_schedule_str_);
+}
+
+Status MsaAlgorithm::OptimizeMemoryBoundLoop(int loop_start_idx,
+                                             int loop_end_idx, int loop_size) {
+  // The MemoryBoundLoopOptimizer works with a minimum of three unrolled loop
+  // iterations: previous, current, and next. So, we pick the second iteration
+  // out of the loop as the current iteration.
+  const int iteration_start_idx = loop_start_idx + loop_size;
+  const int iteration_end_idx = iteration_start_idx + loop_size;
+
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<MemoryBoundLoopOptimizer> optimizer,
+      MemoryBoundLoopOptimizer::Create(
+          iteration_start_idx, iteration_end_idx, options_.max_size_in_bytes,
+          options_.memory_bound_loop_optimizer_options, hlo_live_range_,
+          alias_analysis_, *options_.cost_analysis, options_.size_fn,
+          options_.reserved_scoped_memory_fn));
+  optimizer->Optimize();
+
+  const int loop_optimized_allocations_original_size =
+      loop_optimized_allocations_.size();
+  for (MemoryBoundLoopOptimizer::LoopValue& value : optimizer->loop_values()) {
+    if (!value.allocations.empty() && value.IsAllocationTypeSupported()) {
+      loop_optimized_allocations_.push_back(std::move(value.allocations));
+    }
+  }
+
+  // Check if this unrolled loop is in a while loop.
+  const auto& instruction_sequence =
+      hlo_live_range_.flattened_instruction_sequence().instructions();
+  std::vector<HloInstruction*> callers = call_graph_->GetComputationCallers(
+      instruction_sequence[loop_start_idx]->parent());
+  const bool is_in_while_loop =
+      callers.size() == 1 && callers.front()->opcode() == HloOpcode::kWhile;
+
+  // Update the loop_optimized_allocations_map_ with the output of the
+  // optimizer.
+  for (int i = loop_optimized_allocations_original_size;
+       i < loop_optimized_allocations_.size(); ++i) {
+    const AllocationSequence& sequence = loop_optimized_allocations_.at(i);
+    CHECK(!sequence.empty());
+    VLOG(3) << "  alloc: " << sequence.back()->ToString();
+    for (const auto& allocation : sequence) {
+      // Check if the loop is in a while loop and the position needs to be
+      // allocated in the default memory.
+      const bool require_pos_in_default_space =
+          is_in_while_loop &&
+          (allocation->memory_space() == MemorySpace::kDefault ||
+           allocation->is_copy_allocation());
+      for (const HloUse& use : allocation->uses()) {
+        const int64_t use_idx =
+            hlo_live_range_.instruction_schedule().at(use.instruction) -
+            iteration_start_idx;
+        CHECK_GE(use_idx, 0);
+        CHECK_LT(use_idx, loop_size);
+        for (int64_t i = loop_start_idx + use_idx; i <= loop_end_idx;
+             i += loop_size) {
+          HloInstruction* repeated_inst = instruction_sequence[i];
+          CHECK_EQ(use.instruction->opcode(), repeated_inst->opcode());
+          CHECK_EQ(use.instruction->operand_count(),
+                   repeated_inst->operand_count());
+          CHECK_LT(use.operand_number, repeated_inst->operand_count());
+          HloUse repeated_use{repeated_inst, use.operand_number,
+                              use.operand_index};
+          loop_optimized_allocations_map_[repeated_use] = {use_idx, loop_size,
+                                                           allocation.get()};
+          VLOG(3) << " Setting optimized allocations map. Use: "
+                  << repeated_use.ToString() << " idx: " << use_idx
+                  << " allocation: " << allocation->ToString();
+          if (require_pos_in_default_space) {
+            const HloValue& value =
+                alias_analysis_.dataflow_analysis().GetUniqueValueAt(
+                    repeated_inst->operand(use.operand_number),
+                    use.operand_index);
+            // If any of the positions is a parameter in a while loop, we add a
+            // required assignment in the default memory space.
+            for (const HloPosition& value_position : value.positions()) {
+              if (value_position.instruction->parent() ==
+                      repeated_inst->parent() &&
+                  value_position.instruction->opcode() ==
+                      HloOpcode::kParameter) {
+                AddRequiredAssignment(value_position.instruction,
+                                      value_position.index,
+                                      MemorySpace::kDefault);
+                break;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  return OkStatus();
+}
+
+namespace {
+// A helper function to get the distance between a use and its producer (or -1
+// if producer is a gte, parameter or tuple).
+std::function<int(const HloInstruction*)> GetOperandDistanceFunction(
+    const HloLiveRange& hlo_live_range, const HloInstruction* use_inst) {
+  const int use_idx = hlo_live_range.instruction_schedule().at(use_inst);
+  return [&, use_idx](const HloInstruction* operand) -> int {
+    // We just use -1 for parameter, tuple, gte and constant instructions. We
+    // could make this "see through" the gtes if we get too many false
+    // positives.
+    if (operand->opcode() == HloOpcode::kParameter ||
+        operand->opcode() == HloOpcode::kTuple ||
+        operand->opcode() == HloOpcode::kGetTupleElement ||
+        operand->opcode() == HloOpcode::kConstant) {
+      return -1;
+    }
+    return use_idx - hlo_live_range.instruction_schedule().at(operand);
+  };
+}
+
+// A helper function to check if the operand distances of two instructions
+// are compatible. This assumes `a` is scheduled loop size candidate
+// instructions before `b`. The operand distances are compatible if either
+// distance is -1, or if they are the same, or if they are separated by loop
+// size candidate.
+bool AreOperandCandidatesCompatible(int loop_size_candidate,
+                                    absl::Span<const int> a_distances,
+                                    absl::Span<const int> b_distances) {
+  if (a_distances.size() != b_distances.size()) {
+    return false;
+  }
+  for (int i = 0; i < a_distances.size(); ++i) {
+    const int a_value = a_distances.at(i);
+    const int b_value = b_distances.at(i);
+    if (a_value != -1 && b_value != -1 &&
+        a_value + loop_size_candidate != b_value && a_value != b_value) {
+      return false;
+    }
+  }
+  return true;
+}
+}  // namespace
+
+void MsaAlgorithm::IdentifyAndOptimizeMemoryBoundLoops() {
+  absl::flat_hash_map<absl::string_view, int> fingerprint_schedule_map;
+  const auto& instruction_sequence =
+      hlo_live_range_.flattened_instruction_sequence().instructions();
+  // The minimum and maximum loop sizes that we consider.
+  const int kMinLoopSize = 4;
+  const int kMaxLoopSize = 400;
+  int optimized_loop_idx = 0;
+  while (optimized_loop_idx < instruction_sequence.size()) {
+    // Iterate over the flattened instruction sequence. We first try to find a
+    // loop candidate where the fingerprint between two instructions matches by
+    // the loop size candidate.
+    int loop_size_candidate = -1;
+    int loop_start_idx = -1;
+    int loop_end_idx = -1;
+    for (; optimized_loop_idx < instruction_sequence.size();
+         ++optimized_loop_idx) {
+      const HloInstruction* inst = instruction_sequence[optimized_loop_idx];
+      auto fingerprint_it = fingerprint_map_.find(inst);
+      if (inst->opcode() != HloOpcode::kParameter &&
+          inst->opcode() != HloOpcode::kTuple &&
+          inst->opcode() != HloOpcode::kGetTupleElement &&
+          fingerprint_it != fingerprint_map_.end()) {
+        // Find and the latest instruction with the same fingerprint as this.
+        auto fingerprint_schedule_it =
+            fingerprint_schedule_map.find(fingerprint_it->second);
+        if (fingerprint_schedule_it != fingerprint_schedule_map.end()) {
+          int distance = optimized_loop_idx - fingerprint_schedule_it->second;
+          if (distance >= kMinLoopSize && distance <= kMaxLoopSize) {
+            // We found two instructions with the same fingerprint. The distance
+            // between the two is the loop size candidate.
+            loop_size_candidate = distance;
+            // Update the fingerprint map with the current loop index so that if
+            // the loop size candidate doesn't find a valid loop, we can resume
+            // searching from this instruction.
+            fingerprint_schedule_map[fingerprint_it->second] =
+                optimized_loop_idx;
+            break;
+          }
+        }
+        fingerprint_schedule_map[fingerprint_it->second] = optimized_loop_idx;
+      }
+
+      VLOG(3) << " " << optimized_loop_idx << ": "
+              << instruction_sequence[optimized_loop_idx]->parent()->name()
+              << " " << instruction_sequence[optimized_loop_idx]->name()
+              << " fingerprint: "
+              << (fingerprint_it == fingerprint_map_.end()
+                      ? "none"
+                      : fingerprint_it->second);
+    }
+    VLOG(3) << "Loop size candidate: " << loop_size_candidate;
+    if (loop_size_candidate == -1) {
+      break;
+    }
+
+    std::vector<std::vector<int>> operand_distances;
+
+    // Scan the instructions with the candidate loop size. We try to calculate
+    // the size of the loop by finding the instructions that are loop size
+    // candidate apart, have the same fingerprint and compatible operand
+    // distances. We start scanning the candidate loop a few instructions
+    // earlier than the fingerprint identified in case the loop starts a bit
+    // earlier than the fingerprint logic.
+    const int kLoopScanHeadStart = 10;
+    for (int i = std::max(
+             0, optimized_loop_idx - loop_size_candidate - kLoopScanHeadStart);
+         i < instruction_sequence.size(); ++i) {
+      const HloInstruction* inst = instruction_sequence[i];
+      auto fingerprint_it = fingerprint_map_.find(inst);
+      auto ignore_op = [](const HloInstruction* instruction) {
+        return instruction->opcode() == HloOpcode::kParameter ||
+               instruction->opcode() == HloOpcode::kTuple ||
+               instruction->opcode() == HloOpcode::kGetTupleElement;
+      };
+      // We trigger this if statement until we find the start of the loop.
+      if (loop_start_idx == -1) {
+        if (i > optimized_loop_idx - loop_size_candidate) {
+          break;
+        }
+        if (ignore_op(inst) || fingerprint_it == fingerprint_map_.end()) {
+          continue;
+        }
+        if (i + loop_size_candidate >= instruction_sequence.size()) {
+          break;
+        }
+        const HloInstruction* candidate_inst =
+            instruction_sequence[i + loop_size_candidate];
+        auto candidate_fingerprint_it = fingerprint_map_.find(candidate_inst);
+        if (ignore_op(candidate_inst) ||
+            candidate_fingerprint_it == fingerprint_map_.end() ||
+            fingerprint_it->second != candidate_fingerprint_it->second) {
+          // Fingerprint mismatch.
+          continue;
+        }
+        std::vector<int> inst_operand_distances;
+        absl::c_transform(inst->operands(),
+                          std::back_inserter(inst_operand_distances),
+                          GetOperandDistanceFunction(hlo_live_range_, inst));
+        std::vector<int> candidate_inst_operand_distances;
+        absl::c_transform(
+            candidate_inst->operands(),
+            std::back_inserter(candidate_inst_operand_distances),
+            GetOperandDistanceFunction(hlo_live_range_, candidate_inst));
+        VLOG(3) << "i : " << i << " "
+                << absl::StrJoin(inst_operand_distances, ", ") << " | "
+                << absl::StrJoin(candidate_inst_operand_distances, ", ");
+        if (!AreOperandCandidatesCompatible(loop_size_candidate,
+                                            inst_operand_distances,
+                                            candidate_inst_operand_distances)) {
+          // Operand distance mistatch.
+          continue;
+        }
+        // Found the start of the loop.
+        loop_start_idx = i;
+      }
+      if (inst->parent() != instruction_sequence[loop_start_idx]->parent()) {
+        VLOG(3) << "Mismatch (computation) at " << i << ": "
+                << inst->parent()->name() << " vs "
+                << instruction_sequence[loop_start_idx]->parent()->name();
+        break;
+      }
+      operand_distances.push_back({});
+      if (fingerprint_it == fingerprint_map_.end()) {
+        continue;
+      }
+      absl::c_transform(inst->operands(),
+                        std::back_inserter(operand_distances.back()),
+                        GetOperandDistanceFunction(hlo_live_range_, inst));
+      if (i >= loop_start_idx + loop_size_candidate) {
+        // Verify that this still obeys the fingerprint and operand distance
+        // invariants.
+        const HloInstruction* prev_inst =
+            instruction_sequence[i - loop_size_candidate];
+        auto prev_fingerprint_it = fingerprint_map_.find(prev_inst);
+        if (prev_fingerprint_it == fingerprint_map_.end()) {
+          break;
+        }
+        if (ignore_op(inst) || ignore_op(prev_inst)) {
+          if (inst->opcode() != prev_inst->opcode()) {
+            VLOG(3) << "Mismatch (opcode) at " << i << ", "
+                    << (i - loop_size_candidate) << ": " << inst->opcode()
+                    << " vs " << prev_inst->opcode();
+            break;
+          }
+          if (inst->operand_count() != prev_inst->operand_count()) {
+            VLOG(3) << "Mismatch (# operands) at " << i << ", "
+                    << (i - loop_size_candidate) << ": "
+                    << inst->operand_count() << " vs "
+                    << prev_inst->operand_count();
+            break;
+          }
+        }
+        if (fingerprint_it->second != prev_fingerprint_it->second) {
+          VLOG(3) << "Mismatch (fp) at " << i << ", "
+                  << (i - loop_size_candidate) << ": " << fingerprint_it->second
+                  << " vs " << prev_fingerprint_it->second;
+          break;
+        }
+        if (!AreOperandCandidatesCompatible(
+                loop_size_candidate,
+                *(operand_distances.rbegin() + loop_size_candidate),
+                operand_distances.back())) {
+          VLOG(3) << "Mismatch (op) at " << i << ", "
+                  << (i - loop_size_candidate) << ": "
+                  << absl::StrJoin(operand_distances.back(), ", ") << " vs "
+                  << absl::StrJoin(
+                         *(operand_distances.rbegin() + loop_size_candidate),
+                         ", ");
+          break;
+        }
+      }
+      loop_end_idx = i;
+    }
+    float num_iterations = 0;
+    if (loop_start_idx != -1) {
+      num_iterations = static_cast<float>(loop_end_idx + 1 - loop_start_idx) /
+                       loop_size_candidate;
+    }
+    VLOG(3) << "Loop start: " << loop_start_idx << " loop end: " << loop_end_idx
+            << " num iterations: " << num_iterations;
+
+    optimized_loop_idx = std::max(optimized_loop_idx, loop_end_idx) + 1;
+
+    if (num_iterations >=
+        options_.memory_bound_loop_optimizer_options.min_num_iterations()) {
+      VLOG(2) << "Found valid loop. Loop start: " << loop_start_idx
+              << " loop end: " << loop_end_idx
+              << " num iterations: " << num_iterations;
+
+      TF_CHECK_OK(OptimizeMemoryBoundLoop(loop_start_idx, loop_end_idx,
+                                          loop_size_candidate));
+    }
+  }
+}
+
+absl::StatusOr<HeapSimulator::Result<HloValue>> MsaAlgorithm::Finish() {
+  if (options_.autotuning_config.has_value()) {
+    CHECK_EQ((*options_.autotuning_config).size(), buffer_intervals_.size());
+  }
+  VLOG(1) << "Slicing is "
+          << (options_.sliced_prefetch_options.max_slices() >= 2 ? "enabled"
+                                                                 : "disabled");
+
+  AllocateReservedScopedAllocations();
+  std::vector<MsaBufferInterval> sorted_buffer_intervals =
+      GetSortedBufferIntervals();
+  memory_space_assignment::CustomizeSortedBufferInterval(
+      options_.autotuning_config, sorted_buffer_intervals);
+
+  // Calculate the memory pressure for the buffers that can be assigned in the
+  // alternate memory.
+  memory_pressure_ = 0;
+  VLOG(5) << [&]() {
+    std::string s("Sorted MsaBufferInterval order.");
+    if (options_.buffer_interval_comparator) {
+      absl::StrAppend(
+          &s, " Pre-autotuning sort criteria: ",
+          options_.buffer_interval_comparator->DescribeComparisonCriteria());
+    }
+    return s;
+  }();
+  for (auto& interval : sorted_buffer_intervals) {
+    if (!interval.need_allocation ||
+        !MemorySpaceAssignmentUtils::IsIntervalAllowedInAlternateMemory(
+            interval, options_.alternate_memory_space) ||
+        interval.size > available_heap_size()) {
+      continue;
+    }
+    VLOG(5) << [&]() {
+      std::string s("SortedBufferInterval.");
+      if (options_.buffer_interval_comparator) {
+        absl::StrAppend(
+            &s, " Criteria: ",
+            options_.buffer_interval_comparator->CriteriaToString(interval));
+      }
+      absl::StrAppend(&s, " Buffer: ", interval.buffer->ToShortString());
+      return s;
+    }();
+    memory_pressure_ += interval.size;
+  }
+  VLOG(1) << "Memory pressure = " << memory_pressure_;
+
+  if (options_.enable_cross_program_prefetch) {
+    std::vector<MsaBufferInterval> prefetch_candidates =
+        FindCrossProgramPrefetchCandidates(alias_analysis_, hlo_live_range_,
+                                           options_);
+    for (auto& prefetch_candidate : prefetch_candidates) {
+      HloModule* module = prefetch_candidate.buffer->instruction()->GetModule();
+      if (0 <= options().max_cross_program_prefetches &&
+          options().max_cross_program_prefetches <=
+              module->CrossProgramPrefetches().size()) {
+        break;
+      }
+      AllocateCrossProgramPrefetchBuffer(module, prefetch_candidate);
+    }
+  }
+
+  VLOG(1) << "Assigning buffers to alternate memory. Max heap size = "
+          << options_.max_size_in_bytes;
+
+  AddInputAndOutputRequiredAssignments();
+
+  if (VLOG_IS_ON(3) || options_.dump_fn != nullptr) {
+    VLOG(3) << "Flattened instruction sequence:";
+    const auto& instruction_sequence =
+        hlo_live_range_.flattened_instruction_sequence().instructions();
+    absl::StrAppend(&instruction_schedule_str_, "time,instruction_name\n");
+    for (int i = 0; i < instruction_sequence.size(); ++i) {
+      VLOG(3) << " " << i << ": " << instruction_sequence[i]->parent()->name()
+              << " " << instruction_sequence[i]->name();
+      absl::StrAppend(&instruction_schedule_str_, i, ",",
+                      instruction_sequence[i]->name(), "\n");
+    }
+  }
+
+  if (options_.memory_bound_loop_optimizer_options.enabled()) {
+    IdentifyAndOptimizeMemoryBoundLoops();
+  }
+
+  for (const auto& interval : sorted_buffer_intervals) {
+    auto colocated_intervals = GetSortedColocatedIntervals(interval);
+    if (AreIntervalsReservedInAlternateMemory(colocated_intervals)) {
+      // Increment the reserved part of alternate memory so that it is not
+      // available for other buffers.
+      reserved_in_bytes_ += options_.size_fn(*interval.buffer);
+    }
+  }
+  VLOG(2) << "Total reserved bytes = " << reserved_in_bytes_;
+
+  for (auto& interval : sorted_buffer_intervals) {
+    if (!interval.need_allocation) {
+      VLOG(3) << "Skip " << interval.buffer->ToShortString()
+              << " because it doesn't need an allocation.";
+      continue;
+    }
+
+    if (!MemorySpaceAssignmentUtils::IsIntervalAllowedInAlternateMemory(
+            interval, options_.alternate_memory_space)) {
+      VLOG(3) << "Skip " << interval.buffer->ToShortString()
+              << " because it is not allowed in the alternate memory.";
+      continue;
+    }
+
+    HloInstruction* inst = interval.buffer->instruction();
+    HloModule* module = inst->GetModule();
+
+    // Don't intra-program prefetch a cross program prefetch
+    auto cross_program_prefetches = module->CrossProgramPrefetches();
+    if (inst->opcode() == HloOpcode::kParameter &&
+        absl::c_find_if(cross_program_prefetches, [&](auto& info) {
+          return info.parameter == inst->parameter_number() &&
+                 info.index == interval.buffer->index();
+        }) != module->CrossProgramPrefetches().end()) {
+      VLOG(3) << "Skip " << interval.buffer->ToShortString()
+              << " because it is cross-program prefetched.";
+      continue;
+    }
+
+    if (interval.size > available_heap_size()) {
+      VLOG(3) << "Skip " << interval.buffer->ToShortString()
+              << " because the buffer is larger than the heap size.";
+      continue;
+    }
+
+    auto colocated_intervals = GetSortedColocatedIntervals(interval);
+
+    if (AreIntervalsReservedInAlternateMemory(colocated_intervals)) {
+      VLOG(3) << "Interval " << interval.buffer->ToShortString()
+              << " is reserved in the alternate memory.";
+      for (const MsaBufferInterval* colocated_interval : colocated_intervals) {
+        const HloValue* value = colocated_interval->buffer;
+        // Color all of the aliased reserved buffers here because reserved
+        // alternate memory allocations will not have an entry in preset
+        // allocations that is normally used for coloring.
+        for (auto& position : value->positions()) {
+          VLOG(4) << "Coloring " << position.ToString();
+          Shape* shape = ShapeUtil::GetMutableSubshape(
+              position.instruction->mutable_shape(), position.index);
+          CHECK(shape->IsArray()) << "Coloring a shape that is not an array: "
+                                  << position.ToString();
+          shape->mutable_layout()->set_memory_space(
+              options_.alternate_memory_space);
+        }
+      }
+      continue;
+    }
+
+    if (colocated_intervals.size() > 1 &&
+        !options_.allocate_across_sequential_calls) {
+      VLOG(4) << "Not allocating " << interval.buffer->ToShortString()
+              << " because it aliases with another interval and "
+              << " allocate_across_sequential_calls is false.";
+      continue;
+    }
+
+    if (!ConsumeFuel("memory_space_assignment", [&] {
+          return absl::StrCat("Ran out of fuel at buffer: ",
+                              colocated_intervals[0]->buffer->ToShortString());
+        })) {
+      continue;
+    }
+
+    if (options_.dump_fn != nullptr || VLOG_IS_ON(3)) {
+      // Only fill buffer_info_str_ if needed.
+      AppendBufferInfoDebugString(interval, &buffer_info_str_);
+    }
+
+    std::vector<AllocationValue> allocation_values;
+    CreateAllocationValuesFromColocatedIntervals(colocated_intervals,
+                                                 allocation_values);
+
+    // Retry allocating this value with larger limits if allocation fails.
+    bool repacked = false;
+    for (int retry_number = 0; retry_number < options_.max_retries;
+         retry_number++) {
+      AddRequiredAssignmentsForColocatedIntervals(colocated_intervals);
+      options_.prefetch_interval_picker->SetRetryNumber(retry_number);
+      TF_ASSIGN_OR_RETURN(
+          Result result,
+          AllocateAllocationValues(absl::MakeSpan(allocation_values)));
+      VLOG(2) << "Allocation result = "
+              << absl::StrFormat("%x", static_cast<int>(result));
+      if (result_requires_uncommit(result)) {
+        UncommitPendingChunks(absl::MakeSpan(allocation_values));
+        VLOG(2) << "Couldn't allocate. Retry number " << retry_number;
+      } else if ((result_is(result, Result::kFailOutOfMemory) ||
+                  options_.repack_after_every_allocation) &&
+                 num_repacks_ < options_.max_repacks && !repacked) {
+        UncommitPendingChunks(absl::MakeSpan(allocation_values));
+        ++num_repacks_;
+        repacked = true;
+        CHECK_NE(options_.repacker, nullptr);
+        std::vector<AllocationBlock*> repack_allocation_blocks;
+        ExportAllocationsForRepacking(repack_allocation_blocks);
+        VLOG(2) << "Repacking.";
+        auto repack_status =
+            options_.repacker->Repack(absl::MakeSpan(repack_allocation_blocks));
+        CHECK_EQ(repack_status.status(), OkStatus());
+        VLOG(2) << "Repack complete. Modified = " << *repack_status;
+        // For debug and testing purpose, also update allocations if
+        // repack_after_every_allocation is on.
+        if (*repack_status || options_.repack_after_every_allocation) {
+          ImportRepackedAllocations();
+          --retry_number;
+        }
+        if (*repack_status) {
+          ++num_repacks_successful_;
+        }
+      } else {
+        // Check if any of the allocation sites are inefficient. If so, get rid
+        // of the pending allocation, require all of the inefficient sites in
+        // the default memory, and perform allocation again.
+        std::vector<HloPositionOrUse> inefficient_sites =
+            GetInefficientAllocationSites(allocation_values);
+        if (!inefficient_sites.empty()) {
+          UncommitPendingChunks(absl::MakeSpan(allocation_values));
+          for (const HloPositionOrUse& site : inefficient_sites) {
+            // To avoid a livelock situation, we commit the required assignments
+            // right away. Otherwise, reallocation can find alternate memory
+            // allocations at other sites, which can also be inefficient.
+            std::visit(
+                [this](const auto& site) {
+                  VLOG(3) << "Inefficient site: " << site.ToString();
+                  AddRequiredAssignment(site, MemorySpace::kDefault,
+                                        /*offset=*/nullptr,
+                                        /*add_to_pending=*/false);
+                },
+                site);
+          }
+          --retry_number;
+          continue;
+        }
+
+        FinalizeAllocations(absl::MakeSpan(allocation_values));
+        break;
+      }
+    }
+  }
+  if (options_.repack_after_every_allocation) {
+    CHECK_NE(options_.repacker, nullptr);
+    std::vector<AllocationBlock*> repack_allocation_blocks;
+    ExportAllocationsForRepacking(repack_allocation_blocks);
+    VLOG(2) << "Final Repacking.";
+    auto repack_status =
+        options_.repacker->Repack(absl::MakeSpan(repack_allocation_blocks));
+    CHECK_EQ(repack_status.status(), OkStatus());
+    VLOG(2) << "Final Repack complete. Modified = " << *repack_status;
+  }
+
+  if (options_.dump_fn != nullptr || VLOG_IS_ON(3)) {
+    for (auto& allocation : *allocations_) {
+      // Only fill allocation_info_str_ if needed.
+      AppendAllocationInfoDebugString(*allocation, allocation_info_str_);
+    }
+  }
+
+  VLOG(1) << "Repack summary: " << num_repacks_successful_
+          << " succeeded out of " << num_repacks_;
+
+  VLOG(3) << "Debug buffer info: ";
+  XLA_VLOG_LINES(3, buffer_info_str_);
+  VLOG(3) << "Debug allocation info: ";
+  XLA_VLOG_LINES(3, allocation_info_str_);
+  DumpDebugStringsIfEnabled();
+
+  HeapSimulator::Result<HloValue> result;
+  result.heap_size = result_.heap_size;
+  result.heap_results.emplace_back(std::move(result_));
+  return result;
+}
+
+namespace {
+
+// Convert a tuple HloUse to its equivalent HloPosition.
+HloPosition TupleUseToPosition(const HloUse& use) {
+  CHECK_EQ(use.instruction->opcode(), HloOpcode::kTuple);
+  ShapeIndex index = use.operand_index;
+  index.push_front(use.operand_number);
+  return {use.instruction, index};
+}
+
+// Returns the memory space of the defining position of an Allocation object.
+MemorySpace GetDefiningPositionMemorySpace(const Allocation& allocation) {
+  if (!allocation.is_copy_like_allocation()) {
+    return allocation.memory_space();
+  }
+  if (allocation.memory_space() == MemorySpace::kDefault) {
+    return MemorySpace::kAlternate;
+  }
+  return MemorySpace::kDefault;
+}
+
+}  // namespace
+
+std::vector<std::vector<const Allocation*>>
+MsaAlgorithm::GetLinkedAllocationsInAlternateMemory(
+    absl::Span<const AllocationValue> allocation_values) const {
+  std::vector<std::vector<const Allocation*>> linked_allocations;
+  // A map from position to index into linked_allocations.
+  absl::flat_hash_map<HloPosition, int> link_id_map;
+  // Iterate over the allocation values. Find Allocation objects across the
+  // allocation values that are part of the same linked allocation group. We
+  // define a linked allocation group as Allocation objects that have aliased
+  // positions or uses. An example would be an Allocation object that has a
+  // dynamic-update-slice use and another Allocation object that has the same
+  // dynamic-update-slice as its defining position.
+  for (const AllocationValue& allocation_value : allocation_values) {
+    absl::flat_hash_map<HloUse, std::vector<HloPosition>> aliases;
+    for (const AllocationValue::Use& allocation_value_use :
+         allocation_value.uses()) {
+      if (!allocation_value_use.aliases.empty()) {
+        aliases[allocation_value_use.hlo_use] = allocation_value_use.aliases;
+      }
+    }
+    for (const auto& allocation : *allocation_value.allocation_sequence()) {
+      MemorySpace position_memory_space =
+          GetDefiningPositionMemorySpace(*allocation);
+      if (allocation->memory_space() == MemorySpace::kDefault &&
+          position_memory_space == MemorySpace::kDefault) {
+        // This is just a regular allocation in the default memory, skip.
+        continue;
+      }
+      int link_id = -1;
+      // For every position and use in the alternate memory space, check if
+      // there is already a linked allocation group, and if so, use that link
+      // index.
+      if (position_memory_space == MemorySpace::kAlternate) {
+        auto link_id_map_it = link_id_map.find(allocation->defining_position());
+        if (link_id_map_it != link_id_map.end()) {
+          link_id = link_id_map_it->second;
+        }
+      }
+      if (allocation->memory_space() == MemorySpace::kAlternate) {
+        for (const HloUse& use : allocation->uses()) {
+          if (use.instruction->opcode() == HloOpcode::kTuple) {
+            auto link_id_map_it = link_id_map.find(TupleUseToPosition(use));
+            if (link_id_map_it != link_id_map.end()) {
+              if (link_id != -1 && link_id != link_id_map_it->second) {
+                // We found multiple link indices for the given allocation. We
+                // merge the two linked allocation groups in that case.
+                int old_link_id = link_id_map_it->second;
+                if (old_link_id < link_id) {
+                  std::swap(link_id, old_link_id);
+                }
+                absl::c_copy(linked_allocations[old_link_id],
+                             std::back_inserter(linked_allocations[link_id]));
+                linked_allocations[old_link_id].clear();
+                for (auto it = link_id_map.begin(); it != link_id_map.end();
+                     ++it) {
+                  if (it->second == old_link_id) {
+                    it->second = link_id;
+                  }
+                }
+              }
+              link_id = link_id_map_it->second;
+            }
+          }
+        }
+      }
+      if (link_id == -1) {
+        // Create a new linked allocation group if we couldn't find one.
+        link_id = linked_allocations.size();
+        linked_allocations.push_back({allocation.get()});
+      } else {
+        linked_allocations[link_id].push_back(allocation.get());
+      }
+      // Propagate the link index to all of the aliases of uses in the alternate
+      // memory.
+      if (allocation->memory_space() == MemorySpace::kAlternate) {
+        for (const HloUse& use : allocation->uses()) {
+          auto alias_it = aliases.find(use);
+          if (alias_it != aliases.end()) {
+            for (const HloPosition& aliased_position : alias_it->second) {
+              link_id_map[aliased_position] = link_id;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  linked_allocations.erase(
+      std::remove_if(
+          linked_allocations.begin(), linked_allocations.end(),
+          [](const auto& allocations) { return allocations.empty(); }),
+      linked_allocations.end());
+
+  if (VLOG_IS_ON(3)) {
+    for (int i = 0; i < linked_allocations.size(); ++i) {
+      VLOG(3) << "Link id = " << i;
+      for (const Allocation* allocation : linked_allocations[i]) {
+        VLOG(3) << "  " << allocation->ToString();
+      }
+    }
+  }
+  return linked_allocations;
+}
+
+std::vector<MsaAlgorithm::HloPositionOrUse>
+MsaAlgorithm::GetInefficientAllocationSites(
+    absl::Span<const AllocationValue> allocation_values) const {
+  // The logic below is used mostly for testing, allowing a test case to inject
+  // some custom logic for this method.
+  if (options_.get_inefficient_allocation_sites_fn) {
+    std::vector<HloPosition> defining_positions;
+    defining_positions.reserve(allocation_values.size());
+    for (const AllocationValue& value : allocation_values) {
+      defining_positions.push_back(value.defining_position());
+    }
+    return options_.get_inefficient_allocation_sites_fn(
+        absl::MakeSpan(defining_positions));
+  }
+
+  if (!options_.cost_analysis ||
+      options_.inefficient_use_to_copy_ratio == 0.0) {
+    return {};
+  }
+
+  int64_t size = allocation_values.at(0).size();
+
+  if (VLOG_IS_ON(3)) {
+    for (const AllocationValue& allocation_value : allocation_values) {
+      for (const auto& allocation : *allocation_value.allocation_sequence()) {
+        VLOG(3) << " Allocation: " << allocation->ToString();
+        if (!allocation->is_copy_like_allocation()) {
+          const HloPosition& defining_position =
+              allocation->defining_position();
+          int64_t accessed =
+              options_.cost_analysis->hlo_cost_analysis().output_bytes_accessed(
+                  *defining_position.instruction, defining_position.index);
+          VLOG(3) << "  pos: " << defining_position.ToString()
+                  << ", accessed: " << accessed << " / " << size;
+        }
+        for (const HloUse& use : allocation->uses()) {
+          int64_t accessed =
+              options_.cost_analysis->hlo_cost_analysis()
+                  .operand_bytes_accessed(*use.instruction, use.operand_number,
+                                          use.operand_index);
+          VLOG(3) << "  use: " << use.ToString() << ", accessed: " << accessed
+                  << " / " << size;
+        }
+      }
+    }
+  }
+
+  std::vector<std::vector<const Allocation*>> linked_allocations =
+      GetLinkedAllocationsInAlternateMemory(allocation_values);
+  std::vector<MsaAlgorithm::HloPositionOrUse> inefficient_sites;
+  for (const std::vector<const Allocation*>& allocation_group :
+       linked_allocations) {
+    // For all of allocation in the linked allocation group, calculate the total
+    // use bytes in alternate memory and async copy bytes. If the ratio between
+    // the two is below inefficient_use_to_copy_ratio, add all of the
+    // participating allocation sites into inefficient_sites.
+    VLOG(3) << "AllocationGroup:";
+    int64_t copy_bytes = 0;
+    int64_t use_bytes = 0;
+    for (const Allocation* allocation : allocation_group) {
+      VLOG(3) << " Allocation: " << allocation->ToString();
+      MemorySpace position_memory_space =
+          GetDefiningPositionMemorySpace(*allocation);
+      if (allocation->is_copy_like_allocation()) {
+        copy_bytes += size;
+      }
+      if (position_memory_space == MemorySpace::kAlternate) {
+        use_bytes +=
+            options_.cost_analysis->hlo_cost_analysis().output_bytes_accessed(
+                *allocation->defining_position().instruction,
+                allocation->defining_position().index);
+      }
+      if (allocation->memory_space() == MemorySpace::kAlternate) {
+        for (const HloUse& use : allocation->uses()) {
+          use_bytes +=
+              options_.cost_analysis->hlo_cost_analysis()
+                  .operand_bytes_accessed(*use.instruction, use.operand_number,
+                                          use.operand_index);
+        }
+      }
+    }
+    VLOG(3) << " use bytes: " << use_bytes << ", copy bytes: " << copy_bytes;
+    if (options_.inefficient_use_to_copy_ratio * copy_bytes > use_bytes) {
+      for (const Allocation* allocation : allocation_group) {
+        MemorySpace position_memory_space =
+            GetDefiningPositionMemorySpace(*allocation);
+        if (position_memory_space == MemorySpace::kAlternate) {
+          if (!allocation->is_copy_like_allocation()) {
+            inefficient_sites.push_back(allocation->defining_position());
+          }
+        }
+        if (allocation->memory_space() == MemorySpace::kAlternate) {
+          for (const HloUse& use : allocation->uses()) {
+            inefficient_sites.push_back(use);
+          }
+        }
+      }
+    }
+  }
+  return inefficient_sites;
+}
+
+void MsaAlgorithm::AddRequiredAssignmentsForColocatedIntervals(
+    absl::Span<const MsaBufferInterval* const> colocated_intervals) {
+  // TODO(berkin): For now, place the phi values due to conditionals in
+  // default memory.
+  for (const MsaBufferInterval* colocated_interval : colocated_intervals) {
+    const HloValue* value = colocated_interval->buffer;
+    for (const auto& position : value->positions()) {
+      if (position.instruction->opcode() == HloOpcode::kConditional) {
+        VLOG(3) << "Adding required assignment for condition output: "
+                << value->ToShortString();
+        AddRequiredAssignment(position.instruction, position.index,
+                              MemorySpace::kDefault);
+        for (const HloComputation* called_computation :
+             position.instruction->called_computations()) {
+          AddRequiredAssignment(called_computation->root_instruction(),
+                                position.index, MemorySpace::kDefault);
+        }
+      }
+    }
+  }
+}
+
+void MsaAlgorithm::CreateAllocationValuesFromColocatedIntervals(
+    absl::Span<const MsaBufferInterval* const> colocated_intervals,
+    std::vector<AllocationValue>& allocation_values) {
+  // Create AllocationValues for all the colocated intervals.
+  for (const auto& colocated_interval : colocated_intervals) {
+    CreateAllocationValues(*colocated_interval, allocation_values);
+  }
+  // Go through the AllocationValues and delete the ones that have the identical
+  // defining instruction and use instructions. This is useful for async
+  // operations that can read and write to the same buffer, e.g., in-place
+  // asynchronous collective permute. The AllocationValues that corresponds to
+  // collective-permute-start{0} (the input) and collective-permute-start{1}
+  // (the output) refer to the same buffer by definition (since they are created
+  // from colocated intervals). If we don't delete one of these buffers, then
+  // when we try to allocate the AllocationValue, we would think they overlap.
+  auto create_instruction_vector = [](const AllocationValue& allocation_value) {
+    std::vector<const HloInstruction*> instruction_vector;
+    instruction_vector.push_back(allocation_value.defining_instruction());
+    for (const AllocationValue::Use& use : allocation_value.uses()) {
+      instruction_vector.push_back(use.hlo_use.instruction);
+    }
+    return instruction_vector;
+  };
+  for (int i = 0; i < allocation_values.size() - 1; ++i) {
+    for (int j = i + 1; j < allocation_values.size(); ++j) {
+      const AllocationValue& allocation_value_1 = allocation_values[i];
+      const AllocationValue& allocation_value_2 = allocation_values[j];
+      if (create_instruction_vector(allocation_value_1) ==
+          create_instruction_vector(allocation_value_2)) {
+        VLOG(3) << "Allocation values " << allocation_value_1.ToShortString()
+                << " and " << allocation_value_2.ToShortString()
+                << " are equivalent, deleting the second one.";
+        allocation_values.erase(allocation_values.begin() + j);
+        --j;
+      }
+    }
+  }
+
+  FindAliases(&allocation_values);
+}
+
+absl::StatusOr<MsaAlgorithm::Result> MsaAlgorithm::AllocateAllocationValues(
+    absl::Span<AllocationValue> allocation_values) {
+  const auto& instruction_schedule = hlo_live_range_.instruction_schedule();
+
+  // Find the use times across all of the related AllocationValues and sort
+  // them. We use these to find allocations that are available throughout the
+  // entire live range of all the AllocationValues.
+  std::vector<int64_t> all_use_times;
+  for (const AllocationValue& allocation_value : allocation_values) {
+    absl::c_transform(allocation_value.uses(),
+                      std::back_inserter(all_use_times),
+                      [](const AllocationValue::Use& use) { return use.time; });
+  }
+  absl::c_sort(all_use_times);
+
+  // Data structure to contain the preferred offset for a given computation.
+  // We ensure that the same offset will be allocated outside the while loop
+  // as well as inside the while loop.
+  absl::flat_hash_map<const HloComputation*, AliasedOffset*>
+      preferred_offset_for_computation;
+
+  Result result = Result::kSuccess;
+  for (AllocationValue& allocation_value : allocation_values) {
+    int64_t definition_time =
+        instruction_schedule.at(allocation_value.defining_instruction());
+
+    bool require_no_copy_alternate_mem_allocation =
+        allocation_value.value()->shape().has_layout() &&
+        allocation_value.value()->shape().layout().memory_space() ==
+            options_.alternate_memory_space;
+    VLOG(3) << "require_no_copy_alternate_mem_allocation = "
+            << require_no_copy_alternate_mem_allocation;
+    if (!options_.is_position_allowed_in_alternate_mem_fn(
+            allocation_value.defining_position())) {
+      if (require_no_copy_alternate_mem_allocation) {
+        LOG(WARNING)
+            << "The value " << allocation_value.value()->ToShortString()
+            << " is pre-colored for alternate memory but the position "
+            << allocation_value.defining_position().ToString()
+            << " is not allowed in the alternate memory. Respecting the color "
+               "but this may break things later in compilation.";
+      } else {
+        AddRequiredAssignment(allocation_value.value(),
+                              allocation_value.defining_instruction(),
+                              MemorySpace::kDefault, definition_time);
+      }
+    }
+
+    AliasedOffset* preferred_offset = nullptr;
+    auto preferred_offset_it =
+        preferred_offset_for_computation.find(allocation_value.computation());
+    if (preferred_offset_it != preferred_offset_for_computation.end()) {
+      preferred_offset = preferred_offset_it->second;
+    }
+
+    // Iterate over the uses.
+    for (int use_idx = 0; use_idx < allocation_value.uses().size(); ++use_idx) {
+      const AllocationValue::Use& use = allocation_value.uses().at(use_idx);
+      const HloUse hlo_use = use.hlo_use;
+      int64_t use_time = instruction_schedule.at(hlo_use.instruction);
+      bool allow_no_copy_alternate_mem_allocation = true;
+      bool allow_prefetch = true;
+      bool prefer_no_copy_alternate_mem_allocation = false;
+      // TODO(b/318886791):  Rename boundary variables (here and other places)
+      // like `latest_prefetch_time` and `earliest_prefetch_time` indicate
+      // whether they are exclusive or inclusive boundaries.
+      int64_t latest_prefetch_time = use_time;
+      std::optional<int64_t> earliest_prefetch_time = std::nullopt;
+
+      // Assign the required assignment offset as a preferred offset.
+      std::optional<RequiredMemoryAssignment> required_assignment =
+          AliasedRequiredAssignmentForUse(use);
+      if (required_assignment &&
+          required_assignment->memory_space == MemorySpace::kAlternate) {
+        if (preferred_offset) {
+          CHECK_EQ(preferred_offset, required_assignment->offset);
+        } else {
+          preferred_offset = required_assignment->offset;
+          VLOG(3)
+              << "Setting preferred offset due to required assignment for use: "
+              << preferred_offset->offset;
+        }
+      }
+
+      // Control flow  calls include kWhile, kCall, and kConditional opcodes.
+      bool is_sequential_call =
+          (GetInstructionCallContext(hlo_use.instruction->opcode()) ==
+           CallContext::kControlFlow);
+      if (is_sequential_call) {
+        for (const HloComputation* called_computation :
+             hlo_use.instruction->called_computations()) {
+          const HloLiveRange::TimeBound& computation_span =
+              hlo_live_range_.computation_span_times().at(called_computation);
+          latest_prefetch_time =
+              std::min(computation_span.start - 1, latest_prefetch_time);
+        }
+        if (hlo_use.instruction->opcode() == HloOpcode::kWhile) {
+          // Given an example while loop and flattened schedule (logical times
+          // shown on the left):
+          //
+          // 0:  a = ...
+          // 1:  ...
+          //     cond {
+          // 2:   p = param(0)
+          // 3:   ...
+          //     }
+          //     body {
+          // 4:   p = param(0)
+          // 5:   ...
+          // 6:   ROOT ...
+          //     }
+          // 7:  w = while(a), body=body, cond=cond
+          //
+          // When processing "a" (time 0) and its while use (time 7), we update
+          // the interval to time 0-4. This is so that the remaining interval
+          // (5-6) can be allocated separately and this buffer doesn't waste
+          // alternate memory space within the while loop body.
+          HloComputation* while_body = hlo_use.instruction->while_body();
+          // We require while body ROOTs to be the last in the schedule.
+          CHECK_EQ(instruction_schedule.at(while_body->root_instruction()) + 1,
+                   instruction_schedule.at(hlo_use.instruction))
+              << "While body ROOTs need to be the last in the schedule! "
+                 "Please run RootInstructionSinker.";
+          // Replace the use time with the parameter time so that we can decide
+          // on alternate memory allocations within the while loop body when we
+          // look at uses within the while loop body.
+          use_time =
+              instruction_schedule.at(while_body->parameter_instruction(0));
+        } else if (hlo_use.instruction->opcode() == HloOpcode::kConditional) {
+          // Replace the use time with the earliest parameter of called
+          // computations.
+          for (const HloComputation* called_computation :
+               hlo_use.instruction->called_computations()) {
+            use_time = std::min(
+                use_time, instruction_schedule.at(
+                              called_computation->parameter_instruction(0)));
+          }
+        }
+      }
+
+      // Add a required assignment in default memory if the use not allowed in
+      // alternate memory.
+      if (!IsUseAllowedInAlternateMemory(allocation_value, hlo_use)) {
+        if (require_no_copy_alternate_mem_allocation) {
+          LOG(WARNING)
+              << "The value " << allocation_value.value()->ToShortString()
+              << " is pre-colored for alternate memory but the use "
+              << hlo_use.ToString()
+              << " is not allowed in the alternate memory. Respecting the "
+                 "color but this may break things later in compilation.";
+        } else {
+          AddRequiredAssignment(allocation_value.value(), hlo_use.instruction,
+                                MemorySpace::kDefault, use_time);
+        }
+      } else if (use_idx > 0) {
+        // We allow buffers in alternate memory that are passed into
+        // conditionals to give up their alternate memory allocation inside the
+        // called computation. This means that if a conditional operator has an
+        // alternate memory allocation, subsequent uses cannot use the same
+        // alternate memory allocation in order not to clobber data. So we force
+        // default memory allocation for these subsequent uses.
+        const AllocationValue::Use& previous_use =
+            allocation_value.uses().at(use_idx - 1);
+        if (previous_use.hlo_use.instruction->opcode() ==
+                HloOpcode::kConditional &&
+            previous_use.hlo_use.instruction != hlo_use.instruction) {
+          allow_no_copy_alternate_mem_allocation = false;
+          earliest_prefetch_time =
+              instruction_schedule.at(previous_use.hlo_use.instruction);
+          VLOG(3) << "Previous use (" << previous_use.hlo_use.ToString()
+                  << ") of use (" << hlo_use.ToString()
+                  << ") is a conditional, so this use will need to evict. "
+                  << "Earliest prefetch time = " << *earliest_prefetch_time;
+        }
+      }
+
+      // Bitcasts don't define buffers and don't directly consume buffers. Skip
+      // allocating buffers for bitcast uses (unless they are the root
+      // instruction). The uses that feed from bitcasts will be handled
+      // specially.
+      if (hlo_use.instruction->opcode() != HloOpcode::kBitcast ||
+          hlo_use.instruction ==
+              hlo_use.instruction->parent()->root_instruction()) {
+        std::optional<int64_t> preferred_prefetch_time = std::nullopt;
+        auto loop_optimized_allocation_it =
+            loop_optimized_allocations_map_.find(use.hlo_use);
+        if (loop_optimized_allocation_it !=
+            loop_optimized_allocations_map_.end()) {
+          const LoopOptimizedAllocationInfo& loop_optimized_allocation_info =
+              loop_optimized_allocation_it->second;
+          const Allocation* allocation =
+              loop_optimized_allocation_info.loop_optimized_allocation;
+          VLOG(3) << "Found optimized allocation for " << use.hlo_use.ToString()
+                  << " (loop idx: " << loop_optimized_allocation_info.use_index
+                  << "): " << allocation->ToString();
+          if (require_no_copy_alternate_mem_allocation) {
+            if (allocation->is_copy_allocation() ||
+                allocation->memory_space() == MemorySpace::kDefault) {
+              LOG(WARNING) << "Optimized allocation could not be applied "
+                              "because the tensor is pre-colored, allocation: "
+                           << allocation->ToString();
+            }
+          } else if (allocation->is_copy_allocation()) {
+            allow_no_copy_alternate_mem_allocation = true;
+            const CopyAllocation* copy_allocation =
+                static_cast<const CopyAllocation*>(allocation);
+            int64_t effective_copy_start_time =
+                copy_allocation->copy_start_schedule_after();
+            if (copy_allocation->copy_start_schedule_after() ==
+                    loop_optimized_allocation_info.loop_size - 1 &&
+                copy_allocation->copy_done_schedule_before() == 0) {
+              effective_copy_start_time =
+                  -loop_optimized_allocation_info.loop_size;
+            } else if (copy_allocation->copy_start_schedule_after() + 1 >=
+                       copy_allocation->copy_done_schedule_before()) {
+              effective_copy_start_time -=
+                  loop_optimized_allocation_info.loop_size;
+            }
+            preferred_prefetch_time =
+                hlo_live_range_.instruction_schedule().at(hlo_use.instruction) -
+                loop_optimized_allocation_info.use_index +
+                effective_copy_start_time;
+            VLOG(3) << "Prefer prefetch at " << *preferred_prefetch_time
+                    << " (effective: " << effective_copy_start_time << ")";
+          } else if (allocation->memory_space() == MemorySpace::kDefault) {
+            allow_prefetch = false;
+            allow_no_copy_alternate_mem_allocation = false;
+            VLOG(3) << "Disallowing alternate memory allocation.";
+          } else {
+            CHECK(allocation->memory_space() == MemorySpace::kAlternate);
+            prefer_no_copy_alternate_mem_allocation = true;
+            VLOG(3) << "Prefer no-copy alternate memory allocation.";
+          }
+        }
+
+        if (options_.use_repeated_instance_for_preferred_prefetch_time) {
+          const std::vector<const HloInstruction*>* repeated_insts =
+              GetRepeatedInstructionList(hlo_use.instruction);
+          if (repeated_insts) {
+            for (int i = 0; i < repeated_insts->size(); ++i) {
+              const HloInstruction* repeated = repeated_insts->at(i);
+              VLOG(4) << "Repeated instruction for use: " << repeated->name()
+                      << " "
+                      << hlo_live_range_.instruction_schedule().at(repeated);
+              if (repeated == hlo_use.instruction && i > 0) {
+                const HloInstruction* prev_repeated = repeated_insts->at(i - 1);
+                if (prev_repeated->parent() == hlo_use.instruction->parent()) {
+                  preferred_prefetch_time =
+                      hlo_live_range_.instruction_schedule().at(prev_repeated) +
+                      1;
+                  VLOG(3) << "Found a previous repeated ("
+                          << prev_repeated->name() << ") at "
+                          << (*preferred_prefetch_time - 1)
+                          << ". Setting preferred prefetch time = "
+                          << *preferred_prefetch_time;
+                }
+              }
+            }
+          }
+        }
+        AllocationRequest request;
+
+        int64_t live_range_start_time =
+            (earliest_prefetch_time.has_value()
+                 ? earliest_prefetch_time.value()
+                 : std::min(definition_time, use_time));
+        auto overridden_preferred_prefetch_time =
+            GetOverriddenPreferredPrefetchTime(
+                options_.preferred_prefetch_overrides, allocation_value.size(),
+                hlo_use, instruction_schedule, live_range_start_time,
+                latest_prefetch_time);
+        TF_CHECK_OK(overridden_preferred_prefetch_time.status());
+        if (overridden_preferred_prefetch_time.value().has_value()) {
+          LOG(INFO) << "Overriding preferred prefetch for "
+                    << hlo_use.instruction->name() << " operand number "
+                    << hlo_use.operand_number << " operand index "
+                    << hlo_use.operand_index.ToString() << " size "
+                    << allocation_value.size() << " live range ("
+                    << live_range_start_time << ", " << latest_prefetch_time
+                    << ") from "
+                    << (preferred_prefetch_time.has_value()
+                            ? preferred_prefetch_time.value()
+                            : -1)
+                    << " to "
+                    << overridden_preferred_prefetch_time.value().value();
+          preferred_prefetch_time = overridden_preferred_prefetch_time.value();
+        }
+
+        // Rarely, (e.g., when conditional true and false parameters are the
+        // same), definition time can be the time of the conditional and use
+        // time is the parameter use, which is less.
+        request.inclusive_start_time = std::min(definition_time, use_time);
+        request.end_time = use_time;
+        request.latest_prefetch_time = latest_prefetch_time;
+        request.size = allocation_value.size();
+        request.prefer_no_copy_alternate_mem_allocation =
+            prefer_no_copy_alternate_mem_allocation;
+        request.allow_no_copy_alternate_mem_allocation =
+            allow_no_copy_alternate_mem_allocation;
+        request.allow_prefetch = allow_prefetch;
+        request.require_no_copy_alternate_mem_allocation =
+            require_no_copy_alternate_mem_allocation;
+        request.earliest_prefetch_time = earliest_prefetch_time;
+        request.preferred_prefetch_time = preferred_prefetch_time;
+        request.preferred_offset = preferred_offset;
+        request.use = &use;
+        request.allocation_value = &allocation_value;
+        request.all_use_times = all_use_times;
+        result_mark(AllocateSegment(request), result);
+        if (request.require_no_copy_alternate_mem_allocation &&
+            result != Result::kSuccess) {
+          Status failed_precondition = FailedPrecondition(
+              "The value defined at %s requires allocation in the alternate "
+              "memory, which could not be satisfied. This typically happens "
+              "because more pinned buffers are live than the alternate memory "
+              "capacity.",
+              allocation_value.defining_instruction()->ToString());
+          LOG(ERROR) << failed_precondition;
+          return failed_precondition;
+        }
+        if (result_requires_uncommit(result)) {
+          // If the allocation finding failed (e.g., due to running out of
+          // asynchronous copies), then fall back to allocating the buffer
+          // entirely in the default memory.
+          return result;
+        }
+
+        // If there are multiple uses, they can try using the memory allocation
+        // already at the alternate memory.
+        definition_time = instruction_schedule.at(hlo_use.instruction);
+      }
+
+      // Propagate the allocation to any aliases this use might have had.
+      Allocation* aliased_allocation = GetLiveAllocationAt(
+          *allocation_value.allocation_sequence(), use_time);
+      for (const HloPosition& aliased_position : use.aliases) {
+        AddAliasedRequiredAssignment(aliased_position.instruction,
+                                     aliased_position.index,
+                                     aliased_allocation);
+      }
+
+      if (hlo_use.instruction->opcode() == HloOpcode::kWhile &&
+          aliased_allocation->memory_space() == MemorySpace::kAlternate) {
+        // For while uses that are allocated in the alternate memory space, if
+        // they also have an allocation in the default memory space in their
+        // allocation sequence, create a "parent" allocation that mirrors this
+        // default memory space allocation. When we process the parent
+        // allocation, we add an additional parameter to the while that is a
+        // reference to the buffer in the default memory space. With parent
+        // allocations, we don't need to unnecessarily evict buffers since they
+        // already have a copy in the default memory space. We search backwards
+        // (latest to earliest in execution time) for a suitable allocation in
+        // order to find the most recent one.
+        if (options_.enable_while_redundant_eviction_elimination &&
+            absl::c_find_if(allocation_value.value()->positions(),
+                            [&hlo_use](const HloPosition& position) {
+                              return position.instruction ==
+                                         hlo_use.instruction &&
+                                     position.index == hlo_use.operand_index;
+                            }) != allocation_value.value()->positions().end()) {
+          auto allocation_sequence = allocation_value.allocation_sequence();
+          auto prev_allocation_in_default_mem_it = std::find_if(
+              allocation_sequence->rbegin(), allocation_sequence->rend(),
+              [&](const auto& allocation) {
+                return allocation->memory_space() == MemorySpace::kDefault &&
+                       allocation->defining_position() ==
+                           allocation_value.defining_position();
+              });
+          if (prev_allocation_in_default_mem_it !=
+              allocation_sequence->rend()) {
+            VLOG(3) << "Found a prev allocation in default mem for while use: "
+                    << (*prev_allocation_in_default_mem_it)->ToString();
+            auto body_allocation_value_it = absl::c_find_if(
+                allocation_values, [&](const AllocationValue& value) {
+                  return value.computation() ==
+                             hlo_use.instruction->while_body() &&
+                         value.defining_instruction()->opcode() ==
+                             HloOpcode::kParameter;
+                });
+            CHECK_NE(body_allocation_value_it, allocation_values.end());
+            VLOG(3) << "Body allocation value: "
+                    << body_allocation_value_it->ToShortString();
+            int64_t body_parameter_time = instruction_schedule.at(
+                body_allocation_value_it->defining_instruction());
+            body_allocation_value_it->mutable_allocation_sequence()->push_back(
+                std::make_unique<ParentAllocation>(
+                    **prev_allocation_in_default_mem_it, hlo_use.instruction,
+                    body_allocation_value_it->defining_position(),
+                    body_parameter_time));
+            VLOG(3) << "Created: "
+                    << body_allocation_value_it->allocation_sequence()
+                           ->back()
+                           ->ToString();
+
+            auto after_while_allocation_value_it = absl::c_find_if(
+                allocation_values, [&](const AllocationValue& value) {
+                  return value.defining_instruction() == hlo_use.instruction;
+                });
+            CHECK_NE(after_while_allocation_value_it, allocation_values.end());
+            VLOG(3) << "After while allocation value: "
+                    << after_while_allocation_value_it->ToShortString();
+            int64_t while_time = instruction_schedule.at(hlo_use.instruction);
+            after_while_allocation_value_it->mutable_allocation_sequence()
+                ->push_back(std::make_unique<MirroredAllocation>(
+                    **prev_allocation_in_default_mem_it, while_time));
+            VLOG(3) << "Created: "
+                    << after_while_allocation_value_it->allocation_sequence()
+                           ->back()
+                           ->ToString();
+          }
+        }
+        // Special case for while loops since the root offset must agree with
+        // other offsets: remember the preferred offset for the while loop body.
+        preferred_offset_for_computation[hlo_use.instruction->while_body()] =
+            GetAliasedOffset(*aliased_allocation);
+      }
+    }
+  }
+  return result;
+}
+
+bool operator<(const AsynchronousCopy& a, const AsynchronousCopy& b) {
+  return a.AsTuple() < b.AsTuple();
+}
+
+bool operator==(const AsynchronousCopy& a, const AsynchronousCopy& b) {
+  return a.AsTuple() == b.AsTuple();
+}
+
+bool operator!=(const AsynchronousCopy& a, const AsynchronousCopy& b) {
+  return a.AsTuple() != b.AsTuple();
+}
+
+void AsynchronousCopyOrdering::AddCopy(const AsynchronousCopy& copy) {
+  auto it = ranges_.find({copy.exclusive_start_time, copy.end_time});
+  if (it != ranges_.end()) {
+    CHECK_EQ(it->first.exclusive_start_time, copy.exclusive_start_time);
+    CHECK(it->second.insert(copy).second);
+  } else {
+    ranges_[{copy.exclusive_start_time, copy.end_time}] = {copy};
+  }
+}
+
+void AsynchronousCopyOrdering::RemoveCopy(const AsynchronousCopy& copy) {
+  auto copy_it = ranges_.find({copy.exclusive_start_time, copy.end_time});
+  CHECK(copy_it != ranges_.end());
+  CHECK_EQ(copy_it->first.exclusive_start_time, copy.exclusive_start_time);
+  CHECK_EQ(copy_it->second.erase(copy), 1);
+  if (copy_it->second.empty()) {
+    ranges_.erase(copy_it);
+  }
+}
+
+bool AsynchronousCopyOrdering::ViolatesOrdering(int64_t exclusive_start_time,
+                                                int64_t end_time) const {
+  // We allow identical start and end times. It is enough to check for just the
+  // start time in case we find a match in ranges_ because the found value will
+  // either be identical to {start_time, estimated_end_time} (and this doesn't
+  // violate) or its start_time will be smaller and estimated_end_time will be
+  // larger (this violates).
+  auto copy_it = ranges_.find({exclusive_start_time, end_time});
+  if (copy_it != ranges_.end() &&
+      copy_it->first.exclusive_start_time != exclusive_start_time) {
+    VLOG(4) << "Violates ordering: (" << exclusive_start_time << ", "
+            << end_time << ") and (" << copy_it->first.exclusive_start_time
+            << ", " << copy_it->first.end_time << ")";
+    return true;
+  }
+  return false;
+}
+
+bool AsynchronousCopyResource::ConsumeResource(
+    int64_t exclusive_start_time, int64_t end_time, float resource,
+    absl::flat_hash_map<int64_t, float>* delay_change_map,
+    float resource_to_free) {
+  std::list<AsynchronousCopy>::iterator current_copy = async_copies_.end();
+  // In order to propagate the resource to the next scheduled copy, we iterate
+  // over the copies in start time order until we either find enough free
+  // resource (and return true), or find out that we don't have enough free
+  // resource (and return false).
+  while (true) {
+    // resource is modified below. We save its initial value for logging below.
+    const float amount_requested = resource;
+
+    VLOG(3) << "Consume resource: start time_exclusive = "
+            << exclusive_start_time << ", end time = " << end_time
+            << ", resource = " << resource << ", delay = "
+            << delay_[ExclusiveToInclusiveStartTime(exclusive_start_time)]
+            << ", free = " << resource_to_free;
+    VLOG(5) << "Available resources: "
+            << VectorToString(
+                   GetCurrentResources(), /*include_indices=*/true,
+                   ExclusiveToInclusiveStartTime(exclusive_start_time),
+                   end_time);
+
+    // Nothing to do if we're not adding or removing any resources.
+    if (resource == 0.0 && resource_to_free == 0.0) {
+      return true;
+    }
+
+    // For the async copy we're adding, check the delay_ array to see how much
+    // this copy would have to be delayed because of an earlier copy that wasn't
+    // finished when this copy starts.
+    if (current_copy == async_copies_.end()) {
+      resource += delay_[ExclusiveToInclusiveStartTime(exclusive_start_time)];
+    }
+
+    // Find the copy that is right after this one. If there are leftover
+    // resources by the time the next copy starts, the next copy will be pushed
+    // further later in time.
+    std::list<AsynchronousCopy>::iterator next_copy = async_copies_.end();
+    if (current_copy != async_copies_.end()) {
+      next_copy = std::next(current_copy);
+    } else {
+      auto async_copy_time_it =
+          async_copy_time_map_.upper_bound(exclusive_start_time);
+      if (async_copy_time_it != async_copy_time_map_.end()) {
+        next_copy = async_copy_time_it->second;
+      }
+    }
+
+    // Check if this copy will push the next copy later in time (or if removing
+    // the resource, check if the removal of this copy move the next copy
+    // earlier in time).
+    std::optional<float> delay_for_next_copy = std::nullopt;
+    float resource_freed = 0.0;
+    for (int64_t time = ExclusiveToInclusiveStartTime(exclusive_start_time);
+         time < end_time && resource != 0; ++time) {
+      // Iterate over the logical times that this copy spans. Note that the
+      // start and end time ranges are exclusive.
+      float used_resource = std::min(resource, initial_resources_[time]);
+      if (next_copy != async_copies_.end() &&
+          next_copy->exclusive_start_time ==
+              InclusiveToExclusiveStartTime(time)) {
+        // This is the time where the next copy begins. If the resource is
+        // non-zero at this point, the copy didn't finish by the time the next
+        // copy started, so the next copy would need to be pushed later in time.
+        delay_for_next_copy = resource;
+        resource_to_free -= resource_freed;
+      }
+      if (!delay_for_next_copy.has_value()) {
+        // Update the delay_ vector and resource_freed variable with the amount
+        // that was freed when removing the copy.
+        float old_resource =
+            std::max(0.0f, initial_resources_[time] - delay_[time]);
+        if (delay_change_map && !delay_change_map->contains(time)) {
+          (*delay_change_map)[time] = delay_[time];
+        }
+        delay_[time] = std::max(0.0f, resource - resource_to_free);
+        float new_resource =
+            std::max(0.0f, initial_resources_[time] - delay_[time]);
+        resource_freed += std::max(0.0f, new_resource - old_resource);
+      }
+      // Update the resource with the used amount in this logical time.
+      resource -= used_resource;
+    }
+
+    // If resource isn't satisfied by the end, we didn't have enough resources.
+    if (resource > 0) {
+      VLOG(3) << "Doesn't have enough resource; requested resource = "
+              << amount_requested << "; leftover resources = " << resource;
+      return false;
+    }
+
+    if (!delay_for_next_copy.has_value()) {
+      return true;
+    }
+    // If this copy overlapped with another one, we run for another iteration
+    // with the next copy  with the amount of resource that needs to be added or
+    // removed.
+    exclusive_start_time = next_copy->exclusive_start_time;
+    end_time = next_copy->end_time;
+    resource = *delay_for_next_copy + next_copy->resource;
+    current_copy = next_copy;
+  }
+}
+
+void AsynchronousCopyResource::AddCopy(const AsynchronousCopy& copy) {
+  CHECK(
+      ConsumeResource(copy.exclusive_start_time, copy.end_time, copy.resource));
+
+  // Find the iterator for the copy that would be right after this copy and put
+  // this copy right before it in async_copies_.
+  auto async_copy_time_it =
+      async_copy_time_map_.upper_bound(copy.exclusive_start_time);
+  auto insertion_it = (async_copy_time_it == async_copy_time_map_.end())
+                          ? async_copies_.end()
+                          : async_copy_time_it->second;
+  auto inserted_it = async_copies_.insert(insertion_it, copy);
+  // If this copy is the first copy we have seen with the start time, add the
+  // inserted iterator into async_copy_time_map_ for fast lookups. Note that
+  // async_copy_time_map_ always points to the very first copy with the same
+  // start index. If there are multiple asynchronous copies that have the same
+  // start time, the memory space assignment algorithm schedules them in the
+  // same order that AddCopy was called.
+  if (async_copy_time_map_.find(copy.exclusive_start_time) ==
+      async_copy_time_map_.end()) {
+    async_copy_time_map_[copy.exclusive_start_time] = inserted_it;
+  }
+}
+
+void AsynchronousCopyResource::RemoveCopy(const AsynchronousCopy& copy) {
+  // The ConsumeResource method can only correctly remove the last copy that
+  // starts at a given start time. So if the copy that is requested to be
+  // removed is not the last copy for this start time, we need to temporarily
+  // remove later copies that has the same start time and then add them back one
+  // by one. To do this, we first find the iterator that points to the earliest
+  // copy after this start time. We then decrement this iterator and temporarily
+  // remove the copies until we find the copy we actually want to remove. After
+  // we remove the copy that we actually want to remove, we add back the
+  // temporarily removed copies one by one in the same order.
+  auto async_copy_time_it =
+      async_copy_time_map_.upper_bound(copy.exclusive_start_time);
+  auto copy_it = (async_copy_time_it == async_copy_time_map_.end())
+                     ? async_copies_.end()
+                     : async_copy_time_it->second;
+  CHECK(copy_it != async_copies_.begin());
+  --copy_it;
+
+  std::list<AsynchronousCopy> copies_to_add_back;
+  auto prev_copy_it = copy_it;
+  for (; *copy_it != copy; copy_it = prev_copy_it) {
+    CHECK(copy_it != async_copies_.begin());
+    CHECK_EQ(copy_it->exclusive_start_time, copy.exclusive_start_time);
+    copies_to_add_back.push_front(*copy_it);
+    VLOG(4) << "RemoveCopy found a copy to temporarily remove and add back: "
+            << copy_it->exclusive_start_time << " " << copy_it->end_time << " "
+            << copy_it->resource;
+    prev_copy_it = std::prev(copy_it);
+    RemoveCopy(copy_it);
+  }
+  CHECK(*copy_it == copy);
+  RemoveCopy(copy_it);
+
+  for (const AsynchronousCopy& copy_to_add_back : copies_to_add_back) {
+    AddCopy(copy_to_add_back);
+  }
+}
+
+void AsynchronousCopyResource::RemoveCopy(
+    std::list<AsynchronousCopy>::iterator& copy_it) {
+  // This method works only for the latest copy for the given start time.
+  CHECK(std::next(copy_it) == async_copies_.end() ||
+        std::next(copy_it)->exclusive_start_time >
+            copy_it->exclusive_start_time);
+  CHECK(ConsumeResource(copy_it->exclusive_start_time, copy_it->end_time,
+                        /*resource=*/0,
+                        /*delay_change_map=*/nullptr,
+                        /*resource_to_free=*/copy_it->resource));
+  // If the copy to be removed is the value pointed by async_copy_time_map_, we
+  // make the next copy with the same start time to be pointed by
+  // async_copy_time_map_. If there are no such copies, we remove the key for
+  // this copy start time.
+  int64_t exclusive_start_time = copy_it->exclusive_start_time;
+  auto async_copy_time_it = async_copy_time_map_.find(exclusive_start_time);
+  if (copy_it == async_copy_time_it->second) {
+    if (std::next(copy_it) != async_copies_.end() &&
+        std::next(copy_it)->exclusive_start_time == exclusive_start_time) {
+      async_copy_time_it->second = std::next(copy_it);
+    } else {
+      async_copy_time_map_.erase(async_copy_time_it);
+    }
+  }
+  async_copies_.erase(copy_it);
+}
+
+bool AsynchronousCopyResource::HasEnoughResource(int64_t exclusive_start_time,
+                                                 int64_t end_time,
+                                                 float resource) {
+  absl::flat_hash_map<int64_t, float> delay_changes;
+  bool result =
+      ConsumeResource(exclusive_start_time, end_time, resource, &delay_changes);
+  for (const auto& change_pair : delay_changes) {
+    delay_[change_pair.first] = change_pair.second;
+  }
+  return result;
+}
+
+bool AsynchronousCopyResource::HasEnoughResourceMultiCheck(
+    const std::vector<ResourceSpec>& specs) {
+  absl::flat_hash_map<int64_t, float> delay_changes;
+  bool result = absl::c_all_of(specs, [&](const ResourceSpec& spec) {
+    return ConsumeResource(spec.exclusive_start_time, spec.end_time,
+                           spec.resource, &delay_changes);
+  });
+  for (const auto& change_pair : delay_changes) {
+    delay_[change_pair.first] = change_pair.second;
+  }
+  return result;
+}
+
+namespace {
+
+// A convenience struct for use in the implementation of
+// AsynchronousCopyResource::Dump().
+struct CopyResourceDumpData {
+  float initial_resource;
+  float delay;
+  float available;
+  std::vector<int> overlapping_copies;
+};
+
+}  // namespace
+
+std::string AsynchronousCopyResource::Dump(
+    int64_t start_time, int64_t end_time,
+    MemorySpace memory_space_filter) const {
+  std::vector<float> available = GetCurrentResources();
+  std::vector<CopyResourceDumpData> time_dump_data;
+  for (int i = start_time; i < end_time; ++i) {
+    time_dump_data.push_back({
+        initial_resources_[i],
+        delay_[i],
+        available[i],
+        /*overlapping_copies=*/{},
+    });
+  }
+
+  std::vector<std::string> lines;
+  lines.push_back(absl::StrCat("AsynchronousCopyResource::Dump(start_time: ",
+                               start_time, ", end_time: ", end_time, ")"));
+  for (const AsynchronousCopy& copy : async_copies_) {
+    if (copy.destination != memory_space_filter) {
+      continue;
+    }
+    int64_t overlap_start = std::max(start_time, copy.exclusive_start_time);
+    int64_t overlap_end = std::min(end_time, copy.end_time);
+    if (overlap_start < overlap_end) {
+      lines.push_back(absl::StrCat(
+          "copy(id: ", copy.id,
+          ", exclusive_start: ", copy.exclusive_start_time,
+          ", end: ", copy.end_time, ", resource: ", copy.resource, ")"));
+    }
+    for (int i = overlap_start; i < overlap_end; ++i) {
+      time_dump_data[i - start_time].overlapping_copies.push_back(copy.id);
+    }
+  }
+
+  std::vector<size_t> col_sizes;
+  std::vector<std::vector<std::string>> rows;
+  rows.push_back({"time", "initial", "delay", "avail", "overlapping copies"});
+  for (std::string_view col : rows.front()) {
+    col_sizes.push_back(col.size());
+  }
+  for (int i = 0; i < time_dump_data.size(); ++i) {
+    rows.push_back({absl::StrCat(i + start_time),
+                    absl::StrCat(time_dump_data[i].initial_resource),
+                    absl::StrCat(time_dump_data[i].delay),
+                    absl::StrCat(time_dump_data[i].available),
+                    absl::StrJoin(time_dump_data[i].overlapping_copies, ",")});
+    for (int j = 0; j < rows.back().size(); ++j) {
+      col_sizes[j] = std::max(col_sizes[j], rows.back()[j].size());
+    }
+  }
+  for (const std::vector<std::string>& row : rows) {
+    std::string line;
+    std::string sep;
+    for (int i = 0; i < col_sizes.size(); ++i) {
+      absl::StrAppend(&line, sep, row[i]);
+      sep = std::string(col_sizes[i] + 2 - row[i].size(), ' ');
+    }
+    lines.push_back(line);
+  }
+
+  return absl::StrJoin(lines, "\n");
+}
+
+MsaAlgorithm::AliasedOffset* MsaAlgorithm::GetAliasedOffset(
+    const Allocation& allocation) {
+  auto aliased_offset_it = aliased_offset_map_.find(&allocation);
+  CHECK(aliased_offset_it != aliased_offset_map_.end());
+  return aliased_offset_it->second;
+}
+
+void MsaAlgorithm::CreateOrAddToAliasedOffset(
+    const Allocation& allocation, MsaAlgorithm::AliasedOffset* aliased_offset) {
+  CHECK(allocation.memory_space() == MemorySpace::kAlternate);
+  CHECK(!aliased_offset_map_.contains(&allocation));
+  if (!aliased_offset) {
+    aliased_offsets_.push_back({allocation.chunk().offset});
+    aliased_offset = &aliased_offsets_.back();
+  }
+  CHECK_EQ(allocation.chunk().offset, aliased_offset->offset);
+  CHECK(aliased_offset->allocations.insert(&allocation).second);
+  aliased_offset_map_[&allocation] = aliased_offset;
+}
+
+/*static*/ Allocation* MsaAlgorithm::GetLiveAllocationAt(
+    const AllocationSequence& allocations, int64_t time) {
+  for (auto allocation_it = allocations.rbegin();
+       allocation_it != allocations.rend(); ++allocation_it) {
+    if ((*allocation_it)->start_time() <= time &&
+        (*allocation_it)->end_time() >= time) {
+      return allocation_it->get();
+    }
+  }
+  return nullptr;
+}
+
+void MsaAlgorithm::AllocateCrossProgramPrefetchBuffer(
+    HloModule* module, const MsaBufferInterval& prefetch_candidate) {
+  Chunk chunk_candidate = FindChunkCandidate(prefetch_candidate);
+  if (chunk_candidate.chunk_end() > available_heap_size()) {
+    VLOG(3) << "Could not allocate preferred memory for cross program prefetch";
+    return;
+  }
+
+  const HloValue* buffer = prefetch_candidate.buffer;
+  int64_t parameter = buffer->instruction()->parameter_number();
+  int cross_program_prefetch_index = module->CrossProgramPrefetches().size();
+  module->AddCrossProgramPrefetch(parameter, buffer->index());
+
+  AllocationSequence allocations;
+  allocations.push_back(std::make_unique<PinnedAllocation>(
+      buffer->defining_position(), MemorySpace::kDefault, kDummyChunk,
+      prefetch_candidate.start, prefetch_candidate.end,
+      /*is_scoped_allocation=*/false));
+
+  // Find the earliest use.
+  const auto& instruction_schedule = hlo_live_range_.instruction_schedule();
+  auto uses = FindCrossProgramPrefetchUses(buffer->GetUses(), alias_analysis_);
+  CHECK_GE(uses.size(), 1);
+  auto use_schedule_compare = [&](const HloUse& lhs, const HloUse& rhs) {
+    return instruction_schedule.at(lhs.instruction) <
+           instruction_schedule.at(rhs.instruction);
+  };
+  auto first_use = absl::c_min_element(uses, use_schedule_compare);
+  int64_t latest_prefetch_time =
+      instruction_schedule.at(first_use->instruction);
+
+  // Find the latest use time.
+  int64_t last_use_time = instruction_schedule.at(
+      absl::c_max_element(uses, use_schedule_compare)->instruction);
+  for (const HloValue* colocation : prefetch_candidate.colocations) {
+    auto colocation_uses = colocation->GetUses();
+    if (!colocation_uses.empty()) {
+      last_use_time = std::max(
+          last_use_time,
+          instruction_schedule.at(
+              absl::c_max_element(colocation_uses, use_schedule_compare)
+                  ->instruction));
+    }
+  }
+
+  int64_t end_of_program_prefetch_end_time = instruction_schedule.size();
+  int64_t end_of_program_prefetch_latest_start_time =
+      options_.prefetch_interval_picker->LatestPrefetchStartTime(
+          buffer->defining_position().shape(), last_use_time,
+          end_of_program_prefetch_end_time, nullptr);
+  int64_t end_of_program_inclusive_prefetch_start_time =
+      options_.prefetch_interval_picker->PreferredPrefetchStartTime(
+          buffer->defining_position().shape(), last_use_time,
+          end_of_program_prefetch_latest_start_time,
+          end_of_program_prefetch_end_time);
+  VLOG(2) << "last use time = " << last_use_time
+          << ", end-of-program inclusive prefetch start time = "
+          << end_of_program_inclusive_prefetch_start_time;
+  float total_execution_time =
+      options_.prefetch_interval_picker->GetLogicalIntervalElapsed(
+          0, instruction_schedule.size());
+  float buffer_occupied_time =
+      options_.prefetch_interval_picker->GetLogicalIntervalElapsed(
+          end_of_program_inclusive_prefetch_start_time,
+          end_of_program_prefetch_end_time);
+  if (options_.cost_analysis) {
+    buffer_occupied_time = std::max(buffer_occupied_time,
+                                    options_.cost_analysis->GetAsyncCopyElapsed(
+                                        buffer->defining_position().shape()));
+  }
+  buffer_occupied_time +=
+      options_.prefetch_interval_picker->GetLogicalIntervalElapsed(
+          0, last_use_time);
+  float buffer_occupied_ratio = buffer_occupied_time / total_execution_time;
+  VLOG(2) << "Total execution time = " << total_execution_time
+          << ", buffer occupied time = " << buffer_occupied_time
+          << ", buffer occupied ratio = " << buffer_occupied_ratio;
+  // Freeing buffer only makes sense if the buffer will be free for a
+  // substantial time. Only perform this optimization if the ratio is below the
+  // limit, and if the memory pressure is above the alternate memory size.
+  bool free_buffer =
+      (options_.enable_cross_program_prefetch_freeing &&
+       memory_pressure_ > options_.max_size_in_bytes &&
+       buffer_occupied_ratio < kCrossProgramPrefetchOccupyFreeingLimit &&
+       end_of_program_inclusive_prefetch_start_time > last_use_time &&
+       end_of_program_inclusive_prefetch_start_time <
+           end_of_program_prefetch_end_time);
+  int64_t cross_program_prefetch_end_time =
+      free_buffer ? last_use_time : prefetch_candidate.end;
+
+  AddAsyncCopy(*allocations.back(), MemorySpace::kAlternate, chunk_candidate,
+               /*exclusive_start_time=*/
+               InclusiveToExclusiveStartTime(prefetch_candidate.start),
+               cross_program_prefetch_end_time, latest_prefetch_time,
+               &allocations, /*aliased_offset=*/nullptr,
+               /*resource=*/0.0, cross_program_prefetch_index);
+
+  absl::c_for_each(uses, [&](auto& use) { allocations.back()->AddUse(use); });
+  AliasedOffset* cross_program_prefetch_offset =
+      GetAliasedOffset(*allocations.back());
+
+  if (free_buffer) {
+    VLOG(2) << "Adding an end-of-program prefetch for freed "
+               "cross-program-prefetched buffer.";
+    AddAsyncCopy(*allocations.front(), MemorySpace::kAlternate, chunk_candidate,
+                 /*exclusive_start_time=*/
+                 InclusiveToExclusiveStartTime(
+                     end_of_program_inclusive_prefetch_start_time),
+                 end_of_program_prefetch_end_time,
+                 end_of_program_prefetch_end_time, &allocations,
+                 cross_program_prefetch_offset,
+                 /*resource=*/0.0);
+    CHECK_EQ(cross_program_prefetch_offset->offset,
+             allocations.back()->chunk().offset);
+  }
+
+  const int allocations_initial_size = allocations_->size();
+  for (auto& allocation : allocations) {
+    if (allocation->memory_space() == MemorySpace::kAlternate) {
+      MsaBufferInterval buffer_interval;
+      buffer_interval.start = allocation->start_time();
+      buffer_interval.end = allocation->end_time();
+      buffer_interval.size = allocation->chunk().size;
+      buffer_interval.buffer = prefetch_candidate.buffer;
+      AddToPendingChunks(buffer_interval, chunk_candidate);
+    }
+    allocations_->push_back(std::move(allocation));
+  }
+
+  // Add a repack allocation block for the Allocation objects in alternate
+  // memory.
+  std::vector<AllocationBlock*> colocations;
+  for (int i = allocations_initial_size; i < allocations_->size(); ++i) {
+    const auto& allocation = allocations_->at(i);
+    if (allocation->memory_space() == MemorySpace::kAlternate) {
+      repack_allocation_blocks_.push_back(MakeRepackAllocationBlock(
+          allocation->start_time(), allocation->end_time(),
+          allocation->chunk().size, allocation->chunk().offset,
+          static_cast<int64_t>(repack_allocation_blocks_.size()),
+          allocation.get()));
+      colocations.push_back(&repack_allocation_blocks_.back());
+    }
+  }
+  for (int i = 0; i < colocations.size() - 1; ++i) {
+    colocations[i]->next_colocated = colocations[i + 1];
+  }
+  if (!colocations.empty()) {
+    colocations.back()->next_colocated = colocations.front();
+  }
+
+  ClearPendingChunks();
+}
+
+void MsaAlgorithm::AllocateReservedScopedAllocations() {
+  const auto& instruction_sequence =
+      hlo_live_range_.flattened_instruction_sequence().instructions();
+  for (int i = 0; i < instruction_sequence.size(); ++i) {
+    const HloInstruction* instruction = instruction_sequence[i];
+    int64_t reserved_scoped_memory =
+        std::min(options_.reserved_scoped_memory_fn(
+                     instruction, /*operands_in_alternate_memory=*/{},
+                     /*outputs_in_alternate_memory=*/{}),
+                 options_.max_size_in_bytes);
+    if (reserved_scoped_memory != 0) {
+      VLOG(1) << "Allocate reserved scoped memory at " << i << " ("
+              << instruction->name() << "): " << reserved_scoped_memory;
+      MsaBufferInterval interval;
+      interval.buffer = nullptr;
+      interval.size = reserved_scoped_memory;
+      interval.start = i;
+      interval.end = i;
+      interval.need_allocation = true;
+      Chunk chunk_candidate =
+          FindChunkCandidate(interval, /*preferred_offset=*/0);
+      CHECK_EQ(chunk_candidate.offset, 0);
+      AddToPendingChunks(interval, chunk_candidate);
+
+      if (options_.dump_fn != nullptr || VLOG_IS_ON(3)) {
+        AppendScopedAllocationBufferInfoDebugString(
+            instruction, i, reserved_scoped_memory, buffer_info_str_);
+      }
+
+      allocations_->push_back(std::make_unique<PinnedAllocation>(
+          HloPosition{instruction_sequence[i], {}}, MemorySpace::kAlternate,
+          chunk_candidate, i, i, /*is_scoped_allocation=*/true));
+
+      repack_allocation_blocks_.push_back(MakeRepackAllocationBlock(
+          i, i, reserved_scoped_memory,
+          /*initial_offset=*/0,
+          static_cast<int64_t>(repack_allocation_blocks_.size()),
+          allocations_->back().get()));
+    }
+  }
+  // If requested, make all scoped allocations to colocate with each other so
+  // that when we repack, all scoped allocations get the same offsets. Since
+  // they will all have the same scoped memory addresses, this increases the
+  // opportunity to deduplicate different ops.  However, this may hurt the
+  // memory packing efficiency.
+  if (options_.allocate_reserved_scoped_memory_at_same_offset) {
+    for (auto allocation_block_it = repack_allocation_blocks_.begin();
+         allocation_block_it != repack_allocation_blocks_.end() &&
+         std::next(allocation_block_it) != repack_allocation_blocks_.end();
+         ++allocation_block_it) {
+      allocation_block_it->next_colocated = &*std::next(allocation_block_it);
+    }
+    if (!repack_allocation_blocks_.empty()) {
+      repack_allocation_blocks_.back().next_colocated =
+          &repack_allocation_blocks_.front();
+    }
+  } else {
+    for (RepackAllocationBlock& allocation_block : repack_allocation_blocks_) {
+      allocation_block.next_colocated = &allocation_block;
+    }
+  }
+  ClearPendingChunks();
+}
+
+std::optional<MsaAlgorithm::RequiredMemoryAssignment>
+MsaAlgorithm::RequiredMemoryAssignmentAt(const HloValue* buffer,
+                                         int64_t time) const {
+  auto required_assignment_it = required_assignments_.find(buffer);
+  std::optional<RequiredMemoryAssignment> required_assignment_at_time;
+  if (required_assignment_it != required_assignments_.end()) {
+    for (const RequiredMemoryAssignment& required_assignment :
+         required_assignment_it->second) {
+      if (required_assignment.time == time) {
+        // Sanity check that there is only one required at time.
+        CHECK(!required_assignment_at_time)
+            << buffer->ToShortString() << " at time " << time;
+        required_assignment_at_time = required_assignment;
+      }
+    }
+  }
+  return required_assignment_at_time;
+}
+
+std::optional<MsaAlgorithm::RequiredMemoryAssignment>
+MsaAlgorithm::AliasedRequiredAssignmentForUse(
+    const AllocationValue::Use& use) const {
+  std::optional<RequiredMemoryAssignment> required_assignment;
+  for (const HloPosition& position : use.aliases) {
+    const HloValue* value =
+        &alias_analysis_.dataflow_analysis().GetUniqueValueAt(
+            position.instruction, position.index);
+    int64_t time =
+        hlo_live_range_.instruction_schedule().at(position.instruction);
+    std::optional<RequiredMemoryAssignment> required_assignment_for_alias =
+        RequiredMemoryAssignmentAt(value, time);
+    if (required_assignment == std::nullopt) {
+      required_assignment = required_assignment_for_alias;
+    } else {
+      CHECK(required_assignment_for_alias == std::nullopt ||
+            required_assignment->equals_ignoring_time(
+                *required_assignment_for_alias));
+    }
+  }
+  return required_assignment;
+}
+
+void MsaAlgorithm::AddAliasedRequiredAssignment(
+    const HloInstruction* instruction, ShapeIndex index,
+    const Allocation* aliased_allocation) {
+  AliasedOffset* offset = nullptr;
+  if (aliased_allocation->memory_space() == MemorySpace::kAlternate) {
+    offset = GetAliasedOffset(*aliased_allocation);
+  }
+  AddRequiredAssignment(instruction, index, aliased_allocation->memory_space(),
+                        offset);
+}
+
+void MsaAlgorithm::AddRequiredAssignment(const HloValue* value,
+                                         const HloInstruction* instruction,
+                                         MemorySpace memory_space, int64_t time,
+                                         AliasedOffset* offset,
+                                         bool add_to_pending) {
+  // Check for existing required assignment at this time and make sure it is the
+  // same as this if there is one.
+  auto existing_required_assignment = RequiredMemoryAssignmentAt(value, time);
+  if (existing_required_assignment) {
+    CHECK(memory_space == existing_required_assignment->memory_space)
+        << "inst = " << instruction->ToString() << " at " << time;
+    CHECK((!offset && !existing_required_assignment->offset) ||
+          offset == existing_required_assignment->offset);
+    VLOG(3) << "Not adding required assignment because there is one already: "
+            << value->ToShortString() << " at " << time << " at "
+            << (memory_space == MemorySpace::kDefault ? "def" : "alt");
+  } else {
+    VLOG(3) << "Adding required assignment: " << value->ToShortString()
+            << " at " << time << " at "
+            << (memory_space == MemorySpace::kDefault ? "def" : "alt");
+    RequiredMemoryAssignment required_assignment{memory_space, time, offset};
+    required_assignments_[value].push_back(required_assignment);
+    if (add_to_pending) {
+      pending_required_assignments_.push_back({value, required_assignment});
+    }
+  }
+}
+
+void MsaAlgorithm::AddRequiredAssignment(const HloInstruction* instruction,
+                                         ShapeIndex index,
+                                         MemorySpace memory_space,
+                                         AliasedOffset* offset,
+                                         bool add_to_pending) {
+  const HloValue* value =
+      &alias_analysis_.dataflow_analysis().GetUniqueValueAt(instruction, index);
+  int64_t instruction_time =
+      hlo_live_range_.instruction_schedule().at(instruction);
+  AddRequiredAssignment(value, instruction, memory_space, instruction_time,
+                        offset, add_to_pending);
+}
+
+void MsaAlgorithm::AddRequiredAssignment(const HloPosition& position,
+                                         MemorySpace memory_space,
+                                         AliasedOffset* offset,
+                                         bool add_to_pending) {
+  AddRequiredAssignment(position.instruction, position.index, memory_space,
+                        offset, add_to_pending);
+}
+
+void MsaAlgorithm::AddRequiredAssignment(const HloUse& use,
+                                         MemorySpace memory_space,
+                                         AliasedOffset* offset,
+                                         bool add_to_pending) {
+  const HloValue* value = &alias_analysis_.dataflow_analysis().GetUniqueValueAt(
+      use.instruction->operand(use.operand_number), use.operand_index);
+  int64_t instruction_time =
+      hlo_live_range_.instruction_schedule().at(use.instruction);
+  AddRequiredAssignment(value, use.instruction, memory_space, instruction_time,
+                        offset, add_to_pending);
+}
+
+void MsaAlgorithm::AddInputAndOutputRequiredAssignments() {
+  // Go through the parameters, outputs, and constants and pin them to the
+  // corresponding memory by adding a required assignment.
+  const HloModule& module = alias_analysis_.dataflow_analysis().module();
+  const auto& instruction_schedule = hlo_live_range_.instruction_schedule();
+  HloComputation* entry_computation = module.entry_computation();
+  for (HloInstruction* parameter_instruction :
+       entry_computation->parameter_instructions()) {
+    int64_t parameter_instruction_time =
+        instruction_schedule.at(parameter_instruction);
+    ShapeUtil::ForEachSubshape(
+        parameter_instruction->shape(),
+        [&](const Shape& subshape, const ShapeIndex& index) {
+          MemorySpace memory_space = MemorySpace::kDefault;
+          if (subshape.has_layout() && subshape.layout().memory_space() ==
+                                           options_.alternate_memory_space) {
+            memory_space = MemorySpace::kAlternate;
+          }
+          for (const HloBuffer* buffer :
+               alias_analysis_.ComputeBuffersAt(parameter_instruction, index)) {
+            for (const HloValue* value : buffer->values()) {
+              VLOG(3) << "Adding required assignment for parameter value = "
+                      << value->ToShortString()
+                      << " time = " << parameter_instruction_time << " space = "
+                      << (memory_space == MemorySpace::kDefault ? "def"
+                                                                : "alt");
+              AddRequiredAssignment(value, parameter_instruction, memory_space,
+                                    parameter_instruction_time,
+                                    /*offset=*/nullptr,
+                                    /*add_to_pending=*/false);
+            }
+          }
+        });
+  }
+  HloInstruction* root_instruction = entry_computation->root_instruction();
+  int64_t root_instruction_time = instruction_schedule.at(root_instruction);
+  ShapeUtil::ForEachSubshape(
+      root_instruction->shape(),
+      [&](const Shape& subshape, const ShapeIndex& index) {
+        MemorySpace memory_space = MemorySpace::kDefault;
+        if (subshape.has_layout() && subshape.layout().memory_space() ==
+                                         options_.alternate_memory_space) {
+          memory_space = MemorySpace::kAlternate;
+        }
+        for (const HloBuffer* buffer :
+             alias_analysis_.ComputeBuffersAt(root_instruction, index)) {
+          for (const HloValue* value : buffer->values()) {
+            VLOG(3) << "Adding required assignment for output value = "
+                    << value->ToShortString()
+                    << " time = " << root_instruction_time << " space = "
+                    << (memory_space == MemorySpace::kDefault ? "def" : "alt");
+            AddRequiredAssignment(value, root_instruction, memory_space,
+                                  root_instruction_time,
+                                  /*offset=*/nullptr, /*add_to_pending=*/false);
+          }
+        }
+      });
+
+  for (const HloComputation* computation : module.MakeNonfusionComputations()) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      if (instruction->opcode() == HloOpcode::kConstant) {
+        auto constant_instruction_it = instruction_schedule.find(instruction);
+        if (constant_instruction_it == instruction_schedule.end()) {
+          continue;
+        }
+        int64_t constant_instruction_time = constant_instruction_it->second;
+        ShapeUtil::ForEachLeafShape(
+            instruction->shape(),
+            [&](const Shape& /*sub_shape*/, const ShapeIndex& index) {
+              for (const HloBuffer* buffer :
+                   alias_analysis_.ComputeBuffersAt(instruction, index)) {
+                for (const HloValue* value : buffer->values()) {
+                  VLOG(3) << "Adding required assignment for constant value = "
+                          << value->ToShortString()
+                          << " time = " << constant_instruction_time
+                          << " space = def";
+                  AddRequiredAssignment(value, instruction,
+                                        MemorySpace::kDefault,
+                                        constant_instruction_time,
+                                        /*offset=*/nullptr,
+                                        /*add_to_pending=*/false);
+                }
+              }
+            });
+      }
+    }
+  }
+
+  // Go through all of the values and pin them to the default memory if they are
+  // not allowed on the alternate memory.
+  for (const HloValue* value : alias_analysis_.dataflow_analysis().values()) {
+    if (!options_.is_allowed_in_alternate_mem_fn(*value)) {
+      // We won't find the instruction in the schedule if it's inside a fusion.
+      // If so, just skip.
+      auto instruction_time_it =
+          instruction_schedule.find(value->instruction());
+      if (instruction_time_it == instruction_schedule.end()) {
+        continue;
+      }
+      int64_t instruction_time = instruction_time_it->second;
+      auto& required_assignments = required_assignments_[value];
+      // Check if there is an existing matching required assignment (e.g.
+      // inserted by the logic above) and if so ensure it requires a default
+      // memory allocation.
+      auto matching_assignment = absl::c_find_if(
+          required_assignments,
+          [&](const RequiredMemoryAssignment& required_assignment) {
+            return required_assignment.time == instruction_time;
+          });
+      if (matching_assignment != required_assignments.end()) {
+        CHECK(matching_assignment->memory_space == MemorySpace::kDefault)
+            << "Mismatch in required assignments at time " << instruction_time
+            << " value: " << value->ToString();
+      } else {
+        VLOG(3) << "Adding required assignment: " << value->ToShortString()
+                << " at " << instruction_time << " at def";
+        required_assignments.push_back(
+            {MemorySpace::kDefault, instruction_time});
+      }
+    }
+  }
+}
+
+bool MsaAlgorithm::AreIntervalsReservedInAlternateMemory(
+    absl::Span<const MsaBufferInterval* const> colocated_intervals) const {
+  auto is_position_in_alternate_memory = [&](const HloPosition& position) {
+    const Shape& shape = position.shape();
+    return shape.has_layout() &&
+           shape.layout().memory_space() == options_.alternate_memory_space;
+  };
+
+  const HloModule& module = alias_analysis_.dataflow_analysis().module();
+  const HloComputation* entry_computation = module.entry_computation();
+  const HloInstruction* root_instruction =
+      entry_computation->root_instruction();
+  for (const MsaBufferInterval* colocated_interval : colocated_intervals) {
+    const HloValue* value = colocated_interval->buffer;
+    if (value->defining_instruction()->opcode() == HloOpcode::kParameter &&
+        value->defining_instruction()->parent() == entry_computation &&
+        is_position_in_alternate_memory(value->defining_position())) {
+      return true;
+    }
+
+    for (const HloPosition& position : value->positions()) {
+      if (position.instruction == root_instruction &&
+          is_position_in_alternate_memory(position)) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+const std::vector<const HloInstruction*>*
+MsaAlgorithm::GetRepeatedInstructionList(
+    const HloInstruction* instruction) const {
+  const auto fingerprint_it = fingerprint_map_.find(instruction);
+  if (fingerprint_it == fingerprint_map_.end()) {
+    return nullptr;
+  }
+  const auto repeated_insts_it =
+      repeated_inst_map_.find(fingerprint_it->second);
+  CHECK(repeated_insts_it != repeated_inst_map_.end());
+  return &repeated_insts_it->second;
+}
+
+void MsaAlgorithm::UpdateReservedScopedAllocationSize() {
+  // Check all instructions, if their operands/outputs have been placed in
+  // alternate memory, update their scoped allocation size.
+  VLOG(2) << "Update scoped allocation size before repacking.";
+  const auto& instruction_sequence =
+      hlo_live_range_.flattened_instruction_sequence().instructions();
+  absl::flat_hash_map<int64_t, int64_t> reserved_scoped_memory_map;
+  for (int i = 0; i < instruction_sequence.size(); ++i) {
+    const HloInstruction* instruction = instruction_sequence[i];
+    reserved_scoped_memory_map[i] = options_.reserved_scoped_memory_fn(
+        instruction, operands_in_alternate_memory_map_[instruction],
+        outputs_in_alternate_memory_map_[instruction]);
+  }
+  // Update scoped allocation sizes.
+  for (RepackAllocationBlock& allocation_block : repack_allocation_blocks_) {
+    Allocation* allocation = allocation_block.allocation;
+    if (allocation->is_scoped_allocation()) {
+      allocation_block.size =
+          reserved_scoped_memory_map[allocation->start_time()];
+      allocation->mutable_chunk()->size =
+          reserved_scoped_memory_map[allocation->start_time()];
+    }
+  }
+}
+
+void MsaAlgorithm::ExportAllocationsForRepacking(
+    std::vector<AllocationBlock*>& allocations) {
+  using SliceDetail = SlicedCopyAllocation::SliceDetail;
+
+  if (options_.reduce_scoped_memory_limit) {
+    UpdateReservedScopedAllocationSize();
+  }
+
+  for (RepackAllocationBlock& allocation_block : repack_allocation_blocks_) {
+    allocation_block.original_slice_data = std::nullopt;
+    allocation_block.repacked_slice_data = std::nullopt;
+
+    if (!allocation_block.allocation->is_sliced_copy_allocation()) {
+      allocations.push_back(&allocation_block);
+      continue;
+    }
+
+    SlicedCopyAllocation* allocation =
+        dynamic_cast<SlicedCopyAllocation*>(allocation_block.allocation);
+    std::vector<const SliceDetail*> slice_details_sorted_by_offset;
+    slice_details_sorted_by_offset.reserve(
+        allocation->slice_details_sorted_by_start_time().size());
+    for (const SliceDetail& slice_detail :
+         allocation->slice_details_sorted_by_start_time()) {
+      slice_details_sorted_by_offset.push_back(&slice_detail);
+    }
+    absl::c_stable_sort(slice_details_sorted_by_offset,
+                        [](const SliceDetail* lhs, const SliceDetail* rhs) {
+                          return lhs->slice_decision.chunk.offset <
+                                 rhs->slice_decision.chunk.offset;
+                        });
+
+    // Since this is a sliced allocation, construct SlicedAllocationData to
+    // attach to the AllocationBlock.
+    SlicedAllocationData original_slice_data;
+    for (const SliceDetail* slice_detail : slice_details_sorted_by_offset) {
+      CHECK_EQ(slice_detail->copy_start_after_time,
+               slice_detail->slice_decision.exclusive_start_time);
+      original_slice_data.slices_sorted_by_offset.push_back(AllocatedSlice{
+          slice_detail->slice_decision.chunk.size,
+          slice_detail->slice_decision.chunk.offset,
+          /*inclusive_start_time=*/
+          ExclusiveToInclusiveStartTime(
+              slice_detail->slice_decision.exclusive_start_time)});
+    }
+
+    allocation_block.original_slice_data = std::move(original_slice_data);
+    allocations.push_back(&allocation_block);
+  }
+}
+
+void MsaAlgorithm::ImportRepackedAllocations() {
+  interval_tree_ = {};
+  for (RepackAllocationBlock& allocation_block : repack_allocation_blocks_) {
+    if (allocation_block.allocation->is_sliced_copy_allocation()) {
+      ImportRepackedSlicedAllocation(allocation_block);
+      continue;
+    }
+    ImportRepackedNonSlicedAllocation(allocation_block);
+  }
+}
+
+void MsaAlgorithm::ImportRepackedNonSlicedAllocation(
+    RepackAllocationBlock& block) {
+  Allocation* allocation = block.allocation;
+  int64_t original_offset = block.initial_offset;
+  int64_t repacked_offset = block.offset;
+
+  // Update the Allocation, AllocationBlock, and interval_tree_.
+  allocation->set_offset(repacked_offset);
+  block.initial_offset = repacked_offset;
+  block.offset = -1;
+  interval_tree_.Add(
+      block.inclusive_start_time, block.end_time,
+      HeapSimulator::Chunk::FromOffsetSize(repacked_offset, block.size));
+
+  VLOG(3) << "Repacking move. offset: " << original_offset << " -> "
+          << repacked_offset << "; size: " << block.size
+          << "; Allocation: " << allocation->ToString();
+}
+
+void MsaAlgorithm::ImportRepackedSlicedAllocation(
+    RepackAllocationBlock& block) {
+  using SlicedCopyAllocation = memory_space_assignment::SlicedCopyAllocation;
+  using SliceDetail = SlicedCopyAllocation::SliceDetail;
+
+  CHECK_OK(AreRepackedSlicesValid(block));
+
+  SlicedCopyAllocation* allocation =
+      dynamic_cast<SlicedCopyAllocation*>(block.allocation);
+  CHECK(block.allocation->is_sliced_copy_allocation());
+  int64_t original_offset = block.initial_offset;
+  int64_t repacked_offset = block.offset;
+  std::vector<int64_t> original_slice_offsets =
+      allocation->SliceOffsetsSortedByStartTime();
+
+  // Update the Allocation, AllocationBlock, and interval_tree_.
+  allocation->set_offset(repacked_offset);
+  if (block.repacked_slice_data.has_value()) {
+    allocation->ImportRepackedSliceData(*block.repacked_slice_data);
+  } else {
+    allocation->AddDiffToAllSliceOffsets(repacked_offset - original_offset);
+  }
+  block.initial_offset = repacked_offset;
+  block.offset = -1;
+  // Note, in a non-repacking setting, we would have reworked the chunks as
+  // described in
+  // MsaAlgorithm::PrefetchContext::SlicedSolution::slices_for_pending_chunks.
+  // Doing so was for the benefit of MsaAlgorithm::pending_chunks_. However,
+  // pending_chunks_ are cleared before repacking, when UncommitPendingChunks()
+  // is called. Thus, we don't need to worry about modifying the chunks here.
+  for (const SliceDetail& slice_detail :
+       allocation->slice_details_sorted_by_start_time()) {
+    interval_tree_.Add(
+        /*start=*/
+        ExclusiveToInclusiveStartTime(slice_detail.copy_start_after_time),
+        block.end_time, slice_detail.slice_decision.chunk);
+  }
+
+  VLOG(3) << "Repacking move. offset: " << original_offset << " -> "
+          << repacked_offset << "; size: " << block.size << "; " <<
+      [&]() {
+        std::vector<int64_t> new_slice_offsets =
+            allocation->SliceOffsetsSortedByStartTime();
+        CHECK_EQ(original_slice_offsets.size(), new_slice_offsets.size());
+        std::vector<std::string> offset_moves;
+        offset_moves.reserve(original_slice_offsets.size());
+        for (int i = 0; i < original_slice_offsets.size(); ++i) {
+          offset_moves.push_back(absl::StrCat(original_slice_offsets[i], " -> ",
+                                              new_slice_offsets[i]));
+        }
+        return absl::StrCat("slice_offsets: [",
+                            absl::StrJoin(offset_moves, ", "), "]");
+      }()
+          << "; Allocation: " << allocation->ToString();
+}
+
+Status MsaAlgorithm::AreRepackedSlicesValid(
+    const RepackAllocationBlock& block) {
+  if (!block.repacked_slice_data.has_value()) {
+    return OkStatus();
+  }
+  if (!block.original_slice_data.has_value()) {
+    return InvalidArgumentStrCat(
+        "Repacked sliced allocation has repacked slice data but not original "
+        "slice data.");
+  }
+  int64_t num_slices =
+      block.original_slice_data->slices_sorted_by_offset.size();
+  if (num_slices != block.repacked_slice_data->slices_sorted_by_offset.size()) {
+    return InvalidArgumentStrCat(
+        "Repacked sliced allocation has ", num_slices,
+        " slices but repacking has data for ",
+        block.repacked_slice_data->slices_sorted_by_offset.size(), " slices.");
+  }
+
+  // Ensure that the slice size to start time mapping has not changed. If it
+  // changes, its invalidates MSA's internal state, e.g., the peak_memory_usage_
+  // data structure.
+  std::vector<std::pair<int64_t, int64_t>> original_size_to_time_mapping;
+  original_size_to_time_mapping.reserve(num_slices);
+  for (const AllocatedSlice& slice :
+       block.original_slice_data->slices_sorted_by_offset) {
+    original_size_to_time_mapping.push_back(
+        std::make_pair(slice.size, slice.inclusive_start_time));
+  };
+  absl::c_sort(original_size_to_time_mapping);
+  std::vector<std::pair<int64_t, int64_t>> repacked_size_to_time_mapping;
+  repacked_size_to_time_mapping.reserve(num_slices);
+  for (const AllocatedSlice& slice :
+       block.repacked_slice_data->slices_sorted_by_offset) {
+    repacked_size_to_time_mapping.push_back(
+        std::make_pair(slice.size, slice.inclusive_start_time));
+  };
+  absl::c_sort(repacked_size_to_time_mapping);
+  if (original_size_to_time_mapping != repacked_size_to_time_mapping) {
+    return InvalidArgumentStrCat(
+        "Repacked slices do not preserve the initial slice size-start time "
+        "mappings.");
+  }
+
+  return OkStatus();
+}
+
+void MsaAlgorithm::UncommitPendingChunks(
+    absl::Span<AllocationValue> allocation_values) {
+  // Clear the allocation sequence of the allocation values so that in case we
+  // retry allocation after uncommitting.
+  for (AllocationValue& allocation_value : allocation_values) {
+    allocation_value.mutable_allocation_sequence()->clear();
+  }
+  for (const auto& interval_and_chunk : pending_chunks_) {
+    const MsaBufferInterval& interval = interval_and_chunk.first;
+    const Chunk& chunk = interval_and_chunk.second;
+    VLOG(3) << "Uncommitting: (" << interval.start << ", " << interval.end
+            << ") off = " << chunk.offset << " size = " << chunk.size;
+    for (int i = interval.start; i <= interval.end; ++i) {
+      peak_memory_usage_[i] -= chunk.size;
+      CHECK_GE(peak_memory_usage_[i], 0)
+          << "Peak memory usage at " << i
+          << " is below zero after uncommitting. " << interval.start << "-"
+          << interval.end << " : [" << chunk.offset << ", " << chunk.size
+          << "]";
+    }
+    interval_tree_.Remove(interval.start, interval.end, chunk);
+  }
+  for (const AsynchronousCopy& async_copy : pending_async_copies_) {
+    if (async_copy.destination == MemorySpace::kAlternate) {
+      prefetch_interval_tree_.Remove(
+          /*start=*/
+          ExclusiveToInclusiveStartTime(async_copy.exclusive_start_time),
+          async_copy.end_time, kDummyChunk);
+      prefetch_async_copy_resource_.RemoveCopy(async_copy);
+      if (options_.enforce_prefetch_fifo_order) {
+        async_copy_ordering_.RemoveCopy(async_copy);
+      }
+    } else {
+      eviction_interval_tree_.Remove(
+          /*start=*/
+          ExclusiveToInclusiveStartTime(async_copy.exclusive_start_time),
+          async_copy.end_time, kDummyChunk);
+      eviction_async_copy_resource_.RemoveCopy(async_copy);
+    }
+  }
+  for (const auto& value_and_required_assignment :
+       pending_required_assignments_) {
+    auto& required_assignment_vector =
+        required_assignments_[value_and_required_assignment.first];
+    const RequiredMemoryAssignment& required_assignment =
+        value_and_required_assignment.second;
+    VLOG(3) << "Removing required assignment: "
+            << (required_assignment.memory_space == MemorySpace::kDefault
+                    ? "def"
+                    : "alt")
+            << " time = " << required_assignment.time << " off = "
+            << (required_assignment.offset ? required_assignment.offset->offset
+                                           : -1);
+    for (auto it = required_assignment_vector.begin();
+         it != required_assignment_vector.end(); ++it) {
+      if (*it == value_and_required_assignment.second) {
+        required_assignment_vector.erase(it);
+        break;
+      }
+    }
+  }
+  ClearPendingChunks();
+}
+
+void MsaAlgorithm::FinalizeAllocations(
+    absl::Span<AllocationValue> allocation_values) {
+  absl::flat_hash_map<const AliasedOffset*, std::vector<Allocation*>>
+      colocation_map;
+  for (AllocationValue& allocation_value : allocation_values) {
+    for (auto& allocation : *allocation_value.mutable_allocation_sequence()) {
+      if ((allocation->memory_space() == MemorySpace::kAlternate) &&
+          (!allocation->is_scoped_allocation())) {
+        for (const HloUse& use : allocation->uses()) {
+          operands_in_alternate_memory_map_[use.instruction].insert(
+              std::make_pair(use.operand_number, use.operand_index));
+        }
+        if (!allocation->is_copy_like_allocation()) {
+          outputs_in_alternate_memory_map_[allocation->defining_position()
+                                               .instruction]
+              .insert(allocation->defining_position().index);
+        }
+      }
+      allocations_->push_back(std::move(allocation));
+      Allocation* inserted_allocation = allocations_->back().get();
+      if (inserted_allocation->memory_space() == MemorySpace::kAlternate) {
+        colocation_map[GetAliasedOffset(*inserted_allocation)].push_back(
+            inserted_allocation);
+      }
+    }
+  }
+  // The allocations that have the same AliasedOffset need to be colocated.
+  // Export these to repack_allocation_blocks_ so that we can repack them to
+  // reduce fragmentation.
+  for (auto& colocation : colocation_map) {
+    std::vector<AllocationBlock*> colocations;
+    for (Allocation* colocated_allocation : colocation.second) {
+      repack_allocation_blocks_.push_back(MakeRepackAllocationBlock(
+          colocated_allocation->start_time(), colocated_allocation->end_time(),
+          colocated_allocation->chunk().size,
+          colocated_allocation->chunk().offset,
+          static_cast<int64_t>(repack_allocation_blocks_.size()),
+          colocated_allocation));
+      colocations.push_back(&repack_allocation_blocks_.back());
+    }
+    for (int i = 0; i < colocations.size() - 1; ++i) {
+      colocations[i]->next_colocated = colocations[i + 1];
+    }
+    if (!colocations.empty()) {
+      colocations.back()->next_colocated = colocations.front();
+    }
+  }
+  ClearPendingChunks();
+}
+
+void MsaAlgorithm::ClearPendingChunks() {
+  pending_chunks_.clear();
+  pending_async_copies_.clear();
+  pending_required_assignments_.clear();
+  aliased_offset_map_.clear();
+  aliased_offsets_.clear();
+}
+
+void MsaAlgorithm::AddToPendingChunks(const MsaBufferInterval& buffer_interval,
+                                      const Chunk& chunk_candidate) {
+  VLOG(3) << "Committing chunk: " << buffer_interval.start << "-"
+          << buffer_interval.end << " : " << chunk_candidate.ToString();
+  pending_chunks_.emplace_back(buffer_interval, chunk_candidate);
+  for (int i = buffer_interval.start; i <= buffer_interval.end; ++i) {
+    peak_memory_usage_[i] += chunk_candidate.size;
+    CHECK_LE(peak_memory_usage_[i], options_.max_size_in_bytes)
+        << "Peak memory usage at " << i
+        << " exceeds the max size of alternate memory. "
+        << buffer_interval.start << "-" << buffer_interval.end << " : "
+        << chunk_candidate.ToString();
+  }
+  CommitChunk(buffer_interval, chunk_candidate);
+}
+
+std::optional<int> MsaAlgorithm::FindEarliestExclusiveTimeToSatisfyPeakMemory(
+    int exclusive_start_time, int end_time, int64_t size) const {
+  std::optional<int> earliest_time_exclusive = std::nullopt;
+  for (int time_inclusive = ExclusiveToInclusiveEndTime(end_time);
+       time_inclusive > exclusive_start_time; --time_inclusive) {
+    if (peak_memory_usage_[time_inclusive] + size <=
+        options_.max_size_in_bytes) {
+      earliest_time_exclusive = InclusiveToExclusiveStartTime(time_inclusive);
+    } else {
+      break;
+    }
+  }
+
+  return earliest_time_exclusive;
+}
+
+MsaAlgorithm::Result MsaAlgorithm::AllocateSegment(
+    const AllocationRequest& request) {
+  auto allocation_sequence =
+      request.allocation_value->mutable_allocation_sequence();
+  // inclusive_start_time == end_time is a special case where the value is
+  // consumed multiple times by the same instruction. We can just find the
+  // previous allocation and use that allocation.
+  if (request.inclusive_start_time == request.end_time) {
+    Allocation* allocation =
+        GetLiveAllocationAt(*allocation_sequence, request.end_time);
+    CHECK_NE(allocation, nullptr);
+    allocation->AddUse(request.use->hlo_use);
+    return Result::kSuccess;
+  }
+
+  const HloPosition& defining_position =
+      request.allocation_value->defining_position();
+  VLOG(2) << "Finding allocation for "
+          << request.allocation_value->ToShortString() << " ["
+          << request.inclusive_start_time << ", " << request.end_time
+          << ") latest prefetch = " << request.latest_prefetch_time
+          << " last use = " << request.allocation_value->uses().back().time
+          << " use = " << request.use->hlo_use.ToString()
+          << ". Size = " << request.size
+          << ", def pos = " << defining_position.ToString();
+  if (request.require_no_copy_alternate_mem_allocation) {
+    VLOG(2) << "Requiring alternate memory allocation.";
+  }
+  CHECK_LE(request.inclusive_start_time, request.end_time);
+  if (VLOG_IS_ON(3) && options_.cost_analysis) {
+    const HloPosition& defining_position =
+        request.allocation_value->defining_position();
+    const HloUse& use = request.use->hlo_use;
+    VLOG(3) << "Definition benefit = "
+            << options_.cost_analysis->GetAlternateMemoryBenefit(
+                   request.allocation_value->defining_position())
+            << " use benefit = "
+            << options_.cost_analysis->GetAlternateMemoryBenefit(
+                   request.use->hlo_use);
+    VLOG(3)
+        << "Definition bytes accessed = "
+        << options_.cost_analysis->hlo_cost_analysis().output_bytes_accessed(
+               *defining_position.instruction, defining_position.index)
+        << ", use bytes accessed = "
+        << options_.cost_analysis->hlo_cost_analysis().operand_bytes_accessed(
+               *use.instruction, use.operand_number, use.operand_index);
+  }
+
+  // There could be a requirement to pin this buffer to default memory either
+  // because it is a parameter or an output.  If the buffer is a parameter, then
+  // we're allowed to prefetch. If the use expects the output to be in default
+  // memory, we cannot prefetch it because if we did, it would be in alternate
+  // memory instead.
+  auto required_assignment_at_start = RequiredMemoryAssignmentAt(
+      request.allocation_value->value(), request.inclusive_start_time);
+  std::optional<MemorySpace> required_memory_space_at_start;
+  if (required_assignment_at_start) {
+    required_memory_space_at_start = required_assignment_at_start->memory_space;
+  }
+  // Find required assignment both for the use and its aliases. If they are both
+  // non-nullopt, then make sure they require the same assignment.
+  auto required_assignment_at_end = RequiredMemoryAssignmentAt(
+      request.allocation_value->value(), request.end_time);
+  auto aliased_required_assignment_at_end =
+      AliasedRequiredAssignmentForUse(*request.use);
+  if (required_assignment_at_end != aliased_required_assignment_at_end) {
+    if (required_assignment_at_end == std::nullopt) {
+      required_assignment_at_end = aliased_required_assignment_at_end;
+    } else {
+      CHECK(aliased_required_assignment_at_end == std::nullopt ||
+            aliased_required_assignment_at_end->equals_ignoring_time(
+                *required_assignment_at_end));
+    }
+  }
+  std::optional<MemorySpace> required_memory_space_at_end;
+  if (required_assignment_at_end) {
+    required_memory_space_at_end = required_assignment_at_end->memory_space;
+  }
+
+  if (required_assignment_at_start) {
+    bool needs_required_allocation = true;
+    if (!allocation_sequence->empty()) {
+      auto prev_allocation_it = std::find_if(
+          allocation_sequence->rbegin(), allocation_sequence->rend(),
+          [&](const auto& allocation) {
+            return allocation->memory_space() == required_memory_space_at_start;
+          });
+      if (prev_allocation_it != allocation_sequence->rend()) {
+        (*prev_allocation_it)->set_end_time(request.inclusive_start_time);
+        needs_required_allocation = false;
+      }
+    }
+    if (needs_required_allocation) {
+      std::optional<Chunk> aliased_chunk = std::nullopt;
+      if (required_assignment_at_start->memory_space ==
+          MemorySpace::kAlternate) {
+        aliased_chunk = Chunk::FromOffsetSize(
+            required_assignment_at_start->offset->offset, request.size);
+      }
+      allocation_sequence->push_back(std::make_unique<PinnedAllocation>(
+          defining_position, required_assignment_at_start->memory_space,
+          aliased_chunk, request.inclusive_start_time,
+          request.inclusive_start_time,
+          /*is_scoped_allocation=*/false));
+      if (required_assignment_at_start->memory_space ==
+          MemorySpace::kAlternate) {
+        CreateOrAddToAliasedOffset(*allocation_sequence->back(),
+                                   required_assignment_at_start->offset);
+      }
+    }
+  }
+
+  Result allocation_result = Result::kSuccess;
+  // First try keeping the allocation entirely in the alternate memory.
+  if (required_memory_space_at_start != MemorySpace::kDefault &&
+      required_memory_space_at_end != MemorySpace::kDefault &&
+      request.allow_no_copy_alternate_mem_allocation) {
+    allocation_result = AllocateInAlternateMemoryNoCopy(request);
+    if (allocation_result == Result::kSuccess) {
+      return Result::kSuccess;
+    }
+    // If we required alternate memory allocation, return on failure.
+    if (request.require_no_copy_alternate_mem_allocation) {
+      return allocation_result;
+    }
+  }
+
+  CHECK(!request.require_no_copy_alternate_mem_allocation);
+
+  auto prev_allocation_it = allocation_sequence->rbegin();
+  // Find a previous allocation that is in the default memory space (not
+  // necessarily the very last allocation).
+  auto prev_allocation_in_default_mem_it =
+      std::find_if(allocation_sequence->rbegin(), allocation_sequence->rend(),
+                   [&](const auto& allocation) {
+                     return allocation->memory_space() == MemorySpace::kDefault;
+                   });
+
+  if (prev_allocation_in_default_mem_it == allocation_sequence->rend() &&
+      prev_allocation_it != allocation_sequence->rend() &&
+      (*prev_allocation_it)->memory_space() == MemorySpace::kAlternate &&
+      (*prev_allocation_it)->defining_position() == defining_position &&
+      !request.allocation_value->requires_contiguous_allocation()) {
+    // If there was an allocation for this HloValue that was in the alternate
+    // memory space, we also need to perform an eviction.
+    Result eviction_result = Evict(request);
+    if (eviction_result != Result::kSuccess) {
+      // A non-success eviction requires us to uncommit previous allocations.
+      return result_mark(Result::kFailRequiresUncommit, eviction_result);
+    }
+    prev_allocation_in_default_mem_it = allocation_sequence->rbegin();
+  } else if (prev_allocation_in_default_mem_it == allocation_sequence->rend()) {
+    allocation_sequence->push_back(std::make_unique<PinnedAllocation>(
+        defining_position, MemorySpace::kDefault,
+        /*chunk=*/std::nullopt, request.inclusive_start_time, request.end_time,
+        /*is_scoped_allocation=*/false));
+    prev_allocation_in_default_mem_it = allocation_sequence->rbegin();
+  }
+
+  CHECK(prev_allocation_in_default_mem_it != allocation_sequence->rend());
+  CHECK((*prev_allocation_in_default_mem_it)->memory_space() ==
+        MemorySpace::kDefault);
+
+  // If the allocation value requires a contiguous allocation but has a memory
+  // space mismatch between the start and end required assignments, then we need
+  // to uncommit.
+  if (request.allocation_value->requires_contiguous_allocation() &&
+      required_memory_space_at_start.has_value() &&
+      required_memory_space_at_end.has_value() &&
+      required_memory_space_at_start != required_memory_space_at_end) {
+    VLOG(3) << "Allocation requires contiguous allocation but has memory space "
+               "mismatch.";
+    return result_mark(Result::kFailRequiresUncommit, allocation_result);
+  }
+
+  // If the buffer must be in default memory at the end_time, don't prefetch.
+  if (required_memory_space_at_end == MemorySpace::kDefault) {
+    VLOG(3)
+        << "Not trying to prefetch because use requires buffer in default mem.";
+    (*prev_allocation_in_default_mem_it)->set_end_time(request.end_time);
+    (*prev_allocation_in_default_mem_it)->AddUse(request.use->hlo_use);
+    return Result::kSuccess;
+  }
+
+  // Finally, try to prefetch the buffer into alternate memory.
+  if (request.allow_prefetch &&
+      !request.allocation_value->requires_contiguous_allocation()) {
+    Result prefetch_result =
+        Prefetch(request, **prev_allocation_in_default_mem_it);
+    if (prefetch_result == Result::kSuccess) {
+      if (request.preferred_prefetch_time) {
+        // Warn if the prefetch time picked doesn't match the preferred prefetch
+        // time.
+        CHECK(!request.allocation_value->allocation_sequence()->empty());
+        const Allocation* allocation =
+            request.allocation_value->allocation_sequence()->back().get();
+        int64_t prefetch_time = 0;
+        if (allocation->is_copy_allocation()) {
+          prefetch_time = static_cast<const CopyAllocation*>(allocation)
+                              ->copy_start_schedule_after();
+        } else if (allocation->is_sliced_copy_allocation()) {
+          prefetch_time = static_cast<const SlicedCopyAllocation*>(allocation)
+                              ->slice_details_sorted_by_start_time()
+                              .front()
+                              .copy_start_after_time;
+        } else {
+          LOG(FATAL) << "Prefetch allocation are expected to be "
+                        "CopyAllocations or SlicedCopyAllocations.";
+        }
+        if (prefetch_time != *request.preferred_prefetch_time) {
+          VLOG(1) << "Scheduled prefetch time (" << prefetch_time
+                  << ") doesn't match the preferred prefetch time ("
+                  << *request.preferred_prefetch_time
+                  << "): " << request.use->hlo_use.ToString();
+        }
+      }
+      return Result::kSuccess;
+    }
+    // Warn if there was a preferred prefetch time but we couldn't actually
+    // prefetch.
+    if (request.preferred_prefetch_time) {
+      VLOG(1) << "The request has a preferred prefetch time ("
+              << *request.preferred_prefetch_time
+              << ") which could not be satisfied: "
+              << request.use->hlo_use.ToString();
+    }
+    result_mark(prefetch_result, allocation_result);
+  }
+
+  // If the end assignment was required to be in alternate memory but that
+  // wasn't possible, then this allocation is invalid.
+  if (required_memory_space_at_end == MemorySpace::kAlternate) {
+    return result_mark(Result::kFailRequiresUncommit, allocation_result);
+  }
+
+  // If the start assignment was required to be in alternate memory and the
+  // buffer needs a contiguous assignment, we couldn't satisfy this requirement
+  // and must abort.
+  if (required_memory_space_at_start == MemorySpace::kAlternate &&
+      request.allocation_value->requires_contiguous_allocation()) {
+    return result_mark(Result::kFailRequiresUncommit, allocation_result);
+  }
+
+  // If a copy wasn't inserted, then add this use to the latest allocation in
+  // default memory.
+  (*prev_allocation_in_default_mem_it)->set_end_time(request.end_time);
+  (*prev_allocation_in_default_mem_it)->AddUse(request.use->hlo_use);
+  return allocation_result;
+}
+
+void MsaAlgorithm::AddAsyncCopy(
+    Allocation& prev_allocation, MemorySpace memory_space,
+    std::optional<Chunk> chunk, int64_t exclusive_start_time, int64_t end_time,
+    int64_t copy_done_schedule_before_time, AllocationSequence* allocations,
+    AliasedOffset* aliased_offset, float resource,
+    std::optional<int> cross_program_prefetch_index) {
+  VLOG(3) << "Copy to "
+          << (memory_space == MemorySpace::kDefault ? "default" : "alternate")
+          << " memory in (" << exclusive_start_time << ", "
+          << copy_done_schedule_before_time << "), keeping until " << end_time
+          << ", estimated copy resource is " << resource;
+  CHECK_LT(exclusive_start_time, copy_done_schedule_before_time);
+
+  allocations->push_back(std::make_unique<CopyAllocation>(
+      prev_allocation, memory_space, chunk, exclusive_start_time,
+      copy_done_schedule_before_time, end_time, cross_program_prefetch_index));
+
+  // Register the additional async copy with the interval tree to keep track of
+  // the limit at any given time.
+  pending_async_copies_.push_back({exclusive_start_time,
+                                   copy_done_schedule_before_time, resource,
+                                   memory_space, next_async_copy_id_++});
+  if (memory_space == MemorySpace::kAlternate) {
+    prefetch_interval_tree_.Add(
+        /*start=*/
+        ExclusiveToInclusiveStartTime(exclusive_start_time),
+        copy_done_schedule_before_time, kDummyChunk);
+    prefetch_async_copy_resource_.AddCopy(pending_async_copies_.back());
+    if (options_.enforce_prefetch_fifo_order) {
+      async_copy_ordering_.AddCopy(pending_async_copies_.back());
+    }
+    CreateOrAddToAliasedOffset(*allocations->back(), aliased_offset);
+  } else {
+    eviction_interval_tree_.Add(
+        /*start=*/
+        ExclusiveToInclusiveStartTime(exclusive_start_time),
+        copy_done_schedule_before_time, kDummyChunk);
+    eviction_async_copy_resource_.AddCopy(pending_async_copies_.back());
+  }
+}
+
+namespace {
+
+// Computes a string that can be used for logging/debugging. For each slice, the
+// string includes:
+// - When the slice starts
+// - When the slice copy must complete
+// - When the allocation for the slice ends
+// - An estimation of how much copy resource the slice consumes
+std::string SliceTimesAndCopyResourcesToString(
+    const std::vector<SliceDecision>& slice_decisions, int64_t prefetch_end,
+    int64_t allocation_end) {
+  std::vector<std::string> slice_strings;
+  slice_strings.reserve(slice_decisions.size());
+
+  for (const auto& slice_decision : slice_decisions) {
+    std::vector<std::string> details;
+    details.push_back(absl::StrCat(slice_decision.exclusive_start_time));
+    details.push_back(absl::StrCat(prefetch_end));
+    details.push_back(absl::StrCat(allocation_end));
+    details.push_back(absl::StrCat(slice_decision.copy_resource_consumed));
+
+    slice_strings.push_back(
+        absl::StrCat("(", absl::StrJoin(details, ", "), ")"));
+  }
+
+  return absl::StrCat(
+      "Slices(copy_start_time, copy_done_by_time, allocation_end, "
+      "estimated_copy_resource) = [",
+      absl::StrJoin(slice_strings, ", "), "]");
+}
+
+}  // namespace
+
+void MsaAlgorithm::AddAsyncSlicesForPrefetch(
+    const Allocation& prev_allocation, AllocationSequence* allocations,
+    AliasedOffset* aliased_offset,
+    const std::vector<SliceDecision>& slice_decisions_sorted_by_start_time,
+    int64_t prefetch_end_time, int64_t allocation_end_time) {
+  VLOG(3) << "Sliced copy to alternate memory. "
+          << SliceTimesAndCopyResourcesToString(
+                 slice_decisions_sorted_by_start_time, prefetch_end_time,
+                 allocation_end_time);
+  CHECK(absl::c_all_of(
+      slice_decisions_sorted_by_start_time, [&](const auto& slice_decision) {
+        return slice_decision.exclusive_start_time < prefetch_end_time;
+      }));
+
+  allocations->push_back(std::make_unique<SlicedCopyAllocation>(
+      prev_allocation, MemorySpace::kAlternate,
+      slice_decisions_sorted_by_start_time, prefetch_end_time,
+      allocation_end_time, options_.sliced_prefetch_options,
+      options_.get_equivalent_s8_shape_fn));
+
+  // Register the additional async copy with the interval tree to keep track of
+  // the limit at any given time.
+  for (const auto& slice_decision : slice_decisions_sorted_by_start_time) {
+    pending_async_copies_.push_back(
+        {slice_decision.exclusive_start_time, prefetch_end_time,
+         slice_decision.copy_resource_consumed, MemorySpace::kAlternate,
+         next_async_copy_id_++});
+    prefetch_interval_tree_.Add(slice_decision.exclusive_start_time,
+                                prefetch_end_time, kDummyChunk);
+    prefetch_async_copy_resource_.AddCopy(pending_async_copies_.back());
+    if (options_.enforce_prefetch_fifo_order) {
+      async_copy_ordering_.AddCopy(pending_async_copies_.back());
+    }
+  }
+  CreateOrAddToAliasedOffset(*allocations->back(), aliased_offset);
+}
+
+bool MsaAlgorithm::ViolatesMaximumOutstandingAsyncCopies(
+    int64_t inclusive_start_time, int64_t end_time, bool is_prefetch,
+    int64_t extra_async_copy_limit, int64_t num_additional_copies) const {
+  if (options_.max_outstanding_prefetches < 0 && is_prefetch) {
+    return false;
+  }
+  if (options_.max_outstanding_evictions < 0 && !is_prefetch) {
+    return false;
+  }
+
+  // Count the prefetches/evictions in the interval tree for the given interval.
+  if (is_prefetch) {
+    int64_t num_prefetches =
+        prefetch_interval_tree_
+            .ChunksOverlappingInTime(inclusive_start_time, end_time)
+            .size() +
+        num_additional_copies;
+    return num_prefetches >=
+           options_.max_outstanding_prefetches + extra_async_copy_limit;
+  } else {
+    int64_t num_evictions =
+        eviction_interval_tree_
+            .ChunksOverlappingInTime(inclusive_start_time, end_time)
+            .size() +
+        num_additional_copies;
+    return num_evictions >=
+           options_.max_outstanding_evictions + extra_async_copy_limit;
+  }
+}
+
+MsaAlgorithm::Result MsaAlgorithm::AllocateInAlternateMemoryNoCopy(
+    const AllocationRequest& request) {
+  Allocation* prev_allocation = nullptr;
+  bool can_eliminate_copy = false;
+  if (request.allocation_value->allocation_sequence()->empty()) {
+    // There hasn't been any allocations for this interval so far. We can
+    // eliminate copy if the value can be placed in the alternate memory.
+    can_eliminate_copy = options_.is_allowed_in_alternate_mem_fn(
+        *request.allocation_value->value());
+  } else {
+    // If there has been a previous allocation, we can eliminate the copy if the
+    // previous allocation was also in the alternate memory.
+    prev_allocation =
+        request.allocation_value->allocation_sequence()->back().get();
+    can_eliminate_copy =
+        (prev_allocation->memory_space() == MemorySpace::kAlternate);
+  }
+
+  if (!can_eliminate_copy) {
+    VLOG(3) << "Can't eliminate copy.";
+    return Result::kFailPrevAllocationNotInAlternateMem;
+  }
+
+  const HloPosition& defining_position =
+      request.allocation_value->defining_position();
+  // If prefer_no_copy_alternate_mem_allocation is true, bypass the live range
+  // duration checks.
+  if (!request.require_no_copy_alternate_mem_allocation &&
+      !request.prefer_no_copy_alternate_mem_allocation &&
+      !options_.prefetch_interval_picker->CanAllocateInAlternateMemoryNoCopy(
+          defining_position.shape(), request.inclusive_start_time,
+          request.end_time)) {
+    VLOG(3) << "Live range is too long.";
+    return Result::kFailLiveRangeTooLong;
+  }
+
+  MsaBufferInterval alternate_mem_interval;
+  alternate_mem_interval.buffer = request.allocation_value->value();
+  alternate_mem_interval.size = request.size;
+  alternate_mem_interval.end = request.end_time;
+  alternate_mem_interval.start = request.inclusive_start_time;
+
+  // Prefer the offset that was previously used for the previous allocation.
+  AliasedOffset* preferred_offset = nullptr;
+  if (prev_allocation != nullptr) {
+    preferred_offset = GetAliasedOffset(*prev_allocation);
+    // If there is a previous allocation, set the start time one after the end
+    // of the previous allocation's end.
+    alternate_mem_interval.start = prev_allocation->end_time() + 1;
+  }
+
+  if (request.preferred_offset) {
+    // If there is a preferred offset provided in the request and if it doesn't
+    // match the previous allocation, this request cannot be satisified.
+    if (preferred_offset && request.preferred_offset != preferred_offset) {
+      VLOG(3) << "Cannot perform no-copy allocation due to mismatch: "
+                 "preferred_offset = "
+              << preferred_offset->offset << ", request.preferred_offset = "
+              << request.preferred_offset->offset;
+      return Result::kFailConflictingPreferredOffsets;
+    }
+    preferred_offset = request.preferred_offset;
+  }
+
+  VLOG(3) << "We can eliminate copy to alternate memory. Preferred offset = "
+          << (preferred_offset ? preferred_offset->offset : -1);
+  // In case there are additional uses after this use, we rely on the last use
+  // time to try to reserve a chunk in the heap simulator. This is to prevent
+  // the following scenario:
+  //
+  //                            +-------+
+  //                           /         \
+  //                   Producer--->Use1   +-->Use2
+  //                       +---------+---------+
+  // New buffer:           |         |         |
+  //                       +---------+---------+
+  //
+  //                                     +-----------+
+  // Current heap:                       | offset: 0 |
+  //           --------------------------+-----------+------
+  //
+  // Because we allocate buffers greedily, Producer to Use1 segment first, and
+  // then Use1 to Use2 segment, it is possible to allocate the first segment at
+  // an offset that is available for the first segment (e.g. offset 0) but not
+  // for the entire live range. This can result in unnecessary copies. By using
+  // the last use time, we try to find an allocation that is available for the
+  // entire Producer to Use2 range.
+  std::optional<Chunk> chunk_candidate = FindBestChunkCandidate(
+      request, preferred_offset, &alternate_mem_interval);
+  // Check if the new heap size fits within limits. Also ensure if a
+  // preferred offset was provided, that offset was used.
+  if (chunk_candidate) {
+    VLOG(3) << "Keep the buffer in alternate memory. Offset = "
+            << chunk_candidate->offset << ", size = " << chunk_candidate->size
+            << ", heap_size = " << result_.UpdatedHeapSize(*chunk_candidate)
+            << ", prefetch picker = "
+            << options_.prefetch_interval_picker->ToNoCopyDebugString(
+                   defining_position.shape(),
+                   /*start_time=*/
+                   InclusiveToExclusiveStartTime(request.inclusive_start_time),
+                   request.end_time);
+    AddToPendingChunks(alternate_mem_interval, *chunk_candidate);
+
+    // If there was a previous allocation, the buffer location is the
+    // same as the previous. Otherwise, it is the operand.
+    if (prev_allocation != nullptr &&
+        (prev_allocation->is_copy_like_allocation() ||
+         prev_allocation->defining_position() == defining_position)) {
+      prev_allocation->set_end_time(request.end_time);
+    } else {
+      request.allocation_value->mutable_allocation_sequence()->push_back(
+          std::make_unique<PinnedAllocation>(
+              defining_position, MemorySpace::kAlternate, chunk_candidate,
+              request.inclusive_start_time, request.end_time,
+              /*is_scoped_allocation=*/false));
+      CreateOrAddToAliasedOffset(
+          *request.allocation_value->allocation_sequence()->back(),
+          preferred_offset);
+    }
+    request.allocation_value->allocation_sequence()->back()->AddUse(
+        request.use->hlo_use);
+    return Result::kSuccess;
+  }
+  if (request.prefer_no_copy_alternate_mem_allocation) {
+    VLOG(1) << "Preferred no-copy allocation, but this was not possible: "
+            << request.use->hlo_use.ToString();
+  }
+  return Result::kFailOutOfMemory;
+}
+
+MsaAlgorithm::Result MsaAlgorithm::Evict(const AllocationRequest& request) {
+  CHECK_GT(request.allocation_value->allocation_sequence()->size(), 0);
+  Allocation* prev_allocation =
+      request.allocation_value->allocation_sequence()->back().get();
+  // We do not ever expect an Evict() to be immediately proceeded by a prefetch.
+  // If that case ever occurs, the eviction_exclusive_start_time below will be
+  // calculated incorrectly, as it will need to come after the prefetch finishes
+  // coping data.
+  CHECK(!prev_allocation->is_copy_like_allocation())
+      << "Evict has been given copy-like previous allocation.\nEvict "
+         "candidate:\n"
+      << request.allocation_value->ToString() << "\nPrevious allocation:\n"
+      << prev_allocation->ToString();
+
+  // The previous allocation's inclusive start time is the eviction's exclusive
+  // start time to ensure that the value is created before we start copying
+  // back to default memory.
+  int64_t eviction_exclusive_start_time = prev_allocation->start_time();
+  int64_t eviction_end_time = prev_allocation->end_time();
+  CHECK(eviction_exclusive_start_time <= eviction_end_time);
+
+  int64_t preferred_eviction_end_time =
+      std::max(options_.prefetch_interval_picker->PreferredEvictionEndTime(
+                   request.allocation_value->defining_position().shape(),
+                   eviction_exclusive_start_time, request.end_time),
+               eviction_end_time);
+  // Evictions must complete by the time of this use.
+  preferred_eviction_end_time =
+      std::min(preferred_eviction_end_time, request.latest_prefetch_time);
+
+  MsaBufferInterval eviction_mem_interval;
+  eviction_mem_interval.buffer = request.allocation_value->value();
+  eviction_mem_interval.size = request.size;
+  // Try to reserve a buffer from the end of the previous allocation to the
+  // preferred eviction end time.
+  eviction_mem_interval.start = eviction_end_time + 1;
+  eviction_mem_interval.end = preferred_eviction_end_time;
+  int64_t preferred_offset = prev_allocation->chunk().offset;
+  VLOG(3) << "Considering eviction after" << eviction_exclusive_start_time
+          << ", with preferred end time = " << eviction_mem_interval.end;
+
+  for (; eviction_mem_interval.end > eviction_end_time;
+       --eviction_mem_interval.end) {
+    Chunk chunk_candidate =
+        FindChunkCandidate(eviction_mem_interval, preferred_offset);
+    if (chunk_candidate.offset == preferred_offset) {
+      AddToPendingChunks(eviction_mem_interval, chunk_candidate);
+      break;
+    }
+  }
+  eviction_end_time = eviction_mem_interval.end;
+
+  VLOG(3) << "Evicting buffer at " << prev_allocation->chunk().offset << " ("
+          << eviction_exclusive_start_time << ", " << eviction_end_time << ")";
+
+  float eviction_resource =
+      options_.cost_analysis
+          ? options_.cost_analysis->GetAsyncCopyElapsed(
+                request.allocation_value->defining_position().shape())
+          : 0.1;
+
+  bool eviction_interval_too_short =
+      (eviction_exclusive_start_time == eviction_end_time);
+  bool eviction_violates_resource =
+      !eviction_async_copy_resource_.HasEnoughResource(
+          eviction_exclusive_start_time, eviction_end_time, eviction_resource);
+  if (eviction_violates_resource) {
+    // If we're in the last retry, set resource to 0.
+    if (options_.prefetch_interval_picker->retry_number() ==
+        options_.max_retries - 1) {
+      VLOG(3) << "Violates resource in last retry, setting resource = 0";
+      eviction_resource = 0;
+    }
+    eviction_violates_resource =
+        !eviction_async_copy_resource_.HasEnoughResource(
+            eviction_exclusive_start_time, eviction_end_time,
+            eviction_resource);
+  }
+  bool eviction_violates_outstanding_copies =
+      ViolatesMaximumOutstandingAsyncCopies(
+          /*inclusive_start_time=*/ExclusiveToInclusiveStartTime(
+              eviction_exclusive_start_time),
+          eviction_end_time,
+          /*is_prefetch=*/false);
+
+  // See if this interval would violate the asynchronous copy limit.
+  if (!eviction_interval_too_short && !eviction_violates_outstanding_copies &&
+      !eviction_violates_resource) {
+    prev_allocation->set_end_time(eviction_end_time);
+    AddAsyncCopy(*prev_allocation, MemorySpace::kDefault,
+                 /*chunk=*/std::nullopt, eviction_exclusive_start_time,
+                 prev_allocation->end_time(), eviction_end_time,
+                 request.allocation_value->mutable_allocation_sequence(),
+                 /*aliased_offset=*/nullptr, eviction_resource);
+  } else {
+    if (eviction_violates_outstanding_copies) {
+      VLOG(3) << "This violates the maximum async copies.";
+    } else if (eviction_violates_resource) {
+      VLOG(3) << "This violates resource.";
+    } else {
+      VLOG(3) << "Eviction interval is too short ("
+              << eviction_exclusive_start_time << ", " << eviction_end_time
+              << ").";
+    }
+    // If the original interval violated the limit, try sub-intervals within
+    // this interval.
+    bool eviction_scheduled = false;
+
+    if (!eviction_scheduled) {
+      // If the eviction couldn't be scheduled, then fail. This buffer will be
+      // kept in the default memory.
+      VLOG(3) << "Bailing: Could not evict " << request.use->hlo_use.ToString()
+              << " because we hit the limit of maximum asynchronous copies "
+              << "between ("
+              << hlo_live_range_.flattened_instruction_sequence()
+                     .instructions()[eviction_exclusive_start_time]
+              << ", "
+              << hlo_live_range_.flattened_instruction_sequence()
+                     .instructions()[eviction_end_time]
+              << ")";
+      return Result::kFailOutOfAsyncCopies;
+    }
+  }
+  return Result::kSuccess;
+}
+
+int64_t MsaAlgorithm::FindPrefetchEndTime(
+    const AllocationRequest& request, int64_t earliest_prefetch_time) const {
+  return request.latest_prefetch_time;
+}
+
+namespace {
+
+// A debugging/logging method for describing a sliced solution.
+std::string DescribeSlicedBufferMove(
+    const std::vector<SliceDecision>& slice_decisions,
+    const MsaAlgorithm::HeapResult& heap_result,
+    const MsaAlgorithm::Chunk& full_chunk,
+    absl::string_view prefetch_picker_debug_string) {
+  std::vector<std::string> slice_strings;
+  slice_strings.reserve(slice_decisions.size());
+
+  for (const auto& slice_decision : slice_decisions) {
+    slice_strings.push_back(absl::StrCat(
+        "(", slice_decision.exclusive_start_time, ", ",
+        slice_decision.chunk.offset, ", ", slice_decision.chunk.size, ")"));
+  }
+
+  return absl::StrCat(
+      "Moving buffer to alternate memory in slices. Slices(start_time, offset, "
+      "size) = [",
+      absl::StrJoin(slice_strings, ", "),
+      "]. Heap size = ", heap_result.UpdatedHeapSize(full_chunk),
+      ". Prefetch picker = ", prefetch_picker_debug_string);
+}
+
+}  // namespace
+
+MsaAlgorithm::Result MsaAlgorithm::Prefetch(
+    const AllocationRequest& request,
+    Allocation& prev_allocation_in_default_mem) {
+  // Try partially placing the buffer in the alternate space. The time that is
+  // overlapped will be used to asynchronously copy the buffer from the
+  // default memory to the alternate memory.
+  //
+  //                      start                 end
+  //                      time                  time
+  //                      X---------------------X
+  // Alternate:                          +------+
+  // Default:             +---------------------+
+  //                                     ^      ^
+  //                                   Copy    Copy
+  //                                   Start   Done
+
+  VLOG(5) << "Considering prefetch of "
+          << request.allocation_value->defining_instruction()->ToString()
+          << (request.preferred_offset
+                  ? absl::StrCat(", with a preferred offset of ",
+                                 request.preferred_offset->offset, ".")
+                  : "");
+  PrefetchContext context;
+  context.request = &request;
+  context.prev_allocation_in_default_mem = &prev_allocation_in_default_mem;
+
+  // Create a SliceProposal and WorkingIntervals.
+  SetupPrefetchWorkingIntervalsAndSliceProposal(context);
+
+  // Compute some additional preliminaries
+  Result init_result = InitializePrefetchIntervalPicker(context);
+  if (init_result != Result::kSuccess) {
+    return init_result;
+  }
+  Result check_result = EnsureSomeSpatialPrefetchFitExists(context);
+  if (check_result != Result::kSuccess) {
+    return check_result;
+  }
+  const HloUse& use = request.use->hlo_use;
+  context.full_shape = &ShapeUtil::GetSubshape(
+      use.instruction->operand(use.operand_number)->shape(), use.operand_index);
+  // While uses might be allowed to have additional outstanding prefetches.
+  context.extra_async_copy_limit =
+      use.instruction->opcode() == HloOpcode::kWhile
+          ? options_.while_use_extra_outstanding_prefetch_limit
+          : 0;
+
+  // Loop over potential prefetch starting times. At the selected start time, we
+  // check if we have enough resources and memory for a sliced version of the
+  // request and a non-sliced version of the request. We return the first sliced
+  // solution that we find. We fallback to the first unsliced solution we find,
+  // if we are unable to find a sliced solution.
+  Result result = Result::kSuccess;
+  while (!options_.prefetch_interval_picker->Done()) {
+    // Get the prefetch start time from the interval picker.
+    context.exclusive_prefetch_start_time =
+        options_.prefetch_interval_picker->Next();
+    CHECK_LT(context.exclusive_prefetch_start_time, context.prefetch_end_time);
+    if (context.exclusive_out_of_mem_start.has_value() &&
+        context.exclusive_prefetch_start_time <=
+            *context.exclusive_out_of_mem_start) {
+      VLOG(4) << "This would OOM (cached).";
+      return Result::kFailOutOfMemory;
+    }
+
+    if (context.slice_proposal_collection) {
+      VLOG(5) << "Trying sliced solution.";
+      // Check if a sliced solution fits.
+      Result sliced_result =
+          CheckPrefetchFit(/*for_sliced_solution=*/true, context);
+      if (sliced_result == Result::kSuccess) {
+        // Break out of the loop and use the sliced solution.
+        CHECK(context.sliced_solution);
+        break;
+      } else if (sliced_result != Result::kAllSlicesHaveTheSameStartTime) {
+        result_mark(sliced_result, result);
+      }
+    }
+
+    // If we don't already have an unsliced solution, check the current fit.
+    if (!context.unsliced_solution) {
+      VLOG(5) << "Trying unsliced solution.";
+      Result unsliced_result =
+          CheckPrefetchFit(/*for_sliced_solution=*/false, context);
+      if (unsliced_result != Result::kSuccess) {
+        result_mark(unsliced_result, result);
+      } else if (!context.slice_proposal_collection) {
+        // We found an unsliced solution and there is no slice proposal, so
+        // break out of the loop and use the unsliced solution.
+        CHECK(context.unsliced_solution);
+        break;
+      }
+    }
+  }
+
+  // Check if we found any solutions.
+  if (context.sliced_solution) {
+    CHECK(!context.sliced_solution->slices_for_pending_chunks.empty());
+    VLOG(3) << DescribeSlicedBufferMove(
+        context.sliced_solution->slice_decisions_sorted_by_start_time, result_,
+        context.sliced_solution->slices_for_pending_chunks.back().second,
+        context.sliced_solution->prefetch_picker_debug_string);
+
+    for (const auto& interval_chunk_pair :
+         context.sliced_solution->slices_for_pending_chunks) {
+      AddToPendingChunks(interval_chunk_pair.first, interval_chunk_pair.second);
+    }
+    AddAsyncSlicesForPrefetch(
+        *context.prev_allocation_in_default_mem,
+        context.request->allocation_value->mutable_allocation_sequence(),
+        context.request->preferred_offset,
+        context.sliced_solution->slice_decisions_sorted_by_start_time,
+        context.prefetch_end_time, context.request->end_time);
+    context.request->allocation_value->allocation_sequence()->back()->AddUse(
+        context.request->use->hlo_use);
+    return Result::kSuccess;
+  }
+  if (context.unsliced_solution) {
+    VLOG(3) << "Move the buffer to alternate memory after time "
+            << InclusiveToExclusiveStartTime(
+                   context.unsliced_solution_intervals.full.start)
+            << ". Offset = "
+            << context.unsliced_solution->chunk_candidate.offset
+            << ", size = " << context.unsliced_solution->chunk_candidate.size
+            << ", heap_size = "
+            << result_.UpdatedHeapSize(
+                   context.unsliced_solution->chunk_candidate)
+            << ", prefetch picker = "
+            << context.unsliced_solution->prefetch_picker_debug_string;
+    AddToPendingChunks(context.unsliced_solution_intervals.full,
+                       context.unsliced_solution->chunk_candidate);
+    AddAsyncCopy(
+        *context.prev_allocation_in_default_mem, MemorySpace::kAlternate,
+        context.unsliced_solution->chunk_candidate,
+        context.unsliced_solution_intervals.full.start - 1,
+        context.request->end_time, context.prefetch_end_time,
+        context.request->allocation_value->mutable_allocation_sequence(),
+        context.request->preferred_offset,
+        context.unsliced_solution->prefetch_resource);
+
+    request.allocation_value->allocation_sequence()->back()->AddUse(
+        request.use->hlo_use);
+    return Result::kSuccess;
+  }
+
+  // If we didn't consider any prefetch intervals, then the live range was too
+  // short.
+  return (result == Result::kSuccess ? Result::kFailLiveRangeTooShort : result);
+}
+
+void MsaAlgorithm::GenerateSliceProposal(PrefetchContext& context) const {
+  if (options_.sliced_prefetch_options.max_slices() < 2) {
+    return;
+  }
+  auto log_prefix = [&]() {
+    return absl::StrCat(
+        "Slice request(options = ",
+        options_.sliced_prefetch_options.ShortDebugString(), "; shape = ",
+        context.prev_allocation_in_default_mem->defining_position()
+            .shape()
+            .ToString(),
+        ")");
+  };
+
+  if (context.request->size < options_.sliced_prefetch_options.min_bytes()) {
+    VLOG(5) << "Not slicing " << log_prefix() << " because the request size "
+            << context.request->size
+            << " is smaller than the min configured size of "
+            << options_.sliced_prefetch_options.min_bytes();
+    return;
+  }
+
+  auto status_or_proposal = options_.propose_slice_fn(
+      context.prev_allocation_in_default_mem->defining_position().shape(),
+      options_.sliced_prefetch_options);
+  if (!status_or_proposal.ok()) {
+    VLOG(2) << log_prefix() << " failed: " << status_or_proposal.status();
+    return;
+  }
+
+  if (status_or_proposal.value().size() < 2) {
+    VLOG(2) << log_prefix() << ". No slices proposed.";
+    return;
+  }
+
+  VLOG(6) << log_prefix() << ". Slice proposal = ["
+          << absl::StrJoin(status_or_proposal.value(), ", ",
+                           [](std::string* out, const SliceProposal& proposal) {
+                             absl::StrAppend(out, proposal.ToString());
+                           })
+          << "]";
+
+  context.slice_proposal_collection = std::move(status_or_proposal.value());
+}
+
+void MsaAlgorithm::SetupPrefetchWorkingIntervalsAndSliceProposal(
+    PrefetchContext& context) const {
+  // Setup the full WorkingIntervals for the sliced and unsliced solutions.
+  // Future code will adjust the start and end times.
+  context.sliced_solution_intervals.full = MsaBufferInterval{
+      context.request->allocation_value->value(),
+      /*size=*/context.request->size,
+      /*start=*/-1,
+      /*end=*/context.request->end_time,
+      /*colocations=*/{},
+      /*need_allocation=*/true,
+  };
+  context.unsliced_solution_intervals.full =
+      context.sliced_solution_intervals.full;
+
+  // Attempt to generate a slice proposal.
+  GenerateSliceProposal(context);
+
+  // Setup the full SlicedBufferIntervals for the sliced and unsliced solutions.
+  // If there is no slice proposal, we will not try a sliced solution. In such a
+  // case, we do not populate context.sliced_solution_intervals.
+  if (context.slice_proposal_collection) {
+    context.sliced_solution_intervals.sliced =
+        std::make_unique<SlicedBufferInterval>(
+            SlicedBufferInterval::CreateMutableInterval(
+                context.sliced_solution_intervals.full));
+    std::vector<int64_t> sizes;
+    sizes.reserve(context.slice_proposal_collection->size());
+    for (const SliceProposal& single_slice_proposal :
+         *context.slice_proposal_collection) {
+      sizes.push_back(single_slice_proposal.slice_size);
+    }
+    context.sliced_solution_intervals.sliced->Slice(sizes);
+  }
+  context.unsliced_solution_intervals.sliced =
+      std::make_unique<SlicedBufferInterval>(
+          SlicedBufferInterval::CreateMutableInterval(
+              context.unsliced_solution_intervals.full));
+}
+
+MsaAlgorithm::Result MsaAlgorithm::InitializePrefetchIntervalPicker(
+    PrefetchContext& context) {
+  int64_t earliest_exclusive_prefetch_time =
+      context.prev_allocation_in_default_mem->earliest_available_time();
+  if (context.request->earliest_prefetch_time) {
+    earliest_exclusive_prefetch_time =
+        std::max(earliest_exclusive_prefetch_time,
+                 *context.request->earliest_prefetch_time);
+  }
+  context.prefetch_end_time =
+      FindPrefetchEndTime(*context.request, earliest_exclusive_prefetch_time);
+
+  // As a compile time optimization, use the peak memory usage to filter out
+  // allocation times that would push us to OOM.
+  std::optional<int> earliest_exclusive_non_oom_prefetch_time =
+      FindEarliestExclusiveTimeToSatisfyPeakMemory(
+          earliest_exclusive_prefetch_time, context.prefetch_end_time,
+          context.request->size);
+  if (!earliest_exclusive_non_oom_prefetch_time) {
+    VLOG(3) << "Any prefetch in range (" << earliest_exclusive_prefetch_time
+            << ", " << context.prefetch_end_time << ") for size "
+            << context.request->size << " would go out of memory.";
+    return Result::kFailOutOfMemory;
+  }
+  if (!context.slice_proposal_collection) {
+    // We can only perform this optimization if we are not slicing.
+    // earliest_non_oom_prefetch_time lets us know the first time the entire
+    // buffer will fit, but we may be able to start slices before that time. So,
+    // we leave earliest_prefetch_time at its initial value.
+    VLOG(4) << "After peak memory check, prefetch range is ("
+            << *earliest_exclusive_non_oom_prefetch_time << ", "
+            << context.prefetch_end_time
+            << "). Original earliest prefetch time is "
+            << earliest_exclusive_prefetch_time;
+    earliest_exclusive_prefetch_time =
+        *earliest_exclusive_non_oom_prefetch_time;
+  }
+  std::optional<int64_t> preferred_prefetch_time =
+      context.request->preferred_prefetch_time;
+  if (preferred_prefetch_time) {
+    preferred_prefetch_time =
+        std::max(*preferred_prefetch_time, earliest_exclusive_prefetch_time);
+  }
+  options_.prefetch_interval_picker->Begin(
+      context.request->use->hlo_use, earliest_exclusive_prefetch_time,
+      context.prefetch_end_time, preferred_prefetch_time);
+  VLOG(3) << "Trying prefetch picker = "
+          << options_.prefetch_interval_picker->ToDebugString();
+
+  return Result::kSuccess;
+}
+
+MsaAlgorithm::Result MsaAlgorithm::EnsureSomeSpatialPrefetchFitExists(
+    PrefetchContext& context) const {
+  SlicedBufferInterval* interval =
+      (context.slice_proposal_collection
+           ? context.sliced_solution_intervals.sliced.get()
+           : context.unsliced_solution_intervals.sliced.get());
+
+  // Note, UpdateInclusiveSliceStartTimes() will correctly update start times
+  // for both sliced and unsliced solutions.
+  interval->UpdateExclusiveSliceStartTimes(
+      std::vector<int64_t>(interval->num_slices(),
+                           options_.prefetch_interval_picker->latest_time()));
+  std::vector<Chunk> chunk_candidates = FindBestChunkCandidates(
+      *context.request, context.request->preferred_offset, interval);
+  if (chunk_candidates.empty()) {
+    VLOG(3) << "The latest prefetch (" << interval->full_buffer_interval().start
+            << ", " << context.request->end_time
+            << ") cannot find valid chunks. Giving up.";
+    return Result::kFailOutOfMemory;
+  }
+
+  return Result::kSuccess;
+}
+
+namespace {
+
+// GetAsyncCopyElapsed with a default value.
+float CopyResourceForShape(const Options& options, const Shape& shape) {
+  return options.cost_analysis
+             ? options.cost_analysis->GetAsyncCopyElapsed(shape)
+             : 0.1;
+}
+
+// Returns the copy resources needed for the specified slice proposal
+// collection, in descending order.
+std::vector<float> GetCopyResourcesSortedDescending(
+    const Options& options,
+    const SliceProposalCollection& slice_proposal_collection) {
+  std::vector<float> copy_resources;
+  copy_resources.reserve(slice_proposal_collection.size());
+  for (const SliceProposal& proposal : slice_proposal_collection) {
+    copy_resources.push_back(
+        CopyResourceForShape(options, proposal.slice_shape));
+  }
+  absl::c_sort(copy_resources);
+  return copy_resources;
+}
+
+// Returns true if we would have enough async copy resources to copy each
+// specified slice.
+bool DoWeHaveEnoughCopyResource(
+    const std::vector<int64_t>& slice_start_times, int64_t prefetch_end_time,
+    const std::vector<float>& copy_resource_per_slice,
+    AsynchronousCopyResource& async_copy_resource) {
+  CHECK_EQ(slice_start_times.size(), copy_resource_per_slice.size());
+
+  std::vector<AsynchronousCopyResource::ResourceSpec> specs;
+  specs.reserve(slice_start_times.size());
+
+  // Note, the HasEnoughResourceMultiCheck() below is sensitive to this order.
+  // The specs must be in slice start time order because that's the order
+  // they'll be added to prefetch_async_copy_resource_ in
+  // AddAsyncSlicesForPrefetch(), if the solution is selected.
+  static const float kSlicedCopyResourceInflation = 1.8;
+  for (int i = 0; i < slice_start_times.size(); ++i) {
+    float original_copy_resource = copy_resource_per_slice[i];
+    float new_copy_resource = original_copy_resource;
+    if (slice_start_times.size() > 1) {
+      // This is a hack that makes us more conservative about using sliced
+      // prefetching vs unsliced prefetching.
+      new_copy_resource = original_copy_resource * kSlicedCopyResourceInflation;
+      VLOG(5)
+          << "Inflating required copy resources DoWeHaveEnoughCopyResource() "
+             "slice check from "
+          << original_copy_resource << " to " << new_copy_resource;
+    }
+    specs.push_back(
+        {slice_start_times[i], prefetch_end_time, new_copy_resource});
+  }
+
+  auto specs_to_string = [&specs]() {
+    return absl::StrCat(
+        "[ ",
+        absl::StrJoin(specs, ", ",
+                      [](std::string* out,
+                         const AsynchronousCopyResource::ResourceSpec& spec) {
+                        absl::StrAppend(out, "{exclusive start: ",
+                                        spec.exclusive_start_time,
+                                        ", end: ", spec.end_time,
+                                        ", resource: ", spec.resource, "}");
+                      }),
+        " ]");
+  };
+
+  VLOG(5) << "Checking for enough copy resources for: " << specs_to_string();
+  if (!async_copy_resource.HasEnoughResourceMultiCheck(specs)) {
+    VLOG(4) << "Not enough copy resources for " << specs_to_string();
+    return false;
+  }
+  return true;
+}
+
+// We compute a map from indices in chunk_candidates to indices in a
+// SliceProposalCollection. Since the indices of chunk_candidates correspond to
+// slice start times order, and SliceProposalCollections are always sorted in
+// offset order, the mapping allows us to get the sizing details of a slice at a
+// specific slice time.
+absl::flat_hash_map<int64_t, int64_t> GetCandidateToProposalIndexMap(
+    const std::vector<MsaAlgorithm::Chunk>& chunk_candidates) {
+  std::vector<std::pair<int64_t, int64_t>> sorted_offset_candidate_index_pairs;
+  sorted_offset_candidate_index_pairs.reserve(chunk_candidates.size());
+  for (int64_t chunk_candidate_index = 0;
+       chunk_candidate_index < chunk_candidates.size();
+       ++chunk_candidate_index) {
+    sorted_offset_candidate_index_pairs.push_back(std::make_pair(
+        chunk_candidates[chunk_candidate_index].offset, chunk_candidate_index));
+  }
+  absl::c_sort(sorted_offset_candidate_index_pairs);
+
+  absl::flat_hash_map<int64_t, int64_t> candidate_to_proposal_index_map;
+  for (int64_t offset_index = 0;
+       offset_index < sorted_offset_candidate_index_pairs.size();
+       ++offset_index) {
+    int64_t chunk_candidate_index =
+        sorted_offset_candidate_index_pairs[offset_index].second;
+    candidate_to_proposal_index_map[chunk_candidate_index] = offset_index;
+  }
+
+  return candidate_to_proposal_index_map;
+}
+
+}  // namespace
+
+MsaAlgorithm::Result MsaAlgorithm::CheckPrefetchFit(bool for_sliced_solution,
+                                                    PrefetchContext& context) {
+  SlicedBufferInterval* sliced_buffer_interval =
+      context.GetMutableWorkingIntervals(for_sliced_solution).sliced.get();
+
+  if (for_sliced_solution) {
+    CHECK(context.slice_proposal_collection);
+    CHECK_EQ(context.slice_proposal_collection->size(),
+             sliced_buffer_interval->num_slices());
+  }
+
+  // Update the prefetch start time in our working solution.
+  std::vector<int64_t> exclusive_slice_start_times =
+      SlicedPrefetchStartTimePicker::Pick(
+          sliced_buffer_interval->num_slices(),
+          context.exclusive_prefetch_start_time, context.prefetch_end_time,
+          [&](int64_t exclusive_start_time,
+              int64_t exclusive_end_time) -> float {
+            return options_.prefetch_interval_picker->GetLogicalIntervalElapsed(
+                exclusive_start_time, exclusive_end_time);
+          },
+          [&](int64_t lhs_time, int64_t rhs_time) -> bool {
+            return hlo_live_range_.flattened_instruction_sequence()
+                       .instructions()[lhs_time]
+                       ->parent() ==
+                   hlo_live_range_.flattened_instruction_sequence()
+                       .instructions()[rhs_time]
+                       ->parent();
+          });
+  CHECK_EQ(sliced_buffer_interval->num_slices(),
+           exclusive_slice_start_times.size());
+  sliced_buffer_interval->UpdateExclusiveSliceStartTimes(
+      exclusive_slice_start_times);
+  VLOG(4) << AlternateMemoryAllocationAttemptToString(for_sliced_solution,
+                                                      context);
+
+  // Check if all slices have the same start time. If so, we might as well
+  // resort to a full copy.
+  if (for_sliced_solution &&
+      absl::c_all_of(
+          exclusive_slice_start_times, [&](int64_t slice_start_time) {
+            return slice_start_time == exclusive_slice_start_times.front();
+          })) {
+    return Result::kAllSlicesHaveTheSameStartTime;
+  }
+
+  // Check that we have enough copy resource for the prefetching.
+  std::vector<float> copy_resource_per_slice_sorted_by_start_time;
+  // If there is a preferred prefetch time due to a loop optimized allocation,
+  // we already keep track of the prefetch resources there, so skip tracking
+  // resources here.
+  if (context.request->preferred_prefetch_time) {
+    copy_resource_per_slice_sorted_by_start_time =
+        std::vector<float>(exclusive_slice_start_times.size(), 0.0);
+  } else if (for_sliced_solution) {
+    // In a sliced setting, we don't yet know when each slice will be
+    // prefetched. Given the proposed slice times, the most conservative copy
+    // resource check we can make is to assume that larger slices are started
+    // at earlier times, i.e., they have more time to complete. That is the
+    // check we will make here. Once, we've decided when each slice will be
+    // prefetched, we can do an exact check below.
+    //
+    // We start by computing the amount of copy resources needed for each slice,
+    // if larger slices are started at earlier times.
+    copy_resource_per_slice_sorted_by_start_time =
+        GetCopyResourcesSortedDescending(options_,
+                                         *context.slice_proposal_collection);
+  } else {
+    copy_resource_per_slice_sorted_by_start_time.push_back(
+        CopyResourceForShape(options_, *context.full_shape));
+  }
+  CHECK_EQ(sliced_buffer_interval->num_slices(),
+           copy_resource_per_slice_sorted_by_start_time.size());
+
+  if (!DoWeHaveEnoughCopyResource(exclusive_slice_start_times,
+                                  context.prefetch_end_time,
+                                  copy_resource_per_slice_sorted_by_start_time,
+                                  prefetch_async_copy_resource_)) {
+    return Result::kFailViolatesAsyncCopyResource;
+  }
+
+  // Check if the copies we would add for the prefetch would violate copy
+  // ordering.
+  if (options_.enforce_prefetch_fifo_order &&
+      absl::c_any_of(exclusive_slice_start_times,
+                     [&](int64_t slice_start_time) {
+                       return async_copy_ordering_.ViolatesOrdering(
+                           slice_start_time, context.prefetch_end_time);
+                     })) {
+    VLOG(4) << "This would violate asynchronous copy ordering.";
+    return Result::kFailViolatesAsyncCopyResource;
+  }
+
+  // Check if the copies we would add for the prefetch violate the maximum
+  // number of outstanding async copies.
+  for (int i = 0; i < exclusive_slice_start_times.size(); ++i) {
+    if (ViolatesMaximumOutstandingAsyncCopies(
+            exclusive_slice_start_times[i], context.prefetch_end_time,
+            /*is_prefetch=*/true, context.extra_async_copy_limit, i)) {
+      VLOG(4) << "This would violate the outstanding async copy limit.";
+      return Result::kFailOutOfAsyncCopies;
+    }
+  }
+
+  // Check if we can find a place in alternate memory for the prefetch.
+  std::vector<Chunk> chunk_candidates = FindBestChunkCandidates(
+      *context.request, context.request->preferred_offset,
+      sliced_buffer_interval);
+  CHECK(chunk_candidates.empty() ||
+        chunk_candidates.size() == sliced_buffer_interval->num_slices());
+  std::string prefetch_picker_debug_string;
+  if (VLOG_IS_ON(4)) {
+    prefetch_picker_debug_string =
+        options_.prefetch_interval_picker->ToDebugString();
+  }
+  if (for_sliced_solution && !chunk_candidates.empty()) {
+    // We're trying a sliced solution. So, if FindBestChunkCandidates() found a
+    // solution, each slice should have its own chunk candidate.
+    CHECK_EQ(chunk_candidates.size(), sliced_buffer_interval->num_slices());
+    // We need a mapping from chunks in chunk_candidates to slice proposals in
+    // context.slice_proposal_context.
+    absl::flat_hash_map<int64_t, int64_t> candidate_to_proposal_index_map =
+        GetCandidateToProposalIndexMap(chunk_candidates);
+
+    // Create slice decisions, sorted by time.
+    std::vector<SliceDecision> slice_decisions_sorted_by_start_time;
+    for (int64_t slice_time = 0;
+         slice_time < sliced_buffer_interval->num_slices(); ++slice_time) {
+      const SliceProposal& proposal = context.slice_proposal_collection->at(
+          candidate_to_proposal_index_map[slice_time]);
+      copy_resource_per_slice_sorted_by_start_time[slice_time] =
+          CopyResourceForShape(options_, proposal.slice_shape);
+      slice_decisions_sorted_by_start_time.push_back(SliceDecision{
+          chunk_candidates[slice_time], exclusive_slice_start_times[slice_time],
+          proposal, copy_resource_per_slice_sorted_by_start_time[slice_time]});
+    }
+
+    // Check that we have enough copy resources for all the slice decisions.
+    if (!DoWeHaveEnoughCopyResource(
+            exclusive_slice_start_times, context.prefetch_end_time,
+            copy_resource_per_slice_sorted_by_start_time,
+            prefetch_async_copy_resource_)) {
+      return Result::kFailViolatesAsyncCopyResource;
+    }
+
+    // Construct MsaBufferInterval-Chunk pairs that are appropriate for pending
+    // chunks, as described in PrefetchContext::SlicedSolution.
+    std::vector<std::pair<MsaBufferInterval, Chunk>> slices_for_pending_chunks;
+    slices_for_pending_chunks.reserve(sliced_buffer_interval->num_slices());
+    Chunk final_chunk = Chunk::FromOffsetSize(
+        absl::c_min_element(
+            chunk_candidates,
+            [](const Chunk& a, const Chunk& b) { return a.offset < b.offset; })
+            ->offset,
+        absl::c_accumulate(
+            chunk_candidates, 0,
+            [](int64_t sum, const Chunk& chunk) { return sum + chunk.size; }));
+    MsaBufferInterval final_buffer_interval{
+        context.request->allocation_value->value(),
+        /*size=*/final_chunk.size,
+        /*start=*/
+        ExclusiveToInclusiveStartTime(exclusive_slice_start_times.back()),
+        /*end=*/context.request->end_time,
+        /*colocations=*/
+        sliced_buffer_interval->full_buffer_interval().colocations,
+        /*need_allocation=*/true};
+    for (int64_t slice_time = 0;
+         slice_time < sliced_buffer_interval->num_slices(); ++slice_time) {
+      const Chunk& chunk = chunk_candidates[slice_time];
+      int64_t inclusive_start_time = ExclusiveToInclusiveStartTime(
+          exclusive_slice_start_times[slice_time]);
+      if (inclusive_start_time ==
+          ExclusiveToInclusiveStartTime(exclusive_slice_start_times.back())) {
+        // This and the following chunks will be merged into the final chunk.
+        // Note, it's possible for more than one slice to start at the same
+        // time.
+        break;
+      }
+      CHECK_LT(inclusive_start_time, ExclusiveToInclusiveStartTime(
+                                         exclusive_slice_start_times.back()));
+      slices_for_pending_chunks.push_back(std::make_pair(
+          MsaBufferInterval{
+              context.request->allocation_value->value(),
+              /*size=*/chunk.size,
+              /*start=*/inclusive_start_time,
+              /*end=*/exclusive_slice_start_times.back(),
+              // We only use the final_buffer_interval for colocations because
+              // slices start at different offsets, and the colocation
+              // infrastructure expects all colocated buffers to start at the
+              // same offset.
+              /*colocations=*/{},
+              /*need_allocation=*/true,
+          },
+          chunk));
+    }
+    slices_for_pending_chunks.push_back(
+        std::make_pair(final_buffer_interval, final_chunk));
+
+    context.sliced_solution = PrefetchContext::SlicedSolution{
+        std::move(slice_decisions_sorted_by_start_time),
+        std::move(slices_for_pending_chunks),
+        prefetch_picker_debug_string,
+    };
+    return Result::kSuccess;
+  } else if (!chunk_candidates.empty()) {
+    // We're trying an unsliced solution. So, if FindBestChunkCandidates() found
+    // a solution, there must be only 1 chunk for it.
+    CHECK_EQ(chunk_candidates.size(), 1);
+    CHECK_EQ(copy_resource_per_slice_sorted_by_start_time.size(), 1);
+    context.unsliced_solution = PrefetchContext::UnslicedSolution{
+        chunk_candidates.front(),
+        copy_resource_per_slice_sorted_by_start_time.front(),
+        prefetch_picker_debug_string,
+    };
+    return Result::kSuccess;
+  }
+
+  // Mark the out of memory start with the prefetch start time so that we don't
+  // explore prefetch start times earlier than this point. If a sliced prefetch
+  // doesn't fit at a given time, an unsliced prefetch will not fit either.
+  // Thus, if we are considering a sliced prefetch for the current request,
+  // we can only update out_of_mem_start when we check with slices.
+  if (for_sliced_solution || !context.slice_proposal_collection) {
+    CHECK_GT(exclusive_slice_start_times.size(), 0);
+    context.exclusive_out_of_mem_start = std::max(
+        context.exclusive_out_of_mem_start ? *context.exclusive_out_of_mem_start
+                                           : -1,
+        exclusive_slice_start_times.front());
+  }
+
+  VLOG(4) << "Out of memory.";
+  return Result::kFailOutOfMemory;
+}
+
+std::string MsaAlgorithm::AlternateMemoryAllocationAttemptToString(
+    bool for_sliced_solution, const PrefetchContext& context) const {
+  const SlicedBufferInterval* sliced_buffer_interval =
+      context.GetWorkingIntervals(for_sliced_solution).sliced.get();
+
+  std::vector<std::string> slice_times;
+  std::vector<int64_t> estimated_slice_prefetch_end_times;
+
+  for (int i = 0; i < sliced_buffer_interval->num_slices(); ++i) {
+    slice_times.push_back(absl::StrCat(
+        "[", sliced_buffer_interval->IntervalForMakeFreeChunks(i).start, ", ",
+        sliced_buffer_interval->full_buffer_interval().end, ")"));
+    if (context.slice_proposal_collection) {
+      estimated_slice_prefetch_end_times.push_back(
+          options_.prefetch_interval_picker->EstimatedPrefetchEndTime(
+              context.slice_proposal_collection->at(i).slice_shape,
+              sliced_buffer_interval->IntervalForMakeFreeChunks(i).start,
+              context.prefetch_end_time));
+    } else {
+      estimated_slice_prefetch_end_times.push_back(
+          options_.prefetch_interval_picker->EstimatedPrefetchEndTime(
+              *context.full_shape,
+              sliced_buffer_interval->IntervalForMakeFreeChunks(i).start,
+              context.prefetch_end_time));
+    }
+  }
+
+  return absl::StrCat(
+      "Trying alternate memory allocation. Slice times = { ",
+      absl::StrJoin(slice_times, ", "), " }. Estimated prefetch end times = { ",
+      absl::StrJoin(estimated_slice_prefetch_end_times, ", "), " }");
+}
+
+std::optional<MsaAlgorithm::Chunk> MsaAlgorithm::FindBestChunkCandidate(
+    const AllocationRequest& request, const AliasedOffset* preferred_offset,
+    MsaBufferInterval* alternate_mem_interval) const {
+  SlicedBufferInterval sliced_buffer_interval =
+      SlicedBufferInterval::CreateMutableInterval(*alternate_mem_interval);
+  std::vector<Chunk> chunks = FindBestChunkCandidates(request, preferred_offset,
+                                                      &sliced_buffer_interval);
+  CHECK_LE(chunks.size(), 1);
+  if (chunks.empty()) {
+    return std::nullopt;
+  }
+  return chunks[0];
+}
+
+std::vector<MsaAlgorithm::Chunk> MsaAlgorithm::FindBestChunkCandidates(
+    const AllocationRequest& request, const AliasedOffset* preferred_offset,
+    SlicedBufferInterval* alternate_mem_interval) const {
+  int64_t end_time = request.end_time;
+  if (!preferred_offset) {
+    // First find the earliest use that is the same or later than the end time.
+    const auto& use_times = request.all_use_times;
+    auto use_time_it = absl::c_lower_bound(use_times, end_time);
+    CHECK(use_time_it != use_times.end());
+    int64_t earliest_use = *use_time_it;
+    auto earliest_use_it = use_time_it;
+
+    // Then find the latest use that can be allocated contiguously without
+    // copies.
+    const Shape& shape = request.allocation_value->defining_position().shape();
+    for (;
+         (use_time_it + 1) != use_times.end() &&
+         options_.prefetch_interval_picker->CanAllocateInAlternateMemoryNoCopy(
+             shape, *use_time_it, *(use_time_it + 1));
+         ++use_time_it) {
+    }
+    CHECK(use_time_it != use_times.end());
+    int64_t latest_contiguous_use_time = *use_time_it;
+
+    // Find chunks that are as long living as possible.
+    std::vector<Chunk> last_chunk_candidates;
+    int64_t latest_matching_use = std::numeric_limits<int64_t>::min();
+    (void)std::lower_bound(
+        earliest_use_it, std::next(use_time_it), -1, [&](int64_t use, int64_t) {
+          alternate_mem_interval->UpdateEndTime(use);
+          std::vector<Chunk> chunk_candidates =
+              FindChunkCandidates(*alternate_mem_interval);
+          int64_t candidates_end =
+              absl::c_max_element(chunk_candidates, [](const Chunk& c1,
+                                                       const Chunk& c2) {
+                return c1.chunk_end() < c2.chunk_end();
+              })->chunk_end();
+          if (candidates_end <= available_heap_size()) {
+            if (use > latest_matching_use) {
+              last_chunk_candidates = std::move(chunk_candidates);
+              latest_matching_use = use;
+            }
+            return true;
+          }
+          return false;
+        });
+    if (!last_chunk_candidates.empty()) {
+      VLOG(3) << "FindBestChunkCandidates earliest use = " << earliest_use
+              << ", latest contiguous use = " << latest_contiguous_use_time
+              << ", use with available mem = " << latest_matching_use
+              << ", offsets = { "
+              << absl::StrJoin(last_chunk_candidates, ", ",
+                               [](std::string* out, const Chunk& c) {
+                                 absl::StrAppend(out, c.offset);
+                               })
+              << " }";
+    }
+    alternate_mem_interval->UpdateEndTime(end_time);
+    return last_chunk_candidates;
+  }
+  // If a preferred offset is given, try to find an allocation at that offset
+  // only.
+  alternate_mem_interval->UpdateEndTime(end_time);
+  std::vector<Chunk> chunk_candidates =
+      FindChunkCandidates(*alternate_mem_interval, preferred_offset->offset);
+  int64_t candidates_start =
+      absl::c_min_element(chunk_candidates, [](const Chunk& c1,
+                                               const Chunk& c2) {
+        return c1.offset < c2.offset;
+      })->offset;
+
+  if (candidates_start == preferred_offset->offset) {
+    return chunk_candidates;
+  }
+
+  return {};
+}
+
+}  // namespace memory_space_assignment
+}  // namespace xla
diff --git a/third_party/xla/xla/service/memory_space_assignment/algorithm.h b/third_party/xla/xla/service/memory_space_assignment/algorithm.h
new file mode 100644
index 00000000000000..1202346ad95a50
--- /dev/null
+++ b/third_party/xla/xla/service/memory_space_assignment/algorithm.h
@@ -0,0 +1,1059 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_ALGORITHM_H_
+#define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_ALGORITHM_H_
+
+#include <algorithm>
+#include <cstdint>
+#include <list>
+#include <map>
+#include <memory>
+#include <optional>
+#include <set>
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <variant>
+#include <vector>
+
+// TODO(b/210891274): Use btree_map after build issue in Windows is resolved.
+#if defined(__GNUC__) || defined(__clang__)
+#include "absl/container/btree_map.h"
+#endif
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/utils/hlo_live_range.h"
+#include "xla/service/buffer_value.h"
+#include "xla/service/call_graph.h"
+#include "xla/service/heap_simulator/allocation_block.h"
+#include "xla/service/heap_simulator/heap_simulator.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/service/hlo_alias_analysis.h"
+#include "xla/service/hlo_value.h"
+#include "xla/service/memory_space_assignment/allocation.h"
+#include "xla/service/memory_space_assignment/buffer_interval_comparator.h"
+#include "xla/service/memory_space_assignment/memory_space_assignment.pb.h"
+#include "xla/service/memory_space_assignment/options.h"
+#include "xla/service/memory_space_assignment/slice.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/status.h"
+#include "xla/statusor.h"
+#include "xla/util.h"
+
+namespace xla {
+namespace memory_space_assignment {
+
+// AllocationValue is used to break up HloValues for each non-trivial position
+// (trivial positions are considered Tuple, GetTupleElement, and Bitcast). An
+// HloValue may include positions and uses that alias with each other across
+// multiple computations. We use this class to break these HloValues such that
+// every AllocationValue has one defining position (that may alias with other
+// AllocationValues). The uses field of the AllocationValue contains only the
+// direct uses of the AllocationValue's defining position.
+//
+// For example, consider the following HLO snippet:
+//
+// Body {
+//   body_param = (f32[4,3]{1,0}, f32[]) parameter(0)
+//   get-tuple-element.3 = f32[4,3]{1,0} get-tuple-element(body_param),
+//   index=0
+//   ...
+//   ROOT tuple = (f32[4,3]{1,0}, f32[]) tuple(get-tuple-element.3, ...)
+// }
+//
+// Cond {
+//   cond_param = (f32[4,3]{1,0}, f32[]) parameter(0)
+//   ...
+// }
+//
+// add.4 = f32[4,3]{1,0} add(...)
+// tuple.1 = (f32[4,3]{1,0}, f32[]) tuple(add.4, ...)
+// while = (f32[4,3]{1,0}, f32[]) while(tuple.1), body=Body, condition=Cond
+// get-tuple-element.5 = f32[4,3]{1,0} get-tuple-element(while), index=0
+// add.5 = f32[4,3]{1,0} add(get-tuple-element.5, ...)
+//
+// This contains an HloValue that looks like the following:
+// positions:
+//  add.4
+//  body_param {0}
+//  get-tuple-element.3
+//  tuple {0}
+//  cond_param {0}
+//  tuple.1 {0}
+//  while {0}
+//  get-tuple-element.5
+// uses:
+//  add.1, operand 0
+//  tuple, operand 0
+//  while, operand 0 {0}
+//  add.5, operand 0
+//
+// We break this HloValue up into the following AllocationValues for each
+// non-trivial position:
+// AllocationValue1: computation = Entry
+//  position:
+//   add.4
+//  uses:
+//   while, operand 0 {0}
+// AllocationValue2: computation = Cond
+//  position:
+//   cond_param {0}
+//  uses:
+// AllocationValue3: computation = Body
+//  position:
+//   body_param {0}
+//  uses:
+//   add.1, operand 0
+//   tuple, operand 0
+// AllocationValue4: computation = Entry
+//  position:
+//   while {0}
+//  uses:
+//   add.5, operand 0
+class AllocationValue {
+ public:
+  // This data structure wraps an HloUse and adds additional metadata that are
+  // useful for allocation.
+  struct Use {
+    // The wrapped HloUse object.
+    HloUse hlo_use;
+    // The logical time this use is scheduled.
+    int64_t time;
+    // All the positions where this use aliases with. The aliased positions
+    // must get the same allocation.
+    std::vector<HloPosition> aliases;
+
+    bool operator==(const Use& other) const {
+      return hlo_use == other.hlo_use && time == other.time &&
+             aliases == other.aliases;
+    }
+
+    template <typename H>
+    friend H AbslHashValue(H h, const Use& s) {
+      return H::combine(std::move(h), s.hlo_use, s.time, s.aliases);
+    }
+  };
+
+  AllocationValue(const HloValue* value, const HloPosition& position,
+                  int64_t size)
+      : value_(value),
+        defining_position_(position),
+        size_(size),
+        requires_contiguous_allocation_(false) {}
+
+  const HloPosition& defining_position() const { return defining_position_; }
+  const HloInstruction* defining_instruction() const {
+    return defining_position().instruction;
+  }
+  int64_t size() const { return size_; }
+  const std::vector<Use>& uses() const { return uses_; }
+  std::vector<Use>& uses() { return uses_; }
+  const HloValue* value() const { return value_; }
+  const HloComputation* computation() const {
+    return defining_instruction()->parent();
+  }
+  AllocationSequence* mutable_allocation_sequence() {
+    return &allocation_sequence_;
+  }
+  const AllocationSequence* allocation_sequence() const {
+    return &allocation_sequence_;
+  }
+
+  // Sets/gets whether this AllocationValue requires allocating it
+  // contiguously throughout its live range (without any copies).
+  bool requires_contiguous_allocation() const {
+    return requires_contiguous_allocation_;
+  }
+  void set_requires_contiguous_allocation(bool requires_contiguous_allocation) {
+    requires_contiguous_allocation_ = requires_contiguous_allocation;
+  }
+
+  void AddUse(const HloUse& use, int64_t use_time) {
+    uses_.push_back({use, use_time, {}});
+  }
+
+  std::string ToString() const;
+  std::string ToShortString() const;
+
+ private:
+  const HloValue* value_;
+  HloPosition defining_position_;
+  int64_t size_;
+  // If true, there must be a contiguous allocation for this buffer without
+  // any copies.
+  bool requires_contiguous_allocation_;
+  std::vector<Use> uses_;
+  AllocationSequence allocation_sequence_;
+};
+
+// A struct representing an asynchronous copy with its logical start and end
+// time (time that copy done is scheduled), the resource this copy would use,
+// its destination memory space, and a unique ID.
+struct AsynchronousCopy {
+  int64_t exclusive_start_time;
+  int64_t end_time;
+  float resource;
+  MemorySpace destination;
+  int64_t id;
+
+  std::tuple<int64_t, int64_t, float, MemorySpace, int64_t> AsTuple() const {
+    return std::make_tuple(exclusive_start_time, end_time, resource,
+                           destination, id);
+  }
+};
+
+// Compare asynchronous copies such that an earlier start time has the same or
+// earlier end time and an earlier end time has the same or earlier start time.
+bool operator<(const AsynchronousCopy& a, const AsynchronousCopy& b);
+
+bool operator==(const AsynchronousCopy& a, const AsynchronousCopy& b);
+bool operator!=(const AsynchronousCopy& a, const AsynchronousCopy& b);
+
+// Helper class to enforce asynchronous copy ordering. If the appropriate option
+// is enabled, we only allow asynchronous copies that are pipelined: if an
+// asynchronous copy ends earlier than another asynchronous copy, it must start
+// the same time or earlier than the other asynchronous copy; and if an
+// asynchronous copy starts earlier than another asynchronous copy, it must end
+// the same time or earlier than the other asynchronous copy.
+class AsynchronousCopyOrdering {
+ public:
+  AsynchronousCopyOrdering() = default;
+
+  // Adds an asynchronous copy.
+  void AddCopy(const AsynchronousCopy& copy);
+
+  // Removes an asynchronous copy. CHECKs that it is removed.
+  void RemoveCopy(const AsynchronousCopy& copy);
+
+  // Returns true if the addition of an asynchronous copy in the given time
+  // interval would violate the asynchronous copy ordering. E.g., consider the
+  // following scenario:
+  //                                  CS          CD
+  //  already committed async copy:   +-----------+
+  //                new async copy:     +--------+
+  //
+  // The new asynchronous copy would violate the ordering guarantee because the
+  // copy start is after an already committed asynchronous copy while its copy
+  // done is before the committed copy.
+  bool ViolatesOrdering(int64_t exclusive_start_time, int64_t end_time) const;
+
+ private:
+  // We use this data structure for keys into the map that has a custom
+  // comparator for the ordering guarantees.
+  struct Interval {
+    int64_t exclusive_start_time;
+    int64_t end_time;
+
+    // We allow multiple prefetches that have one or both of the same start and
+    // end times. std::map considers two values as equal if neither are less
+    // than the other.  Using this comparator, we can ensure that the only
+    // intervals that evaluate to be equal are those with the same start and end
+    // times or those with intervals that violate the FIFO order.
+    bool operator<(const Interval& other) const {
+      return (exclusive_start_time < other.exclusive_start_time &&
+              end_time <= other.end_time) ||
+             (exclusive_start_time <= other.exclusive_start_time &&
+              end_time < other.end_time);
+    }
+  };
+  // Stores asynchronous copies in a tree set respecting the pipelining order.
+  std::map<Interval, std::set<AsynchronousCopy>> ranges_;
+};
+
+// Helper class to enforce asynchronous copy resources by keeping track of
+// available copy bandwidth and elapsed times of overlapped operations. It
+// maintains a list of initial resources that correspond to the elapsed times of
+// overlapped operations. As asynchronous copies are added, the available
+// resource is subtracted to keep track of the current state.
+class AsynchronousCopyResource {
+ public:
+  // A specification of needed asynchronous copy resources.
+  struct ResourceSpec {
+    int64_t exclusive_start_time;
+    int64_t end_time;
+    float resource;
+  };
+
+  AsynchronousCopyResource() = default;
+
+  // The constructor needs the initial resources.
+  explicit AsynchronousCopyResource(absl::Span<const float> initial_resources)
+      : initial_resources_(initial_resources.begin(), initial_resources.end()),
+        delay_(initial_resources.size(), 0) {}
+
+  // Adds the given asynchronous copy and updates the current resources. CHECK
+  // fails if there aren't enough resources to satisfy this copy (the caller
+  // should use HasEnoughResource first to ensure there is enough resource).
+  void AddCopy(const AsynchronousCopy& copy);
+
+  // Removes the given copy and frees the resource.
+  void RemoveCopy(const AsynchronousCopy& copy);
+
+  // Returns true if a copy with the given start and end times and resource can
+  // be satisfied.
+  bool HasEnoughResource(int64_t exclusive_start_time, int64_t end_time,
+                         float resource);
+
+  // Returns true if a set of copy specifications can be satisfied in the
+  // order specified.
+  bool HasEnoughResourceMultiCheck(const std::vector<ResourceSpec>& specs);
+
+  // This is only used for debugging and testing purposes, it returns the
+  // currently available resource at each logical time.
+  std::vector<float> GetCurrentResources() const {
+    std::vector<float> current_resources(initial_resources_.begin(),
+                                         initial_resources_.end());
+    for (int i = 0; i < current_resources.size(); ++i) {
+      current_resources[i] -= std::min(current_resources[i], delay_[i]);
+    }
+    return current_resources;
+  }
+
+  // A useful debugging tool for printing several pieces of information about
+  // AsynchronousCopyResource.
+  std::string Dump(int64_t start_time, int64_t end_time,
+                   MemorySpace memory_space_filter) const;
+
+ private:
+  // Internal helper method to implement adding/removing/checking resources.
+  // ConsumeResource() may modify delay_. If delay_change_map is not null,
+  // for any change to delay_[i], {i, delay_[i]} will be added to
+  // delay_change_map, allowing callers to undo any modifications.
+  bool ConsumeResource(
+      int64_t exclusive_start_time, int64_t end_time, float resource,
+      absl::flat_hash_map<int64_t, float>* delay_change_map = nullptr,
+      float resource_to_free = 0.0);
+
+  // Same as the public RemoveCopy except it works on the async_copies_
+  // iterator. Assumes copy_it points to the last copy for its start time;
+  // otherwise the public RemoveCopy method is supposed to temporarily remove
+  // these later copies that share the same start time before removing the
+  // requested copy.
+  void RemoveCopy(std::list<AsynchronousCopy>::iterator& copy_it);
+
+  // We maintain a linked list of asynchronous copies sorted by the start times.
+  // This allows us to efficiently find the copy that starts right after another
+  // one because adding a copy might push a copy further into the future.
+  std::list<AsynchronousCopy> async_copies_;
+// To make the lookups into async_copies_ more efficient, we also maintain a
+// binary tree that is indexed by the start time, containing iterators into
+// async_copies_.
+// TODO(b/210891274): Use btree_map after build issue in Windows is resolved.
+#if defined(__GNUC__) || defined(__clang__)
+  absl::btree_map<int64_t, std::list<AsynchronousCopy>::iterator>
+      async_copy_time_map_;
+#else
+  std::map<int64_t, std::list<AsynchronousCopy>::iterator> async_copy_time_map_;
+#endif
+  std::vector<float> initial_resources_;
+  std::vector<float> delay_;
+};
+
+// This class inherits from GlobalDecreasingSizeBestFitHeap with a notion of
+// maximum size.
+class MsaAlgorithm : public GlobalDecreasingSizeBestFitHeap<HloValue> {
+ public:
+  using HloPositionOrUse = std::variant<HloPosition, HloUse>;
+
+  MsaAlgorithm(AllocationSequence* allocations, const Options& options,
+               const HloAliasAnalysis& alias_analysis,
+               const HloLiveRange& hlo_live_range);
+
+  // Allocates a buffer in preferred memory with whole program lifetime and
+  // enables prefetching prefetch_candidate from default memory across program
+  // boundaries.
+  void AllocateCrossProgramPrefetchBuffer(
+      HloModule* module, const MsaBufferInterval& prefetch_candidate);
+
+  absl::StatusOr<HeapSimulator::Result<HloValue>> Finish() override;
+
+ protected:
+  // Given a buffer interval, returns the colocated intervals. Unlike the
+  // similar GlobalDecreasingSizeBestFitHeap::GetTransitiveColocations, it
+  // returns the colocated intervals sorted by scheduled time.
+  std::vector<const MsaBufferInterval*> GetSortedColocatedIntervals(
+      const MsaBufferInterval& interval) const;
+
+  // Given a MsaBufferInterval, creates AllocationValue objects and
+  // corresponding AllocationSequences and appends them into
+  // allocation_sequence_list_.
+  void CreateAllocationValues(
+      const MsaBufferInterval& buffer_interval,
+      std::vector<AllocationValue>& allocation_values) const;
+
+  // Given colocated intervals, populates allocation_values with the
+  // corresponding AllocationValue objects.
+  virtual void CreateAllocationValuesFromColocatedIntervals(
+      absl::Span<const MsaBufferInterval* const> colocated_intervals,
+      std::vector<AllocationValue>& allocation_values);
+
+  // Go through all the uses in the AllocationValues and find the aliasing
+  // positions.
+  void FindAliases(std::vector<AllocationValue>* allocation_values) const;
+
+  AllocationSequence* allocations() { return allocations_; }
+  const Options& options() const { return options_; }
+  const HloAliasAnalysis& alias_analysis() { return alias_analysis_; }
+  const HloLiveRange& hlo_live_range() { return hlo_live_range_; }
+
+ private:
+  // We inherit AllocationBlock struct to attach the Allocation information to
+  // make importing repacked offsets easier.
+  struct RepackAllocationBlock : AllocationBlock {
+    Allocation* allocation;
+  };
+
+  // A data structure we use to associate Allocation objects that are aliased
+  // and must get the same offset.
+  struct AliasedOffset {
+    int64_t offset;
+    absl::flat_hash_set<const Allocation*> allocations;
+  };
+
+  // An allocation request for a use segment. A use segment is the time segment
+  // between the definition and the first use, and the time segment between the
+  // uses of a buffer. For example, the time between the definition and Use1, is
+  // the first segment, and the time between Use1 and Use2 is the second segment
+  // and so on:
+  //
+  //        +------+----------+-------+
+  //       /        \          \       \
+  //      /          v          v       v
+  //    Def         Use1       Use2    Use3
+  //     <----------> <--------> <----->
+  //        Segment    Segment   Segment
+  //
+  // start_time and end_time are the start and end logical times of the segment.
+  // use_times is a sorted sequence of the times of all uses.
+  // latest_prefetch_time is the latest time we can schedule the CopyDone for a
+  // prefetch.
+  // If allow_no_copy_alternate_mem_allocation is false, an eviction is forced.
+  // If earliest_prefetch_time is set, prefetches cannot start before this
+  // value.
+  struct AllocationRequest {
+    int64_t inclusive_start_time;
+    int64_t end_time;
+    int64_t latest_prefetch_time;
+    int64_t size;
+    bool prefer_no_copy_alternate_mem_allocation;
+    bool allow_no_copy_alternate_mem_allocation;
+    bool require_no_copy_alternate_mem_allocation;
+    bool allow_prefetch;
+    std::optional<int64_t> earliest_prefetch_time;
+    std::optional<int64_t> preferred_prefetch_time;
+    AliasedOffset* preferred_offset;
+    const AllocationValue::Use* use;
+    AllocationValue* allocation_value;
+    absl::Span<const int64_t> all_use_times;
+  };
+
+  // This struct contains mandatory memory assignments at a given time. E.g., an
+  // input's required memory assignment time would correspond to the definition
+  // time of the parameter instruction, and an output's time would correspond to
+  // the time of last use.
+  struct RequiredMemoryAssignment {
+    MemorySpace memory_space;
+    int64_t time;
+    AliasedOffset* offset;
+
+    bool equals_ignoring_time(const RequiredMemoryAssignment& other) const {
+      return memory_space == other.memory_space && offset == other.offset;
+    }
+
+    bool operator==(const RequiredMemoryAssignment& other) const {
+      return memory_space == other.memory_space && time == other.time &&
+             offset == other.offset;
+    }
+
+    bool operator!=(const RequiredMemoryAssignment& other) const {
+      return !(*this == other);
+    }
+  };
+
+  // A struct that contains a pointer to loop-optimized allocation along with
+  // essential data about the loop itself.
+  struct LoopOptimizedAllocationInfo {
+    // The use_idx is the instruction index of the use within the loop.
+    int64_t use_index;
+    // The number of instructions in one iteration of the loop. We use use_index
+    // and loop_size to calculate when exactly to schedule a prefetch
+    // instruction.
+    int64_t loop_size;
+    // A pointer into an Allocation in loop_optimized_allocations_.
+    const Allocation* loop_optimized_allocation;
+  };
+
+  // A context object that is used to share state amongst the methods that
+  // implement Prefetch(). Prefetch tries to find both a sliced solution and an
+  // unsliced solution at the same time. We store both in this structure.
+  struct PrefetchContext {
+    // Prefetching is designed to operate on a SlicedBufferInterval that is
+    // backed by a standard MsaBufferInterval, even if the number of slices
+    // == 1. WorkingIntervals is used to store a SlicedBufferInterval and its
+    // backing MsaBufferInterval.
+    struct WorkingIntervals {
+      MsaBufferInterval full;
+      // sliced is a unique_ptr because it won't necessarily be initialized
+      // when the WorkingBufferIntervals are created, and there is no way to
+      // create an empty SlicedBufferInterval.
+      std::unique_ptr<SlicedBufferInterval> sliced;
+    };
+
+    struct SlicedSolution {
+      // When we talk about a slice, we think of spatial slices, where each
+      // slice is allocated at different times. The following example shows
+      // 3 slices that are used to form a contiguous buffer from [p0, p3]
+      //
+      //   space
+      //    ^
+      // p3 |       +-----------+
+      //    |       |    s2     |
+      // p2 |   +---+-----------+
+      //    |   |      s1       |
+      // p1 |   +-------+-------+
+      //    |           |  s0   |
+      // p0 |           +-------+
+      //    +---|---|---|---|---|----> time
+      //        t0  t1  t2  t3  t4
+      std::vector<SliceDecision> slice_decisions_sorted_by_start_time;
+
+      // In order to support colocated buffer calculations, we need to add a
+      // MsaBufferInterval-Chunk pair to pending_chunks_, such that:
+      // - The duration of the MsaBufferInterval is non-zero.
+      // - All slices have been allocated by the start of the MsaBufferInterval.
+      // - The MsaBufferInterval ends at the end time for all slices.
+      // - The Chunk covers the space allocated for all slices.
+      //
+      // In order to meet that requirement,
+      // we create MsaBufferInterval-Chunk pairs from
+      // slice_decisions_sorted_by_start_time that meet those requirement but do
+      // not cause any memory to be allocated in more than one Chunk at a time.
+      // The result is stored in slices_for_pending_chunks.
+      //
+      // The illustration below demonstrates how we would construct such
+      // MsaBufferInterval-Chunk pairs from the
+      // slice_decisions_sorted_by_start_time example above.
+      //
+      //   space
+      //    ^
+      // p3 |       +---+---+---+
+      //    |       |c2 |       |
+      // p2 |   +---+---+       |
+      //    |   |  c0   |   c2  |
+      // p1 |   +-------+       |
+      //    |           |       |
+      // p0 |           +-------+
+      //    +---|---|---|---|---|----> time
+      //        t0  t1  t2  t3  t4
+      std::vector<std::pair<MsaBufferInterval, Chunk>>
+          slices_for_pending_chunks;
+
+      // The prefetch_picker_debug_string will only be set with the appropriate
+      // VLOG level.
+      std::string prefetch_picker_debug_string;
+    };
+
+    struct UnslicedSolution {
+      Chunk chunk_candidate;    // The chunk chosen for the solution.
+      float prefetch_resource;  // The amount of required prefetch resource.
+      // The prefetch_picker_debug_string will only be set with the appropriate
+      // VLOG level.
+      std::string prefetch_picker_debug_string;
+    };
+
+    WorkingIntervals& GetMutableWorkingIntervals(bool for_sliced_solution) {
+      if (for_sliced_solution) {
+        return sliced_solution_intervals;
+      }
+      return unsliced_solution_intervals;
+    }
+
+    const WorkingIntervals& GetWorkingIntervals(
+        bool for_sliced_solution) const {
+      if (for_sliced_solution) {
+        return sliced_solution_intervals;
+      }
+      return unsliced_solution_intervals;
+    }
+
+    // Parameters to Prefetch().
+    const AllocationRequest* request;
+    Allocation* prev_allocation_in_default_mem;
+
+    // Intermediate calculations common to both the sliced and unsliced
+    // solutions.
+    int64_t exclusive_prefetch_start_time = -1;
+    int64_t prefetch_end_time = -1;
+    const Shape* full_shape;
+    int64_t extra_async_copy_limit = 0;
+    // As a compilation time optimization, store the prefetch start time where
+    // we have first seen out of memory. There is no point of exploring prefetch
+    // start times earlier than this point.
+    std::optional<int64_t> exclusive_out_of_mem_start = std::nullopt;
+
+    // Data structures used to compute and store the sliced solution.
+    std::optional<SliceProposalCollection> slice_proposal_collection =
+        std::nullopt;
+    WorkingIntervals sliced_solution_intervals;
+    std::optional<SlicedSolution> sliced_solution;
+
+    // Data structures used to compute and store the unsliced solution.
+    WorkingIntervals unsliced_solution_intervals;
+    std::optional<UnslicedSolution> unsliced_solution;
+  };
+
+  // Result of an allocation, prefetch, eviction etc. request.  The result is
+  // either kSuccess or a bitwise OR of one or more failures. The values are
+  // unique powers of two. To check if a result contains a particular failure,
+  // use the result_is method. To add a new failure to a result, use the
+  // result_mark method.
+  enum class Result {
+    // Successful allocation.
+    kSuccess = 0,
+    // Allocation failed because we ran out of alternate memory.
+    kFailOutOfMemory = 1,
+    // A no-copy allocation couldn't be performed because the previous
+    // allocation wasn't in the alternate memory space.
+    kFailPrevAllocationNotInAlternateMem = 2,
+    // A no-copy allocation couldn't be performed because the live range was too
+    // long.
+    kFailLiveRangeTooLong = 4,
+    // A prefetching couldn't be performed because the live range was too short.
+    kFailLiveRangeTooShort = 8,
+    // Ran out of outstanding asynchronous copy limit either during prefetching
+    // or eviction.
+    kFailOutOfAsyncCopies = 16,
+    // A prefetching couldn't be performed because the asynchronous copy
+    // resource was violated.
+    kFailViolatesAsyncCopyResource = 32,
+    // An allocation failure happened that requires uncommitting all the pending
+    // allocations. Usually this is due to a situation requiring an eviction but
+    // the eviction couldn't be performed.
+    kFailRequiresUncommit = 64,
+    // For prefetching, indicates that all slices have the same start time, in
+    // which case, we fallback to an unsliced solution.
+    kAllSlicesHaveTheSameStartTime = 128,
+    // There were conflicting preferred offsets.
+    kFailConflictingPreferredOffsets = 256
+  };
+
+  // Return true if the result belongs to a failure.
+  static bool result_is(Result result, Result failure) {
+    return static_cast<int>(result) & static_cast<int>(failure);
+  }
+
+  // Mark (bitwise OR) a failure to the result.
+  static Result result_mark(Result failure, Result& result) {
+    result = static_cast<Result>(static_cast<int>(result) |
+                                 static_cast<int>(failure));
+    return result;
+  }
+
+  // Return true if the result is a failure that requires us to uncommit pending
+  // chunks.
+  static bool result_requires_uncommit(Result result) {
+    return result_is(result, Result::kFailRequiresUncommit);
+  }
+
+  // Return true if the result is a failure either due to running out of
+  // outstanding asynchronous copies or due to violating asynchronous copy
+  // ordering.
+  static bool result_failed_because_of_async_copy(Result result) {
+    return result_is(result, Result::kFailOutOfAsyncCopies) ||
+           result_is(result, Result::kFailViolatesAsyncCopyResource);
+  }
+
+  // For the given loop with the start and end index and loop size, run the
+  // MemoryBoundLoopOptimizer and record its outputs into
+  // optimized_allocations_map_.
+  Status OptimizeMemoryBoundLoop(int loop_start_idx, int loop_end_idx,
+                                 int loop_size);
+
+  // Identify memory-bound loops in the graph and call OptimizeMemoryBoundLoop
+  // for the found loops.
+  void IdentifyAndOptimizeMemoryBoundLoops();
+
+  // Allocates buffers for instructions that need reserved scoped allocations in
+  // the alternate memory space.
+  void AllocateReservedScopedAllocations();
+
+  // Returns the AliasedOffset object associated with the allocation.
+  AliasedOffset* GetAliasedOffset(const Allocation& allocation);
+
+  // If aliased_offset is non-null, this method adds the allocation to
+  // aliased_offset. Otherwise, it creates a new AliasedOffset object and adds
+  // the allocation to this new AliasedOffset.
+  void CreateOrAddToAliasedOffset(const Allocation& allocation,
+                                  AliasedOffset* aliased_offset);
+
+  // Given an allocation sequence, returns the live allocation at time with a
+  // preference towards allocations in alternate memory. Returns nullptr if no
+  // allocation is alive at that time.
+  static Allocation* GetLiveAllocationAt(const AllocationSequence& allocations,
+                                         int64_t time);
+
+  // Returns true if the use is allowed in the alternate memory.
+  bool IsUseAllowedInAlternateMemory(const AllocationValue& value,
+                                     const HloUse& use) const;
+
+  // Finds allocations for allocation values generated from colocated intervals.
+  // All of the allocation values have a must-alias relationship with each
+  // other. Returns either kSuccess if all of the sites could be placed in the
+  // alternate memory or a bitwise OR of failure reasons why they couldn't
+  absl::StatusOr<Result> AllocateAllocationValues(
+      absl::Span<AllocationValue> allocation_values);
+
+  // Finds an allocation for an allocation request for a segment (see the
+  // documentation for AllocationRequest above how a segment is defined).
+  //
+  // It performs three things in the following order:
+  //  1- Allocate the allocation request entirely in the alternate memory, if
+  //     there is enough space and if the prefetch interval picker allows.
+  //  2- If (1) was unsuccessful, and the only allocation for
+  //     this buffer was in the alternate memory, we try to perform a prefetch.
+  //  3- If (1) was unsuccessful, prefetch the buffer into the alternate memory,
+  //     if there is enough space and if the prefetch interval picker allows.
+  //
+  // If an eviction (2) was requested and was unsuccessful, this method returns
+  // Result::kFailRequiresUncommit. This means we could not find a suitable
+  // allocation, so all previous allocations for this buffer must be removed and
+  // allocated in the default memory. Otherwise, this method may return
+  // Result::kSuccess if the buffer could be placed in alternate memory or some
+  // other Result with an OR of reasons why the buffer couldn't be placed in
+  // alternate memory.
+  Result AllocateSegment(const AllocationRequest& request);
+
+  // Try allocating in alternate memory without any copies.
+  Result AllocateInAlternateMemoryNoCopy(const AllocationRequest& request);
+
+  // Try evicting to default memory space.
+  Result Evict(const AllocationRequest& request);
+
+  // Returns the time a copy done of a prefetch should be scheduled.
+  int64_t FindPrefetchEndTime(const AllocationRequest& request,
+                              int64_t earliest_prefetch_time) const;
+
+  // Try prefetching to alternate memory space.
+  Result Prefetch(const AllocationRequest& request,
+                  Allocation& prev_allocation_in_default_mem);
+
+  // Helper methods used to implement Prefetch().
+  //
+  // Generates a SliceProposal in context, if options dictate and one can be
+  // constructed.
+  void GenerateSliceProposal(PrefetchContext& context) const;
+  // Calls GenerateSliceProposal to potentially create a SliceProposal, and
+  // sets up WorkingIntervals for a sliced and unsliced solution. Updates
+  // context.
+  void SetupPrefetchWorkingIntervalsAndSliceProposal(
+      PrefetchContext& context) const;
+  // Initializes the PrefetchIntervalPicker and associated data structures in
+  // context.
+  Result InitializePrefetchIntervalPicker(PrefetchContext& context);
+  // As a compile time optimization, try a prefetch allocation that is as late
+  // as possible. If this is not able to find a solution, none of the
+  // earlier tries will succeed either.
+  Result EnsureSomeSpatialPrefetchFitExists(PrefetchContext& context) const;
+  // Check if for the specified type of solution, using the parameters in
+  // context. If we find a solution, it will be stored in context.
+  Result CheckPrefetchFit(bool for_sliced_solution, PrefetchContext& context);
+  // Creates a debugging string describing the timing of the prefetch solution
+  // we are currently attempting (as dictated by for_sliced_solution and
+  // context).
+  std::string AlternateMemoryAllocationAttemptToString(
+      bool for_sliced_solution, const PrefetchContext& context) const;
+
+  // Find the best possible chunk candidate, where it has the longest possible
+  // availability if no preferred offset is given, or at the preferred_offset if
+  // it is given.
+  std::optional<Chunk> FindBestChunkCandidate(
+      const AllocationRequest& request, const AliasedOffset* preferred_offset,
+      MsaBufferInterval* alternate_mem_interval) const;
+  // The same as FindBestChunkCandidate() but allocates the request in slices.
+  // The ith returned chunk should be allocated at slice time i.
+  std::vector<Chunk> FindBestChunkCandidates(
+      const AllocationRequest& request, const AliasedOffset* preferred_offset,
+      SlicedBufferInterval* alternate_mem_interval) const;
+
+  // Returns the required assignment at a particular time, if available.
+  std::optional<RequiredMemoryAssignment> RequiredMemoryAssignmentAt(
+      const HloValue* buffer, int64_t time) const;
+
+  // Searches for aliases in the use for a required assignment, and returns it
+  // if found.
+  std::optional<RequiredMemoryAssignment> AliasedRequiredAssignmentForUse(
+      const AllocationValue::Use& use) const;
+
+  // Goes through the colocated intervals and adds any required assignment.
+  void AddRequiredAssignmentsForColocatedIntervals(
+      absl::Span<const MsaBufferInterval* const> colocated_intervals);
+
+  // Propagates aliased required assignment for a given position.
+  void AddAliasedRequiredAssignment(const HloInstruction* instruction,
+                                    ShapeIndex index,
+                                    const Allocation* aliased_allocation);
+
+  // This sets a required assignment. CHECK fails if there is a conflicting
+  // required assignment at the same time.
+  void AddRequiredAssignment(const HloValue* value,
+                             const HloInstruction* instruction,
+                             MemorySpace memory_space, int64_t time,
+                             AliasedOffset* offset = nullptr,
+                             bool add_to_pending = true);
+  void AddRequiredAssignment(const HloInstruction* instruction,
+                             ShapeIndex index, MemorySpace memory_space,
+                             AliasedOffset* offset = nullptr,
+                             bool add_to_pending = true);
+  void AddRequiredAssignment(const HloPosition& position,
+                             MemorySpace memory_space,
+                             AliasedOffset* offset = nullptr,
+                             bool add_to_pending = true);
+  void AddRequiredAssignment(const HloUse& use, MemorySpace memory_space,
+                             AliasedOffset* offset = nullptr,
+                             bool add_to_pending = true);
+
+  // Adds input and outputs as required assignments.
+  void AddInputAndOutputRequiredAssignments();
+
+  // Returns a list of "linked" allocations in the alternate memory. Linked
+  // allocations all share a common allocation site (a use or position) with
+  // each other. This can be used to determine if a group of linked allocations
+  // are considered efficient or not.
+  std::vector<std::vector<const Allocation*>>
+  GetLinkedAllocationsInAlternateMemory(
+      absl::Span<const AllocationValue> allocation_values) const;
+
+  // Returns allocation sites (use or position) that are allocated in the
+  // alternate memory, but is considered inefficient.  These arise in the
+  // context of in-place operation like dynamic-update-slice.  We will typically
+  // have an allocation that has the DUS as a use, and another allocation that
+  // has the DUS as a defining position. These two allocation will be part of
+  // the same linked allocation group.
+  //
+  // One reason why an allocation site could be inefficient is because the
+  // amount of data that is asynchronously copied (prefetch and eviction) is
+  // much larger than the amount of data that is used by the HLOs. If we find
+  // inefficient allocation sites, we can require these sites default memory
+  // allocations and allocate them again.
+  std::vector<HloPositionOrUse> GetInefficientAllocationSites(
+      absl::Span<const AllocationValue> allocation_values) const;
+
+  // Returns true if the colocated intervals in the argument are in a parameter
+  // or root instruction of the entry computation and are reserved by the user
+  // to be in the alternate memory space.
+  bool AreIntervalsReservedInAlternateMemory(
+      absl::Span<const MsaBufferInterval* const> colocated_intervals) const;
+
+  // Since the allocations are recorded to the AllocationSequence, we don't
+  // maintain result_ in GlobalDecreasingSizeBestFitHeap. Override AddToChunkMap
+  // to avoid unnecessarily adding the chunk to the chunk map.
+  //
+  // Sliced prefetching requires that we override this method because we
+  // associate more than one chunk with a buffer (i.e., 1 chunk per slice),
+  // which would cause the original implementation of this method to CHECK fail.
+  void AddToChunkMap(const HloValue* buffer, Chunk chunk) override {}
+
+  // Returns true if the addition of num_additional_copies asynchronous copies
+  // in the given time interval would violate the maximum number of asynchronous
+  // copies. An extra  async copy limit can be provided to increase the limit of
+  // asynchronous copies for this instance.
+  bool ViolatesMaximumOutstandingAsyncCopies(
+      int64_t inclusive_start_time, int64_t end_time, bool is_prefetch,
+      int64_t extra_async_copy_limit = 0,
+      int64_t num_additional_copies = 1) const;
+
+  // Exports the allocations for repacking and puts them into the vector in the
+  // parameter.
+  void ExportAllocationsForRepacking(
+      std::vector<AllocationBlock*>& allocations);
+
+  // Update reserved scoped allocation size for instructions when their
+  // operand/output has been allocated in alternate memory by invoking
+  // reserved_scoped_memory_fn
+  void UpdateReservedScopedAllocationSize();
+
+  // Imports repacked allocations and updates the internal data structures
+  // consistent with the new packing.
+  void ImportRepackedAllocations();
+  // Helper functions to implement ImportRepackedAllocations.
+  void ImportRepackedNonSlicedAllocation(RepackAllocationBlock& block);
+  void ImportRepackedSlicedAllocation(RepackAllocationBlock& block);
+  Status AreRepackedSlicesValid(const RepackAllocationBlock& block);
+
+  // Adds an asynchronous copy to allocations.
+  void AddAsyncCopy(
+      Allocation& prev_allocation, MemorySpace memory_space,
+      std::optional<Chunk> chunk, int64_t exclusive_start_time,
+      int64_t end_time, int64_t copy_done_schedule_before_time,
+      AllocationSequence* allocations, AliasedOffset* aliased_offset,
+      float resource,
+      std::optional<int> cross_program_prefetch_index = std::nullopt);
+
+  // For prefetching, adds a SlicedCopyAllocation to allocations. Also updates
+  // asynchronous copy data structures, prefetch_interval_tree_, and aliasing
+  // data structures
+  void AddAsyncSlicesForPrefetch(
+      const Allocation& prev_allocation, AllocationSequence* allocations,
+      AliasedOffset* aliased_offset,
+      const std::vector<SliceDecision>& slice_decisions_sorted_by_start_time,
+      int64_t prefetch_end_time, int64_t allocation_end_time);
+
+  // This method is used for committing the chunk candidate but adding it to
+  // pending_chunks_ so that we can "uncommit" them in case we need to roll back
+  // this allocation sequence.
+  void AddToPendingChunks(const MsaBufferInterval& buffer_interval,
+                          const Chunk& chunk);
+  // If we need to remove the allocations for this allocation sequence, this
+  // removes pending chunks and asynchronous copies in the respective pending
+  // buffers from the interval trees. If an allocation request returns
+  // kFailRequiresUncommit, this method must be called.
+  void UncommitPendingChunks(absl::Span<AllocationValue> allocation_values);
+
+  // Finalizes the allocations where they can no longer be uncommitted.
+  void FinalizeAllocations(absl::Span<AllocationValue> allocation_values);
+
+  // Clears all pending chunks and asynchronous copies.
+  void ClearPendingChunks();
+
+  // Append buffer and allocation infos for debugging and dump it into a file,
+  // if enabled.
+  void AppendBufferInfoDebugString(const MsaBufferInterval& interval,
+                                   std::string* debug_str) const;
+  void AppendScopedAllocationBufferInfoDebugString(
+      const HloInstruction* instruction, int64_t time, int64_t size,
+      std::string& debug_str) const;
+  void AppendAllocationInfoDebugString(const Allocation& allocation,
+                                       std::string& debug_str) const;
+  void DumpDebugStringsIfEnabled() const;
+
+  // Returns the available heap size in the alternate memory.
+  int64_t available_heap_size() const {
+    return options_.max_size_in_bytes - reserved_in_bytes_;
+  }
+
+  // Returns the earliest time in the (exclusive_start_time, end_time) range
+  // that a new allocation with the given size would fit in the alternate
+  // memory. If it doesn't fit, it returns nullopt.
+  std::optional<int> FindEarliestExclusiveTimeToSatisfyPeakMemory(
+      int exclusive_start_time, int end_time, int64_t size) const;
+
+  // Creates and returns a RepackAllocationBlock.
+  static RepackAllocationBlock MakeRepackAllocationBlock(
+      int64_t start_time, int64_t end_time, int64_t size,
+      int64_t initial_offset, int64_t id, Allocation* allocation) {
+    RepackAllocationBlock allocation_block;
+    allocation_block.inclusive_start_time = start_time;
+    allocation_block.end_time = end_time;
+    allocation_block.size = size;
+    allocation_block.offset = -1;
+    allocation_block.initial_offset = initial_offset;
+    allocation_block.id = id;
+    allocation_block.next_colocated = nullptr;
+    allocation_block.allocation = allocation;
+    return allocation_block;
+  }
+
+  // Returns a vector of instructions that have the same fingerprint as this
+  // instruction.
+  const std::vector<const HloInstruction*>* GetRepeatedInstructionList(
+      const HloInstruction* instruction) const;
+
+  // Returns true if the interval is pinned in the alternate memory. Buffers are
+  // pinned when their layout has the alternate memory space before MSA runs.
+  bool IsIntervalPinnedToAlternateMemory(
+      const MsaBufferInterval& interval) const;
+
+  AllocationSequence* allocations_;
+  const Options& options_;
+  const HloAliasAnalysis& alias_analysis_;
+  const HloLiveRange& hlo_live_range_;
+  std::unique_ptr<CallGraph> call_graph_;
+  // We use a interval tree to keep track of the number of outstanding
+  // prefetches and evictions.
+  BufferIntervalTree prefetch_interval_tree_;
+  BufferIntervalTree eviction_interval_tree_;
+  AsynchronousCopyOrdering async_copy_ordering_;
+  AsynchronousCopyResource prefetch_async_copy_resource_;
+  AsynchronousCopyResource eviction_async_copy_resource_;
+  // A list of RepackAllocationBlock objects that mirrors allocation sequences,
+  // used for repacking. We use a list here because we need pointer stability
+  // for aliased allocations.
+  std::list<RepackAllocationBlock> repack_allocation_blocks_;
+  int64_t num_repacks_ = 0;
+  int64_t num_repacks_successful_ = 0;
+  std::vector<std::pair<MsaBufferInterval, Chunk>> pending_chunks_;
+  std::vector<AsynchronousCopy> pending_async_copies_;
+  std::vector<std::pair<const HloValue*, RequiredMemoryAssignment>>
+      pending_required_assignments_;
+  // A cache to keep the peak memory usage at each point in the graph. We use
+  // this to see if the proposed allocation in the alternate memory would fit
+  // ignoring fragmentation, and if not, we can skip the more expensive lookup
+  // in the BufferIntervalTree, which also considers fragmentation.
+  std::vector<int64_t> peak_memory_usage_;
+  // The data structure that contains AliasedOffset objects and Allocation to
+  // AliasedOffset map for efficient lookup.
+  std::list<AliasedOffset> aliased_offsets_;
+  absl::flat_hash_map<const Allocation*, AliasedOffset*> aliased_offset_map_;
+  // This map contains required memory assignments for HloValues (e.g., input
+  // and outputs).
+  absl::flat_hash_map<const HloValue*, std::vector<RequiredMemoryAssignment>>
+      required_assignments_;
+  // Number of bytes reserved in alternate memory space.
+  int64_t reserved_in_bytes_ = 0;
+  // A rough measure of the memory pressure of the model, in bytes. Note that
+  // this is pressure for memory capacity (and not accessed bytes), and for
+  // alternate memory (not default memory).
+  int64_t memory_pressure_ = 0;
+  int64_t next_async_copy_id_ = 0;
+  // Fingerprint cache.
+  absl::flat_hash_map<const HloInstruction*, std::string> fingerprint_map_;
+  // Vector of repeated instructions (that have the same fingerprint) indexed by
+  // fingerprint.
+  absl::flat_hash_map<std::string, std::vector<const HloInstruction*>>
+      repeated_inst_map_;
+
+  // Loop-optimized allocations found by MemoryBoundLoopOptimizer. These
+  // allocation objects describe the allocations for one iteration of the loop,
+  // so we translate them into the program-level Allocation objects in
+  // allocations_.
+  std::vector<AllocationSequence> loop_optimized_allocations_;
+  // A map to look up the loop-optimized allocation info by use.
+  absl::flat_hash_map<HloUse, LoopOptimizedAllocationInfo>
+      loop_optimized_allocations_map_;
+  // A map to look the operands of each instruction that are assigned in
+  // alternate memory.
+  absl::flat_hash_map<const HloInstruction*,
+                      absl::flat_hash_set<std::pair<int, ShapeIndex>>>
+      operands_in_alternate_memory_map_;
+  // A map to look the outputs of each instruction that are assigned in
+  // alternate memory.
+  absl::flat_hash_map<const HloInstruction*, absl::flat_hash_set<ShapeIndex>>
+      outputs_in_alternate_memory_map_;
+  // Debug strings.
+  std::string buffer_info_str_;
+  std::string allocation_info_str_;
+  std::string instruction_schedule_str_;
+};
+
+}  // namespace memory_space_assignment
+}  // namespace xla
+
+#endif  // XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_ALGORITHM_H_
diff --git a/third_party/xla/xla/service/memory_space_assignment/allocation.cc b/third_party/xla/xla/service/memory_space_assignment/allocation.cc
index 49808709832f50..1ca05bfa8c5d14 100644
--- a/third_party/xla/xla/service/memory_space_assignment/allocation.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/allocation.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <cstdint>
 #include <functional>
+#include <memory>
 #include <optional>
 #include <string>
 #include <tuple>
@@ -29,7 +30,6 @@ limitations under the License.
 #include "absl/functional/function_ref.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
-#include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
@@ -39,7 +39,6 @@ limitations under the License.
 #include "xla/service/heap_simulator/allocation_block.h"
 #include "xla/service/heap_simulator/heap_simulator.h"
 #include "xla/service/hlo_value.h"
-#include "xla/service/memory_space_assignment/cost_analysis.h"
 #include "xla/service/memory_space_assignment/slice.h"
 #include "xla/service/time_utils.h"
 #include "xla/service/tuple_util.h"
@@ -47,6 +46,7 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/status.h"
 #include "xla/util.h"
+#include "tsl/platform/casts.h"
 #include "tsl/platform/errors.h"
 
 namespace xla::memory_space_assignment {
@@ -851,4 +851,47 @@ bool MirroredAllocation::operator==(const Allocation& other) const {
   return casted_other != nullptr && (*this) == (*casted_other);
 }
 
+std::tuple<int64_t, bool, int64_t> GetAllocationSortTuple(
+    const std::unique_ptr<Allocation>& allocation) {
+  int64_t scheduled_on_or_before = allocation->start_time();
+  int64_t scheduled_on_or_after = allocation->start_time();
+  if (allocation->is_copy_allocation()) {
+    auto copy_allocation =
+        tensorflow::down_cast<CopyAllocation*>(allocation.get());
+    scheduled_on_or_before = copy_allocation->copy_done_schedule_before();
+    scheduled_on_or_after = copy_allocation->copy_start_schedule_after();
+  }
+  return std::forward_as_tuple(scheduled_on_or_before,
+                               !allocation->is_copy_allocation(),
+                               scheduled_on_or_after);
+}
+
+void SortAllocationSequence(AllocationSequence& allocations) {
+  absl::c_sort(allocations, [](const std::unique_ptr<Allocation>& lhs,
+                               const std::unique_ptr<Allocation>& rhs) {
+    return GetAllocationSortTuple(lhs) < GetAllocationSortTuple(rhs);
+  });
+}
+
+std::string AllocationSequenceToString(AllocationSequence& allocations,
+                                       bool sort_allocations) {
+  if (sort_allocations) {
+    SortAllocationSequence(allocations);
+  }
+  std::string allocations_str = "\n";
+  for (const std::unique_ptr<Allocation>& allocation : allocations) {
+    absl::StrAppend(&allocations_str, allocation->ToString(), "\n");
+  }
+  return allocations_str;
+}
+
+std::vector<Allocation*> GetAllocationSequenceInRawPointers(
+    AllocationSequence& allocations) {
+  std::vector<Allocation*> allocations_in_raw_pointers;
+  for (const std::unique_ptr<Allocation>& allocation : allocations) {
+    allocations_in_raw_pointers.push_back(allocation.get());
+  }
+  return allocations_in_raw_pointers;
+}
+
 }  // namespace xla::memory_space_assignment
diff --git a/third_party/xla/xla/service/memory_space_assignment/allocation.h b/third_party/xla/xla/service/memory_space_assignment/allocation.h
index e9977dda40ab82..accbe1bcff581f 100644
--- a/third_party/xla/xla/service/memory_space_assignment/allocation.h
+++ b/third_party/xla/xla/service/memory_space_assignment/allocation.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstdint>
+#include <memory>
 #include <optional>
 #include <string>
 #include <tuple>
@@ -184,6 +185,13 @@ class Allocation {
 };
 
 using AllocationSequence = std::vector<std::unique_ptr<Allocation>>;
+std::tuple<int64_t, bool, int64_t> GetAllocationSortTuple(
+    const std::unique_ptr<Allocation>& allocation);
+void SortAllocationSequence(AllocationSequence& allocations);
+std::string AllocationSequenceToString(AllocationSequence& allocations,
+                                       bool sort_allocations = false);
+std::vector<Allocation*> GetAllocationSequenceInRawPointers(
+    AllocationSequence& allocations);
 
 // This class represents an allocation that pins a tensor to
 // a specific memory space.
diff --git a/third_party/xla/xla/service/memory_space_assignment/buffer_interval_comparator.cc b/third_party/xla/xla/service/memory_space_assignment/buffer_interval_comparator.cc
new file mode 100644
index 00000000000000..10ced32f478b79
--- /dev/null
+++ b/third_party/xla/xla/service/memory_space_assignment/buffer_interval_comparator.cc
@@ -0,0 +1,222 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/memory_space_assignment/buffer_interval_comparator.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+#include <string>
+#include <tuple>
+#include <utility>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
+#include "re2/re2.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/utils/hlo_live_range.h"
+#include "xla/service/hlo_value.h"
+#include "xla/service/memory_space_assignment/cost_analysis.h"
+#include "xla/service/memory_space_assignment/memory_space_assignment.pb.h"
+#include "xla/shape_util.h"
+#include "xla/util.h"
+
+namespace xla {
+namespace memory_space_assignment {
+namespace {
+
+bool DoesResultMatchFilter(const HloPositionMatcher& filter,
+                           const MsaBufferInterval& buffer_interval) {
+  HloInstruction* instruction = buffer_interval.buffer->instruction();
+  if (filter.has_instruction_regex() &&
+      !RE2::FullMatch(instruction->ToString(), filter.instruction_regex())) {
+    return false;
+  }
+  if (filter.has_instruction_name_regex() &&
+      !RE2::FullMatch(instruction->name(), filter.instruction_name_regex())) {
+    return false;
+  }
+  if (filter.has_tuple_index() &&
+      buffer_interval.buffer->index() !=
+          ShapeIndex(filter.tuple_index().index().begin(),
+                     filter.tuple_index().index().end())) {
+    return false;
+  }
+  if (filter.has_size_gte() && filter.size_gte() > buffer_interval.size) {
+    return false;
+  }
+  if (filter.has_size_lte() && filter.size_lte() < buffer_interval.size) {
+    return false;
+  }
+  return true;
+}
+
+// Returns an integer representing the priority of a MsaBufferInterval during
+// assignment, a smaller number indicates a higher priority.
+int64_t GetBufferIntervalOverridePriority(
+    const MsaSortOrderOverrides& msa_sort_order_overrides,
+    const MsaBufferInterval& buffer_interval) {
+  if (msa_sort_order_overrides.overrides_size() == 0) {
+    return 0;
+  }
+  for (int64_t i = 0; i < msa_sort_order_overrides.overrides_size(); ++i) {
+    const auto& override = msa_sort_order_overrides.overrides(i);
+    if (!DoesResultMatchFilter(override.hlo_position_matcher(),
+                               buffer_interval)) {
+      continue;
+    }
+    LOG(INFO) << "Override Sort Order Config " << i << " matches "
+              << buffer_interval.buffer->instruction()->ToString();
+    switch (override.override_options().options_case()) {
+      case MsaSortOrderOverrideOptions::kAssignFirst:
+        return std::numeric_limits<int64_t>::lowest() + i;
+      case MsaSortOrderOverrideOptions::kAssignLast:
+        return std::numeric_limits<int64_t>::max() - i;
+      case MsaSortOrderOverrideOptions::OPTIONS_NOT_SET:
+        continue;
+    }
+  }
+  return 0;
+}
+
+}  // namespace
+
+MemoryBoundednessBufferIntervalComparator::
+    MemoryBoundednessBufferIntervalComparator(
+        const CostAnalysis& cost_analysis,
+        CostAnalysis::Cache* cost_analysis_cache)
+    : BufferIntervalComparator(),
+      cost_analysis_(cost_analysis),
+      cost_analysis_cache_(cost_analysis_cache) {}
+
+MemoryBoundednessBufferIntervalComparator::
+    MemoryBoundednessBufferIntervalComparator(
+        const CostAnalysis& cost_analysis,
+        CostAnalysis::Cache* cost_analysis_cache,
+        MsaSortOrderOverrides msa_sort_order_overrides)
+    : BufferIntervalComparator(),
+      cost_analysis_(cost_analysis),
+      cost_analysis_cache_(cost_analysis_cache),
+      msa_sort_order_overrides_(msa_sort_order_overrides) {}
+
+std::string
+MemoryBoundednessBufferIntervalComparator::DescribeComparisonCriteria() const {
+  return "[override priority, -memory boundedness, -size, -buffer duration, "
+         "latest use time, (inclusive) start time, instruction id ]";
+}
+
+std::string MemoryBoundednessBufferIntervalComparator::CriteriaToString(
+    const MsaBufferInterval& buffer_interval) {
+  return absl::StrCat("[ ", absl::StrJoin(GetTuple(buffer_interval), ", "),
+                      " ]");
+}
+
+bool MemoryBoundednessBufferIntervalComparator::LessThan(
+    const MsaBufferInterval& lhs, const MsaBufferInterval& rhs) {
+  return GetTuple(lhs) < GetTuple(rhs);
+}
+
+int64_t MemoryBoundednessBufferIntervalComparator::GetLatestUseTime(
+    const MsaBufferInterval& buffer_interval) {
+  auto latest_use_it = buffer_to_latest_use_.find(buffer_interval.buffer);
+  if (latest_use_it == buffer_to_latest_use_.end()) {
+    int64_t latest_use_time = 0;
+    for (const HloUse& use : buffer_interval.buffer->GetUses()) {
+      auto it = cost_analysis_.hlo_live_range().instruction_schedule().find(
+          use.instruction);
+      if (it != cost_analysis_.hlo_live_range().instruction_schedule().end()) {
+        latest_use_time = std::max(latest_use_time, it->second);
+      }
+    }
+    latest_use_it =
+        buffer_to_latest_use_
+            .insert(std::make_pair(buffer_interval.buffer, latest_use_time))
+            .first;
+  }
+  return latest_use_it->second;
+}
+
+MemoryBoundednessBufferIntervalComparator::ComparisonTuple
+MemoryBoundednessBufferIntervalComparator::GetTuple(
+    const MsaBufferInterval& buffer_interval) {
+  int64_t priority = GetBufferIntervalOverridePriority(
+      msa_sort_order_overrides_, buffer_interval);
+  float inverse_memory_boundedness =
+      -1.0 * cost_analysis_.GetMemoryBoundedness(buffer_interval,
+                                                 cost_analysis_cache_);
+  int64_t inverse_buffer_size = -1 * buffer_interval.size;
+  int64_t inverse_buffer_duration = buffer_interval.start - buffer_interval.end;
+  int64_t latest_use_time = GetLatestUseTime(buffer_interval);
+  int64_t buffer_start_time = buffer_interval.start;
+  auto buffer_id = buffer_interval.buffer->id();
+  return std::make_tuple(priority, inverse_memory_boundedness,
+                         inverse_buffer_size, inverse_buffer_duration,
+                         latest_use_time, buffer_start_time, buffer_id);
+}
+
+DefaultCrossProgramPrefetchBufferIntervalComparator::
+    DefaultCrossProgramPrefetchBufferIntervalComparator(
+        const HloLiveRange& hlo_live_range)
+    : BufferIntervalComparator(), hlo_live_range_(hlo_live_range) {}
+
+std::string DefaultCrossProgramPrefetchBufferIntervalComparator::
+    DescribeComparisonCriteria() const {
+  return "[ -size, -cumulative use size, latest use, instruction id]";
+}
+
+std::string
+DefaultCrossProgramPrefetchBufferIntervalComparator::CriteriaToString(
+    const MsaBufferInterval& buffer_interval) {
+  return absl::StrCat("[ ", absl::StrJoin(GetTuple(buffer_interval), ", "),
+                      " ]");
+}
+
+bool DefaultCrossProgramPrefetchBufferIntervalComparator::LessThan(
+    const MsaBufferInterval& lhs, const MsaBufferInterval& rhs) {
+  return GetTuple(lhs) < GetTuple(rhs);
+}
+
+DefaultCrossProgramPrefetchBufferIntervalComparator::ComparisonTuple
+DefaultCrossProgramPrefetchBufferIntervalComparator::GetTuple(
+    const MsaBufferInterval& buffer_interval) {
+  auto sort_data_it = additional_sort_data_.find(buffer_interval.buffer);
+  if (sort_data_it == additional_sort_data_.end()) {
+    AdditionalSortData sort_data;
+    absl::c_for_each(buffer_interval.buffer->GetUses(), [&](const HloUse& use) {
+      auto it = hlo_live_range_.instruction_schedule().find(use.instruction);
+      if (it == hlo_live_range_.instruction_schedule().end()) {
+        return;
+      }
+      sort_data.latest_use = std::max(sort_data.latest_use, it->second);
+      sort_data.cumulative_use_size +=
+          ShapeUtil::ElementsInRecursive(use.instruction->shape());
+    });
+    sort_data_it =
+        additional_sort_data_.try_emplace(buffer_interval.buffer, sort_data)
+            .first;
+  }
+
+  return std::make_tuple(
+      -1 * buffer_interval.size, -1 * sort_data_it->second.cumulative_use_size,
+      sort_data_it->second.latest_use, buffer_interval.buffer->id());
+}
+
+}  // namespace memory_space_assignment
+}  // namespace xla
diff --git a/third_party/xla/xla/service/memory_space_assignment/buffer_interval_comparator.h b/third_party/xla/xla/service/memory_space_assignment/buffer_interval_comparator.h
new file mode 100644
index 00000000000000..f5705df568f3d9
--- /dev/null
+++ b/third_party/xla/xla/service/memory_space_assignment/buffer_interval_comparator.h
@@ -0,0 +1,149 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_BUFFER_INTERVAL_COMPARATOR_H_
+#define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_BUFFER_INTERVAL_COMPARATOR_H_
+
+#include <cstdint>
+#include <string>
+#include <tuple>
+
+#include "absl/container/flat_hash_map.h"
+#include "xla/hlo/utils/hlo_live_range.h"
+#include "xla/service/buffer_value.h"
+#include "xla/service/heap_simulator/heap_simulator.h"
+#include "xla/service/hlo_value.h"
+#include "xla/service/memory_space_assignment/cost_analysis.h"
+#include "xla/service/memory_space_assignment/memory_space_assignment.pb.h"
+
+namespace xla {
+namespace memory_space_assignment {
+
+using MsaBufferInterval =
+    GlobalDecreasingSizeBestFitHeap<HloValue>::BufferInterval;
+using MsaBufferIntervalCompare =
+    GlobalDecreasingSizeBestFitHeap<HloValue>::BufferIntervalCompare;
+
+// The MsaBufferInterval sorting interface that MemorySpaceAssignment expects.
+class BufferIntervalComparator {
+ public:
+  virtual ~BufferIntervalComparator() = default;
+
+  // A logging string explaining the sorting criteria. E.g., [ -size, offset ]
+  // indicates we sort (desc) size, then (asc) offset.
+  virtual std::string DescribeComparisonCriteria() const = 0;
+
+  // A logging string containing the values used to sort buffer_interval.
+  // E.g., we might return [ -1024, 100 ], if the criteria is [ -size,
+  // offset ].
+  virtual std::string CriteriaToString(
+      const MsaBufferInterval& buffer_interval) = 0;
+
+  // comparator.LessThan(lhs, rhs) will be used for MsaBufferIntervalCompare.
+  virtual bool LessThan(const MsaBufferInterval& lhs,
+                        const MsaBufferInterval& rhs) = 0;
+
+  // Used to create a functor that can be passed to a method like std::sort.
+  // E.g., absl::c_sort(v, comparator.GetComparisonFunctor());
+  MsaBufferIntervalCompare GetComparisonFunctor() {
+    return [this](const MsaBufferInterval& lhs, const MsaBufferInterval& rhs) {
+      return LessThan(lhs, rhs);
+    };
+  }
+
+ protected:
+  BufferIntervalComparator() = default;
+};
+
+// A BufferIntervalComparator that utilizes MemoryBoundedness as its primary
+// sorting criteria.
+//
+// This comparator caches HloValues -> latest use time.
+class MemoryBoundednessBufferIntervalComparator
+    : public BufferIntervalComparator {
+ public:
+  MemoryBoundednessBufferIntervalComparator(
+      const CostAnalysis& cost_analysis,
+      CostAnalysis::Cache* cost_analysis_cache);
+
+  MemoryBoundednessBufferIntervalComparator(
+      const CostAnalysis& cost_analysis,
+      CostAnalysis::Cache* cost_analysis_cache,
+      MsaSortOrderOverrides msa_sort_order_overrides);
+
+  ~MemoryBoundednessBufferIntervalComparator() override = default;
+
+  std::string DescribeComparisonCriteria() const override;
+  std::string CriteriaToString(
+      const MsaBufferInterval& buffer_interval) override;
+  bool LessThan(const MsaBufferInterval& lhs,
+                const MsaBufferInterval& rhs) override;
+
+ private:
+  // See the value returned by DescribeComparisonCriteria() for the meaning of
+  // each tuple element.
+  using ComparisonTuple = std::tuple<int64_t, float, int64_t, int64_t, int64_t,
+                                     int64_t, BufferValue::Id>;
+
+  ComparisonTuple GetTuple(const MsaBufferInterval& buffer_interval);
+  int64_t GetLatestUseTime(const MsaBufferInterval& buffer_interval);
+  absl::flat_hash_map<const HloValue*, int64_t> buffer_to_latest_use_;
+  const CostAnalysis& cost_analysis_;
+  CostAnalysis::Cache* cost_analysis_cache_;
+
+  // Config to override alternate memory assignment sorting order for filtered
+  // buffers.
+  MsaSortOrderOverrides msa_sort_order_overrides_;
+};
+
+// The default BufferIntervalComparator used for cross-program prefetching.
+//
+// This class caches HloValue -> {latest use, cumulative use size }.
+class DefaultCrossProgramPrefetchBufferIntervalComparator
+    : public BufferIntervalComparator {
+ public:
+  explicit DefaultCrossProgramPrefetchBufferIntervalComparator(
+      const HloLiveRange& hlo_live_range);
+
+  ~DefaultCrossProgramPrefetchBufferIntervalComparator() override = default;
+
+  std::string DescribeComparisonCriteria() const override;
+  std::string CriteriaToString(
+      const MsaBufferInterval& buffer_interval) override;
+  bool LessThan(const MsaBufferInterval& lhs,
+                const MsaBufferInterval& rhs) override;
+
+ private:
+  // See the value returned by DescribeComparisonCriteria() for the meaning of
+  // each tuple element.
+  using ComparisonTuple =
+      std::tuple<int64_t, int64_t, int64_t, BufferValue::Id>;
+
+  struct AdditionalSortData {
+    int64_t latest_use = 0;
+    int64_t cumulative_use_size = 0;
+  };
+
+  ComparisonTuple GetTuple(const MsaBufferInterval& buffer_interval);
+
+  absl::flat_hash_map<const HloValue*, AdditionalSortData>
+      additional_sort_data_;
+  const HloLiveRange& hlo_live_range_;
+};
+
+}  // namespace memory_space_assignment
+}  // namespace xla
+
+#endif  // XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_BUFFER_INTERVAL_COMPARATOR_H_
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
index 022591adf1e6a1..4f4fdc29e50d42 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
@@ -20,60 +20,44 @@ limitations under the License.
 #include <cstdint>
 #include <functional>
 #include <iterator>
-#include <limits>
-#include <list>
 #include <map>
 #include <memory>
 #include <optional>
-#include <set>
 #include <string>
 #include <string_view>
 #include <tuple>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
-#include "absl/functional/any_invocable.h"
 #include "absl/log/check.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
-#include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
-#include "re2/re2.h"
-#include "xla/debug_options_flags.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/hlo/utils/hlo_live_range.h"
 #include "xla/service/buffer_value.h"
-#include "xla/service/call_graph.h"
-#include "xla/service/heap_simulator/allocation_block.h"
 #include "xla/service/heap_simulator/heap_simulator.h"
 #include "xla/service/hlo_alias_analysis.h"
 #include "xla/service/hlo_buffer.h"
-#include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/hlo_dataflow_analysis.h"
 #include "xla/service/hlo_value.h"
+#include "xla/service/memory_space_assignment/algorithm.h"
 #include "xla/service/memory_space_assignment/allocation.h"
 #include "xla/service/memory_space_assignment/cost_analysis.h"
-#include "xla/service/memory_space_assignment/memory_bound_loop_optimizer.h"
 #include "xla/service/memory_space_assignment/memory_space_assignment.pb.h"
 #include "xla/service/memory_space_assignment/options.h"
-#include "xla/service/memory_space_assignment/repacking.h"
 #include "xla/service/memory_space_assignment/slice.h"
-#include "xla/service/memory_space_assignment/tuning_utils.h"
-#include "xla/service/memory_space_assignment/utils.h"
-#include "xla/service/time_utils.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
 #include "xla/status_macros.h"
-#include "xla/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/casts.h"
@@ -86,230 +70,6 @@ namespace xla {
 namespace memory_space_assignment {
 namespace {
 
-// Define a dummy chunk for chunks that will be allocated in the default memory
-// space and for keeping track of number of asynchronous copies.
-const HeapSimulator::Chunk kDummyChunk =
-    HeapSimulator::Chunk::FromOffsetSize(-1, -1);
-// For cross-program prefetched buffer, we only perform the freeing optimization
-// if the buffer occupies less of the execution time ratio than this value.
-const float kCrossProgramPrefetchOccupyFreeingLimit = 0.6;
-
-template <typename T>
-std::string VectorToString(const std::vector<T>& v,
-                           bool include_indices = false, int start = 0,
-                           int end = std::numeric_limits<int>::max()) {
-  std::vector<std::string> elements;
-
-  for (int i = start; i < end && i < v.size(); ++i) {
-    std::string prefix;
-    if (include_indices) {
-      prefix = absl::StrCat(i, ": ");
-    }
-    elements.push_back(absl::StrCat(prefix, v[i]));
-  }
-
-  return absl::StrCat("[ ", absl::StrJoin(elements, ", "), " ]");
-}
-
-bool LooksLikeAnActivation(const HloInstruction* inst) {
-  for (HloInstruction* user : inst->users()) {
-    switch (user->opcode()) {
-      case HloOpcode::kConvolution:
-      case HloOpcode::kDot:
-        if (user->operand(0) == inst) {
-          return true;
-        }
-        break;
-      case HloOpcode::kGather:
-        if (user->operand(1) == inst) {
-          return true;
-        }
-        break;
-      case HloOpcode::kFusion:
-        for (int i = 0; i < user->operand_count(); ++i) {
-          if (user->operand(i) == inst &&
-              LooksLikeAnActivation(user->fused_parameter(i))) {
-            return true;
-          }
-        }
-        break;
-      case HloOpcode::kBitcast:
-      case HloOpcode::kBroadcast:
-      case HloOpcode::kTranspose:
-        if (LooksLikeAnActivation(user)) {
-          return true;
-        }
-        break;
-      case HloOpcode::kCopy:
-        if (user->IsFused() && (user == user->parent()->root_instruction())) {
-          user = user->parent()->FusionInstruction();
-          if (LooksLikeAnActivation(user)) {
-            return true;
-          } else {
-            break;
-          }
-        }
-        return true;
-      case HloOpcode::kDynamicUpdateSlice:
-      case HloOpcode::kDynamicSlice:
-        if (std::find(user->operands().begin() + 1, user->operands().end(),
-                      inst) != user->operands().end()) {
-          return true;
-        }
-        if (LooksLikeAnActivation(user)) {
-          return true;
-        }
-        break;
-      case HloOpcode::kReduce:
-        // Check init operands.
-        if (std::find(user->operands().begin() + user->operand_count() / 2,
-                      user->operands().end(), inst) != user->operands().end()) {
-          return true;
-        }
-        if (LooksLikeAnActivation(user)) {
-          return true;
-        }
-        break;
-      default:
-        return true;
-    }
-  }
-  return false;
-}
-
-// Filters out buffer uses that cannot use the cross-program prefetch due to
-// aliasing with program output.
-std::vector<HloUse> FindCrossProgramPrefetchUses(
-    absl::Span<const HloUse> buffer_uses,
-    const HloAliasAnalysis& alias_analysis) {
-  std::vector<HloUse> uses;
-  if (buffer_uses.empty()) {
-    return uses;
-  }
-  const HloInstruction* root_instruction = buffer_uses.at(0)
-                                               .instruction->GetModule()
-                                               ->entry_computation()
-                                               ->root_instruction();
-  // This function returns true if the use value does not live out of the
-  // module. The value lives out if it is the root or it aliases with another
-  // value that lives out. We recurse to detect the latter case.
-  std::function<bool(const HloUse&)> use_does_not_live_out =
-      [&](const HloUse& use) {
-        if (use.instruction == root_instruction &&
-            (use.instruction->opcode() == HloOpcode::kTuple ||
-             use.instruction->opcode() == HloOpcode::kBitcast)) {
-          return false;
-        }
-        auto in_place_pairs =
-            HloDataflowAnalysis::GetInPlaceInputOutputPairs(use.instruction);
-        return absl::c_all_of(
-            in_place_pairs,
-            [&](const std::pair<HloOperandIndex, ShapeIndex>& in_place_pair) {
-              if (in_place_pair.first.operand_number == use.operand_number &&
-                  in_place_pair.first.operand_index == use.operand_index) {
-                return use.instruction != root_instruction &&
-                       absl::c_all_of(
-                           alias_analysis.dataflow_analysis()
-                               .GetUniqueValueAt(use.instruction,
-                                                 in_place_pair.second)
-                               .GetUses(),
-                           use_does_not_live_out);
-              }
-              return true;
-            });
-      };
-
-  absl::c_copy_if(buffer_uses, std::back_inserter(uses), use_does_not_live_out);
-  return uses;
-}
-
-bool IsCrossProgramPrefetchCandidate(const HloValue& value,
-                                     const HloAliasAnalysis& alias_analysis,
-                                     const Options& options) {
-  // Filter out values that alias with the entry computation root.
-  const HloBuffer& buffer = alias_analysis.GetBufferContainingValue(value);
-  const HloInstruction* root = alias_analysis.dataflow_analysis()
-                                   .module()
-                                   .entry_computation()
-                                   ->root_instruction();
-  for (const HloPosition& position : buffer.ComputePositions()) {
-    if (position.instruction == root) {
-      return false;
-    }
-  }
-  std::vector<HloUse> uses =
-      FindCrossProgramPrefetchUses(value.GetUses(), alias_analysis);
-  return value.defining_instruction()->parent() ==
-             value.defining_instruction()->GetModule()->entry_computation() &&
-         value.defining_instruction()->opcode() == HloOpcode::kParameter &&
-         (!value.shape().has_layout() ||
-          value.shape().layout().memory_space() !=
-              options.alternate_memory_space) &&
-         value.index().size() <= 1 && value.shape().IsArray() &&
-         !uses.empty() && options.size_fn(value) <= options.max_size_in_bytes &&
-         absl::c_all_of(uses, [&](const HloUse& use) {
-           const HloInstruction* inst =
-               use.instruction->operand(use.operand_number);
-
-           // Skip the LooksLikeAnActivation test since we're testing the
-           // parent GTE/parameter and its children below.
-           if (inst->opcode() == HloOpcode::kBitcast &&
-               ((inst->operand(0)->opcode() == HloOpcode::kGetTupleElement &&
-                 inst->operand(0)->operand(0)->opcode() ==
-                     HloOpcode::kParameter) ||
-                inst->operand(0)->opcode() == HloOpcode::kParameter)) {
-             return true;
-           }
-
-           return (inst->opcode() == HloOpcode::kGetTupleElement ||
-                   inst->opcode() == HloOpcode::kParameter) &&
-                  !LooksLikeAnActivation(inst);
-         });
-}
-
-struct CrossProgramPrefetchBufferSortValues {
-  int64_t latest_use = 0;
-  int64_t use_size = 0;
-};
-
-std::vector<MsaBufferInterval> FindCrossProgramPrefetchCandidates(
-    const HloAliasAnalysis& alias_analysis, const HloLiveRange& hlo_live_range,
-    const Options& options) {
-  std::vector<MsaBufferInterval> candidates;
-  for (const HloBuffer& buffer : alias_analysis.buffers()) {
-    CHECK_GE(buffer.values().size(), 1);
-    const HloValue* value = buffer.values().at(0);
-    if (IsCrossProgramPrefetchCandidate(*value, alias_analysis, options)) {
-      MsaBufferInterval interval;
-      interval.buffer = value;
-      interval.size = options.size_fn(*value);
-      interval.start = 0;
-      interval.end = hlo_live_range.schedule_end_time();
-      interval.need_allocation = true;
-      interval.colocations = {++buffer.values().begin(), buffer.values().end()};
-      candidates.emplace_back(interval);
-    }
-  }
-
-  DefaultCrossProgramPrefetchBufferIntervalComparator default_comparator(
-      hlo_live_range);
-  BufferIntervalComparator* comparator =
-      (options.default_cross_program_prefetch_heuristic &&
-               options.buffer_interval_comparator
-           ? options.buffer_interval_comparator
-           : &default_comparator);
-  absl::c_sort(candidates, comparator->GetComparisonFunctor());
-
-  VLOG(3) << "Cross-program prefetch candidates: " << candidates.size()
-          << ". Sorting criteria: " << comparator->DescribeComparisonCriteria();
-  for (auto& candidate : candidates) {
-    VLOG(3) << "Cross-program prefetch candidate. Sorting criteria: "
-            << comparator->CriteriaToString(candidate)
-            << ". Candidate: " << candidate.buffer->ToString();
-  }
-  return candidates;
-}
-
 Status InsertInstructionAndEnsureOperandsInserted(
     HloInstruction* new_instruction, HloInstructionSequence* new_sequence,
     absl::flat_hash_set<HloInstruction*>* inserted_instructions);
@@ -330,5008 +90,191 @@ Status EnsureInstructionAndOperandsInserted(
 // Same as above, but does not check if instruction is already inserted. This is
 // used when the caller already knows the instruction isn't inserted yet, to
 // speed up compilation.
-Status InsertInstructionAndEnsureOperandsInserted(
-    HloInstruction* new_instruction, HloInstructionSequence* new_sequence,
-    absl::flat_hash_set<HloInstruction*>* inserted_instructions) {
-  for (HloInstruction* operand : new_instruction->operands()) {
-    TF_RETURN_IF_ERROR(EnsureInstructionAndOperandsInserted(
-        operand, new_sequence, inserted_instructions));
-  }
-  VLOG(4) << "inserting: " << new_instruction->ToShortString();
-  new_sequence->push_back(new_instruction);
-  TF_RET_CHECK(inserted_instructions->insert(new_instruction).second);
-  return OkStatus();
-}
-
-absl::StatusOr<xla::HloLiveRange::LogicalTime>
-GetScheduleTimeFromInstructionName(
-    absl::string_view name,
-    const absl::flat_hash_map<const xla::HloInstruction*,
-                              xla::HloLiveRange::LogicalTime>& schedule) {
-  for (auto schedule_entry : schedule) {
-    if (schedule_entry.first->name() == name) {
-      return schedule_entry.second;
-    }
-  }
-  return NotFound("Reference instruction %s was not found in the schedule.",
-                  name);
-}
-
-bool DoesOperandMatchFilter(const HloOperandFilter& filter,
-                            int64_t operand_size, const HloUse& hlo_use) {
-  if (filter.has_size_gte() && operand_size < filter.size_gte()) {
-    return false;
-  }
-  if (filter.has_size_lte() && operand_size > filter.size_lte()) {
-    return false;
-  }
-  if (filter.has_operand_number() &&
-      hlo_use.operand_number != filter.operand_number()) {
-    return false;
-  }
-  if (filter.has_instruction_name_regex() &&
-      !RE2::FullMatch(hlo_use.instruction->name(),
-                      filter.instruction_name_regex())) {
-    return false;
-  }
-  if (filter.has_tuple_index() &&
-      hlo_use.operand_index != ShapeIndex(filter.tuple_index().index().begin(),
-                                          filter.tuple_index().index().end())) {
-    return false;
-  }
-  return true;
-}
-
-absl::StatusOr<std::optional<int64_t>> GetPrefetchTimeByEagerness(
-    float prefetch_eagerness, int64_t earliest_prefetch_time,
-    int64_t latest_prefetch_time) {
-  CHECK_GE(prefetch_eagerness, 0.0);
-  CHECK_LE(prefetch_eagerness, 1.0);
-  if (earliest_prefetch_time > latest_prefetch_time) {
-    return static_cast<std::optional<int64_t>>(std::nullopt);
-  }
-  return static_cast<std::optional<int64_t>>(
-      earliest_prefetch_time +
-      (latest_prefetch_time - earliest_prefetch_time) * prefetch_eagerness);
-}
-
-absl::StatusOr<std::optional<int64_t>> GetPrefetchTimeAfterInstruction(
-    const std::string& after_instruction_name,
-    const absl::flat_hash_map<const xla::HloInstruction*,
-                              xla::HloLiveRange::LogicalTime>& schedule) {
-  TF_ASSIGN_OR_RETURN(
-      auto reference_instruction_time,
-      GetScheduleTimeFromInstructionName(after_instruction_name, schedule));
-  return static_cast<std::optional<int64_t>>(reference_instruction_time);
-}
-
-absl::StatusOr<std::optional<int64_t>> GetPrefetchTimeBeforeInstruction(
-    const std::string& before_instruction_name,
-    const absl::flat_hash_map<const xla::HloInstruction*,
-                              xla::HloLiveRange::LogicalTime>& schedule) {
-  TF_ASSIGN_OR_RETURN(
-      auto reference_instruction_time,
-      GetScheduleTimeFromInstructionName(before_instruction_name, schedule));
-  return static_cast<std::optional<int64_t>>(reference_instruction_time - 1);
-}
-
-absl::StatusOr<std::optional<int64_t>> GetPrefetchTime(
-    const PreferredPrefetchOverrideOptions& override_options,
-    int64_t earliest_prefetch_time, int64_t latest_prefetch_time,
-    const absl::flat_hash_map<const HloInstruction*, HloLiveRange::LogicalTime>&
-        instruction_schedule) {
-  switch (override_options.options_case()) {
-    case PreferredPrefetchOverrideOptions::kPrefetchEagerness:
-      return GetPrefetchTimeByEagerness(override_options.prefetch_eagerness(),
-                                        earliest_prefetch_time,
-                                        latest_prefetch_time);
-    case PreferredPrefetchOverrideOptions::kAfterInstructionName:
-      return GetPrefetchTimeAfterInstruction(
-          override_options.after_instruction_name(), instruction_schedule);
-    case PreferredPrefetchOverrideOptions::kBeforeInstructionName:
-      return GetPrefetchTimeBeforeInstruction(
-          override_options.before_instruction_name(), instruction_schedule);
-    case PreferredPrefetchOverrideOptions::OPTIONS_NOT_SET:
-      break;
-  }
-  return static_cast<StatusOr<std::optional<int64_t>>>(std::nullopt);
-}
-
-absl::StatusOr<std::optional<int64_t>> GetOverriddenPreferredPrefetchTime(
-    const PreferredPrefetchOverrides& preferred_prefetch_overrides,
-    int64_t operand_size, const HloUse& hlo_use,
-    const absl::flat_hash_map<const HloInstruction*, HloLiveRange::LogicalTime>&
-        instruction_schedule,
-    int64_t earliest_prefetch_time, int64_t latest_prefetch_time) {
-  for (const auto& override : preferred_prefetch_overrides.overrides()) {
-    if (!DoesOperandMatchFilter(override.hlo_operand_filter(), operand_size,
-                                hlo_use)) {
-      continue;
-    }
-    LOG(INFO) << "Config match for instruction " << hlo_use.instruction->name()
-              << " operand number " << hlo_use.operand_number
-              << " operand index " << hlo_use.operand_index.ToString()
-              << " size " << operand_size << " live range ("
-              << earliest_prefetch_time << ", " << latest_prefetch_time << ")";
-    TF_ASSIGN_OR_RETURN(
-        auto prefetch_time,
-        GetPrefetchTime(override.override_options(), earliest_prefetch_time,
-                        latest_prefetch_time, instruction_schedule));
-    if (prefetch_time.has_value() &&
-        prefetch_time.value() >= earliest_prefetch_time &&
-        prefetch_time.value() <= latest_prefetch_time) {
-      return prefetch_time;
-    }
-  }
-  return static_cast<StatusOr<std::optional<int64_t>>>(std::nullopt);
-}
-
-bool DoesResultMatchFilter(const HloPositionMatcher& filter,
-                           const ShapeIndex& index,
-                           HloInstruction* instruction) {
-  if (filter.has_instruction_regex() &&
-      !RE2::FullMatch(instruction->ToString(), filter.instruction_regex())) {
-    return false;
-  }
-  if (filter.has_instruction_name_regex() &&
-      !RE2::FullMatch(instruction->name(), filter.instruction_name_regex())) {
-    return false;
-  }
-  if (filter.has_tuple_index() &&
-      index != ShapeIndex(filter.tuple_index().index().begin(),
-                          filter.tuple_index().index().end())) {
-    return false;
-  }
-  return true;
-}
-
-// Returns an integer representing the priority of a BufferInterval during
-// assignment, a smaller number indicates a higher priority.
-int64_t GetBufferIntervalOverridePriority(
-    const MsaSortOrderOverrides& msa_sort_order_overrides,
-    const BufferInterval& buffer_interval) {
-  if (msa_sort_order_overrides.overrides_size() == 0) {
-    return 0;
-  }
-  for (int64_t i = 0; i < msa_sort_order_overrides.overrides_size(); ++i) {
-    const auto& override = msa_sort_order_overrides.overrides(i);
-    if (!DoesResultMatchFilter(override.hlo_position_matcher(),
-                               buffer_interval.buffer->index(),
-                               buffer_interval.buffer->instruction())) {
-      continue;
-    }
-    LOG(INFO) << "Override Sort Order Config " << i << " matches "
-              << buffer_interval.buffer->instruction()->ToString();
-    switch (override.override_options().options_case()) {
-      case MsaSortOrderOverrideOptions::kAssignFirst:
-        return std::numeric_limits<int64_t>::lowest() + i;
-      case MsaSortOrderOverrideOptions::kAssignLast:
-        return std::numeric_limits<int64_t>::max() - i;
-      case MsaSortOrderOverrideOptions::OPTIONS_NOT_SET:
-        continue;
-    }
-  }
-  return 0;
-}
-
-std::tuple<int64_t, bool, int64_t> GetAllocationSortTuple(
-    const std::unique_ptr<Allocation>& allocation) {
-  int64_t scheduled_on_or_before = allocation->start_time();
-  int64_t scheduled_on_or_after = allocation->start_time();
-  if (allocation->is_copy_allocation()) {
-    auto copy_allocation =
-        tensorflow::down_cast<CopyAllocation*>(allocation.get());
-    scheduled_on_or_before = copy_allocation->copy_done_schedule_before();
-    scheduled_on_or_after = copy_allocation->copy_start_schedule_after();
-  }
-  return std::forward_as_tuple(scheduled_on_or_before,
-                               !allocation->is_copy_allocation(),
-                               scheduled_on_or_after);
-}
-
-void SortAllocationSequence(AllocationSequence& allocations) {
-  absl::c_sort(allocations, [](const std::unique_ptr<Allocation>& lhs,
-                               const std::unique_ptr<Allocation>& rhs) {
-    return GetAllocationSortTuple(lhs) < GetAllocationSortTuple(rhs);
-  });
-}
-
-std::string AllocationSequenceToString(AllocationSequence& allocations,
-                                       bool sort_allocations = false) {
-  if (sort_allocations) {
-    SortAllocationSequence(allocations);
-  }
-  std::string allocations_str = "\n";
-  for (const std::unique_ptr<Allocation>& allocation : allocations) {
-    absl::StrAppend(&allocations_str, allocation->ToString(), "\n");
-  }
-  return allocations_str;
-}
-
-std::string InstructionScheduleToString(const HloLiveRange& hlo_live_range) {
-  const absl::flat_hash_map<const HloInstruction*, HloLiveRange::LogicalTime>&
-      instruction_schedule = hlo_live_range.instruction_schedule();
-  std::vector<std::pair<int64_t, const HloInstruction*>> instructions;
-  instructions.reserve(instruction_schedule.size());
-  for (const auto& instruction : instruction_schedule) {
-    instructions.push_back({instruction.second, instruction.first});
-  }
-  std::string instruction_schedule_str = "\n";
-  absl::c_sort(instructions);
-  for (auto& instruction : instructions) {
-    absl::StrAppend(&instruction_schedule_str,
-                    "LogicalTime: ", instruction.first, " ",
-                    instruction.second->ToString(), "\n");
-  }
-  return instruction_schedule_str;
-}
-
-void EnsureParentAllocationIsAvailableForCopy(CopyAllocation* copy_allocation) {
-  Allocation& parent_allocation = copy_allocation->mutable_prev_allocation();
-  parent_allocation.Extend(copy_allocation->copy_done_schedule_before());
-  if (parent_allocation.is_copy_allocation()) {
-    auto parent_copy_allocation =
-        tensorflow::down_cast<CopyAllocation*>(&parent_allocation);
-    parent_copy_allocation->set_copy_done_schedule_before(
-        std::min(parent_copy_allocation->copy_done_schedule_before(),
-                 copy_allocation->start_time()));
-    parent_copy_allocation->set_copy_start_schedule_after(
-        std::min(parent_copy_allocation->copy_start_schedule_after(),
-                 parent_copy_allocation->copy_done_schedule_before() - 1));
-  }
-}
-
-void MakeCopyAllocationJitForSingleUse(CopyAllocation* copy_allocation,
-                                       int64_t use_time) {
-  copy_allocation->set_start_time(use_time - 1);
-  copy_allocation->set_copy_start_schedule_after(use_time - 1);
-  copy_allocation->set_end_time(use_time);
-  copy_allocation->set_copy_done_schedule_before(use_time);
-  EnsureParentAllocationIsAvailableForCopy(copy_allocation);
-}
-
-int64_t GetUseTime(const HloUse& use, const HloLiveRange& hlo_live_range) {
-  return hlo_live_range.instruction_schedule().at(use.instruction);
-}
-
-std::vector<Allocation*> GetAllocationSequenceInRawPointers(
-    AllocationSequence& allocations) {
-  std::vector<Allocation*> allocations_in_raw_pointers;
-  for (const std::unique_ptr<Allocation>& allocation : allocations) {
-    allocations_in_raw_pointers.push_back(allocation.get());
-  }
-  return allocations_in_raw_pointers;
-}
-
-void ProcessPrefetchesToAlternateMemory(AllocationSequence& allocations,
-                                        const HloLiveRange& hlo_live_range) {
-  std::vector<Allocation*> allocations_in_raw_pointers =
-      GetAllocationSequenceInRawPointers(allocations);
-  for (auto allocation : allocations_in_raw_pointers) {
-    if (allocation->is_copy_allocation() && allocation->is_in_alternate_mem() &&
-        !allocation->uses().empty()) {
-      CopyAllocation* prefetch =
-          tensorflow::down_cast<CopyAllocation*>(allocation);
-      std::vector<HloUse> uses = prefetch->uses();  // Create a copy of uses.
-      prefetch->clear_uses();                       // Clear old uses.
-      // For every prefetch, update prefetch to serve earliest use just in time.
-      prefetch->AddUse(uses[0]);
-      MakeCopyAllocationJitForSingleUse(prefetch,
-                                        GetUseTime(uses[0], hlo_live_range));
-      // For every use after the first use, create a new prefetch from the same
-      // parent allocation.
-      for (size_t use_index = 1; use_index < uses.size(); ++use_index) {
-        const HloUse& use = uses[use_index];
-        int64_t use_time = GetUseTime(use, hlo_live_range);
-        auto jit_single_use_prefetch = std::make_unique<CopyAllocation>(
-            prefetch->mutable_prev_allocation(), MemorySpace::kAlternate,
-            prefetch->chunk(), use_time - 1, use_time, use_time);
-        jit_single_use_prefetch->set_copy_start_schedule_after(use_time - 1);
-        jit_single_use_prefetch->AddUse(use);
-        EnsureParentAllocationIsAvailableForCopy(jit_single_use_prefetch.get());
-        allocations.push_back(std::move(jit_single_use_prefetch));
-      }
-    }
-  }
-}
-
-void MakeEvictionImmediate(CopyAllocation* eviction) {
-  const Allocation& parent_allocation = eviction->prev_allocation();
-  eviction->set_start_time(parent_allocation.start_time());
-  eviction->set_copy_start_schedule_after(parent_allocation.start_time());
-  eviction->set_copy_done_schedule_before(parent_allocation.start_time() + 1);
-  eviction->Extend(parent_allocation.start_time() + 1);
-}
-
-absl::flat_hash_map<Allocation*, CopyAllocation*> GetEvictionsMap(
-    std::vector<Allocation*>& allocations) {
-  absl::flat_hash_map<Allocation*, CopyAllocation*> evictions_map;
-  for (auto& allocation : allocations) {
-    if (allocation->is_copy_allocation() && allocation->is_in_default_mem()) {
-      auto eviction = tensorflow::down_cast<CopyAllocation*>(allocation);
-      Allocation& parent_allocation = eviction->mutable_prev_allocation();
-      if (!parent_allocation.is_copy_allocation()) {
-        evictions_map[&parent_allocation] = eviction;
-      }
-    }
-  }
-  return evictions_map;
-}
-
-void ProcessBuffersProducedInAlternateMemory(
-    AllocationSequence& allocations, const HloLiveRange& hlo_live_range) {
-  std::vector<Allocation*> allocations_in_raw_pointers =
-      GetAllocationSequenceInRawPointers(allocations);
-  // For all parent allocations produced in alternate memory, create a map from
-  // parent allocation -> eviction.
-  absl::flat_hash_map<Allocation*, CopyAllocation*> evictions_map =
-      GetEvictionsMap(allocations_in_raw_pointers);
-  // Make all such evictions immediate.
-  for (auto& [_, eviction] : evictions_map) {
-    MakeEvictionImmediate(eviction);
-  }
-  VLOG(2) << "AllocationSequence after making spills immediate spills\n";
-  XLA_LOG_LINES(2, AllocationSequenceToString(allocations, true));
-  // Process all buffers produced in the alternate memory:
-  // 1. Make the buffer short lived.
-  // 2. Service immediate use if any.
-  // 3. If buffer is also used later get or create an immediate eviction.
-  // 4. For every later use prefetch just in time from the eviction.
-  for (auto allocation : allocations_in_raw_pointers) {
-    if (!allocation->is_copy_allocation() &&
-        allocation->is_in_alternate_mem()) {
-      std::vector<HloUse> uses = allocation->uses();  // Create a copy of uses.
-      allocation->clear_uses();                       // Clear old uses.
-      // Make buffer short lived.
-      allocation->set_end_time(allocation->start_time() + 1);
-      for (const HloUse& use : uses) {
-        int64_t use_time = GetUseTime(use, hlo_live_range);
-        if (allocation->start_time() + 1 == use_time) {
-          allocation->AddUse(use);
-          continue;
-        }
-        if (!evictions_map.contains(allocation)) {
-          auto eviction_unique_ptr = std::make_unique<CopyAllocation>(
-              *allocation, MemorySpace::kDefault, std::nullopt,
-              allocation->start_time(), allocation->start_time() + 1,
-              allocation->start_time() + 1);
-          eviction_unique_ptr->set_copy_start_schedule_after(
-              allocation->start_time());
-          evictions_map[allocation] = eviction_unique_ptr.get();
-          allocations.push_back(std::move(eviction_unique_ptr));
-        }
-        CopyAllocation* eviction = evictions_map[allocation];
-        auto jit_single_use_prefetch = std::make_unique<CopyAllocation>(
-            *eviction, MemorySpace::kAlternate, allocation->chunk(),
-            use_time - 1, use_time, use_time);
-        jit_single_use_prefetch->set_copy_start_schedule_after(use_time - 1);
-        jit_single_use_prefetch->AddUse(use);
-        EnsureParentAllocationIsAvailableForCopy(jit_single_use_prefetch.get());
-        allocations.push_back(std::move(jit_single_use_prefetch));
-      }
-    }
-  }
-}
-
-void TransformAllocationSequenceToSpill(AllocationSequence& allocations,
-                                        const HloLiveRange& hlo_live_range) {
-  VLOG(2) << "InstructionSchedule before transform\n";
-  XLA_LOG_LINES(2, InstructionScheduleToString(hlo_live_range));
-  VLOG(2) << "AllocationSequence before transform\n";
-  XLA_LOG_LINES(2, AllocationSequenceToString(allocations, true));
-  ProcessPrefetchesToAlternateMemory(allocations, hlo_live_range);
-  VLOG(2) << "AllocationSequence after processing prefetches\n";
-  XLA_LOG_LINES(2, AllocationSequenceToString(allocations, true));
-  ProcessBuffersProducedInAlternateMemory(allocations, hlo_live_range);
-  VLOG(2) << "AllocationSequence after processing buffers produced in kAlt\n";
-  XLA_LOG_LINES(2, AllocationSequenceToString(allocations, true));
-  SortAllocationSequence(allocations);
-}
-
-}  // namespace
-
-std::string MemorySpaceAssignment::AllocationValue::ToString() const {
-  std::string out = absl::StrCat("computation = ", computation()->name());
-  absl::StrAppend(&out,
-                  (requires_contiguous_allocation_ ? " (cont alloc)" : ""));
-  absl::StrAppend(&out, "\n position:\n");
-  absl::StrAppend(&out, "  ", defining_position_.ToString(), "\n");
-  absl::StrAppend(&out, " uses:\n");
-  for (const Use& use : uses_) {
-    absl::StrAppend(&out, "  ", use.hlo_use.ToString(), "\n");
-  }
-  return out;
-}
-
-std::string MemorySpaceAssignment::AllocationValue::ToShortString() const {
-  return absl::StrCat("computation = ", computation()->name(),
-                      ", position = ", defining_position_.ToString(),
-                      ", value = ", value_->ToShortString(),
-                      (requires_contiguous_allocation_ ? " (cont alloc)" : ""));
-}
-
-bool AlternateMemoryBestFitHeap::IsIntervalPinnedToAlternateMemory(
-    const AlternateMemoryBestFitHeap::BufferInterval& interval) const {
-  const Shape& shape = interval.buffer->shape();
-  return shape.has_layout() &&
-         shape.layout().memory_space() == options_.alternate_memory_space;
-}
-
-AlternateMemoryBestFitHeap::AlternateMemoryBestFitHeap(
-    AllocationSequence* allocations, const Options& options,
-    const HloAliasAnalysis& alias_analysis, const HloLiveRange& hlo_live_range)
-    : GlobalDecreasingSizeBestFitHeap(
-          options.alignment_in_bytes,
-          /*type=*/kSpatial, /*buffer_interval_compare=*/nullptr,
-          (options.sliced_prefetch_options.max_slices() >
-                   options.sliced_prefetch_options
-                       .all_slice_time_permutations_threshold()
-               ? SliceTimePermutationIterator::Ty::kPreferred
-               : SliceTimePermutationIterator::Ty::kAll)),
-      allocations_(allocations),
-      options_(options),
-      alias_analysis_(alias_analysis),
-      hlo_live_range_(hlo_live_range),
-      peak_memory_usage_(hlo_live_range.schedule_end_time() + 1) {
-  // Override buffer interval compare if provided.
-  auto comparison_function = GetSpatialBufferIntervalCompare();
-  if (options.buffer_interval_comparator) {
-    comparison_function =
-        options.buffer_interval_comparator->GetComparisonFunctor();
-  }
-
-  // Prioritize pinned buffers in the buffer interval order.
-  buffer_interval_compare_ =
-      [this, comparison_function = std::move(comparison_function)](
-          const BufferInterval& a, const BufferInterval& b) {
-        bool is_a_pinned = IsIntervalPinnedToAlternateMemory(a);
-        bool is_b_pinned = IsIntervalPinnedToAlternateMemory(b);
-        if (is_a_pinned && !is_b_pinned) {
-          return true;
-        }
-        if (!is_a_pinned && is_b_pinned) {
-          return false;
-        }
-        return comparison_function(a, b);
-      };
-
-  call_graph_ = CallGraph::Build(&alias_analysis_.dataflow_analysis().module());
-
-  std::vector<float> initial_resources(hlo_live_range.schedule_end_time(), 1.0);
-  if (options.cost_analysis) {
-    const std::vector<HloInstruction*>& flattened_instructions =
-        hlo_live_range.flattened_instruction_sequence().instructions();
-    for (int i = 0; i < flattened_instructions.size(); ++i) {
-      const HloInstruction* inst = flattened_instructions[i];
-      if (inst->opcode() == HloOpcode::kWhile ||
-          inst->opcode() == HloOpcode::kConditional) {
-        initial_resources[i] = 0;
-      } else {
-        initial_resources[i] =
-            options.cost_analysis->GetInstructionElapsed(*inst);
-        if (options_.use_repeated_instance_for_preferred_prefetch_time ||
-            options_.memory_bound_loop_optimizer_options.enabled()) {
-          std::string fingerprint;
-          absl::StrAppend(&fingerprint, inst->shape().ToString(), " ",
-                          HloOpcodeString(inst->opcode()), "(");
-          for (int operand_idx = 0; operand_idx < inst->operands().size();
-               ++operand_idx) {
-            if (operand_idx > 0) {
-              absl::StrAppend(&fingerprint, ", ");
-            }
-            absl::StrAppend(&fingerprint,
-                            inst->operand(operand_idx)->shape().ToString());
-          }
-          absl::StrAppend(&fingerprint, ")");
-          fingerprint_map_[inst] = fingerprint;
-          repeated_inst_map_[fingerprint].push_back(inst);
-        }
-      }
-      VLOG(2) << "Initial resource[" << i << "] = " << initial_resources[i]
-              << " (" << inst->name() << ")";
-    }
-  }
-  prefetch_async_copy_resource_ = AsynchronousCopyResource(initial_resources);
-  eviction_async_copy_resource_ = AsynchronousCopyResource(initial_resources);
-}
-
-void AlternateMemoryBestFitHeap::CreateAllocationValues(
-    const AlternateMemoryBestFitHeap::BufferInterval& buffer_interval,
-    std::vector<AllocationValue>& allocation_values) const {
-  const HloValue* value = buffer_interval.buffer;
-  VLOG(3) << "Creating AllocationValues for: " << value->ToString();
-
-  // Find and sort all non-trivial (excluding GTE, Tuple, and bitcast)
-  // positions. We create an AllocationValue object for each non-trivial
-  // position. And for each AllocationValue object, we create an
-  // AllocationSequence consisting of one or more Allocation objects.The reason
-  // why we exclude the trivial positions from AllocationValue is because
-  // Allocation objects have special support for tuples and bitcasts.
-  const absl::flat_hash_map<const HloInstruction*, int64_t>&
-      instruction_schedule = hlo_live_range_.instruction_schedule();
-  std::vector<HloPosition> positions;
-  for (const HloPosition& position : value->positions()) {
-    const HloInstruction* instruction = position.instruction;
-    if (instruction->opcode() != HloOpcode::kGetTupleElement &&
-        instruction->opcode() != HloOpcode::kTuple &&
-        instruction->opcode() != HloOpcode::kBitcast) {
-      positions.push_back(position);
-    }
-  }
-  absl::c_stable_sort(positions,
-                      [&](const HloPosition& pos1, const HloPosition& pos2) {
-                        return instruction_schedule.at(pos1.instruction) <
-                               instruction_schedule.at(pos2.instruction);
-                      });
-
-  // Create an AllocationValue for each non-trivial position.
-  int beginning_idx = allocation_values.size();
-  for (int i = 0; i < positions.size(); ++i) {
-    const HloPosition& position = positions.at(i);
-    allocation_values.emplace_back(value, position, buffer_interval.size);
-  }
-
-  std::vector<HloUse> uses(value->GetUses().begin(), value->GetUses().end());
-  absl::c_stable_sort(uses, [&](const HloUse& use1, const HloUse& use2) {
-    return instruction_schedule.at(use1.instruction) <
-           instruction_schedule.at(use2.instruction);
-  });
-
-  // Associate each use with an AllocationValue. Each AllocationValue contains a
-  // position and uses in the same computation. Furthermore, if the original
-  // HloValue had multiple non-trivial positions in the same computation, those
-  // will get their own AllocationValue as well. We split these HloValues so
-  // that when we insert CopyStart/CopyDone in CopyAllocation::Process, they
-  // point to the latest position. We then replace the operand of the use with
-  // CopyStart/CopyDone with an operand of the latest position.
-  for (const HloUse& use : uses) {
-    int64_t use_time = instruction_schedule.at(use.instruction);
-    HloComputation* use_computation = use.instruction->parent();
-
-    AllocationValue* last_allocation_value = nullptr;
-    for (int i = beginning_idx; i < allocation_values.size(); ++i) {
-      AllocationValue* allocation_value = &allocation_values.at(i);
-      if (HloDataflowAnalysis::IsAsynchronousOperationDone(
-              use.instruction->opcode())) {
-        if (allocation_value->defining_instruction() ==
-                use.instruction->operand(0) &&
-            use.operand_index == allocation_value->defining_position().index) {
-          last_allocation_value = allocation_value;
-        }
-      } else if (!HloDataflowAnalysis::IsAsynchronousOperationStart(
-                     allocation_value->defining_instruction()->opcode()) &&
-                 allocation_value->computation() == use_computation &&
-                 instruction_schedule.at(
-                     allocation_value->defining_position().instruction) <
-                     use_time) {
-        last_allocation_value = allocation_value;
-      }
-    }
-    CHECK(last_allocation_value != nullptr);
-    last_allocation_value->AddUse(use, use_time);
-  }
-
-  for (int i = beginning_idx; i < allocation_values.size(); ++i) {
-    AllocationValue& allocation_value = allocation_values.at(i);
-    if (HloDataflowAnalysis::IsAsynchronousOperationStart(
-            allocation_value.defining_instruction()->opcode())) {
-      CHECK_EQ(allocation_value.uses().size(), 1);
-      CHECK(HloDataflowAnalysis::IsAsynchronousOperationDone(
-          allocation_value.uses().at(0).hlo_use.instruction->opcode()));
-      VLOG(3) << "Mark " << allocation_value.ToShortString()
-              << " to require contiguous allocation because it is an async "
-                 "start operation.";
-      allocation_value.set_requires_contiguous_allocation(true);
-    } else if (options_.position_requires_contiguous_allocation_fn(
-                   allocation_value.defining_position())) {
-      VLOG(3) << "Mark " << allocation_value.ToShortString()
-              << " to require contiguous allocation because of options.";
-      allocation_value.set_requires_contiguous_allocation(true);
-    }
-    VLOG(3) << "Created allocation value: "
-            << allocation_values.at(i).ToString();
-  }
-}
-
-void AlternateMemoryBestFitHeap::FindAliases(
-    std::vector<AllocationValue>* allocation_values) const {
-  absl::flat_hash_map<const HloInstruction*,
-                      std::vector<const AllocationValue*>>
-      values_by_defining_inst;
-  for (AllocationValue& value : *allocation_values) {
-    values_by_defining_inst[value.defining_instruction()].push_back(&value);
-  }
-  auto maybe_add_alias_with_instruction = [&](const HloInstruction* instruction,
-                                              AllocationValue::Use* use) {
-    auto aliased_values_it = values_by_defining_inst.find(instruction);
-    if (aliased_values_it != values_by_defining_inst.end()) {
-      for (const AllocationValue* aliased_value : aliased_values_it->second) {
-        VLOG(3) << "Adding aliasing for use " << use->hlo_use.ToString()
-                << " to " << aliased_value->ToShortString();
-        use->aliases.push_back(aliased_value->defining_position());
-      }
-    }
-  };
-
-  for (AllocationValue& value : *allocation_values) {
-    for (AllocationValue::Use& use : value.uses()) {
-      // Find any aliases with the instruction itself (operand and output must
-      // alias).
-      maybe_add_alias_with_instruction(use.hlo_use.instruction, &use);
-
-      // Find any aliases with the parameters of called computations.
-      for (const HloComputation* called_computation :
-           use.hlo_use.instruction->called_computations()) {
-        for (const HloInstruction* parameter_instruction :
-             called_computation->parameter_instructions()) {
-          maybe_add_alias_with_instruction(parameter_instruction, &use);
-        }
-      }
-
-      // Special case for kWhile: the root of the body computation must alias as
-      // well.
-      if (use.hlo_use.instruction->opcode() == HloOpcode::kWhile) {
-        HloPosition root_alias{
-            use.hlo_use.instruction->while_body()->root_instruction(),
-            use.hlo_use.operand_index};
-        VLOG(3) << "Adding while body root aliasing for use "
-                << use.hlo_use.ToString() << " to " << root_alias;
-        use.aliases.push_back(root_alias);
-      }
-    }
-  }
-}
-
-std::vector<const AlternateMemoryBestFitHeap::BufferInterval*>
-AlternateMemoryBestFitHeap::GetSortedColocatedIntervals(
-    const AlternateMemoryBestFitHeap::BufferInterval& interval) const {
-  std::vector<const BufferInterval*> colocated_intervals;
-  std::vector<const BufferInterval*> worklist = {&interval};
-  while (!worklist.empty()) {
-    const BufferInterval* item = worklist.back();
-    worklist.pop_back();
-    colocated_intervals.push_back(item);
-    for (const HloValue* buffer_colocated : item->colocations) {
-      worklist.push_back(&buffer_intervals_.at(buffer_colocated));
-    }
-  }
-
-  absl::c_stable_sort(colocated_intervals, [&](const BufferInterval* x,
-                                               const BufferInterval* y) {
-    return std::make_pair(x->start, x->end) < std::make_pair(y->start, y->end);
-  });
-  return colocated_intervals;
-}
-
-bool AlternateMemoryBestFitHeap::IsUseAllowedInAlternateMemory(
-    const AllocationValue& value, const HloUse& use) const {
-  const auto& instruction_schedule = hlo_live_range_.instruction_schedule();
-  if (!options_.is_use_allowed_in_alternate_mem_fn(use)) {
-    return false;
-  }
-  if (use.instruction->opcode() == HloOpcode::kWhile) {
-    HloComputation* while_body = use.instruction->while_body();
-
-    // We don't want to allocate this buffer in alternate memory if it will be
-    // evicted anyway. Find out if it has an early use or a late definition that
-    // would make sense to keep it in the alternate memory.
-    HloValue* parameter_value =
-        &alias_analysis_.dataflow_analysis().GetUniqueValueAt(
-            while_body->parameter_instruction(0), use.operand_index);
-    int64_t parameter_time =
-        instruction_schedule.at(while_body->parameter_instruction(0));
-    int64_t root_time = instruction_schedule.at(while_body->root_instruction());
-    int64_t min_use_time = root_time;
-    for (const HloUse& parameter_use : parameter_value->GetUses()) {
-      int64_t use_time = instruction_schedule.at(parameter_use.instruction);
-      if (parameter_use.instruction->opcode() != HloOpcode::kGetTupleElement &&
-          parameter_use.instruction->opcode() != HloOpcode::kTuple &&
-          parameter_use.instruction->opcode() != HloOpcode::kBitcast &&
-          use_time > parameter_time) {
-        min_use_time = std::min(min_use_time, use_time);
-      }
-    }
-    // If there is no use of this buffer inside the while loop, there is no need
-    // to allocate it in the loop.
-    if (min_use_time == root_time) {
-      VLOG(4) << "While allocation not allowed in alternate memory. "
-              << "use time = " << min_use_time << ", root time = " << root_time;
-      return false;
-    }
-    const Shape& shape = parameter_value->shape();
-    // Allow the buffer in alternate memory if the buffer has a short live range
-    // either at the beginning or end of the while loop body.
-    if (!options_.prefetch_interval_picker->CanAllocateInAlternateMemoryNoCopy(
-            shape, parameter_time, min_use_time)) {
-      VLOG(4) << "While allocation not allowed in alternate memory. "
-              << "use time = " << min_use_time << ", root time = " << root_time;
-      return false;
-    }
-    // Check if there is a required assignment for the while loop output.
-    HloValue* while_value =
-        &alias_analysis_.dataflow_analysis().GetUniqueValueAt(
-            use.instruction, use.operand_index);
-    int64_t while_time = instruction_schedule.at(use.instruction);
-    auto existing_required_assignment =
-        RequiredMemoryAssignmentAt(while_value, while_time);
-    if (existing_required_assignment &&
-        existing_required_assignment->memory_space == MemorySpace::kDefault) {
-      VLOG(4) << "While allocation not allowed in alternate memory because "
-                 "there is a required default memory assignment.";
-      return false;
-    }
-  } else if (use.instruction->opcode() == HloOpcode::kConditional) {
-    // For any use of this conditional (the same value might be passed into
-    // multiple called computations), determine if the parameter->first use
-    // dependency is short.
-    int64_t conditional_time = instruction_schedule.at(use.instruction);
-    for (const AllocationValue::Use& other_use : value.uses()) {
-      if (other_use.hlo_use.instruction != use.instruction) {
-        continue;
-      }
-      // Operand 0 is not passed into the computation.
-      if (other_use.hlo_use.operand_number == 0) {
-        continue;
-      }
-      HloComputation* called_computation =
-          use.instruction->called_computations().at(
-              other_use.hlo_use.operand_number - 1);
-      const HloInstruction* parameter_instruction =
-          called_computation->parameter_instruction(0);
-      HloValue* parameter_value =
-          &alias_analysis_.dataflow_analysis().GetUniqueValueAt(
-              parameter_instruction, other_use.hlo_use.operand_index);
-      int64_t parameter_time = instruction_schedule.at(parameter_instruction);
-      int64_t min_use_time = conditional_time;
-      for (const HloUse& parameter_use : parameter_value->GetUses()) {
-        if (parameter_use.instruction->parent() == called_computation &&
-            parameter_use.instruction->opcode() !=
-                HloOpcode::kGetTupleElement &&
-            parameter_use.instruction->opcode() != HloOpcode::kTuple &&
-            parameter_use.instruction->opcode() != HloOpcode::kBitcast) {
-          min_use_time = std::min(
-              min_use_time, instruction_schedule.at(parameter_use.instruction));
-        }
-      }
-      if (options_.prefetch_interval_picker->CanAllocateInAlternateMemoryNoCopy(
-              parameter_value->shape(), parameter_time, min_use_time)) {
-        VLOG(4) << "Conditional allocation allowed in alternate memory for "
-                   "computation = "
-                << called_computation->name()
-                << ", parameter time = " << parameter_time
-                << ", min use time = " << min_use_time;
-        return true;
-      } else {
-        VLOG(4) << "Conditional allocation not allowed in alternate memory for "
-                   "computation = "
-                << called_computation->name()
-                << ", parameter time = " << parameter_time
-                << ", min use time = " << min_use_time;
-      }
-    }
-    return false;
-  }
-
-  return true;
-}
-
-namespace {
-// Columns in buffer information:
-// buffer_id: int. This value can be used to match the allocation in
-// allocation information.
-// buffer_name: string.
-// alt_mem_benefit: float. Roughly corresponds to how much the cost analysis
-// thought it would be beneficial to put this in the alternate memory. The
-// higher the value, the more it is memory bound.
-// size: int. In bytes.
-// definition_time: int. Logical time this value was defined in the schedule.
-// use_times: string. This is a semicolon-separated list of integers for all
-// the use times.
-// use_names: string. This is a semicolon-separated list of string
-// representation of uses.
-// is_scoped: int. A value of 1 indicates that the buffer is a scoped
-// allocation.
-constexpr absl::string_view kBufferInfoColumnNames =
-    "buffer_id,buffer_name,alt_mem_benefit,size,definition_time,use_times,use_"
-    "names,is_scoped";
-}  // namespace
-
-void AlternateMemoryBestFitHeap::AppendBufferInfoDebugString(
-    const AlternateMemoryBestFitHeap::BufferInterval& interval,
-    std::string* debug_str) const {
-  if (debug_str->empty()) {
-    // Append the column names.
-    absl::StrAppend(debug_str, kBufferInfoColumnNames, "\n");
-  }
-  const HloBuffer& buffer =
-      alias_analysis_.GetBufferContainingValue(*interval.buffer);
-  const auto& instruction_schedule = hlo_live_range_.instruction_schedule();
-  int64_t definition_time =
-      instruction_schedule.at(interval.buffer->defining_position().instruction);
-  std::vector<std::pair<int64_t, std::string>> uses;
-  for (const HloValue* value : buffer.values()) {
-    for (const HloUse& use : value->GetUses()) {
-      uses.push_back(
-          {instruction_schedule.at(use.instruction), use.ToString()});
-    }
-  }
-  absl::c_sort(uses);
-  std::vector<int64_t> use_times;
-  std::vector<std::string> use_names;
-  use_times.reserve(uses.size());
-  use_names.reserve(uses.size());
-  for (const auto& use : uses) {
-    use_times.push_back(use.first);
-    use_names.push_back(use.second);
-  }
-
-  absl::StrAppend(debug_str, buffer.id(), ",");
-  absl::StrAppend(debug_str, "\"", interval.buffer->ToShortString(), "\",");
-  auto alternate_memory_benefit =
-      options_.prefetch_interval_picker->BufferIntervalAlternateMemoryBenefit(
-          interval);
-  absl::StrAppend(
-      debug_str, alternate_memory_benefit ? *alternate_memory_benefit : 0, ",");
-  absl::StrAppend(debug_str, interval.size, ",");
-  absl::StrAppend(debug_str, definition_time, ",");
-  absl::StrAppend(debug_str, "\"", absl::StrJoin(use_times, ";"), "\",");
-  absl::StrAppend(debug_str, "\"", absl::StrJoin(use_names, ";"), "\",");
-  absl::StrAppend(debug_str, "0");  // is_scoped
-  absl::StrAppend(debug_str, "\n");
-}
-
-void AlternateMemoryBestFitHeap::AppendScopedAllocationBufferInfoDebugString(
-    const HloInstruction* instruction, int64_t time, int64_t size,
-    std::string& debug_str) const {
-  if (debug_str.empty()) {
-    // Append the column names.
-    absl::StrAppend(&debug_str, kBufferInfoColumnNames, "\n");
-  }
-  const HloBuffer& buffer = alias_analysis_.GetUniqueBufferAt(instruction);
-
-  // As a convention, we use negative values for scoped allocations.
-  absl::StrAppend(&debug_str, -buffer.id(), ",");
-  absl::StrAppend(&debug_str, "\"scoped allocation for ", instruction->name(),
-                  "\",");
-  absl::StrAppend(&debug_str, 0, ",");  // alt_mem_benefit
-  absl::StrAppend(&debug_str, size, ",");
-  absl::StrAppend(&debug_str, time, ",");
-  absl::StrAppend(&debug_str, "\"\",");  // use_times
-  absl::StrAppend(&debug_str, "\"\",");  // use_names
-  absl::StrAppend(&debug_str, "1");      // is_scoped
-  absl::StrAppend(&debug_str, "\n");
-}
-
-void AlternateMemoryBestFitHeap::AppendAllocationInfoDebugString(
-    const Allocation& allocation, std::string& debug_str) const {
-  // Columns in allocation information:
-  // buffer_id: int. This value can be used the match with buffer info.
-  // size: int. In bytes.
-  // offset: int. In bytes.
-  // start_time: int. Logical start time of the allocation.
-  // end_time: int. Logical end time of the allocation.
-  if (debug_str.empty()) {
-    // Append the column names.
-    absl::StrAppend(&debug_str, "buffer_id,size,offset,start_time,end_time\n");
-  }
-  if (allocation.memory_space() == MemorySpace::kAlternate) {
-    const HloPosition& position = allocation.defining_position();
-    const HloBuffer& buffer =
-        alias_analysis_.GetUniqueBufferAt(position.instruction, position.index);
-    // As a convention, we use negative values for scoped allocations.
-    absl::StrAppend(
-        &debug_str,
-        allocation.is_scoped_allocation() ? -buffer.id() : buffer.id(), ",");
-    absl::StrAppend(&debug_str, allocation.chunk().size, ",");
-    absl::StrAppend(&debug_str, allocation.chunk().offset, ",");
-    absl::StrAppend(&debug_str, allocation.start_time(), ",");
-    absl::StrAppend(&debug_str, allocation.end_time(), "\n");
-  }
-}
-
-void AlternateMemoryBestFitHeap::DumpDebugStringsIfEnabled() const {
-  if (!options_.dump_fn) {
-    return;
-  }
-  options_.dump_fn("bufferinfo", buffer_info_str_);
-  options_.dump_fn("allocinfo", allocation_info_str_);
-  options_.dump_fn("scheduleinfo", instruction_schedule_str_);
-}
-
-Status AlternateMemoryBestFitHeap::OptimizeMemoryBoundLoop(int loop_start_idx,
-                                                           int loop_end_idx,
-                                                           int loop_size) {
-  // The MemoryBoundLoopOptimizer works with a minimum of three unrolled loop
-  // iterations: previous, current, and next. So, we pick the second iteration
-  // out of the loop as the current iteration.
-  const int iteration_start_idx = loop_start_idx + loop_size;
-  const int iteration_end_idx = iteration_start_idx + loop_size;
-
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<MemoryBoundLoopOptimizer> optimizer,
-      MemoryBoundLoopOptimizer::Create(
-          iteration_start_idx, iteration_end_idx, options_.max_size_in_bytes,
-          options_.memory_bound_loop_optimizer_options, hlo_live_range_,
-          alias_analysis_, *options_.cost_analysis, options_.size_fn,
-          options_.reserved_scoped_memory_fn));
-  optimizer->Optimize();
-
-  const int loop_optimized_allocations_original_size =
-      loop_optimized_allocations_.size();
-  for (MemoryBoundLoopOptimizer::LoopValue& value : optimizer->loop_values()) {
-    if (!value.allocations.empty() && value.IsAllocationTypeSupported()) {
-      loop_optimized_allocations_.push_back(std::move(value.allocations));
-    }
-  }
-
-  // Check if this unrolled loop is in a while loop.
-  const auto& instruction_sequence =
-      hlo_live_range_.flattened_instruction_sequence().instructions();
-  std::vector<HloInstruction*> callers = call_graph_->GetComputationCallers(
-      instruction_sequence[loop_start_idx]->parent());
-  const bool is_in_while_loop =
-      callers.size() == 1 && callers.front()->opcode() == HloOpcode::kWhile;
-
-  // Update the loop_optimized_allocations_map_ with the output of the
-  // optimizer.
-  for (int i = loop_optimized_allocations_original_size;
-       i < loop_optimized_allocations_.size(); ++i) {
-    const AllocationSequence& sequence = loop_optimized_allocations_.at(i);
-    CHECK(!sequence.empty());
-    VLOG(3) << "  alloc: " << sequence.back()->ToString();
-    for (const auto& allocation : sequence) {
-      // Check if the loop is in a while loop and the position needs to be
-      // allocated in the default memory.
-      const bool require_pos_in_default_space =
-          is_in_while_loop &&
-          (allocation->memory_space() == MemorySpace::kDefault ||
-           allocation->is_copy_allocation());
-      for (const HloUse& use : allocation->uses()) {
-        const int64_t use_idx =
-            hlo_live_range_.instruction_schedule().at(use.instruction) -
-            iteration_start_idx;
-        CHECK_GE(use_idx, 0);
-        CHECK_LT(use_idx, loop_size);
-        for (int64_t i = loop_start_idx + use_idx; i <= loop_end_idx;
-             i += loop_size) {
-          HloInstruction* repeated_inst = instruction_sequence[i];
-          CHECK_EQ(use.instruction->opcode(), repeated_inst->opcode());
-          CHECK_EQ(use.instruction->operand_count(),
-                   repeated_inst->operand_count());
-          CHECK_LT(use.operand_number, repeated_inst->operand_count());
-          HloUse repeated_use{repeated_inst, use.operand_number,
-                              use.operand_index};
-          loop_optimized_allocations_map_[repeated_use] = {use_idx, loop_size,
-                                                           allocation.get()};
-          VLOG(3) << " Setting optimized allocations map. Use: "
-                  << repeated_use.ToString() << " idx: " << use_idx
-                  << " allocation: " << allocation->ToString();
-          if (require_pos_in_default_space) {
-            const HloValue& value =
-                alias_analysis_.dataflow_analysis().GetUniqueValueAt(
-                    repeated_inst->operand(use.operand_number),
-                    use.operand_index);
-            // If any of the positions is a parameter in a while loop, we add a
-            // required assignment in the default memory space.
-            for (const HloPosition& value_position : value.positions()) {
-              if (value_position.instruction->parent() ==
-                      repeated_inst->parent() &&
-                  value_position.instruction->opcode() ==
-                      HloOpcode::kParameter) {
-                AddRequiredAssignment(value_position.instruction,
-                                      value_position.index,
-                                      MemorySpace::kDefault);
-                break;
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-  return OkStatus();
-}
-
-namespace {
-// A helper function to get the distance between a use and its producer (or -1
-// if producer is a gte, parameter or tuple).
-std::function<int(const HloInstruction*)> GetOperandDistanceFunction(
-    const HloLiveRange& hlo_live_range, const HloInstruction* use_inst) {
-  const int use_idx = hlo_live_range.instruction_schedule().at(use_inst);
-  return [&, use_idx](const HloInstruction* operand) -> int {
-    // We just use -1 for parameter, tuple, gte and constant instructions. We
-    // could make this "see through" the gtes if we get too many false
-    // positives.
-    if (operand->opcode() == HloOpcode::kParameter ||
-        operand->opcode() == HloOpcode::kTuple ||
-        operand->opcode() == HloOpcode::kGetTupleElement ||
-        operand->opcode() == HloOpcode::kConstant) {
-      return -1;
-    }
-    return use_idx - hlo_live_range.instruction_schedule().at(operand);
-  };
-}
-
-// A helper function to check if the operand distances of two instructions
-// are compatible. This assumes `a` is scheduled loop size candidate
-// instructions before `b`. The operand distances are compatible if either
-// distance is -1, or if they are the same, or if they are separated by loop
-// size candidate.
-bool AreOperandCandidatesCompatible(int loop_size_candidate,
-                                    absl::Span<const int> a_distances,
-                                    absl::Span<const int> b_distances) {
-  if (a_distances.size() != b_distances.size()) {
-    return false;
-  }
-  for (int i = 0; i < a_distances.size(); ++i) {
-    const int a_value = a_distances.at(i);
-    const int b_value = b_distances.at(i);
-    if (a_value != -1 && b_value != -1 &&
-        a_value + loop_size_candidate != b_value && a_value != b_value) {
-      return false;
-    }
-  }
-  return true;
-}
-}  // namespace
-
-void AlternateMemoryBestFitHeap::IdentifyAndOptimizeMemoryBoundLoops() {
-  absl::flat_hash_map<absl::string_view, int> fingerprint_schedule_map;
-  const auto& instruction_sequence =
-      hlo_live_range_.flattened_instruction_sequence().instructions();
-  // The minimum and maximum loop sizes that we consider.
-  const int kMinLoopSize = 4;
-  const int kMaxLoopSize = 400;
-  int optimized_loop_idx = 0;
-  while (optimized_loop_idx < instruction_sequence.size()) {
-    // Iterate over the flattened instruction sequence. We first try to find a
-    // loop candidate where the fingerprint between two instructions matches by
-    // the loop size candidate.
-    int loop_size_candidate = -1;
-    int loop_start_idx = -1;
-    int loop_end_idx = -1;
-    for (; optimized_loop_idx < instruction_sequence.size();
-         ++optimized_loop_idx) {
-      const HloInstruction* inst = instruction_sequence[optimized_loop_idx];
-      auto fingerprint_it = fingerprint_map_.find(inst);
-      if (inst->opcode() != HloOpcode::kParameter &&
-          inst->opcode() != HloOpcode::kTuple &&
-          inst->opcode() != HloOpcode::kGetTupleElement &&
-          fingerprint_it != fingerprint_map_.end()) {
-        // Find and the latest instruction with the same fingerprint as this.
-        auto fingerprint_schedule_it =
-            fingerprint_schedule_map.find(fingerprint_it->second);
-        if (fingerprint_schedule_it != fingerprint_schedule_map.end()) {
-          int distance = optimized_loop_idx - fingerprint_schedule_it->second;
-          if (distance >= kMinLoopSize && distance <= kMaxLoopSize) {
-            // We found two instructions with the same fingerprint. The distance
-            // between the two is the loop size candidate.
-            loop_size_candidate = distance;
-            // Update the fingerprint map with the current loop index so that if
-            // the loop size candidate doesn't find a valid loop, we can resume
-            // searching from this instruction.
-            fingerprint_schedule_map[fingerprint_it->second] =
-                optimized_loop_idx;
-            break;
-          }
-        }
-        fingerprint_schedule_map[fingerprint_it->second] = optimized_loop_idx;
-      }
-
-      VLOG(3) << " " << optimized_loop_idx << ": "
-              << instruction_sequence[optimized_loop_idx]->parent()->name()
-              << " " << instruction_sequence[optimized_loop_idx]->name()
-              << " fingerprint: "
-              << (fingerprint_it == fingerprint_map_.end()
-                      ? "none"
-                      : fingerprint_it->second);
-    }
-    VLOG(3) << "Loop size candidate: " << loop_size_candidate;
-    if (loop_size_candidate == -1) {
-      break;
-    }
-
-    std::vector<std::vector<int>> operand_distances;
-
-    // Scan the instructions with the candidate loop size. We try to calculate
-    // the size of the loop by finding the instructions that are loop size
-    // candidate apart, have the same fingerprint and compatible operand
-    // distances. We start scanning the candidate loop a few instructions
-    // earlier than the fingerprint identified in case the loop starts a bit
-    // earlier than the fingerprint logic.
-    const int kLoopScanHeadStart = 10;
-    for (int i = std::max(
-             0, optimized_loop_idx - loop_size_candidate - kLoopScanHeadStart);
-         i < instruction_sequence.size(); ++i) {
-      const HloInstruction* inst = instruction_sequence[i];
-      auto fingerprint_it = fingerprint_map_.find(inst);
-      auto ignore_op = [](const HloInstruction* instruction) {
-        return instruction->opcode() == HloOpcode::kParameter ||
-               instruction->opcode() == HloOpcode::kTuple ||
-               instruction->opcode() == HloOpcode::kGetTupleElement;
-      };
-      // We trigger this if statement until we find the start of the loop.
-      if (loop_start_idx == -1) {
-        if (i > optimized_loop_idx - loop_size_candidate) {
-          break;
-        }
-        if (ignore_op(inst) || fingerprint_it == fingerprint_map_.end()) {
-          continue;
-        }
-        if (i + loop_size_candidate >= instruction_sequence.size()) {
-          break;
-        }
-        const HloInstruction* candidate_inst =
-            instruction_sequence[i + loop_size_candidate];
-        auto candidate_fingerprint_it = fingerprint_map_.find(candidate_inst);
-        if (ignore_op(candidate_inst) ||
-            candidate_fingerprint_it == fingerprint_map_.end() ||
-            fingerprint_it->second != candidate_fingerprint_it->second) {
-          // Fingerprint mismatch.
-          continue;
-        }
-        std::vector<int> inst_operand_distances;
-        absl::c_transform(inst->operands(),
-                          std::back_inserter(inst_operand_distances),
-                          GetOperandDistanceFunction(hlo_live_range_, inst));
-        std::vector<int> candidate_inst_operand_distances;
-        absl::c_transform(
-            candidate_inst->operands(),
-            std::back_inserter(candidate_inst_operand_distances),
-            GetOperandDistanceFunction(hlo_live_range_, candidate_inst));
-        VLOG(3) << "i : " << i << " "
-                << absl::StrJoin(inst_operand_distances, ", ") << " | "
-                << absl::StrJoin(candidate_inst_operand_distances, ", ");
-        if (!AreOperandCandidatesCompatible(loop_size_candidate,
-                                            inst_operand_distances,
-                                            candidate_inst_operand_distances)) {
-          // Operand distance mistatch.
-          continue;
-        }
-        // Found the start of the loop.
-        loop_start_idx = i;
-      }
-      if (inst->parent() != instruction_sequence[loop_start_idx]->parent()) {
-        VLOG(3) << "Mismatch (computation) at " << i << ": "
-                << inst->parent()->name() << " vs "
-                << instruction_sequence[loop_start_idx]->parent()->name();
-        break;
-      }
-      operand_distances.push_back({});
-      if (fingerprint_it == fingerprint_map_.end()) {
-        continue;
-      }
-      absl::c_transform(inst->operands(),
-                        std::back_inserter(operand_distances.back()),
-                        GetOperandDistanceFunction(hlo_live_range_, inst));
-      if (i >= loop_start_idx + loop_size_candidate) {
-        // Verify that this still obeys the fingerprint and operand distance
-        // invariants.
-        const HloInstruction* prev_inst =
-            instruction_sequence[i - loop_size_candidate];
-        auto prev_fingerprint_it = fingerprint_map_.find(prev_inst);
-        if (prev_fingerprint_it == fingerprint_map_.end()) {
-          break;
-        }
-        if (ignore_op(inst) || ignore_op(prev_inst)) {
-          if (inst->opcode() != prev_inst->opcode()) {
-            VLOG(3) << "Mismatch (opcode) at " << i << ", "
-                    << (i - loop_size_candidate) << ": " << inst->opcode()
-                    << " vs " << prev_inst->opcode();
-            break;
-          }
-          if (inst->operand_count() != prev_inst->operand_count()) {
-            VLOG(3) << "Mismatch (# operands) at " << i << ", "
-                    << (i - loop_size_candidate) << ": "
-                    << inst->operand_count() << " vs "
-                    << prev_inst->operand_count();
-            break;
-          }
-        }
-        if (fingerprint_it->second != prev_fingerprint_it->second) {
-          VLOG(3) << "Mismatch (fp) at " << i << ", "
-                  << (i - loop_size_candidate) << ": " << fingerprint_it->second
-                  << " vs " << prev_fingerprint_it->second;
-          break;
-        }
-        if (!AreOperandCandidatesCompatible(
-                loop_size_candidate,
-                *(operand_distances.rbegin() + loop_size_candidate),
-                operand_distances.back())) {
-          VLOG(3) << "Mismatch (op) at " << i << ", "
-                  << (i - loop_size_candidate) << ": "
-                  << absl::StrJoin(operand_distances.back(), ", ") << " vs "
-                  << absl::StrJoin(
-                         *(operand_distances.rbegin() + loop_size_candidate),
-                         ", ");
-          break;
-        }
-      }
-      loop_end_idx = i;
-    }
-    float num_iterations = 0;
-    if (loop_start_idx != -1) {
-      num_iterations = static_cast<float>(loop_end_idx + 1 - loop_start_idx) /
-                       loop_size_candidate;
-    }
-    VLOG(3) << "Loop start: " << loop_start_idx << " loop end: " << loop_end_idx
-            << " num iterations: " << num_iterations;
-
-    optimized_loop_idx = std::max(optimized_loop_idx, loop_end_idx) + 1;
-
-    if (num_iterations >=
-        options_.memory_bound_loop_optimizer_options.min_num_iterations()) {
-      VLOG(2) << "Found valid loop. Loop start: " << loop_start_idx
-              << " loop end: " << loop_end_idx
-              << " num iterations: " << num_iterations;
-
-      TF_CHECK_OK(OptimizeMemoryBoundLoop(loop_start_idx, loop_end_idx,
-                                          loop_size_candidate));
-    }
-  }
-}
-
-absl::StatusOr<HeapSimulator::Result<HloValue>>
-AlternateMemoryBestFitHeap::Finish() {
-  if (options_.autotuning_config.has_value()) {
-    CHECK_EQ((*options_.autotuning_config).size(), buffer_intervals_.size());
-  }
-  VLOG(1) << "Slicing is "
-          << (options_.sliced_prefetch_options.max_slices() >= 2 ? "enabled"
-                                                                 : "disabled");
-
-  AllocateReservedScopedAllocations();
-  std::vector<BufferInterval> sorted_buffer_intervals =
-      GetSortedBufferIntervals();
-  memory_space_assignment::CustomizeSortedBufferInterval(
-      options_.autotuning_config, sorted_buffer_intervals);
-
-  // Calculate the memory pressure for the buffers that can be assigned in the
-  // alternate memory.
-  memory_pressure_ = 0;
-  VLOG(5) << [&]() {
-    std::string s("Sorted BufferInterval order.");
-    if (options_.buffer_interval_comparator) {
-      absl::StrAppend(
-          &s, " Pre-autotuning sort criteria: ",
-          options_.buffer_interval_comparator->DescribeComparisonCriteria());
-    }
-    return s;
-  }();
-  for (auto& interval : sorted_buffer_intervals) {
-    if (!interval.need_allocation ||
-        !MemorySpaceAssignmentUtils::IsIntervalAllowedInAlternateMemory(
-            interval, options_.alternate_memory_space) ||
-        interval.size > available_heap_size()) {
-      continue;
-    }
-    VLOG(5) << [&]() {
-      std::string s("SortedBufferInterval.");
-      if (options_.buffer_interval_comparator) {
-        absl::StrAppend(
-            &s, " Criteria: ",
-            options_.buffer_interval_comparator->CriteriaToString(interval));
-      }
-      absl::StrAppend(&s, " Buffer: ", interval.buffer->ToShortString());
-      return s;
-    }();
-    memory_pressure_ += interval.size;
-  }
-  VLOG(1) << "Memory pressure = " << memory_pressure_;
-
-  if (options_.enable_cross_program_prefetch) {
-    std::vector<AlternateMemoryBestFitHeap::BufferInterval>
-        prefetch_candidates = FindCrossProgramPrefetchCandidates(
-            alias_analysis_, hlo_live_range_, options_);
-    for (auto& prefetch_candidate : prefetch_candidates) {
-      HloModule* module = prefetch_candidate.buffer->instruction()->GetModule();
-      if (0 <= options().max_cross_program_prefetches &&
-          options().max_cross_program_prefetches <=
-              module->CrossProgramPrefetches().size()) {
-        break;
-      }
-      AllocateCrossProgramPrefetchBuffer(module, prefetch_candidate);
-    }
-  }
-
-  VLOG(1) << "Assigning buffers to alternate memory. Max heap size = "
-          << options_.max_size_in_bytes;
-
-  AddInputAndOutputRequiredAssignments();
-
-  if (VLOG_IS_ON(3) || options_.dump_fn != nullptr) {
-    VLOG(3) << "Flattened instruction sequence:";
-    const auto& instruction_sequence =
-        hlo_live_range_.flattened_instruction_sequence().instructions();
-    absl::StrAppend(&instruction_schedule_str_, "time,instruction_name\n");
-    for (int i = 0; i < instruction_sequence.size(); ++i) {
-      VLOG(3) << " " << i << ": " << instruction_sequence[i]->parent()->name()
-              << " " << instruction_sequence[i]->name();
-      absl::StrAppend(&instruction_schedule_str_, i, ",",
-                      instruction_sequence[i]->name(), "\n");
-    }
-  }
-
-  if (options_.memory_bound_loop_optimizer_options.enabled()) {
-    IdentifyAndOptimizeMemoryBoundLoops();
-  }
-
-  for (const auto& interval : sorted_buffer_intervals) {
-    auto colocated_intervals = GetSortedColocatedIntervals(interval);
-    if (AreIntervalsReservedInAlternateMemory(colocated_intervals)) {
-      // Increment the reserved part of alternate memory so that it is not
-      // available for other buffers.
-      reserved_in_bytes_ += options_.size_fn(*interval.buffer);
-    }
-  }
-  VLOG(2) << "Total reserved bytes = " << reserved_in_bytes_;
-
-  for (auto& interval : sorted_buffer_intervals) {
-    if (!interval.need_allocation) {
-      VLOG(3) << "Skip " << interval.buffer->ToShortString()
-              << " because it doesn't need an allocation.";
-      continue;
-    }
-
-    if (!MemorySpaceAssignmentUtils::IsIntervalAllowedInAlternateMemory(
-            interval, options_.alternate_memory_space)) {
-      VLOG(3) << "Skip " << interval.buffer->ToShortString()
-              << " because it is not allowed in the alternate memory.";
-      continue;
-    }
-
-    HloInstruction* inst = interval.buffer->instruction();
-    HloModule* module = inst->GetModule();
-
-    // Don't intra-program prefetch a cross program prefetch
-    auto cross_program_prefetches = module->CrossProgramPrefetches();
-    if (inst->opcode() == HloOpcode::kParameter &&
-        absl::c_find_if(cross_program_prefetches, [&](auto& info) {
-          return info.parameter == inst->parameter_number() &&
-                 info.index == interval.buffer->index();
-        }) != module->CrossProgramPrefetches().end()) {
-      VLOG(3) << "Skip " << interval.buffer->ToShortString()
-              << " because it is cross-program prefetched.";
-      continue;
-    }
-
-    if (interval.size > available_heap_size()) {
-      VLOG(3) << "Skip " << interval.buffer->ToShortString()
-              << " because the buffer is larger than the heap size.";
-      continue;
-    }
-
-    auto colocated_intervals = GetSortedColocatedIntervals(interval);
-
-    if (AreIntervalsReservedInAlternateMemory(colocated_intervals)) {
-      VLOG(3) << "Interval " << interval.buffer->ToShortString()
-              << " is reserved in the alternate memory.";
-      for (const BufferInterval* colocated_interval : colocated_intervals) {
-        const HloValue* value = colocated_interval->buffer;
-        // Color all of the aliased reserved buffers here because reserved
-        // alternate memory allocations will not have an entry in preset
-        // allocations that is normally used for coloring.
-        for (auto& position : value->positions()) {
-          VLOG(4) << "Coloring " << position.ToString();
-          Shape* shape = ShapeUtil::GetMutableSubshape(
-              position.instruction->mutable_shape(), position.index);
-          CHECK(shape->IsArray()) << "Coloring a shape that is not an array: "
-                                  << position.ToString();
-          shape->mutable_layout()->set_memory_space(
-              options_.alternate_memory_space);
-        }
-      }
-      continue;
-    }
-
-    if (colocated_intervals.size() > 1 &&
-        !options_.allocate_across_sequential_calls) {
-      VLOG(4) << "Not allocating " << interval.buffer->ToShortString()
-              << " because it aliases with another interval and "
-              << " allocate_across_sequential_calls is false.";
-      continue;
-    }
-
-    if (!ConsumeFuel("memory_space_assignment", [&] {
-          return absl::StrCat("Ran out of fuel at buffer: ",
-                              colocated_intervals[0]->buffer->ToShortString());
-        })) {
-      continue;
-    }
-
-    if (options_.dump_fn != nullptr || VLOG_IS_ON(3)) {
-      // Only fill buffer_info_str_ if needed.
-      AppendBufferInfoDebugString(interval, &buffer_info_str_);
-    }
-
-    std::vector<AllocationValue> allocation_values;
-    CreateAllocationValuesFromColocatedIntervals(colocated_intervals,
-                                                 allocation_values);
-
-    // Retry allocating this value with larger limits if allocation fails.
-    bool repacked = false;
-    for (int retry_number = 0; retry_number < options_.max_retries;
-         retry_number++) {
-      AddRequiredAssignmentsForColocatedIntervals(colocated_intervals);
-      options_.prefetch_interval_picker->SetRetryNumber(retry_number);
-      TF_ASSIGN_OR_RETURN(
-          Result result,
-          AllocateAllocationValues(absl::MakeSpan(allocation_values)));
-      VLOG(2) << "Allocation result = "
-              << absl::StrFormat("%x", static_cast<int>(result));
-      if (result_requires_uncommit(result)) {
-        UncommitPendingChunks(absl::MakeSpan(allocation_values));
-        VLOG(2) << "Couldn't allocate. Retry number " << retry_number;
-      } else if ((result_is(result, Result::kFailOutOfMemory) ||
-                  options_.repack_after_every_allocation) &&
-                 num_repacks_ < options_.max_repacks && !repacked) {
-        UncommitPendingChunks(absl::MakeSpan(allocation_values));
-        ++num_repacks_;
-        repacked = true;
-        CHECK_NE(options_.repacker, nullptr);
-        std::vector<AllocationBlock*> repack_allocation_blocks;
-        ExportAllocationsForRepacking(repack_allocation_blocks);
-        VLOG(2) << "Repacking.";
-        auto repack_status =
-            options_.repacker->Repack(absl::MakeSpan(repack_allocation_blocks));
-        CHECK_EQ(repack_status.status(), OkStatus());
-        VLOG(2) << "Repack complete. Modified = " << *repack_status;
-        // For debug and testing purpose, also update allocations if
-        // repack_after_every_allocation is on.
-        if (*repack_status || options_.repack_after_every_allocation) {
-          ImportRepackedAllocations();
-          --retry_number;
-        }
-        if (*repack_status) {
-          ++num_repacks_successful_;
-        }
-      } else {
-        // Check if any of the allocation sites are inefficient. If so, get rid
-        // of the pending allocation, require all of the inefficient sites in
-        // the default memory, and perform allocation again.
-        std::vector<HloPositionOrUse> inefficient_sites =
-            GetInefficientAllocationSites(allocation_values);
-        if (!inefficient_sites.empty()) {
-          UncommitPendingChunks(absl::MakeSpan(allocation_values));
-          for (const HloPositionOrUse& site : inefficient_sites) {
-            // To avoid a livelock situation, we commit the required assignments
-            // right away. Otherwise, reallocation can find alternate memory
-            // allocations at other sites, which can also be inefficient.
-            std::visit(
-                [this](const auto& site) {
-                  VLOG(3) << "Inefficient site: " << site.ToString();
-                  AddRequiredAssignment(site, MemorySpace::kDefault,
-                                        /*offset=*/nullptr,
-                                        /*add_to_pending=*/false);
-                },
-                site);
-          }
-          --retry_number;
-          continue;
-        }
-
-        FinalizeAllocations(absl::MakeSpan(allocation_values));
-        break;
-      }
-    }
-  }
-  if (options_.repack_after_every_allocation) {
-    CHECK_NE(options_.repacker, nullptr);
-    std::vector<AllocationBlock*> repack_allocation_blocks;
-    ExportAllocationsForRepacking(repack_allocation_blocks);
-    VLOG(2) << "Final Repacking.";
-    auto repack_status =
-        options_.repacker->Repack(absl::MakeSpan(repack_allocation_blocks));
-    CHECK_EQ(repack_status.status(), OkStatus());
-    VLOG(2) << "Final Repack complete. Modified = " << *repack_status;
-  }
-
-  if (options_.dump_fn != nullptr || VLOG_IS_ON(3)) {
-    for (auto& allocation : *allocations_) {
-      // Only fill allocation_info_str_ if needed.
-      AppendAllocationInfoDebugString(*allocation, allocation_info_str_);
-    }
-  }
-
-  VLOG(1) << "Repack summary: " << num_repacks_successful_
-          << " succeeded out of " << num_repacks_;
-
-  VLOG(3) << "Debug buffer info: ";
-  XLA_VLOG_LINES(3, buffer_info_str_);
-  VLOG(3) << "Debug allocation info: ";
-  XLA_VLOG_LINES(3, allocation_info_str_);
-  DumpDebugStringsIfEnabled();
-
-  HeapSimulator::Result<HloValue> result;
-  result.heap_size = result_.heap_size;
-  result.heap_results.emplace_back(std::move(result_));
-  return result;
-}
-
-namespace {
-
-// Convert a tuple HloUse to its equivalent HloPosition.
-HloPosition TupleUseToPosition(const HloUse& use) {
-  CHECK_EQ(use.instruction->opcode(), HloOpcode::kTuple);
-  ShapeIndex index = use.operand_index;
-  index.push_front(use.operand_number);
-  return {use.instruction, index};
-}
-
-// Returns the memory space of the defining position of an Allocation object.
-MemorySpace GetDefiningPositionMemorySpace(const Allocation& allocation) {
-  if (!allocation.is_copy_like_allocation()) {
-    return allocation.memory_space();
-  }
-  if (allocation.memory_space() == MemorySpace::kDefault) {
-    return MemorySpace::kAlternate;
-  }
-  return MemorySpace::kDefault;
-}
-
-}  // namespace
-
-std::vector<std::vector<const Allocation*>>
-AlternateMemoryBestFitHeap::GetLinkedAllocationsInAlternateMemory(
-    absl::Span<const AlternateMemoryBestFitHeap::AllocationValue>
-        allocation_values) const {
-  std::vector<std::vector<const Allocation*>> linked_allocations;
-  // A map from position to index into linked_allocations.
-  absl::flat_hash_map<HloPosition, int> link_id_map;
-  // Iterate over the allocation values. Find Allocation objects across the
-  // allocation values that are part of the same linked allocation group. We
-  // define a linked allocation group as Allocation objects that have aliased
-  // positions or uses. An example would be an Allocation object that has a
-  // dynamic-update-slice use and another Allocation object that has the same
-  // dynamic-update-slice as its defining position.
-  for (const AllocationValue& allocation_value : allocation_values) {
-    absl::flat_hash_map<HloUse, std::vector<HloPosition>> aliases;
-    for (const AllocationValue::Use& allocation_value_use :
-         allocation_value.uses()) {
-      if (!allocation_value_use.aliases.empty()) {
-        aliases[allocation_value_use.hlo_use] = allocation_value_use.aliases;
-      }
-    }
-    for (const auto& allocation : *allocation_value.allocation_sequence()) {
-      MemorySpace position_memory_space =
-          GetDefiningPositionMemorySpace(*allocation);
-      if (allocation->memory_space() == MemorySpace::kDefault &&
-          position_memory_space == MemorySpace::kDefault) {
-        // This is just a regular allocation in the default memory, skip.
-        continue;
-      }
-      int link_id = -1;
-      // For every position and use in the alternate memory space, check if
-      // there is already a linked allocation group, and if so, use that link
-      // index.
-      if (position_memory_space == MemorySpace::kAlternate) {
-        auto link_id_map_it = link_id_map.find(allocation->defining_position());
-        if (link_id_map_it != link_id_map.end()) {
-          link_id = link_id_map_it->second;
-        }
-      }
-      if (allocation->memory_space() == MemorySpace::kAlternate) {
-        for (const HloUse& use : allocation->uses()) {
-          if (use.instruction->opcode() == HloOpcode::kTuple) {
-            auto link_id_map_it = link_id_map.find(TupleUseToPosition(use));
-            if (link_id_map_it != link_id_map.end()) {
-              if (link_id != -1 && link_id != link_id_map_it->second) {
-                // We found multiple link indices for the given allocation. We
-                // merge the two linked allocation groups in that case.
-                int old_link_id = link_id_map_it->second;
-                if (old_link_id < link_id) {
-                  std::swap(link_id, old_link_id);
-                }
-                absl::c_copy(linked_allocations[old_link_id],
-                             std::back_inserter(linked_allocations[link_id]));
-                linked_allocations[old_link_id].clear();
-                for (auto it = link_id_map.begin(); it != link_id_map.end();
-                     ++it) {
-                  if (it->second == old_link_id) {
-                    it->second = link_id;
-                  }
-                }
-              }
-              link_id = link_id_map_it->second;
-            }
-          }
-        }
-      }
-      if (link_id == -1) {
-        // Create a new linked allocation group if we couldn't find one.
-        link_id = linked_allocations.size();
-        linked_allocations.push_back({allocation.get()});
-      } else {
-        linked_allocations[link_id].push_back(allocation.get());
-      }
-      // Propagate the link index to all of the aliases of uses in the alternate
-      // memory.
-      if (allocation->memory_space() == MemorySpace::kAlternate) {
-        for (const HloUse& use : allocation->uses()) {
-          auto alias_it = aliases.find(use);
-          if (alias_it != aliases.end()) {
-            for (const HloPosition& aliased_position : alias_it->second) {
-              link_id_map[aliased_position] = link_id;
-            }
-          }
-        }
-      }
-    }
-  }
-
-  linked_allocations.erase(
-      std::remove_if(
-          linked_allocations.begin(), linked_allocations.end(),
-          [](const auto& allocations) { return allocations.empty(); }),
-      linked_allocations.end());
-
-  if (VLOG_IS_ON(3)) {
-    for (int i = 0; i < linked_allocations.size(); ++i) {
-      VLOG(3) << "Link id = " << i;
-      for (const Allocation* allocation : linked_allocations[i]) {
-        VLOG(3) << "  " << allocation->ToString();
-      }
-    }
-  }
-  return linked_allocations;
-}
-
-std::vector<AlternateMemoryBestFitHeap::HloPositionOrUse>
-AlternateMemoryBestFitHeap::GetInefficientAllocationSites(
-    absl::Span<const AlternateMemoryBestFitHeap::AllocationValue>
-        allocation_values) const {
-  // The logic below is used mostly for testing, allowing a test case to inject
-  // some custom logic for this method.
-  if (options_.get_inefficient_allocation_sites_fn) {
-    std::vector<HloPosition> defining_positions;
-    defining_positions.reserve(allocation_values.size());
-    for (const AllocationValue& value : allocation_values) {
-      defining_positions.push_back(value.defining_position());
-    }
-    return options_.get_inefficient_allocation_sites_fn(
-        absl::MakeSpan(defining_positions));
-  }
-
-  if (!options_.cost_analysis ||
-      options_.inefficient_use_to_copy_ratio == 0.0) {
-    return {};
-  }
-
-  int64_t size = allocation_values.at(0).size();
-
-  if (VLOG_IS_ON(3)) {
-    for (const AllocationValue& allocation_value : allocation_values) {
-      for (const auto& allocation : *allocation_value.allocation_sequence()) {
-        VLOG(3) << " Allocation: " << allocation->ToString();
-        if (!allocation->is_copy_like_allocation()) {
-          const HloPosition& defining_position =
-              allocation->defining_position();
-          int64_t accessed =
-              options_.cost_analysis->hlo_cost_analysis().output_bytes_accessed(
-                  *defining_position.instruction, defining_position.index);
-          VLOG(3) << "  pos: " << defining_position.ToString()
-                  << ", accessed: " << accessed << " / " << size;
-        }
-        for (const HloUse& use : allocation->uses()) {
-          int64_t accessed =
-              options_.cost_analysis->hlo_cost_analysis()
-                  .operand_bytes_accessed(*use.instruction, use.operand_number,
-                                          use.operand_index);
-          VLOG(3) << "  use: " << use.ToString() << ", accessed: " << accessed
-                  << " / " << size;
-        }
-      }
-    }
-  }
-
-  std::vector<std::vector<const Allocation*>> linked_allocations =
-      GetLinkedAllocationsInAlternateMemory(allocation_values);
-  std::vector<AlternateMemoryBestFitHeap::HloPositionOrUse> inefficient_sites;
-  for (const std::vector<const Allocation*>& allocation_group :
-       linked_allocations) {
-    // For all of allocation in the linked allocation group, calculate the total
-    // use bytes in alternate memory and async copy bytes. If the ratio between
-    // the two is below inefficient_use_to_copy_ratio, add all of the
-    // participating allocation sites into inefficient_sites.
-    VLOG(3) << "AllocationGroup:";
-    int64_t copy_bytes = 0;
-    int64_t use_bytes = 0;
-    for (const Allocation* allocation : allocation_group) {
-      VLOG(3) << " Allocation: " << allocation->ToString();
-      MemorySpace position_memory_space =
-          GetDefiningPositionMemorySpace(*allocation);
-      if (allocation->is_copy_like_allocation()) {
-        copy_bytes += size;
-      }
-      if (position_memory_space == MemorySpace::kAlternate) {
-        use_bytes +=
-            options_.cost_analysis->hlo_cost_analysis().output_bytes_accessed(
-                *allocation->defining_position().instruction,
-                allocation->defining_position().index);
-      }
-      if (allocation->memory_space() == MemorySpace::kAlternate) {
-        for (const HloUse& use : allocation->uses()) {
-          use_bytes +=
-              options_.cost_analysis->hlo_cost_analysis()
-                  .operand_bytes_accessed(*use.instruction, use.operand_number,
-                                          use.operand_index);
-        }
-      }
-    }
-    VLOG(3) << " use bytes: " << use_bytes << ", copy bytes: " << copy_bytes;
-    if (options_.inefficient_use_to_copy_ratio * copy_bytes > use_bytes) {
-      for (const Allocation* allocation : allocation_group) {
-        MemorySpace position_memory_space =
-            GetDefiningPositionMemorySpace(*allocation);
-        if (position_memory_space == MemorySpace::kAlternate) {
-          if (!allocation->is_copy_like_allocation()) {
-            inefficient_sites.push_back(allocation->defining_position());
-          }
-        }
-        if (allocation->memory_space() == MemorySpace::kAlternate) {
-          for (const HloUse& use : allocation->uses()) {
-            inefficient_sites.push_back(use);
-          }
-        }
-      }
-    }
-  }
-  return inefficient_sites;
-}
-
-void AlternateMemoryBestFitHeap::AddRequiredAssignmentsForColocatedIntervals(
-    absl::Span<const AlternateMemoryBestFitHeap::BufferInterval* const>
-        colocated_intervals) {
-  // TODO(berkin): For now, place the phi values due to conditionals in
-  // default memory.
-  for (const BufferInterval* colocated_interval : colocated_intervals) {
-    const HloValue* value = colocated_interval->buffer;
-    for (const auto& position : value->positions()) {
-      if (position.instruction->opcode() == HloOpcode::kConditional) {
-        VLOG(3) << "Adding required assignment for condition output: "
-                << value->ToShortString();
-        AddRequiredAssignment(position.instruction, position.index,
-                              MemorySpace::kDefault);
-        for (const HloComputation* called_computation :
-             position.instruction->called_computations()) {
-          AddRequiredAssignment(called_computation->root_instruction(),
-                                position.index, MemorySpace::kDefault);
-        }
-      }
-    }
-  }
-}
-
-void AlternateMemoryBestFitHeap::CreateAllocationValuesFromColocatedIntervals(
-    absl::Span<const AlternateMemoryBestFitHeap::BufferInterval* const>
-        colocated_intervals,
-    std::vector<MemorySpaceAssignment::AllocationValue>& allocation_values) {
-  // Create AllocationValues for all the colocated intervals.
-  for (const auto& colocated_interval : colocated_intervals) {
-    CreateAllocationValues(*colocated_interval, allocation_values);
-  }
-  // Go through the AllocationValues and delete the ones that have the identical
-  // defining instruction and use instructions. This is useful for async
-  // operations that can read and write to the same buffer, e.g., in-place
-  // asynchronous collective permute. The AllocationValues that corresponds to
-  // collective-permute-start{0} (the input) and collective-permute-start{1}
-  // (the output) refer to the same buffer by definition (since they are created
-  // from colocated intervals). If we don't delete one of these buffers, then
-  // when we try to allocate the AllocationValue, we would think they overlap.
-  auto create_instruction_vector = [](const AllocationValue& allocation_value) {
-    std::vector<const HloInstruction*> instruction_vector;
-    instruction_vector.push_back(allocation_value.defining_instruction());
-    for (const AllocationValue::Use& use : allocation_value.uses()) {
-      instruction_vector.push_back(use.hlo_use.instruction);
-    }
-    return instruction_vector;
-  };
-  for (int i = 0; i < allocation_values.size() - 1; ++i) {
-    for (int j = i + 1; j < allocation_values.size(); ++j) {
-      const AllocationValue& allocation_value_1 = allocation_values[i];
-      const AllocationValue& allocation_value_2 = allocation_values[j];
-      if (create_instruction_vector(allocation_value_1) ==
-          create_instruction_vector(allocation_value_2)) {
-        VLOG(3) << "Allocation values " << allocation_value_1.ToShortString()
-                << " and " << allocation_value_2.ToShortString()
-                << " are equivalent, deleting the second one.";
-        allocation_values.erase(allocation_values.begin() + j);
-        --j;
-      }
-    }
-  }
-
-  FindAliases(&allocation_values);
-}
-
-absl::StatusOr<AlternateMemoryBestFitHeap::Result>
-AlternateMemoryBestFitHeap::AllocateAllocationValues(
-    absl::Span<MemorySpaceAssignment::AllocationValue> allocation_values) {
-  const auto& instruction_schedule = hlo_live_range_.instruction_schedule();
-
-  // Find the use times across all of the related AllocationValues and sort
-  // them. We use these to find allocations that are available throughout the
-  // entire live range of all the AllocationValues.
-  std::vector<int64_t> all_use_times;
-  for (const AllocationValue& allocation_value : allocation_values) {
-    absl::c_transform(allocation_value.uses(),
-                      std::back_inserter(all_use_times),
-                      [](const AllocationValue::Use& use) { return use.time; });
-  }
-  absl::c_sort(all_use_times);
-
-  // Data structure to contain the preferred offset for a given computation.
-  // We ensure that the same offset will be allocated outside the while loop
-  // as well as inside the while loop.
-  absl::flat_hash_map<const HloComputation*, AliasedOffset*>
-      preferred_offset_for_computation;
-
-  Result result = Result::kSuccess;
-  for (AllocationValue& allocation_value : allocation_values) {
-    int64_t definition_time =
-        instruction_schedule.at(allocation_value.defining_instruction());
-
-    bool require_no_copy_alternate_mem_allocation =
-        allocation_value.value()->shape().has_layout() &&
-        allocation_value.value()->shape().layout().memory_space() ==
-            options_.alternate_memory_space;
-    VLOG(3) << "require_no_copy_alternate_mem_allocation = "
-            << require_no_copy_alternate_mem_allocation;
-    if (!options_.is_position_allowed_in_alternate_mem_fn(
-            allocation_value.defining_position())) {
-      if (require_no_copy_alternate_mem_allocation) {
-        LOG(WARNING)
-            << "The value " << allocation_value.value()->ToShortString()
-            << " is pre-colored for alternate memory but the position "
-            << allocation_value.defining_position().ToString()
-            << " is not allowed in the alternate memory. Respecting the color "
-               "but this may break things later in compilation.";
-      } else {
-        AddRequiredAssignment(allocation_value.value(),
-                              allocation_value.defining_instruction(),
-                              MemorySpace::kDefault, definition_time);
-      }
-    }
-
-    AliasedOffset* preferred_offset = nullptr;
-    auto preferred_offset_it =
-        preferred_offset_for_computation.find(allocation_value.computation());
-    if (preferred_offset_it != preferred_offset_for_computation.end()) {
-      preferred_offset = preferred_offset_it->second;
-    }
-
-    // Iterate over the uses.
-    for (int use_idx = 0; use_idx < allocation_value.uses().size(); ++use_idx) {
-      const AllocationValue::Use& use = allocation_value.uses().at(use_idx);
-      const HloUse hlo_use = use.hlo_use;
-      int64_t use_time = instruction_schedule.at(hlo_use.instruction);
-      bool allow_no_copy_alternate_mem_allocation = true;
-      bool allow_prefetch = true;
-      bool prefer_no_copy_alternate_mem_allocation = false;
-      // TODO(b/318886791):  Rename boundary variables (here and other places)
-      // like `latest_prefetch_time` and `earliest_prefetch_time` indicate
-      // whether they are exclusive or inclusive boundaries.
-      int64_t latest_prefetch_time = use_time;
-      std::optional<int64_t> earliest_prefetch_time = std::nullopt;
-
-      // Assign the required assignment offset as a preferred offset.
-      std::optional<RequiredMemoryAssignment> required_assignment =
-          AliasedRequiredAssignmentForUse(use);
-      if (required_assignment &&
-          required_assignment->memory_space == MemorySpace::kAlternate) {
-        if (preferred_offset) {
-          CHECK_EQ(preferred_offset, required_assignment->offset);
-        } else {
-          preferred_offset = required_assignment->offset;
-          VLOG(3)
-              << "Setting preferred offset due to required assignment for use: "
-              << preferred_offset->offset;
-        }
-      }
-
-      // Control flow  calls include kWhile, kCall, and kConditional opcodes.
-      bool is_sequential_call =
-          (GetInstructionCallContext(hlo_use.instruction->opcode()) ==
-           CallContext::kControlFlow);
-      if (is_sequential_call) {
-        for (const HloComputation* called_computation :
-             hlo_use.instruction->called_computations()) {
-          const HloLiveRange::TimeBound& computation_span =
-              hlo_live_range_.computation_span_times().at(called_computation);
-          latest_prefetch_time =
-              std::min(computation_span.start - 1, latest_prefetch_time);
-        }
-        if (hlo_use.instruction->opcode() == HloOpcode::kWhile) {
-          // Given an example while loop and flattened schedule (logical times
-          // shown on the left):
-          //
-          // 0:  a = ...
-          // 1:  ...
-          //     cond {
-          // 2:   p = param(0)
-          // 3:   ...
-          //     }
-          //     body {
-          // 4:   p = param(0)
-          // 5:   ...
-          // 6:   ROOT ...
-          //     }
-          // 7:  w = while(a), body=body, cond=cond
-          //
-          // When processing "a" (time 0) and its while use (time 7), we update
-          // the interval to time 0-4. This is so that the remaining interval
-          // (5-6) can be allocated separately and this buffer doesn't waste
-          // alternate memory space within the while loop body.
-          HloComputation* while_body = hlo_use.instruction->while_body();
-          // We require while body ROOTs to be the last in the schedule.
-          CHECK_EQ(instruction_schedule.at(while_body->root_instruction()) + 1,
-                   instruction_schedule.at(hlo_use.instruction))
-              << "While body ROOTs need to be the last in the schedule! "
-                 "Please run RootInstructionSinker.";
-          // Replace the use time with the parameter time so that we can decide
-          // on alternate memory allocations within the while loop body when we
-          // look at uses within the while loop body.
-          use_time =
-              instruction_schedule.at(while_body->parameter_instruction(0));
-        } else if (hlo_use.instruction->opcode() == HloOpcode::kConditional) {
-          // Replace the use time with the earliest parameter of called
-          // computations.
-          for (const HloComputation* called_computation :
-               hlo_use.instruction->called_computations()) {
-            use_time = std::min(
-                use_time, instruction_schedule.at(
-                              called_computation->parameter_instruction(0)));
-          }
-        }
-      }
-
-      // Add a required assignment in default memory if the use not allowed in
-      // alternate memory.
-      if (!IsUseAllowedInAlternateMemory(allocation_value, hlo_use)) {
-        if (require_no_copy_alternate_mem_allocation) {
-          LOG(WARNING)
-              << "The value " << allocation_value.value()->ToShortString()
-              << " is pre-colored for alternate memory but the use "
-              << hlo_use.ToString()
-              << " is not allowed in the alternate memory. Respecting the "
-                 "color but this may break things later in compilation.";
-        } else {
-          AddRequiredAssignment(allocation_value.value(), hlo_use.instruction,
-                                MemorySpace::kDefault, use_time);
-        }
-      } else if (use_idx > 0) {
-        // We allow buffers in alternate memory that are passed into
-        // conditionals to give up their alternate memory allocation inside the
-        // called computation. This means that if a conditional operator has an
-        // alternate memory allocation, subsequent uses cannot use the same
-        // alternate memory allocation in order not to clobber data. So we force
-        // default memory allocation for these subsequent uses.
-        const AllocationValue::Use& previous_use =
-            allocation_value.uses().at(use_idx - 1);
-        if (previous_use.hlo_use.instruction->opcode() ==
-                HloOpcode::kConditional &&
-            previous_use.hlo_use.instruction != hlo_use.instruction) {
-          allow_no_copy_alternate_mem_allocation = false;
-          earliest_prefetch_time =
-              instruction_schedule.at(previous_use.hlo_use.instruction);
-          VLOG(3) << "Previous use (" << previous_use.hlo_use.ToString()
-                  << ") of use (" << hlo_use.ToString()
-                  << ") is a conditional, so this use will need to evict. "
-                  << "Earliest prefetch time = " << *earliest_prefetch_time;
-        }
-      }
-
-      // Bitcasts don't define buffers and don't directly consume buffers. Skip
-      // allocating buffers for bitcast uses (unless they are the root
-      // instruction). The uses that feed from bitcasts will be handled
-      // specially.
-      if (hlo_use.instruction->opcode() != HloOpcode::kBitcast ||
-          hlo_use.instruction ==
-              hlo_use.instruction->parent()->root_instruction()) {
-        std::optional<int64_t> preferred_prefetch_time = std::nullopt;
-        auto loop_optimized_allocation_it =
-            loop_optimized_allocations_map_.find(use.hlo_use);
-        if (loop_optimized_allocation_it !=
-            loop_optimized_allocations_map_.end()) {
-          const LoopOptimizedAllocationInfo& loop_optimized_allocation_info =
-              loop_optimized_allocation_it->second;
-          const Allocation* allocation =
-              loop_optimized_allocation_info.loop_optimized_allocation;
-          VLOG(3) << "Found optimized allocation for " << use.hlo_use.ToString()
-                  << " (loop idx: " << loop_optimized_allocation_info.use_index
-                  << "): " << allocation->ToString();
-          if (require_no_copy_alternate_mem_allocation) {
-            if (allocation->is_copy_allocation() ||
-                allocation->memory_space() == MemorySpace::kDefault) {
-              LOG(WARNING) << "Optimized allocation could not be applied "
-                              "because the tensor is pre-colored, allocation: "
-                           << allocation->ToString();
-            }
-          } else if (allocation->is_copy_allocation()) {
-            allow_no_copy_alternate_mem_allocation = true;
-            const CopyAllocation* copy_allocation =
-                static_cast<const CopyAllocation*>(allocation);
-            int64_t effective_copy_start_time =
-                copy_allocation->copy_start_schedule_after();
-            if (copy_allocation->copy_start_schedule_after() ==
-                    loop_optimized_allocation_info.loop_size - 1 &&
-                copy_allocation->copy_done_schedule_before() == 0) {
-              effective_copy_start_time =
-                  -loop_optimized_allocation_info.loop_size;
-            } else if (copy_allocation->copy_start_schedule_after() + 1 >=
-                       copy_allocation->copy_done_schedule_before()) {
-              effective_copy_start_time -=
-                  loop_optimized_allocation_info.loop_size;
-            }
-            preferred_prefetch_time =
-                hlo_live_range_.instruction_schedule().at(hlo_use.instruction) -
-                loop_optimized_allocation_info.use_index +
-                effective_copy_start_time;
-            VLOG(3) << "Prefer prefetch at " << *preferred_prefetch_time
-                    << " (effective: " << effective_copy_start_time << ")";
-          } else if (allocation->memory_space() == MemorySpace::kDefault) {
-            allow_prefetch = false;
-            allow_no_copy_alternate_mem_allocation = false;
-            VLOG(3) << "Disallowing alternate memory allocation.";
-          } else {
-            CHECK(allocation->memory_space() == MemorySpace::kAlternate);
-            prefer_no_copy_alternate_mem_allocation = true;
-            VLOG(3) << "Prefer no-copy alternate memory allocation.";
-          }
-        }
-
-        if (options_.use_repeated_instance_for_preferred_prefetch_time) {
-          const std::vector<const HloInstruction*>* repeated_insts =
-              GetRepeatedInstructionList(hlo_use.instruction);
-          if (repeated_insts) {
-            for (int i = 0; i < repeated_insts->size(); ++i) {
-              const HloInstruction* repeated = repeated_insts->at(i);
-              VLOG(4) << "Repeated instruction for use: " << repeated->name()
-                      << " "
-                      << hlo_live_range_.instruction_schedule().at(repeated);
-              if (repeated == hlo_use.instruction && i > 0) {
-                const HloInstruction* prev_repeated = repeated_insts->at(i - 1);
-                if (prev_repeated->parent() == hlo_use.instruction->parent()) {
-                  preferred_prefetch_time =
-                      hlo_live_range_.instruction_schedule().at(prev_repeated) +
-                      1;
-                  VLOG(3) << "Found a previous repeated ("
-                          << prev_repeated->name() << ") at "
-                          << (*preferred_prefetch_time - 1)
-                          << ". Setting preferred prefetch time = "
-                          << *preferred_prefetch_time;
-                }
-              }
-            }
-          }
-        }
-        AllocationRequest request;
-
-        int64_t live_range_start_time =
-            (earliest_prefetch_time.has_value()
-                 ? earliest_prefetch_time.value()
-                 : std::min(definition_time, use_time));
-        auto overridden_preferred_prefetch_time =
-            GetOverriddenPreferredPrefetchTime(
-                options_.preferred_prefetch_overrides, allocation_value.size(),
-                hlo_use, instruction_schedule, live_range_start_time,
-                latest_prefetch_time);
-        TF_CHECK_OK(overridden_preferred_prefetch_time.status());
-        if (overridden_preferred_prefetch_time.value().has_value()) {
-          LOG(INFO) << "Overriding preferred prefetch for "
-                    << hlo_use.instruction->name() << " operand number "
-                    << hlo_use.operand_number << " operand index "
-                    << hlo_use.operand_index.ToString() << " size "
-                    << allocation_value.size() << " live range ("
-                    << live_range_start_time << ", " << latest_prefetch_time
-                    << ") from "
-                    << (preferred_prefetch_time.has_value()
-                            ? preferred_prefetch_time.value()
-                            : -1)
-                    << " to "
-                    << overridden_preferred_prefetch_time.value().value();
-          preferred_prefetch_time = overridden_preferred_prefetch_time.value();
-        }
-
-        // Rarely, (e.g., when conditional true and false parameters are the
-        // same), definition time can be the time of the conditional and use
-        // time is the parameter use, which is less.
-        request.inclusive_start_time = std::min(definition_time, use_time);
-        request.end_time = use_time;
-        request.latest_prefetch_time = latest_prefetch_time;
-        request.size = allocation_value.size();
-        request.prefer_no_copy_alternate_mem_allocation =
-            prefer_no_copy_alternate_mem_allocation;
-        request.allow_no_copy_alternate_mem_allocation =
-            allow_no_copy_alternate_mem_allocation;
-        request.allow_prefetch = allow_prefetch;
-        request.require_no_copy_alternate_mem_allocation =
-            require_no_copy_alternate_mem_allocation;
-        request.earliest_prefetch_time = earliest_prefetch_time;
-        request.preferred_prefetch_time = preferred_prefetch_time;
-        request.preferred_offset = preferred_offset;
-        request.use = &use;
-        request.allocation_value = &allocation_value;
-        request.all_use_times = all_use_times;
-        result_mark(AllocateSegment(request), result);
-        if (request.require_no_copy_alternate_mem_allocation &&
-            result != Result::kSuccess) {
-          Status failed_precondition = FailedPrecondition(
-              "The value defined at %s requires allocation in the alternate "
-              "memory, which could not be satisfied. This typically happens "
-              "because more pinned buffers are live than the alternate memory "
-              "capacity.",
-              allocation_value.defining_instruction()->ToString());
-          LOG(ERROR) << failed_precondition;
-          return failed_precondition;
-        }
-        if (result_requires_uncommit(result)) {
-          // If the allocation finding failed (e.g., due to running out of
-          // asynchronous copies), then fall back to allocating the buffer
-          // entirely in the default memory.
-          return result;
-        }
-
-        // If there are multiple uses, they can try using the memory allocation
-        // already at the alternate memory.
-        definition_time = instruction_schedule.at(hlo_use.instruction);
-      }
-
-      // Propagate the allocation to any aliases this use might have had.
-      Allocation* aliased_allocation = GetLiveAllocationAt(
-          *allocation_value.allocation_sequence(), use_time);
-      for (const HloPosition& aliased_position : use.aliases) {
-        AddAliasedRequiredAssignment(aliased_position.instruction,
-                                     aliased_position.index,
-                                     aliased_allocation);
-      }
-
-      if (hlo_use.instruction->opcode() == HloOpcode::kWhile &&
-          aliased_allocation->memory_space() == MemorySpace::kAlternate) {
-        // For while uses that are allocated in the alternate memory space, if
-        // they also have an allocation in the default memory space in their
-        // allocation sequence, create a "parent" allocation that mirrors this
-        // default memory space allocation. When we process the parent
-        // allocation, we add an additional parameter to the while that is a
-        // reference to the buffer in the default memory space. With parent
-        // allocations, we don't need to unnecessarily evict buffers since they
-        // already have a copy in the default memory space. We search backwards
-        // (latest to earliest in execution time) for a suitable allocation in
-        // order to find the most recent one.
-        if (options_.enable_while_redundant_eviction_elimination &&
-            absl::c_find_if(allocation_value.value()->positions(),
-                            [&hlo_use](const HloPosition& position) {
-                              return position.instruction ==
-                                         hlo_use.instruction &&
-                                     position.index == hlo_use.operand_index;
-                            }) != allocation_value.value()->positions().end()) {
-          auto allocation_sequence = allocation_value.allocation_sequence();
-          auto prev_allocation_in_default_mem_it = std::find_if(
-              allocation_sequence->rbegin(), allocation_sequence->rend(),
-              [&](const auto& allocation) {
-                return allocation->memory_space() == MemorySpace::kDefault &&
-                       allocation->defining_position() ==
-                           allocation_value.defining_position();
-              });
-          if (prev_allocation_in_default_mem_it !=
-              allocation_sequence->rend()) {
-            VLOG(3) << "Found a prev allocation in default mem for while use: "
-                    << (*prev_allocation_in_default_mem_it)->ToString();
-            auto body_allocation_value_it = absl::c_find_if(
-                allocation_values, [&](const AllocationValue& value) {
-                  return value.computation() ==
-                             hlo_use.instruction->while_body() &&
-                         value.defining_instruction()->opcode() ==
-                             HloOpcode::kParameter;
-                });
-            CHECK_NE(body_allocation_value_it, allocation_values.end());
-            VLOG(3) << "Body allocation value: "
-                    << body_allocation_value_it->ToShortString();
-            int64_t body_parameter_time = instruction_schedule.at(
-                body_allocation_value_it->defining_instruction());
-            body_allocation_value_it->mutable_allocation_sequence()->push_back(
-                std::make_unique<ParentAllocation>(
-                    **prev_allocation_in_default_mem_it, hlo_use.instruction,
-                    body_allocation_value_it->defining_position(),
-                    body_parameter_time));
-            VLOG(3) << "Created: "
-                    << body_allocation_value_it->allocation_sequence()
-                           ->back()
-                           ->ToString();
-
-            auto after_while_allocation_value_it = absl::c_find_if(
-                allocation_values, [&](const AllocationValue& value) {
-                  return value.defining_instruction() == hlo_use.instruction;
-                });
-            CHECK_NE(after_while_allocation_value_it, allocation_values.end());
-            VLOG(3) << "After while allocation value: "
-                    << after_while_allocation_value_it->ToShortString();
-            int64_t while_time = instruction_schedule.at(hlo_use.instruction);
-            after_while_allocation_value_it->mutable_allocation_sequence()
-                ->push_back(std::make_unique<MirroredAllocation>(
-                    **prev_allocation_in_default_mem_it, while_time));
-            VLOG(3) << "Created: "
-                    << after_while_allocation_value_it->allocation_sequence()
-                           ->back()
-                           ->ToString();
-          }
-        }
-        // Special case for while loops since the root offset must agree with
-        // other offsets: remember the preferred offset for the while loop body.
-        preferred_offset_for_computation[hlo_use.instruction->while_body()] =
-            GetAliasedOffset(*aliased_allocation);
-      }
-    }
-  }
-  return result;
-}
-
-bool operator<(const AsynchronousCopy& a, const AsynchronousCopy& b) {
-  return a.AsTuple() < b.AsTuple();
-}
-
-bool operator==(const AsynchronousCopy& a, const AsynchronousCopy& b) {
-  return a.AsTuple() == b.AsTuple();
-}
-
-bool operator!=(const AsynchronousCopy& a, const AsynchronousCopy& b) {
-  return a.AsTuple() != b.AsTuple();
-}
-
-void AsynchronousCopyOrdering::AddCopy(const AsynchronousCopy& copy) {
-  auto it = ranges_.find({copy.exclusive_start_time, copy.end_time});
-  if (it != ranges_.end()) {
-    CHECK_EQ(it->first.exclusive_start_time, copy.exclusive_start_time);
-    CHECK(it->second.insert(copy).second);
-  } else {
-    ranges_[{copy.exclusive_start_time, copy.end_time}] = {copy};
-  }
-}
-
-void AsynchronousCopyOrdering::RemoveCopy(const AsynchronousCopy& copy) {
-  auto copy_it = ranges_.find({copy.exclusive_start_time, copy.end_time});
-  CHECK(copy_it != ranges_.end());
-  CHECK_EQ(copy_it->first.exclusive_start_time, copy.exclusive_start_time);
-  CHECK_EQ(copy_it->second.erase(copy), 1);
-  if (copy_it->second.empty()) {
-    ranges_.erase(copy_it);
-  }
-}
-
-bool AsynchronousCopyOrdering::ViolatesOrdering(int64_t exclusive_start_time,
-                                                int64_t end_time) const {
-  // We allow identical start and end times. It is enough to check for just the
-  // start time in case we find a match in ranges_ because the found value will
-  // either be identical to {start_time, estimated_end_time} (and this doesn't
-  // violate) or its start_time will be smaller and estimated_end_time will be
-  // larger (this violates).
-  auto copy_it = ranges_.find({exclusive_start_time, end_time});
-  if (copy_it != ranges_.end() &&
-      copy_it->first.exclusive_start_time != exclusive_start_time) {
-    VLOG(4) << "Violates ordering: (" << exclusive_start_time << ", "
-            << end_time << ") and (" << copy_it->first.exclusive_start_time
-            << ", " << copy_it->first.end_time << ")";
-    return true;
-  }
-  return false;
-}
-
-bool AsynchronousCopyResource::ConsumeResource(
-    int64_t exclusive_start_time, int64_t end_time, float resource,
-    absl::flat_hash_map<int64_t, float>* delay_change_map,
-    float resource_to_free) {
-  std::list<AsynchronousCopy>::iterator current_copy = async_copies_.end();
-  // In order to propagate the resource to the next scheduled copy, we iterate
-  // over the copies in start time order until we either find enough free
-  // resource (and return true), or find out that we don't have enough free
-  // resource (and return false).
-  while (true) {
-    // resource is modified below. We save its initial value for logging below.
-    const float amount_requested = resource;
-
-    VLOG(3) << "Consume resource: start time_exclusive = "
-            << exclusive_start_time << ", end time = " << end_time
-            << ", resource = " << resource << ", delay = "
-            << delay_[ExclusiveToInclusiveStartTime(exclusive_start_time)]
-            << ", free = " << resource_to_free;
-    VLOG(5) << "Available resources: "
-            << VectorToString(
-                   GetCurrentResources(), /*include_indices=*/true,
-                   ExclusiveToInclusiveStartTime(exclusive_start_time),
-                   end_time);
-
-    // Nothing to do if we're not adding or removing any resources.
-    if (resource == 0.0 && resource_to_free == 0.0) {
-      return true;
-    }
-
-    // For the async copy we're adding, check the delay_ array to see how much
-    // this copy would have to be delayed because of an earlier copy that wasn't
-    // finished when this copy starts.
-    if (current_copy == async_copies_.end()) {
-      resource += delay_[ExclusiveToInclusiveStartTime(exclusive_start_time)];
-    }
-
-    // Find the copy that is right after this one. If there are leftover
-    // resources by the time the next copy starts, the next copy will be pushed
-    // further later in time.
-    std::list<AsynchronousCopy>::iterator next_copy = async_copies_.end();
-    if (current_copy != async_copies_.end()) {
-      next_copy = std::next(current_copy);
-    } else {
-      auto async_copy_time_it =
-          async_copy_time_map_.upper_bound(exclusive_start_time);
-      if (async_copy_time_it != async_copy_time_map_.end()) {
-        next_copy = async_copy_time_it->second;
-      }
-    }
-
-    // Check if this copy will push the next copy later in time (or if removing
-    // the resource, check if the removal of this copy move the next copy
-    // earlier in time).
-    std::optional<float> delay_for_next_copy = std::nullopt;
-    float resource_freed = 0.0;
-    for (int64_t time = ExclusiveToInclusiveStartTime(exclusive_start_time);
-         time < end_time && resource != 0; ++time) {
-      // Iterate over the logical times that this copy spans. Note that the
-      // start and end time ranges are exclusive.
-      float used_resource = std::min(resource, initial_resources_[time]);
-      if (next_copy != async_copies_.end() &&
-          next_copy->exclusive_start_time ==
-              InclusiveToExclusiveStartTime(time)) {
-        // This is the time where the next copy begins. If the resource is
-        // non-zero at this point, the copy didn't finish by the time the next
-        // copy started, so the next copy would need to be pushed later in time.
-        delay_for_next_copy = resource;
-        resource_to_free -= resource_freed;
-      }
-      if (!delay_for_next_copy.has_value()) {
-        // Update the delay_ vector and resource_freed variable with the amount
-        // that was freed when removing the copy.
-        float old_resource =
-            std::max(0.0f, initial_resources_[time] - delay_[time]);
-        if (delay_change_map && !delay_change_map->contains(time)) {
-          (*delay_change_map)[time] = delay_[time];
-        }
-        delay_[time] = std::max(0.0f, resource - resource_to_free);
-        float new_resource =
-            std::max(0.0f, initial_resources_[time] - delay_[time]);
-        resource_freed += std::max(0.0f, new_resource - old_resource);
-      }
-      // Update the resource with the used amount in this logical time.
-      resource -= used_resource;
-    }
-
-    // If resource isn't satisfied by the end, we didn't have enough resources.
-    if (resource > 0) {
-      VLOG(3) << "Doesn't have enough resource; requested resource = "
-              << amount_requested << "; leftover resources = " << resource;
-      return false;
-    }
-
-    if (!delay_for_next_copy.has_value()) {
-      return true;
-    }
-    // If this copy overlapped with another one, we run for another iteration
-    // with the next copy  with the amount of resource that needs to be added or
-    // removed.
-    exclusive_start_time = next_copy->exclusive_start_time;
-    end_time = next_copy->end_time;
-    resource = *delay_for_next_copy + next_copy->resource;
-    current_copy = next_copy;
-  }
-}
-
-void AsynchronousCopyResource::AddCopy(const AsynchronousCopy& copy) {
-  CHECK(
-      ConsumeResource(copy.exclusive_start_time, copy.end_time, copy.resource));
-
-  // Find the iterator for the copy that would be right after this copy and put
-  // this copy right before it in async_copies_.
-  auto async_copy_time_it =
-      async_copy_time_map_.upper_bound(copy.exclusive_start_time);
-  auto insertion_it = (async_copy_time_it == async_copy_time_map_.end())
-                          ? async_copies_.end()
-                          : async_copy_time_it->second;
-  auto inserted_it = async_copies_.insert(insertion_it, copy);
-  // If this copy is the first copy we have seen with the start time, add the
-  // inserted iterator into async_copy_time_map_ for fast lookups. Note that
-  // async_copy_time_map_ always points to the very first copy with the same
-  // start index. If there are multiple asynchronous copies that have the same
-  // start time, the memory space assignment algorithm schedules them in the
-  // same order that AddCopy was called.
-  if (async_copy_time_map_.find(copy.exclusive_start_time) ==
-      async_copy_time_map_.end()) {
-    async_copy_time_map_[copy.exclusive_start_time] = inserted_it;
-  }
-}
-
-void AsynchronousCopyResource::RemoveCopy(const AsynchronousCopy& copy) {
-  // The ConsumeResource method can only correctly remove the last copy that
-  // starts at a given start time. So if the copy that is requested to be
-  // removed is not the last copy for this start time, we need to temporarily
-  // remove later copies that has the same start time and then add them back one
-  // by one. To do this, we first find the iterator that points to the earliest
-  // copy after this start time. We then decrement this iterator and temporarily
-  // remove the copies until we find the copy we actually want to remove. After
-  // we remove the copy that we actually want to remove, we add back the
-  // temporarily removed copies one by one in the same order.
-  auto async_copy_time_it =
-      async_copy_time_map_.upper_bound(copy.exclusive_start_time);
-  auto copy_it = (async_copy_time_it == async_copy_time_map_.end())
-                     ? async_copies_.end()
-                     : async_copy_time_it->second;
-  CHECK(copy_it != async_copies_.begin());
-  --copy_it;
-
-  std::list<AsynchronousCopy> copies_to_add_back;
-  auto prev_copy_it = copy_it;
-  for (; *copy_it != copy; copy_it = prev_copy_it) {
-    CHECK(copy_it != async_copies_.begin());
-    CHECK_EQ(copy_it->exclusive_start_time, copy.exclusive_start_time);
-    copies_to_add_back.push_front(*copy_it);
-    VLOG(4) << "RemoveCopy found a copy to temporarily remove and add back: "
-            << copy_it->exclusive_start_time << " " << copy_it->end_time << " "
-            << copy_it->resource;
-    prev_copy_it = std::prev(copy_it);
-    RemoveCopy(copy_it);
-  }
-  CHECK(*copy_it == copy);
-  RemoveCopy(copy_it);
-
-  for (const AsynchronousCopy& copy_to_add_back : copies_to_add_back) {
-    AddCopy(copy_to_add_back);
-  }
-}
-
-void AsynchronousCopyResource::RemoveCopy(
-    std::list<AsynchronousCopy>::iterator& copy_it) {
-  // This method works only for the latest copy for the given start time.
-  CHECK(std::next(copy_it) == async_copies_.end() ||
-        std::next(copy_it)->exclusive_start_time >
-            copy_it->exclusive_start_time);
-  CHECK(ConsumeResource(copy_it->exclusive_start_time, copy_it->end_time,
-                        /*resource=*/0,
-                        /*delay_change_map=*/nullptr,
-                        /*resource_to_free=*/copy_it->resource));
-  // If the copy to be removed is the value pointed by async_copy_time_map_, we
-  // make the next copy with the same start time to be pointed by
-  // async_copy_time_map_. If there are no such copies, we remove the key for
-  // this copy start time.
-  int64_t exclusive_start_time = copy_it->exclusive_start_time;
-  auto async_copy_time_it = async_copy_time_map_.find(exclusive_start_time);
-  if (copy_it == async_copy_time_it->second) {
-    if (std::next(copy_it) != async_copies_.end() &&
-        std::next(copy_it)->exclusive_start_time == exclusive_start_time) {
-      async_copy_time_it->second = std::next(copy_it);
-    } else {
-      async_copy_time_map_.erase(async_copy_time_it);
-    }
-  }
-  async_copies_.erase(copy_it);
-}
-
-bool AsynchronousCopyResource::HasEnoughResource(int64_t exclusive_start_time,
-                                                 int64_t end_time,
-                                                 float resource) {
-  absl::flat_hash_map<int64_t, float> delay_changes;
-  bool result =
-      ConsumeResource(exclusive_start_time, end_time, resource, &delay_changes);
-  for (const auto& change_pair : delay_changes) {
-    delay_[change_pair.first] = change_pair.second;
-  }
-  return result;
-}
-
-bool AsynchronousCopyResource::HasEnoughResourceMultiCheck(
-    const std::vector<ResourceSpec>& specs) {
-  absl::flat_hash_map<int64_t, float> delay_changes;
-  bool result = absl::c_all_of(specs, [&](const ResourceSpec& spec) {
-    return ConsumeResource(spec.exclusive_start_time, spec.end_time,
-                           spec.resource, &delay_changes);
-  });
-  for (const auto& change_pair : delay_changes) {
-    delay_[change_pair.first] = change_pair.second;
-  }
-  return result;
-}
-
-namespace {
-
-// A convenience struct for use in the implementation of
-// AsynchronousCopyResource::Dump().
-struct CopyResourceDumpData {
-  float initial_resource;
-  float delay;
-  float available;
-  std::vector<int> overlapping_copies;
-};
-
-}  // namespace
-
-std::string AsynchronousCopyResource::Dump(
-    int64_t start_time, int64_t end_time,
-    MemorySpace memory_space_filter) const {
-  std::vector<float> available = GetCurrentResources();
-  std::vector<CopyResourceDumpData> time_dump_data;
-  for (int i = start_time; i < end_time; ++i) {
-    time_dump_data.push_back({
-        initial_resources_[i],
-        delay_[i],
-        available[i],
-        /*overlapping_copies=*/{},
-    });
-  }
-
-  std::vector<std::string> lines;
-  lines.push_back(absl::StrCat("AsynchronousCopyResource::Dump(start_time: ",
-                               start_time, ", end_time: ", end_time, ")"));
-  for (const AsynchronousCopy& copy : async_copies_) {
-    if (copy.destination != memory_space_filter) {
-      continue;
-    }
-    int64_t overlap_start = std::max(start_time, copy.exclusive_start_time);
-    int64_t overlap_end = std::min(end_time, copy.end_time);
-    if (overlap_start < overlap_end) {
-      lines.push_back(absl::StrCat(
-          "copy(id: ", copy.id,
-          ", exclusive_start: ", copy.exclusive_start_time,
-          ", end: ", copy.end_time, ", resource: ", copy.resource, ")"));
-    }
-    for (int i = overlap_start; i < overlap_end; ++i) {
-      time_dump_data[i - start_time].overlapping_copies.push_back(copy.id);
-    }
-  }
-
-  std::vector<size_t> col_sizes;
-  std::vector<std::vector<std::string>> rows;
-  rows.push_back({"time", "initial", "delay", "avail", "overlapping copies"});
-  for (std::string_view col : rows.front()) {
-    col_sizes.push_back(col.size());
-  }
-  for (int i = 0; i < time_dump_data.size(); ++i) {
-    rows.push_back({absl::StrCat(i + start_time),
-                    absl::StrCat(time_dump_data[i].initial_resource),
-                    absl::StrCat(time_dump_data[i].delay),
-                    absl::StrCat(time_dump_data[i].available),
-                    absl::StrJoin(time_dump_data[i].overlapping_copies, ",")});
-    for (int j = 0; j < rows.back().size(); ++j) {
-      col_sizes[j] = std::max(col_sizes[j], rows.back()[j].size());
-    }
-  }
-  for (const std::vector<std::string>& row : rows) {
-    std::string line;
-    std::string sep;
-    for (int i = 0; i < col_sizes.size(); ++i) {
-      absl::StrAppend(&line, sep, row[i]);
-      sep = std::string(col_sizes[i] + 2 - row[i].size(), ' ');
-    }
-    lines.push_back(line);
-  }
-
-  return absl::StrJoin(lines, "\n");
-}
-
-AlternateMemoryBestFitHeap::AliasedOffset*
-AlternateMemoryBestFitHeap::GetAliasedOffset(const Allocation& allocation) {
-  auto aliased_offset_it = aliased_offset_map_.find(&allocation);
-  CHECK(aliased_offset_it != aliased_offset_map_.end());
-  return aliased_offset_it->second;
-}
-
-void AlternateMemoryBestFitHeap::CreateOrAddToAliasedOffset(
-    const Allocation& allocation,
-    AlternateMemoryBestFitHeap::AliasedOffset* aliased_offset) {
-  CHECK(allocation.memory_space() == MemorySpace::kAlternate);
-  CHECK(!aliased_offset_map_.contains(&allocation));
-  if (!aliased_offset) {
-    aliased_offsets_.push_back({allocation.chunk().offset});
-    aliased_offset = &aliased_offsets_.back();
-  }
-  CHECK_EQ(allocation.chunk().offset, aliased_offset->offset);
-  CHECK(aliased_offset->allocations.insert(&allocation).second);
-  aliased_offset_map_[&allocation] = aliased_offset;
-}
-
-/*static*/ Allocation* AlternateMemoryBestFitHeap::GetLiveAllocationAt(
-    const AllocationSequence& allocations, int64_t time) {
-  for (auto allocation_it = allocations.rbegin();
-       allocation_it != allocations.rend(); ++allocation_it) {
-    if ((*allocation_it)->start_time() <= time &&
-        (*allocation_it)->end_time() >= time) {
-      return allocation_it->get();
-    }
-  }
-  return nullptr;
-}
-
-void AlternateMemoryBestFitHeap::AllocateCrossProgramPrefetchBuffer(
-    HloModule* module, const BufferInterval& prefetch_candidate) {
-  Chunk chunk_candidate = FindChunkCandidate(prefetch_candidate);
-  if (chunk_candidate.chunk_end() > available_heap_size()) {
-    VLOG(3) << "Could not allocate preferred memory for cross program prefetch";
-    return;
-  }
-
-  const HloValue* buffer = prefetch_candidate.buffer;
-  int64_t parameter = buffer->instruction()->parameter_number();
-  int cross_program_prefetch_index = module->CrossProgramPrefetches().size();
-  module->AddCrossProgramPrefetch(parameter, buffer->index());
-
-  AllocationSequence allocations;
-  allocations.push_back(std::make_unique<PinnedAllocation>(
-      buffer->defining_position(), MemorySpace::kDefault, kDummyChunk,
-      prefetch_candidate.start, prefetch_candidate.end,
-      /*is_scoped_allocation=*/false));
-
-  // Find the earliest use.
-  const auto& instruction_schedule = hlo_live_range_.instruction_schedule();
-  auto uses = FindCrossProgramPrefetchUses(buffer->GetUses(), alias_analysis_);
-  CHECK_GE(uses.size(), 1);
-  auto use_schedule_compare = [&](const HloUse& lhs, const HloUse& rhs) {
-    return instruction_schedule.at(lhs.instruction) <
-           instruction_schedule.at(rhs.instruction);
-  };
-  auto first_use = absl::c_min_element(uses, use_schedule_compare);
-  int64_t latest_prefetch_time =
-      instruction_schedule.at(first_use->instruction);
-
-  // Find the latest use time.
-  int64_t last_use_time = instruction_schedule.at(
-      absl::c_max_element(uses, use_schedule_compare)->instruction);
-  for (const HloValue* colocation : prefetch_candidate.colocations) {
-    auto colocation_uses = colocation->GetUses();
-    if (!colocation_uses.empty()) {
-      last_use_time = std::max(
-          last_use_time,
-          instruction_schedule.at(
-              absl::c_max_element(colocation_uses, use_schedule_compare)
-                  ->instruction));
-    }
-  }
-
-  int64_t end_of_program_prefetch_end_time = instruction_schedule.size();
-  int64_t end_of_program_prefetch_latest_start_time =
-      options_.prefetch_interval_picker->LatestPrefetchStartTime(
-          buffer->defining_position().shape(), last_use_time,
-          end_of_program_prefetch_end_time, nullptr);
-  int64_t end_of_program_inclusive_prefetch_start_time =
-      options_.prefetch_interval_picker->PreferredPrefetchStartTime(
-          buffer->defining_position().shape(), last_use_time,
-          end_of_program_prefetch_latest_start_time,
-          end_of_program_prefetch_end_time);
-  VLOG(2) << "last use time = " << last_use_time
-          << ", end-of-program inclusive prefetch start time = "
-          << end_of_program_inclusive_prefetch_start_time;
-  float total_execution_time =
-      options_.prefetch_interval_picker->GetLogicalIntervalElapsed(
-          0, instruction_schedule.size());
-  float buffer_occupied_time =
-      options_.prefetch_interval_picker->GetLogicalIntervalElapsed(
-          end_of_program_inclusive_prefetch_start_time,
-          end_of_program_prefetch_end_time);
-  if (options_.cost_analysis) {
-    buffer_occupied_time = std::max(buffer_occupied_time,
-                                    options_.cost_analysis->GetAsyncCopyElapsed(
-                                        buffer->defining_position().shape()));
-  }
-  buffer_occupied_time +=
-      options_.prefetch_interval_picker->GetLogicalIntervalElapsed(
-          0, last_use_time);
-  float buffer_occupied_ratio = buffer_occupied_time / total_execution_time;
-  VLOG(2) << "Total execution time = " << total_execution_time
-          << ", buffer occupied time = " << buffer_occupied_time
-          << ", buffer occupied ratio = " << buffer_occupied_ratio;
-  // Freeing buffer only makes sense if the buffer will be free for a
-  // substantial time. Only perform this optimization if the ratio is below the
-  // limit, and if the memory pressure is above the alternate memory size.
-  bool free_buffer =
-      (options_.enable_cross_program_prefetch_freeing &&
-       memory_pressure_ > options_.max_size_in_bytes &&
-       buffer_occupied_ratio < kCrossProgramPrefetchOccupyFreeingLimit &&
-       end_of_program_inclusive_prefetch_start_time > last_use_time &&
-       end_of_program_inclusive_prefetch_start_time <
-           end_of_program_prefetch_end_time);
-  int64_t cross_program_prefetch_end_time =
-      free_buffer ? last_use_time : prefetch_candidate.end;
-
-  AddAsyncCopy(*allocations.back(), MemorySpace::kAlternate, chunk_candidate,
-               /*exclusive_start_time=*/
-               InclusiveToExclusiveStartTime(prefetch_candidate.start),
-               cross_program_prefetch_end_time, latest_prefetch_time,
-               &allocations, /*aliased_offset=*/nullptr,
-               /*resource=*/0.0, cross_program_prefetch_index);
-
-  absl::c_for_each(uses, [&](auto& use) { allocations.back()->AddUse(use); });
-  AliasedOffset* cross_program_prefetch_offset =
-      GetAliasedOffset(*allocations.back());
-
-  if (free_buffer) {
-    VLOG(2) << "Adding an end-of-program prefetch for freed "
-               "cross-program-prefetched buffer.";
-    AddAsyncCopy(*allocations.front(), MemorySpace::kAlternate, chunk_candidate,
-                 /*exclusive_start_time=*/
-                 InclusiveToExclusiveStartTime(
-                     end_of_program_inclusive_prefetch_start_time),
-                 end_of_program_prefetch_end_time,
-                 end_of_program_prefetch_end_time, &allocations,
-                 cross_program_prefetch_offset,
-                 /*resource=*/0.0);
-    CHECK_EQ(cross_program_prefetch_offset->offset,
-             allocations.back()->chunk().offset);
-  }
-
-  const int allocations_initial_size = allocations_->size();
-  for (auto& allocation : allocations) {
-    if (allocation->memory_space() == MemorySpace::kAlternate) {
-      BufferInterval buffer_interval;
-      buffer_interval.start = allocation->start_time();
-      buffer_interval.end = allocation->end_time();
-      buffer_interval.size = allocation->chunk().size;
-      buffer_interval.buffer = prefetch_candidate.buffer;
-      AddToPendingChunks(buffer_interval, chunk_candidate);
-    }
-    allocations_->push_back(std::move(allocation));
-  }
-
-  // Add a repack allocation block for the Allocation objects in alternate
-  // memory.
-  std::vector<AllocationBlock*> colocations;
-  for (int i = allocations_initial_size; i < allocations_->size(); ++i) {
-    const auto& allocation = allocations_->at(i);
-    if (allocation->memory_space() == MemorySpace::kAlternate) {
-      repack_allocation_blocks_.push_back(MakeRepackAllocationBlock(
-          allocation->start_time(), allocation->end_time(),
-          allocation->chunk().size, allocation->chunk().offset,
-          static_cast<int64_t>(repack_allocation_blocks_.size()),
-          allocation.get()));
-      colocations.push_back(&repack_allocation_blocks_.back());
-    }
-  }
-  for (int i = 0; i < colocations.size() - 1; ++i) {
-    colocations[i]->next_colocated = colocations[i + 1];
-  }
-  if (!colocations.empty()) {
-    colocations.back()->next_colocated = colocations.front();
-  }
-
-  ClearPendingChunks();
-}
-
-void AlternateMemoryBestFitHeap::AllocateReservedScopedAllocations() {
-  const auto& instruction_sequence =
-      hlo_live_range_.flattened_instruction_sequence().instructions();
-  for (int i = 0; i < instruction_sequence.size(); ++i) {
-    const HloInstruction* instruction = instruction_sequence[i];
-    int64_t reserved_scoped_memory =
-        std::min(options_.reserved_scoped_memory_fn(
-                     instruction, /*operands_in_alternate_memory=*/{},
-                     /*outputs_in_alternate_memory=*/{}),
-                 options_.max_size_in_bytes);
-    if (reserved_scoped_memory != 0) {
-      VLOG(1) << "Allocate reserved scoped memory at " << i << " ("
-              << instruction->name() << "): " << reserved_scoped_memory;
-      MsaBufferInterval interval;
-      interval.buffer = nullptr;
-      interval.size = reserved_scoped_memory;
-      interval.start = i;
-      interval.end = i;
-      interval.need_allocation = true;
-      Chunk chunk_candidate =
-          FindChunkCandidate(interval, /*preferred_offset=*/0);
-      CHECK_EQ(chunk_candidate.offset, 0);
-      AddToPendingChunks(interval, chunk_candidate);
-
-      if (options_.dump_fn != nullptr || VLOG_IS_ON(3)) {
-        AppendScopedAllocationBufferInfoDebugString(
-            instruction, i, reserved_scoped_memory, buffer_info_str_);
-      }
-
-      allocations_->push_back(std::make_unique<PinnedAllocation>(
-          HloPosition{instruction_sequence[i], {}}, MemorySpace::kAlternate,
-          chunk_candidate, i, i, /*is_scoped_allocation=*/true));
-
-      repack_allocation_blocks_.push_back(MakeRepackAllocationBlock(
-          i, i, reserved_scoped_memory,
-          /*initial_offset=*/0,
-          static_cast<int64_t>(repack_allocation_blocks_.size()),
-          allocations_->back().get()));
-    }
-  }
-  // If requested, make all scoped allocations to colocate with each other so
-  // that when we repack, all scoped allocations get the same offsets. Since
-  // they will all have the same scoped memory addresses, this increases the
-  // opportunity to deduplicate different ops.  However, this may hurt the
-  // memory packing efficiency.
-  if (options_.allocate_reserved_scoped_memory_at_same_offset) {
-    for (auto allocation_block_it = repack_allocation_blocks_.begin();
-         allocation_block_it != repack_allocation_blocks_.end() &&
-         std::next(allocation_block_it) != repack_allocation_blocks_.end();
-         ++allocation_block_it) {
-      allocation_block_it->next_colocated = &*std::next(allocation_block_it);
-    }
-    if (!repack_allocation_blocks_.empty()) {
-      repack_allocation_blocks_.back().next_colocated =
-          &repack_allocation_blocks_.front();
-    }
-  } else {
-    for (RepackAllocationBlock& allocation_block : repack_allocation_blocks_) {
-      allocation_block.next_colocated = &allocation_block;
-    }
-  }
-  ClearPendingChunks();
-}
-
-std::optional<AlternateMemoryBestFitHeap::RequiredMemoryAssignment>
-AlternateMemoryBestFitHeap::RequiredMemoryAssignmentAt(const HloValue* buffer,
-                                                       int64_t time) const {
-  auto required_assignment_it = required_assignments_.find(buffer);
-  std::optional<RequiredMemoryAssignment> required_assignment_at_time;
-  if (required_assignment_it != required_assignments_.end()) {
-    for (const RequiredMemoryAssignment& required_assignment :
-         required_assignment_it->second) {
-      if (required_assignment.time == time) {
-        // Sanity check that there is only one required at time.
-        CHECK(!required_assignment_at_time)
-            << buffer->ToShortString() << " at time " << time;
-        required_assignment_at_time = required_assignment;
-      }
-    }
-  }
-  return required_assignment_at_time;
-}
-
-std::optional<AlternateMemoryBestFitHeap::RequiredMemoryAssignment>
-AlternateMemoryBestFitHeap::AliasedRequiredAssignmentForUse(
-    const AllocationValue::Use& use) const {
-  std::optional<RequiredMemoryAssignment> required_assignment;
-  for (const HloPosition& position : use.aliases) {
-    const HloValue* value =
-        &alias_analysis_.dataflow_analysis().GetUniqueValueAt(
-            position.instruction, position.index);
-    int64_t time =
-        hlo_live_range_.instruction_schedule().at(position.instruction);
-    std::optional<RequiredMemoryAssignment> required_assignment_for_alias =
-        RequiredMemoryAssignmentAt(value, time);
-    if (required_assignment == std::nullopt) {
-      required_assignment = required_assignment_for_alias;
-    } else {
-      CHECK(required_assignment_for_alias == std::nullopt ||
-            required_assignment->equals_ignoring_time(
-                *required_assignment_for_alias));
-    }
-  }
-  return required_assignment;
-}
-
-void AlternateMemoryBestFitHeap::AddAliasedRequiredAssignment(
-    const HloInstruction* instruction, ShapeIndex index,
-    const Allocation* aliased_allocation) {
-  AliasedOffset* offset = nullptr;
-  if (aliased_allocation->memory_space() == MemorySpace::kAlternate) {
-    offset = GetAliasedOffset(*aliased_allocation);
-  }
-  AddRequiredAssignment(instruction, index, aliased_allocation->memory_space(),
-                        offset);
-}
-
-void AlternateMemoryBestFitHeap::AddRequiredAssignment(
-    const HloValue* value, const HloInstruction* instruction,
-    MemorySpace memory_space, int64_t time, AliasedOffset* offset,
-    bool add_to_pending) {
-  // Check for existing required assignment at this time and make sure it is the
-  // same as this if there is one.
-  auto existing_required_assignment = RequiredMemoryAssignmentAt(value, time);
-  if (existing_required_assignment) {
-    CHECK(memory_space == existing_required_assignment->memory_space)
-        << "inst = " << instruction->ToString() << " at " << time;
-    CHECK((!offset && !existing_required_assignment->offset) ||
-          offset == existing_required_assignment->offset);
-    VLOG(3) << "Not adding required assignment because there is one already: "
-            << value->ToShortString() << " at " << time << " at "
-            << (memory_space == MemorySpace::kDefault ? "def" : "alt");
-  } else {
-    VLOG(3) << "Adding required assignment: " << value->ToShortString()
-            << " at " << time << " at "
-            << (memory_space == MemorySpace::kDefault ? "def" : "alt");
-    RequiredMemoryAssignment required_assignment{memory_space, time, offset};
-    required_assignments_[value].push_back(required_assignment);
-    if (add_to_pending) {
-      pending_required_assignments_.push_back({value, required_assignment});
-    }
-  }
-}
-
-void AlternateMemoryBestFitHeap::AddRequiredAssignment(
-    const HloInstruction* instruction, ShapeIndex index,
-    MemorySpace memory_space, AliasedOffset* offset, bool add_to_pending) {
-  const HloValue* value =
-      &alias_analysis_.dataflow_analysis().GetUniqueValueAt(instruction, index);
-  int64_t instruction_time =
-      hlo_live_range_.instruction_schedule().at(instruction);
-  AddRequiredAssignment(value, instruction, memory_space, instruction_time,
-                        offset, add_to_pending);
-}
-
-void AlternateMemoryBestFitHeap::AddRequiredAssignment(
-    const HloPosition& position, MemorySpace memory_space,
-    AliasedOffset* offset, bool add_to_pending) {
-  AddRequiredAssignment(position.instruction, position.index, memory_space,
-                        offset, add_to_pending);
-}
-
-void AlternateMemoryBestFitHeap::AddRequiredAssignment(const HloUse& use,
-                                                       MemorySpace memory_space,
-                                                       AliasedOffset* offset,
-                                                       bool add_to_pending) {
-  const HloValue* value = &alias_analysis_.dataflow_analysis().GetUniqueValueAt(
-      use.instruction->operand(use.operand_number), use.operand_index);
-  int64_t instruction_time =
-      hlo_live_range_.instruction_schedule().at(use.instruction);
-  AddRequiredAssignment(value, use.instruction, memory_space, instruction_time,
-                        offset, add_to_pending);
-}
-
-void AlternateMemoryBestFitHeap::AddInputAndOutputRequiredAssignments() {
-  // Go through the parameters, outputs, and constants and pin them to the
-  // corresponding memory by adding a required assignment.
-  const HloModule& module = alias_analysis_.dataflow_analysis().module();
-  const auto& instruction_schedule = hlo_live_range_.instruction_schedule();
-  HloComputation* entry_computation = module.entry_computation();
-  for (HloInstruction* parameter_instruction :
-       entry_computation->parameter_instructions()) {
-    int64_t parameter_instruction_time =
-        instruction_schedule.at(parameter_instruction);
-    ShapeUtil::ForEachSubshape(
-        parameter_instruction->shape(),
-        [&](const Shape& subshape, const ShapeIndex& index) {
-          MemorySpace memory_space = MemorySpace::kDefault;
-          if (subshape.has_layout() && subshape.layout().memory_space() ==
-                                           options_.alternate_memory_space) {
-            memory_space = MemorySpace::kAlternate;
-          }
-          for (const HloBuffer* buffer :
-               alias_analysis_.ComputeBuffersAt(parameter_instruction, index)) {
-            for (const HloValue* value : buffer->values()) {
-              VLOG(3) << "Adding required assignment for parameter value = "
-                      << value->ToShortString()
-                      << " time = " << parameter_instruction_time << " space = "
-                      << (memory_space == MemorySpace::kDefault ? "def"
-                                                                : "alt");
-              AddRequiredAssignment(value, parameter_instruction, memory_space,
-                                    parameter_instruction_time,
-                                    /*offset=*/nullptr,
-                                    /*add_to_pending=*/false);
-            }
-          }
-        });
-  }
-  HloInstruction* root_instruction = entry_computation->root_instruction();
-  int64_t root_instruction_time = instruction_schedule.at(root_instruction);
-  ShapeUtil::ForEachSubshape(
-      root_instruction->shape(),
-      [&](const Shape& subshape, const ShapeIndex& index) {
-        MemorySpace memory_space = MemorySpace::kDefault;
-        if (subshape.has_layout() && subshape.layout().memory_space() ==
-                                         options_.alternate_memory_space) {
-          memory_space = MemorySpace::kAlternate;
-        }
-        for (const HloBuffer* buffer :
-             alias_analysis_.ComputeBuffersAt(root_instruction, index)) {
-          for (const HloValue* value : buffer->values()) {
-            VLOG(3) << "Adding required assignment for output value = "
-                    << value->ToShortString()
-                    << " time = " << root_instruction_time << " space = "
-                    << (memory_space == MemorySpace::kDefault ? "def" : "alt");
-            AddRequiredAssignment(value, root_instruction, memory_space,
-                                  root_instruction_time,
-                                  /*offset=*/nullptr, /*add_to_pending=*/false);
-          }
-        }
-      });
-
-  for (const HloComputation* computation : module.MakeNonfusionComputations()) {
-    for (HloInstruction* instruction : computation->instructions()) {
-      if (instruction->opcode() == HloOpcode::kConstant) {
-        auto constant_instruction_it = instruction_schedule.find(instruction);
-        if (constant_instruction_it == instruction_schedule.end()) {
-          continue;
-        }
-        int64_t constant_instruction_time = constant_instruction_it->second;
-        ShapeUtil::ForEachLeafShape(
-            instruction->shape(),
-            [&](const Shape& /*sub_shape*/, const ShapeIndex& index) {
-              for (const HloBuffer* buffer :
-                   alias_analysis_.ComputeBuffersAt(instruction, index)) {
-                for (const HloValue* value : buffer->values()) {
-                  VLOG(3) << "Adding required assignment for constant value = "
-                          << value->ToShortString()
-                          << " time = " << constant_instruction_time
-                          << " space = def";
-                  AddRequiredAssignment(value, instruction,
-                                        MemorySpace::kDefault,
-                                        constant_instruction_time,
-                                        /*offset=*/nullptr,
-                                        /*add_to_pending=*/false);
-                }
-              }
-            });
-      }
-    }
-  }
-
-  // Go through all of the values and pin them to the default memory if they are
-  // not allowed on the alternate memory.
-  for (const HloValue* value : alias_analysis_.dataflow_analysis().values()) {
-    if (!options_.is_allowed_in_alternate_mem_fn(*value)) {
-      // We won't find the instruction in the schedule if it's inside a fusion.
-      // If so, just skip.
-      auto instruction_time_it =
-          instruction_schedule.find(value->instruction());
-      if (instruction_time_it == instruction_schedule.end()) {
-        continue;
-      }
-      int64_t instruction_time = instruction_time_it->second;
-      auto& required_assignments = required_assignments_[value];
-      // Check if there is an existing matching required assignment (e.g.
-      // inserted by the logic above) and if so ensure it requires a default
-      // memory allocation.
-      auto matching_assignment = absl::c_find_if(
-          required_assignments,
-          [&](const RequiredMemoryAssignment& required_assignment) {
-            return required_assignment.time == instruction_time;
-          });
-      if (matching_assignment != required_assignments.end()) {
-        CHECK(matching_assignment->memory_space == MemorySpace::kDefault)
-            << "Mismatch in required assignments at time " << instruction_time
-            << " value: " << value->ToString();
-      } else {
-        VLOG(3) << "Adding required assignment: " << value->ToShortString()
-                << " at " << instruction_time << " at def";
-        required_assignments.push_back(
-            {MemorySpace::kDefault, instruction_time});
-      }
-    }
-  }
-}
-
-bool AlternateMemoryBestFitHeap::AreIntervalsReservedInAlternateMemory(
-    absl::Span<const BufferInterval* const> colocated_intervals) const {
-  auto is_position_in_alternate_memory = [&](const HloPosition& position) {
-    const Shape& shape = position.shape();
-    return shape.has_layout() &&
-           shape.layout().memory_space() == options_.alternate_memory_space;
-  };
-
-  const HloModule& module = alias_analysis_.dataflow_analysis().module();
-  const HloComputation* entry_computation = module.entry_computation();
-  const HloInstruction* root_instruction =
-      entry_computation->root_instruction();
-  for (const BufferInterval* colocated_interval : colocated_intervals) {
-    const HloValue* value = colocated_interval->buffer;
-    if (value->defining_instruction()->opcode() == HloOpcode::kParameter &&
-        value->defining_instruction()->parent() == entry_computation &&
-        is_position_in_alternate_memory(value->defining_position())) {
-      return true;
-    }
-
-    for (const HloPosition& position : value->positions()) {
-      if (position.instruction == root_instruction &&
-          is_position_in_alternate_memory(position)) {
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-const std::vector<const HloInstruction*>*
-AlternateMemoryBestFitHeap::GetRepeatedInstructionList(
-    const HloInstruction* instruction) const {
-  const auto fingerprint_it = fingerprint_map_.find(instruction);
-  if (fingerprint_it == fingerprint_map_.end()) {
-    return nullptr;
-  }
-  const auto repeated_insts_it =
-      repeated_inst_map_.find(fingerprint_it->second);
-  CHECK(repeated_insts_it != repeated_inst_map_.end());
-  return &repeated_insts_it->second;
-}
-
-void AlternateMemoryBestFitHeap::UpdateReservedScopedAllocationSize() {
-  // Check all instructions, if their operands/outputs have been placed in
-  // alternate memory, update their scoped allocation size.
-  VLOG(2) << "Update scoped allocation size before repacking.";
-  const auto& instruction_sequence =
-      hlo_live_range_.flattened_instruction_sequence().instructions();
-  absl::flat_hash_map<int64_t, int64_t> reserved_scoped_memory_map;
-  for (int i = 0; i < instruction_sequence.size(); ++i) {
-    const HloInstruction* instruction = instruction_sequence[i];
-    reserved_scoped_memory_map[i] = options_.reserved_scoped_memory_fn(
-        instruction, operands_in_alternate_memory_map_[instruction],
-        outputs_in_alternate_memory_map_[instruction]);
-  }
-  // Update scoped allocation sizes.
-  for (RepackAllocationBlock& allocation_block : repack_allocation_blocks_) {
-    Allocation* allocation = allocation_block.allocation;
-    if (allocation->is_scoped_allocation()) {
-      allocation_block.size =
-          reserved_scoped_memory_map[allocation->start_time()];
-      allocation->mutable_chunk()->size =
-          reserved_scoped_memory_map[allocation->start_time()];
-    }
-  }
-}
-
-void AlternateMemoryBestFitHeap::ExportAllocationsForRepacking(
-    std::vector<AllocationBlock*>& allocations) {
-  using SliceDetail = SlicedCopyAllocation::SliceDetail;
-
-  if (options_.reduce_scoped_memory_limit) {
-    UpdateReservedScopedAllocationSize();
-  }
-
-  for (RepackAllocationBlock& allocation_block : repack_allocation_blocks_) {
-    allocation_block.original_slice_data = std::nullopt;
-    allocation_block.repacked_slice_data = std::nullopt;
-
-    if (!allocation_block.allocation->is_sliced_copy_allocation()) {
-      allocations.push_back(&allocation_block);
-      continue;
-    }
-
-    SlicedCopyAllocation* allocation =
-        dynamic_cast<SlicedCopyAllocation*>(allocation_block.allocation);
-    std::vector<const SliceDetail*> slice_details_sorted_by_offset;
-    slice_details_sorted_by_offset.reserve(
-        allocation->slice_details_sorted_by_start_time().size());
-    for (const SliceDetail& slice_detail :
-         allocation->slice_details_sorted_by_start_time()) {
-      slice_details_sorted_by_offset.push_back(&slice_detail);
-    }
-    absl::c_stable_sort(slice_details_sorted_by_offset,
-                        [](const SliceDetail* lhs, const SliceDetail* rhs) {
-                          return lhs->slice_decision.chunk.offset <
-                                 rhs->slice_decision.chunk.offset;
-                        });
-
-    // Since this is a sliced allocation, construct SlicedAllocationData to
-    // attach to the AllocationBlock.
-    SlicedAllocationData original_slice_data;
-    for (const SliceDetail* slice_detail : slice_details_sorted_by_offset) {
-      CHECK_EQ(slice_detail->copy_start_after_time,
-               slice_detail->slice_decision.exclusive_start_time);
-      original_slice_data.slices_sorted_by_offset.push_back(AllocatedSlice{
-          slice_detail->slice_decision.chunk.size,
-          slice_detail->slice_decision.chunk.offset,
-          /*inclusive_start_time=*/
-          ExclusiveToInclusiveStartTime(
-              slice_detail->slice_decision.exclusive_start_time)});
-    }
-
-    allocation_block.original_slice_data = std::move(original_slice_data);
-    allocations.push_back(&allocation_block);
-  }
-}
-
-void AlternateMemoryBestFitHeap::ImportRepackedAllocations() {
-  interval_tree_ = {};
-  for (RepackAllocationBlock& allocation_block : repack_allocation_blocks_) {
-    if (allocation_block.allocation->is_sliced_copy_allocation()) {
-      ImportRepackedSlicedAllocation(allocation_block);
-      continue;
-    }
-    ImportRepackedNonSlicedAllocation(allocation_block);
-  }
-}
-
-void AlternateMemoryBestFitHeap::ImportRepackedNonSlicedAllocation(
-    RepackAllocationBlock& block) {
-  Allocation* allocation = block.allocation;
-  int64_t original_offset = block.initial_offset;
-  int64_t repacked_offset = block.offset;
-
-  // Update the Allocation, AllocationBlock, and interval_tree_.
-  allocation->set_offset(repacked_offset);
-  block.initial_offset = repacked_offset;
-  block.offset = -1;
-  interval_tree_.Add(
-      block.inclusive_start_time, block.end_time,
-      HeapSimulator::Chunk::FromOffsetSize(repacked_offset, block.size));
-
-  VLOG(3) << "Repacking move. offset: " << original_offset << " -> "
-          << repacked_offset << "; size: " << block.size
-          << "; Allocation: " << allocation->ToString();
-}
-
-void AlternateMemoryBestFitHeap::ImportRepackedSlicedAllocation(
-    RepackAllocationBlock& block) {
-  using SlicedCopyAllocation = memory_space_assignment::SlicedCopyAllocation;
-  using SliceDetail = SlicedCopyAllocation::SliceDetail;
-
-  CHECK_OK(AreRepackedSlicesValid(block));
-
-  SlicedCopyAllocation* allocation =
-      dynamic_cast<SlicedCopyAllocation*>(block.allocation);
-  CHECK(block.allocation->is_sliced_copy_allocation());
-  int64_t original_offset = block.initial_offset;
-  int64_t repacked_offset = block.offset;
-  std::vector<int64_t> original_slice_offsets =
-      allocation->SliceOffsetsSortedByStartTime();
-
-  // Update the Allocation, AllocationBlock, and interval_tree_.
-  allocation->set_offset(repacked_offset);
-  if (block.repacked_slice_data.has_value()) {
-    allocation->ImportRepackedSliceData(*block.repacked_slice_data);
-  } else {
-    allocation->AddDiffToAllSliceOffsets(repacked_offset - original_offset);
-  }
-  block.initial_offset = repacked_offset;
-  block.offset = -1;
-  // Note, in a non-repacking setting, we would have reworked the chunks as
-  // described in AlternateMemoryBestFitHeap::PrefetchContext::SlicedSolution::
-  // slices_for_pending_chunks. Doing so was for the benefit of
-  // AlternateMemoryBestFitHeap::pending_chunks_. However, pending_chunks_
-  // are cleared before repacking, when UncommitPendingChunks() is called. Thus,
-  // we don't need to worry about modifying the chunks here.
-  for (const SliceDetail& slice_detail :
-       allocation->slice_details_sorted_by_start_time()) {
-    interval_tree_.Add(
-        /*start=*/
-        ExclusiveToInclusiveStartTime(slice_detail.copy_start_after_time),
-        block.end_time, slice_detail.slice_decision.chunk);
-  }
-
-  VLOG(3) << "Repacking move. offset: " << original_offset << " -> "
-          << repacked_offset << "; size: " << block.size << "; " <<
-      [&]() {
-        std::vector<int64_t> new_slice_offsets =
-            allocation->SliceOffsetsSortedByStartTime();
-        CHECK_EQ(original_slice_offsets.size(), new_slice_offsets.size());
-        std::vector<std::string> offset_moves;
-        offset_moves.reserve(original_slice_offsets.size());
-        for (int i = 0; i < original_slice_offsets.size(); ++i) {
-          offset_moves.push_back(absl::StrCat(original_slice_offsets[i], " -> ",
-                                              new_slice_offsets[i]));
-        }
-        return absl::StrCat("slice_offsets: [",
-                            absl::StrJoin(offset_moves, ", "), "]");
-      }()
-          << "; Allocation: " << allocation->ToString();
-}
-
-Status AlternateMemoryBestFitHeap::AreRepackedSlicesValid(
-    const RepackAllocationBlock& block) {
-  if (!block.repacked_slice_data.has_value()) {
-    return OkStatus();
-  }
-  if (!block.original_slice_data.has_value()) {
-    return InvalidArgumentStrCat(
-        "Repacked sliced allocation has repacked slice data but not original "
-        "slice data.");
-  }
-  int64_t num_slices =
-      block.original_slice_data->slices_sorted_by_offset.size();
-  if (num_slices != block.repacked_slice_data->slices_sorted_by_offset.size()) {
-    return InvalidArgumentStrCat(
-        "Repacked sliced allocation has ", num_slices,
-        " slices but repacking has data for ",
-        block.repacked_slice_data->slices_sorted_by_offset.size(), " slices.");
-  }
-
-  // Ensure that the slice size to start time mapping has not changed. If it
-  // changes, its invalidates MSA's internal state, e.g., the peak_memory_usage_
-  // data structure.
-  std::vector<std::pair<int64_t, int64_t>> original_size_to_time_mapping;
-  original_size_to_time_mapping.reserve(num_slices);
-  for (const AllocatedSlice& slice :
-       block.original_slice_data->slices_sorted_by_offset) {
-    original_size_to_time_mapping.push_back(
-        std::make_pair(slice.size, slice.inclusive_start_time));
-  };
-  absl::c_sort(original_size_to_time_mapping);
-  std::vector<std::pair<int64_t, int64_t>> repacked_size_to_time_mapping;
-  repacked_size_to_time_mapping.reserve(num_slices);
-  for (const AllocatedSlice& slice :
-       block.repacked_slice_data->slices_sorted_by_offset) {
-    repacked_size_to_time_mapping.push_back(
-        std::make_pair(slice.size, slice.inclusive_start_time));
-  };
-  absl::c_sort(repacked_size_to_time_mapping);
-  if (original_size_to_time_mapping != repacked_size_to_time_mapping) {
-    return InvalidArgumentStrCat(
-        "Repacked slices do not preserve the initial slice size-start time "
-        "mappings.");
-  }
-
-  return OkStatus();
-}
-
-void AlternateMemoryBestFitHeap::UncommitPendingChunks(
-    absl::Span<AllocationValue> allocation_values) {
-  // Clear the allocation sequence of the allocation values so that in case we
-  // retry allocation after uncommitting.
-  for (AllocationValue& allocation_value : allocation_values) {
-    allocation_value.mutable_allocation_sequence()->clear();
-  }
-  for (const auto& interval_and_chunk : pending_chunks_) {
-    const BufferInterval& interval = interval_and_chunk.first;
-    const Chunk& chunk = interval_and_chunk.second;
-    VLOG(3) << "Uncommitting: (" << interval.start << ", " << interval.end
-            << ") off = " << chunk.offset << " size = " << chunk.size;
-    for (int i = interval.start; i <= interval.end; ++i) {
-      peak_memory_usage_[i] -= chunk.size;
-      CHECK_GE(peak_memory_usage_[i], 0)
-          << "Peak memory usage at " << i
-          << " is below zero after uncommitting. " << interval.start << "-"
-          << interval.end << " : [" << chunk.offset << ", " << chunk.size
-          << "]";
-    }
-    interval_tree_.Remove(interval.start, interval.end, chunk);
-  }
-  for (const AsynchronousCopy& async_copy : pending_async_copies_) {
-    if (async_copy.destination == MemorySpace::kAlternate) {
-      prefetch_interval_tree_.Remove(
-          /*start=*/
-          ExclusiveToInclusiveStartTime(async_copy.exclusive_start_time),
-          async_copy.end_time, kDummyChunk);
-      prefetch_async_copy_resource_.RemoveCopy(async_copy);
-      if (options_.enforce_prefetch_fifo_order) {
-        async_copy_ordering_.RemoveCopy(async_copy);
-      }
-    } else {
-      eviction_interval_tree_.Remove(
-          /*start=*/
-          ExclusiveToInclusiveStartTime(async_copy.exclusive_start_time),
-          async_copy.end_time, kDummyChunk);
-      eviction_async_copy_resource_.RemoveCopy(async_copy);
-    }
-  }
-  for (const auto& value_and_required_assignment :
-       pending_required_assignments_) {
-    auto& required_assignment_vector =
-        required_assignments_[value_and_required_assignment.first];
-    const RequiredMemoryAssignment& required_assignment =
-        value_and_required_assignment.second;
-    VLOG(3) << "Removing required assignment: "
-            << (required_assignment.memory_space == MemorySpace::kDefault
-                    ? "def"
-                    : "alt")
-            << " time = " << required_assignment.time << " off = "
-            << (required_assignment.offset ? required_assignment.offset->offset
-                                           : -1);
-    for (auto it = required_assignment_vector.begin();
-         it != required_assignment_vector.end(); ++it) {
-      if (*it == value_and_required_assignment.second) {
-        required_assignment_vector.erase(it);
-        break;
-      }
-    }
-  }
-  ClearPendingChunks();
-}
-
-void AlternateMemoryBestFitHeap::FinalizeAllocations(
-    absl::Span<AllocationValue> allocation_values) {
-  absl::flat_hash_map<const AliasedOffset*, std::vector<Allocation*>>
-      colocation_map;
-  for (AllocationValue& allocation_value : allocation_values) {
-    for (auto& allocation : *allocation_value.mutable_allocation_sequence()) {
-      if ((allocation->memory_space() == MemorySpace::kAlternate) &&
-          (!allocation->is_scoped_allocation())) {
-        for (const HloUse& use : allocation->uses()) {
-          operands_in_alternate_memory_map_[use.instruction].insert(
-              std::make_pair(use.operand_number, use.operand_index));
-        }
-        if (!allocation->is_copy_like_allocation()) {
-          outputs_in_alternate_memory_map_[allocation->defining_position()
-                                               .instruction]
-              .insert(allocation->defining_position().index);
-        }
-      }
-      allocations_->push_back(std::move(allocation));
-      Allocation* inserted_allocation = allocations_->back().get();
-      if (inserted_allocation->memory_space() == MemorySpace::kAlternate) {
-        colocation_map[GetAliasedOffset(*inserted_allocation)].push_back(
-            inserted_allocation);
-      }
-    }
-  }
-  // The allocations that have the same AliasedOffset need to be colocated.
-  // Export these to repack_allocation_blocks_ so that we can repack them to
-  // reduce fragmentation.
-  for (auto& colocation : colocation_map) {
-    std::vector<AllocationBlock*> colocations;
-    for (Allocation* colocated_allocation : colocation.second) {
-      repack_allocation_blocks_.push_back(MakeRepackAllocationBlock(
-          colocated_allocation->start_time(), colocated_allocation->end_time(),
-          colocated_allocation->chunk().size,
-          colocated_allocation->chunk().offset,
-          static_cast<int64_t>(repack_allocation_blocks_.size()),
-          colocated_allocation));
-      colocations.push_back(&repack_allocation_blocks_.back());
-    }
-    for (int i = 0; i < colocations.size() - 1; ++i) {
-      colocations[i]->next_colocated = colocations[i + 1];
-    }
-    if (!colocations.empty()) {
-      colocations.back()->next_colocated = colocations.front();
-    }
-  }
-  ClearPendingChunks();
-}
-
-void AlternateMemoryBestFitHeap::ClearPendingChunks() {
-  pending_chunks_.clear();
-  pending_async_copies_.clear();
-  pending_required_assignments_.clear();
-  aliased_offset_map_.clear();
-  aliased_offsets_.clear();
-}
-
-void AlternateMemoryBestFitHeap::AddToPendingChunks(
-    const BufferInterval& buffer_interval, const Chunk& chunk_candidate) {
-  VLOG(3) << "Committing chunk: " << buffer_interval.start << "-"
-          << buffer_interval.end << " : " << chunk_candidate.ToString();
-  pending_chunks_.emplace_back(buffer_interval, chunk_candidate);
-  for (int i = buffer_interval.start; i <= buffer_interval.end; ++i) {
-    peak_memory_usage_[i] += chunk_candidate.size;
-    CHECK_LE(peak_memory_usage_[i], options_.max_size_in_bytes)
-        << "Peak memory usage at " << i
-        << " exceeds the max size of alternate memory. "
-        << buffer_interval.start << "-" << buffer_interval.end << " : "
-        << chunk_candidate.ToString();
-  }
-  CommitChunk(buffer_interval, chunk_candidate);
-}
-
-std::optional<int>
-AlternateMemoryBestFitHeap::FindEarliestExclusiveTimeToSatisfyPeakMemory(
-    int exclusive_start_time, int end_time, int64_t size) const {
-  std::optional<int> earliest_time_exclusive = std::nullopt;
-  for (int time_inclusive = ExclusiveToInclusiveEndTime(end_time);
-       time_inclusive > exclusive_start_time; --time_inclusive) {
-    if (peak_memory_usage_[time_inclusive] + size <=
-        options_.max_size_in_bytes) {
-      earliest_time_exclusive = InclusiveToExclusiveStartTime(time_inclusive);
-    } else {
-      break;
-    }
-  }
-
-  return earliest_time_exclusive;
-}
-
-AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::AllocateSegment(
-    const AllocationRequest& request) {
-  auto allocation_sequence =
-      request.allocation_value->mutable_allocation_sequence();
-  // inclusive_start_time == end_time is a special case where the value is
-  // consumed multiple times by the same instruction. We can just find the
-  // previous allocation and use that allocation.
-  if (request.inclusive_start_time == request.end_time) {
-    Allocation* allocation =
-        GetLiveAllocationAt(*allocation_sequence, request.end_time);
-    CHECK_NE(allocation, nullptr);
-    allocation->AddUse(request.use->hlo_use);
-    return Result::kSuccess;
-  }
-
-  const HloPosition& defining_position =
-      request.allocation_value->defining_position();
-  VLOG(2) << "Finding allocation for "
-          << request.allocation_value->ToShortString() << " ["
-          << request.inclusive_start_time << ", " << request.end_time
-          << ") latest prefetch = " << request.latest_prefetch_time
-          << " last use = " << request.allocation_value->uses().back().time
-          << " use = " << request.use->hlo_use.ToString()
-          << ". Size = " << request.size
-          << ", def pos = " << defining_position.ToString();
-  if (request.require_no_copy_alternate_mem_allocation) {
-    VLOG(2) << "Requiring alternate memory allocation.";
-  }
-  CHECK_LE(request.inclusive_start_time, request.end_time);
-  if (VLOG_IS_ON(3) && options_.cost_analysis) {
-    const HloPosition& defining_position =
-        request.allocation_value->defining_position();
-    const HloUse& use = request.use->hlo_use;
-    VLOG(3) << "Definition benefit = "
-            << options_.cost_analysis->GetAlternateMemoryBenefit(
-                   request.allocation_value->defining_position())
-            << " use benefit = "
-            << options_.cost_analysis->GetAlternateMemoryBenefit(
-                   request.use->hlo_use);
-    VLOG(3)
-        << "Definition bytes accessed = "
-        << options_.cost_analysis->hlo_cost_analysis().output_bytes_accessed(
-               *defining_position.instruction, defining_position.index)
-        << ", use bytes accessed = "
-        << options_.cost_analysis->hlo_cost_analysis().operand_bytes_accessed(
-               *use.instruction, use.operand_number, use.operand_index);
-  }
-
-  // There could be a requirement to pin this buffer to default memory either
-  // because it is a parameter or an output.  If the buffer is a parameter, then
-  // we're allowed to prefetch. If the use expects the output to be in default
-  // memory, we cannot prefetch it because if we did, it would be in alternate
-  // memory instead.
-  auto required_assignment_at_start = RequiredMemoryAssignmentAt(
-      request.allocation_value->value(), request.inclusive_start_time);
-  std::optional<MemorySpace> required_memory_space_at_start;
-  if (required_assignment_at_start) {
-    required_memory_space_at_start = required_assignment_at_start->memory_space;
-  }
-  // Find required assignment both for the use and its aliases. If they are both
-  // non-nullopt, then make sure they require the same assignment.
-  auto required_assignment_at_end = RequiredMemoryAssignmentAt(
-      request.allocation_value->value(), request.end_time);
-  auto aliased_required_assignment_at_end =
-      AliasedRequiredAssignmentForUse(*request.use);
-  if (required_assignment_at_end != aliased_required_assignment_at_end) {
-    if (required_assignment_at_end == std::nullopt) {
-      required_assignment_at_end = aliased_required_assignment_at_end;
-    } else {
-      CHECK(aliased_required_assignment_at_end == std::nullopt ||
-            aliased_required_assignment_at_end->equals_ignoring_time(
-                *required_assignment_at_end));
-    }
-  }
-  std::optional<MemorySpace> required_memory_space_at_end;
-  if (required_assignment_at_end) {
-    required_memory_space_at_end = required_assignment_at_end->memory_space;
-  }
-
-  if (required_assignment_at_start) {
-    bool needs_required_allocation = true;
-    if (!allocation_sequence->empty()) {
-      auto prev_allocation_it = std::find_if(
-          allocation_sequence->rbegin(), allocation_sequence->rend(),
-          [&](const auto& allocation) {
-            return allocation->memory_space() == required_memory_space_at_start;
-          });
-      if (prev_allocation_it != allocation_sequence->rend()) {
-        (*prev_allocation_it)->set_end_time(request.inclusive_start_time);
-        needs_required_allocation = false;
-      }
-    }
-    if (needs_required_allocation) {
-      std::optional<Chunk> aliased_chunk = std::nullopt;
-      if (required_assignment_at_start->memory_space ==
-          MemorySpace::kAlternate) {
-        aliased_chunk = Chunk::FromOffsetSize(
-            required_assignment_at_start->offset->offset, request.size);
-      }
-      allocation_sequence->push_back(std::make_unique<PinnedAllocation>(
-          defining_position, required_assignment_at_start->memory_space,
-          aliased_chunk, request.inclusive_start_time,
-          request.inclusive_start_time,
-          /*is_scoped_allocation=*/false));
-      if (required_assignment_at_start->memory_space ==
-          MemorySpace::kAlternate) {
-        CreateOrAddToAliasedOffset(*allocation_sequence->back(),
-                                   required_assignment_at_start->offset);
-      }
-    }
-  }
-
-  Result allocation_result = Result::kSuccess;
-  // First try keeping the allocation entirely in the alternate memory.
-  if (required_memory_space_at_start != MemorySpace::kDefault &&
-      required_memory_space_at_end != MemorySpace::kDefault &&
-      request.allow_no_copy_alternate_mem_allocation) {
-    allocation_result = AllocateInAlternateMemoryNoCopy(request);
-    if (allocation_result == Result::kSuccess) {
-      return Result::kSuccess;
-    }
-    // If we required alternate memory allocation, return on failure.
-    if (request.require_no_copy_alternate_mem_allocation) {
-      return allocation_result;
-    }
-  }
-
-  CHECK(!request.require_no_copy_alternate_mem_allocation);
-
-  auto prev_allocation_it = allocation_sequence->rbegin();
-  // Find a previous allocation that is in the default memory space (not
-  // necessarily the very last allocation).
-  auto prev_allocation_in_default_mem_it =
-      std::find_if(allocation_sequence->rbegin(), allocation_sequence->rend(),
-                   [&](const auto& allocation) {
-                     return allocation->memory_space() == MemorySpace::kDefault;
-                   });
-
-  if (prev_allocation_in_default_mem_it == allocation_sequence->rend() &&
-      prev_allocation_it != allocation_sequence->rend() &&
-      (*prev_allocation_it)->memory_space() == MemorySpace::kAlternate &&
-      (*prev_allocation_it)->defining_position() == defining_position &&
-      !request.allocation_value->requires_contiguous_allocation()) {
-    // If there was an allocation for this HloValue that was in the alternate
-    // memory space, we also need to perform an eviction.
-    Result eviction_result = Evict(request);
-    if (eviction_result != Result::kSuccess) {
-      // A non-success eviction requires us to uncommit previous allocations.
-      return result_mark(Result::kFailRequiresUncommit, eviction_result);
-    }
-    prev_allocation_in_default_mem_it = allocation_sequence->rbegin();
-  } else if (prev_allocation_in_default_mem_it == allocation_sequence->rend()) {
-    allocation_sequence->push_back(std::make_unique<PinnedAllocation>(
-        defining_position, MemorySpace::kDefault,
-        /*chunk=*/std::nullopt, request.inclusive_start_time, request.end_time,
-        /*is_scoped_allocation=*/false));
-    prev_allocation_in_default_mem_it = allocation_sequence->rbegin();
-  }
-
-  CHECK(prev_allocation_in_default_mem_it != allocation_sequence->rend());
-  CHECK((*prev_allocation_in_default_mem_it)->memory_space() ==
-        MemorySpace::kDefault);
-
-  // If the allocation value requires a contiguous allocation but has a memory
-  // space mismatch between the start and end required assignments, then we need
-  // to uncommit.
-  if (request.allocation_value->requires_contiguous_allocation() &&
-      required_memory_space_at_start.has_value() &&
-      required_memory_space_at_end.has_value() &&
-      required_memory_space_at_start != required_memory_space_at_end) {
-    VLOG(3) << "Allocation requires contiguous allocation but has memory space "
-               "mismatch.";
-    return result_mark(Result::kFailRequiresUncommit, allocation_result);
-  }
-
-  // If the buffer must be in default memory at the end_time, don't prefetch.
-  if (required_memory_space_at_end == MemorySpace::kDefault) {
-    VLOG(3)
-        << "Not trying to prefetch because use requires buffer in default mem.";
-    (*prev_allocation_in_default_mem_it)->set_end_time(request.end_time);
-    (*prev_allocation_in_default_mem_it)->AddUse(request.use->hlo_use);
-    return Result::kSuccess;
-  }
-
-  // Finally, try to prefetch the buffer into alternate memory.
-  if (request.allow_prefetch &&
-      !request.allocation_value->requires_contiguous_allocation()) {
-    Result prefetch_result =
-        Prefetch(request, **prev_allocation_in_default_mem_it);
-    if (prefetch_result == Result::kSuccess) {
-      if (request.preferred_prefetch_time) {
-        // Warn if the prefetch time picked doesn't match the preferred prefetch
-        // time.
-        CHECK(!request.allocation_value->allocation_sequence()->empty());
-        const Allocation* allocation =
-            request.allocation_value->allocation_sequence()->back().get();
-        int64_t prefetch_time = 0;
-        if (allocation->is_copy_allocation()) {
-          prefetch_time = static_cast<const CopyAllocation*>(allocation)
-                              ->copy_start_schedule_after();
-        } else if (allocation->is_sliced_copy_allocation()) {
-          prefetch_time = static_cast<const SlicedCopyAllocation*>(allocation)
-                              ->slice_details_sorted_by_start_time()
-                              .front()
-                              .copy_start_after_time;
-        } else {
-          LOG(FATAL) << "Prefetch allocation are expected to be "
-                        "CopyAllocations or SlicedCopyAllocations.";
-        }
-        if (prefetch_time != *request.preferred_prefetch_time) {
-          VLOG(1) << "Scheduled prefetch time (" << prefetch_time
-                  << ") doesn't match the preferred prefetch time ("
-                  << *request.preferred_prefetch_time
-                  << "): " << request.use->hlo_use.ToString();
-        }
-      }
-      return Result::kSuccess;
-    }
-    // Warn if there was a preferred prefetch time but we couldn't actually
-    // prefetch.
-    if (request.preferred_prefetch_time) {
-      VLOG(1) << "The request has a preferred prefetch time ("
-              << *request.preferred_prefetch_time
-              << ") which could not be satisfied: "
-              << request.use->hlo_use.ToString();
-    }
-    result_mark(prefetch_result, allocation_result);
-  }
-
-  // If the end assignment was required to be in alternate memory but that
-  // wasn't possible, then this allocation is invalid.
-  if (required_memory_space_at_end == MemorySpace::kAlternate) {
-    return result_mark(Result::kFailRequiresUncommit, allocation_result);
-  }
-
-  // If the start assignment was required to be in alternate memory and the
-  // buffer needs a contiguous assignment, we couldn't satisfy this requirement
-  // and must abort.
-  if (required_memory_space_at_start == MemorySpace::kAlternate &&
-      request.allocation_value->requires_contiguous_allocation()) {
-    return result_mark(Result::kFailRequiresUncommit, allocation_result);
-  }
-
-  // If a copy wasn't inserted, then add this use to the latest allocation in
-  // default memory.
-  (*prev_allocation_in_default_mem_it)->set_end_time(request.end_time);
-  (*prev_allocation_in_default_mem_it)->AddUse(request.use->hlo_use);
-  return allocation_result;
-}
-
-void AlternateMemoryBestFitHeap::AddAsyncCopy(
-    Allocation& prev_allocation, MemorySpace memory_space,
-    std::optional<Chunk> chunk, int64_t exclusive_start_time, int64_t end_time,
-    int64_t copy_done_schedule_before_time, AllocationSequence* allocations,
-    AliasedOffset* aliased_offset, float resource,
-    std::optional<int> cross_program_prefetch_index) {
-  VLOG(3) << "Copy to "
-          << (memory_space == MemorySpace::kDefault ? "default" : "alternate")
-          << " memory in (" << exclusive_start_time << ", "
-          << copy_done_schedule_before_time << "), keeping until " << end_time
-          << ", estimated copy resource is " << resource;
-  CHECK_LT(exclusive_start_time, copy_done_schedule_before_time);
-
-  allocations->push_back(std::make_unique<CopyAllocation>(
-      prev_allocation, memory_space, chunk, exclusive_start_time,
-      copy_done_schedule_before_time, end_time, cross_program_prefetch_index));
-
-  // Register the additional async copy with the interval tree to keep track of
-  // the limit at any given time.
-  pending_async_copies_.push_back({exclusive_start_time,
-                                   copy_done_schedule_before_time, resource,
-                                   memory_space, next_async_copy_id_++});
-  if (memory_space == MemorySpace::kAlternate) {
-    prefetch_interval_tree_.Add(
-        /*start=*/
-        ExclusiveToInclusiveStartTime(exclusive_start_time),
-        copy_done_schedule_before_time, kDummyChunk);
-    prefetch_async_copy_resource_.AddCopy(pending_async_copies_.back());
-    if (options_.enforce_prefetch_fifo_order) {
-      async_copy_ordering_.AddCopy(pending_async_copies_.back());
-    }
-    CreateOrAddToAliasedOffset(*allocations->back(), aliased_offset);
-  } else {
-    eviction_interval_tree_.Add(
-        /*start=*/
-        ExclusiveToInclusiveStartTime(exclusive_start_time),
-        copy_done_schedule_before_time, kDummyChunk);
-    eviction_async_copy_resource_.AddCopy(pending_async_copies_.back());
-  }
-}
-
-namespace {
-
-// Computes a string that can be used for logging/debugging. For each slice, the
-// string includes:
-// - When the slice starts
-// - When the slice copy must complete
-// - When the allocation for the slice ends
-// - An estimation of how much copy resource the slice consumes
-std::string SliceTimesAndCopyResourcesToString(
-    const std::vector<SliceDecision>& slice_decisions, int64_t prefetch_end,
-    int64_t allocation_end) {
-  std::vector<std::string> slice_strings;
-  slice_strings.reserve(slice_decisions.size());
-
-  for (const auto& slice_decision : slice_decisions) {
-    std::vector<std::string> details;
-    details.push_back(absl::StrCat(slice_decision.exclusive_start_time));
-    details.push_back(absl::StrCat(prefetch_end));
-    details.push_back(absl::StrCat(allocation_end));
-    details.push_back(absl::StrCat(slice_decision.copy_resource_consumed));
-
-    slice_strings.push_back(
-        absl::StrCat("(", absl::StrJoin(details, ", "), ")"));
-  }
-
-  return absl::StrCat(
-      "Slices(copy_start_time, copy_done_by_time, allocation_end, "
-      "estimated_copy_resource) = [",
-      absl::StrJoin(slice_strings, ", "), "]");
-}
-
-}  // namespace
-
-void AlternateMemoryBestFitHeap::AddAsyncSlicesForPrefetch(
-    const Allocation& prev_allocation, AllocationSequence* allocations,
-    AliasedOffset* aliased_offset,
-    const std::vector<SliceDecision>& slice_decisions_sorted_by_start_time,
-    int64_t prefetch_end_time, int64_t allocation_end_time) {
-  VLOG(3) << "Sliced copy to alternate memory. "
-          << SliceTimesAndCopyResourcesToString(
-                 slice_decisions_sorted_by_start_time, prefetch_end_time,
-                 allocation_end_time);
-  CHECK(absl::c_all_of(
-      slice_decisions_sorted_by_start_time, [&](const auto& slice_decision) {
-        return slice_decision.exclusive_start_time < prefetch_end_time;
-      }));
-
-  allocations->push_back(std::make_unique<SlicedCopyAllocation>(
-      prev_allocation, MemorySpace::kAlternate,
-      slice_decisions_sorted_by_start_time, prefetch_end_time,
-      allocation_end_time, options_.sliced_prefetch_options,
-      options_.get_equivalent_s8_shape_fn));
-
-  // Register the additional async copy with the interval tree to keep track of
-  // the limit at any given time.
-  for (const auto& slice_decision : slice_decisions_sorted_by_start_time) {
-    pending_async_copies_.push_back(
-        {slice_decision.exclusive_start_time, prefetch_end_time,
-         slice_decision.copy_resource_consumed, MemorySpace::kAlternate,
-         next_async_copy_id_++});
-    prefetch_interval_tree_.Add(slice_decision.exclusive_start_time,
-                                prefetch_end_time, kDummyChunk);
-    prefetch_async_copy_resource_.AddCopy(pending_async_copies_.back());
-    if (options_.enforce_prefetch_fifo_order) {
-      async_copy_ordering_.AddCopy(pending_async_copies_.back());
-    }
-  }
-  CreateOrAddToAliasedOffset(*allocations->back(), aliased_offset);
-}
-
-bool AlternateMemoryBestFitHeap::ViolatesMaximumOutstandingAsyncCopies(
-    int64_t inclusive_start_time, int64_t end_time, bool is_prefetch,
-    int64_t extra_async_copy_limit, int64_t num_additional_copies) const {
-  if (options_.max_outstanding_prefetches < 0 && is_prefetch) {
-    return false;
-  }
-  if (options_.max_outstanding_evictions < 0 && !is_prefetch) {
-    return false;
-  }
-
-  // Count the prefetches/evictions in the interval tree for the given interval.
-  if (is_prefetch) {
-    int64_t num_prefetches =
-        prefetch_interval_tree_
-            .ChunksOverlappingInTime(inclusive_start_time, end_time)
-            .size() +
-        num_additional_copies;
-    return num_prefetches >=
-           options_.max_outstanding_prefetches + extra_async_copy_limit;
-  } else {
-    int64_t num_evictions =
-        eviction_interval_tree_
-            .ChunksOverlappingInTime(inclusive_start_time, end_time)
-            .size() +
-        num_additional_copies;
-    return num_evictions >=
-           options_.max_outstanding_evictions + extra_async_copy_limit;
-  }
-}
-
-AlternateMemoryBestFitHeap::Result
-AlternateMemoryBestFitHeap::AllocateInAlternateMemoryNoCopy(
-    const AllocationRequest& request) {
-  Allocation* prev_allocation = nullptr;
-  bool can_eliminate_copy = false;
-  if (request.allocation_value->allocation_sequence()->empty()) {
-    // There hasn't been any allocations for this interval so far. We can
-    // eliminate copy if the value can be placed in the alternate memory.
-    can_eliminate_copy = options_.is_allowed_in_alternate_mem_fn(
-        *request.allocation_value->value());
-  } else {
-    // If there has been a previous allocation, we can eliminate the copy if the
-    // previous allocation was also in the alternate memory.
-    prev_allocation =
-        request.allocation_value->allocation_sequence()->back().get();
-    can_eliminate_copy =
-        (prev_allocation->memory_space() == MemorySpace::kAlternate);
-  }
-
-  if (!can_eliminate_copy) {
-    VLOG(3) << "Can't eliminate copy.";
-    return Result::kFailPrevAllocationNotInAlternateMem;
-  }
-
-  const HloPosition& defining_position =
-      request.allocation_value->defining_position();
-  // If prefer_no_copy_alternate_mem_allocation is true, bypass the live range
-  // duration checks.
-  if (!request.require_no_copy_alternate_mem_allocation &&
-      !request.prefer_no_copy_alternate_mem_allocation &&
-      !options_.prefetch_interval_picker->CanAllocateInAlternateMemoryNoCopy(
-          defining_position.shape(), request.inclusive_start_time,
-          request.end_time)) {
-    VLOG(3) << "Live range is too long.";
-    return Result::kFailLiveRangeTooLong;
-  }
-
-  BufferInterval alternate_mem_interval;
-  alternate_mem_interval.buffer = request.allocation_value->value();
-  alternate_mem_interval.size = request.size;
-  alternate_mem_interval.end = request.end_time;
-  alternate_mem_interval.start = request.inclusive_start_time;
-
-  // Prefer the offset that was previously used for the previous allocation.
-  AliasedOffset* preferred_offset = nullptr;
-  if (prev_allocation != nullptr) {
-    preferred_offset = GetAliasedOffset(*prev_allocation);
-    // If there is a previous allocation, set the start time one after the end
-    // of the previous allocation's end.
-    alternate_mem_interval.start = prev_allocation->end_time() + 1;
-  }
-
-  if (request.preferred_offset) {
-    // If there is a preferred offset provided in the request and if it doesn't
-    // match the previous allocation, this request cannot be satisified.
-    if (preferred_offset && request.preferred_offset != preferred_offset) {
-      VLOG(3) << "Cannot perform no-copy allocation due to mismatch: "
-                 "preferred_offset = "
-              << preferred_offset->offset << ", request.preferred_offset = "
-              << request.preferred_offset->offset;
-      return Result::kFailConflictingPreferredOffsets;
-    }
-    preferred_offset = request.preferred_offset;
-  }
-
-  VLOG(3) << "We can eliminate copy to alternate memory. Preferred offset = "
-          << (preferred_offset ? preferred_offset->offset : -1);
-  // In case there are additional uses after this use, we rely on the last use
-  // time to try to reserve a chunk in the heap simulator. This is to prevent
-  // the following scenario:
-  //
-  //                            +-------+
-  //                           /         \
-  //                   Producer--->Use1   +-->Use2
-  //                       +---------+---------+
-  // New buffer:           |         |         |
-  //                       +---------+---------+
-  //
-  //                                     +-----------+
-  // Current heap:                       | offset: 0 |
-  //           --------------------------+-----------+------
-  //
-  // Because we allocate buffers greedily, Producer to Use1 segment first, and
-  // then Use1 to Use2 segment, it is possible to allocate the first segment at
-  // an offset that is available for the first segment (e.g. offset 0) but not
-  // for the entire live range. This can result in unnecessary copies. By using
-  // the last use time, we try to find an allocation that is available for the
-  // entire Producer to Use2 range.
-  std::optional<Chunk> chunk_candidate = FindBestChunkCandidate(
-      request, preferred_offset, &alternate_mem_interval);
-  // Check if the new heap size fits within limits. Also ensure if a
-  // preferred offset was provided, that offset was used.
-  if (chunk_candidate) {
-    VLOG(3) << "Keep the buffer in alternate memory. Offset = "
-            << chunk_candidate->offset << ", size = " << chunk_candidate->size
-            << ", heap_size = " << result_.UpdatedHeapSize(*chunk_candidate)
-            << ", prefetch picker = "
-            << options_.prefetch_interval_picker->ToNoCopyDebugString(
-                   defining_position.shape(),
-                   /*start_time=*/
-                   InclusiveToExclusiveStartTime(request.inclusive_start_time),
-                   request.end_time);
-    AddToPendingChunks(alternate_mem_interval, *chunk_candidate);
-
-    // If there was a previous allocation, the buffer location is the
-    // same as the previous. Otherwise, it is the operand.
-    if (prev_allocation != nullptr &&
-        (prev_allocation->is_copy_like_allocation() ||
-         prev_allocation->defining_position() == defining_position)) {
-      prev_allocation->set_end_time(request.end_time);
-    } else {
-      request.allocation_value->mutable_allocation_sequence()->push_back(
-          std::make_unique<PinnedAllocation>(
-              defining_position, MemorySpace::kAlternate, chunk_candidate,
-              request.inclusive_start_time, request.end_time,
-              /*is_scoped_allocation=*/false));
-      CreateOrAddToAliasedOffset(
-          *request.allocation_value->allocation_sequence()->back(),
-          preferred_offset);
-    }
-    request.allocation_value->allocation_sequence()->back()->AddUse(
-        request.use->hlo_use);
-    return Result::kSuccess;
-  }
-  if (request.prefer_no_copy_alternate_mem_allocation) {
-    VLOG(1) << "Preferred no-copy allocation, but this was not possible: "
-            << request.use->hlo_use.ToString();
-  }
-  return Result::kFailOutOfMemory;
-}
-
-AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::Evict(
-    const AllocationRequest& request) {
-  CHECK_GT(request.allocation_value->allocation_sequence()->size(), 0);
-  Allocation* prev_allocation =
-      request.allocation_value->allocation_sequence()->back().get();
-  // We do not ever expect an Evict() to be immediately proceeded by a prefetch.
-  // If that case ever occurs, the eviction_exclusive_start_time below will be
-  // calculated incorrectly, as it will need to come after the prefetch finishes
-  // coping data.
-  CHECK(!prev_allocation->is_copy_like_allocation())
-      << "Evict has been given copy-like previous allocation.\nEvict "
-         "candidate:\n"
-      << request.allocation_value->ToString() << "\nPrevious allocation:\n"
-      << prev_allocation->ToString();
-
-  // The previous allocation's inclusive start time is the eviction's exclusive
-  // start time to ensure that the value is created before we start copying
-  // back to default memory.
-  int64_t eviction_exclusive_start_time = prev_allocation->start_time();
-  int64_t eviction_end_time = prev_allocation->end_time();
-  CHECK(eviction_exclusive_start_time <= eviction_end_time);
-
-  int64_t preferred_eviction_end_time =
-      std::max(options_.prefetch_interval_picker->PreferredEvictionEndTime(
-                   request.allocation_value->defining_position().shape(),
-                   eviction_exclusive_start_time, request.end_time),
-               eviction_end_time);
-  // Evictions must complete by the time of this use.
-  preferred_eviction_end_time =
-      std::min(preferred_eviction_end_time, request.latest_prefetch_time);
-
-  BufferInterval eviction_mem_interval;
-  eviction_mem_interval.buffer = request.allocation_value->value();
-  eviction_mem_interval.size = request.size;
-  // Try to reserve a buffer from the end of the previous allocation to the
-  // preferred eviction end time.
-  eviction_mem_interval.start = eviction_end_time + 1;
-  eviction_mem_interval.end = preferred_eviction_end_time;
-  int64_t preferred_offset = prev_allocation->chunk().offset;
-  VLOG(3) << "Considering eviction after" << eviction_exclusive_start_time
-          << ", with preferred end time = " << eviction_mem_interval.end;
-
-  for (; eviction_mem_interval.end > eviction_end_time;
-       --eviction_mem_interval.end) {
-    Chunk chunk_candidate =
-        FindChunkCandidate(eviction_mem_interval, preferred_offset);
-    if (chunk_candidate.offset == preferred_offset) {
-      AddToPendingChunks(eviction_mem_interval, chunk_candidate);
-      break;
-    }
-  }
-  eviction_end_time = eviction_mem_interval.end;
-
-  VLOG(3) << "Evicting buffer at " << prev_allocation->chunk().offset << " ("
-          << eviction_exclusive_start_time << ", " << eviction_end_time << ")";
-
-  float eviction_resource =
-      options_.cost_analysis
-          ? options_.cost_analysis->GetAsyncCopyElapsed(
-                request.allocation_value->defining_position().shape())
-          : 0.1;
-
-  bool eviction_interval_too_short =
-      (eviction_exclusive_start_time == eviction_end_time);
-  bool eviction_violates_resource =
-      !eviction_async_copy_resource_.HasEnoughResource(
-          eviction_exclusive_start_time, eviction_end_time, eviction_resource);
-  if (eviction_violates_resource) {
-    // If we're in the last retry, set resource to 0.
-    if (options_.prefetch_interval_picker->retry_number() ==
-        options_.max_retries - 1) {
-      VLOG(3) << "Violates resource in last retry, setting resource = 0";
-      eviction_resource = 0;
-    }
-    eviction_violates_resource =
-        !eviction_async_copy_resource_.HasEnoughResource(
-            eviction_exclusive_start_time, eviction_end_time,
-            eviction_resource);
-  }
-  bool eviction_violates_outstanding_copies =
-      ViolatesMaximumOutstandingAsyncCopies(
-          /*inclusive_start_time=*/ExclusiveToInclusiveStartTime(
-              eviction_exclusive_start_time),
-          eviction_end_time,
-          /*is_prefetch=*/false);
-
-  // See if this interval would violate the asynchronous copy limit.
-  if (!eviction_interval_too_short && !eviction_violates_outstanding_copies &&
-      !eviction_violates_resource) {
-    prev_allocation->set_end_time(eviction_end_time);
-    AddAsyncCopy(*prev_allocation, MemorySpace::kDefault,
-                 /*chunk=*/std::nullopt, eviction_exclusive_start_time,
-                 prev_allocation->end_time(), eviction_end_time,
-                 request.allocation_value->mutable_allocation_sequence(),
-                 /*aliased_offset=*/nullptr, eviction_resource);
-  } else {
-    if (eviction_violates_outstanding_copies) {
-      VLOG(3) << "This violates the maximum async copies.";
-    } else if (eviction_violates_resource) {
-      VLOG(3) << "This violates resource.";
-    } else {
-      VLOG(3) << "Eviction interval is too short ("
-              << eviction_exclusive_start_time << ", " << eviction_end_time
-              << ").";
-    }
-    // If the original interval violated the limit, try sub-intervals within
-    // this interval.
-    bool eviction_scheduled = false;
-
-    if (!eviction_scheduled) {
-      // If the eviction couldn't be scheduled, then fail. This buffer will be
-      // kept in the default memory.
-      VLOG(3) << "Bailing: Could not evict " << request.use->hlo_use.ToString()
-              << " because we hit the limit of maximum asynchronous copies "
-              << "between ("
-              << hlo_live_range_.flattened_instruction_sequence()
-                     .instructions()[eviction_exclusive_start_time]
-              << ", "
-              << hlo_live_range_.flattened_instruction_sequence()
-                     .instructions()[eviction_end_time]
-              << ")";
-      return Result::kFailOutOfAsyncCopies;
-    }
-  }
-  return Result::kSuccess;
-}
-
-int64_t AlternateMemoryBestFitHeap::FindPrefetchEndTime(
-    const AllocationRequest& request, int64_t earliest_prefetch_time) const {
-  return request.latest_prefetch_time;
-}
-
-namespace {
-
-// A debugging/logging method for describing a sliced solution.
-std::string DescribeSlicedBufferMove(
-    const std::vector<SliceDecision>& slice_decisions,
-    const AlternateMemoryBestFitHeap::HeapResult& heap_result,
-    const AlternateMemoryBestFitHeap::Chunk& full_chunk,
-    absl::string_view prefetch_picker_debug_string) {
-  std::vector<std::string> slice_strings;
-  slice_strings.reserve(slice_decisions.size());
-
-  for (const auto& slice_decision : slice_decisions) {
-    slice_strings.push_back(absl::StrCat(
-        "(", slice_decision.exclusive_start_time, ", ",
-        slice_decision.chunk.offset, ", ", slice_decision.chunk.size, ")"));
-  }
-
-  return absl::StrCat(
-      "Moving buffer to alternate memory in slices. Slices(start_time, offset, "
-      "size) = [",
-      absl::StrJoin(slice_strings, ", "),
-      "]. Heap size = ", heap_result.UpdatedHeapSize(full_chunk),
-      ". Prefetch picker = ", prefetch_picker_debug_string);
-}
-
-}  // namespace
-
-AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::Prefetch(
-    const AllocationRequest& request,
-    Allocation& prev_allocation_in_default_mem) {
-  // Try partially placing the buffer in the alternate space. The time that is
-  // overlapped will be used to asynchronously copy the buffer from the
-  // default memory to the alternate memory.
-  //
-  //                      start                 end
-  //                      time                  time
-  //                      X---------------------X
-  // Alternate:                          +------+
-  // Default:             +---------------------+
-  //                                     ^      ^
-  //                                   Copy    Copy
-  //                                   Start   Done
-
-  VLOG(5) << "Considering prefetch of "
-          << request.allocation_value->defining_instruction()->ToString()
-          << (request.preferred_offset
-                  ? absl::StrCat(", with a preferred offset of ",
-                                 request.preferred_offset->offset, ".")
-                  : "");
-  PrefetchContext context;
-  context.request = &request;
-  context.prev_allocation_in_default_mem = &prev_allocation_in_default_mem;
-
-  // Create a SliceProposal and WorkingIntervals.
-  SetupPrefetchWorkingIntervalsAndSliceProposal(context);
-
-  // Compute some additional preliminaries
-  Result init_result = InitializePrefetchIntervalPicker(context);
-  if (init_result != Result::kSuccess) {
-    return init_result;
-  }
-  Result check_result = EnsureSomeSpatialPrefetchFitExists(context);
-  if (check_result != Result::kSuccess) {
-    return check_result;
-  }
-  const HloUse& use = request.use->hlo_use;
-  context.full_shape = &ShapeUtil::GetSubshape(
-      use.instruction->operand(use.operand_number)->shape(), use.operand_index);
-  // While uses might be allowed to have additional outstanding prefetches.
-  context.extra_async_copy_limit =
-      use.instruction->opcode() == HloOpcode::kWhile
-          ? options_.while_use_extra_outstanding_prefetch_limit
-          : 0;
-
-  // Loop over potential prefetch starting times. At the selected start time, we
-  // check if we have enough resources and memory for a sliced version of the
-  // request and a non-sliced version of the request. We return the first sliced
-  // solution that we find. We fallback to the first unsliced solution we find,
-  // if we are unable to find a sliced solution.
-  Result result = Result::kSuccess;
-  while (!options_.prefetch_interval_picker->Done()) {
-    // Get the prefetch start time from the interval picker.
-    context.exclusive_prefetch_start_time =
-        options_.prefetch_interval_picker->Next();
-    CHECK_LT(context.exclusive_prefetch_start_time, context.prefetch_end_time);
-    if (context.exclusive_out_of_mem_start.has_value() &&
-        context.exclusive_prefetch_start_time <=
-            *context.exclusive_out_of_mem_start) {
-      VLOG(4) << "This would OOM (cached).";
-      return Result::kFailOutOfMemory;
-    }
-
-    if (context.slice_proposal_collection) {
-      VLOG(5) << "Trying sliced solution.";
-      // Check if a sliced solution fits.
-      Result sliced_result =
-          CheckPrefetchFit(/*for_sliced_solution=*/true, context);
-      if (sliced_result == Result::kSuccess) {
-        // Break out of the loop and use the sliced solution.
-        CHECK(context.sliced_solution);
-        break;
-      } else if (sliced_result != Result::kAllSlicesHaveTheSameStartTime) {
-        result_mark(sliced_result, result);
-      }
-    }
-
-    // If we don't already have an unsliced solution, check the current fit.
-    if (!context.unsliced_solution) {
-      VLOG(5) << "Trying unsliced solution.";
-      Result unsliced_result =
-          CheckPrefetchFit(/*for_sliced_solution=*/false, context);
-      if (unsliced_result != Result::kSuccess) {
-        result_mark(unsliced_result, result);
-      } else if (!context.slice_proposal_collection) {
-        // We found an unsliced solution and there is no slice proposal, so
-        // break out of the loop and use the unsliced solution.
-        CHECK(context.unsliced_solution);
-        break;
-      }
-    }
-  }
-
-  // Check if we found any solutions.
-  if (context.sliced_solution) {
-    CHECK(!context.sliced_solution->slices_for_pending_chunks.empty());
-    VLOG(3) << DescribeSlicedBufferMove(
-        context.sliced_solution->slice_decisions_sorted_by_start_time, result_,
-        context.sliced_solution->slices_for_pending_chunks.back().second,
-        context.sliced_solution->prefetch_picker_debug_string);
-
-    for (const auto& interval_chunk_pair :
-         context.sliced_solution->slices_for_pending_chunks) {
-      AddToPendingChunks(interval_chunk_pair.first, interval_chunk_pair.second);
-    }
-    AddAsyncSlicesForPrefetch(
-        *context.prev_allocation_in_default_mem,
-        context.request->allocation_value->mutable_allocation_sequence(),
-        context.request->preferred_offset,
-        context.sliced_solution->slice_decisions_sorted_by_start_time,
-        context.prefetch_end_time, context.request->end_time);
-    context.request->allocation_value->allocation_sequence()->back()->AddUse(
-        context.request->use->hlo_use);
-    return Result::kSuccess;
-  }
-  if (context.unsliced_solution) {
-    VLOG(3) << "Move the buffer to alternate memory after time "
-            << InclusiveToExclusiveStartTime(
-                   context.unsliced_solution_intervals.full.start)
-            << ". Offset = "
-            << context.unsliced_solution->chunk_candidate.offset
-            << ", size = " << context.unsliced_solution->chunk_candidate.size
-            << ", heap_size = "
-            << result_.UpdatedHeapSize(
-                   context.unsliced_solution->chunk_candidate)
-            << ", prefetch picker = "
-            << context.unsliced_solution->prefetch_picker_debug_string;
-    AddToPendingChunks(context.unsliced_solution_intervals.full,
-                       context.unsliced_solution->chunk_candidate);
-    AddAsyncCopy(
-        *context.prev_allocation_in_default_mem, MemorySpace::kAlternate,
-        context.unsliced_solution->chunk_candidate,
-        context.unsliced_solution_intervals.full.start - 1,
-        context.request->end_time, context.prefetch_end_time,
-        context.request->allocation_value->mutable_allocation_sequence(),
-        context.request->preferred_offset,
-        context.unsliced_solution->prefetch_resource);
-
-    request.allocation_value->allocation_sequence()->back()->AddUse(
-        request.use->hlo_use);
-    return Result::kSuccess;
-  }
-
-  // If we didn't consider any prefetch intervals, then the live range was too
-  // short.
-  return (result == Result::kSuccess ? Result::kFailLiveRangeTooShort : result);
-}
-
-void AlternateMemoryBestFitHeap::GenerateSliceProposal(
-    PrefetchContext& context) const {
-  if (options_.sliced_prefetch_options.max_slices() < 2) {
-    return;
-  }
-  auto log_prefix = [&]() {
-    return absl::StrCat(
-        "Slice request(options = ",
-        options_.sliced_prefetch_options.ShortDebugString(), "; shape = ",
-        context.prev_allocation_in_default_mem->defining_position()
-            .shape()
-            .ToString(),
-        ")");
-  };
-
-  if (context.request->size < options_.sliced_prefetch_options.min_bytes()) {
-    VLOG(5) << "Not slicing " << log_prefix() << " because the request size "
-            << context.request->size
-            << " is smaller than the min configured size of "
-            << options_.sliced_prefetch_options.min_bytes();
-    return;
-  }
-
-  auto status_or_proposal = options_.propose_slice_fn(
-      context.prev_allocation_in_default_mem->defining_position().shape(),
-      options_.sliced_prefetch_options);
-  if (!status_or_proposal.ok()) {
-    VLOG(2) << log_prefix() << " failed: " << status_or_proposal.status();
-    return;
-  }
-
-  if (status_or_proposal.value().size() < 2) {
-    VLOG(2) << log_prefix() << ". No slices proposed.";
-    return;
-  }
-
-  VLOG(6) << log_prefix() << ". Slice proposal = ["
-          << absl::StrJoin(status_or_proposal.value(), ", ",
-                           [](std::string* out, const SliceProposal& proposal) {
-                             absl::StrAppend(out, proposal.ToString());
-                           })
-          << "]";
-
-  context.slice_proposal_collection = std::move(status_or_proposal.value());
-}
-
-void AlternateMemoryBestFitHeap::SetupPrefetchWorkingIntervalsAndSliceProposal(
-    PrefetchContext& context) const {
-  // Setup the full WorkingIntervals for the sliced and unsliced solutions.
-  // Future code will adjust the start and end times.
-  context.sliced_solution_intervals.full = BufferInterval{
-      context.request->allocation_value->value(),
-      /*size=*/context.request->size,
-      /*start=*/-1,
-      /*end=*/context.request->end_time,
-      /*colocations=*/{},
-      /*need_allocation=*/true,
-  };
-  context.unsliced_solution_intervals.full =
-      context.sliced_solution_intervals.full;
-
-  // Attempt to generate a slice proposal.
-  GenerateSliceProposal(context);
-
-  // Setup the full SlicedBufferIntervals for the sliced and unsliced solutions.
-  // If there is no slice proposal, we will not try a sliced solution. In such a
-  // case, we do not populate context.sliced_solution_intervals.
-  if (context.slice_proposal_collection) {
-    context.sliced_solution_intervals.sliced =
-        std::make_unique<SlicedBufferInterval>(
-            SlicedBufferInterval::CreateMutableInterval(
-                context.sliced_solution_intervals.full));
-    std::vector<int64_t> sizes;
-    sizes.reserve(context.slice_proposal_collection->size());
-    for (const SliceProposal& single_slice_proposal :
-         *context.slice_proposal_collection) {
-      sizes.push_back(single_slice_proposal.slice_size);
-    }
-    context.sliced_solution_intervals.sliced->Slice(sizes);
+Status InsertInstructionAndEnsureOperandsInserted(
+    HloInstruction* new_instruction, HloInstructionSequence* new_sequence,
+    absl::flat_hash_set<HloInstruction*>* inserted_instructions) {
+  for (HloInstruction* operand : new_instruction->operands()) {
+    TF_RETURN_IF_ERROR(EnsureInstructionAndOperandsInserted(
+        operand, new_sequence, inserted_instructions));
   }
-  context.unsliced_solution_intervals.sliced =
-      std::make_unique<SlicedBufferInterval>(
-          SlicedBufferInterval::CreateMutableInterval(
-              context.unsliced_solution_intervals.full));
+  VLOG(4) << "inserting: " << new_instruction->ToShortString();
+  new_sequence->push_back(new_instruction);
+  TF_RET_CHECK(inserted_instructions->insert(new_instruction).second);
+  return OkStatus();
 }
 
-AlternateMemoryBestFitHeap::Result
-AlternateMemoryBestFitHeap::InitializePrefetchIntervalPicker(
-    PrefetchContext& context) {
-  int64_t earliest_exclusive_prefetch_time =
-      context.prev_allocation_in_default_mem->earliest_available_time();
-  if (context.request->earliest_prefetch_time) {
-    earliest_exclusive_prefetch_time =
-        std::max(earliest_exclusive_prefetch_time,
-                 *context.request->earliest_prefetch_time);
-  }
-  context.prefetch_end_time =
-      FindPrefetchEndTime(*context.request, earliest_exclusive_prefetch_time);
-
-  // As a compile time optimization, use the peak memory usage to filter out
-  // allocation times that would push us to OOM.
-  std::optional<int> earliest_exclusive_non_oom_prefetch_time =
-      FindEarliestExclusiveTimeToSatisfyPeakMemory(
-          earliest_exclusive_prefetch_time, context.prefetch_end_time,
-          context.request->size);
-  if (!earliest_exclusive_non_oom_prefetch_time) {
-    VLOG(3) << "Any prefetch in range (" << earliest_exclusive_prefetch_time
-            << ", " << context.prefetch_end_time << ") for size "
-            << context.request->size << " would go out of memory.";
-    return Result::kFailOutOfMemory;
-  }
-  if (!context.slice_proposal_collection) {
-    // We can only perform this optimization if we are not slicing.
-    // earliest_non_oom_prefetch_time lets us know the first time the entire
-    // buffer will fit, but we may be able to start slices before that time. So,
-    // we leave earliest_prefetch_time at its initial value.
-    VLOG(4) << "After peak memory check, prefetch range is ("
-            << *earliest_exclusive_non_oom_prefetch_time << ", "
-            << context.prefetch_end_time
-            << "). Original earliest prefetch time is "
-            << earliest_exclusive_prefetch_time;
-    earliest_exclusive_prefetch_time =
-        *earliest_exclusive_non_oom_prefetch_time;
+std::string InstructionScheduleToString(const HloLiveRange& hlo_live_range) {
+  const absl::flat_hash_map<const HloInstruction*, HloLiveRange::LogicalTime>&
+      instruction_schedule = hlo_live_range.instruction_schedule();
+  std::vector<std::pair<int64_t, const HloInstruction*>> instructions;
+  instructions.reserve(instruction_schedule.size());
+  for (const auto& instruction : instruction_schedule) {
+    instructions.push_back({instruction.second, instruction.first});
   }
-  std::optional<int64_t> preferred_prefetch_time =
-      context.request->preferred_prefetch_time;
-  if (preferred_prefetch_time) {
-    preferred_prefetch_time =
-        std::max(*preferred_prefetch_time, earliest_exclusive_prefetch_time);
+  std::string instruction_schedule_str = "\n";
+  absl::c_sort(instructions);
+  for (auto& instruction : instructions) {
+    absl::StrAppend(&instruction_schedule_str,
+                    "LogicalTime: ", instruction.first, " ",
+                    instruction.second->ToString(), "\n");
   }
-  options_.prefetch_interval_picker->Begin(
-      context.request->use->hlo_use, earliest_exclusive_prefetch_time,
-      context.prefetch_end_time, preferred_prefetch_time);
-  VLOG(3) << "Trying prefetch picker = "
-          << options_.prefetch_interval_picker->ToDebugString();
-
-  return Result::kSuccess;
+  return instruction_schedule_str;
 }
 
-AlternateMemoryBestFitHeap::Result
-AlternateMemoryBestFitHeap::EnsureSomeSpatialPrefetchFitExists(
-    PrefetchContext& context) const {
-  SlicedBufferInterval* interval =
-      (context.slice_proposal_collection
-           ? context.sliced_solution_intervals.sliced.get()
-           : context.unsliced_solution_intervals.sliced.get());
-
-  // Note, UpdateInclusiveSliceStartTimes() will correctly update start times
-  // for both sliced and unsliced solutions.
-  interval->UpdateExclusiveSliceStartTimes(
-      std::vector<int64_t>(interval->num_slices(),
-                           options_.prefetch_interval_picker->latest_time()));
-  std::vector<Chunk> chunk_candidates = FindBestChunkCandidates(
-      *context.request, context.request->preferred_offset, interval);
-  if (chunk_candidates.empty()) {
-    VLOG(3) << "The latest prefetch (" << interval->full_buffer_interval().start
-            << ", " << context.request->end_time
-            << ") cannot find valid chunks. Giving up.";
-    return Result::kFailOutOfMemory;
+void EnsureParentAllocationIsAvailableForCopy(CopyAllocation* copy_allocation) {
+  Allocation& parent_allocation = copy_allocation->mutable_prev_allocation();
+  parent_allocation.Extend(copy_allocation->copy_done_schedule_before());
+  if (parent_allocation.is_copy_allocation()) {
+    auto parent_copy_allocation =
+        tensorflow::down_cast<CopyAllocation*>(&parent_allocation);
+    parent_copy_allocation->set_copy_done_schedule_before(
+        std::min(parent_copy_allocation->copy_done_schedule_before(),
+                 copy_allocation->start_time()));
+    parent_copy_allocation->set_copy_start_schedule_after(
+        std::min(parent_copy_allocation->copy_start_schedule_after(),
+                 parent_copy_allocation->copy_done_schedule_before() - 1));
   }
-
-  return Result::kSuccess;
 }
 
-namespace {
-
-// GetAsyncCopyElapsed with a default value.
-float CopyResourceForShape(const Options& options, const Shape& shape) {
-  return options.cost_analysis
-             ? options.cost_analysis->GetAsyncCopyElapsed(shape)
-             : 0.1;
+void MakeCopyAllocationJitForSingleUse(CopyAllocation* copy_allocation,
+                                       int64_t use_time) {
+  copy_allocation->set_start_time(use_time - 1);
+  copy_allocation->set_copy_start_schedule_after(use_time - 1);
+  copy_allocation->set_end_time(use_time);
+  copy_allocation->set_copy_done_schedule_before(use_time);
+  EnsureParentAllocationIsAvailableForCopy(copy_allocation);
 }
 
-// Returns the copy resources needed for the specified slice proposal
-// collection, in descending order.
-std::vector<float> GetCopyResourcesSortedDescending(
-    const Options& options,
-    const SliceProposalCollection& slice_proposal_collection) {
-  std::vector<float> copy_resources;
-  copy_resources.reserve(slice_proposal_collection.size());
-  for (const SliceProposal& proposal : slice_proposal_collection) {
-    copy_resources.push_back(
-        CopyResourceForShape(options, proposal.slice_shape));
-  }
-  absl::c_sort(copy_resources);
-  return copy_resources;
+int64_t GetUseTime(const HloUse& use, const HloLiveRange& hlo_live_range) {
+  return hlo_live_range.instruction_schedule().at(use.instruction);
 }
 
-// Returns true if we would have enough async copy resources to copy each
-// specified slice.
-bool DoWeHaveEnoughCopyResource(
-    const std::vector<int64_t>& slice_start_times, int64_t prefetch_end_time,
-    const std::vector<float>& copy_resource_per_slice,
-    AsynchronousCopyResource& async_copy_resource) {
-  CHECK_EQ(slice_start_times.size(), copy_resource_per_slice.size());
-
-  std::vector<AsynchronousCopyResource::ResourceSpec> specs;
-  specs.reserve(slice_start_times.size());
-
-  // Note, the HasEnoughResourceMultiCheck() below is sensitive to this order.
-  // The specs must be in slice start time order because that's the order
-  // they'll be added to prefetch_async_copy_resource_ in
-  // AddAsyncSlicesForPrefetch(), if the solution is selected.
-  static const float kSlicedCopyResourceInflation = 1.8;
-  for (int i = 0; i < slice_start_times.size(); ++i) {
-    float original_copy_resource = copy_resource_per_slice[i];
-    float new_copy_resource = original_copy_resource;
-    if (slice_start_times.size() > 1) {
-      // This is a hack that makes us more conservative about using sliced
-      // prefetching vs unsliced prefetching.
-      new_copy_resource = original_copy_resource * kSlicedCopyResourceInflation;
-      VLOG(5)
-          << "Inflating required copy resources DoWeHaveEnoughCopyResource() "
-             "slice check from "
-          << original_copy_resource << " to " << new_copy_resource;
+void ProcessPrefetchesToAlternateMemory(AllocationSequence& allocations,
+                                        const HloLiveRange& hlo_live_range) {
+  std::vector<Allocation*> allocations_in_raw_pointers =
+      GetAllocationSequenceInRawPointers(allocations);
+  for (auto allocation : allocations_in_raw_pointers) {
+    if (allocation->is_copy_allocation() && allocation->is_in_alternate_mem() &&
+        !allocation->uses().empty()) {
+      CopyAllocation* prefetch =
+          tensorflow::down_cast<CopyAllocation*>(allocation);
+      std::vector<HloUse> uses = prefetch->uses();  // Create a copy of uses.
+      prefetch->clear_uses();                       // Clear old uses.
+      // For every prefetch, update prefetch to serve earliest use just in time.
+      prefetch->AddUse(uses[0]);
+      MakeCopyAllocationJitForSingleUse(prefetch,
+                                        GetUseTime(uses[0], hlo_live_range));
+      // For every use after the first use, create a new prefetch from the same
+      // parent allocation.
+      for (size_t use_index = 1; use_index < uses.size(); ++use_index) {
+        const HloUse& use = uses[use_index];
+        int64_t use_time = GetUseTime(use, hlo_live_range);
+        auto jit_single_use_prefetch = std::make_unique<CopyAllocation>(
+            prefetch->mutable_prev_allocation(), MemorySpace::kAlternate,
+            prefetch->chunk(), use_time - 1, use_time, use_time);
+        jit_single_use_prefetch->set_copy_start_schedule_after(use_time - 1);
+        jit_single_use_prefetch->AddUse(use);
+        EnsureParentAllocationIsAvailableForCopy(jit_single_use_prefetch.get());
+        allocations.push_back(std::move(jit_single_use_prefetch));
+      }
     }
-    specs.push_back(
-        {slice_start_times[i], prefetch_end_time, new_copy_resource});
-  }
-
-  auto specs_to_string = [&specs]() {
-    return absl::StrCat(
-        "[ ",
-        absl::StrJoin(specs, ", ",
-                      [](std::string* out,
-                         const AsynchronousCopyResource::ResourceSpec& spec) {
-                        absl::StrAppend(out, "{exclusive start: ",
-                                        spec.exclusive_start_time,
-                                        ", end: ", spec.end_time,
-                                        ", resource: ", spec.resource, "}");
-                      }),
-        " ]");
-  };
-
-  VLOG(5) << "Checking for enough copy resources for: " << specs_to_string();
-  if (!async_copy_resource.HasEnoughResourceMultiCheck(specs)) {
-    VLOG(4) << "Not enough copy resources for " << specs_to_string();
-    return false;
   }
-  return true;
 }
 
-// We compute a map from indices in chunk_candidates to indices in a
-// SliceProposalCollection. Since the indices of chunk_candidates correspond to
-// slice start times order, and SliceProposalCollections are always sorted in
-// offset order, the mapping allows us to get the sizing details of a slice at a
-// specific slice time.
-absl::flat_hash_map<int64_t, int64_t> GetCandidateToProposalIndexMap(
-    const std::vector<AlternateMemoryBestFitHeap::Chunk>& chunk_candidates) {
-  std::vector<std::pair<int64_t, int64_t>> sorted_offset_candidate_index_pairs;
-  sorted_offset_candidate_index_pairs.reserve(chunk_candidates.size());
-  for (int64_t chunk_candidate_index = 0;
-       chunk_candidate_index < chunk_candidates.size();
-       ++chunk_candidate_index) {
-    sorted_offset_candidate_index_pairs.push_back(std::make_pair(
-        chunk_candidates[chunk_candidate_index].offset, chunk_candidate_index));
-  }
-  absl::c_sort(sorted_offset_candidate_index_pairs);
-
-  absl::flat_hash_map<int64_t, int64_t> candidate_to_proposal_index_map;
-  for (int64_t offset_index = 0;
-       offset_index < sorted_offset_candidate_index_pairs.size();
-       ++offset_index) {
-    int64_t chunk_candidate_index =
-        sorted_offset_candidate_index_pairs[offset_index].second;
-    candidate_to_proposal_index_map[chunk_candidate_index] = offset_index;
-  }
-
-  return candidate_to_proposal_index_map;
+void MakeEvictionImmediate(CopyAllocation* eviction) {
+  const Allocation& parent_allocation = eviction->prev_allocation();
+  eviction->set_start_time(parent_allocation.start_time());
+  eviction->set_copy_start_schedule_after(parent_allocation.start_time());
+  eviction->set_copy_done_schedule_before(parent_allocation.start_time() + 1);
+  eviction->Extend(parent_allocation.start_time() + 1);
 }
 
-}  // namespace
-
-AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::CheckPrefetchFit(
-    bool for_sliced_solution, PrefetchContext& context) {
-  SlicedBufferInterval* sliced_buffer_interval =
-      context.GetMutableWorkingIntervals(for_sliced_solution).sliced.get();
-
-  if (for_sliced_solution) {
-    CHECK(context.slice_proposal_collection);
-    CHECK_EQ(context.slice_proposal_collection->size(),
-             sliced_buffer_interval->num_slices());
-  }
-
-  // Update the prefetch start time in our working solution.
-  std::vector<int64_t> exclusive_slice_start_times =
-      SlicedPrefetchStartTimePicker::Pick(
-          sliced_buffer_interval->num_slices(),
-          context.exclusive_prefetch_start_time, context.prefetch_end_time,
-          [&](int64_t exclusive_start_time,
-              int64_t exclusive_end_time) -> float {
-            return options_.prefetch_interval_picker->GetLogicalIntervalElapsed(
-                exclusive_start_time, exclusive_end_time);
-          },
-          [&](int64_t lhs_time, int64_t rhs_time) -> bool {
-            return hlo_live_range_.flattened_instruction_sequence()
-                       .instructions()[lhs_time]
-                       ->parent() ==
-                   hlo_live_range_.flattened_instruction_sequence()
-                       .instructions()[rhs_time]
-                       ->parent();
-          });
-  CHECK_EQ(sliced_buffer_interval->num_slices(),
-           exclusive_slice_start_times.size());
-  sliced_buffer_interval->UpdateExclusiveSliceStartTimes(
-      exclusive_slice_start_times);
-  VLOG(4) << AlternateMemoryAllocationAttemptToString(for_sliced_solution,
-                                                      context);
-
-  // Check if all slices have the same start time. If so, we might as well
-  // resort to a full copy.
-  if (for_sliced_solution &&
-      absl::c_all_of(
-          exclusive_slice_start_times, [&](int64_t slice_start_time) {
-            return slice_start_time == exclusive_slice_start_times.front();
-          })) {
-    return Result::kAllSlicesHaveTheSameStartTime;
-  }
-
-  // Check that we have enough copy resource for the prefetching.
-  std::vector<float> copy_resource_per_slice_sorted_by_start_time;
-  // If there is a preferred prefetch time due to a loop optimized allocation,
-  // we already keep track of the prefetch resources there, so skip tracking
-  // resources here.
-  if (context.request->preferred_prefetch_time) {
-    copy_resource_per_slice_sorted_by_start_time =
-        std::vector<float>(exclusive_slice_start_times.size(), 0.0);
-  } else if (for_sliced_solution) {
-    // In a sliced setting, we don't yet know when each slice will be
-    // prefetched. Given the proposed slice times, the most conservative copy
-    // resource check we can make is to assume that larger slices are started
-    // at earlier times, i.e., they have more time to complete. That is the
-    // check we will make here. Once, we've decided when each slice will be
-    // prefetched, we can do an exact check below.
-    //
-    // We start by computing the amount of copy resources needed for each slice,
-    // if larger slices are started at earlier times.
-    copy_resource_per_slice_sorted_by_start_time =
-        GetCopyResourcesSortedDescending(options_,
-                                         *context.slice_proposal_collection);
-  } else {
-    copy_resource_per_slice_sorted_by_start_time.push_back(
-        CopyResourceForShape(options_, *context.full_shape));
-  }
-  CHECK_EQ(sliced_buffer_interval->num_slices(),
-           copy_resource_per_slice_sorted_by_start_time.size());
-
-  if (!DoWeHaveEnoughCopyResource(exclusive_slice_start_times,
-                                  context.prefetch_end_time,
-                                  copy_resource_per_slice_sorted_by_start_time,
-                                  prefetch_async_copy_resource_)) {
-    return Result::kFailViolatesAsyncCopyResource;
-  }
-
-  // Check if the copies we would add for the prefetch would violate copy
-  // ordering.
-  if (options_.enforce_prefetch_fifo_order &&
-      absl::c_any_of(exclusive_slice_start_times,
-                     [&](int64_t slice_start_time) {
-                       return async_copy_ordering_.ViolatesOrdering(
-                           slice_start_time, context.prefetch_end_time);
-                     })) {
-    VLOG(4) << "This would violate asynchronous copy ordering.";
-    return Result::kFailViolatesAsyncCopyResource;
-  }
-
-  // Check if the copies we would add for the prefetch violate the maximum
-  // number of outstanding async copies.
-  for (int i = 0; i < exclusive_slice_start_times.size(); ++i) {
-    if (ViolatesMaximumOutstandingAsyncCopies(
-            exclusive_slice_start_times[i], context.prefetch_end_time,
-            /*is_prefetch=*/true, context.extra_async_copy_limit, i)) {
-      VLOG(4) << "This would violate the outstanding async copy limit.";
-      return Result::kFailOutOfAsyncCopies;
-    }
-  }
-
-  // Check if we can find a place in alternate memory for the prefetch.
-  std::vector<Chunk> chunk_candidates = FindBestChunkCandidates(
-      *context.request, context.request->preferred_offset,
-      sliced_buffer_interval);
-  CHECK(chunk_candidates.empty() ||
-        chunk_candidates.size() == sliced_buffer_interval->num_slices());
-  std::string prefetch_picker_debug_string;
-  if (VLOG_IS_ON(4)) {
-    prefetch_picker_debug_string =
-        options_.prefetch_interval_picker->ToDebugString();
-  }
-  if (for_sliced_solution && !chunk_candidates.empty()) {
-    // We're trying a sliced solution. So, if FindBestChunkCandidates() found a
-    // solution, each slice should have its own chunk candidate.
-    CHECK_EQ(chunk_candidates.size(), sliced_buffer_interval->num_slices());
-    // We need a mapping from chunks in chunk_candidates to slice proposals in
-    // context.slice_proposal_context.
-    absl::flat_hash_map<int64_t, int64_t> candidate_to_proposal_index_map =
-        GetCandidateToProposalIndexMap(chunk_candidates);
-
-    // Create slice decisions, sorted by time.
-    std::vector<SliceDecision> slice_decisions_sorted_by_start_time;
-    for (int64_t slice_time = 0;
-         slice_time < sliced_buffer_interval->num_slices(); ++slice_time) {
-      const SliceProposal& proposal = context.slice_proposal_collection->at(
-          candidate_to_proposal_index_map[slice_time]);
-      copy_resource_per_slice_sorted_by_start_time[slice_time] =
-          CopyResourceForShape(options_, proposal.slice_shape);
-      slice_decisions_sorted_by_start_time.push_back(SliceDecision{
-          chunk_candidates[slice_time], exclusive_slice_start_times[slice_time],
-          proposal, copy_resource_per_slice_sorted_by_start_time[slice_time]});
-    }
-
-    // Check that we have enough copy resources for all the slice decisions.
-    if (!DoWeHaveEnoughCopyResource(
-            exclusive_slice_start_times, context.prefetch_end_time,
-            copy_resource_per_slice_sorted_by_start_time,
-            prefetch_async_copy_resource_)) {
-      return Result::kFailViolatesAsyncCopyResource;
-    }
-
-    // Construct BufferInterval-Chunk pairs that are appropriate for pending
-    // chunks, as described in PrefetchContext::SlicedSolution.
-    std::vector<std::pair<BufferInterval, Chunk>> slices_for_pending_chunks;
-    slices_for_pending_chunks.reserve(sliced_buffer_interval->num_slices());
-    Chunk final_chunk = Chunk::FromOffsetSize(
-        absl::c_min_element(
-            chunk_candidates,
-            [](const Chunk& a, const Chunk& b) { return a.offset < b.offset; })
-            ->offset,
-        absl::c_accumulate(
-            chunk_candidates, 0,
-            [](int64_t sum, const Chunk& chunk) { return sum + chunk.size; }));
-    BufferInterval final_buffer_interval{
-        context.request->allocation_value->value(),
-        /*size=*/final_chunk.size,
-        /*start=*/
-        ExclusiveToInclusiveStartTime(exclusive_slice_start_times.back()),
-        /*end=*/context.request->end_time,
-        /*colocations=*/
-        sliced_buffer_interval->full_buffer_interval().colocations,
-        /*need_allocation=*/true};
-    for (int64_t slice_time = 0;
-         slice_time < sliced_buffer_interval->num_slices(); ++slice_time) {
-      const Chunk& chunk = chunk_candidates[slice_time];
-      int64_t inclusive_start_time = ExclusiveToInclusiveStartTime(
-          exclusive_slice_start_times[slice_time]);
-      if (inclusive_start_time ==
-          ExclusiveToInclusiveStartTime(exclusive_slice_start_times.back())) {
-        // This and the following chunks will be merged into the final chunk.
-        // Note, it's possible for more than one slice to start at the same
-        // time.
-        break;
+absl::flat_hash_map<Allocation*, CopyAllocation*> GetEvictionsMap(
+    std::vector<Allocation*>& allocations) {
+  absl::flat_hash_map<Allocation*, CopyAllocation*> evictions_map;
+  for (auto& allocation : allocations) {
+    if (allocation->is_copy_allocation() && allocation->is_in_default_mem()) {
+      auto eviction = tensorflow::down_cast<CopyAllocation*>(allocation);
+      Allocation& parent_allocation = eviction->mutable_prev_allocation();
+      if (!parent_allocation.is_copy_allocation()) {
+        evictions_map[&parent_allocation] = eviction;
       }
-      CHECK_LT(inclusive_start_time, ExclusiveToInclusiveStartTime(
-                                         exclusive_slice_start_times.back()));
-      slices_for_pending_chunks.push_back(std::make_pair(
-          BufferInterval{
-              context.request->allocation_value->value(),
-              /*size=*/chunk.size,
-              /*start=*/inclusive_start_time,
-              /*end=*/exclusive_slice_start_times.back(),
-              // We only use the final_buffer_interval for colocations because
-              // slices start at different offsets, and the colocation
-              // infrastructure expects all colocated buffers to start at the
-              // same offset.
-              /*colocations=*/{},
-              /*need_allocation=*/true,
-          },
-          chunk));
     }
-    slices_for_pending_chunks.push_back(
-        std::make_pair(final_buffer_interval, final_chunk));
-
-    context.sliced_solution = PrefetchContext::SlicedSolution{
-        std::move(slice_decisions_sorted_by_start_time),
-        std::move(slices_for_pending_chunks),
-        prefetch_picker_debug_string,
-    };
-    return Result::kSuccess;
-  } else if (!chunk_candidates.empty()) {
-    // We're trying an unsliced solution. So, if FindBestChunkCandidates() found
-    // a solution, there must be only 1 chunk for it.
-    CHECK_EQ(chunk_candidates.size(), 1);
-    CHECK_EQ(copy_resource_per_slice_sorted_by_start_time.size(), 1);
-    context.unsliced_solution = PrefetchContext::UnslicedSolution{
-        chunk_candidates.front(),
-        copy_resource_per_slice_sorted_by_start_time.front(),
-        prefetch_picker_debug_string,
-    };
-    return Result::kSuccess;
   }
-
-  // Mark the out of memory start with the prefetch start time so that we don't
-  // explore prefetch start times earlier than this point. If a sliced prefetch
-  // doesn't fit at a given time, an unsliced prefetch will not fit either.
-  // Thus, if we are considering a sliced prefetch for the current request,
-  // we can only update out_of_mem_start when we check with slices.
-  if (for_sliced_solution || !context.slice_proposal_collection) {
-    CHECK_GT(exclusive_slice_start_times.size(), 0);
-    context.exclusive_out_of_mem_start = std::max(
-        context.exclusive_out_of_mem_start ? *context.exclusive_out_of_mem_start
-                                           : -1,
-        exclusive_slice_start_times.front());
-  }
-
-  VLOG(4) << "Out of memory.";
-  return Result::kFailOutOfMemory;
+  return evictions_map;
 }
 
-std::vector<int64_t> SlicedPrefetchStartTimePicker::Pick(
-    int64_t num_slices, int64_t exclusive_prefetch_start_time,
-    int64_t prefetch_end_time, absl::AnyInvocable<ElapsedTimeFn> elapsed_fn,
-    absl::AnyInvocable<SameComputationParentFn> has_same_parent_fn) {
-  CHECK_LE(exclusive_prefetch_start_time, prefetch_end_time);
-  VLOG(5) << "Picking slice start times. num_slices = " << num_slices
-          << "; exclusive_prefetch_start_time = "
-          << exclusive_prefetch_start_time
-          << "; prefetch_end_time = " << prefetch_end_time;
-
-  // Prefetching starts after the selected start instruction and ends
-  // before the selected end instruction. Thus, we have (end - (start + 1)) HLO
-  // instructions worth of time to perform all of the sliced copies. So, the
-  // only choices for start times that give us time to copy are <=
-  // prefetch_end_time - 2.
-  if (exclusive_prefetch_start_time >= prefetch_end_time - 2 ||
-      num_slices == 1) {
-    return std::vector<int64_t>(num_slices, exclusive_prefetch_start_time);
-  }
-
-  float total_elapsed =
-      elapsed_fn(exclusive_prefetch_start_time, prefetch_end_time);
-  if (total_elapsed <= 0.0) {
-    return std::vector<int64_t>(num_slices, exclusive_prefetch_start_time);
-  }
-
-  std::vector<int64_t> start_times;
-  start_times.reserve(num_slices);
-  start_times.push_back(exclusive_prefetch_start_time);
-  int64_t last_valid_candidate = exclusive_prefetch_start_time;
-  int64_t candidate = exclusive_prefetch_start_time;
-  while (candidate < prefetch_end_time - 1 && start_times.size() < num_slices) {
-    float target_elapsed = total_elapsed *
-                           static_cast<float>(num_slices - start_times.size()) /
-                           static_cast<float>(num_slices);
-    float elapsed = elapsed_fn(candidate, prefetch_end_time);
-    if (elapsed < target_elapsed) {
-      // We've gone past our target, so use the last valid candidate.
-      start_times.push_back(last_valid_candidate);
-      continue;
-    }
-    bool updating_candidate_impacts_elapsed =
-        last_valid_candidate != candidate &&
-        elapsed_fn(last_valid_candidate,
-                   ExclusiveToInclusiveStartTime(candidate)) > 0.0;
-    // has_same_parent_fn will look up the computation parent of the
-    // instructions at prefetch_start_time and prefetch_end_time. If
-    // prefetch_start_time is -1, no such instruction will exist. However, if we
-    // want to insert an instruction after the -1 schedule position, we can
-    // use the parent of the instruction at index 0 instead. Thus, we use
-    // std::max below.
-    if (has_same_parent_fn(std::max<int64_t>(0, exclusive_prefetch_start_time),
-                           std::max<int64_t>(0, candidate)) &&
-        updating_candidate_impacts_elapsed) {
-      last_valid_candidate = candidate;
-    }
-    ++candidate;
-  }
-  while (start_times.size() < num_slices) {
-    start_times.push_back(last_valid_candidate);
+void ProcessBuffersProducedInAlternateMemory(
+    AllocationSequence& allocations, const HloLiveRange& hlo_live_range) {
+  std::vector<Allocation*> allocations_in_raw_pointers =
+      GetAllocationSequenceInRawPointers(allocations);
+  // For all parent allocations produced in alternate memory, create a map from
+  // parent allocation -> eviction.
+  absl::flat_hash_map<Allocation*, CopyAllocation*> evictions_map =
+      GetEvictionsMap(allocations_in_raw_pointers);
+  // Make all such evictions immediate.
+  for (auto& [_, eviction] : evictions_map) {
+    MakeEvictionImmediate(eviction);
   }
-
-  return start_times;
-}
-
-std::string
-AlternateMemoryBestFitHeap::AlternateMemoryAllocationAttemptToString(
-    bool for_sliced_solution, const PrefetchContext& context) const {
-  const SlicedBufferInterval* sliced_buffer_interval =
-      context.GetWorkingIntervals(for_sliced_solution).sliced.get();
-
-  std::vector<std::string> slice_times;
-  std::vector<int64_t> estimated_slice_prefetch_end_times;
-
-  for (int i = 0; i < sliced_buffer_interval->num_slices(); ++i) {
-    slice_times.push_back(absl::StrCat(
-        "[", sliced_buffer_interval->IntervalForMakeFreeChunks(i).start, ", ",
-        sliced_buffer_interval->full_buffer_interval().end, ")"));
-    if (context.slice_proposal_collection) {
-      estimated_slice_prefetch_end_times.push_back(
-          options_.prefetch_interval_picker->EstimatedPrefetchEndTime(
-              context.slice_proposal_collection->at(i).slice_shape,
-              sliced_buffer_interval->IntervalForMakeFreeChunks(i).start,
-              context.prefetch_end_time));
-    } else {
-      estimated_slice_prefetch_end_times.push_back(
-          options_.prefetch_interval_picker->EstimatedPrefetchEndTime(
-              *context.full_shape,
-              sliced_buffer_interval->IntervalForMakeFreeChunks(i).start,
-              context.prefetch_end_time));
+  VLOG(2) << "AllocationSequence after making spills immediate spills\n";
+  XLA_LOG_LINES(2, AllocationSequenceToString(allocations, true));
+  // Process all buffers produced in the alternate memory:
+  // 1. Make the buffer short lived.
+  // 2. Service immediate use if any.
+  // 3. If buffer is also used later get or create an immediate eviction.
+  // 4. For every later use prefetch just in time from the eviction.
+  for (auto allocation : allocations_in_raw_pointers) {
+    if (!allocation->is_copy_allocation() &&
+        allocation->is_in_alternate_mem()) {
+      std::vector<HloUse> uses = allocation->uses();  // Create a copy of uses.
+      allocation->clear_uses();                       // Clear old uses.
+      // Make buffer short lived.
+      allocation->set_end_time(allocation->start_time() + 1);
+      for (const HloUse& use : uses) {
+        int64_t use_time = GetUseTime(use, hlo_live_range);
+        if (allocation->start_time() + 1 == use_time) {
+          allocation->AddUse(use);
+          continue;
+        }
+        if (!evictions_map.contains(allocation)) {
+          auto eviction_unique_ptr = std::make_unique<CopyAllocation>(
+              *allocation, MemorySpace::kDefault, std::nullopt,
+              allocation->start_time(), allocation->start_time() + 1,
+              allocation->start_time() + 1);
+          eviction_unique_ptr->set_copy_start_schedule_after(
+              allocation->start_time());
+          evictions_map[allocation] = eviction_unique_ptr.get();
+          allocations.push_back(std::move(eviction_unique_ptr));
+        }
+        CopyAllocation* eviction = evictions_map[allocation];
+        auto jit_single_use_prefetch = std::make_unique<CopyAllocation>(
+            *eviction, MemorySpace::kAlternate, allocation->chunk(),
+            use_time - 1, use_time, use_time);
+        jit_single_use_prefetch->set_copy_start_schedule_after(use_time - 1);
+        jit_single_use_prefetch->AddUse(use);
+        EnsureParentAllocationIsAvailableForCopy(jit_single_use_prefetch.get());
+        allocations.push_back(std::move(jit_single_use_prefetch));
+      }
     }
   }
-
-  return absl::StrCat(
-      "Trying alternate memory allocation. Slice times = { ",
-      absl::StrJoin(slice_times, ", "), " }. Estimated prefetch end times = { ",
-      absl::StrJoin(estimated_slice_prefetch_end_times, ", "), " }");
 }
 
-std::optional<AlternateMemoryBestFitHeap::Chunk>
-AlternateMemoryBestFitHeap::FindBestChunkCandidate(
-    const AllocationRequest& request, const AliasedOffset* preferred_offset,
-    BufferInterval* alternate_mem_interval) const {
-  SlicedBufferInterval sliced_buffer_interval =
-      SlicedBufferInterval::CreateMutableInterval(*alternate_mem_interval);
-  std::vector<Chunk> chunks = FindBestChunkCandidates(request, preferred_offset,
-                                                      &sliced_buffer_interval);
-  CHECK_LE(chunks.size(), 1);
-  if (chunks.empty()) {
-    return std::nullopt;
-  }
-  return chunks[0];
+void TransformAllocationSequenceToSpill(AllocationSequence& allocations,
+                                        const HloLiveRange& hlo_live_range) {
+  VLOG(2) << "InstructionSchedule before transform\n";
+  XLA_LOG_LINES(2, InstructionScheduleToString(hlo_live_range));
+  VLOG(2) << "AllocationSequence before transform\n";
+  XLA_LOG_LINES(2, AllocationSequenceToString(allocations, true));
+  ProcessPrefetchesToAlternateMemory(allocations, hlo_live_range);
+  VLOG(2) << "AllocationSequence after processing prefetches\n";
+  XLA_LOG_LINES(2, AllocationSequenceToString(allocations, true));
+  ProcessBuffersProducedInAlternateMemory(allocations, hlo_live_range);
+  VLOG(2) << "AllocationSequence after processing buffers produced in kAlt\n";
+  XLA_LOG_LINES(2, AllocationSequenceToString(allocations, true));
+  SortAllocationSequence(allocations);
 }
 
-std::vector<AlternateMemoryBestFitHeap::Chunk>
-AlternateMemoryBestFitHeap::FindBestChunkCandidates(
-    const AllocationRequest& request, const AliasedOffset* preferred_offset,
-    SlicedBufferInterval* alternate_mem_interval) const {
-  int64_t end_time = request.end_time;
-  if (!preferred_offset) {
-    // First find the earliest use that is the same or later than the end time.
-    const auto& use_times = request.all_use_times;
-    auto use_time_it = absl::c_lower_bound(use_times, end_time);
-    CHECK(use_time_it != use_times.end());
-    int64_t earliest_use = *use_time_it;
-    auto earliest_use_it = use_time_it;
-
-    // Then find the latest use that can be allocated contiguously without
-    // copies.
-    const Shape& shape = request.allocation_value->defining_position().shape();
-    for (;
-         (use_time_it + 1) != use_times.end() &&
-         options_.prefetch_interval_picker->CanAllocateInAlternateMemoryNoCopy(
-             shape, *use_time_it, *(use_time_it + 1));
-         ++use_time_it) {
-    }
-    CHECK(use_time_it != use_times.end());
-    int64_t latest_contiguous_use_time = *use_time_it;
-
-    // Find chunks that are as long living as possible.
-    std::vector<Chunk> last_chunk_candidates;
-    int64_t latest_matching_use = std::numeric_limits<int64_t>::min();
-    (void)std::lower_bound(
-        earliest_use_it, std::next(use_time_it), -1, [&](int64_t use, int64_t) {
-          alternate_mem_interval->UpdateEndTime(use);
-          std::vector<Chunk> chunk_candidates =
-              FindChunkCandidates(*alternate_mem_interval);
-          int64_t candidates_end =
-              absl::c_max_element(chunk_candidates, [](const Chunk& c1,
-                                                       const Chunk& c2) {
-                return c1.chunk_end() < c2.chunk_end();
-              })->chunk_end();
-          if (candidates_end <= available_heap_size()) {
-            if (use > latest_matching_use) {
-              last_chunk_candidates = std::move(chunk_candidates);
-              latest_matching_use = use;
-            }
-            return true;
-          }
-          return false;
-        });
-    if (!last_chunk_candidates.empty()) {
-      VLOG(3) << "FindBestChunkCandidates earliest use = " << earliest_use
-              << ", latest contiguous use = " << latest_contiguous_use_time
-              << ", use with available mem = " << latest_matching_use
-              << ", offsets = { "
-              << absl::StrJoin(last_chunk_candidates, ", ",
-                               [](std::string* out, const Chunk& c) {
-                                 absl::StrAppend(out, c.offset);
-                               })
-              << " }";
-    }
-    alternate_mem_interval->UpdateEndTime(end_time);
-    return last_chunk_candidates;
-  }
-  // If a preferred offset is given, try to find an allocation at that offset
-  // only.
-  alternate_mem_interval->UpdateEndTime(end_time);
-  std::vector<Chunk> chunk_candidates =
-      FindChunkCandidates(*alternate_mem_interval, preferred_offset->offset);
-  int64_t candidates_start =
-      absl::c_min_element(chunk_candidates, [](const Chunk& c1,
-                                               const Chunk& c2) {
-        return c1.offset < c2.offset;
-      })->offset;
-
-  if (candidates_start == preferred_offset->offset) {
-    return chunk_candidates;
-  }
-
-  return {};
-}
+}  // namespace
 
 absl::StatusOr<MemorySpaceAssignment::AsyncCopyStats>
 MemorySpaceAssignment::CalculateAsyncCopyStats() const {
@@ -5433,7 +376,7 @@ MemorySpaceAssignment::RunMemorySpaceAssignment(
 Status MemorySpaceAssignment::FindAllocationSequence(
     const HloLiveRange& hlo_live_range,
     const HloAliasAnalysis& alias_analysis) {
-  auto algorithm = std::make_unique<AlternateMemoryBestFitHeap>(
+  auto algorithm = std::make_unique<MsaAlgorithm>(
       &allocations_, options_, alias_analysis, hlo_live_range);
 
   HeapSimulator::Options heap_simulator_options;
@@ -6098,7 +1041,7 @@ Status MemorySpaceAssignment::VerifyAndExportHeapSimulatorTrace() {
 
   auto add_allocation_and_verify = [&](int64_t start_time, int64_t end_time,
                                        const HeapSimulator::Chunk& chunk,
-                                       const HloValue* value) {
+                                       const HloValue* value) -> absl::Status {
     events[std::make_tuple(start_time, /*is_free=*/false, value->id())] =
         std::make_tuple(value, chunk, HeapSimulatorTrace::Event::ALLOC);
     events[std::make_tuple(end_time, /*is_free=*/true, value->id())] =
@@ -6297,125 +1240,5 @@ Status MemorySpaceAssignment::VerifyAndExportHeapSimulatorTrace() {
   return OkStatus();
 }
 
-DefaultCrossProgramPrefetchBufferIntervalComparator::
-    DefaultCrossProgramPrefetchBufferIntervalComparator(
-        const HloLiveRange& hlo_live_range)
-    : BufferIntervalComparator(), hlo_live_range_(hlo_live_range) {}
-
-std::string DefaultCrossProgramPrefetchBufferIntervalComparator::
-    DescribeComparisonCriteria() const {
-  return "[ -size, -cumulative use size, latest use, instruction id]";
-}
-
-std::string
-DefaultCrossProgramPrefetchBufferIntervalComparator::CriteriaToString(
-    const MsaBufferInterval& buffer_interval) {
-  return absl::StrCat("[ ", absl::StrJoin(GetTuple(buffer_interval), ", "),
-                      " ]");
-}
-
-bool DefaultCrossProgramPrefetchBufferIntervalComparator::LessThan(
-    const MsaBufferInterval& lhs, const MsaBufferInterval& rhs) {
-  return GetTuple(lhs) < GetTuple(rhs);
-}
-
-DefaultCrossProgramPrefetchBufferIntervalComparator::ComparisonTuple
-DefaultCrossProgramPrefetchBufferIntervalComparator::GetTuple(
-    const MsaBufferInterval& buffer_interval) {
-  auto sort_data_it = additional_sort_data_.find(buffer_interval.buffer);
-  if (sort_data_it == additional_sort_data_.end()) {
-    AdditionalSortData sort_data;
-    absl::c_for_each(buffer_interval.buffer->GetUses(), [&](const HloUse& use) {
-      auto it = hlo_live_range_.instruction_schedule().find(use.instruction);
-      if (it == hlo_live_range_.instruction_schedule().end()) {
-        return;
-      }
-      sort_data.latest_use = std::max(sort_data.latest_use, it->second);
-      sort_data.cumulative_use_size +=
-          ShapeUtil::ElementsInRecursive(use.instruction->shape());
-    });
-    sort_data_it =
-        additional_sort_data_.try_emplace(buffer_interval.buffer, sort_data)
-            .first;
-  }
-
-  return std::make_tuple(
-      -1 * buffer_interval.size, -1 * sort_data_it->second.cumulative_use_size,
-      sort_data_it->second.latest_use, buffer_interval.buffer->id());
-}
-
-MemoryBoundednessBufferIntervalComparator::
-    MemoryBoundednessBufferIntervalComparator(
-        const CostAnalysis& cost_analysis,
-        CostAnalysis::Cache* cost_analysis_cache)
-    : BufferIntervalComparator(),
-      cost_analysis_(cost_analysis),
-      cost_analysis_cache_(cost_analysis_cache) {}
-
-MemoryBoundednessBufferIntervalComparator::
-    MemoryBoundednessBufferIntervalComparator(
-        const CostAnalysis& cost_analysis,
-        CostAnalysis::Cache* cost_analysis_cache,
-        MsaSortOrderOverrides msa_sort_order_overrides)
-    : BufferIntervalComparator(),
-      cost_analysis_(cost_analysis),
-      cost_analysis_cache_(cost_analysis_cache),
-      msa_sort_order_overrides_(msa_sort_order_overrides) {}
-
-std::string
-MemoryBoundednessBufferIntervalComparator::DescribeComparisonCriteria() const {
-  return "[override priority, -memory boundedness, -size, -buffer duration, "
-         "latest use time, (inclusive) start time, instruction id ]";
-}
-
-std::string MemoryBoundednessBufferIntervalComparator::CriteriaToString(
-    const MsaBufferInterval& buffer_interval) {
-  return absl::StrCat("[ ", absl::StrJoin(GetTuple(buffer_interval), ", "),
-                      " ]");
-}
-
-bool MemoryBoundednessBufferIntervalComparator::LessThan(
-    const MsaBufferInterval& lhs, const MsaBufferInterval& rhs) {
-  return GetTuple(lhs) < GetTuple(rhs);
-}
-
-int64_t MemoryBoundednessBufferIntervalComparator::GetLatestUseTime(
-    const MsaBufferInterval& buffer_interval) {
-  auto latest_use_it = buffer_to_latest_use_.find(buffer_interval.buffer);
-  if (latest_use_it == buffer_to_latest_use_.end()) {
-    int64_t latest_use_time = 0;
-    for (const HloUse& use : buffer_interval.buffer->GetUses()) {
-      auto it = cost_analysis_.hlo_live_range().instruction_schedule().find(
-          use.instruction);
-      if (it != cost_analysis_.hlo_live_range().instruction_schedule().end()) {
-        latest_use_time = std::max(latest_use_time, it->second);
-      }
-    }
-    latest_use_it =
-        buffer_to_latest_use_
-            .insert(std::make_pair(buffer_interval.buffer, latest_use_time))
-            .first;
-  }
-  return latest_use_it->second;
-}
-
-MemoryBoundednessBufferIntervalComparator::ComparisonTuple
-MemoryBoundednessBufferIntervalComparator::GetTuple(
-    const MsaBufferInterval& buffer_interval) {
-  int64_t priority = GetBufferIntervalOverridePriority(
-      msa_sort_order_overrides_, buffer_interval);
-  float inverse_memory_boundedness =
-      -1.0 * cost_analysis_.GetMemoryBoundedness(buffer_interval,
-                                                 cost_analysis_cache_);
-  int64_t inverse_buffer_size = -1 * buffer_interval.size;
-  int64_t inverse_buffer_duration = buffer_interval.start - buffer_interval.end;
-  int64_t latest_use_time = GetLatestUseTime(buffer_interval);
-  int64_t buffer_start_time = buffer_interval.start;
-  auto buffer_id = buffer_interval.buffer->id();
-  return std::make_tuple(priority, inverse_memory_boundedness,
-                         inverse_buffer_size, inverse_buffer_duration,
-                         latest_use_time, buffer_start_time, buffer_id);
-}
-
 }  // namespace memory_space_assignment
 }  // namespace xla
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h
index e262349fbb5cbd..9e0e6d1a9a45b2 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h
@@ -75,8 +75,8 @@ Classes
     whether implicit by the nature of the instruction (e.g.,
     dynamic-update-slice) or explicit (e.g., fusion input-output aliasing).
 
-  - BufferInterval (HeapSimulator concept): A BufferInterval is defined by a
-    buffer of a given size, with a defined lifetime. In MSA, the buffer
+  - MsaBufferInterval (HeapSimulator concept): A MsaBufferInterval is defined by
+    a buffer of a given size, with a defined lifetime. In MSA, the buffer
     corresponds to an HloValue.
 
   - AllocationValue: An AllocationValue is defined by an HloValue, and *one* of
@@ -166,33 +166,19 @@ Useful logging and error messages
 #ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_SPACE_ASSIGNMENT_H_
 #define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_SPACE_ASSIGNMENT_H_
 
-#include <algorithm>
 #include <cstdint>
-#include <list>
-#include <map>
 #include <memory>
-#include <optional>
-#include <set>
 #include <string>
 #include <tuple>
-#include <type_traits>
 #include <utility>
-#include <variant>
 #include <vector>
 
-// TODO(b/210891274): Use btree_map after build issue in Windows is resolved.
-#if defined(__GNUC__) || defined(__clang__)
-#include "absl/container/btree_map.h"
-#endif
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
-#include "absl/functional/any_invocable.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/utils/hlo_live_range.h"
 #include "xla/service/buffer_value.h"
-#include "xla/service/call_graph.h"
-#include "xla/service/heap_simulator/allocation_block.h"
 #include "xla/service/heap_simulator/heap_simulator.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_alias_analysis.h"
@@ -201,11 +187,7 @@ Useful logging and error messages
 #include "xla/service/memory_space_assignment/cost_analysis.h"
 #include "xla/service/memory_space_assignment/memory_space_assignment.pb.h"
 #include "xla/service/memory_space_assignment/options.h"
-#include "xla/service/memory_space_assignment/slice.h"
-#include "xla/shape.h"
-#include "xla/shape_util.h"
 #include "xla/status.h"
-#include "xla/statusor.h"
 #include "xla/util.h"
 
 namespace xla {
@@ -281,33 +263,6 @@ class PresetAssignments {
   std::string instruction_schedule_str_;
 };
 
-// A class for turning a copy start time and end time into slice start times.
-class SlicedPrefetchStartTimePicker {
- public:
-  // Returns the amount of time elapsed in the instruction schedule between
-  // (exclusive_start_time, exclusive_end_time).
-  using ElapsedTimeFn = std::add_pointer<float(
-      int64_t exclusive_start_time, int64_t exclusive_end_time) const>::type;
-
-  // Returns true if the instructions at lhs_time and rhs_time are in the same
-  // computation.
-  using SameComputationParentFn =
-      std::add_pointer<bool(int64_t lhs_time, int64_t rhs_time) const>::type;
-
-  // Picks slice start times, given the num_slices, prefetch_start_time, and
-  // prefetch_end_time. The returned times are exclusive.
-  //
-  // REQUIRES:
-  // - The instructions following each start time are guaranateed to be in the
-  //   same computation.
-  // - The returned times sorted.
-  // - The first returned time is equal to prefetch_start_time.
-  static std::vector<int64_t> Pick(
-      int64_t num_slices, int64_t exclusive_prefetch_start_time,
-      int64_t prefetch_end_time, absl::AnyInvocable<ElapsedTimeFn> elapsed_fn,
-      absl::AnyInvocable<SameComputationParentFn> has_same_parent_fn);
-};
-
 // MemorySpaceAssignment assigns memory spaces (default or alternate) to each
 // instruction in the module. It will greedily try placing as as many values in
 // the alternate memory space as possible. It uses the heap simulator to
@@ -317,150 +272,6 @@ class SlicedPrefetchStartTimePicker {
 // memory space.
 class MemorySpaceAssignment {
  public:
-  // AllocationValue is used to break up HloValues for each non-trivial position
-  // (trivial positions are considered Tuple, GetTupleElement, and Bitcast). An
-  // HloValue may include positions and uses that alias with each other across
-  // multiple computations. We use this class to break these HloValues such that
-  // every AllocationValue has one defining position (that may alias with other
-  // AllocationValues). The uses field of the AllocationValue contains only the
-  // direct uses of the AllocationValue's defining position.
-  //
-  // For example, consider the following HLO snippet:
-  //
-  // Body {
-  //   body_param = (f32[4,3]{1,0}, f32[]) parameter(0)
-  //   get-tuple-element.3 = f32[4,3]{1,0} get-tuple-element(body_param),
-  //   index=0
-  //   ...
-  //   ROOT tuple = (f32[4,3]{1,0}, f32[]) tuple(get-tuple-element.3, ...)
-  // }
-  //
-  // Cond {
-  //   cond_param = (f32[4,3]{1,0}, f32[]) parameter(0)
-  //   ...
-  // }
-  //
-  // add.4 = f32[4,3]{1,0} add(...)
-  // tuple.1 = (f32[4,3]{1,0}, f32[]) tuple(add.4, ...)
-  // while = (f32[4,3]{1,0}, f32[]) while(tuple.1), body=Body, condition=Cond
-  // get-tuple-element.5 = f32[4,3]{1,0} get-tuple-element(while), index=0
-  // add.5 = f32[4,3]{1,0} add(get-tuple-element.5, ...)
-  //
-  // This contains an HloValue that looks like the following:
-  // positions:
-  //  add.4
-  //  body_param {0}
-  //  get-tuple-element.3
-  //  tuple {0}
-  //  cond_param {0}
-  //  tuple.1 {0}
-  //  while {0}
-  //  get-tuple-element.5
-  // uses:
-  //  add.1, operand 0
-  //  tuple, operand 0
-  //  while, operand 0 {0}
-  //  add.5, operand 0
-  //
-  // We break this HloValue up into the following AllocationValues for each
-  // non-trivial position:
-  // AllocationValue1: computation = Entry
-  //  position:
-  //   add.4
-  //  uses:
-  //   while, operand 0 {0}
-  // AllocationValue2: computation = Cond
-  //  position:
-  //   cond_param {0}
-  //  uses:
-  // AllocationValue3: computation = Body
-  //  position:
-  //   body_param {0}
-  //  uses:
-  //   add.1, operand 0
-  //   tuple, operand 0
-  // AllocationValue4: computation = Entry
-  //  position:
-  //   while {0}
-  //  uses:
-  //   add.5, operand 0
-  class AllocationValue {
-   public:
-    // This data structure wraps an HloUse and adds additional metadata that are
-    // useful for allocation.
-    struct Use {
-      // The wrapped HloUse object.
-      HloUse hlo_use;
-      // The logical time this use is scheduled.
-      int64_t time;
-      // All the positions where this use aliases with. The aliased positions
-      // must get the same allocation.
-      std::vector<HloPosition> aliases;
-
-      bool operator==(const Use& other) const {
-        return hlo_use == other.hlo_use && time == other.time &&
-               aliases == other.aliases;
-      }
-
-      template <typename H>
-      friend H AbslHashValue(H h, const Use& s) {
-        return H::combine(std::move(h), s.hlo_use, s.time, s.aliases);
-      }
-    };
-
-    AllocationValue(const HloValue* value, const HloPosition& position,
-                    int64_t size)
-        : value_(value),
-          defining_position_(position),
-          size_(size),
-          requires_contiguous_allocation_(false) {}
-
-    const HloPosition& defining_position() const { return defining_position_; }
-    const HloInstruction* defining_instruction() const {
-      return defining_position().instruction;
-    }
-    int64_t size() const { return size_; }
-    const std::vector<Use>& uses() const { return uses_; }
-    std::vector<Use>& uses() { return uses_; }
-    const HloValue* value() const { return value_; }
-    const HloComputation* computation() const {
-      return defining_instruction()->parent();
-    }
-    AllocationSequence* mutable_allocation_sequence() {
-      return &allocation_sequence_;
-    }
-    const AllocationSequence* allocation_sequence() const {
-      return &allocation_sequence_;
-    }
-
-    // Sets/gets whether this AllocationValue requires allocating it
-    // contiguously throughout its live range (without any copies).
-    bool requires_contiguous_allocation() const {
-      return requires_contiguous_allocation_;
-    }
-    void set_requires_contiguous_allocation(
-        bool requires_contiguous_allocation) {
-      requires_contiguous_allocation_ = requires_contiguous_allocation;
-    }
-
-    void AddUse(const HloUse& use, int64_t use_time) {
-      uses_.push_back({use, use_time, {}});
-    }
-
-    std::string ToString() const;
-    std::string ToShortString() const;
-
-   private:
-    const HloValue* value_;
-    HloPosition defining_position_;
-    int64_t size_;
-    // If true, there must be a contiguous allocation for this buffer without
-    // any copies.
-    bool requires_contiguous_allocation_;
-    std::vector<Use> uses_;
-    AllocationSequence allocation_sequence_;
-  };
-
   // Statistics of asynchronous copies.
   struct AsyncCopyStats {
     // Includes both async copies and async sliced copies.
@@ -495,8 +306,7 @@ class MemorySpaceAssignment {
                            const HloAliasAnalysis& alias_analysis);
 
   // Finds an AllocationSequence for placing buffers in alternate memory using
-  // the AlternateMemoryBestFitHeap algorithm. Must be set before Process() is
-  // called.
+  // the MsaAlgorithm algorithm. Must be set before Process() is called.
   virtual Status FindAllocationSequence(const HloLiveRange& hlo_live_range,
                                         const HloAliasAnalysis& alias_analysis);
 
@@ -574,933 +384,6 @@ class MemorySpaceAssignment {
   absl::flat_hash_map<int64_t, std::vector<HloInstruction*>> schedule_before_;
 };
 
-// A BufferIntervalComparator that utilizes MemoryBoundedness as its primary
-// sorting criteria.
-//
-// This comparator caches HloValues -> latest use time.
-class MemoryBoundednessBufferIntervalComparator
-    : public BufferIntervalComparator {
- public:
-  MemoryBoundednessBufferIntervalComparator(
-      const CostAnalysis& cost_analysis,
-      CostAnalysis::Cache* cost_analysis_cache);
-
-  MemoryBoundednessBufferIntervalComparator(
-      const CostAnalysis& cost_analysis,
-      CostAnalysis::Cache* cost_analysis_cache,
-      MsaSortOrderOverrides msa_sort_order_overrides);
-
-  ~MemoryBoundednessBufferIntervalComparator() override = default;
-
-  std::string DescribeComparisonCriteria() const override;
-  std::string CriteriaToString(
-      const MsaBufferInterval& buffer_interval) override;
-  bool LessThan(const MsaBufferInterval& lhs,
-                const MsaBufferInterval& rhs) override;
-
- private:
-  // See the value returned by DescribeComparisonCriteria() for the meaning of
-  // each tuple element.
-  using ComparisonTuple = std::tuple<int64_t, float, int64_t, int64_t, int64_t,
-                                     int64_t, BufferValue::Id>;
-
-  ComparisonTuple GetTuple(const MsaBufferInterval& buffer_interval);
-  int64_t GetLatestUseTime(const MsaBufferInterval& buffer_interval);
-  absl::flat_hash_map<const HloValue*, int64_t> buffer_to_latest_use_;
-  const CostAnalysis& cost_analysis_;
-  CostAnalysis::Cache* cost_analysis_cache_;
-
-  // Config to override alternate memory assignment sorting order for filtered
-  // buffers.
-  MsaSortOrderOverrides msa_sort_order_overrides_;
-};
-
-// The default BufferIntervalComparator used for cross-program prefetching.
-//
-// This class caches HloValue -> {latest use, cumulative use size }.
-class DefaultCrossProgramPrefetchBufferIntervalComparator
-    : public BufferIntervalComparator {
- public:
-  explicit DefaultCrossProgramPrefetchBufferIntervalComparator(
-      const HloLiveRange& hlo_live_range);
-
-  ~DefaultCrossProgramPrefetchBufferIntervalComparator() override = default;
-
-  std::string DescribeComparisonCriteria() const override;
-  std::string CriteriaToString(
-      const MsaBufferInterval& buffer_interval) override;
-  bool LessThan(const MsaBufferInterval& lhs,
-                const MsaBufferInterval& rhs) override;
-
- private:
-  // See the value returned by DescribeComparisonCriteria() for the meaning of
-  // each tuple element.
-  using ComparisonTuple =
-      std::tuple<int64_t, int64_t, int64_t, BufferValue::Id>;
-
-  struct AdditionalSortData {
-    int64_t latest_use = 0;
-    int64_t cumulative_use_size = 0;
-  };
-
-  ComparisonTuple GetTuple(const MsaBufferInterval& buffer_interval);
-
-  absl::flat_hash_map<const HloValue*, AdditionalSortData>
-      additional_sort_data_;
-  const HloLiveRange& hlo_live_range_;
-};
-
-// A struct representing an asynchronous copy with its logical start and end
-// time (time that copy done is scheduled), the resource this copy would use,
-// its destination memory space, and a unique ID.
-struct AsynchronousCopy {
-  int64_t exclusive_start_time;
-  int64_t end_time;
-  float resource;
-  MemorySpace destination;
-  int64_t id;
-
-  std::tuple<int64_t, int64_t, float, MemorySpace, int64_t> AsTuple() const {
-    return std::make_tuple(exclusive_start_time, end_time, resource,
-                           destination, id);
-  }
-};
-
-// Compare asynchronous copies such that an earlier start time has the same or
-// earlier end time and an earlier end time has the same or earlier start time.
-bool operator<(const AsynchronousCopy& a, const AsynchronousCopy& b);
-
-bool operator==(const AsynchronousCopy& a, const AsynchronousCopy& b);
-bool operator!=(const AsynchronousCopy& a, const AsynchronousCopy& b);
-
-// Helper class to enforce asynchronous copy ordering. If the appropriate option
-// is enabled, we only allow asynchronous copies that are pipelined: if an
-// asynchronous copy ends earlier than another asynchronous copy, it must start
-// the same time or earlier than the other asynchronous copy; and if an
-// asynchronous copy starts earlier than another asynchronous copy, it must end
-// the same time or earlier than the other asynchronous copy.
-class AsynchronousCopyOrdering {
- public:
-  AsynchronousCopyOrdering() = default;
-
-  // Adds an asynchronous copy.
-  void AddCopy(const AsynchronousCopy& copy);
-
-  // Removes an asynchronous copy. CHECKs that it is removed.
-  void RemoveCopy(const AsynchronousCopy& copy);
-
-  // Returns true if the addition of an asynchronous copy in the given time
-  // interval would violate the asynchronous copy ordering. E.g., consider the
-  // following scenario:
-  //                                  CS          CD
-  //  already committed async copy:   +-----------+
-  //                new async copy:     +--------+
-  //
-  // The new asynchronous copy would violate the ordering guarantee because the
-  // copy start is after an already committed asynchronous copy while its copy
-  // done is before the committed copy.
-  bool ViolatesOrdering(int64_t exclusive_start_time, int64_t end_time) const;
-
- private:
-  // We use this data structure for keys into the map that has a custom
-  // comparator for the ordering guarantees.
-  struct Interval {
-    int64_t exclusive_start_time;
-    int64_t end_time;
-
-    // We allow multiple prefetches that have one or both of the same start and
-    // end times. std::map considers two values as equal if neither are less
-    // than the other.  Using this comparator, we can ensure that the only
-    // intervals that evaluate to be equal are those with the same start and end
-    // times or those with intervals that violate the FIFO order.
-    bool operator<(const Interval& other) const {
-      return (exclusive_start_time < other.exclusive_start_time &&
-              end_time <= other.end_time) ||
-             (exclusive_start_time <= other.exclusive_start_time &&
-              end_time < other.end_time);
-    }
-  };
-  // Stores asynchronous copies in a tree set respecting the pipelining order.
-  std::map<Interval, std::set<AsynchronousCopy>> ranges_;
-};
-
-// Helper class to enforce asynchronous copy resources by keeping track of
-// available copy bandwidth and elapsed times of overlapped operations. It
-// maintains a list of initial resources that correspond to the elapsed times of
-// overlapped operations. As asynchronous copies are added, the available
-// resource is subtracted to keep track of the current state.
-class AsynchronousCopyResource {
- public:
-  // A specification of needed asynchronous copy resources.
-  struct ResourceSpec {
-    int64_t exclusive_start_time;
-    int64_t end_time;
-    float resource;
-  };
-
-  AsynchronousCopyResource() = default;
-
-  // The constructor needs the initial resources.
-  explicit AsynchronousCopyResource(absl::Span<const float> initial_resources)
-      : initial_resources_(initial_resources.begin(), initial_resources.end()),
-        delay_(initial_resources.size(), 0) {}
-
-  // Adds the given asynchronous copy and updates the current resources. CHECK
-  // fails if there aren't enough resources to satisfy this copy (the caller
-  // should use HasEnoughResource first to ensure there is enough resource).
-  void AddCopy(const AsynchronousCopy& copy);
-
-  // Removes the given copy and frees the resource.
-  void RemoveCopy(const AsynchronousCopy& copy);
-
-  // Returns true if a copy with the given start and end times and resource can
-  // be satisfied.
-  bool HasEnoughResource(int64_t exclusive_start_time, int64_t end_time,
-                         float resource);
-
-  // Returns true if a set of copy specifications can be satisfied in the
-  // order specified.
-  bool HasEnoughResourceMultiCheck(const std::vector<ResourceSpec>& specs);
-
-  // This is only used for debugging and testing purposes, it returns the
-  // currently available resource at each logical time.
-  std::vector<float> GetCurrentResources() const {
-    std::vector<float> current_resources(initial_resources_.begin(),
-                                         initial_resources_.end());
-    for (int i = 0; i < current_resources.size(); ++i) {
-      current_resources[i] -= std::min(current_resources[i], delay_[i]);
-    }
-    return current_resources;
-  }
-
-  // A useful debugging tool for printing several pieces of information about
-  // AsynchronousCopyResource.
-  std::string Dump(int64_t start_time, int64_t end_time,
-                   MemorySpace memory_space_filter) const;
-
- private:
-  // Internal helper method to implement adding/removing/checking resources.
-  // ConsumeResource() may modify delay_. If delay_change_map is not null,
-  // for any change to delay_[i], {i, delay_[i]} will be added to
-  // delay_change_map, allowing callers to undo any modifications.
-  bool ConsumeResource(
-      int64_t exclusive_start_time, int64_t end_time, float resource,
-      absl::flat_hash_map<int64_t, float>* delay_change_map = nullptr,
-      float resource_to_free = 0.0);
-
-  // Same as the public RemoveCopy except it works on the async_copies_
-  // iterator. Assumes copy_it points to the last copy for its start time;
-  // otherwise the public RemoveCopy method is supposed to temporarily remove
-  // these later copies that share the same start time before removing the
-  // requested copy.
-  void RemoveCopy(std::list<AsynchronousCopy>::iterator& copy_it);
-
-  // We maintain a linked list of asynchronous copies sorted by the start times.
-  // This allows us to efficiently find the copy that starts right after another
-  // one because adding a copy might push a copy further into the future.
-  std::list<AsynchronousCopy> async_copies_;
-// To make the lookups into async_copies_ more efficient, we also maintain a
-// binary tree that is indexed by the start time, containing iterators into
-// async_copies_.
-// TODO(b/210891274): Use btree_map after build issue in Windows is resolved.
-#if defined(__GNUC__) || defined(__clang__)
-  absl::btree_map<int64_t, std::list<AsynchronousCopy>::iterator>
-      async_copy_time_map_;
-#else
-  std::map<int64_t, std::list<AsynchronousCopy>::iterator> async_copy_time_map_;
-#endif
-  std::vector<float> initial_resources_;
-  std::vector<float> delay_;
-};
-
-// This class inherits from GlobalDecreasingSizeBestFitHeap with a notion of
-// maximum size.
-class AlternateMemoryBestFitHeap
-    : public GlobalDecreasingSizeBestFitHeap<HloValue> {
- public:
-  using AllocationValue = MemorySpaceAssignment::AllocationValue;
-  using HloPositionOrUse = std::variant<HloPosition, HloUse>;
-
-  AlternateMemoryBestFitHeap(AllocationSequence* allocations,
-                             const Options& options,
-                             const HloAliasAnalysis& alias_analysis,
-                             const HloLiveRange& hlo_live_range);
-
-  // Allocates a buffer in preferred memory with whole program lifetime and
-  // enables prefetching prefetch_candidate from default memory across program
-  // boundaries.
-  void AllocateCrossProgramPrefetchBuffer(
-      HloModule* module, const BufferInterval& prefetch_candidate);
-
-  absl::StatusOr<HeapSimulator::Result<HloValue>> Finish() override;
-
- protected:
-  // Given a buffer interval, returns the colocated intervals. Unlike the
-  // similar GlobalDecreasingSizeBestFitHeap::GetTransitiveColocations, it
-  // returns the colocated intervals sorted by scheduled time.
-  std::vector<const BufferInterval*> GetSortedColocatedIntervals(
-      const BufferInterval& interval) const;
-
-  // Given a BufferInterval, creates AllocationValue objects and corresponding
-  // AllocationSequences and appends them into allocation_sequence_list_.
-  void CreateAllocationValues(
-      const BufferInterval& buffer_interval,
-      std::vector<AllocationValue>& allocation_values) const;
-
-  // Given colocated intervals, populates allocation_values with the
-  // corresponding AllocationValue objects.
-  virtual void CreateAllocationValuesFromColocatedIntervals(
-      absl::Span<const AlternateMemoryBestFitHeap::BufferInterval* const>
-          colocated_intervals,
-      std::vector<MemorySpaceAssignment::AllocationValue>& allocation_values);
-
-  // Go through all the uses in the AllocationValues and find the aliasing
-  // positions.
-  void FindAliases(std::vector<AllocationValue>* allocation_values) const;
-
-  AllocationSequence* allocations() { return allocations_; }
-  const Options& options() const { return options_; }
-  const HloAliasAnalysis& alias_analysis() { return alias_analysis_; }
-  const HloLiveRange& hlo_live_range() { return hlo_live_range_; }
-
- private:
-  // We inherit AllocationBlock struct to attach the Allocation information to
-  // make importing repacked offsets easier.
-  struct RepackAllocationBlock : AllocationBlock {
-    Allocation* allocation;
-  };
-
-  // A data structure we use to associate Allocation objects that are aliased
-  // and must get the same offset.
-  struct AliasedOffset {
-    int64_t offset;
-    absl::flat_hash_set<const Allocation*> allocations;
-  };
-
-  // An allocation request for a use segment. A use segment is the time segment
-  // between the definition and the first use, and the time segment between the
-  // uses of a buffer. For example, the time between the definition and Use1, is
-  // the first segment, and the time between Use1 and Use2 is the second segment
-  // and so on:
-  //
-  //        +------+----------+-------+
-  //       /        \          \       \
-  //      /          v          v       v
-  //    Def         Use1       Use2    Use3
-  //     <----------> <--------> <----->
-  //        Segment    Segment   Segment
-  //
-  // start_time and end_time are the start and end logical times of the segment.
-  // use_times is a sorted sequence of the times of all uses.
-  // latest_prefetch_time is the latest time we can schedule the CopyDone for a
-  // prefetch.
-  // If allow_no_copy_alternate_mem_allocation is false, an eviction is forced.
-  // If earliest_prefetch_time is set, prefetches cannot start before this
-  // value.
-  struct AllocationRequest {
-    int64_t inclusive_start_time;
-    int64_t end_time;
-    int64_t latest_prefetch_time;
-    int64_t size;
-    bool prefer_no_copy_alternate_mem_allocation;
-    bool allow_no_copy_alternate_mem_allocation;
-    bool require_no_copy_alternate_mem_allocation;
-    bool allow_prefetch;
-    std::optional<int64_t> earliest_prefetch_time;
-    std::optional<int64_t> preferred_prefetch_time;
-    AliasedOffset* preferred_offset;
-    const MemorySpaceAssignment::AllocationValue::Use* use;
-    MemorySpaceAssignment::AllocationValue* allocation_value;
-    absl::Span<const int64_t> all_use_times;
-  };
-
-  // This struct contains mandatory memory assignments at a given time. E.g., an
-  // input's required memory assignment time would correspond to the definition
-  // time of the parameter instruction, and an output's time would correspond to
-  // the time of last use.
-  struct RequiredMemoryAssignment {
-    MemorySpace memory_space;
-    int64_t time;
-    AliasedOffset* offset;
-
-    bool equals_ignoring_time(const RequiredMemoryAssignment& other) const {
-      return memory_space == other.memory_space && offset == other.offset;
-    }
-
-    bool operator==(const RequiredMemoryAssignment& other) const {
-      return memory_space == other.memory_space && time == other.time &&
-             offset == other.offset;
-    }
-
-    bool operator!=(const RequiredMemoryAssignment& other) const {
-      return !(*this == other);
-    }
-  };
-
-  // A struct that contains a pointer to loop-optimized allocation along with
-  // essential data about the loop itself.
-  struct LoopOptimizedAllocationInfo {
-    // The use_idx is the instruction index of the use within the loop.
-    int64_t use_index;
-    // The number of instructions in one iteration of the loop. We use use_index
-    // and loop_size to calculate when exactly to schedule a prefetch
-    // instruction.
-    int64_t loop_size;
-    // A pointer into an Allocation in loop_optimized_allocations_.
-    const Allocation* loop_optimized_allocation;
-  };
-
-  // A context object that is used to share state amongst the methods that
-  // implement Prefetch(). Prefetch tries to find both a sliced solution and an
-  // unsliced solution at the same time. We store both in this structure.
-  struct PrefetchContext {
-    // Prefetching is designed to operate on a SlicedBufferInterval that is
-    // backed by a standard BufferInterval, even if the number of slices == 1.
-    // WorkingIntervals is used to store a SlicedBufferInterval and its backing
-    // BufferInterval.
-    struct WorkingIntervals {
-      BufferInterval full;
-      // sliced is a unique_ptr because it won't necessarily be initialized
-      // when the WorkingBufferIntervals are created, and there is no way to
-      // create an empty SlicedBufferInterval.
-      std::unique_ptr<SlicedBufferInterval> sliced;
-    };
-
-    struct SlicedSolution {
-      // When we talk about a slice, we think of spatial slices, where each
-      // slice is allocated at different times. The following example shows
-      // 3 slices that are used to form a contiguous buffer from [p0, p3]
-      //
-      //   space
-      //    ^
-      // p3 |       +-----------+
-      //    |       |    s2     |
-      // p2 |   +---+-----------+
-      //    |   |      s1       |
-      // p1 |   +-------+-------+
-      //    |           |  s0   |
-      // p0 |           +-------+
-      //    +---|---|---|---|---|----> time
-      //        t0  t1  t2  t3  t4
-      std::vector<SliceDecision> slice_decisions_sorted_by_start_time;
-
-      // In order to support colocated buffer calculations, we need to add a
-      // BufferInterval-Chunk pair to pending_chunks_, such that:
-      // - The duration of the BufferInterval is non-zero.
-      // - All slices have been allocated by the start of the BufferInterval.
-      // - The BufferInterval ends at the end time for all slices.
-      // - The Chunk covers the space allocated for all slices.
-      //
-      // In order to meet that requirement,
-      // we create BufferInterval-Chunk pairs from
-      // slice_decisions_sorted_by_start_time that meet those requirement but do
-      // not cause any memory to be allocated in more than one Chunk at a time.
-      // The result is stored in slices_for_pending_chunks.
-      //
-      // The illustration below demonstrates how we would construct such
-      // BufferInterval-Chunk pairs from the
-      // slice_decisions_sorted_by_start_time example above.
-      //
-      //   space
-      //    ^
-      // p3 |       +---+---+---+
-      //    |       |c2 |       |
-      // p2 |   +---+---+       |
-      //    |   |  c0   |   c2  |
-      // p1 |   +-------+       |
-      //    |           |       |
-      // p0 |           +-------+
-      //    +---|---|---|---|---|----> time
-      //        t0  t1  t2  t3  t4
-      std::vector<std::pair<BufferInterval, Chunk>> slices_for_pending_chunks;
-
-      // The prefetch_picker_debug_string will only be set with the appropriate
-      // VLOG level.
-      std::string prefetch_picker_debug_string;
-    };
-
-    struct UnslicedSolution {
-      Chunk chunk_candidate;    // The chunk chosen for the solution.
-      float prefetch_resource;  // The amount of required prefetch resource.
-      // The prefetch_picker_debug_string will only be set with the appropriate
-      // VLOG level.
-      std::string prefetch_picker_debug_string;
-    };
-
-    WorkingIntervals& GetMutableWorkingIntervals(bool for_sliced_solution) {
-      if (for_sliced_solution) {
-        return sliced_solution_intervals;
-      }
-      return unsliced_solution_intervals;
-    }
-
-    const WorkingIntervals& GetWorkingIntervals(
-        bool for_sliced_solution) const {
-      if (for_sliced_solution) {
-        return sliced_solution_intervals;
-      }
-      return unsliced_solution_intervals;
-    }
-
-    // Parameters to Prefetch().
-    const AllocationRequest* request;
-    Allocation* prev_allocation_in_default_mem;
-
-    // Intermediate calculations common to both the sliced and unsliced
-    // solutions.
-    int64_t exclusive_prefetch_start_time = -1;
-    int64_t prefetch_end_time = -1;
-    const Shape* full_shape;
-    int64_t extra_async_copy_limit = 0;
-    // As a compilation time optimization, store the prefetch start time where
-    // we have first seen out of memory. There is no point of exploring prefetch
-    // start times earlier than this point.
-    std::optional<int64_t> exclusive_out_of_mem_start = std::nullopt;
-
-    // Data structures used to compute and store the sliced solution.
-    std::optional<SliceProposalCollection> slice_proposal_collection =
-        std::nullopt;
-    WorkingIntervals sliced_solution_intervals;
-    std::optional<SlicedSolution> sliced_solution;
-
-    // Data structures used to compute and store the unsliced solution.
-    WorkingIntervals unsliced_solution_intervals;
-    std::optional<UnslicedSolution> unsliced_solution;
-  };
-
-  // Result of an allocation, prefetch, eviction etc. request.  The result is
-  // either kSuccess or a bitwise OR of one or more failures. The values are
-  // unique powers of two. To check if a result contains a particular failure,
-  // use the result_is method. To add a new failure to a result, use the
-  // result_mark method.
-  enum class Result {
-    // Successful allocation.
-    kSuccess = 0,
-    // Allocation failed because we ran out of alternate memory.
-    kFailOutOfMemory = 1,
-    // A no-copy allocation couldn't be performed because the previous
-    // allocation wasn't in the alternate memory space.
-    kFailPrevAllocationNotInAlternateMem = 2,
-    // A no-copy allocation couldn't be performed because the live range was too
-    // long.
-    kFailLiveRangeTooLong = 4,
-    // A prefetching couldn't be performed because the live range was too short.
-    kFailLiveRangeTooShort = 8,
-    // Ran out of outstanding asynchronous copy limit either during prefetching
-    // or eviction.
-    kFailOutOfAsyncCopies = 16,
-    // A prefetching couldn't be performed because the asynchronous copy
-    // resource was violated.
-    kFailViolatesAsyncCopyResource = 32,
-    // An allocation failure happened that requires uncommitting all the pending
-    // allocations. Usually this is due to a situation requiring an eviction but
-    // the eviction couldn't be performed.
-    kFailRequiresUncommit = 64,
-    // For prefetching, indicates that all slices have the same start time, in
-    // which case, we fallback to an unsliced solution.
-    kAllSlicesHaveTheSameStartTime = 128,
-    // There were conflicting preferred offsets.
-    kFailConflictingPreferredOffsets = 256
-  };
-
-  // Return true if the result belongs to a failure.
-  static bool result_is(Result result, Result failure) {
-    return static_cast<int>(result) & static_cast<int>(failure);
-  }
-
-  // Mark (bitwise OR) a failure to the result.
-  static Result result_mark(Result failure, Result& result) {
-    result = static_cast<Result>(static_cast<int>(result) |
-                                 static_cast<int>(failure));
-    return result;
-  }
-
-  // Return true if the result is a failure that requires us to uncommit pending
-  // chunks.
-  static bool result_requires_uncommit(Result result) {
-    return result_is(result, Result::kFailRequiresUncommit);
-  }
-
-  // Return true if the result is a failure either due to running out of
-  // outstanding asynchronous copies or due to violating asynchronous copy
-  // ordering.
-  static bool result_failed_because_of_async_copy(Result result) {
-    return result_is(result, Result::kFailOutOfAsyncCopies) ||
-           result_is(result, Result::kFailViolatesAsyncCopyResource);
-  }
-
-  // For the given loop with the start and end index and loop size, run the
-  // MemoryBoundLoopOptimizer and record its outputs into
-  // optimized_allocations_map_.
-  Status OptimizeMemoryBoundLoop(int loop_start_idx, int loop_end_idx,
-                                 int loop_size);
-
-  // Identify memory-bound loops in the graph and call OptimizeMemoryBoundLoop
-  // for the found loops.
-  void IdentifyAndOptimizeMemoryBoundLoops();
-
-  // Allocates buffers for instructions that need reserved scoped allocations in
-  // the alternate memory space.
-  void AllocateReservedScopedAllocations();
-
-  // Returns the AliasedOffset object associated with the allocation.
-  AliasedOffset* GetAliasedOffset(const Allocation& allocation);
-
-  // If aliased_offset is non-null, this method adds the allocation to
-  // aliased_offset. Otherwise, it creates a new AliasedOffset object and adds
-  // the allocation to this new AliasedOffset.
-  void CreateOrAddToAliasedOffset(const Allocation& allocation,
-                                  AliasedOffset* aliased_offset);
-
-  // Given an allocation sequence, returns the live allocation at time with a
-  // preference towards allocations in alternate memory. Returns nullptr if no
-  // allocation is alive at that time.
-  static Allocation* GetLiveAllocationAt(const AllocationSequence& allocations,
-                                         int64_t time);
-
-  // Returns true if the use is allowed in the alternate memory.
-  bool IsUseAllowedInAlternateMemory(const AllocationValue& value,
-                                     const HloUse& use) const;
-
-  // Finds allocations for allocation values generated from colocated intervals.
-  // All of the allocation values have a must-alias relationship with each
-  // other. Returns either kSuccess if all of the sites could be placed in the
-  // alternate memory or a bitwise OR of failure reasons why they couldn't
-  absl::StatusOr<Result> AllocateAllocationValues(
-      absl::Span<AllocationValue> allocation_values);
-
-  // Finds an allocation for an allocation request for a segment (see the
-  // documentation for AllocationRequest above how a segment is defined).
-  //
-  // It performs three things in the following order:
-  //  1- Allocate the allocation request entirely in the alternate memory, if
-  //     there is enough space and if the prefetch interval picker allows.
-  //  2- If (1) was unsuccessful, and the only allocation for
-  //     this buffer was in the alternate memory, we try to perform a prefetch.
-  //  3- If (1) was unsuccessful, prefetch the buffer into the alternate memory,
-  //     if there is enough space and if the prefetch interval picker allows.
-  //
-  // If an eviction (2) was requested and was unsuccessful, this method returns
-  // Result::kFailRequiresUncommit. This means we could not find a suitable
-  // allocation, so all previous allocations for this buffer must be removed and
-  // allocated in the default memory. Otherwise, this method may return
-  // Result::kSuccess if the buffer could be placed in alternate memory or some
-  // other Result with an OR of reasons why the buffer couldn't be placed in
-  // alternate memory.
-  Result AllocateSegment(const AllocationRequest& request);
-
-  // Try allocating in alternate memory without any copies.
-  Result AllocateInAlternateMemoryNoCopy(const AllocationRequest& request);
-
-  // Try evicting to default memory space.
-  Result Evict(const AllocationRequest& request);
-
-  // Returns the time a copy done of a prefetch should be scheduled.
-  int64_t FindPrefetchEndTime(const AllocationRequest& request,
-                              int64_t earliest_prefetch_time) const;
-
-  // Try prefetching to alternate memory space.
-  Result Prefetch(const AllocationRequest& request,
-                  Allocation& prev_allocation_in_default_mem);
-
-  // Helper methods used to implement Prefetch().
-  //
-  // Generates a SliceProposal in context, if options dictate and one can be
-  // constructed.
-  void GenerateSliceProposal(PrefetchContext& context) const;
-  // Calls GenerateSliceProposal to potentially create a SliceProposal, and
-  // sets up WorkingIntervals for a sliced and unsliced solution. Updates
-  // context.
-  void SetupPrefetchWorkingIntervalsAndSliceProposal(
-      PrefetchContext& context) const;
-  // Initializes the PrefetchIntervalPicker and associated data structures in
-  // context.
-  Result InitializePrefetchIntervalPicker(PrefetchContext& context);
-  // As a compile time optimization, try a prefetch allocation that is as late
-  // as possible. If this is not able to find a solution, none of the
-  // earlier tries will succeed either.
-  Result EnsureSomeSpatialPrefetchFitExists(PrefetchContext& context) const;
-  // Check if for the specified type of solution, using the parameters in
-  // context. If we find a solution, it will be stored in context.
-  Result CheckPrefetchFit(bool for_sliced_solution, PrefetchContext& context);
-  // Creates a debugging string describing the timing of the prefetch solution
-  // we are currently attempting (as dictated by for_sliced_solution and
-  // context).
-  std::string AlternateMemoryAllocationAttemptToString(
-      bool for_sliced_solution, const PrefetchContext& context) const;
-
-  // Find the best possible chunk candidate, where it has the longest possible
-  // availability if no preferred offset is given, or at the preferred_offset if
-  // it is given.
-  std::optional<Chunk> FindBestChunkCandidate(
-      const AllocationRequest& request, const AliasedOffset* preferred_offset,
-      BufferInterval* alternate_mem_interval) const;
-  // The same as FindBestChunkCandidate() but allocates the request in slices.
-  // The ith returned chunk should be allocated at slice time i.
-  std::vector<Chunk> FindBestChunkCandidates(
-      const AllocationRequest& request, const AliasedOffset* preferred_offset,
-      SlicedBufferInterval* alternate_mem_interval) const;
-
-  // Returns the required assignment at a particular time, if available.
-  std::optional<RequiredMemoryAssignment> RequiredMemoryAssignmentAt(
-      const HloValue* buffer, int64_t time) const;
-
-  // Searches for aliases in the use for a required assignment, and returns it
-  // if found.
-  std::optional<RequiredMemoryAssignment> AliasedRequiredAssignmentForUse(
-      const AllocationValue::Use& use) const;
-
-  // Goes through the colocated intervals and adds any required assignment.
-  void AddRequiredAssignmentsForColocatedIntervals(
-      absl::Span<const AlternateMemoryBestFitHeap::BufferInterval* const>
-          colocated_intervals);
-
-  // Propagates aliased required assignment for a given position.
-  void AddAliasedRequiredAssignment(const HloInstruction* instruction,
-                                    ShapeIndex index,
-                                    const Allocation* aliased_allocation);
-
-  // This sets a required assignment. CHECK fails if there is a conflicting
-  // required assignment at the same time.
-  void AddRequiredAssignment(const HloValue* value,
-                             const HloInstruction* instruction,
-                             MemorySpace memory_space, int64_t time,
-                             AliasedOffset* offset = nullptr,
-                             bool add_to_pending = true);
-  void AddRequiredAssignment(const HloInstruction* instruction,
-                             ShapeIndex index, MemorySpace memory_space,
-                             AliasedOffset* offset = nullptr,
-                             bool add_to_pending = true);
-  void AddRequiredAssignment(const HloPosition& position,
-                             MemorySpace memory_space,
-                             AliasedOffset* offset = nullptr,
-                             bool add_to_pending = true);
-  void AddRequiredAssignment(const HloUse& use, MemorySpace memory_space,
-                             AliasedOffset* offset = nullptr,
-                             bool add_to_pending = true);
-
-  // Adds input and outputs as required assignments.
-  void AddInputAndOutputRequiredAssignments();
-
-  // Returns a list of "linked" allocations in the alternate memory. Linked
-  // allocations all share a common allocation site (a use or position) with
-  // each other. This can be used to determine if a group of linked allocations
-  // are considered efficient or not.
-  std::vector<std::vector<const Allocation*>>
-  GetLinkedAllocationsInAlternateMemory(
-      absl::Span<const AllocationValue> allocation_values) const;
-
-  // Returns allocation sites (use or position) that are allocated in the
-  // alternate memory, but is considered inefficient.  These arise in the
-  // context of in-place operation like dynamic-update-slice.  We will typically
-  // have an allocation that has the DUS as a use, and another allocation that
-  // has the DUS as a defining position. These two allocation will be part of
-  // the same linked allocation group.
-  //
-  // One reason why an allocation site could be inefficient is because the
-  // amount of data that is asynchronously copied (prefetch and eviction) is
-  // much larger than the amount of data that is used by the HLOs. If we find
-  // inefficient allocation sites, we can require these sites default memory
-  // allocations and allocate them again.
-  std::vector<HloPositionOrUse> GetInefficientAllocationSites(
-      absl::Span<const AllocationValue> allocation_values) const;
-
-  // Returns true if the colocated intervals in the argument are in a parameter
-  // or root instruction of the entry computation and are reserved by the user
-  // to be in the alternate memory space.
-  bool AreIntervalsReservedInAlternateMemory(
-      absl::Span<const BufferInterval* const> colocated_intervals) const;
-
-  // Since the allocations are recorded to the AllocationSequence, we don't
-  // maintain result_ in GlobalDecreasingSizeBestFitHeap. Override AddToChunkMap
-  // to avoid unnecessarily adding the chunk to the chunk map.
-  //
-  // Sliced prefetching requires that we override this method because we
-  // associate more than one chunk with a buffer (i.e., 1 chunk per slice),
-  // which would cause the original implementation of this method to CHECK fail.
-  void AddToChunkMap(const HloValue* buffer, Chunk chunk) override {}
-
-  // Returns true if the addition of num_additional_copies asynchronous copies
-  // in the given time interval would violate the maximum number of asynchronous
-  // copies. An extra  async copy limit can be provided to increase the limit of
-  // asynchronous copies for this instance.
-  bool ViolatesMaximumOutstandingAsyncCopies(
-      int64_t inclusive_start_time, int64_t end_time, bool is_prefetch,
-      int64_t extra_async_copy_limit = 0,
-      int64_t num_additional_copies = 1) const;
-
-  // Exports the allocations for repacking and puts them into the vector in the
-  // parameter.
-  void ExportAllocationsForRepacking(
-      std::vector<AllocationBlock*>& allocations);
-
-  // Update reserved scoped allocation size for instructions when their
-  // operand/output has been allocated in alternate memory by invoking
-  // reserved_scoped_memory_fn
-  void UpdateReservedScopedAllocationSize();
-
-  // Imports repacked allocations and updates the internal data structures
-  // consistent with the new packing.
-  void ImportRepackedAllocations();
-  // Helper functions to implement ImportRepackedAllocations.
-  void ImportRepackedNonSlicedAllocation(RepackAllocationBlock& block);
-  void ImportRepackedSlicedAllocation(RepackAllocationBlock& block);
-  Status AreRepackedSlicesValid(const RepackAllocationBlock& block);
-
-  // Adds an asynchronous copy to allocations.
-  void AddAsyncCopy(
-      Allocation& prev_allocation, MemorySpace memory_space,
-      std::optional<Chunk> chunk, int64_t exclusive_start_time,
-      int64_t end_time, int64_t copy_done_schedule_before_time,
-      AllocationSequence* allocations, AliasedOffset* aliased_offset,
-      float resource,
-      std::optional<int> cross_program_prefetch_index = std::nullopt);
-
-  // For prefetching, adds a SlicedCopyAllocation to allocations. Also updates
-  // asynchronous copy data structures, prefetch_interval_tree_, and aliasing
-  // data structures
-  void AddAsyncSlicesForPrefetch(
-      const Allocation& prev_allocation, AllocationSequence* allocations,
-      AliasedOffset* aliased_offset,
-      const std::vector<SliceDecision>& slice_decisions_sorted_by_start_time,
-      int64_t prefetch_end_time, int64_t allocation_end_time);
-
-  // This method is used for committing the chunk candidate but adding it to
-  // pending_chunks_ so that we can "uncommit" them in case we need to roll back
-  // this allocation sequence.
-  void AddToPendingChunks(const BufferInterval& buffer_interval,
-                          const Chunk& chunk);
-  // If we need to remove the allocations for this allocation sequence, this
-  // removes pending chunks and asynchronous copies in the respective pending
-  // buffers from the interval trees. If an allocation request returns
-  // kFailRequiresUncommit, this method must be called.
-  void UncommitPendingChunks(absl::Span<AllocationValue> allocation_values);
-
-  // Finalizes the allocations where they can no longer be uncommitted.
-  void FinalizeAllocations(absl::Span<AllocationValue> allocation_values);
-
-  // Clears all pending chunks and asynchronous copies.
-  void ClearPendingChunks();
-
-  // Append buffer and allocation infos for debugging and dump it into a file,
-  // if enabled.
-  void AppendBufferInfoDebugString(const BufferInterval& interval,
-                                   std::string* debug_str) const;
-  void AppendScopedAllocationBufferInfoDebugString(
-      const HloInstruction* instruction, int64_t time, int64_t size,
-      std::string& debug_str) const;
-  void AppendAllocationInfoDebugString(const Allocation& allocation,
-                                       std::string& debug_str) const;
-  void DumpDebugStringsIfEnabled() const;
-
-  // Returns the available heap size in the alternate memory.
-  int64_t available_heap_size() const {
-    return options_.max_size_in_bytes - reserved_in_bytes_;
-  }
-
-  // Returns the earliest time in the (exclusive_start_time, end_time) range
-  // that a new allocation with the given size would fit in the alternate
-  // memory. If it doesn't fit, it returns nullopt.
-  std::optional<int> FindEarliestExclusiveTimeToSatisfyPeakMemory(
-      int exclusive_start_time, int end_time, int64_t size) const;
-
-  // Creates and returns a RepackAllocationBlock.
-  static RepackAllocationBlock MakeRepackAllocationBlock(
-      int64_t start_time, int64_t end_time, int64_t size,
-      int64_t initial_offset, int64_t id, Allocation* allocation) {
-    RepackAllocationBlock allocation_block;
-    allocation_block.inclusive_start_time = start_time;
-    allocation_block.end_time = end_time;
-    allocation_block.size = size;
-    allocation_block.offset = -1;
-    allocation_block.initial_offset = initial_offset;
-    allocation_block.id = id;
-    allocation_block.next_colocated = nullptr;
-    allocation_block.allocation = allocation;
-    return allocation_block;
-  }
-
-  // Returns a vector of instructions that have the same fingerprint as this
-  // instruction.
-  const std::vector<const HloInstruction*>* GetRepeatedInstructionList(
-      const HloInstruction* instruction) const;
-
-  // Returns true if the interval is pinned in the alternate memory. Buffers are
-  // pinned when their layout has the alternate memory space before MSA runs.
-  bool IsIntervalPinnedToAlternateMemory(const BufferInterval& interval) const;
-
-  AllocationSequence* allocations_;
-  const Options& options_;
-  const HloAliasAnalysis& alias_analysis_;
-  const HloLiveRange& hlo_live_range_;
-  std::unique_ptr<CallGraph> call_graph_;
-  // We use a interval tree to keep track of the number of outstanding
-  // prefetches and evictions.
-  BufferIntervalTree prefetch_interval_tree_;
-  BufferIntervalTree eviction_interval_tree_;
-  AsynchronousCopyOrdering async_copy_ordering_;
-  AsynchronousCopyResource prefetch_async_copy_resource_;
-  AsynchronousCopyResource eviction_async_copy_resource_;
-  // A list of RepackAllocationBlock objects that mirrors allocation sequences,
-  // used for repacking. We use a list here because we need pointer stability
-  // for aliased allocations.
-  std::list<RepackAllocationBlock> repack_allocation_blocks_;
-  int64_t num_repacks_ = 0;
-  int64_t num_repacks_successful_ = 0;
-  std::vector<std::pair<BufferInterval, Chunk>> pending_chunks_;
-  std::vector<AsynchronousCopy> pending_async_copies_;
-  std::vector<std::pair<const HloValue*, RequiredMemoryAssignment>>
-      pending_required_assignments_;
-  // A cache to keep the peak memory usage at each point in the graph. We use
-  // this to see if the proposed allocation in the alternate memory would fit
-  // ignoring fragmentation, and if not, we can skip the more expensive lookup
-  // in the BufferIntervalTree, which also considers fragmentation.
-  std::vector<int64_t> peak_memory_usage_;
-  // The data structure that contains AliasedOffset objects and Allocation to
-  // AliasedOffset map for efficient lookup.
-  std::list<AliasedOffset> aliased_offsets_;
-  absl::flat_hash_map<const Allocation*, AliasedOffset*> aliased_offset_map_;
-  // This map contains required memory assignments for HloValues (e.g., input
-  // and outputs).
-  absl::flat_hash_map<const HloValue*, std::vector<RequiredMemoryAssignment>>
-      required_assignments_;
-  // Number of bytes reserved in alternate memory space.
-  int64_t reserved_in_bytes_ = 0;
-  // A rough measure of the memory pressure of the model, in bytes. Note that
-  // this is pressure for memory capacity (and not accessed bytes), and for
-  // alternate memory (not default memory).
-  int64_t memory_pressure_ = 0;
-  int64_t next_async_copy_id_ = 0;
-  // Fingerprint cache.
-  absl::flat_hash_map<const HloInstruction*, std::string> fingerprint_map_;
-  // Vector of repeated instructions (that have the same fingerprint) indexed by
-  // fingerprint.
-  absl::flat_hash_map<std::string, std::vector<const HloInstruction*>>
-      repeated_inst_map_;
-
-  // Loop-optimized allocations found by MemoryBoundLoopOptimizer. These
-  // allocation objects describe the allocations for one iteration of the loop,
-  // so we translate them into the program-level Allocation objects in
-  // allocations_.
-  std::vector<AllocationSequence> loop_optimized_allocations_;
-  // A map to look up the loop-optimized allocation info by use.
-  absl::flat_hash_map<HloUse, LoopOptimizedAllocationInfo>
-      loop_optimized_allocations_map_;
-  // A map to look the operands of each instruction that are assigned in
-  // alternate memory.
-  absl::flat_hash_map<const HloInstruction*,
-                      absl::flat_hash_set<std::pair<int, ShapeIndex>>>
-      operands_in_alternate_memory_map_;
-  // A map to look the outputs of each instruction that are assigned in
-  // alternate memory.
-  absl::flat_hash_map<const HloInstruction*, absl::flat_hash_set<ShapeIndex>>
-      outputs_in_alternate_memory_map_;
-  // Debug strings.
-  std::string buffer_info_str_;
-  std::string allocation_info_str_;
-  std::string instruction_schedule_str_;
-};
-
 }  // namespace memory_space_assignment
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.proto b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.proto
index 47a89e74bebb00..77faa69e74c8c8 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.proto
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.proto
@@ -135,6 +135,10 @@ message HloPositionMatcher {
   // If output of an instruction is a tuple and indexing into the
   // tuple is required.
   optional TupleShapeIndex tuple_index = 3;
+  // Filters instructions with output size in bytes greater or equal to a value.
+  optional int64 size_gte = 4;
+  // Filters instructions with output size in bytes less or equal to a value.
+  optional int64 size_lte = 5;
 }
 
 // Options to override preferred prefetch time for an operand.
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
index 5c9cc1e1321ba8..192985928c9844 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
@@ -35,7 +35,6 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
-#include "absl/functional/any_invocable.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
@@ -54,7 +53,9 @@ limitations under the License.
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/hlo_value.h"
 #include "xla/service/instruction_hoister.h"
+#include "xla/service/memory_space_assignment/algorithm.h"
 #include "xla/service/memory_space_assignment/allocation.h"
+#include "xla/service/memory_space_assignment/buffer_interval_comparator.h"
 #include "xla/service/memory_space_assignment/cost_analysis.h"
 #include "xla/service/memory_space_assignment/memory_space_assignment.pb.h"
 #include "xla/service/memory_space_assignment/options.h"
@@ -62,7 +63,6 @@ limitations under the License.
 #include "xla/service/memory_space_assignment/repacking.h"
 #include "xla/service/memory_space_assignment/slice.h"
 #include "xla/service/memory_space_assignment/testing_utils.h"
-#include "xla/service/time_utils.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
@@ -126,9 +126,7 @@ StatusOr<MessageType> ParseTextProto(const std::string& text_proto) {
 
 class TestBufferIntervalComparator : public BufferIntervalComparator {
  public:
-  explicit TestBufferIntervalComparator(
-      GlobalDecreasingSizeBestFitHeap<HloValue>::BufferIntervalCompare
-          compare_method)
+  explicit TestBufferIntervalComparator(MsaBufferIntervalCompare compare_method)
       : BufferIntervalComparator(), compare_method_(compare_method) {}
 
   ~TestBufferIntervalComparator() override = default;
@@ -146,8 +144,7 @@ class TestBufferIntervalComparator : public BufferIntervalComparator {
   }
 
  private:
-  GlobalDecreasingSizeBestFitHeap<HloValue>::BufferIntervalCompare
-      compare_method_;
+  MsaBufferIntervalCompare compare_method_;
 };
 
 class MemorySpaceAssignmentTestBase : public HloTestBase {
@@ -4546,8 +4543,6 @@ TEST_P(MemorySpaceAssignmentTest, MemoryBoundednessBufferIntervalCompare) {
 
 TEST_P(MemorySpaceAssignmentTest,
        MemoryBoundednessOverrideSortOrderAssignFirst) {
-  // Override MSA sort order and try to assign all negates to alternate memory
-  // first.
   absl::string_view hlo_string = R"(
   HloModule module, is_scheduled=true
 
@@ -4571,6 +4566,8 @@ TEST_P(MemorySpaceAssignmentTest,
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
+  // Override MSA sort order and try to assign all negates to alternate memory
+  // first. Alternate memory size is enough to fit 2 f32[4,3] tensors at a time.
   const std::string text_proto = R"pb(
     overrides {
       hlo_position_matcher { instruction_name_regex: "negate(.*)" }
@@ -4589,7 +4586,8 @@ TEST_P(MemorySpaceAssignmentTest,
   EXPECT_EQ(p0->shape().layout().memory_space(), kDefaultMemorySpace);
   const HloInstruction* p1 = FindInstruction(module.get(), "p1");
   EXPECT_EQ(p1->shape().layout().memory_space(), kDefaultMemorySpace);
-  // All negates are in alternate memory space except negate4.
+  // Check that all negates are in alternate memory space except negate4.
+  // negate4 is a program output, so it has to land in default memory.
   HloInstruction* negate0 = FindInstruction(module.get(), "negate0");
   EXPECT_EQ(negate0->shape().layout().memory_space(), kAlternateMemorySpace);
   HloInstruction* negate1 = FindInstruction(module.get(), "negate1");
@@ -4614,8 +4612,6 @@ TEST_P(MemorySpaceAssignmentTest,
 
 TEST_P(MemorySpaceAssignmentTest,
        MemoryBoundednessOverrideSortOrderAssignLast) {
-  // Override MSA sort order and try to assign all negates to alternate memory
-  // last.
   absl::string_view hlo_string = R"(
   HloModule module, is_scheduled=true
 
@@ -4639,9 +4635,11 @@ TEST_P(MemorySpaceAssignmentTest,
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
+  // Override MSA sort order and try to assign all tanhs to alternate memory
+  // last. Alternate memory size is enough to fit 2 f32[4,3] tensors at a time.
   const std::string text_proto = R"pb(
     overrides {
-      hlo_position_matcher { instruction_name_regex: "negate(.*)" }
+      hlo_position_matcher { instruction_name_regex: "tanh(.*)" }
       override_options { assign_last: true }
     }
   )pb";
@@ -4658,7 +4656,82 @@ TEST_P(MemorySpaceAssignmentTest,
   EXPECT_EQ(p0->shape().layout().memory_space(), kDefaultMemorySpace);
   const HloInstruction* p1 = FindInstruction(module.get(), "p1");
   EXPECT_EQ(p1->shape().layout().memory_space(), kDefaultMemorySpace);
-  // All negates are in default memory space except negate3.
+  HloInstruction* negate0 = FindInstruction(module.get(), "negate0");
+  EXPECT_EQ(negate0->shape().layout().memory_space(), kAlternateMemorySpace);
+  HloInstruction* negate1 = FindInstruction(module.get(), "negate1");
+  EXPECT_EQ(negate1->shape().layout().memory_space(), kAlternateMemorySpace);
+  HloInstruction* negate2 = FindInstruction(module.get(), "negate2");
+  EXPECT_EQ(negate2->shape().layout().memory_space(), kAlternateMemorySpace);
+  HloInstruction* negate3 = FindInstruction(module.get(), "negate3");
+  EXPECT_EQ(negate3->shape().layout().memory_space(), kAlternateMemorySpace);
+  HloInstruction* negate4 = FindInstruction(module.get(), "negate4");
+  // negate4 is a program output, so it has to land in default memory.
+  EXPECT_EQ(negate4->shape().layout().memory_space(), kDefaultMemorySpace);
+  // Check that all tanhs are in default memory space.
+  const HloInstruction* tanh0 = FindInstruction(module.get(), "tanh0");
+  EXPECT_EQ(tanh0->shape().layout().memory_space(), kDefaultMemorySpace);
+  const HloInstruction* tanh1 = FindInstruction(module.get(), "tanh1");
+  EXPECT_EQ(tanh1->shape().layout().memory_space(), kDefaultMemorySpace);
+  const HloInstruction* tanh2 = FindInstruction(module.get(), "tanh2");
+  EXPECT_EQ(tanh2->shape().layout().memory_space(), kDefaultMemorySpace);
+  const HloInstruction* tanh3 = FindInstruction(module.get(), "tanh3");
+  EXPECT_EQ(tanh3->shape().layout().memory_space(), kDefaultMemorySpace);
+  const HloInstruction* tanh4 = FindInstruction(module.get(), "tanh4");
+  EXPECT_EQ(tanh4->shape().layout().memory_space(), kDefaultMemorySpace);
+}
+
+TEST_P(MemorySpaceAssignmentTest,
+       MemoryBoundednessOverrideSortOrderBySizeLteAssignFirst) {
+  absl::string_view hlo_string = R"(
+  HloModule module, is_scheduled=true
+
+  ENTRY entry {
+    p0 = f32[3,4]{1,0} parameter(0)
+    p1 = f32[5,4]{1,0} parameter(1)
+    tanh0 = f32[3,4]{1,0} tanh(p0)
+    negate0 = f32[5,4]{1,0} negate(p1)
+    tanh1 = f32[3,4]{1,0} tanh(tanh0)
+    negate1 = f32[5,4]{1,0} negate(negate0)
+    tanh2 = f32[3,4]{1,0} tanh(tanh1)
+    negate2 = f32[5,4]{1,0} negate(negate1)
+    tanh3 = f32[3,4]{1,0} tanh(tanh2)
+    negate3 = f32[5,4]{1,0} negate(negate2)
+    tanh4 = f32[3,4]{1,0} tanh(tanh3)
+    negate4 = f32[5,4]{1,0} negate(negate3)
+    ROOT tuple = (f32[3,4]{1,0}, f32[5,4]{1,0}) tuple(tanh4, negate4)
+  }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  // Override MSA sort order and try to assign all buffers with size lesser
+  // than or equal to 48 bytes to alternate memory first.
+  const std::string text_proto = R"pb(
+    overrides {
+      hlo_position_matcher { size_lte: 48 }
+      override_options { assign_first: true }
+    }
+  )pb";
+  TF_ASSERT_OK_AND_ASSIGN(auto msa_sort_order_overrides,
+                          ParseTextProto<MsaSortOrderOverrides>(text_proto));
+
+  Options memory_space_options = DefaultMemorySpaceOptions();
+  // Set max size to 120 bytes, such that 2 f32[4,3] tensors can fit in
+  // alternate memory at the same time but not 1 f32[4,3] tensor and 1
+  // f32[4,5] tensor. If the max size was 128 bytes, negate3 would be assigned
+  // to alternate memory.
+  memory_space_options.max_size_in_bytes = 120;
+  AssignMemorySpaceUsingCostAnalysis(
+      module.get(), memory_space_options,
+      /*cost_analysis_options_override=*/std::nullopt,
+      /*hlo_cost_options_override=*/std::nullopt,
+      /*optional_msa_sort_order_overrides=*/msa_sort_order_overrides);
+  // Parameters are in the default memory space.
+  const HloInstruction* p0 = FindInstruction(module.get(), "p0");
+  EXPECT_EQ(p0->shape().layout().memory_space(), kDefaultMemorySpace);
+  const HloInstruction* p1 = FindInstruction(module.get(), "p1");
+  EXPECT_EQ(p1->shape().layout().memory_space(), kDefaultMemorySpace);
   HloInstruction* negate0 = FindInstruction(module.get(), "negate0");
   EXPECT_EQ(negate0->shape().layout().memory_space(), kDefaultMemorySpace);
   HloInstruction* negate1 = FindInstruction(module.get(), "negate1");
@@ -4666,9 +4739,11 @@ TEST_P(MemorySpaceAssignmentTest,
   HloInstruction* negate2 = FindInstruction(module.get(), "negate2");
   EXPECT_EQ(negate2->shape().layout().memory_space(), kDefaultMemorySpace);
   HloInstruction* negate3 = FindInstruction(module.get(), "negate3");
-  EXPECT_EQ(negate3->shape().layout().memory_space(), kAlternateMemorySpace);
+  EXPECT_EQ(negate3->shape().layout().memory_space(), kDefaultMemorySpace);
   HloInstruction* negate4 = FindInstruction(module.get(), "negate4");
   EXPECT_EQ(negate4->shape().layout().memory_space(), kDefaultMemorySpace);
+  // Check that all tanhs are in alternate memory space except tanh4. tanh4
+  // is a program output, so it has to land in default memory.
   const HloInstruction* tanh0 = FindInstruction(module.get(), "tanh0");
   EXPECT_EQ(tanh0->shape().layout().memory_space(), kAlternateMemorySpace);
   const HloInstruction* tanh1 = FindInstruction(module.get(), "tanh1");
@@ -4681,6 +4756,82 @@ TEST_P(MemorySpaceAssignmentTest,
   EXPECT_EQ(tanh4->shape().layout().memory_space(), kDefaultMemorySpace);
 }
 
+TEST_P(MemorySpaceAssignmentTest,
+       MemoryBoundednessOverrideSortOrderBySizeGteAssignFirst) {
+  absl::string_view hlo_string = R"(
+  HloModule module, is_scheduled=true
+
+  ENTRY entry {
+    p0 = f32[3,4]{1,0} parameter(0)
+    p1 = f32[5,4]{1,0} parameter(1)
+    tanh0 = f32[3,4]{1,0} tanh(p0)
+    negate0 = f32[5,4]{1,0} negate(p1)
+    tanh1 = f32[3,4]{1,0} tanh(tanh0)
+    negate1 = f32[5,4]{1,0} negate(negate0)
+    tanh2 = f32[3,4]{1,0} tanh(tanh1)
+    negate2 = f32[5,4]{1,0} negate(negate1)
+    tanh3 = f32[3,4]{1,0} tanh(tanh2)
+    negate3 = f32[5,4]{1,0} negate(negate2)
+    tanh4 = f32[3,4]{1,0} tanh(tanh3)
+    negate4 = f32[5,4]{1,0} negate(negate3)
+    ROOT tuple = (f32[3,4]{1,0}, f32[5,4]{1,0}) tuple(tanh4, negate4)
+  }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  // Override MSA sort order and try to assign all buffers with size greater
+  // than or equal to 80 bytes to alternate memory first.
+  const std::string text_proto = R"pb(
+    overrides {
+      hlo_position_matcher { size_gte: 80 }
+      override_options { assign_first: true }
+    }
+  )pb";
+  TF_ASSERT_OK_AND_ASSIGN(auto msa_sort_order_overrides,
+                          ParseTextProto<MsaSortOrderOverrides>(text_proto));
+
+  Options memory_space_options = DefaultMemorySpaceOptions();
+  // Set max size to 160 bytes to allow 2 f32[4,5] tensors to fit in alternate
+  // memory at the same time. tanh3 would not be prefetched because negate2 and
+  // negate3 would be in alternate memory at the same time leaving no space for
+  // tanh3.
+  memory_space_options.max_size_in_bytes = 160;
+  AssignMemorySpaceUsingCostAnalysis(
+      module.get(), memory_space_options,
+      /*cost_analysis_options_override=*/std::nullopt,
+      /*hlo_cost_options_override=*/std::nullopt,
+      /*optional_msa_sort_order_overrides=*/msa_sort_order_overrides);
+  // Parameters are in the default memory space.
+  const HloInstruction* p0 = FindInstruction(module.get(), "p0");
+  EXPECT_EQ(p0->shape().layout().memory_space(), kDefaultMemorySpace);
+  const HloInstruction* p1 = FindInstruction(module.get(), "p1");
+  EXPECT_EQ(p1->shape().layout().memory_space(), kDefaultMemorySpace);
+  // Check that all negates are in alternate memory space except negate4.
+  // negate4 is a program output, so it has to land in default memory.
+  HloInstruction* negate0 = FindInstruction(module.get(), "negate0");
+  EXPECT_EQ(negate0->shape().layout().memory_space(), kAlternateMemorySpace);
+  HloInstruction* negate1 = FindInstruction(module.get(), "negate1");
+  EXPECT_EQ(negate1->shape().layout().memory_space(), kAlternateMemorySpace);
+  HloInstruction* negate2 = FindInstruction(module.get(), "negate2");
+  EXPECT_EQ(negate2->shape().layout().memory_space(), kAlternateMemorySpace);
+  HloInstruction* negate3 = FindInstruction(module.get(), "negate3");
+  EXPECT_EQ(negate3->shape().layout().memory_space(), kAlternateMemorySpace);
+  HloInstruction* negate4 = FindInstruction(module.get(), "negate4");
+  EXPECT_EQ(negate4->shape().layout().memory_space(), kDefaultMemorySpace);
+  const HloInstruction* tanh0 = FindInstruction(module.get(), "tanh0");
+  EXPECT_EQ(tanh0->shape().layout().memory_space(), kDefaultMemorySpace);
+  const HloInstruction* tanh1 = FindInstruction(module.get(), "tanh1");
+  EXPECT_EQ(tanh1->shape().layout().memory_space(), kDefaultMemorySpace);
+  const HloInstruction* tanh2 = FindInstruction(module.get(), "tanh2");
+  EXPECT_EQ(tanh2->shape().layout().memory_space(), kDefaultMemorySpace);
+  const HloInstruction* tanh3 = FindInstruction(module.get(), "tanh3");
+  EXPECT_EQ(tanh3->shape().layout().memory_space(), kDefaultMemorySpace);
+  const HloInstruction* tanh4 = FindInstruction(module.get(), "tanh4");
+  EXPECT_EQ(tanh4->shape().layout().memory_space(), kDefaultMemorySpace);
+}
+
 TEST_P(MemorySpaceAssignmentTest, SimpleWhileTupleTest) {
   Shape s32 = ShapeUtil::MakeShape(xla::S32, {});
   Shape f32v1 = ShapeUtil::MakeShape(F32, {1});
@@ -9655,195 +9806,6 @@ ENTRY main {
   EXPECT_EQ(f_index, p1_copy_end + 1);
 }
 
-class SlicedPrefetchStartTimePickerTest : public ::testing::Test {
- protected:
-  struct FakeInstructionData {
-    float elapsed_time = 0.0;
-    std::string computation;
-  };
-
-  std::vector<int64_t> Pick(
-      const std::vector<FakeInstructionData>& schedule_data, int64_t num_slices,
-      int64_t prefetch_start_time, int64_t prefetch_end_time) {
-    return SlicedPrefetchStartTimePicker::Pick(
-        num_slices, prefetch_start_time, prefetch_end_time,
-        [&schedule_data](int64_t exclusive_start_time,
-                         int64_t exclusive_end_time) {
-          auto start_it = schedule_data.begin() +
-                          ExclusiveToInclusiveStartTime(exclusive_start_time);
-          auto end_it = (exclusive_end_time < schedule_data.size()
-                             ? schedule_data.begin() + exclusive_end_time
-                             : schedule_data.end());
-          return std::accumulate(
-              start_it, end_it, 0.0,
-              [](float total, const FakeInstructionData& data) {
-                return total + data.elapsed_time;
-              });
-        },
-        [&schedule_data](int64_t lhs_time, int64_t rhs_time) {
-          CHECK_GE(lhs_time, 0);
-          CHECK_GE(rhs_time, 0);
-          CHECK_LT(lhs_time, schedule_data.size());
-          CHECK_LT(rhs_time, schedule_data.size());
-          return schedule_data[lhs_time].computation ==
-                 schedule_data[rhs_time].computation;
-        });
-  }
-};
-
-TEST_F(SlicedPrefetchStartTimePickerTest, Base1) {
-  // The 2nd slice naturally should start after 1.5 time units have passed,
-  // forcing us to start before t=1.
-  EXPECT_THAT(Pick({
-                       /*t=0*/ {1.0, "a"},
-                       /*t=1*/ {1.0, "a"},
-                       /*t=2*/ {1.0, "a"},
-                   },
-                   /*num_slices=*/2, /*prefetch_start_time=*/-1,
-                   /*prefetch_end_time=*/3),
-              ::testing::ElementsAre(-1, 0));
-}
-
-TEST_F(SlicedPrefetchStartTimePickerTest, Base2) {
-  // The 2nd slice naturally should start after 6.0 time units have passed,
-  // forcing us to start before t=0.
-  EXPECT_THAT(Pick({
-                       /*t=0*/ {10.0, "a"},
-                       /*t=1*/ {1.0, "a"},
-                       /*t=2*/ {1.0, "a"},
-                   },
-                   /*num_slices=*/2, /*prefetch_start_time=*/-1,
-                   /*prefetch_end_time=*/3),
-              ::testing::ElementsAre(-1, -1));
-}
-
-TEST_F(SlicedPrefetchStartTimePickerTest, Base3) {
-  // The 2nd slice naturally should start after 1.0 time unit has passed.
-  EXPECT_THAT(Pick({
-                       /*t=0*/ {1.0, "a"},
-                       /*t=1*/ {1.0, "a"},
-                   },
-                   /*num_slices=*/2, /*prefetch_start_time=*/-1,
-                   /*prefetch_end_time=*/2),
-              ::testing::ElementsAre(-1, 0));
-}
-
-TEST_F(SlicedPrefetchStartTimePickerTest, Zeros1) {
-  // The 2nd slice naturally should start after 1.0 time unit has passed.
-  // Make sure we don't add extra 0.0 cost instructions to the start time.
-  EXPECT_THAT(Pick({
-                       /*t=0*/ {1.0, "a"},
-                       /*t=1*/ {0.0, "a"},
-                       /*t=2*/ {0.0, "a"},
-                       /*t=3*/ {0.0, "a"},
-                       /*t=4*/ {1.0, "a"},
-                   },
-                   /*num_slices=*/2, /*prefetch_start_time=*/-1,
-                   /*prefetch_end_time=*/5),
-              ::testing::ElementsAre(-1, 0));
-}
-
-TEST_F(SlicedPrefetchStartTimePickerTest, Zeros2) {
-  // The 2nd slice naturally should start after 2.0 time units have passed.
-  // Make sure we don't add extra 0.0 cost instructions to the start time.
-  EXPECT_THAT(Pick({
-                       /*t=0*/ {1.0, "a"},
-                       /*t=1*/ {0.0, "a"},
-                       /*t=2*/ {1.0, "a"},
-                       /*t=3*/ {0.0, "a"},
-                       /*t=4*/ {1.0, "a"},
-                       /*t=5*/ {0.0, "a"},
-                       /*t=6*/ {1.0, "a"},
-                   },
-                   /*num_slices=*/2, /*prefetch_start_time=*/-1,
-                   /*prefetch_end_time=*/7),
-              ::testing::ElementsAre(-1, 2));
-}
-
-TEST_F(SlicedPrefetchStartTimePickerTest, Zeros3) {
-  // The first slice always comes at prefetch_start_time. The 2nd slice
-  // naturally should start after 1.5 time units have passed, causing us to
-  // start after t=2. Make sure we don't add extra 0.0 cost instructions to the
-  // start time.
-  EXPECT_THAT(Pick({
-                       /*t=0*/ {1.0, "a"},
-                       /*t=1*/ {0.0, "a"},
-                       /*t=2*/ {1.0, "a"},
-                       /*t=3*/ {0.0, "a"},
-                       /*t=4*/ {1.0, "a"},
-                       /*t=5*/ {0.0, "a"},
-                       /*t=6*/ {1.0, "a"},
-                   },
-                   /*num_slices=*/2, /*prefetch_start_time=*/1,
-                   /*prefetch_end_time=*/7),
-              ::testing::ElementsAre(1, 2));
-}
-
-TEST_F(SlicedPrefetchStartTimePickerTest, MidSchedule) {
-  EXPECT_THAT(Pick({
-                       /*t=0*/ {1.0, "a"},
-                       /*t=1*/ {1.0, "a"},
-                       /*t=3*/ {1.0, "a"},
-                       /*t=4*/ {1.0, "a"},
-                       /*t=5*/ {1.0, "a"},
-                       /*t=6*/ {1.0, "a"},
-                       /*t=7*/ {1.0, "a"},
-                       /*t=8*/ {1.0, "a"},
-                       /*t=9*/ {1.0, "a"},
-                       /*t=10*/ {1.0, "a"},
-                       /*t=11*/ {1.0, "a"},
-                       /*t=12*/ {1.0, "a"},
-                   },
-                   /*num_slices=*/2, /*prefetch_start_time=*/5,
-                   /*prefetch_end_time=*/10),
-              ::testing::ElementsAre(5, 7));
-}
-
-TEST_F(SlicedPrefetchStartTimePickerTest, ManySlices) {
-  EXPECT_THAT(Pick({
-                       /*t=0*/ {1.0, "a"},
-                       /*t=1*/ {1.0, "a"},
-                       /*t=2*/ {1.0, "a"},
-                       /*t=3*/ {1.0, "a"},
-                       /*t=4*/ {1.0, "a"},
-                       /*t=5*/ {1.0, "a"},
-                       /*t=6*/ {1.0, "a"},
-                       /*t=7*/ {1.0, "a"},
-                       /*t=8*/ {1.0, "a"},
-                       /*t=9*/ {1.0, "a"},
-                       /*t=10*/ {1.0, "a"},
-                       /*t=11*/ {1.0, "a"},
-                       /*t=12*/ {1.0, "a"},
-                       /*t=13*/ {1.0, "a"},
-                       /*t=14*/ {1.0, "a"},
-                       /*t=15*/ {1.0, "a"},
-                       /*t=16*/ {1.0, "a"},
-                       /*t=17*/ {1.0, "a"},
-                       /*t=18*/ {1.0, "a"},
-                       /*t=19*/ {1.0, "a"},
-                   },
-                   /*num_slices=*/5, /*prefetch_start_time=*/-1,
-                   /*prefetch_end_time=*/20),
-              ::testing::ElementsAre(-1, 3, 7, 11, 15));
-}
-
-TEST_F(SlicedPrefetchStartTimePickerTest, DifferentParents) {
-  // The 2nd slice naturally should start after t=2, but we are forced to push
-  // it after t=1, since the instruction at t=3 has parent "b", while the first
-  // instruction has parent "a."
-  EXPECT_THAT(Pick({
-                       /*t=0*/ {1.0, "a"},
-                       /*t=1*/ {1.0, "a"},
-                       /*t=2*/ {1.0, "b"},
-                       /*t=3*/ {1.0, "b"},
-                       /*t=4*/ {1.0, "b"},
-                       /*t=5*/ {1.0, "a"},
-                   },
-                   /*num_slices=*/2, /*prefetch_start_time=*/-1,
-                   /*prefetch_end_time=*/6),
-              ::testing::ElementsAre(-1, 1));
-}
-
 class SlicedPrefetchTest : public MemorySpaceAssignmentTestBase {
  protected:
   // Used by CheckSchedule() to classify instructions in the schedule.
diff --git a/third_party/xla/xla/service/memory_space_assignment/options.h b/third_party/xla/xla/service/memory_space_assignment/options.h
index b49f30e08cedda..4ef5ddcdf19292 100644
--- a/third_party/xla/xla/service/memory_space_assignment/options.h
+++ b/third_party/xla/xla/service/memory_space_assignment/options.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "xla/service/buffer_value.h"
 #include "xla/service/heap_simulator/heap_simulator.h"
 #include "xla/service/hlo_value.h"
+#include "xla/service/memory_space_assignment/buffer_interval_comparator.h"
 #include "xla/service/memory_space_assignment/cost_analysis.h"
 #include "xla/service/memory_space_assignment/memory_space_assignment.pb.h"
 #include "xla/service/memory_space_assignment/prefetch_interval_picker.h"
@@ -54,44 +55,9 @@ using ReservedScopedMemoryFunction = std::function<int64_t(
     const absl::flat_hash_set<
         std::pair<int, ShapeIndex>>& /*operands_in_alternate_memory*/,
     const absl::flat_hash_set<ShapeIndex>& /*outputs_in_alternate_memory*/)>;
-using MsaBufferInterval =
-    GlobalDecreasingSizeBestFitHeap<HloValue>::BufferInterval;
-using MsaBufferIntervalCompare =
-    GlobalDecreasingSizeBestFitHeap<HloValue>::BufferIntervalCompare;
 using PositionRequiresContiguousAllocationFunction =
     std::function<bool(const HloPosition&)>;
 
-// The BufferInterval sorting interface that MemorySpaceAssignment expects.
-class BufferIntervalComparator {
- public:
-  virtual ~BufferIntervalComparator() = default;
-
-  // A logging string explaining the sorting criteria. E.g., [ -size, offset ]
-  // indicates we sort (desc) size, then (asc) offset.
-  virtual std::string DescribeComparisonCriteria() const = 0;
-
-  // A logging string containing the values used to sort buffer_interval.
-  // E.g., we might return [ -1024, 100 ], if the criteria is [ -size,
-  // offset ].
-  virtual std::string CriteriaToString(
-      const MsaBufferInterval& buffer_interval) = 0;
-
-  // comparator.LessThan(lhs, rhs) will be used for BufferIntervalCompare.
-  virtual bool LessThan(const MsaBufferInterval& lhs,
-                        const MsaBufferInterval& rhs) = 0;
-
-  // Used to create a functor that can be passed to a method like std::sort.
-  // E.g., absl::c_sort(v, comparator.GetComparisonFunctor());
-  MsaBufferIntervalCompare GetComparisonFunctor() {
-    return [this](const MsaBufferInterval& lhs, const MsaBufferInterval& rhs) {
-      return LessThan(lhs, rhs);
-    };
-  }
-
- protected:
-  BufferIntervalComparator() = default;
-};
-
 // The different options to be passed to the Run() API.
 struct Options {
   // Backend-specific integer value that describes the alternate memory.
@@ -239,7 +205,7 @@ struct Options {
   float inefficient_use_to_copy_ratio = 0.0;
 
   // This is mostly used for testing, it allows a test case to inject its own
-  // logic for AlternateMemoryBestFitHeap::GetInefficientAllocationSites.
+  // logic for MsaAlgorithm::GetInefficientAllocationSites.
   std::function<std::vector<std::variant<HloPosition, HloUse>>(
       absl::Span<HloPosition>)>
       get_inefficient_allocation_sites_fn = nullptr;
diff --git a/third_party/xla/xla/service/memory_space_assignment/slice.cc b/third_party/xla/xla/service/memory_space_assignment/slice.cc
index e550e965f804fb..44e9fa098fc043 100644
--- a/third_party/xla/xla/service/memory_space_assignment/slice.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/slice.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/service/memory_space_assignment/slice.h"
 
+#include <algorithm>
 #include <cstdint>
 #include <functional>
 #include <ostream>
@@ -22,9 +23,13 @@ limitations under the License.
 #include <tuple>
 #include <vector>
 
+#include "absl/functional/any_invocable.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "xla/service/heap_simulator/heap_simulator.h"
+#include "xla/service/time_utils.h"
 #include "xla/shape.h"
 
 namespace xla::memory_space_assignment {
@@ -86,4 +91,69 @@ bool IsUniformSliceSizingEnabled(const SlicedPrefetchOptions& options) {
   return options.max_slices() > 0 && options.preferred_slice_size() > 0;
 }
 
+std::vector<int64_t> SlicedPrefetchStartTimePicker::Pick(
+    int64_t num_slices, int64_t exclusive_prefetch_start_time,
+    int64_t prefetch_end_time, absl::AnyInvocable<ElapsedTimeFn> elapsed_fn,
+    absl::AnyInvocable<SameComputationParentFn> has_same_parent_fn) {
+  CHECK_LE(exclusive_prefetch_start_time, prefetch_end_time);
+  VLOG(5) << "Picking slice start times. num_slices = " << num_slices
+          << "; exclusive_prefetch_start_time = "
+          << exclusive_prefetch_start_time
+          << "; prefetch_end_time = " << prefetch_end_time;
+
+  // Prefetching starts after the selected start instruction and ends
+  // before the selected end instruction. Thus, we have (end - (start + 1)) HLO
+  // instructions worth of time to perform all of the sliced copies. So, the
+  // only choices for start times that give us time to copy are <=
+  // prefetch_end_time - 2.
+  if (exclusive_prefetch_start_time >= prefetch_end_time - 2 ||
+      num_slices == 1) {
+    return std::vector<int64_t>(num_slices, exclusive_prefetch_start_time);
+  }
+
+  float total_elapsed =
+      elapsed_fn(exclusive_prefetch_start_time, prefetch_end_time);
+  if (total_elapsed <= 0.0) {
+    return std::vector<int64_t>(num_slices, exclusive_prefetch_start_time);
+  }
+
+  std::vector<int64_t> start_times;
+  start_times.reserve(num_slices);
+  start_times.push_back(exclusive_prefetch_start_time);
+  int64_t last_valid_candidate = exclusive_prefetch_start_time;
+  int64_t candidate = exclusive_prefetch_start_time;
+  while (candidate < prefetch_end_time - 1 && start_times.size() < num_slices) {
+    float target_elapsed = total_elapsed *
+                           static_cast<float>(num_slices - start_times.size()) /
+                           static_cast<float>(num_slices);
+    float elapsed = elapsed_fn(candidate, prefetch_end_time);
+    if (elapsed < target_elapsed) {
+      // We've gone past our target, so use the last valid candidate.
+      start_times.push_back(last_valid_candidate);
+      continue;
+    }
+    bool updating_candidate_impacts_elapsed =
+        last_valid_candidate != candidate &&
+        elapsed_fn(last_valid_candidate,
+                   ExclusiveToInclusiveStartTime(candidate)) > 0.0;
+    // has_same_parent_fn will look up the computation parent of the
+    // instructions at prefetch_start_time and prefetch_end_time. If
+    // prefetch_start_time is -1, no such instruction will exist. However, if we
+    // want to insert an instruction after the -1 schedule position, we can
+    // use the parent of the instruction at index 0 instead. Thus, we use
+    // std::max below.
+    if (has_same_parent_fn(std::max<int64_t>(0, exclusive_prefetch_start_time),
+                           std::max<int64_t>(0, candidate)) &&
+        updating_candidate_impacts_elapsed) {
+      last_valid_candidate = candidate;
+    }
+    ++candidate;
+  }
+  while (start_times.size() < num_slices) {
+    start_times.push_back(last_valid_candidate);
+  }
+
+  return start_times;
+}
+
 }  // namespace xla::memory_space_assignment
diff --git a/third_party/xla/xla/service/memory_space_assignment/slice.h b/third_party/xla/xla/service/memory_space_assignment/slice.h
index 3d1fe279e36fe3..1b34eb9fdde3e7 100644
--- a/third_party/xla/xla/service/memory_space_assignment/slice.h
+++ b/third_party/xla/xla/service/memory_space_assignment/slice.h
@@ -41,12 +41,13 @@ limitations under the License.
 #include <ostream>
 #include <string>
 #include <tuple>
+#include <type_traits>
 #include <vector>
 
+#include "absl/functional/any_invocable.h"
 #include "xla/service/heap_simulator/heap_simulator.h"
 #include "xla/service/memory_space_assignment/memory_space_assignment.pb.h"
 #include "xla/shape.h"
-#include "xla/shape_util.h"
 
 namespace xla::memory_space_assignment {
 
@@ -115,6 +116,33 @@ struct SliceDecision {
 // size.
 bool IsUniformSliceSizingEnabled(const SlicedPrefetchOptions& options);
 
+// A class for turning a copy start time and end time into slice start times.
+class SlicedPrefetchStartTimePicker {
+ public:
+  // Returns the amount of time elapsed in the instruction schedule between
+  // (exclusive_start_time, exclusive_end_time).
+  using ElapsedTimeFn = std::add_pointer<float(
+      int64_t exclusive_start_time, int64_t exclusive_end_time) const>::type;
+
+  // Returns true if the instructions at lhs_time and rhs_time are in the same
+  // computation.
+  using SameComputationParentFn =
+      std::add_pointer<bool(int64_t lhs_time, int64_t rhs_time) const>::type;
+
+  // Picks slice start times, given the num_slices, prefetch_start_time, and
+  // prefetch_end_time. The returned times are exclusive.
+  //
+  // REQUIRES:
+  // - The instructions following each start time are guaranateed to be in the
+  //   same computation.
+  // - The returned times sorted.
+  // - The first returned time is equal to prefetch_start_time.
+  static std::vector<int64_t> Pick(
+      int64_t num_slices, int64_t exclusive_prefetch_start_time,
+      int64_t prefetch_end_time, absl::AnyInvocable<ElapsedTimeFn> elapsed_fn,
+      absl::AnyInvocable<SameComputationParentFn> has_same_parent_fn);
+};
+
 }  // namespace xla::memory_space_assignment
 
 #endif  // XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_SLICE_H_
diff --git a/third_party/xla/xla/service/memory_space_assignment/slice_test.cc b/third_party/xla/xla/service/memory_space_assignment/slice_test.cc
new file mode 100644
index 00000000000000..53fb497373e76d
--- /dev/null
+++ b/third_party/xla/xla/service/memory_space_assignment/slice_test.cc
@@ -0,0 +1,224 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/memory_space_assignment/slice.h"
+
+#include <cstdint>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/functional/any_invocable.h"
+#include "absl/log/check.h"
+#include "xla/service/time_utils.h"
+
+namespace xla {
+namespace memory_space_assignment {
+namespace {
+
+class SlicedPrefetchStartTimePickerTest : public ::testing::Test {
+ protected:
+  struct FakeInstructionData {
+    float elapsed_time = 0.0;
+    std::string computation;
+  };
+
+  std::vector<int64_t> Pick(
+      const std::vector<FakeInstructionData>& schedule_data, int64_t num_slices,
+      int64_t prefetch_start_time, int64_t prefetch_end_time) {
+    return SlicedPrefetchStartTimePicker::Pick(
+        num_slices, prefetch_start_time, prefetch_end_time,
+        [&schedule_data](int64_t exclusive_start_time,
+                         int64_t exclusive_end_time) {
+          auto start_it = schedule_data.begin() +
+                          ExclusiveToInclusiveStartTime(exclusive_start_time);
+          auto end_it = (exclusive_end_time < schedule_data.size()
+                             ? schedule_data.begin() + exclusive_end_time
+                             : schedule_data.end());
+          return std::accumulate(
+              start_it, end_it, 0.0,
+              [](float total, const FakeInstructionData& data) {
+                return total + data.elapsed_time;
+              });
+        },
+        [&schedule_data](int64_t lhs_time, int64_t rhs_time) {
+          CHECK_GE(lhs_time, 0);
+          CHECK_GE(rhs_time, 0);
+          CHECK_LT(lhs_time, schedule_data.size());
+          CHECK_LT(rhs_time, schedule_data.size());
+          return schedule_data[lhs_time].computation ==
+                 schedule_data[rhs_time].computation;
+        });
+  }
+};
+
+TEST_F(SlicedPrefetchStartTimePickerTest, Base1) {
+  // The 2nd slice naturally should start after 1.5 time units have passed,
+  // forcing us to start before t=1.
+  EXPECT_THAT(Pick({
+                       /*t=0*/ {1.0, "a"},
+                       /*t=1*/ {1.0, "a"},
+                       /*t=2*/ {1.0, "a"},
+                   },
+                   /*num_slices=*/2, /*prefetch_start_time=*/-1,
+                   /*prefetch_end_time=*/3),
+              ::testing::ElementsAre(-1, 0));
+}
+
+TEST_F(SlicedPrefetchStartTimePickerTest, Base2) {
+  // The 2nd slice naturally should start after 6.0 time units have passed,
+  // forcing us to start before t=0.
+  EXPECT_THAT(Pick({
+                       /*t=0*/ {10.0, "a"},
+                       /*t=1*/ {1.0, "a"},
+                       /*t=2*/ {1.0, "a"},
+                   },
+                   /*num_slices=*/2, /*prefetch_start_time=*/-1,
+                   /*prefetch_end_time=*/3),
+              ::testing::ElementsAre(-1, -1));
+}
+
+TEST_F(SlicedPrefetchStartTimePickerTest, Base3) {
+  // The 2nd slice naturally should start after 1.0 time unit has passed.
+  EXPECT_THAT(Pick({
+                       /*t=0*/ {1.0, "a"},
+                       /*t=1*/ {1.0, "a"},
+                   },
+                   /*num_slices=*/2, /*prefetch_start_time=*/-1,
+                   /*prefetch_end_time=*/2),
+              ::testing::ElementsAre(-1, 0));
+}
+
+TEST_F(SlicedPrefetchStartTimePickerTest, Zeros1) {
+  // The 2nd slice naturally should start after 1.0 time unit has passed.
+  // Make sure we don't add extra 0.0 cost instructions to the start time.
+  EXPECT_THAT(Pick({
+                       /*t=0*/ {1.0, "a"},
+                       /*t=1*/ {0.0, "a"},
+                       /*t=2*/ {0.0, "a"},
+                       /*t=3*/ {0.0, "a"},
+                       /*t=4*/ {1.0, "a"},
+                   },
+                   /*num_slices=*/2, /*prefetch_start_time=*/-1,
+                   /*prefetch_end_time=*/5),
+              ::testing::ElementsAre(-1, 0));
+}
+
+TEST_F(SlicedPrefetchStartTimePickerTest, Zeros2) {
+  // The 2nd slice naturally should start after 2.0 time units have passed.
+  // Make sure we don't add extra 0.0 cost instructions to the start time.
+  EXPECT_THAT(Pick({
+                       /*t=0*/ {1.0, "a"},
+                       /*t=1*/ {0.0, "a"},
+                       /*t=2*/ {1.0, "a"},
+                       /*t=3*/ {0.0, "a"},
+                       /*t=4*/ {1.0, "a"},
+                       /*t=5*/ {0.0, "a"},
+                       /*t=6*/ {1.0, "a"},
+                   },
+                   /*num_slices=*/2, /*prefetch_start_time=*/-1,
+                   /*prefetch_end_time=*/7),
+              ::testing::ElementsAre(-1, 2));
+}
+
+TEST_F(SlicedPrefetchStartTimePickerTest, Zeros3) {
+  // The first slice always comes at prefetch_start_time. The 2nd slice
+  // naturally should start after 1.5 time units have passed, causing us to
+  // start after t=2. Make sure we don't add extra 0.0 cost instructions to the
+  // start time.
+  EXPECT_THAT(Pick({
+                       /*t=0*/ {1.0, "a"},
+                       /*t=1*/ {0.0, "a"},
+                       /*t=2*/ {1.0, "a"},
+                       /*t=3*/ {0.0, "a"},
+                       /*t=4*/ {1.0, "a"},
+                       /*t=5*/ {0.0, "a"},
+                       /*t=6*/ {1.0, "a"},
+                   },
+                   /*num_slices=*/2, /*prefetch_start_time=*/1,
+                   /*prefetch_end_time=*/7),
+              ::testing::ElementsAre(1, 2));
+}
+
+TEST_F(SlicedPrefetchStartTimePickerTest, MidSchedule) {
+  EXPECT_THAT(Pick({
+                       /*t=0*/ {1.0, "a"},
+                       /*t=1*/ {1.0, "a"},
+                       /*t=3*/ {1.0, "a"},
+                       /*t=4*/ {1.0, "a"},
+                       /*t=5*/ {1.0, "a"},
+                       /*t=6*/ {1.0, "a"},
+                       /*t=7*/ {1.0, "a"},
+                       /*t=8*/ {1.0, "a"},
+                       /*t=9*/ {1.0, "a"},
+                       /*t=10*/ {1.0, "a"},
+                       /*t=11*/ {1.0, "a"},
+                       /*t=12*/ {1.0, "a"},
+                   },
+                   /*num_slices=*/2, /*prefetch_start_time=*/5,
+                   /*prefetch_end_time=*/10),
+              ::testing::ElementsAre(5, 7));
+}
+
+TEST_F(SlicedPrefetchStartTimePickerTest, ManySlices) {
+  EXPECT_THAT(Pick({
+                       /*t=0*/ {1.0, "a"},
+                       /*t=1*/ {1.0, "a"},
+                       /*t=2*/ {1.0, "a"},
+                       /*t=3*/ {1.0, "a"},
+                       /*t=4*/ {1.0, "a"},
+                       /*t=5*/ {1.0, "a"},
+                       /*t=6*/ {1.0, "a"},
+                       /*t=7*/ {1.0, "a"},
+                       /*t=8*/ {1.0, "a"},
+                       /*t=9*/ {1.0, "a"},
+                       /*t=10*/ {1.0, "a"},
+                       /*t=11*/ {1.0, "a"},
+                       /*t=12*/ {1.0, "a"},
+                       /*t=13*/ {1.0, "a"},
+                       /*t=14*/ {1.0, "a"},
+                       /*t=15*/ {1.0, "a"},
+                       /*t=16*/ {1.0, "a"},
+                       /*t=17*/ {1.0, "a"},
+                       /*t=18*/ {1.0, "a"},
+                       /*t=19*/ {1.0, "a"},
+                   },
+                   /*num_slices=*/5, /*prefetch_start_time=*/-1,
+                   /*prefetch_end_time=*/20),
+              ::testing::ElementsAre(-1, 3, 7, 11, 15));
+}
+
+TEST_F(SlicedPrefetchStartTimePickerTest, DifferentParents) {
+  // The 2nd slice naturally should start after t=2, but we are forced to push
+  // it after t=1, since the instruction at t=3 has parent "b", while the first
+  // instruction has parent "a."
+  EXPECT_THAT(Pick({
+                       /*t=0*/ {1.0, "a"},
+                       /*t=1*/ {1.0, "a"},
+                       /*t=2*/ {1.0, "b"},
+                       /*t=3*/ {1.0, "b"},
+                       /*t=4*/ {1.0, "b"},
+                       /*t=5*/ {1.0, "a"},
+                   },
+                   /*num_slices=*/2, /*prefetch_start_time=*/-1,
+                   /*prefetch_end_time=*/6),
+              ::testing::ElementsAre(-1, 1));
+}
+
+}  // namespace
+}  // namespace memory_space_assignment
+}  // namespace xla
diff --git a/third_party/xla/xla/service/p2p_schedule_preparation.cc b/third_party/xla/xla/service/p2p_schedule_preparation.cc
index 1771a99ea2227a..7508e73810f2fb 100644
--- a/third_party/xla/xla/service/p2p_schedule_preparation.cc
+++ b/third_party/xla/xla/service/p2p_schedule_preparation.cc
@@ -17,12 +17,14 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
+#include <optional>
 #include <set>
 #include <utility>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
@@ -89,7 +91,7 @@ HloInstruction* GetStartOpForDoneOp(HloInstruction* op) {
 
 enum P2PGroupKind { kUnpipelined = 0, kPipelined = 1, kUnrecognized = 2 };
 
-enum P2PPipelineStream { kUnknown = 0, kPipeline0 = 1, kPipeline1 = 2 };
+enum P2PRuntimeStream { kUnknown = 0, kStream0 = 1, kStream1 = 2 };
 
 // A P2P group node represents the P2P instructions that are in the same
 // computation and have the same channel ID. This includes one Send/SendDone
@@ -164,14 +166,14 @@ struct P2PGroupNode {
 
   // Returns the pipeline stream used to execute the P2P instructions in the
   // group.
-  P2PPipelineStream GetPipelineStream(const HloInstruction* start) const {
+  P2PRuntimeStream GetRuntimeStream(const HloInstruction* start) const {
     auto it = start->frontend_attributes().map().find(kSendRecvPipelineAttr);
     if (it != start->frontend_attributes().map().end()) {
       if (it->second == "0") {
-        return kPipeline0;
+        return kStream0;
       }
       if (it->second == "1") {
-        return kPipeline1;
+        return kStream1;
       }
     }
     return kUnknown;
@@ -180,15 +182,17 @@ struct P2PGroupNode {
   // Finds the pipeline stream from the frontend attribute of the Send/Recv in
   // the pipeline group node, verifies they both have the same value and returns
   // the stream.
-  P2PPipelineStream GetPipelineStream() const {
-    P2PPipelineStream send_stream = GetPipelineStream(send);
-    P2PPipelineStream recv_stream = GetPipelineStream(recv);
+  P2PRuntimeStream GetRuntimeStream() const {
+    P2PRuntimeStream send_stream = GetRuntimeStream(send);
+    P2PRuntimeStream recv_stream = GetRuntimeStream(recv);
     if (send_stream != recv_stream) {
       return kUnknown;
     }
     return send_stream;
   }
 
+  int64_t GetChannel() const { return recv->channel_id().value(); }
+
   HloRecvDoneInstruction* recv_done = nullptr;
   HloSendDoneInstruction* send_done = nullptr;
   HloRecvInstruction* recv = nullptr;
@@ -283,37 +287,47 @@ struct P2PGroup {
   // Finds the pipeline stream from the frontend attribute of the Send/Recv in
   // the pipeline group, verifies they all have the same value and records
   // the stream.
-  bool RecordPipelineStream() {
-    P2PPipelineStream child_stream =
-        nodes[kPipelinedChildNodeIdx].GetPipelineStream();
-    P2PPipelineStream parent_stream =
-        nodes[kPipelinedParentNodeIdx].GetPipelineStream();
-    if (child_stream != parent_stream || child_stream == kUnknown) {
-      return false;
+  bool RecordRuntimeStream() {
+    P2PRuntimeStream child_stream =
+        nodes[kPipelinedChildNodeIdx].GetRuntimeStream();
+    if (kind == kPipelined) {
+      P2PRuntimeStream parent_stream =
+          nodes[kPipelinedParentNodeIdx].GetRuntimeStream();
+      if (child_stream != parent_stream || child_stream == kUnknown) {
+        return false;
+      }
     }
     // Record the stream.
-    pipeline_stream = child_stream;
+    runtime_stream = child_stream;
     return true;
   }
 
   // Records the other group that forms a cycle with this group, assuming that
-  // we only pipepline at most two groups for a loop.
+  // we handle only two groups that form a cycle.
   Status RecordComplementGroup(P2PGroupMap& p2p_group_map) {
+    CHECK(!complement_group_channel.has_value() && runtime_stream == kStream1);
     for (auto& [channel, p2p_group] : p2p_group_map) {
-      if (&p2p_group == this || p2p_group.kind != kPipelined ||
-          p2p_group.ChildComputation() != ChildComputation() ||
-          p2p_group.ParentComputation() != ParentComputation()) {
+      if (&p2p_group == this ||
+          p2p_group.ChildComputation() != ChildComputation()) {
         continue;
       }
-      // Found two pipeline group for the same while loop, verify that they have
-      // different valid pipeline stream.
-      if (pipeline_stream == p2p_group.pipeline_stream) {
-        return Internal(
-            "Expected different pipeline stream for complement group");
+      if (p2p_group.kind == kPipelined &&
+          p2p_group.ParentComputation() == ParentComputation()) {
+        // Found two pipelined group for the same while loop, verify that they
+        // have different valid pipeline stream.
+        if (p2p_group.runtime_stream != kStream0) {
+          return Internal(
+              "Expected different pipeline stream for complement group");
+        }
+        // Set the complement_group_channel for the current group.
+        complement_group_channel = channel;
+        // Set the complement_group_channel for the complement-group.
+        p2p_group.complement_group_channel = GetChannel();
+      } else if (p2p_group.kind == kUnpipelined &&
+                 p2p_group.runtime_stream == kStream0) {
+        complement_group_channel = channel;
+        p2p_group.complement_group_channel = GetChannel();
       }
-      complement_group = &p2p_group;
-      p2p_group.complement_group = this;
-      break;
     }
     return OkStatus();
   }
@@ -324,6 +338,7 @@ struct P2PGroup {
   // Returns the child computation for the group.
   HloComputation* ChildComputation() const { return GetChild().computation; }
 
+  int64_t GetChannel() const { return nodes[kUnpipelinedNodeIdx].GetChannel(); }
   P2PGroupNode& GetChild() { return nodes[kPipelinedChildNodeIdx]; }
   P2PGroupNode& GetParent() { return nodes[kPipelinedParentNodeIdx]; }
   const P2PGroupNode& GetChild() const { return nodes[kPipelinedChildNodeIdx]; }
@@ -332,46 +347,54 @@ struct P2PGroup {
   }
 
   // Returns the start and end of a region marked by a pipelined chain in the
-  // given computation. For most of the cases, this is the region with the
-  // pipelined P2P instructions. The only exception is for a pipelined chain
-  // in the child computation, in which case, the region is from the end of the
-  // Send/Recv-done instructions block to the beginning of the Send/Recv
-  // instruction block start instruction block which is the region where other
-  // collectives should be scheduled to.
-  ChainStartEnd GetChainStartEnd(HloComputation* computation) const {
-    if (kind == kUnpipelined) {
-      return std::make_pair(GetChild().recv, GetChild().send_done);
-    }
-
-    CHECK(kind == kPipelined);
+  // given computation, which is the region with the pipelined P2P instructions.
+  ChainStartEnd GetChainStartEnd(const HloComputation* computation,
+                                 const P2PGroupMap& p2p_group_map) const {
     if (computation == ChildComputation()) {
-      // For the child computation of a pipelined group, we return the start
-      // and end of the instruction where we can put other collectives.
-      if (complement_group == nullptr) {
-        return std::make_pair(GetChild().send_done, GetChild().recv);
+      if (!InCycle()) {
+        return std::make_pair(GetChild().recv, GetChild().send_done);
       }
-      CHECK(pipeline_stream == kPipeline1);
-      return std::make_pair(GetChild().send_done, GetChild().recv);
+      if (runtime_stream == kStream1) {
+        return std::make_pair(
+            GetComplementGroup(p2p_group_map)->GetChild().recv,
+            GetChild().send_done);
+      }
+      return std::make_pair(
+          GetChild().recv,
+          GetComplementGroup(p2p_group_map)->GetChild().send_done);
     }
 
-    CHECK(computation == ParentComputation());
-    if (complement_group == nullptr) {
+    CHECK(kind == kPipelined && computation == ParentComputation());
+    if (!InCycle()) {
       return std::make_pair(GetParent().recv, GetParent().send_done);
     }
-    CHECK(pipeline_stream == kPipeline1);
-    return std::make_pair(complement_group->GetParent().recv,
-                          GetParent().send_done);
+    if (runtime_stream == kStream1) {
+      return std::make_pair(GetComplementGroup(p2p_group_map)->GetParent().recv,
+                            GetParent().send_done);
+    }
+    return std::make_pair(
+        GetParent().recv,
+        GetComplementGroup(p2p_group_map)->GetParent().send_done);
   }
 
   HloInstruction* GetWhileOp() const {
     return nodes[kPipelinedParentNodeIdx].while_loop;
   }
 
+  bool InCycle() const { return complement_group_channel.has_value(); }
+  P2PGroup* GetComplementGroup(P2PGroupMap& p2p_group_map) const {
+    CHECK(InCycle());
+    return &p2p_group_map.at(*complement_group_channel);
+  }
+  const P2PGroup* GetComplementGroup(const P2PGroupMap& p2p_group_map) const {
+    CHECK(InCycle());
+    return &p2p_group_map.at(*complement_group_channel);
+  }
   P2PGroupKind kind = kUnpipelined;
   P2PGroupNode nodes[2];
-  P2PPipelineStream pipeline_stream = kUnknown;
-  // Another P2PGroup that forms a cycle with this group.
-  P2PGroup* complement_group = nullptr;
+  P2PRuntimeStream runtime_stream = kUnknown;
+  // The channel id for another P2PGroup that forms a cycle with this group.
+  std::optional<int64_t> complement_group_channel = std::nullopt;
 };
 
 bool MayInvokeCollectiveOp(
@@ -411,32 +434,7 @@ Status MayAddWhileOpToPipelinedGroup(HloInstruction* while_op,
   int pipelined_group = 0;
   // Check whether the while-op init contains a token from a Send result.
   for (auto hlo : while_op->while_init()->operands()) {
-    if (hlo->opcode() == HloOpcode::kTuple) {
-      // A send has a tuple as its result, the tuple contains a token.
-      // If a send is pipelined, then, the while-init either contains
-      // a send-result, or contains a tuple with a token element from the
-      // send result. As such, if a tuple represent a pipelined send, it is
-      // either a direct send result, or a tuple with this code pattern:
-      ///
-      //   send = (..., token) send(...)
-      //   send.token = token[] get-tuple-element(send) index=...
-      //   send.tuple.reconstruct = tuple(..., send.token)
-      //   while-init =  tuple(..., send.tuple.reconstruct)
-      //   while-result =  while(while-init), ...
-      //
-      // So if the tuple contains a token, we make `hlo` point-to the producer
-      // of the token so that we can check whether the producer is a send after.
-      for (auto ele : hlo->operands()) {
-        if (ele->shape().IsToken()) {
-          // Assure that the token is part of an instruction result and not
-          // generated by a copy as we currently don't copy token.
-          CHECK(ele->opcode() == HloOpcode::kGetTupleElement);
-          hlo = ele->mutable_operand(0);
-          break;
-        }
-      }
-    }
-    if (hlo->opcode() != HloOpcode::kSend) {
+    if (hlo->opcode() != HloOpcode::kSendDone) {
       continue;
     }
     int64_t channel_id = hlo->channel_id().value();
@@ -463,11 +461,9 @@ Status OrderBefore(HloInstruction* i1, HloInstruction* i2) {
   return OkStatus();
 }
 
-// For an unpipelined Send-Recv chain, we add control dependence to enforce this
-// ordering:
+// Adds control dependence to enforce this ordering:
 //   recv => send => recv-done => send-done.
-Status ConnectUnpipelinedP2P(const P2PGroup& p2p_group) {
-  const P2PGroupNode& node = p2p_group.GetChild();
+Status ConnectP2P1NodeChain(const P2PGroupNode& node) {
   HloRecvDoneInstruction* recv_done = node.recv_done;
   HloRecvInstruction* recv = node.recv;
   HloSendDoneInstruction* send_done = node.send_done;
@@ -478,28 +474,26 @@ Status ConnectUnpipelinedP2P(const P2PGroup& p2p_group) {
   return OkStatus();
 }
 
-// For a single pipelined Send-Recv chain in a while-body, we enforce this
+// For an unpipelined Send-Recv chain, adds control dependence to enforce this
 // ordering:
-//   recv-done => send-done => recv => send
+//   recv => send => recv-done => send-done.
+Status ConnectUnpipelinedP2P(const P2PGroup& p2p_group) {
+  return ConnectP2P1NodeChain(p2p_group.GetChild());
+}
+
+// For a single pipelined Send-Recv chain in a while-body, adds control
+// dependence toenforce this ordering:
+//   recv => send => recv-done => send-done
 Status ConnectPipelined1P2PChild(const P2PGroup& p2p_group) {
-  const P2PGroupNode& node = p2p_group.GetChild();
-  HloSendRecvInstruction* recv_done = node.recv_done;
-  HloRecvInstruction* recv = node.recv;
-  HloSendRecvInstruction* send_done = node.send_done;
-  HloSendInstruction* send = node.send;
-  TF_RETURN_IF_ERROR(OrderBefore(recv_done, send_done));
-  TF_RETURN_IF_ERROR(OrderBefore(send_done, recv));
-  TF_RETURN_IF_ERROR(OrderBefore(recv, send));
-  return OkStatus();
+  return ConnectP2P1NodeChain(p2p_group.GetChild());
 }
 
-// For two pipelined Send-Recv chains forming a cycle in a while-body
-// computation, we enforce this ordering:
-//   recv-done.0 => send-done.0 => recv-done.1 => send-done.1 =>
-//   recv.0 => send.0 => recv.1 => send.1
-Status ConnectPipelined2P2PChild(const P2PGroup& p2p_group) {
-  const P2PGroupNode& node0 = p2p_group.complement_group->GetChild();
-  const P2PGroupNode& node1 = p2p_group.GetChild();
+// For aSend-Recv chain involving two channels, adds control dependence to
+// enforce this ordering:
+//   recv.0 => send.0 => recv.1 => send.1 =>
+//   recv-done.0 => recv-done.1 => send-done.0 => send-done.1
+Status ConnectP2P2NodeChain(const P2PGroupNode& node0,
+                            const P2PGroupNode& node1) {
   HloSendRecvInstruction* recv_done0 = node0.recv_done;
   HloRecvInstruction* recv0 = node0.recv;
   HloSendRecvInstruction* send_done0 = node0.send_done;
@@ -509,54 +503,59 @@ Status ConnectPipelined2P2PChild(const P2PGroup& p2p_group) {
   HloSendRecvInstruction* send_done1 = node1.send_done;
   HloSendInstruction* send1 = node1.send;
 
-  TF_RETURN_IF_ERROR(OrderBefore(recv_done0, send_done0));
-  TF_RETURN_IF_ERROR(OrderBefore(send_done0, recv_done1));
-  TF_RETURN_IF_ERROR(OrderBefore(recv_done1, send_done1));
-  TF_RETURN_IF_ERROR(OrderBefore(send_done1, recv0));
+  TF_RETURN_IF_ERROR(OrderBefore(recv_done0, recv_done1));
+  TF_RETURN_IF_ERROR(OrderBefore(recv_done1, send_done0));
+  TF_RETURN_IF_ERROR(OrderBefore(send_done0, send_done1));
+
   TF_RETURN_IF_ERROR(OrderBefore(recv0, send0));
   TF_RETURN_IF_ERROR(OrderBefore(send0, recv1));
   TF_RETURN_IF_ERROR(OrderBefore(recv1, send1));
 
+  TF_RETURN_IF_ERROR(OrderBefore(send1, recv_done0));
+
   return OkStatus();
 }
 
-// For a single pipelined Send-Recv chain in the while-body calling computation,
-// we enforce this ordering:
-//   recv => send => (while_op) => recv-done => send-done
+// For a pipelined Send-Recv chain with two channel groups forming a cycle in a
+// while-body computation, we enforce this ordering:
+//   recv.0 => send.0 => recv.1 => send.1 =>
+//   recv-done.0 => recv-done.1 => send-done.0 => send-done.1
+Status ConnectPipelined2P2PChild(const P2PGroup& p2p_group,
+                                 const P2PGroupMap& p2p_group_map) {
+  return ConnectP2P2NodeChain(
+      p2p_group.GetComplementGroup(p2p_group_map)->GetChild(),
+      p2p_group.GetChild());
+}
+
+// For a pipelined Send-Recv chain with one group in the while-body calling
+// computation, we enforce this ordering:
+//   recv => send => recv-done => send-done
 Status ConnectPipelined1P2PParent(const P2PGroup& p2p_group) {
-  const P2PGroupNode& node = p2p_group.GetParent();
-  HloSendRecvInstruction* recv_done = node.recv_done;
-  HloRecvInstruction* recv = node.recv;
-  HloSendRecvInstruction* send_done = node.send_done;
-  HloSendInstruction* send = node.send;
-  TF_RETURN_IF_ERROR(OrderBefore(recv, send));
-  TF_RETURN_IF_ERROR(OrderBefore(recv_done, send_done));
-  return OkStatus();
+  return ConnectP2P1NodeChain(p2p_group.GetParent());
 }
 
-// For two pipelined Send-Recv chains forming a cycle in the while-body
-// calling computation, we enforce this ordering:
-//   recv.0 => send.0 => recv.1 => send.1 => (while_op) =>
-//   recv-done.0 => send-done.0 => recv-done.1 => send-done.1
-Status ConnectPipelined2P2PParent(const P2PGroup& p2p_group) {
-  const P2PGroupNode& node0 = p2p_group.complement_group->GetParent();
-  const P2PGroupNode& node1 = p2p_group.GetParent();
-  HloSendRecvInstruction* recv_done0 = node0.recv_done;
-  HloRecvInstruction* recv0 = node0.recv;
-  HloSendRecvInstruction* send_done0 = node0.send_done;
-  HloSendInstruction* send0 = node0.send;
-  HloSendRecvInstruction* recv_done1 = node1.recv_done;
-  HloRecvInstruction* recv1 = node1.recv;
-  HloSendRecvInstruction* send_done1 = node1.send_done;
-  HloSendInstruction* send1 = node1.send;
+// For a pipelined Send-Recv chain with two channel groups forming a cycle
+// in the while-body calling computation, we enforce this ordering:
+//   recv.0 => send.0 => recv.1 => send.1 => =>
+//   recv-done.0 => recv-done.1 => send-done.0 => send-done.1
+Status ConnectPipelined2P2PParent(const P2PGroup& p2p_group,
+                                  const P2PGroupMap& p2p_group_map) {
+  return ConnectP2P2NodeChain(
+      p2p_group.GetComplementGroup(p2p_group_map)->GetParent(),
+      p2p_group.GetParent());
+}
 
-  TF_RETURN_IF_ERROR(OrderBefore(recv0, send0));
-  TF_RETURN_IF_ERROR(OrderBefore(send0, recv1));
-  TF_RETURN_IF_ERROR(OrderBefore(recv1, send1));
-  TF_RETURN_IF_ERROR(OrderBefore(recv_done0, send_done0));
-  TF_RETURN_IF_ERROR(OrderBefore(send_done0, recv_done1));
-  TF_RETURN_IF_ERROR(OrderBefore(recv_done1, send_done1));
-  return OkStatus();
+// For a Send-Recv chain with two channel groups forming a cycle in a while-body
+// annotated for pipelining but not pipelined (due to skip pipelining pass), we
+// enforece this ordering:
+//   recv.0 => send.0 => recv.1 => send.1 =>
+//   recv-done.0 => recv-done.1 => send-done.0 => send-done.1
+Status ConnectUnpipelined2P2P(const P2PGroup& p2p_group,
+                              const P2PGroupMap& p2p_group_map) {
+  CHECK(p2p_group.runtime_stream == kStream1);
+  return ConnectP2P2NodeChain(
+      p2p_group.GetComplementGroup(p2p_group_map)->GetChild(),
+      p2p_group.GetChild());
 }
 
 // Collects P2P send-done and recv-done instructions from the computation,
@@ -571,16 +570,8 @@ Status GatherP2PGroupsAndCollectiveInfo(
   std::vector<HloInstruction*> while_ops;
   for (auto hlo : computation->MakeInstructionPostOrder()) {
     // Record the use of collective operations.
-    if (IsCollectiveOp(hlo)) {
+    if (MayInvokeCollectiveOp(hlo, collective_in_computation)) {
       collective_in_computation[computation] = true;
-    } else {
-      // Propagate CollectiveInComputation from callees to callers.
-      for (auto callee : hlo->called_computations()) {
-        auto collective_in_comp = collective_in_computation.find(callee);
-        if (collective_in_comp != collective_in_computation.end()) {
-          collective_in_computation[computation] |= collective_in_comp->second;
-        }
-      }
     }
 
     if (hlo->opcode() == HloOpcode::kWhile) {
@@ -614,7 +605,7 @@ Status GatherP2PGroupsAndCollectiveInfo(
     }
     // We can't rely on the operation on p2p_group_map above to find out
     // whether it is the first time to handle this channel for the current
-    // computation, as we may drop information in the present of kUncognized
+    // computation, as we may drop information in the present of kUnrecognized
     // groups.
     auto p2p_in_comp = p2p_in_computation.find(computation);
     if (p2p_in_comp == p2p_in_computation.end()) {
@@ -637,14 +628,15 @@ Status GatherP2PGroupsAndCollectiveInfo(
   // kUnrecognized.
   for (auto& [channel, p2p_group] : p2p_group_map) {
     if (p2p_group.kind == kUnpipelined) {
-      if (p2p_group.nodes[kUnpipelinedNodeIdx].Incomplete()) {
+      if (p2p_group.nodes[kUnpipelinedNodeIdx].Incomplete() ||
+          !p2p_group.RecordRuntimeStream()) {
         p2p_group.kind = kUnrecognized;
       }
     } else if (p2p_group.kind == kPipelined) {
       if (p2p_group.nodes[kPipelinedChildNodeIdx].Incomplete() ||
           p2p_group.nodes[kPipelinedParentNodeIdx]
               .IncompletePipelinedParent() ||
-          !p2p_group.RecordPipelineStream()) {
+          !p2p_group.RecordRuntimeStream()) {
         p2p_group.kind = kUnrecognized;
       }
     }
@@ -655,14 +647,14 @@ Status GatherP2PGroupsAndCollectiveInfo(
     return p2p_group.second.kind == kUnrecognized;
   });
 
-  // Connect kPipelined groups that form cycles if the current computation is
-  // the calling computation for the loop being pipelined. We only build such a
-  // connection when we are processing the group for kPipeline1 stream.
+  // Connect two groups that form a cycle, both for pipelined and unpipelined
+  // cases for the current computation. We only build such a connection when we
+  // are processing the group for kStream1 stream, and for parent computation
+  // for a pipelined group.
   for (auto& [channel, p2p_group] : p2p_group_map) {
-    if (p2p_group.kind != kPipelined ||
-        p2p_group.ParentComputation() != computation ||
-        p2p_group.complement_group != nullptr ||
-        p2p_group.pipeline_stream != kPipeline1) {
+    if ((p2p_group.kind == kPipelined &&
+         p2p_group.ParentComputation() != computation) ||
+        p2p_group.InCycle() || p2p_group.runtime_stream != kStream1) {
       continue;
     }
     TF_RETURN_IF_ERROR(p2p_group.RecordComplementGroup(p2p_group_map));
@@ -693,11 +685,15 @@ absl::StatusOr<std::pair<int, const P2PGroup*>> ConnectP2PChain(
     const P2PGroup& p2p_group = it->second;
     P2PGroupKind kind = p2p_group.kind;
     if (kind == P2PGroupKind::kUnpipelined) {
-      TF_RETURN_IF_ERROR(ConnectUnpipelinedP2P(p2p_group));
+      if (!p2p_group.InCycle()) {
+        TF_RETURN_IF_ERROR(ConnectUnpipelinedP2P(p2p_group));
+      } else if (p2p_group.runtime_stream == kStream1) {
+        TF_RETURN_IF_ERROR(ConnectUnpipelined2P2P(p2p_group, p2p_group_map));
+      }
       continue;
     }
 
-    if (p2p_group.complement_group == nullptr) {
+    if (!p2p_group.InCycle()) {
       if (computation == p2p_group.ParentComputation()) {
         TF_RETURN_IF_ERROR(ConnectPipelined1P2PParent(p2p_group));
       } else {
@@ -712,13 +708,13 @@ absl::StatusOr<std::pair<int, const P2PGroup*>> ConnectP2PChain(
     }
 
     // A pipeline of two groups that form a cycle. We process the pipeline when
-    // we see the group with kPipeline1.
-    if (p2p_group.pipeline_stream != kPipeline1) {
+    // we see the group with kStream1.
+    if (p2p_group.runtime_stream != kStream1) {
       continue;
     }
 
     if (computation == p2p_group.ParentComputation()) {
-      TF_RETURN_IF_ERROR(ConnectPipelined2P2PParent(p2p_group));
+      TF_RETURN_IF_ERROR(ConnectPipelined2P2PParent(p2p_group, p2p_group_map));
     } else {
       if (pipelined_group != nullptr) {
         return Internal(
@@ -726,7 +722,7 @@ absl::StatusOr<std::pair<int, const P2PGroup*>> ConnectP2PChain(
             "while-body");
       }
       pipelined_group = &p2p_group;
-      TF_RETURN_IF_ERROR(ConnectPipelined2P2PChild(p2p_group));
+      TF_RETURN_IF_ERROR(ConnectPipelined2P2PChild(p2p_group, p2p_group_map));
     }
   }
   return std::make_pair(num_p2p_chains, pipelined_group);
@@ -760,8 +756,7 @@ Status LinearizeCollectivesWithOtherP2P(
     const std::vector<HloInstruction*>::iterator& end_iter,
     HloReachabilityMap* reachability) {
   HloComputation* computation = (*chain_start_iter)->parent();
-  ChainStartEnd start_end = group.GetChainStartEnd(computation);
-
+  ChainStartEnd start_end = group.GetChainStartEnd(computation, p2p_group_map);
   // We refer to the P2P chain represented by `group` chain A.
   for (auto it = begin_iter; it != end_iter; ++it) {
     HloInstruction* hlo = *it;
@@ -780,8 +775,8 @@ Status LinearizeCollectivesWithOtherP2P(
         // LinearizeCollectivesWithPipelinedP2PChild.
         continue;
       }
-
-      ChainStartEnd cur_start_end = cur_group.GetChainStartEnd(computation);
+      ChainStartEnd cur_start_end =
+          cur_group.GetChainStartEnd(computation, p2p_group_map);
       if (cur_start_end.first != hlo) {
         // We will linearize the two chains when we see the first instruction in
         // chain B.
@@ -839,22 +834,18 @@ Status LinearizeCollectivesWithOtherP2P(
 
 // Adds control dependence to linearize other collective ops with respect to
 // the given pipelined P2P chain in the computation for the pipelined
-// while-loop, which is ordered as follows:
-//   RecvDone => SendDone  .... Recv => Send (1 pipelined chain)
-//   RecvDone.0 => SendDone.0 => RecvDone.1 => SendDone.1  .... Recv.0 =>
-//       Send.0 => Recv.1 => Send.1 (2 pipelined chains)
-// All collective ops should be scheduled between (SendDone, Recv) or
-// (SendDone.1, Recv.0)
+// while-loop. All Collective ops should be scheduled before the chain.
 Status LinearizeCollectivesWithPipelinedP2PChild(
     const P2PGroupMap& p2p_group_map, const P2PGroup& group,
     const CollectiveInComputation& collective_in_computation,
     HloComputation* computation, HloReachabilityMap* reachability) {
-  ChainStartEnd start_end = group.GetChainStartEnd(computation);
+  ChainStartEnd start_end = group.GetChainStartEnd(computation, p2p_group_map);
 
   // If an hlo may invoke collective operation, we add control dependence to
-  // make sure that the hlo is schedule between (start, end) marked by the
-  // pipelined P2P operation in a while-body.
+  // make sure that the hlo is scheduled before the pipelined chain starts.
   for (HloInstruction* hlo : computation->MakeInstructionPostOrder()) {
+    // For async collective ops, only the done version of the op passes this
+    // check, to avoid handling async ops twice.
     if (!MayInvokeCollectiveOp(hlo, collective_in_computation)) {
       continue;
     }
@@ -864,7 +855,7 @@ Status LinearizeCollectivesWithPipelinedP2PChild(
     if (IsP2POp(hlo) && opcode != HloOpcode::kSendDone) {
       continue;
     }
-    if (opcode == HloOpcode::kSendDone) {
+    if (hlo->opcode() == HloOpcode::kSendDone) {
       auto group_it = p2p_group_map.find(hlo->channel_id().value());
       if (group_it == p2p_group_map.end()) {
         continue;
@@ -878,19 +869,16 @@ Status LinearizeCollectivesWithPipelinedP2PChild(
         continue;
       }
 
-      ChainStartEnd cur_start_end = cur_group.GetChainStartEnd(computation);
-      TF_RETURN_IF_ERROR(
-          OrderBefore(reachability, start_end.first, cur_start_end.first));
+      ChainStartEnd cur_start_end =
+          cur_group.GetChainStartEnd(computation, p2p_group_map);
       TF_RETURN_IF_ERROR(
-          OrderBefore(reachability, cur_start_end.second, start_end.second));
+          OrderBefore(reachability, cur_start_end.second, start_end.first));
 
       continue;
     }
 
     // Async done, CustomCall, or other ops that indirectly invoke collectives.
-    TF_RETURN_IF_ERROR(
-        OrderBefore(reachability, start_end.first, GetStartOpForDoneOp(hlo)));
-    TF_RETURN_IF_ERROR(OrderBefore(reachability, hlo, start_end.second));
+    TF_RETURN_IF_ERROR(OrderBefore(reachability, hlo, start_end.first));
   }
 
   return OkStatus();
@@ -954,10 +942,9 @@ absl::StatusOr<bool> P2PSchedulePreparation::Run(
     std::unique_ptr<HloReachabilityMap> reachability =
         HloReachabilityMap::Build(computation);
     if (result.second != nullptr) {
-      // The current compuation is a while-body with pipelined P2P chain.
-      // Order all other collectives in a pipelined while-body between the
-      // Send/Recv-done block and the Send/Recv block of the pipelined P2P
-      // chain.
+      // The current computation is a while-body with pipelined P2P chain.
+      // Order all other collectives in a pipelined while-body before the
+      // pipelined P2P chain.
       TF_RETURN_IF_ERROR(LinearizeCollectivesWithPipelinedP2PChild(
           p2p_group_map, *result.second, collective_in_computation, computation,
           reachability.get()));
@@ -988,14 +975,9 @@ absl::StatusOr<bool> P2PSchedulePreparation::Run(
         // to other collectives.
         continue;
       }
-      if (kind == P2PGroupKind::kPipelined &&
-          group.complement_group != nullptr &&
-          group.pipeline_stream != kPipeline1) {
-        // We process a chain with two groups when we see the group for
-        // kPipeline1.
-        continue;
-      }
-      ChainStartEnd start_end = group.GetChainStartEnd(computation);
+
+      ChainStartEnd start_end =
+          group.GetChainStartEnd(computation, p2p_group_map);
 
       // Handle the group when we see the beginning of the chain.
       if (start_end.first != hlo) {
diff --git a/third_party/xla/xla/service/p2p_schedule_preparation_test.cc b/third_party/xla/xla/service/p2p_schedule_preparation_test.cc
index f4d04a56556399..b1127c586fe032 100644
--- a/third_party/xla/xla/service/p2p_schedule_preparation_test.cc
+++ b/third_party/xla/xla/service/p2p_schedule_preparation_test.cc
@@ -46,10 +46,9 @@ class P2PSchedulePreparationTest : public HloTestBase {
     EXPECT_EQ(send_done->control_predecessors().size(), 0);
   }
 
-  // Verifies that the control dependence enforces this ordering for an
-  // unpipelined Send-Recv chain:
+  // Verifies that the control dependence enforces this ordering:
   //   recv => send => recv-done => send-done
-  void VerifyUnpipelinedP2P(HloModule* module, const std::string& suffix = "") {
+  void VerifyP2P1GroupChain(HloModule* module, const std::string& suffix) {
     HloInstruction* send = FindInstruction(module, "send" + suffix);
     HloInstruction* recv = FindInstruction(module, "recv" + suffix);
     HloInstruction* recv_done = FindInstruction(module, "recv-done" + suffix);
@@ -59,23 +58,19 @@ class P2PSchedulePreparationTest : public HloTestBase {
     EXPECT_EQ(send_done->control_predecessors()[0], recv_done);
   }
 
+  // Verifies that the control dependence enforces this ordering for an
+  // unpipelined Send-Recv chain:
+  //   recv => send => recv-done => send-done
+  void VerifyUnpipelinedP2P(HloModule* module, const std::string& suffix = "") {
+    VerifyP2P1GroupChain(module, suffix);
+  }
+
   // Verifies that the control dependence enforces this ordering for a pipelined
   // Send-Recv chain in the while-body:
-  // recv-done => send-done => recv => send.
+  //   recv => send => recv-done => send-done
   void VerifyPipelinedP2PChild(HloModule* module,
                                const std::string& suffix = "") {
-    HloInstruction* send = FindInstruction(module, "send" + suffix);
-    HloInstruction* recv = FindInstruction(module, "recv" + suffix);
-    HloInstruction* recv_done = FindInstruction(module, "recv-done" + suffix);
-    HloInstruction* send_done = FindInstruction(module, "send-done" + suffix);
-    // If the while-body has other P2P, the pipelined Recv should also have the
-    // Send-done of the other P2P as control predecessors.
-    EXPECT_EQ(1, absl::c_count(recv->control_predecessors(), send_done));
-    EXPECT_EQ(recv_done->control_predecessors().size(), 0);
-    EXPECT_EQ(send_done->control_predecessors().size(), 1);
-    EXPECT_EQ(send_done->control_predecessors()[0], recv_done);
-    EXPECT_EQ(send->control_predecessors().size(), 1);
-    EXPECT_EQ(send->control_predecessors()[0], recv);
+    VerifyP2P1GroupChain(module, suffix);
   }
 
   // Verifies that the control dependence enforces this ordering for a pipelined
@@ -83,22 +78,14 @@ class P2PSchedulePreparationTest : public HloTestBase {
   //   recv => send => while-loop => recv-done => send-done.
   void VerifyPipelinedP2PParent(HloModule* module,
                                 const std::string& suffix = "") {
-    HloInstruction* send = FindInstruction(module, "send" + suffix);
-    HloInstruction* recv = FindInstruction(module, "recv" + suffix);
-    HloInstruction* recv_done = FindInstruction(module, "recv-done" + suffix);
-    HloInstruction* send_done = FindInstruction(module, "send-done" + suffix);
-    EXPECT_EQ(send_done->control_predecessors().size(), 1);
-    EXPECT_EQ(send_done->control_predecessors()[0], recv_done);
-    EXPECT_EQ(send->control_predecessors().size(), 1);
-    EXPECT_EQ(send->control_predecessors()[0], recv);
+    VerifyP2P1GroupChain(module, suffix);
   }
 
-  // Verifies that the control dependence enforces this ordering for a pipelined
-  // chain with two Send-Recv groups in a while-body:
-  //  recv-done.0 => send-done.0 => recv-done.1 => send-done.1 =>
-  //  recv.0 => send.0 => recv.1 => send.1
-  void VerifyPipelined2P2PChild(HloModule* module, const std::string& suffix0,
-                                const std::string& suffix1) {
+  // Verifies that the control dependence enforces this ordering:
+  //  recv.0 => send.0 => recv.1 => send.1 =>
+  //  recv-done.0 => recv-done.1 => send-done.0 => send-done.1
+  void VerifyP2P2GroupChain(HloModule* module, const std::string& suffix0,
+                            const std::string& suffix1) {
     HloInstruction* send0 = FindInstruction(module, "send" + suffix0);
     HloInstruction* recv0 = FindInstruction(module, "recv" + suffix0);
     HloInstruction* recv_done0 = FindInstruction(module, "recv-done" + suffix0);
@@ -108,37 +95,33 @@ class P2PSchedulePreparationTest : public HloTestBase {
     HloInstruction* recv_done1 = FindInstruction(module, "recv-done" + suffix1);
     HloInstruction* send_done1 = FindInstruction(module, "send-done" + suffix1);
 
-    EXPECT_EQ(send_done0->control_predecessors()[0], recv_done0);
-    EXPECT_EQ(recv_done1->control_predecessors()[0], send_done0);
-    EXPECT_EQ(send_done1->control_predecessors()[0], recv_done1);
+    EXPECT_EQ(recv_done1->control_predecessors()[0], recv_done0);
+    EXPECT_EQ(send_done0->control_predecessors()[0], recv_done1);
+    EXPECT_EQ(send_done1->control_predecessors()[0], send_done0);
 
     EXPECT_EQ(send0->control_predecessors()[0], recv0);
     EXPECT_EQ(recv1->control_predecessors()[0], send0);
     EXPECT_EQ(send1->control_predecessors()[0], recv1);
+
+    EXPECT_EQ(recv_done0->control_predecessors()[0], send1);
+  }
+
+  // Verifies that the control dependence enforces this ordering for a pipelined
+  // chain with two Send-Recv groups in a while-body:
+  //  recv.0 => send.0 => recv.1 => send.1 =>
+  //  recv-done.0 => send-done.0 => recv-done.1 => send-done.1
+  void VerifyPipelined2P2PChild(HloModule* module, const std::string& suffix0,
+                                const std::string& suffix1) {
+    VerifyP2P2GroupChain(module, suffix0, suffix1);
   }
 
   // Verifies that the control dependence enforces this ordering for a pipelined
   // chain with two Send-Recv groups in the while-loop calling computation:
-  //   recv.0 => send.0 => recv.1 => send.1 => while-loop
+  //   recv.0 => send.0 => recv.1 => send.1 =>
   //   => recv-done.0 => send-done.0 => recv-done.1 => send-done.1
   void VerifyPipelined2P2PParent(HloModule* module, const std::string& suffix0,
                                  const std::string& suffix1) {
-    HloInstruction* send0 = FindInstruction(module, "send" + suffix0);
-    HloInstruction* recv0 = FindInstruction(module, "recv" + suffix0);
-    HloInstruction* recv_done0 = FindInstruction(module, "recv-done" + suffix0);
-    HloInstruction* send_done0 = FindInstruction(module, "send-done" + suffix0);
-    HloInstruction* send1 = FindInstruction(module, "send" + suffix1);
-    HloInstruction* recv1 = FindInstruction(module, "recv" + suffix1);
-    HloInstruction* recv_done1 = FindInstruction(module, "recv-done" + suffix1);
-    HloInstruction* send_done1 = FindInstruction(module, "send-done" + suffix1);
-
-    EXPECT_EQ(send0->control_predecessors()[0], recv0);
-    EXPECT_EQ(recv1->control_predecessors()[0], send0);
-    EXPECT_EQ(send1->control_predecessors()[0], recv1);
-
-    EXPECT_EQ(send_done0->control_predecessors()[0], recv_done0);
-    EXPECT_EQ(recv_done1->control_predecessors()[0], send_done0);
-    EXPECT_EQ(send_done1->control_predecessors()[0], recv_done1);
+    VerifyP2P2GroupChain(module, suffix0, suffix1);
   }
 };
 
@@ -385,52 +368,38 @@ std::string GetPipelinedP2PModuleString(bool nested_p2p_in_main = false,
   // while-loop with nested P2P chains.
   constexpr char kUnnestedResult[] = R"(
   while-result-1 = f32[1, 1024, 1024] get-tuple-element(while-result), index=1
-  ROOT collective-permute.2 = f32[1, 1024, 1024] collective-permute(while-result-1),
+  collective-permute.2 = f32[1, 1024, 1024] collective-permute(init),
     source_target_pairs={{0,1}, {1,2}, {2,3}, {3,4}}
+  ROOT entry-result = f32[1, 1024, 1024] add(while-result-1, collective-permute.2)
 )";
 
   // Similar to the above, but for test_custom_call = true.
   constexpr char kUnnestedResultWithCustomCall[] = R"(
   while-result-1 = f32[1, 1024, 1024] get-tuple-element(while-result), index=1
-  ROOT custom-call = f32[1, 1024, 1024] custom-call(while-result-1),
+  custom-call = f32[1, 1024, 1024] custom-call(init),
     custom_call_target="my_custom_call"
+  ROOT entry-result = f32[1, 1024, 1024] add(while-result-1, custom-call)
 )";
 
   // This is the result for the main computation, if it has another while-loop
   // with nested P2P chains.
   constexpr char kNestedResult[] = R"(
   while-result-1 = f32[1, 1024, 1024] get-tuple-element(while-result), index=1
-  while-init-2 =  (u32[], f32[1, 1024, 1024]) tuple(c0, while-result-1)
-  while-result-2 = (u32[], f32[1, 1024, 1024]) while(while-init-2),
+  while-init-2 =  (u32[], f32[1, 1024, 1024]) tuple(c0, init)
+  while-2 = (u32[], f32[1, 1024, 1024]) while(while-init-2),
       body=while-body-2, condition=while-cond-2,
       backend_config={"known_trip_count":{"n":"25"}}
-  ROOT entry-result = f32[1, 1024, 1024] get-tuple-element(while-result-2), index=1
+  while-result-2 = f32[1, 1024, 1024] get-tuple-element(while-2), index=1
+  ROOT entry-result = f32[1, 1024, 1024] add(while-result-1, while-result-2)
 )";
 
   constexpr char kPipelinedWhileBodyWithoutOtherP2P[] = R"(
   while-body {
-    param = (u32[], (f32[1, 1024, 1024], token[]),
-      (f32[1, 1024, 1024], token[])) parameter(0)
+    param = (u32[], (f32[1, 1024, 1024], token[]), token[]) parameter(0)
     count = get-tuple-element(param), index=0
 
-    // Mimic the code transformation done by copy-insertion to complicate
-    // the code pattern.
-    send.1.q.t = (f32[1,1024,1024], token[]) get-tuple-element(param), index=1
-    send.1.q.data = f32[1,1024,1024] get-tuple-element(send.1.q.t), index=0
-    send.1.q.data.copy = f32[1,1024,1024] copy(send.1.q.data)
-    send.1.q.token = token[] get-tuple-element(send.1.q.t), index=1
-    send.1.q = (f32[1, 1024, 1024], token[]) tuple(send.1.q.data.copy, send.1.q.token)
-
-    recv.1.q = (f32[1, 1024, 1024], token[])get-tuple-element(param), index=1
-    send-done.1 = token[] send-done(send.1.q), channel_id=1,
-      frontend_attributes={
-        _xla_send_recv_pipeline="0"
-      }
-    recv-done.1 = token[] recv-done(recv.1.q), channel_id=1,
-      frontend_attributes={
-        _xla_send_recv_pipeline="0"
-      }
-    recv-data = f32[1, 1024, 1024] get-tuple-element(recv-done.1), index=0
+    recv-done.1.q = (f32[1, 1024, 1024], token[]) get-tuple-element(param), index=1
+    recv-data = f32[1, 1024, 1024] get-tuple-element(recv-done.1.q), index=0
 
     c1 = u32[] constant(1)
     new-count = u32[] add(count, c1)
@@ -455,39 +424,31 @@ std::string GetPipelinedP2PModuleString(bool nested_p2p_in_main = false,
       _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}",
       _xla_send_recv_pipeline="0"
     }
-
-    // Mimic the code transformation done by copy-insertion to complicate
-    // the code pattern.
+    send-done.1 = token[] send-done(send.1), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
     recv.1 = (f32[1, 1024, 1024], token[]) recv(after-all.1), channel_id=1,
       frontend_attributes={
        _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}",
        _xla_send_recv_pipeline="0"
     }
-    recv.1.data = f32[1,1024,1024] get-tuple-element(recv.1), index=0
-    recv.1.data.copy = f32[1,1024,1024] copy(recv.1.data)
-    recv.1.token = token[] get-tuple-element(recv.1), index=1
-    recv.1.tuple = (f32[1,1024,1024], token[]) tuple(recv.1.data.copy, recv.1.token)
+    recv-done.1 = (f32[1, 1024, 1024], token[]) recv-done(recv.1), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
 
-    ROOT body-result = (u32[], (f32[1, 1024, 1024], token[]),
-      (f32[1, 1024, 1024], token[])) tuple(new-count, recv.1, send.1)
+    ROOT body-result = (u32[], (f32[1, 1024, 1024], token[]), token[])
+      tuple(new-count, recv-done.1, send-done.1)
   }
 )";
 
   constexpr char kPipelinedWhileBodyWithOtherP2P[] = R"(
   while-body {
-    param = (u32[], (f32[1, 1024, 1024], token[]), (f32[1, 1024, 1024], token[])) parameter(0)
+    param = (u32[], (f32[1, 1024, 1024], token[]), token[]) parameter(0)
     count = get-tuple-element(param), index=0
-    send.1.q = (f32[1, 1024, 1024], token[]) get-tuple-element(param), index=2
-    recv.1.q = (f32[1, 1024, 1024], token[])get-tuple-element(param), index=1
-    send-done.1 = token[] send-done(send.1.q), channel_id=1,
-      frontend_attributes={
-        _xla_send_recv_pipeline="0"
-      }
-    recv-done.1 = token[] recv-done(recv.1.q), channel_id=1,
-      frontend_attributes={
-        _xla_send_recv_pipeline="0"
-      }
-    recv-data = f32[1, 1024, 1024] get-tuple-element(recv-done.1), index=0
+    recv-done.1.q = (f32[1, 1024, 1024], token[])get-tuple-element(param), index=1
+    recv-data = f32[1, 1024, 1024] get-tuple-element(recv-done.1.q), index=0
 
     c1 = u32[] constant(1)
     new-count = u32[] add(count, c1)
@@ -509,30 +470,37 @@ std::string GetPipelinedP2PModuleString(bool nested_p2p_in_main = false,
     after-all.4 = token[] after-all()
     send.4 = (f32[1, 1024, 1024], u32[], token[]) send(send-data, after-all.4),
       channel_id=4, frontend_attributes={
-      _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}"
-    }
+        _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}"
+      }
     send-done.4 = token[] send-done(send.4), channel_id=4
     recv.4 = (f32[1, 1024, 1024], u32[], token[]) recv(after-all.4), channel_id=4,
       frontend_attributes={
-       _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}"
-    }
+        _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}"
+      }
     recv-done.4 = (f32[1, 1024, 1024], token[]) recv-done(recv.4), channel_id=4
     new-data = f32[1, 1024, 1024] get-tuple-element(recv-done.4), index=0
 
     after-all.1 = token[] after-all()
     send.1 = (f32[1, 1024, 1024], token[]) send(new-data, after-all.1),
       channel_id=1, frontend_attributes={
-      _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}",
-      _xla_send_recv_pipeline="0"
-    }
-    recv.1 = (f32[1, 1024, 1024], u32[], token[]) recv(after-all.1), channel_id=1,
+        _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}",
+        _xla_send_recv_pipeline="0"
+      }
+    send-done.1 = token[] send-done(send.1), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
+    recv.1 = (f32[1, 1024, 1024], token[]) recv(after-all.1), channel_id=1,
       frontend_attributes={
        _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}",
        _xla_send_recv_pipeline="0"
     }
-
-    ROOT body-result = (u32[], (f32[1, 1024, 1024], token[]),
-      (f32[1, 1024, 1024], token[])) tuple(new-count, recv.1, send.1)
+    recv-done.1 = (f32[1, 1024, 1024], token[]) recv-done(recv.1), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
+    ROOT body-result = (u32[], (f32[1, 1024, 1024], token[]), token[])
+      tuple(new-count, recv-done.1, send-done.1)
   }
 )";
 
@@ -540,7 +508,7 @@ std::string GetPipelinedP2PModuleString(bool nested_p2p_in_main = false,
   HloModule test
 
   while-cond {
-    param = (u32[], (f32[1, 1024, 1024], u32[], token[]), (f32[1, 1024, 1024], u32[], token[])) parameter(0)
+    param = (u32[], (f32[1, 1024, 1024], u32[], token[]), token[]) parameter(0)
     count = get-tuple-element(param), index=0
     ub = u32[] constant(25)
     ROOT cond-result = pred[] compare(count, ub), direction=LT
@@ -560,41 +528,33 @@ std::string GetPipelinedP2PModuleString(bool nested_p2p_in_main = false,
     after-all.2 = token[] after-all()
     recv.2 = (f32[1, 1024, 1024], token[]) recv(after-all.2), channel_id=1,
       frontend_attributes={
-       _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}",
-       _xla_send_recv_pipeline="0"
-    }
-
-    // Mimic the code transformation done by copy-insertion to complicate
-    // the code pattern.
+        _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}",
+        _xla_send_recv_pipeline="0"
+      }
+    recv-done.2 = (f32[1, 1024, 1024], token[]) recv-done(recv.2), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
     send.2 = (f32[1, 1024, 1024], token[]) send(init, after-all.2),
       channel_id=1, frontend_attributes={
-      _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}",
-      _xla_send_recv_pipeline="0"
-    }
-    send.2.data = f32[1,1024,1024] get-tuple-element(send.2), index=0
-    send.2.data.copy = f32[1,1024,1024] copy(send.2.data)
-    send.2.token = token[] get-tuple-element(send.2), index=1
-    send.2.tuple = (f32[1,1024,1024], token[]) tuple(send.2.data.copy, send.2.token)
-
-    while-init =  (u32[], (f32[1, 1024, 1024], token[]),
-      (f32[1, 1024, 1024], token[])) tuple(c0, recv.2, send.2.tuple)
-    while-result =  (u32[], (f32[1, 1024, 1024], token[]),
-      (f32[1, 1024, 1024], token[])) while(while-init),
-      body=while-body, condition=while-cond,
-      backend_config={"known_trip_count":{"n":"25"}}
-
-    recv.2.q = (f32[1, 1024, 1024], token[]) get-tuple-element(while-result), index=1
-    recv-done.2 = (f32[1, 1024, 1024], token[]) recv-done(recv.2.q), channel_id=1,
-      frontend_attributes={
+        _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}",
         _xla_send_recv_pipeline="0"
       }
-    recv-data.2.q = f32[1, 1024, 1024] get-tuple-element(recv-done.2), index=0
-    send.2.q = (f32[1, 1024, 1024], token[]) get-tuple-element(while-result), index=2
-    send-done.2 = token[] send-done(send.2.q), channel_id=1,
+    send-done.2 = token[] send-done(send.2), channel_id=1,
       frontend_attributes={
         _xla_send_recv_pipeline="0"
       }
 
+    while-init =  (u32[], (f32[1, 1024, 1024], token[]), token[])
+      tuple(c0, recv-done.2, send-done.2)
+    while-result =  (u32[], (f32[1, 1024, 1024], token[]), token[])
+      while(while-init),
+      body=while-body, condition=while-cond,
+      backend_config={"known_trip_count":{"n":"25"}}
+
+    recv-done.2.q = (f32[1, 1024, 1024], token[]) get-tuple-element(while-result), index=1
+    recv-data.2.q = f32[1, 1024, 1024] get-tuple-element(recv-done.2.q), index=0
+
     // The code for the computation result goes here.
     %s
   }
@@ -626,18 +586,22 @@ TEST_F(P2PSchedulePreparationTest, UnnestedPipelinedP2PChainTransformed) {
   // Verify the pipelined P2P chain in the main computation.
   VerifyPipelinedP2PParent(module.get(), ".2");
 
-  // Verify in the while-body collective-permute is scheduled after Send-done.
-  HloInstruction* send_done_1 = FindInstruction(module.get(), "send-done.1");
+  // Verify in the while-body collective-permute is scheduled before recv.
+  HloInstruction* recv_1 = FindInstruction(module.get(), "recv.1");
   HloInstruction* collective_1 =
       FindInstruction(module.get(), "collective-permute.1");
-  EXPECT_EQ(collective_1->control_predecessors()[0], send_done_1);
+  EXPECT_EQ(recv_1->control_predecessors()[0], collective_1);
 
-  // Verify in the main computation collective-permute is scheduled after the
-  // Send-done for the pipelined while-loop.
+  // Verify in the main computation collective-permute is either scheduled
+  // after send-done or before recv of the pipelined P2P chain.
   HloInstruction* send_done_2 = FindInstruction(module.get(), "send-done.2");
+  HloInstruction* recv_2 = FindInstruction(module.get(), "recv.2");
   HloInstruction* collective_2 =
       FindInstruction(module.get(), "collective-permute.2");
-  EXPECT_EQ(collective_2->control_predecessors()[0], send_done_2);
+  EXPECT_TRUE((!collective_2->control_predecessors().empty() &&
+               collective_2->control_predecessors()[0] == send_done_2) ||
+              (!recv_2->control_predecessors().empty() &&
+               recv_2->control_predecessors()[0] == collective_2));
 }
 
 TEST_F(P2PSchedulePreparationTest, NestedPipelinedP2PChainTransformed) {
@@ -657,11 +621,15 @@ TEST_F(P2PSchedulePreparationTest, NestedPipelinedP2PChainTransformed) {
   // Verify the unpipelined P2P chain in the other while-body.
   VerifyUnpipelinedP2P(module.get(), ".3");
 
-  // Verify that the while-loop with nested P2P is schedule after the last
-  // Send-done of the pipeline P2P chain.
-  HloInstruction* send_done = FindInstruction(module.get(), "send-done.2");
-  HloInstruction* while_user = FindInstruction(module.get(), "while-result-2");
-  EXPECT_EQ(while_user->control_predecessors()[0], send_done);
+  // Verify in the while-loop with nested P2P is either scheduled after
+  // end-done or before recv of the pipelined P2P chain.
+  HloInstruction* send_done_2 = FindInstruction(module.get(), "send-done.2");
+  HloInstruction* recv_2 = FindInstruction(module.get(), "recv.2");
+  HloInstruction* while_2 = FindInstruction(module.get(), "while-2");
+  EXPECT_TRUE((!while_2->control_predecessors().empty() &&
+               while_2->control_predecessors()[0] == send_done_2) ||
+              (!recv_2->control_predecessors().empty() &&
+               recv_2->control_predecessors()[0] == while_2));
 }
 
 TEST_F(P2PSchedulePreparationTest,
@@ -682,16 +650,11 @@ TEST_F(P2PSchedulePreparationTest,
   // Verify the other unpipelined P2P chain in the while-body.
   VerifyUnpipelinedP2P(module.get(), ".4");
 
-  // Verify that in the pipelined while-body, the pipelined Send-done is ordered
-  // before other P2P while the pipelined Recv is ordered after other P2P.
-  HloInstruction* pipelined_send_done =
-      FindInstruction(module.get(), "send-done.1");
+  // Verify that in the pipelined while-body, the pipelined recv is ordered
+  // after other P2P.
   HloInstruction* pipelined_recv = FindInstruction(module.get(), "recv.1");
-  HloInstruction* other_recv = FindInstruction(module.get(), "recv.4");
   HloInstruction* other_send_done =
       FindInstruction(module.get(), "send-done.4");
-  EXPECT_EQ(1, absl::c_count(other_recv->control_predecessors(),
-                             pipelined_send_done));
   EXPECT_EQ(1, absl::c_count(pipelined_recv->control_predecessors(),
                              other_send_done));
 }
@@ -707,11 +670,15 @@ TEST_F(P2PSchedulePreparationTest,
   TF_ASSERT_OK_AND_ASSIGN(bool changed, preparation.Run(module.get()));
   EXPECT_TRUE(changed);
 
-  // Verify in the main computation custom-call is scheduled after the
-  // Send-done for the pipelined while-loop.
+  // Verify in the main computation, custom-call is either scheduled after
+  // end-done or before recv of the pipelined P2P chain.
   HloInstruction* send_done_2 = FindInstruction(module.get(), "send-done.2");
+  HloInstruction* recv_2 = FindInstruction(module.get(), "recv.2");
   HloInstruction* custom_call = FindInstruction(module.get(), "custom-call");
-  EXPECT_EQ(custom_call->control_predecessors()[0], send_done_2);
+  EXPECT_TRUE((!custom_call->control_predecessors().empty() &&
+               custom_call->control_predecessors()[0] == send_done_2) ||
+              (!recv_2->control_predecessors().empty() &&
+               recv_2->control_predecessors()[0] == custom_call));
 }
 
 TEST_F(P2PSchedulePreparationTest, PipelinedP2PChain2Transformed) {
@@ -719,31 +686,22 @@ TEST_F(P2PSchedulePreparationTest, PipelinedP2PChain2Transformed) {
   HloModule test
 
 cond {
-    param = (u32[], (u32[2], u32[], token[]), (u32[2], u32[], token[]),
-      (u32[2], u32[], token[]), (u32[2], u32[], token[])) parameter(0)
+    param = (u32[], (u32[2], token[]), (u32[2], token[]),
+      token[], token[]) parameter(0)
     count = get-tuple-element(%param), index=0
     ub = u32[] constant(10)
     ROOT result = pred[] compare(count, ub), direction=LT
  }
 
 body {
-    param = (u32[], (u32[2], u32[], token[]), (u32[2], u32[], token[]),
-      (u32[2], u32[], token[]), (u32[2], u32[], token[])) parameter(0)
+    param = (u32[], (u32[2], token[]), (u32[2], token[]),
+      token[], token[]) parameter(0)
     count = get-tuple-element(param), index=0
 
-    recv.0.f = (u32[2], u32[], token[]) get-tuple-element(param), index=1
-    recv-done.0 = (u32[2], token[]) recv-done(recv.0.f), channel_id=1,
-      frontend_attributes={
-        _xla_send_recv_pipeline="0"
-      }
-    recv-data.0 = u32[2] get-tuple-element(recv-done.0), index=0
-
-    recv.1.f = (u32[2], u32[], token[]) get-tuple-element(param), index=2
-    recv-done.1 = (u32[2], token[]) recv-done(recv.1.f), channel_id=2,
-      frontend_attributes={
-        _xla_send_recv_pipeline="1"
-      }
-    recv-data.1 = u32[2] get-tuple-element(recv-done.1), index=0
+    recv-done.0.f = (u32[2], token[]) get-tuple-element(param), index=1
+    recv-data.0 = u32[2] get-tuple-element(recv-done.0.f), index=0
+    recv-done.1.f = (u32[2], token[]) get-tuple-element(param), index=2
+    recv-data.1 = u32[2] get-tuple-element(recv-done.1.f), index=0
 
     replica = u32[] replica-id()
     constant0 = u32[] constant(0)
@@ -757,17 +715,6 @@ body {
     r = u32[2] broadcast(c1), dimensions={}
     s = u32[2] add(r, recv-data)
 
-    send.0.f = (u32[2], u32[], token[]) get-tuple-element(param), index=3
-    send-done.0 = token[] send-done(send.0.f), channel_id=1,
-      frontend_attributes={
-        _xla_send_recv_pipeline="0"
-      }
-    send.1.f = (u32[2], u32[], token[]) get-tuple-element(param), index=4
-    send-done.1 = token[] send-done(send.1.f), channel_id=2,
-      frontend_attributes={
-        _xla_send_recv_pipeline="1"
-      }
-
     // The Recv "rotated" from the beginning of the loop to the end of the loop.
     after-all.0.n = token[] after-all()
     recv.0 = (u32[2], u32[], token[]) recv(after-all.0.n), channel_id=1,
@@ -781,6 +728,14 @@ body {
         _xla_send_recv_source_target_pairs="{{3,0}}",
         _xla_send_recv_pipeline="0"
       }
+    recv-done.0 = (u32[2], token[]) recv-done(recv.0), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
+    send-done.0 = token[] send-done(send.0), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
 
     after-all.1.n = token[] after-all()
     recv.1 = (u32[2], u32[], token[]) recv(after-all.1.n), channel_id=2,
@@ -794,9 +749,16 @@ body {
         _xla_send_recv_source_target_pairs="{{0,1},{1,2},{2,3}}",
         _xla_send_recv_pipeline="1"
       }
-
-    ROOT result = (u32[], (u32[2], u32[], token[]), (u32[2], u32[], token[]),
-      (u32[2], u32[], token[]), (u32[2], u32[], token[])) tuple(new_count, recv.0, recv.1, send.0, send.1)
+    recv-done.1 = (u32[2], token[]) recv-done(recv.1), channel_id=2,
+      frontend_attributes={
+        _xla_send_recv_pipeline="1"
+      }
+    send-done.1 = token[] send-done(send.1), channel_id=2,
+      frontend_attributes={
+        _xla_send_recv_pipeline="1"
+      }
+    ROOT result = (u32[], (u32[2], token[]), (u32[2], token[]), token[], token[])
+      tuple(new_count, recv-done.0, recv-done.1, send-done.0, send-done.1)
   }
 
   ENTRY test_computation {
@@ -819,7 +781,14 @@ body {
         _xla_send_recv_source_target_pairs="{{3,0}}",
         _xla_send_recv_pipeline="0"
       }
-
+    recv-done.2 = (u32[2], token[]) recv-done(recv.2), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
+    send-done.2 = token[] send-done(send.2), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
     after-all.1.p = token[] after-all()
     recv.3 = (u32[2], u32[], token[]) recv(after-all.1.p), channel_id=2,
       frontend_attributes={
@@ -832,51 +801,231 @@ body {
         _xla_send_recv_source_target_pairs="{{0,1},{1,2},{2,3}}",
         _xla_send_recv_pipeline="1"
       }
-
+    recv-done.3 = (u32[2], token[]) recv-done(recv.3), channel_id=2,
+      frontend_attributes={
+        _xla_send_recv_pipeline="1"
+      }
+    send-done.3 = token[] send-done(send.3), channel_id=2,
+      frontend_attributes={
+        _xla_send_recv_pipeline="1"
+      }
     // This is the pipelined loop.
-    while_init = (u32[], (u32[2], u32[], token[]), (u32[2], u32[], token[]),
-      (u32[2], u32[], token[]), (u32[2], u32[], token[])) tuple(c0, recv.2, recv.3, send.2, send.3)
+    while_init = (u32[], (u32[2], token[]), (u32[2], token[]),
+      token[], token[]) tuple(c0, recv-done.2, recv-done.3, send-done.2, send-done.3)
     while_result = (u32[], (u32[2], u32[], token[]), (u32[2], u32[], token[]),
-      (u32[2], u32[], token[]), (u32[2], u32[], token[])) while(while_init), body=body, condition=cond,
+      token[], token[]) while(while_init), body=body, condition=cond,
     backend_config={"known_trip_count":{"n":"10"}}
 
     // This is the remaining Send/Send-done/Recv-done for the pipeline.
     // Use .q as suffix for HLO name.
+     recv-done.0.q = (u32[2], u32[], token[]) get-tuple-element(while_result), index=1
+     recv-data.0.q = u32[2] get-tuple-element(recv-done.0.q), index=0
+
+     recv-done.1.q = (u32[2], u32[], token[]) get-tuple-element(while_result), index=2
+     recv-data.1.q = u32[2] get-tuple-element(recv-done.1.q), index=0
+
+    replica = u32[] replica-id()
+    constant0 = u32[] constant(0)
+    compare0 = pred[] compare(replica, constant0), direction=EQ
+    compare = pred[2] broadcast(compare0), dimensions={}
+    recv-data = u32[2] select(compare, recv-data.0.q, recv-data.1.q)
+
+    s = u32[2] add(c1, recv-data)
+
+    ROOT result = u32[2] add(s, recv-data)
+  }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule((kModuleStr)));
+  P2PSchedulePreparation preparation;
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, preparation.Run(module.get()));
+  VLOG(10) << module->ToString();
+  EXPECT_TRUE(changed);
 
-     recv.0.q = (u32[2], u32[], token[]) get-tuple-element(while_result), index=1
-     recv-done.2 = (u32[2], token[]) recv-done(recv.0.q), channel_id=1,
-       frontend_attributes={
+  // Verify the pipelined P2P chain in the while-body.
+  VerifyPipelined2P2PChild(module.get(), ".0", ".1");
+  // Verify the pipelined P2P chain in the main computation.
+  VerifyPipelined2P2PParent(module.get(), ".2", ".3");
+}
+
+TEST_F(P2PSchedulePreparationTest, UnpipelinedP2PChain2Transformed) {
+  const char* const kModuleStr = R"(
+  HloModule test
+
+cond {
+    param = (u32[], u32[2]) parameter(0)
+    count = get-tuple-element(%param), index=0
+    ub = u32[] constant(11)
+    ROOT result = pred[] compare(count, ub), direction=LT
+ }
+
+body {
+    param = (u32[], u32[2]) parameter(0)
+    count = get-tuple-element(param), index=0
+    send-data = u32[2] get-tuple-element(param), index=1
+
+    after-all.0.n = token[] after-all()
+    recv.0 = (u32[2], u32[], token[]) recv(after-all.0.n), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_source_target_pairs="{{3,0}}",
+        _xla_send_recv_pipeline="0"
+      }
+    send.0 = (u32[2], u32[], token[]) send(send-data, after-all.0.n),
+      channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_source_target_pairs="{{3,0}}",
+        _xla_send_recv_pipeline="0"
+      }
+    recv-done.0 = (u32[2], token[]) recv-done(recv.0), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
+    send-done.0 = token[] send-done(send.0), channel_id=1,
+      frontend_attributes={
         _xla_send_recv_pipeline="0"
       }
-     recv-data.0.q = u32[2] get-tuple-element(recv-done.2), index=0
 
-     recv.1.q = (u32[2], u32[], token[]) get-tuple-element(while_result), index=2
-     recv-done.3 = (u32[2], token[]) recv-done(recv.1.q), channel_id=2,
-       frontend_attributes={
+    after-all.1 = token[] after-all()
+    recv.1 = (u32[2], u32[], token[]) recv(after-all.1), channel_id=2,
+      frontend_attributes={
+        _xla_send_recv_source_target_pairs="{{0,1},{1,2},{2,3}}",
+        _xla_send_recv_pipeline="1"
+      }
+    send.1 = (u32[2], u32[], token[]) send(send-data, after-all.1),
+      channel_id=2,
+      frontend_attributes={
+        _xla_send_recv_source_target_pairs="{{0,1},{1,2},{2,3}}",
+        _xla_send_recv_pipeline="1"
+      }
+    recv-done.1 = (u32[2], token[]) recv-done(recv.1), channel_id=2,
+      frontend_attributes={
         _xla_send_recv_pipeline="1"
       }
-     recv-data.1.q = u32[2] get-tuple-element(recv-done.2), index=0
+    send-done.1 = token[] send-done(send.1), channel_id=2,
+      frontend_attributes={
+        _xla_send_recv_pipeline="1"
+      }
+
+    recv-data.0 = u32[2] get-tuple-element(recv-done.0), index=0
+    recv-data.1 = u32[2] get-tuple-element(recv-done.1), index=0
 
     replica = u32[] replica-id()
     constant0 = u32[] constant(0)
     compare0 = pred[] compare(replica, constant0), direction=EQ
     compare = pred[2] broadcast(compare0), dimensions={}
-    recv-data = u32[2] select(compare, recv-data.0.q, recv-data.1.q)
+    recv-data = u32[2] select(compare, recv-data.0, recv-data.1)
 
-    s = u32[2] add(c1, recv-data)
+    c1 = u32[] constant(1)
+    new_count = u32[] add(count, c1)
 
-    send.0.q = (u32[2], u32[], token[]) get-tuple-element(while_result), index=3
-    send-done.2 = token[] send-done(send.0.q), channel_id=1,
+    r = u32[2] broadcast(c1), dimensions={}
+    s = u32[2] add(r, recv-data)
+
+    ROOT result = (u32[], u32[2]) tuple(new_count, s)
+  }
+
+  ENTRY test_computation {
+    c0 = u32[] constant(0)
+    c1 = u32[] constant(1)
+    r = u32[] replica-id()
+    a = u32[] add(c1, r)
+    init = u32[2] broadcast(a), dimensions={}
+    while_init = (u32[], u32[2]) tuple(c0, init)
+    while_result = (u32[], u32[2]) while(while_init), body=body, condition=cond,
+      backend_config={"known_trip_count":{"n":"11"}}
+    ROOT recv-data = u32[2] get-tuple-element(while_result), index=1
+  }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule((kModuleStr)));
+  P2PSchedulePreparation preparation;
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, preparation.Run(module.get()));
+  EXPECT_TRUE(changed);
+
+  // Verify the unpipelined P2P chain with two channels in the while-body.
+  VerifyP2P2GroupChain(module.get(), ".0", ".1");
+}
+
+TEST_F(P2PSchedulePreparationTest, Unpipelined2SeparatedChainTransformed) {
+  const char* const kModuleStr = R"(
+  HloModule test
+
+cond {
+    param = (u32[], u32[2]) parameter(0)
+    count = get-tuple-element(%param), index=0
+    ub = u32[] constant(11)
+    ROOT result = pred[] compare(count, ub), direction=LT
+ }
+
+body {
+    param = (u32[], u32[2]) parameter(0)
+    count = get-tuple-element(param), index=0
+    send-data = u32[2] get-tuple-element(param), index=1
+
+    after-all.0.n = token[] after-all()
+    recv.0 = (u32[2], u32[], token[]) recv(after-all.0.n), channel_id=1,
       frontend_attributes={
+        _xla_send_recv_source_target_pairs="{{3,0}}",
         _xla_send_recv_pipeline="0"
       }
-    send.1.q = (u32[2], u32[], token[]) get-tuple-element(while_result), index=4
-    send-done.3 = token[] send-done(send.1.q), channel_id=2,
+    send.0 = (u32[2], u32[], token[]) send(send-data, after-all.0.n),
+      channel_id=1,
       frontend_attributes={
-        _xla_send_recv_pipeline="1"
+        _xla_send_recv_source_target_pairs="{{3,0}}",
+        _xla_send_recv_pipeline="0"
+      }
+    recv-done.0 = (u32[2], token[]) recv-done(recv.0), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
+    send-done.0 = token[] send-done(send.0), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
       }
 
-    ROOT result = u32[2] add(s, recv-data)
+    after-all.1 = token[] after-all()
+    recv.1 = (u32[2], u32[], token[]) recv(after-all.1), channel_id=2,
+      frontend_attributes={
+        _xla_send_recv_source_target_pairs="{{0,1},{1,2}}"
+      }
+    send.1 = (u32[2], u32[], token[]) send(send-data, after-all.1),
+      channel_id=2,
+      frontend_attributes={
+        _xla_send_recv_source_target_pairs="{{0,1},{1,2}}"
+      }
+    recv-done.1 = (u32[2], token[]) recv-done(recv.1), channel_id=2
+    send-done.1 = token[] send-done(send.1), channel_id=2
+
+    recv-data.0 = u32[2] get-tuple-element(recv-done.0), index=0
+    recv-data.1 = u32[2] get-tuple-element(recv-done.1), index=0
+
+    replica = u32[] replica-id()
+    constant0 = u32[] constant(0)
+    compare0 = pred[] compare(replica, constant0), direction=EQ
+    compare = pred[2] broadcast(compare0), dimensions={}
+    recv-data = u32[2] select(compare, recv-data.0, recv-data.1)
+
+    c1 = u32[] constant(1)
+    new_count = u32[] add(count, c1)
+
+    r = u32[2] broadcast(c1), dimensions={}
+    s = u32[2] add(r, recv-data)
+
+    ROOT result = (u32[], u32[2]) tuple(new_count, s)
+  }
+
+  ENTRY test_computation {
+    c0 = u32[] constant(0)
+    c1 = u32[] constant(1)
+    r = u32[] replica-id()
+    a = u32[] add(c1, r)
+    init = u32[2] broadcast(a), dimensions={}
+    while_init = (u32[], u32[2]) tuple(c0, init)
+    while_result = (u32[], u32[2]) while(while_init), body=body, condition=cond,
+      backend_config={"known_trip_count":{"n":"11"}}
+    ROOT recv-data = u32[2] get-tuple-element(while_result), index=1
   }
   )";
 
@@ -884,13 +1033,22 @@ body {
                           ParseAndReturnUnverifiedModule((kModuleStr)));
   P2PSchedulePreparation preparation;
   TF_ASSERT_OK_AND_ASSIGN(bool changed, preparation.Run(module.get()));
-  VLOG(10) << module->ToString();
   EXPECT_TRUE(changed);
 
-  // Verify the pipelined P2P chain in the while-body.
-  VerifyPipelined2P2PChild(module.get(), ".0", ".1");
-  // Verify the pipelined P2P chain in the main computation.
-  VerifyPipelined2P2PParent(module.get(), ".2", ".3");
+  VerifyUnpipelinedP2P(module.get(), ".0");
+  VerifyUnpipelinedP2P(module.get(), ".1");
+  // Verify the two chains are ordered, that is, either the chain-0 goes before
+  // or after chain-1.
+  HloInstruction* recv0 = FindInstruction(module.get(), "recv.0");
+  if (!recv0->control_predecessors().empty()) {
+    HloInstruction* send_done1 = FindInstruction(module.get(), "send-done.1");
+    EXPECT_EQ(recv0->control_predecessors()[0], send_done1);
+  } else {
+    HloInstruction* recv1 = FindInstruction(module.get(), "recv.1");
+    HloInstruction* send_done0 = FindInstruction(module.get(), "send-done.0");
+    EXPECT_TRUE(!recv1->control_predecessors().empty());
+    EXPECT_EQ(recv1->control_predecessors()[0], send_done0);
+  }
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/reduce_scatter_decomposer.cc b/third_party/xla/xla/service/reduce_scatter_decomposer.cc
index 7210a2c12b4f30..da2fed224a53f5 100644
--- a/third_party/xla/xla/service/reduce_scatter_decomposer.cc
+++ b/third_party/xla/xla/service/reduce_scatter_decomposer.cc
@@ -53,7 +53,11 @@ absl::StatusOr<bool> ReduceScatterDecomposer::Run(
       if (rs->channel_id()) {
         channel_id = next_channel_id++;
       }
+      if (should_decompose_ && !should_decompose_(rs)) {
+        continue;
+      }
 
+      VLOG(2) << "Decompose: " << rs->ToString();
       // Create an all-reduce
       HloComputation *apply_clone = module->AddComputationAndUnifyNamesAndIds(
           rs->to_apply()->Clone(), /*is_entry=*/false);
diff --git a/third_party/xla/xla/service/reduce_scatter_decomposer.h b/third_party/xla/xla/service/reduce_scatter_decomposer.h
index 324d97d0e915e9..1ee1f603c09f28 100644
--- a/third_party/xla/xla/service/reduce_scatter_decomposer.h
+++ b/third_party/xla/xla/service/reduce_scatter_decomposer.h
@@ -29,8 +29,9 @@ namespace xla {
 class ReduceScatterDecomposer : public HloModulePass {
  public:
   explicit ReduceScatterDecomposer(
-      std::function<void(Shape&)> update_layout = nullptr)
-      : update_layout_(update_layout) {}
+      std::function<void(Shape&)> update_layout = nullptr,
+      std::function<bool(const HloInstruction*)> should_decompose = nullptr)
+      : update_layout_(update_layout), should_decompose_(should_decompose) {}
   absl::string_view name() const override {
     return "reduce-scatter-decomposer";
   }
@@ -40,6 +41,7 @@ class ReduceScatterDecomposer : public HloModulePass {
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
   std::function<void(Shape&)> update_layout_;
+  std::function<bool(const HloInstruction*)> should_decompose_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/reduce_scatter_decomposer_test.cc b/third_party/xla/xla/service/reduce_scatter_decomposer_test.cc
index bfaa918930befb..d7f8360fbdc910 100644
--- a/third_party/xla/xla/service/reduce_scatter_decomposer_test.cc
+++ b/third_party/xla/xla/service/reduce_scatter_decomposer_test.cc
@@ -41,13 +41,18 @@ class ReduceScatterDecomposerTest : public HloTestBase {
       absl::string_view hlo_module, PassAction action,
       CollectiveOpGroupMode mode = CollectiveOpGroupMode::kCrossReplica,
       int64_t shard_size = 0, int64_t shard_dimension = 0,
-      int64_t replica_count = 2) {
+      int64_t replica_count = 2,
+      std::function<bool(const HloInstruction *)> should_decompose =
+          [](const HloInstruction *) { return true; }) {
     const int64_t partition_count = 2;
     TF_ASSERT_OK_AND_ASSIGN(
         auto module, ParseAndReturnVerifiedModule(hlo_module, replica_count,
                                                   partition_count));
-    TF_ASSERT_OK_AND_ASSIGN(bool changed,
-                            ReduceScatterDecomposer().Run(module.get()));
+    TF_ASSERT_OK_AND_ASSIGN(
+        bool changed,
+        ReduceScatterDecomposer(/*update_layout=*/nullptr,
+                                /*should_decompose=*/should_decompose)
+            .Run(module.get()));
     if (action == PassAction::kNoChange) {
       ASSERT_FALSE(changed);
       return;
@@ -222,5 +227,26 @@ ENTRY main {
   RunPass(hlo_string, PassAction::kNoChange);
 }
 
+TEST_F(ReduceScatterDecomposerTest, NoChangeWithShouldDecompose) {
+  absl::string_view hlo_string = R"(
+HloModule m
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add.2 = f32[] add(a, b)
+}
+
+ENTRY main {
+  p0 = f32[4, 8] parameter(0)
+  ROOT rs = f32[4, 4] reduce-scatter(p0), replica_groups={{0,1}, {2,3}}, channel_id=1, dimensions={1}, to_apply=sum, use_global_device_ids=true
+}
+)";
+  RunPass(hlo_string, PassAction::kNoChange,
+          CollectiveOpGroupMode::kCrossReplica,
+          /*shard_size=*/0, /*shard_dimension=*/0,
+          /*replica_count=*/2, [](const HloInstruction *) { return false; });
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/scatter_simplifier.cc b/third_party/xla/xla/service/scatter_simplifier.cc
index a4f50ae4367322..b6b12a3dc3b9ad 100644
--- a/third_party/xla/xla/service/scatter_simplifier.cc
+++ b/third_party/xla/xla/service/scatter_simplifier.cc
@@ -202,26 +202,28 @@ absl::StatusOr<HloInstruction*> ScatterSimplifier::ExpandInstruction(
   return MaybeMakeTuple(result_items);
 }
 
-bool ScatterSimplifier::InstructionMatchesPattern(HloInstruction* inst) {
-  if (auto* scatter = DynCast<HloScatterInstruction>(inst)) {
-    const auto& dims = scatter->scatter_dimension_numbers();
-
-    bool nonstandard_index_vector_dim =
-        dims.index_vector_dim() !=
-        scatter->scatter_indices()->shape().rank() - 1;
-    int64_t num_scatter_dims =
-        scatter->scatter_updates().front()->shape().rank() -
-        dims.update_window_dims().size();
-    bool scatter_indices_reordered =
-        !IsIdentityPermutation(dims.scatter_dims_to_operand_dims());
-    bool scatter_dim_not_first =
-        absl::c_linear_search(dims.update_window_dims(), 0);
-
-    return nonstandard_index_vector_dim || num_scatter_dims > 1 ||
+bool ScatterSimplifier::IsSimplifiedScatter(
+    const HloScatterInstruction* scatter) {
+  const auto& dims = scatter->scatter_dimension_numbers();
+
+  bool nonstandard_index_vector_dim =
+      dims.index_vector_dim() != scatter->scatter_indices()->shape().rank() - 1;
+  int64_t num_scatter_dims =
+      scatter->scatter_updates().front()->shape().rank() -
+      dims.update_window_dims().size();
+  bool scatter_indices_reordered =
+      !IsIdentityPermutation(dims.scatter_dims_to_operand_dims());
+  bool scatter_dim_not_first =
+      absl::c_linear_search(dims.update_window_dims(), 0);
+
+  return !(nonstandard_index_vector_dim || num_scatter_dims > 1 ||
            scatter_indices_reordered || scatter_dim_not_first ||
-           !dims.inserted_window_dims().empty();
-  }
-  return false;
+           !dims.inserted_window_dims().empty());
+}
+
+bool ScatterSimplifier::InstructionMatchesPattern(HloInstruction* inst) {
+  auto* scatter = DynCast<HloScatterInstruction>(inst);
+  return scatter && !IsSimplifiedScatter(scatter);
 }
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/scatter_simplifier.h b/third_party/xla/xla/service/scatter_simplifier.h
index 59bfc3ecc382e7..8b14e16abc9fff 100644
--- a/third_party/xla/xla/service/scatter_simplifier.h
+++ b/third_party/xla/xla/service/scatter_simplifier.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_SERVICE_SCATTER_SIMPLIFIER_H_
 #define XLA_SERVICE_SCATTER_SIMPLIFIER_H_
 
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/op_expander_pass.h"
 
 namespace xla {
@@ -42,6 +43,8 @@ class ScatterSimplifier : public OpExpanderPass {
  public:
   absl::string_view name() const override { return "scatter_simplifier"; }
 
+  static bool IsSimplifiedScatter(const HloScatterInstruction* scatter);
+
  protected:
   bool InstructionMatchesPattern(HloInstruction* inst) override;
 
diff --git a/third_party/xla/xla/service/shape_inference.cc b/third_party/xla/xla/service/shape_inference.cc
index 02e0cb595993e4..a6ba2645d35273 100644
--- a/third_party/xla/xla/service/shape_inference.cc
+++ b/third_party/xla/xla/service/shape_inference.cc
@@ -1532,8 +1532,11 @@ ShapeInference::InferScalarBroadcastShape(absl::Span<const Shape> shapes) {
     }
   }
 
-  return ShapeUtil::MakeShape(output_shape.element_type(),
-                              arg_shape->dimensions());
+  return ShapeUtil::MakeShape(
+      output_shape.element_type(), arg_shape->dimensions(),
+      /*dynamic_dimensions=*/
+      std::vector<bool>(arg_shape->dynamic_dimensions().begin(),
+                        arg_shape->dynamic_dimensions().end()));
 }
 
 /* static */ absl::StatusOr<Shape> ShapeInference::InferBatchNormTrainingShape(
@@ -2323,13 +2326,15 @@ ShapeInference::InferScalarBroadcastShape(absl::Span<const Shape> shapes) {
         "Arguments to triangular solve must have equal rank; got %s and %s.",
         b.ToString(), a.ToString());
   }
-  if (a.dimensions(a.rank() - 2) != a.dimensions(a.rank() - 1)) {
+  if (!CompatibleDimensionSizes(a.dimensions(a.rank() - 2),
+                                a.dimensions(a.rank() - 1))) {
     return InvalidArgument(
         "The two minor dimensions of 'a' must have equal size, got %s.",
         a.ToString());
   }
-  if (a.dimensions(a.rank() - 1) !=
-      b.dimensions(b.rank() - (options.left_side() ? 2 : 1))) {
+  if (!CompatibleDimensionSizes(
+          a.dimensions(a.rank() - 1),
+          b.dimensions(b.rank() - (options.left_side() ? 2 : 1)))) {
     return InvalidArgument(
         "The shared dimension of 'a' and 'b' does not match, got shapes %s and "
         "%s",
@@ -2367,9 +2372,10 @@ ShapeInference::InferScalarBroadcastShape(absl::Span<const Shape> shapes) {
         "The 'a' argument to Cholesky must have rank >= 2, got shape %s",
         a.ToString());
   }
-  if (a.dimensions(a.rank() - 2) != a.dimensions(a.rank() - 1)) {
+  if (!CompatibleDimensionSizes(a.dimensions(a.rank() - 2),
+                                a.dimensions(a.rank() - 1))) {
     return InvalidArgument(
-        "The two minor dimensions of 'a' must have equal size, got %s.",
+        "The two minor dimensions of 'a' must have compatible size, got %s.",
         a.ToString());
   }
   return a;
@@ -2388,9 +2394,12 @@ ShapeInference::InferScalarBroadcastShape(absl::Span<const Shape> shapes) {
     TF_RETURN_IF_ERROR(ExpectArray(*operand_shape, "operand of all-gather"));
 
     Shape output_shape = *operand_shape;
-    output_shape.set_dimensions(
-        all_gather_dimension,
-        shard_count * output_shape.dimensions(all_gather_dimension));
+    int64_t output_shape_dimension =
+        output_shape.dimensions(all_gather_dimension);
+    output_shape.set_dimensions(all_gather_dimension,
+                                IsUnboundedDynamicSize(output_shape_dimension)
+                                    ? Shape::kUnboundedSize
+                                    : shard_count * output_shape_dimension);
     output_shapes.push_back(output_shape);
   }
   if (output_shapes.size() == 1) {
@@ -2455,8 +2464,10 @@ ShapeInference::InferScalarBroadcastShape(absl::Span<const Shape> shapes) {
     }
 
     Shape output_shape = *operand_shape;
-    output_shape.set_dimensions(scatter_dimension,
-                                scatter_dim_input_size / shard_count);
+    output_shape.set_dimensions(
+        scatter_dimension, output_shape.is_dynamic_dimension(scatter_dimension)
+                               ? Shape::kUnboundedSize
+                               : scatter_dim_input_size / shard_count);
     output_shapes.push_back(output_shape);
   }
 
@@ -2522,6 +2533,7 @@ ShapeInference::InferScalarBroadcastShape(absl::Span<const Shape> shapes) {
 
   return InferVariadicOpShape(HloOpcode::kTuple, operand_shapes);
 }
+
 /* static */ absl::StatusOr<Shape>
 ShapeInference::InferCollectiveBroadcastShape(
     absl::Span<const Shape* const> operand_shapes) {
@@ -2812,7 +2824,8 @@ ShapeInference::InferCollectivePermuteDoneShape(const Shape& operand_shape) {
     absl::Span<const int64_t> lhs_dilation,
     absl::Span<const int64_t> rhs_dilation,
     std::optional<std::vector<bool>> window_reversal) {
-  const auto verify_size = [&](const size_t x, const char* x_name) {
+  const auto verify_size = [&](const size_t x,
+                               const char* x_name) -> absl::Status {
     if (x == 0 || x == window_dimensions.size()) {
       return OkStatus();
     } else {
@@ -3033,7 +3046,8 @@ ShapeInference::InferCollectivePermuteDoneShape(const Shape& operand_shape) {
       return InvalidArgument("Negative size index to dynamic slice: %d.",
                              slice_dim_size);
     }
-    if (slice_dim_size > input_dim_size) {
+    if (!IsUnboundedDynamicSize(input_dim_size) &&
+        slice_dim_size > input_dim_size) {
       return InvalidArgument(
           "Slice dim size %d greater than dynamic slice dimension: %d.",
           slice_dim_size, input_dim_size);
@@ -3157,7 +3171,7 @@ ShapeInference::InferCollectivePermuteDoneShape(const Shape& operand_shape) {
   for (int64_t dim = 0; dim < operand_shape.rank(); ++dim) {
     const int64_t input_dim_size = operand_shape.dimensions(dim);
     const int64_t update_dim_size = update_shape.dimensions(dim);
-    if (update_dim_size < 0) {
+    if (!IsUnboundedDynamicSize(update_dim_size) && update_dim_size < 0) {
       return InvalidArgument(
           "Size index %d to dynamic update slice must be >= 0.",
           update_dim_size);
diff --git a/third_party/xla/xla/service/shape_inference_test.cc b/third_party/xla/xla/service/shape_inference_test.cc
index 781852cd41d9dd..3afa14fa302eb8 100644
--- a/third_party/xla/xla/service/shape_inference_test.cc
+++ b/third_party/xla/xla/service/shape_inference_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "xla/service/shape_inference.h"
 
+#include <array>
+#include <cstddef>
 #include <cstdint>
 #include <optional>
 #include <string>
@@ -22,7 +24,9 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/strings/substitute.h"
 #include "absl/types/span.h"
@@ -32,10 +36,8 @@ limitations under the License.
 #include "xla/service/hlo_parser.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/statusor.h"
 #include "xla/test.h"
 #include "xla/test_helpers.h"
-#include "xla/types.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
@@ -141,6 +143,10 @@ class UnboundedBinaryOpShapeInferenceTest
 class UnboundedCompareOpShapeInferenceTest
     : public ::testing::TestWithParam<BinaryOpTestCase> {};
 
+// Subclass for testing unbounded dynamic complex op
+class UnboundedComplexOpShapeInferenceTest
+    : public ::testing::TestWithParam<BinaryOpTestCase> {};
+
 // Subclass for testing unbounded dynamic concatenate op
 class UnboundedConcatenateOpShapeInferenceTest
     : public ::testing::TestWithParam<std::vector<std::string>> {};
@@ -4028,6 +4034,28 @@ TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedAdd) {
   }
 }
 
+TEST_F(ShapeInferenceTest, UnboundedAllGather) {
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(
+      const Shape inferred_shape,
+      ShapeInference::InferAllGatherShape(
+          {&operand}, /*all_gather_dimension=*/0, /*shard_count=*/2));
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
+      << " expected: " << ShapeUtil::HumanString(expected);
+}
+
+TEST_F(ShapeInferenceTest, UnboundedAllReduce) {
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape inferred_shape,
+                          ShapeInference::InferAllReduceShape({&operand}));
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
+      << " expected: " << ShapeUtil::HumanString(expected);
+}
+
 TEST_P(UnboundedLogicalOpShapeInferenceTest, UnboundedAnd) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam().lhs));
   TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam().rhs));
@@ -4188,6 +4216,30 @@ TEST_F(ShapeInferenceTest, UnboundedBroadcastInDimUnsupported) {
               HasSubstr("Non-broadcast dimensions must not be dynamic."));
 }
 
+TEST_F(ShapeInferenceTest, UnboundedCall) {
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand0, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand1, ParseShape("f32[10, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape inferred_shape,
+                          ShapeInference::InferCallShape(
+                              /*arg_shapes=*/{&operand0, &operand1},
+                              /*to_apply=*/ShapeUtil::MakeProgramShape(
+                                  {operand1, operand0}, operand0)));
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
+      << " expected: " << ShapeUtil::HumanString(expected);
+}
+
+TEST_F(ShapeInferenceTest, UnboundedCholesky) {
+  TF_ASSERT_OK_AND_ASSIGN(const Shape a, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape inferred_shape,
+                          ShapeInference::InferCholeskyShape(a));
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
+      << " expected: " << ShapeUtil::HumanString(expected);
+}
+
 TEST_P(UnboundedClampOpShapeInferenceTest, UnboundedClamp) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam()[0]));
   TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam()[1]));
@@ -4217,6 +4269,28 @@ TEST_F(ShapeInferenceTest, UnboundedClampWithTuple) {
           "Expected array argument for clamp min, but got (f32[2], f32[?])."));
 }
 
+TEST_F(ShapeInferenceTest, UnboundedCollectiveBroadcast) {
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape inferred_shape,
+                          ShapeInference::InferCollectiveBroadcastShape(
+                              /*operand_shapes=*/{&operand}));
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
+      << " expected: " << ShapeUtil::HumanString(expected);
+}
+
+TEST_F(ShapeInferenceTest, UnboundedCollectivePermute) {
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape inferred_shape,
+                          ShapeInference::InferCollectivePermuteShape(
+                              /*operand_shapes=*/{&operand}));
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
+      << " expected: " << ShapeUtil::HumanString(expected);
+}
+
 TEST_P(UnboundedCompareOpShapeInferenceTest, UnboundedCompare) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam().lhs));
   TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam().rhs));
@@ -4236,6 +4310,25 @@ TEST_P(UnboundedCompareOpShapeInferenceTest, UnboundedCompare) {
   }
 }
 
+TEST_P(UnboundedComplexOpShapeInferenceTest, UnboundedComplex) {
+  TF_ASSERT_OK_AND_ASSIGN(const Shape real, ParseShape(GetParam().lhs));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape imag, ParseShape(GetParam().rhs));
+  const absl::StatusOr<Shape> inferred_shape =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kComplex, real, imag,
+                                         GetParam().broadcast_dimensions);
+  if (inferred_shape.ok()) {
+    TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
+                            ParseShape(GetParam().expected));
+    EXPECT_TRUE(ShapeUtil::Equal(*inferred_shape, expected))
+        << "inferred: " << ShapeUtil::HumanString(*inferred_shape)
+        << " expected: " << ShapeUtil::HumanString(expected);
+  } else {
+    ASSERT_TRUE(GetParam().error_message.has_value());
+    EXPECT_THAT(inferred_shape.status().message(),
+                HasSubstr(*GetParam().error_message));
+  }
+}
+
 TEST_P(UnboundedConcatenateOpShapeInferenceTest, UnboundedConcatenate) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape operand1, ParseShape(GetParam()[0]));
   TF_ASSERT_OK_AND_ASSIGN(const Shape operand2, ParseShape(GetParam()[1]));
@@ -4380,6 +4473,35 @@ TEST_F(ShapeInferenceTest, UnboundedDotGeneral) {
       << " expected: " << ShapeUtil::HumanString(expected);
 }
 
+TEST_F(ShapeInferenceTest, UnboundedDynamicSlice) {
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape start_index, ParseShape("s32[]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[2, 2]"));
+  TF_ASSERT_OK_AND_ASSIGN(
+      const Shape inferred_shape,
+      ShapeInference::InferDynamicSliceShape(
+          operand, /*start_index_shapes=*/{start_index, start_index},
+          /*slice_sizes=*/{2, 2}, /*allow_scalar_indices=*/true));
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
+      << " expected: " << ShapeUtil::HumanString(expected);
+}
+
+TEST_F(ShapeInferenceTest, UnboundedDynamicUpdateSlice) {
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape update, ParseShape("f32[?, 5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape start_index, ParseShape("s32[]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(
+      const Shape inferred_shape,
+      ShapeInference::InferDynamicUpdateSliceShape(
+          operand, update, /*start_index_shapes=*/{start_index, start_index},
+          /*allow_scalar_indices=*/true));
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
+      << " expected: " << ShapeUtil::HumanString(expected);
+}
+
 TEST_F(ShapeInferenceTest, UnboundedGather) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[3, 4, 2]"));
   TF_ASSERT_OK_AND_ASSIGN(const Shape start_indices,
@@ -4403,6 +4525,34 @@ TEST_F(ShapeInferenceTest, UnboundedGather) {
       << " expected: " << ShapeUtil::HumanString(expected);
 }
 
+TEST(XlaBuilderTest, UnboundedGetTupleElement) {
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(
+      const Shape inferred_shape,
+      ShapeInference::InferGetTupleElementShape(
+          ShapeUtil::MakeTupleShape({operand}), /*index=*/0));
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
+      << " expected: " << ShapeUtil::HumanString(expected);
+}
+
+TEST_F(ShapeInferenceTest, UnboundedMap) {
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand0, ParseShape("f32[2, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand1, ParseShape("f32[?, 3, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[2, ?, ?]"));
+
+  const ProgramShape to_apply = ShapeUtil::MakeProgramShape({f32_, f32_}, f32_);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      const Shape result_shape,
+      ShapeInference::InferMapShape(/*arg_shapes=*/{&operand0, &operand1},
+                                    to_apply, /*dimensions=*/{0, 1, 2}));
+  EXPECT_TRUE(ShapeUtil::Equal(result_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(result_shape)
+      << " expected: " << ShapeUtil::HumanString(expected);
+}
+
 TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedMax) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam().lhs));
   TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam().rhs));
@@ -4422,6 +4572,25 @@ TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedMax) {
   }
 }
 
+TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedMin) {
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam().lhs));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam().rhs));
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kMinimum, lhs, rhs,
+                                         GetParam().broadcast_dimensions);
+  if (inferred_status.ok()) {
+    TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
+                            ParseShape(GetParam().expected));
+    EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, expected))
+        << "inferred: " << ShapeUtil::HumanString(*inferred_status)
+        << " expected: " << ShapeUtil::HumanString(expected);
+  } else {
+    ASSERT_TRUE(GetParam().error_message.has_value());
+    EXPECT_THAT(inferred_status.status().message(),
+                HasSubstr(*GetParam().error_message));
+  }
+}
+
 TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedMul) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam().lhs));
   TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam().rhs));
@@ -4533,6 +4702,30 @@ TEST_F(ShapeInferenceTest, UnboundedReduceInvalidReduceDimension) {
               HasSubstr("All reduced tensors must have compatible dimension"));
 }
 
+TEST_F(ShapeInferenceTest, UnboundedReducePrecision) {
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(
+      const Shape inferred,
+      ShapeInference::InferReducePrecisionShape(operand, /*exponent_bits=*/2,
+                                                /*mantissa_bits=*/2));
+  ASSERT_TRUE(ShapeUtil::Equal(inferred, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred)
+      << " expected: " << ShapeUtil::HumanString(expected);
+}
+
+TEST_F(ShapeInferenceTest, UnboundedReduceScatter) {
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape inferred_shape,
+                          ShapeInference::InferReduceScatterShape(
+                              /*operand_shapes=*/{&operand},
+                              /*scatter_dimension=*/0, /*shard_count=*/2));
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
+      << " expected: " << ShapeUtil::HumanString(expected);
+}
+
 TEST_F(ShapeInferenceTest, UnboundedReduceWindow) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape input, ParseShape("f32[?, 4, 8]"));
   TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 3, 5]"));
@@ -4561,6 +4754,25 @@ TEST_F(ShapeInferenceTest, UnboundedReduceWindow) {
       << " expected: " << ShapeUtil::HumanString(expected);
 }
 
+TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedRemainder) {
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam().lhs));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam().rhs));
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kRemainder, lhs, rhs,
+                                         GetParam().broadcast_dimensions);
+  if (inferred_status.ok()) {
+    TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
+                            ParseShape(GetParam().expected));
+    EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, expected))
+        << "inferred: " << ShapeUtil::HumanString(*inferred_status)
+        << " expected: " << ShapeUtil::HumanString(expected);
+  } else {
+    ASSERT_TRUE(GetParam().error_message.has_value());
+    EXPECT_THAT(inferred_status.status().message(),
+                HasSubstr(*GetParam().error_message));
+  }
+}
+
 TEST_F(ShapeInferenceTest, UnboundedReshape) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?]"));
   TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[2,3]"));
@@ -4595,6 +4807,42 @@ TEST_F(ShapeInferenceTest, UnboundedReshapeUnsupportedMixOfDynamism) {
                         "not supported."));
 }
 
+TEST_F(ShapeInferenceTest, UnboundedReverse) {
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(
+      const Shape inferred_shape,
+      ShapeInference::InferReverseShape(operand, /*dimensions=*/{0, 1}));
+  ASSERT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
+      << " expected: " << ShapeUtil::HumanString(expected);
+}
+
+TEST_F(ShapeInferenceTest, UnboundedScatter) {
+  TF_ASSERT_OK_AND_ASSIGN(Shape input, ParseShape("f32[?, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(Shape scatter_indices, ParseShape("s32[?, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(Shape updates, ParseShape("f32[?, ?, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, ?, ?]"));
+
+  const ProgramShape to_apply = ShapeUtil::MakeProgramShape({f32_, f32_}, f32_);
+
+  ScatterDimensionNumbers dimension_numbers;
+  dimension_numbers.add_update_window_dims(2);
+  dimension_numbers.add_update_window_dims(3);
+  dimension_numbers.add_inserted_window_dims(0);
+  dimension_numbers.add_scatter_dims_to_operand_dims(1);
+  dimension_numbers.add_scatter_dims_to_operand_dims(0);
+  dimension_numbers.set_index_vector_dim(2);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      Shape result,
+      ShapeInference::InferScatterShape({&input, &scatter_indices, &updates},
+                                        to_apply, dimension_numbers));
+  EXPECT_TRUE(ShapeUtil::Equal(result, expected))
+      << "inferred: " << ShapeUtil::HumanString(result)
+      << " expected: " << ShapeUtil::HumanString(expected);
+}
+
 TEST_P(UnboundedSelectOpShapeInferenceTest, UnboundedSelect) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam()[0]));
   TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam()[1]));
@@ -4623,6 +4871,103 @@ TEST_F(ShapeInferenceTest, UnboundedSelectWithTupleUnsupported) {
                         "(pred[2], pred[?])."));
 }
 
+TEST_F(ShapeInferenceTest, UnboundedSelectAndScatter) {
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape source, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape init_value, ParseShape("f32[]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
+
+  Window window;
+  WindowDimension dim0;
+  dim0.set_base_dilation(1);
+  dim0.set_size(3);
+  dim0.set_stride(2);
+  dim0.set_padding_low(0);
+  dim0.set_padding_high(1);
+  dim0.set_window_dilation(1);
+
+  WindowDimension dim1;
+  dim1.set_base_dilation(1);
+  dim1.set_size(1);
+  dim1.set_stride(1);
+  dim1.set_padding_low(0);
+  dim1.set_padding_high(0);
+  dim1.set_window_dilation(1);
+
+  *window.add_dimensions() = dim0;
+  *window.add_dimensions() = dim1;
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      Shape result,
+      ShapeInference::InferSelectAndScatterShape(
+          operand,
+          /*select_shape=*/ShapeUtil::MakeProgramShape({f32_, f32_}, pred_),
+          window, source, init_value,
+          /*scatter_shape=*/
+          ShapeUtil::MakeProgramShape({f32_, f32_}, f32_)));
+
+  EXPECT_TRUE(ShapeUtil::Equal(result, expected))
+      << "inferred: " << ShapeUtil::HumanString(result)
+      << " expected: " << ShapeUtil::HumanString(expected);
+}
+
+TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedShiftLeft) {
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam().lhs));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam().rhs));
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kShiftLeft, lhs, rhs,
+                                         GetParam().broadcast_dimensions);
+  if (inferred_status.ok()) {
+    TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
+                            ParseShape(GetParam().expected));
+    EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, expected))
+        << "inferred: " << ShapeUtil::HumanString(*inferred_status)
+        << " expected: " << ShapeUtil::HumanString(expected);
+  } else {
+    ASSERT_TRUE(GetParam().error_message.has_value());
+    EXPECT_THAT(inferred_status.status().message(),
+                HasSubstr(*GetParam().error_message));
+  }
+}
+
+TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedShiftRightArithmetic) {
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam().lhs));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam().rhs));
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kShiftRightArithmetic, lhs,
+                                         rhs, GetParam().broadcast_dimensions);
+  if (inferred_status.ok()) {
+    TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
+                            ParseShape(GetParam().expected));
+    EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, expected))
+        << "inferred: " << ShapeUtil::HumanString(*inferred_status)
+        << " expected: " << ShapeUtil::HumanString(expected);
+  } else {
+    ASSERT_TRUE(GetParam().error_message.has_value());
+    EXPECT_THAT(inferred_status.status().message(),
+                HasSubstr(*GetParam().error_message));
+  }
+}
+
+TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedShiftRightLogical) {
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam().lhs));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam().rhs));
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kShiftRightLogical, lhs,
+                                         rhs, GetParam().broadcast_dimensions);
+  if (inferred_status.ok()) {
+    TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
+                            ParseShape(GetParam().expected));
+    EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, expected))
+        << "inferred: " << ShapeUtil::HumanString(*inferred_status)
+        << " expected: " << ShapeUtil::HumanString(expected);
+  } else {
+    ASSERT_TRUE(GetParam().error_message.has_value());
+    EXPECT_THAT(inferred_status.status().message(),
+                HasSubstr(*GetParam().error_message));
+  }
+}
+
 TEST_F(ShapeInferenceTest, UnboundedSlice) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[1, <=3, ?]"));
   TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[1, <=2, 3]"));
@@ -4636,6 +4981,17 @@ TEST_F(ShapeInferenceTest, UnboundedSlice) {
       << " expected: " << ShapeUtil::HumanString(expected);
 }
 
+TEST_F(ShapeInferenceTest, UnboundedSort) {
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(
+      const Shape inferred_shape,
+      ShapeInference::InferVariadicOpShape(HloOpcode::kSort, {&operand}));
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
+      << " expected: " << ShapeUtil::HumanString(expected);
+}
+
 TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedSub) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam().lhs));
   TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam().rhs));
@@ -4655,32 +5011,6 @@ TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedSub) {
   }
 }
 
-TEST_F(ShapeInferenceTest, UnboundedScatter) {
-  TF_ASSERT_OK_AND_ASSIGN(const Shape input, ParseShape("f32[?, ?, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(const Shape scatter_indices,
-                          ParseShape("s32[?, ?, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(const Shape updates, ParseShape("f32[?, ?, ?, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, ?, ?]"));
-
-  const ProgramShape to_apply = ShapeUtil::MakeProgramShape({f32_, f32_}, f32_);
-
-  ScatterDimensionNumbers dimension_numbers;
-  dimension_numbers.add_update_window_dims(2);
-  dimension_numbers.add_update_window_dims(3);
-  dimension_numbers.add_inserted_window_dims(0);
-  dimension_numbers.add_scatter_dims_to_operand_dims(1);
-  dimension_numbers.add_scatter_dims_to_operand_dims(0);
-  dimension_numbers.set_index_vector_dim(2);
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      const Shape result,
-      ShapeInference::InferScatterShape({&input, &scatter_indices, &updates},
-                                        to_apply, dimension_numbers));
-  EXPECT_TRUE(ShapeUtil::Equal(result, expected))
-      << "inferred: " << ShapeUtil::HumanString(result)
-      << " expected: " << ShapeUtil::HumanString(expected);
-}
-
 TEST_F(ShapeInferenceTest, UnboundedTranspose) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape operand,
                           ParseShape("f32[1, ?, 2, ?, <=2]{4,3,2,1,0}"));
@@ -4705,6 +5035,69 @@ TEST_F(ShapeInferenceTest, UnboundedTransposeRank1) {
       << " expected: " << ShapeUtil::HumanString(expected);
 }
 
+TEST_F(ShapeInferenceTest, UnboundedTriangularSolve) {
+  TF_ASSERT_OK_AND_ASSIGN(const Shape a, ParseShape("f32[?, 3, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape b, ParseShape("f32[?, ?, 4]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, ?, 4]"));
+  TriangularSolveOptions options;
+  options.set_left_side(true);
+  options.set_lower(true);
+  options.set_unit_diagonal(false);
+  options.set_transpose_a(TriangularSolveOptions::TRANSPOSE);
+  TF_ASSERT_OK_AND_ASSIGN(
+      const Shape result_shape,
+      ShapeInference::InferTriangularSolveShape(a, b, options));
+  EXPECT_TRUE(ShapeUtil::Equal(result_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(result_shape)
+      << " expected: " << ShapeUtil::HumanString(expected);
+}
+
+TEST_F(ShapeInferenceTest, UnboundedTuple) {
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
+  const Shape expected = ShapeUtil::MakeTupleShape({operand});
+  TF_ASSERT_OK_AND_ASSIGN(
+      const Shape result_shape,
+      ShapeInference::InferVariadicOpShape(
+          HloOpcode::kTuple, std::vector<const Shape*>({&operand})));
+  EXPECT_TRUE(ShapeUtil::Equal(result_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(result_shape)
+      << " expected: " << ShapeUtil::HumanString(expected);
+}
+
+TEST_F(ShapeInferenceTest, UnboundedWhile) {
+  TF_ASSERT_OK_AND_ASSIGN(const Shape init, ParseShape("f32[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape result_shape, ParseShape("f32[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(
+      const Shape inferred_shape,
+      ShapeInference::InferWhileShape(
+          /*condition=*/ShapeUtil::MakeProgramShape({result_shape}, pred_),
+          /*body=*/ShapeUtil::MakeProgramShape({result_shape}, result_shape),
+          /*init=*/init));
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
+      << " expected: " << ShapeUtil::HumanString(expected);
+}
+
+TEST_P(UnboundedLogicalOpShapeInferenceTest, UnboundedXor) {
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam().lhs));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam().rhs));
+  const absl::StatusOr<Shape> inferred_status =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kXor, lhs, rhs,
+                                         GetParam().broadcast_dimensions);
+  if (inferred_status.ok()) {
+    TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
+                            ParseShape(GetParam().expected));
+    EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, expected))
+        << "inferred: " << ShapeUtil::HumanString(*inferred_status)
+        << " expected: " << ShapeUtil::HumanString(expected);
+  } else {
+    ASSERT_TRUE(GetParam().error_message.has_value());
+    EXPECT_THAT(inferred_status.status().message(),
+                HasSubstr(*GetParam().error_message));
+  }
+}
+
 INSTANTIATE_TEST_SUITE_P(UnboundedDynamism,
                          UnboundedLogicalOpShapeInferenceTest,
                          ::testing::ValuesIn<BinaryOpTestCase>(
@@ -4794,6 +5187,36 @@ INSTANTIATE_TEST_SUITE_P(UnboundedDynamism,
                                "",
                                kIncompatibleBinaryOpShapeErrorMessage}}));
 
+INSTANTIATE_TEST_SUITE_P(UnboundedDynamism,
+                         UnboundedComplexOpShapeInferenceTest,
+                         ::testing::ValuesIn<BinaryOpTestCase>(
+                             {// LHS | RHS | bdims | Res
+                              // 1   | ?   | []    | ?
+                              {"f32[1]", "f32[?]", {}, "c64[?]"},
+                              // ?   | 1   | []    | ?
+                              {"f32[?]", "f32[1]", {}, "c64[?]"},
+                              // 2   | ?   | []    | 2
+                              {"f32[2]", "f32[?]", {}, "c64[2]"},
+                              // ?   | 2   | []    | 2
+                              {"f32[?]", "f32[2]", {}, "c64[2]"},
+                              // <=2 | ?   | []    | <=2
+                              {"f32[<=2]", "f32[?]", {}, "c64[<=2]"},
+                              // ?   | <=2 | []    | <=2
+                              {"f32[?]", "f32[<=2]", {}, "c64[<=2]"},
+                              // ?   | ?   | []    | ?
+                              {"f32[?]", "f32[?]", {}, "c64[?]"},
+                              // 1   | ?,3 | [0]   | ?,3
+                              {"f32[1]", "f32[?,3]", zero_array, "c64[?,3]"},
+                              // 2   | ?,3 | [0]   | err
+                              {"f32[2]", "f32[?,3]", zero_array, "",
+                               kBroadcastDimensionMismatchErrorMessage},
+                              // ?,2 | ?,3 | []    | err
+                              {"f32[?,2]",
+                               "f32[?,3]",
+                               {},
+                               "",
+                               kIncompatibleBinaryOpShapeErrorMessage}}));
+
 INSTANTIATE_TEST_SUITE_P(
     UnboundedDynamism, UnboundedConcatenateOpShapeInferenceTest,
     ::testing::Values(
@@ -4933,6 +5356,7 @@ INSTANTIATE_TEST_SUITE_P(UnboundedDynamism, UnboundedUnaryOpShapeInferenceTest,
                               {"f32[?]", "f32[?]", HloOpcode::kLog1p},
                               {"f32[?]", "f32[?]", HloOpcode::kLogistic},
                               {"f32[?]", "f32[?]", HloOpcode::kNegate},
+                              {"s32[?]", "s32[?]", HloOpcode::kNot},
                               {"u32[?]", "u32[?]", HloOpcode::kPopulationCount},
                               {"f32[?]", "f32[?]", HloOpcode::kReal},
                               {"f32[?]", "f32[?]", HloOpcode::kRoundNearestAfz},
diff --git a/third_party/xla/xla/service/sharding_propagation.cc b/third_party/xla/xla/service/sharding_propagation.cc
index 5b6a12b09ace52..75777bc12f5ea9 100644
--- a/third_party/xla/xla/service/sharding_propagation.cc
+++ b/third_party/xla/xla/service/sharding_propagation.cc
@@ -542,7 +542,8 @@ std::optional<HloSharding> LookaheadUserSharding(HloInstruction* instr,
     HloInstruction* current = users_chain[i - 1];
     CHECK(user->has_sharding());
     sharding = ShardingPropagation::GetShardingFromUser(
-        *current, *user, INT64_MAX, is_spmd, call_graph);
+        *current, *user, INT64_MAX, is_spmd, call_graph,
+        /*sharding_helper=*/nullptr);
     // We need to set the sharding to the instruction, because
     // GetShardingFromUser() interface uses sharding from the instruction
     // itself. It will be cleared out later.
@@ -713,6 +714,7 @@ bool CanPropagateThroughAtAggressiveLevel(const HloInstruction& inst,
       inst.opcode() != HloOpcode::kGetTupleElement &&
       inst.opcode() != HloOpcode::kWhile &&
       inst.opcode() != HloOpcode::kDynamicSlice &&
+      inst.opcode() != HloOpcode::kDynamicUpdateSlice &&
       inst.opcode() != HloOpcode::kOptimizationBarrier &&
       inst.opcode() != HloOpcode::kConcatenate &&
       inst.opcode() != HloOpcode::kCall && inst.opcode() != HloOpcode::kCopy) {
@@ -1140,7 +1142,8 @@ bool InferUnspecifiedDimsFromOneUser(HloInstruction* annotate_op,
   std::optional<HloSharding> user_sharding =
       ShardingPropagation::GetShardingFromUser(
           man_conversion_op == nullptr ? *annotate_op : *man_conversion_op,
-          *user, aggressiveness, is_spmd, call_graph);
+          *user, aggressiveness, is_spmd, call_graph,
+          /*sharding_helper=*/nullptr);
   if (!user_sharding.has_value() || user_sharding->IsTileMaximal()) {
     return false;
   }
@@ -1720,7 +1723,8 @@ int64_t ComputeNonRootUsers(const HloInstruction* instr) {
 // Return the sharding that should be propagated from user to instruction.
 std::optional<HloSharding> ShardingPropagation::GetShardingFromUser(
     const HloInstruction& instruction, const HloInstruction& user,
-    int64_t aggressiveness, bool is_spmd, const CallGraph& call_graph) {
+    int64_t aggressiveness, bool is_spmd, const CallGraph& call_graph,
+    const CustomCallShardingHelper* sharding_helper) {
   if (!CanPropagateThroughAtAggressiveLevel(user, aggressiveness)) {
     return std::nullopt;
   }
@@ -2074,6 +2078,23 @@ std::optional<HloSharding> ShardingPropagation::GetShardingFromUser(
       }
       return std::nullopt;
     }
+    case HloOpcode::kCustomCall: {
+      bool compatible_shapes = ShapeUtil::CompatibleIgnoringElementType(
+          instruction.shape(), user.shape());
+      if (!compatible_shapes) {
+        // Incompatible shapes, we will not propagate sharding.
+        return std::nullopt;
+      }
+      if (!sharding_helper) {
+        // No available sharding helper and shapes are compatible, we will
+        // propagate sharding.
+        return user.sharding();
+      }
+      if (sharding_helper->CanPropagateShardingToOperands(&user)) {
+        return user.sharding();
+      }
+      return std::nullopt;
+    }
     default: {
       // If the user output shape is compatible with the current instruction
       // shape excluding element type and the current instruction is supported
@@ -2801,7 +2822,8 @@ bool ShardingPropagation::InferShardingFromUsers(
       } else {
         std::optional<HloSharding> user_sharding =
             ShardingPropagation::GetShardingFromUser(
-                *instruction, *user, aggressiveness, is_spmd, call_graph);
+                *instruction, *user, aggressiveness, is_spmd, call_graph,
+                sharding_helper);
         if (user_sharding && user_sharding->IsManual()) {
           instruction->set_sharding(std::move(*user_sharding));
           return true;
@@ -2820,8 +2842,9 @@ bool ShardingPropagation::InferShardingFromUsers(
   const bool may_combine_partial_sharding = is_spmd && aggressiveness > 0;
   for (const HloInstruction* user : instruction->users()) {
     std::optional<HloSharding> user_sharding =
-        ShardingPropagation::GetShardingFromUser(
-            *instruction, *user, aggressiveness, is_spmd, call_graph);
+        ShardingPropagation::GetShardingFromUser(*instruction, *user,
+                                                 aggressiveness, is_spmd,
+                                                 call_graph, sharding_helper);
     if (user_sharding && instruction->opcode() == HloOpcode::kCustomCall) {
       if (auto* partitioner =
               GetCustomCallPartitioner(instruction->custom_call_target())) {
diff --git a/third_party/xla/xla/service/sharding_propagation.h b/third_party/xla/xla/service/sharding_propagation.h
index 390be726e24b7c..cdede317345428 100644
--- a/third_party/xla/xla/service/sharding_propagation.h
+++ b/third_party/xla/xla/service/sharding_propagation.h
@@ -135,7 +135,8 @@ class ShardingPropagation : public HloModulePass {
 
   static std::optional<HloSharding> GetShardingFromUser(
       const HloInstruction& instruction, const HloInstruction& user,
-      int64_t aggressiveness, bool is_spmd, const CallGraph& call_graph);
+      int64_t aggressiveness, bool is_spmd, const CallGraph& call_graph,
+      const CustomCallShardingHelper* sharding_helper);
 
   // Canonicalizes entry_computation_layouts by calling
   // module.layout_canonicalization_callback(), which gives canolicalized
diff --git a/third_party/xla/xla/service/space_to_batch_converter.cc b/third_party/xla/xla/service/space_to_batch_converter.cc
index f5015217115b5e..4cf5c6bdb09137 100644
--- a/third_party/xla/xla/service/space_to_batch_converter.cc
+++ b/third_party/xla/xla/service/space_to_batch_converter.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
 #include "absl/types/span.h"
 #include "xla/debug_options_flags.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
@@ -546,7 +547,7 @@ absl::StatusOr<HloInstruction*> ConvolutionVisitor::HaloDuplicateWithSlice(
     std::vector<int64_t> strides(rank, 1);
     HloInstruction* padding =
         pad_val == nullptr
-            ? computation_->AddInstruction(HloInstruction::CreateConstant(
+            ? activations->AddInstruction(HloInstruction::CreateConstant(
                   LiteralUtil::Zero(activations->shape().element_type())))
             : pad_val;
 
@@ -559,8 +560,10 @@ absl::StatusOr<HloInstruction*> ConvolutionVisitor::HaloDuplicateWithSlice(
       end_indices[remapped_batch_dimension] = batch_size - 1;
       end_indices[spatial_dimension_to_split] = spatial_split_size;
 
-      TF_ASSIGN_OR_RETURN(first_slice, MakeSliceHlo(activations, start_indices,
-                                                    end_indices, strides));
+      TF_ASSIGN_OR_RETURN(first_slice,
+                          MakeSliceHlo(activations, start_indices, end_indices,
+                                       strides, &activations->metadata(),
+                                       &activations->frontend_attributes()));
       VLOG(1) << "first slice " << first_slice->ToString();
 
       PaddingConfig padding_config =
@@ -569,7 +572,9 @@ absl::StatusOr<HloInstruction*> ConvolutionVisitor::HaloDuplicateWithSlice(
           ->set_edge_padding_low(1);
 
       TF_ASSIGN_OR_RETURN(first_slice,
-                          MakePadHlo(first_slice, padding, padding_config));
+                          MakePadHlo(first_slice, padding, padding_config,
+                                     &first_slice->metadata(),
+                                     &first_slice->frontend_attributes()));
     }
 
     HloInstruction* halo_region = nullptr;
@@ -580,17 +585,20 @@ absl::StatusOr<HloInstruction*> ConvolutionVisitor::HaloDuplicateWithSlice(
 
       start_indices_halo[remapped_batch_dimension] = 1;
       end_indices_halo[spatial_dimension_to_split] = halo_size - low_padding;
-
-      TF_ASSIGN_OR_RETURN(halo_region,
-                          MakeSliceHlo(activations, start_indices_halo,
-                                       end_indices_halo, strides));
+      TF_ASSIGN_OR_RETURN(
+          halo_region,
+          MakeSliceHlo(activations, start_indices_halo, end_indices_halo,
+                       strides, &activations->metadata(),
+                       &activations->frontend_attributes()));
       VLOG(1) << "halo_region " << halo_region->ToString();
       PaddingConfig padding_config_halo =
           MakeNoPaddingConfig(halo_region->shape().dimensions_size());
       padding_config_halo.mutable_dimensions(remapped_batch_dimension)
           ->set_edge_padding_high(1);
-      TF_ASSIGN_OR_RETURN(
-          halo_region, MakePadHlo(halo_region, padding, padding_config_halo));
+      TF_ASSIGN_OR_RETURN(halo_region,
+                          MakePadHlo(halo_region, padding, padding_config_halo,
+                                     &halo_region->metadata(),
+                                     &halo_region->frontend_attributes()));
     }
 
     if ((halo_size == 0 && low_padding != 0) || low_padding < 0) {
@@ -607,22 +615,27 @@ absl::StatusOr<HloInstruction*> ConvolutionVisitor::HaloDuplicateWithSlice(
         end_indices_activations_cut[spatial_dimension_to_split] =
             spatial_split_size;
       }
-
       TF_ASSIGN_OR_RETURN(
           activations, MakeSliceHlo(activations, start_indices_activations_cut,
-                                    end_indices_activations_cut, strides));
+                                    end_indices_activations_cut, strides,
+                                    &activations->metadata(),
+                                    &activations->frontend_attributes()));
     }
 
     if (first_slice != nullptr) {
-      TF_ASSIGN_OR_RETURN(activations,
-                          MakeConcatHlo({first_slice, activations},
-                                        spatial_dimension_to_split));
+      TF_ASSIGN_OR_RETURN(
+          activations,
+          MakeConcatHlo({first_slice, activations}, spatial_dimension_to_split,
+                        &activations->metadata(),
+                        &activations->frontend_attributes()));
     }
 
     if (halo_region != nullptr) {
-      TF_ASSIGN_OR_RETURN(activations,
-                          MakeConcatHlo({activations, halo_region},
-                                        spatial_dimension_to_split));
+      TF_ASSIGN_OR_RETURN(
+          activations,
+          MakeConcatHlo({activations, halo_region}, spatial_dimension_to_split,
+                        &activations->metadata(),
+                        &activations->frontend_attributes()));
     }
   }
 
@@ -849,13 +862,15 @@ ConvolutionVisitor::ChangeSpatialSizeOnSpaceToBatchedShape(
       padding_config.mutable_dimensions(spatial_dimension)
           ->set_edge_padding_low(0);
     }
-    HloInstruction* padding = computation_->AddInstruction(
+    HloInstruction* padding = activations->AddInstruction(
         HloInstruction::CreateConstant(LiteralUtil::Zero(
             batch_space_collapsed_reshape->shape().element_type())));
 
     TF_ASSIGN_OR_RETURN(
         batch_space_collapsed_reshape,
-        MakePadHlo(batch_space_collapsed_reshape, padding, padding_config));
+        MakePadHlo(batch_space_collapsed_reshape, padding, padding_config,
+                   &batch_space_collapsed_reshape->metadata(),
+                   &batch_space_collapsed_reshape->frontend_attributes()));
   } else {
     std::vector<int64_t> start_indices(rank, 0),
         end_indices(batch_space_collapsed_reshape->shape().dimensions().begin(),
@@ -865,13 +880,13 @@ ConvolutionVisitor::ChangeSpatialSizeOnSpaceToBatchedShape(
       end_indices[spatial_dimension] =
           new_spatial_dim_size * ctrl_.number_of_splits;
     }
-
     // This is the slice from halo padding.
-    TF_ASSIGN_OR_RETURN(batch_space_collapsed_reshape,
-                        MakeSliceHlo(batch_space_collapsed_reshape,
-                                     start_indices, end_indices, strides));
+    TF_ASSIGN_OR_RETURN(
+        batch_space_collapsed_reshape,
+        MakeSliceHlo(batch_space_collapsed_reshape, start_indices, end_indices,
+                     strides, &batch_space_collapsed_reshape->metadata(),
+                     &batch_space_collapsed_reshape->frontend_attributes()));
   }
-
   TF_ASSIGN_OR_RETURN(
       HloInstruction * activations_new,
       PerformSplitSpace(batch_space_collapsed_reshape, spatial_dimensions,
@@ -1467,8 +1482,9 @@ void ConvolutionVisitor::PropagateOnBroadcast(HloInstruction* consumer,
   for (auto j : dimensions) {
     broadcast_dims.push_back(DimLookUp(permute_dims, j));
   }
-  auto new_broadcast = MakeBroadcastHlo(consumer->mutable_operand(0),
-                                        broadcast_dims, final_shape_dims);
+  auto new_broadcast = MakeBroadcastHlo(
+      consumer->mutable_operand(0), broadcast_dims, final_shape_dims,
+      &consumer->metadata(), &consumer->frontend_attributes());
   VLOG(1) << "Created broadcast " << new_broadcast->ToString();
 
   if (batch_is_broadcasted) {
@@ -1873,11 +1889,12 @@ absl::StatusOr<bool> ConvolutionVisitor::Propagate(HloInstruction* consumer,
               pivot_space_size - new_dimensions[space_dim]);
           padding_config.mutable_dimensions(space_dim)->set_edge_padding_low(0);
           HloInstruction* padding =
-              computation_->AddInstruction(HloInstruction::CreateConstant(
+              consumer->AddInstruction(HloInstruction::CreateConstant(
                   LiteralUtil::Zero(reshape->shape().element_type())));
-
-          TF_ASSIGN_OR_RETURN(HloInstruction * padded_operand,
-                              MakePadHlo(reshape, padding, padding_config));
+          TF_ASSIGN_OR_RETURN(
+              HloInstruction * padded_operand,
+              MakePadHlo(reshape, padding, padding_config, &reshape->metadata(),
+                         &reshape->frontend_attributes()));
 
           TF_ASSIGN_OR_RETURN(
               operand_to_use,
@@ -2072,7 +2089,7 @@ absl::StatusOr<bool> ConvolutionVisitor::Propagate(HloInstruction* consumer,
 
     auto pad_val =
         is_select_and_scatter
-            ? computation_->AddInstruction(
+            ? consumer->AddInstruction(
                   HloInstruction::CreateConstant(LiteralUtil::MinValue(
                       consumer->operand(2)->shape().element_type())))
             : init_val;
@@ -2159,10 +2176,11 @@ absl::StatusOr<bool> ConvolutionVisitor::Propagate(HloInstruction* consumer,
               new_shape, select_comp->ComputeProgramShape(), new_win,
               second_operand->shape(), init_val->shape(),
               scatter_comp->ComputeProgramShape()));
-      new_consumer =
-          computation_->AddInstruction(HloInstruction::CreateSelectAndScatter(
+      new_consumer = computation_->AddInstruction(
+          HloInstruction::CreateSelectAndScatter(
               new_select_and_scatter_shape, first_operand, select_comp, new_win,
-              second_operand, init_val, scatter_comp));
+              second_operand, init_val, scatter_comp),
+          &consumer->metadata(), &consumer->frontend_attributes());
       // Replace operand 0.
       TF_CHECK_OK(
           new_consumer->ReplaceOperandWithDifferentShape(0, first_operand));
@@ -2190,7 +2208,9 @@ absl::StatusOr<bool> ConvolutionVisitor::Propagate(HloInstruction* consumer,
         // This is the slice from halo padding.
         TF_ASSIGN_OR_RETURN(
             HloInstruction * bottom,
-            MakeSliceHlo(new_consumer, start_indices, end_indices, strides));
+            MakeSliceHlo(new_consumer, start_indices, end_indices, strides,
+                         &consumer->metadata(),
+                         &consumer->frontend_attributes()));
 
         std::vector<int64_t> start_indices_top(rank, 0),
             end_indices_top(new_consumer->shape().dimensions().begin(),
@@ -2200,44 +2220,61 @@ absl::StatusOr<bool> ConvolutionVisitor::Propagate(HloInstruction* consumer,
         start_indices_top[new_batch_dim] = 1;
 
         // This is the original area from where halo pad was extracted.
-        TF_ASSIGN_OR_RETURN(HloInstruction * top,
-                            MakeSliceHlo(new_consumer, start_indices_top,
-                                         end_indices_top, strides));
+        TF_ASSIGN_OR_RETURN(
+            HloInstruction * top,
+            MakeSliceHlo(new_consumer, start_indices_top, end_indices_top,
+                         strides, &consumer->metadata(),
+                         &consumer->frontend_attributes()));
 
-        HloInstruction* default_fill =
-            MakeBroadcastHlo(init_val, {}, top->shape().dimensions());
+        HloInstruction* default_fill = MakeBroadcastHlo(
+            init_val, {}, top->shape().dimensions(), &init_val->metadata(),
+            &init_val->frontend_attributes());
 
         // Compare to see if the bottom area was changed.
         TF_ASSIGN_OR_RETURN(
             HloInstruction * bottom_compare,
-            MakeCompareHlo(ComparisonDirection::kNe, bottom, default_fill));
+            // TODO(hanrach): Verify that this is the correct metadata
+            MakeCompareHlo(ComparisonDirection::kNe, bottom, default_fill,
+                           &bottom->metadata(),
+                           &bottom->frontend_attributes()));
 
         // Take out only the changed values.
         TF_ASSIGN_OR_RETURN(
             HloInstruction * bottom_taken,
-            MakeSelectHlo(bottom_compare, bottom, default_fill));
+            MakeSelectHlo(bottom_compare, bottom, default_fill, nullptr,
+                          &bottom_compare->metadata(),
+                          &bottom_compare->frontend_attributes()));
 
         // Compare to see if the top area was changed.
         TF_ASSIGN_OR_RETURN(
             HloInstruction * top_compare,
-            MakeCompareHlo(ComparisonDirection::kNe, top, default_fill));
+            MakeCompareHlo(ComparisonDirection::kNe, top, default_fill,
+                           &top->metadata(), &top->frontend_attributes()));
 
         // Take out only the changed values.
         TF_ASSIGN_OR_RETURN(HloInstruction * top_taken,
-                            MakeSelectHlo(top_compare, top, bottom_taken));
+                            MakeSelectHlo(top_compare, top, bottom_taken,
+                                          nullptr, &top_compare->metadata(),
+                                          &top_compare->frontend_attributes()));
 
         // This makes checks if the area was updated by both overlaps.
-        TF_ASSIGN_OR_RETURN(
-            HloInstruction * both_compare,
-            MakeBinaryHlo(HloOpcode::kAnd, top_compare, bottom_compare));
+        TF_ASSIGN_OR_RETURN(HloInstruction * both_compare,
+                            MakeBinaryHlo(HloOpcode::kAnd, top_compare,
+                                          bottom_compare, &consumer->metadata(),
+                                          &consumer->frontend_attributes()));
 
         // If it was, add them up.
-        TF_ASSIGN_OR_RETURN(HloInstruction * both_added,
-                            MakeBinaryHlo(HloOpcode::kAdd, top, bottom));
+        TF_ASSIGN_OR_RETURN(
+            HloInstruction * both_added,
+            MakeBinaryHlo(HloOpcode::kAdd, top, bottom, &consumer->metadata(),
+                          &consumer->frontend_attributes()));
 
         // Pad the final result to the original shape.
-        TF_ASSIGN_OR_RETURN(HloInstruction * final_selection,
-                            MakeSelectHlo(both_compare, both_added, top_taken));
+        TF_ASSIGN_OR_RETURN(
+            HloInstruction * final_selection,
+            MakeSelectHlo(both_compare, both_added, top_taken, nullptr,
+                          &both_compare->metadata(),
+                          &both_compare->frontend_attributes()));
 
         PaddingConfig padding_config =
             MakeNoPaddingConfig(final_selection->shape().dimensions_size());
@@ -2245,13 +2282,16 @@ absl::StatusOr<bool> ConvolutionVisitor::Propagate(HloInstruction* consumer,
             ->set_edge_padding_low(1);
         padding_config.mutable_dimensions(new_space_dim)
             ->set_edge_padding_high(new_space_size);
-        HloInstruction* padding =
-            computation_->AddInstruction(HloInstruction::CreateConstant(
-                LiteralUtil::Zero(final_selection->shape().element_type())));
+        HloInstruction* padding = computation_->AddInstruction(
+            HloInstruction::CreateConstant(
+                LiteralUtil::Zero(final_selection->shape().element_type())),
+            &consumer->metadata(), &consumer->frontend_attributes());
 
         TF_ASSIGN_OR_RETURN(
             final_selection,
-            MakePadHlo(final_selection, padding, padding_config));
+            MakePadHlo(final_selection, padding, padding_config,
+                       &final_selection->metadata(),
+                       &final_selection->frontend_attributes()));
 
         tsl::core::Bitmap b(batch_size * (new_space_size + halo_size));
         for (int k = 0; k < batch_size * (new_space_size + halo_size); ++k) {
@@ -2267,7 +2307,8 @@ absl::StatusOr<bool> ConvolutionVisitor::Propagate(HloInstruction* consumer,
         auto arg_literal = LiteralUtil::CreateR1(b);
         VLOG(4) << "Slice mask created: arg literal " << arg_literal.ToString();
         HloInstruction* slice_mask = computation_->AddInstruction(
-            HloInstruction::CreateConstant(std::move(arg_literal)));
+            HloInstruction::CreateConstant(std::move(arg_literal)),
+            &consumer->metadata(), &consumer->frontend_attributes());
 
         std::vector<int64_t> slice_mask_reshape_dims(2);
         slice_mask_reshape_dims[0] = batch_size;
@@ -2280,11 +2321,14 @@ absl::StatusOr<bool> ConvolutionVisitor::Propagate(HloInstruction* consumer,
         // Broadcast the mask in all dimensions.
         HloInstruction* shape_mask = MakeBroadcastHlo(
             slice_mask_reshaped, {new_batch_dim, new_space_dim},
-            final_selection->shape().dimensions());
+            final_selection->shape().dimensions(), &slice_mask->metadata(),
+            &slice_mask->frontend_attributes());
 
         TF_ASSIGN_OR_RETURN(
             new_consumer,
-            MakeSelectHlo(shape_mask, new_consumer, final_selection));
+            MakeSelectHlo(shape_mask, new_consumer, final_selection, nullptr,
+                          &shape_mask->metadata(),
+                          &shape_mask->frontend_attributes()));
       }
 
       auto previous_shape =
@@ -2294,19 +2338,21 @@ absl::StatusOr<bool> ConvolutionVisitor::Propagate(HloInstruction* consumer,
                       previous_shape.dimensions().end()),
           strides(previous_shape.rank(), 1);
 
-      TF_ASSIGN_OR_RETURN(
-          new_consumer,
-          MakeSliceHlo(new_consumer, start_indices, end_indices, strides));
+      TF_ASSIGN_OR_RETURN(new_consumer,
+                          MakeSliceHlo(new_consumer, start_indices, end_indices,
+                                       strides, &consumer->metadata(),
+                                       &consumer->frontend_attributes()));
 
     } else {
       auto reduce_comp = consumer->to_apply();
       TF_ASSIGN_OR_RETURN(auto new_reduce_window_shape,
                           ShapeInference::InferReduceWindowShape(
                               new_shape, init_val->shape(), new_win));
-      new_consumer =
-          computation_->AddInstruction(HloInstruction::CreateReduceWindow(
-              new_reduce_window_shape, first_operand, init_val, new_win,
-              reduce_comp));
+      new_consumer = computation_->AddInstruction(
+          HloInstruction::CreateReduceWindow(new_reduce_window_shape,
+                                             first_operand, init_val, new_win,
+                                             reduce_comp),
+          &consumer->metadata(), &consumer->frontend_attributes());
       // Replace operand 0.
       TF_CHECK_OK(
           new_consumer->ReplaceOperandWithDifferentShape(0, first_operand));
@@ -2384,7 +2430,8 @@ absl::StatusOr<HloInstruction*> ConvolutionVisitor::SelectValidPortion(
   auto arg_literal = LiteralUtil::CreateR1(b);
   VLOG(4) << "Slice mask created: arg literal " << arg_literal.ToString();
   HloInstruction* slice_mask = computation_->AddInstruction(
-      HloInstruction::CreateConstant(std::move(arg_literal)));
+      HloInstruction::CreateConstant(std::move(arg_literal)),
+      &old_instr->metadata(), &old_instr->frontend_attributes());
 
   std::vector<int64_t> slice_mask_reshape_dims(1 + spatial_dim_count,
                                                new_space_size);
@@ -2398,14 +2445,20 @@ absl::StatusOr<HloInstruction*> ConvolutionVisitor::SelectValidPortion(
   broadcast_dims.insert(broadcast_dims.begin(), new_batch_dim);
   // Broadcast the mask in all dimensions of the activations.
   HloInstruction* shape_mask = MakeBroadcastHlo(
-      slice_mask_reshaped, broadcast_dims, new_instr->shape().dimensions());
+      slice_mask_reshaped, broadcast_dims, new_instr->shape().dimensions(),
+      &slice_mask_reshaped->metadata(),
+      &slice_mask_reshaped->frontend_attributes());
 
   VLOG(1) << "Shape mask made " << shape_mask->ToString();
 
-  HloInstruction* zeroes =
-      MakeBroadcastHlo(select_val, {}, new_instr->shape().dimensions());
+  HloInstruction* zeroes = MakeBroadcastHlo(
+      select_val, {}, new_instr->shape().dimensions(), &select_val->metadata(),
+      &select_val->frontend_attributes());
 
-  TF_ASSIGN_OR_RETURN(new_instr, MakeSelectHlo(shape_mask, new_instr, zeroes));
+  TF_ASSIGN_OR_RETURN(new_instr,
+                      MakeSelectHlo(shape_mask, new_instr, zeroes, nullptr,
+                                    &shape_mask->metadata(),
+                                    &shape_mask->frontend_attributes()));
 
   return new_instr;
 }
@@ -2474,7 +2527,8 @@ absl::StatusOr<HloInstruction*> ConvolutionVisitor::BatchToSpace(
   // This slicing is getting rid of the padding we added to evenly divide space.
   TF_ASSIGN_OR_RETURN(
       HloInstruction * output_slice,
-      MakeSliceHlo(reshape, start_indices, end_indices, strides));
+      MakeSliceHlo(reshape, start_indices, end_indices, strides,
+                   &reshape->metadata(), &reshape->frontend_attributes()));
   VLOG(1) << "Batch to space slice " << output_slice->ToString();
   std::vector<int64_t> transpose_dims(permute_dims);
   TF_ASSIGN_OR_RETURN(HloInstruction * output_transpose,
@@ -2627,8 +2681,10 @@ Status ConvolutionVisitor::PropagateOnConv(HloInstruction* convolution) {
   activations_new = retval.instr;
   std::vector<int64_t> trans_dims = retval.transpose_dims;
   CHECK(!trans_dims.empty());
-  auto select_val = computation_->AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::Zero(activations_new->shape().element_type())));
+  auto select_val = computation_->AddInstruction(
+      HloInstruction::CreateConstant(
+          LiteralUtil::Zero(activations_new->shape().element_type())),
+      &convolution->metadata(), &convolution->frontend_attributes());
 
   TF_ASSIGN_OR_RETURN(
       activations_new,
@@ -2792,8 +2848,10 @@ Status ConvolutionVisitor::PropagateOnConcat(HloInstruction* concat) {
   for (int64_t i = 0; i < concat->operand_count(); ++i) {
     new_operands[i] = old_to_new_instrs_[concat->mutable_operand(i)];
   }
-  TF_ASSIGN_OR_RETURN(HloInstruction * new_concat,
-                      MakeConcatHlo(new_operands, new_concat_dim));
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * new_concat,
+      MakeConcatHlo(new_operands, new_concat_dim, &concat->metadata(),
+                    &concat->frontend_attributes()));
   old_to_new_instrs_[concat] = new_concat;
   // Set mappings from operand 0.
   instr_to_dim_map_[concat] =
@@ -2842,7 +2900,9 @@ Status ConvolutionVisitor::PropagateOnPad(HloInstruction* pad) {
   HloInstruction* padding = pad->mutable_operand(1);
 
   TF_ASSIGN_OR_RETURN(auto new_pad,
-                      MakePadHlo(first_operand, padding, padding_config));
+                      MakePadHlo(first_operand, padding, padding_config,
+                                 &first_operand->metadata(),
+                                 &first_operand->frontend_attributes()));
 
   old_to_new_instrs_[pad] = new_pad;
   // Set mappings from operand 0.
@@ -2875,8 +2935,10 @@ Status ConvolutionVisitor::PropagateOnSlice(HloInstruction* slice) {
     limits[i] = slice->slice_limits(old_dim);
   }
 
-  TF_ASSIGN_OR_RETURN(auto new_slice,
-                      MakeSliceHlo(operand, starts, limits, strides));
+  TF_ASSIGN_OR_RETURN(
+      auto new_slice,
+      MakeSliceHlo(operand, starts, limits, strides, &operand->metadata(),
+                   &operand->frontend_attributes()));
 
   old_to_new_instrs_[slice] = new_slice;
   // Set mappings from operand 0.
@@ -2995,11 +3057,14 @@ absl::StatusOr<HloInstruction*> ConvolutionVisitor::PadAndSplitSpace(
       padding_config.mutable_dimensions(spatial_dimension_to_split)
           ->set_edge_padding_low(low_padding);
     }
-    HloInstruction* padding =
-        computation_->AddInstruction(HloInstruction::CreateConstant(
-            LiteralUtil::Zero(activations->shape().element_type())));
+    HloInstruction* padding = computation_->AddInstruction(
+        HloInstruction::CreateConstant(
+            LiteralUtil::Zero(activations->shape().element_type())),
+        &activations->metadata(), &activations->frontend_attributes());
     TF_ASSIGN_OR_RETURN(activations,
-                        MakePadHlo(activations, padding, padding_config));
+                        MakePadHlo(activations, padding, padding_config,
+                                   &activations->metadata(),
+                                   &activations->frontend_attributes()));
   }
   VLOG(1) << "Initial padded activations shape "
           << activations->shape().ToString() << " old_batch_size "
@@ -3312,9 +3377,10 @@ Status ConvolutionVisitor::PropagateOnBackpropFilterConv(
   }
 
   spatial_dimension_to_split = spatial_dimensions_to_split[0];
-
-  auto select_val = computation_->AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::Zero(activations_new->shape().element_type())));
+  auto select_val = computation_->AddInstruction(
+      HloInstruction::CreateConstant(
+          LiteralUtil::Zero(activations_new->shape().element_type())),
+      &activations_new->metadata(), &activations_new->frontend_attributes());
 
   if (!activations_locally_space_to_batched) {
     // Select activations correctly by masking additional space.
@@ -3461,7 +3527,9 @@ Status ConvolutionVisitor::PropagateOnBackpropFilterConv(
 
   TF_ASSIGN_OR_RETURN(
       activations_new,
-      MakeConcatHlo(absl::MakeSpan(activations_chunks), new_spatial_dimension));
+      MakeConcatHlo(absl::MakeSpan(activations_chunks), new_spatial_dimension,
+                    &activations_old->metadata(),
+                    &activations_old->frontend_attributes()));
 
   // Reshape the kernel with additional spatial dim.
   std::vector<int64_t> kernel_sizes(kernel_new->shape().dimensions().begin(),
@@ -3718,7 +3786,6 @@ Status ConvolutionVisitor::PerformSpaceToBatchOnConvolution(
   int64_t activations_batch_dim = dim_numbers.input_batch_dimension();
 
   auto activations = convolution->mutable_operand(0);
-
   VLOG(1) << "spatial size " << c.spatial_size;
 
   // A very primitive cost model to thwart propagations on tiny shapes.
@@ -3865,7 +3932,8 @@ Status ConvolutionVisitor::PerformSpaceToBatchOnConvolution(
           activations, /*rhs=*/convolution->mutable_operand(1),
           convolution->feature_group_count(), convolution->batch_group_count(),
           new_window, new_dim_numbers, convolution->precision_config(),
-          /*preferred_element_type=*/convolution->shape().element_type()));
+          /*preferred_element_type=*/convolution->shape().element_type(),
+          &convolution->metadata(), &convolution->frontend_attributes()));
   convolution->SetupDerivedInstruction(new_conv);
 
   // If the activations were to be batch-to-spaced again, simply use the
@@ -3887,8 +3955,10 @@ Status ConvolutionVisitor::PerformSpaceToBatchOnConvolution(
 
   const int64_t output_batch_dim = new_dim_numbers.output_batch_dimension();
 
-  auto select_val = computation_->AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::Zero(new_conv->shape().element_type())));
+  auto select_val = computation_->AddInstruction(
+      HloInstruction::CreateConstant(
+          LiteralUtil::Zero(new_conv->shape().element_type())),
+      &convolution->metadata(), &convolution->frontend_attributes());
 
   TF_ASSIGN_OR_RETURN(
       new_conv,
diff --git a/third_party/xla/xla/service/spmd/dot_handler.cc b/third_party/xla/xla/service/spmd/dot_handler.cc
index cd2a7f51f26490..8115a78794672d 100644
--- a/third_party/xla/xla/service/spmd/dot_handler.cc
+++ b/third_party/xla/xla/service/spmd/dot_handler.cc
@@ -527,9 +527,9 @@ std::optional<WindowedEinsumConfig> GetWindowedEinsumConfiguration(
     }
     constexpr int kAggressiveness = 3;
     std::optional<HloSharding> original_ideal_sharding =
-        ShardingPropagation::GetShardingFromUser(*to_loop_over, *original_hlo,
-                                                 kAggressiveness,
-                                                 /*is_spmd=*/true, call_graph);
+        ShardingPropagation::GetShardingFromUser(
+            *to_loop_over, *original_hlo, kAggressiveness,
+            /*is_spmd=*/true, call_graph, /*sharding_helper=*/nullptr);
     // Default to perform collective matmul if GetShardingFromUser() couldn't
     // determine the sharding.
     if (!original_ideal_sharding) {
@@ -542,7 +542,7 @@ std::optional<WindowedEinsumConfig> GetWindowedEinsumConfiguration(
       std::optional<HloSharding> from_user =
           ShardingPropagation::GetShardingFromUser(
               *to_loop_over, *user, kAggressiveness,
-              /*is_spmd=*/true, call_graph);
+              /*is_spmd=*/true, call_graph, /*sharding_helper=*/nullptr);
       // Could't determine sharding. Skip to next one and pretend it wouldn't
       // share the resharding.
       if (!from_user) {
diff --git a/third_party/xla/xla/service/spmd/gather_scatter_handler.cc b/third_party/xla/xla/service/spmd/gather_scatter_handler.cc
index 0fcea7793414c2..e8b97b25d4d175 100644
--- a/third_party/xla/xla/service/spmd/gather_scatter_handler.cc
+++ b/third_party/xla/xla/service/spmd/gather_scatter_handler.cc
@@ -131,7 +131,7 @@ std::vector<int64_t> GatherOperandDimsByPriority(
   }
   const auto operand_passthrough_dims =
       hlo_sharding_util::GetGatherOperandPassthroughOperandDims(
-          operand.base_shape(), operand.sharding(), *gather, slice_sizes);
+          operand.base_shape(), *gather, slice_sizes);
   absl::c_copy(operand_passthrough_dims,
                std::back_inserter(priority_dims_for_operand));
   return priority_dims_for_operand;
@@ -168,8 +168,7 @@ std::vector<int64_t> GatherOutputDimsByPriority(
   std::vector<int64_t> priority_dims_for_output;
   auto operand_passthrough_output_dims =
       hlo_sharding_util::GetGatherOperandPassthroughOutputDims(
-          output_shape, operand.base_shape(), operand.sharding(), *gather,
-          slice_sizes);
+          output_shape, operand.base_shape(), *gather, slice_sizes);
   for (int i = 0; i != output_shape.rank(); ++i) {
     if (!absl::c_linear_search(operand_passthrough_output_dims, i)) {
       priority_dims_for_output.push_back(i);
@@ -354,7 +353,7 @@ absl::StatusOr<HloInstruction*> PartitionGatherOperandPassthroughDimensions(
               operand.base_shape(), operand.sharding(), *gather, slice_sizes)) {
     const auto operand_grouping_dims =
         hlo_sharding_util::GetGatherOperandPassthroughOperandDims(
-            operand.base_shape(), operand.sharding(), *gather, slice_sizes);
+            operand.base_shape(), *gather, slice_sizes);
     const int64_t num_groups =
         operand.sharding().NumTiles(operand_grouping_dims);
     const int64_t num_tiles = operand.sharding().TotalNumTiles();
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc b/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc
index 357f7d2841bdd8..d8bcc31a0250e2 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc
@@ -13460,8 +13460,8 @@ TEST_P(SpmdPartitioningTest, CustomCallShardingRegistration) {
         const HloInstruction* instruction) const override {
       return true;
     }
-    xla::Status Partition(spmd::SpmdPartitioningVisitor* partitioner,
-                          HloInstruction* hlo) const override {
+    absl::Status Partition(spmd::SpmdPartitioningVisitor* partitioner,
+                           HloInstruction* hlo) const override {
       if (hlo->shape().rank() <= 2) {
         return partitioner->DefaultAction(hlo);
       }
diff --git a/third_party/xla/xla/service/stable_sort_expander.cc b/third_party/xla/xla/service/stable_sort_expander.cc
index cd0a5cb1afcaa1..7a92ab7656c80f 100644
--- a/third_party/xla/xla/service/stable_sort_expander.cc
+++ b/third_party/xla/xla/service/stable_sort_expander.cc
@@ -34,7 +34,7 @@ namespace xla {
 // If no matching iota operand is found, a iota operand is added to Sort. The
 // comparison computation is adjusted to break ties using the values from the
 // iota operand.
-StatusOr<HloInstruction*> StableSortExpander::ExpandInstruction(
+absl::StatusOr<HloInstruction*> StableSortExpander::ExpandInstruction(
     HloInstruction* instruction) {
   auto* sort = Cast<HloSortInstruction>(instruction);
   HloComputation* computation = sort->parent();
diff --git a/third_party/xla/xla/service/stable_sort_expander.h b/third_party/xla/xla/service/stable_sort_expander.h
index 83867ddc02083b..b5213b22ab466d 100644
--- a/third_party/xla/xla/service/stable_sort_expander.h
+++ b/third_party/xla/xla/service/stable_sort_expander.h
@@ -33,7 +33,7 @@ class StableSortExpander : public OpExpanderPass {
 
  private:
   bool InstructionMatchesPattern(HloInstruction* instruction) override;
-  StatusOr<HloInstruction*> ExpandInstruction(
+  absl::StatusOr<HloInstruction*> ExpandInstruction(
       HloInstruction* instruction) override;
 };
 
diff --git a/third_party/xla/xla/service/stochastic_convert_decomposer.cc b/third_party/xla/xla/service/stochastic_convert_decomposer.cc
index ca6686195b5bd6..8829b42ed86456 100644
--- a/third_party/xla/xla/service/stochastic_convert_decomposer.cc
+++ b/third_party/xla/xla/service/stochastic_convert_decomposer.cc
@@ -136,7 +136,7 @@ Status DecomposeStochasticConvert(HloComputation* comp,
                        PrimitiveType_Name(to_type));
 }
 
-StatusOr<bool> StochasticConvertDecomposer::Run(
+absl::StatusOr<bool> StochasticConvertDecomposer::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/stochastic_convert_decomposer.h b/third_party/xla/xla/service/stochastic_convert_decomposer.h
index 05d7e8ef2d268e..a51421b438b285 100644
--- a/third_party/xla/xla/service/stochastic_convert_decomposer.h
+++ b/third_party/xla/xla/service/stochastic_convert_decomposer.h
@@ -29,7 +29,7 @@ class StochasticConvertDecomposer : public HloModulePass {
     return "stochastic_convert_decomposer";
   }
   using HloPassInterface::Run;
-  StatusOr<bool> Run(
+  absl::StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/sub_byte_normalization.cc b/third_party/xla/xla/service/sub_byte_normalization.cc
index 028eb83b5f2268..b113de71c07806 100644
--- a/third_party/xla/xla/service/sub_byte_normalization.cc
+++ b/third_party/xla/xla/service/sub_byte_normalization.cc
@@ -84,7 +84,7 @@ bool ProcessInputOrOutputLayout(ShapeLayout* shape_layout,
 
 }  // namespace
 
-StatusOr<bool> SubByteNormalization::Run(
+absl::StatusOr<bool> SubByteNormalization::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/sub_byte_normalization.h b/third_party/xla/xla/service/sub_byte_normalization.h
index 3329600204c28b..ea70be74a19b4b 100644
--- a/third_party/xla/xla/service/sub_byte_normalization.h
+++ b/third_party/xla/xla/service/sub_byte_normalization.h
@@ -53,7 +53,7 @@ class SubByteNormalization : public HloModulePass {
     }
   }
   using HloPassInterface::Run;
-  StatusOr<bool> Run(
+  absl::StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/symbol_repository.h b/third_party/xla/xla/service/symbol_repository.h
index e50198de71ce50..6cce3bc497ab13 100644
--- a/third_party/xla/xla/service/symbol_repository.h
+++ b/third_party/xla/xla/service/symbol_repository.h
@@ -104,9 +104,10 @@ inline SymbolRepositoryRegistry& GetGlobalSymbolRepositoryRegistry() {
 
 // Entry points start here.
 
-inline StatusOr<std::unique_ptr<HloModuleAndMetadata>> LookupSymbolInRepository(
-    absl::string_view repository, absl::string_view symbol_reference,
-    BackendType backend) {
+inline absl::StatusOr<std::unique_ptr<HloModuleAndMetadata>>
+LookupSymbolInRepository(absl::string_view repository,
+                         absl::string_view symbol_reference,
+                         BackendType backend) {
   if (SymbolRepository* repo =
           GetGlobalSymbolRepositoryRegistry().repo(repository);
       repo != nullptr) {
diff --git a/third_party/xla/xla/service/topk_rewriter.cc b/third_party/xla/xla/service/topk_rewriter.cc
index 0258d607b42848..257163c0faf7a2 100644
--- a/third_party/xla/xla/service/topk_rewriter.cc
+++ b/third_party/xla/xla/service/topk_rewriter.cc
@@ -42,7 +42,7 @@ namespace xla {
 namespace m = match;
 
 // TODO(cheshire): Avoid duplication w/ cudnn_vectorize_convolutions.
-static StatusOr<HloComputation*> BuilderToHloComputation(
+static absl::StatusOr<HloComputation*> BuilderToHloComputation(
     XlaComputation& comp, HloComputation* sibling_computation) {
   TF_ASSIGN_OR_RETURN(ProgramShape program_shape, comp.GetProgramShape());
   HloModuleConfig config(program_shape);
@@ -374,7 +374,7 @@ TopKCustomCall CreateTopKCustomCall(HloInstruction* input,
   return {topk, value_gte, index_gte};
 }
 
-StatusOr<HloInstruction*> TopkRewriter::TransformPatternToCustomCall(
+absl::StatusOr<HloInstruction*> TopkRewriter::TransformPatternToCustomCall(
     HloInstruction* inst) {
   // Check if sort is in TopK.
   std::optional<int64_t> k = SortIsInTopK(inst);
@@ -428,7 +428,7 @@ StatusOr<HloInstruction*> TopkRewriter::TransformPatternToCustomCall(
   return topkcc.topk;
 }
 
-StatusOr<bool> TopkRewriter::TransformToCustomCall(
+absl::StatusOr<bool> TopkRewriter::TransformToCustomCall(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
@@ -445,7 +445,7 @@ StatusOr<bool> TopkRewriter::TransformToCustomCall(
   return changed;
 }
 
-StatusOr<bool> TopkRewriter::Run(
+absl::StatusOr<bool> TopkRewriter::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
@@ -486,7 +486,8 @@ class TopkDecomposerVisitor : public DfsHloRewriteVisitor {
     return inst->user_count() == 1 && inst->users().front()->tuple_index() == 0;
   }
 
-  StatusOr<HloComputation*> CreateVariadicComparator(HloInstruction* inst) {
+  absl::StatusOr<HloComputation*> CreateVariadicComparator(
+      HloInstruction* inst) {
     HloTopKInstruction* topk = DynCast<HloTopKInstruction>(inst);
     XlaBuilder b(absl::StrCat("comparator_", topk->name()));
     std::vector<PrimitiveType> ptypes = {
@@ -554,7 +555,7 @@ class TopkDecomposerVisitor : public DfsHloRewriteVisitor {
   HloPredicate should_decompose_;
 };
 
-StatusOr<bool> TopkDecomposer::Run(
+absl::StatusOr<bool> TopkDecomposer::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   return TopkDecomposerVisitor(should_decompose_)
diff --git a/third_party/xla/xla/service/topk_rewriter.h b/third_party/xla/xla/service/topk_rewriter.h
index fbafecdc700685..8400c505053a63 100644
--- a/third_party/xla/xla/service/topk_rewriter.h
+++ b/third_party/xla/xla/service/topk_rewriter.h
@@ -41,7 +41,7 @@ class TopkRewriter : public HloModulePass {
   absl::string_view name() const override { return "topk-rewriter"; }
 
   using HloPassInterface::Run;
-  StatusOr<bool> Run(
+  absl::StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
@@ -50,7 +50,7 @@ class TopkRewriter : public HloModulePass {
   std::optional<int64_t> SortIsInTopK(HloInstruction* inst);
 
   // Transform to CustomCall.
-  StatusOr<bool> TransformToCustomCall(
+  absl::StatusOr<bool> TransformToCustomCall(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads);
 
@@ -62,7 +62,8 @@ class TopkRewriter : public HloModulePass {
 
   // Matches the input to the sort+iota+slice pattern and converts to custom
   // call if profitable. Returns the custom call if one was created.
-  StatusOr<HloInstruction*> TransformPatternToCustomCall(HloInstruction* inst);
+  absl::StatusOr<HloInstruction*> TransformPatternToCustomCall(
+      HloInstruction* inst);
 };
 
 class TopkDecomposer : public HloModulePass {
@@ -73,7 +74,7 @@ class TopkDecomposer : public HloModulePass {
       : should_decompose_(should_decompose) {}
 
   using HloPassInterface::Run;
-  StatusOr<bool> Run(
+  absl::StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/tpu_computation_placer.h b/third_party/xla/xla/service/tpu_computation_placer.h
index 128561bfbc9243..60345010683a26 100644
--- a/third_party/xla/xla/service/tpu_computation_placer.h
+++ b/third_party/xla/xla/service/tpu_computation_placer.h
@@ -27,7 +27,7 @@ namespace tpu {
 class TpuComputationPlacer : public xla::ComputationPlacer {
  public:
   template <typename T>
-  using StatusOr = xla::StatusOr<T>;
+  using StatusOr = absl::StatusOr<T>;
 
   TpuComputationPlacer();
   ~TpuComputationPlacer() override;
diff --git a/third_party/xla/xla/service/transfer_manager.cc b/third_party/xla/xla/service/transfer_manager.cc
index 68ba3bada6261d..4335351bc8db27 100644
--- a/third_party/xla/xla/service/transfer_manager.cc
+++ b/third_party/xla/xla/service/transfer_manager.cc
@@ -53,7 +53,7 @@ TransferManager::GetPlatformTransferManagers() {
   return r;
 }
 
-StatusOr<Literal> TransferManager::TransferLiteralFromDevice(
+absl::StatusOr<Literal> TransferManager::TransferLiteralFromDevice(
     se::Stream* stream, const ShapedBuffer& device_buffer,
     const TransferMetadata* transfer_metadata) {
   Literal literal(device_buffer.on_host_shape());
@@ -98,7 +98,7 @@ Status TransferManager::TransferLiteralToDevice(
   return substream->BlockHostUntilDone();
 }
 
-StatusOr<Literal> TransferManager::TransferArrayFromDevice(
+absl::StatusOr<Literal> TransferManager::TransferArrayFromDevice(
     se::Stream* stream, const Shape& shape, const se::DeviceMemoryBase& source,
     const TransferMetadata* transfer_metadata) {
   TF_RET_CHECK(shape.IsArray());
@@ -149,7 +149,8 @@ Status TransferManager::ReadDynamicShapes(se::Stream* stream,
   TF_ASSIGN_OR_RETURN(auto compiler,
                       Compiler::GetForPlatform(stream->parent()->platform()));
   TF_RETURN_IF_ERROR(device_buffer->buffers().ForEachElementWithStatus(
-      [&](const ShapeIndex& index, const se::DeviceMemoryBase& buffer) {
+      [&](const ShapeIndex& index,
+          const se::DeviceMemoryBase& buffer) -> absl::Status {
         const Shape& buffer_shape =
             ShapeUtil::GetSubshape(*device_shape, index);
         if (buffer_shape.IsTuple()) {
@@ -201,7 +202,7 @@ Status TransferManager::ReadDynamicShapes(se::Stream* stream,
   (*managers)[platform_id].creation_function = creation_function;
 }
 
-/* static */ StatusOr<TransferManager*> TransferManager::GetForPlatform(
+/* static */ absl::StatusOr<TransferManager*> TransferManager::GetForPlatform(
     const se::Platform* platform) {
   absl::MutexLock lock(&TransferManager::platform_transfer_manager_mutex_);
   auto* managers = GetPlatformTransferManagers();
@@ -296,7 +297,7 @@ Status TransferManager::WriteRootTupleIndexTable(
                                     &device_memory);
 }
 
-StatusOr<ScopedShapedBuffer> TransferManager::AllocateScopedShapedBuffer(
+absl::StatusOr<ScopedShapedBuffer> TransferManager::AllocateScopedShapedBuffer(
     const Shape& on_host_shape, se::DeviceMemoryAllocator* allocator,
     int device_ordinal, DeviceShapeRepresentationFn shape_representation_fn) {
   if (!LayoutUtil::HasLayout(on_host_shape)) {
@@ -331,7 +332,7 @@ StatusOr<ScopedShapedBuffer> TransferManager::AllocateScopedShapedBuffer(
   return std::move(shaped_buffer);
 }
 
-StatusOr<Shape> TransferManager::ChooseCompactLayoutForShape(
+absl::StatusOr<Shape> TransferManager::ChooseCompactLayoutForShape(
     const Shape& host_shape) const {
   return LayoutUtil::GetWithDefaultLayout(host_shape);
 }
diff --git a/third_party/xla/xla/service/transfer_manager.h b/third_party/xla/xla/service/transfer_manager.h
index a229be39be5ebf..b29a93ee6eb551 100644
--- a/third_party/xla/xla/service/transfer_manager.h
+++ b/third_party/xla/xla/service/transfer_manager.h
@@ -77,7 +77,7 @@ class TransferManager {
   //
   // Optionally caller can specify platform-specific transfer metadata that
   // tells the actual implementation to do something special.
-  StatusOr<Literal> TransferLiteralFromDevice(
+  absl::StatusOr<Literal> TransferLiteralFromDevice(
       se::Stream* stream, const ShapedBuffer& device_buffer,
       const TransferMetadata* transfer_metadata = nullptr);
 
@@ -171,7 +171,7 @@ class TransferManager {
       const se::DeviceMemoryBase& dest,
       const TransferMetadata* transfer_metadata = nullptr);
 
-  StatusOr<Literal> TransferArrayFromDevice(
+  absl::StatusOr<Literal> TransferArrayFromDevice(
       se::Stream* stream, const Shape& shape,
       const se::DeviceMemoryBase& source,
       const TransferMetadata* transfer_metadata = nullptr);
@@ -228,7 +228,7 @@ class TransferManager {
   // devices that have tiled memory architectures.
   // The default implementation always picks a default (major-to-minor) layout.
   // Fails if 'shape' cannot be represented by the device.
-  virtual StatusOr<Shape> ChooseCompactLayoutForShape(
+  virtual absl::StatusOr<Shape> ChooseCompactLayoutForShape(
       const Shape& host_shape) const;
 
   // For the given shape, chooses a layout for infeed. The returned shape
@@ -241,7 +241,7 @@ class TransferManager {
   // Allocates a ScopedShapedBuffer which can hold data with the given on-host
   // shape. The on-device shape may be different as indicated by
   // HostShapeToDeviceShape.
-  StatusOr<ScopedShapedBuffer> AllocateScopedShapedBuffer(
+  absl::StatusOr<ScopedShapedBuffer> AllocateScopedShapedBuffer(
       const Shape& on_host_shape, se::DeviceMemoryAllocator* allocator,
       int device_ordinal,
       DeviceShapeRepresentationFn shape_representation_fn = nullptr);
@@ -288,7 +288,7 @@ class TransferManager {
 
   // Returns the transfer manager singleton pointer if it is available for the
   // given platform, or an error status if it is not.
-  static StatusOr<TransferManager*> GetForPlatform(
+  static absl::StatusOr<TransferManager*> GetForPlatform(
       const se::Platform* platform);
 
   // Writes the given device-memory pointers in 'elements' to the given region
diff --git a/third_party/xla/xla/service/transpose_folding.cc b/third_party/xla/xla/service/transpose_folding.cc
index 5d9881cea91f4a..b954f307f82d4f 100644
--- a/third_party/xla/xla/service/transpose_folding.cc
+++ b/third_party/xla/xla/service/transpose_folding.cc
@@ -188,7 +188,7 @@ TransposeFolding::TransposeFolding(
           std::move(dot_can_fold_transpose_operand)),
       transposable_conv_operands_(std::move(transposable_conv_operands)) {}
 
-StatusOr<bool> TransposeFolding::Run(
+absl::StatusOr<bool> TransposeFolding::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // Modifying the graph while traversing is dangerous, so we find all folding
@@ -249,8 +249,9 @@ StatusOr<bool> TransposeFolding::Run(
   return changed;
 }
 
-/*static*/ StatusOr<bool> TransposeFolding::IsRowColumnTransposeDotOperand(
-    const HloInstruction& dot, int64_t operand_idx) {
+/*static*/ absl::StatusOr<bool>
+TransposeFolding::IsRowColumnTransposeDotOperand(const HloInstruction& dot,
+                                                 int64_t operand_idx) {
   TF_RET_CHECK(dot.opcode() == HloOpcode::kDot);
   TF_RET_CHECK(dot.operand_count() > operand_idx);
 
diff --git a/third_party/xla/xla/service/transpose_folding.h b/third_party/xla/xla/service/transpose_folding.h
index ff42651ca44228..eacf99ae3ced4f 100644
--- a/third_party/xla/xla/service/transpose_folding.h
+++ b/third_party/xla/xla/service/transpose_folding.h
@@ -34,7 +34,7 @@ class TransposeFolding : public HloModulePass {
   using TransposableConvOperandsFn = std::function<OperandIndices(
       const HloInstruction&, const OperandIndices&)>;
 
-  using CanFoldTransposeOperand = std::function<StatusOr<bool>(
+  using CanFoldTransposeOperand = std::function<absl::StatusOr<bool>(
       const HloInstruction&, int64_t /*operand_idx*/)>;
 
   // Helper function to explicitly not fold transposes.
@@ -63,11 +63,11 @@ class TransposeFolding : public HloModulePass {
   absl::string_view name() const override { return "transpose-folding"; }
 
   using HloPassInterface::Run;
-  StatusOr<bool> Run(
+  absl::StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
-  static StatusOr<bool> IsRowColumnTransposeDotOperand(
+  static absl::StatusOr<bool> IsRowColumnTransposeDotOperand(
       const HloInstruction& dot, int64_t operand_idx);
 
  private:
diff --git a/third_party/xla/xla/service/transpose_folding_test.cc b/third_party/xla/xla/service/transpose_folding_test.cc
index fa75fb266c2c37..1dd4f0c361badc 100644
--- a/third_party/xla/xla/service/transpose_folding_test.cc
+++ b/third_party/xla/xla/service/transpose_folding_test.cc
@@ -246,7 +246,7 @@ TEST_F(TransposeFoldingTest, FoldConvDimSwapTransposeRhs) {
     dim->set_size(
         transpose_y->shape().dimensions(dnums.kernel_spatial_dimensions(i)));
   }
-  StatusOr<Shape> conv_shape = ShapeInference::InferConvolveShape(
+  absl::StatusOr<Shape> conv_shape = ShapeInference::InferConvolveShape(
       x->shape(), transpose_y->shape(), /*feature_group_count=*/1,
       /*batch_group_count=*/1, window, dnums,
       /*preferred_element_type=*/std::nullopt);
@@ -304,7 +304,7 @@ TEST_F(TransposeFoldingTest, FoldConvComplexTransposeRhs) {
     dim->set_size(
         transpose_y->shape().dimensions(dnums.kernel_spatial_dimensions(i)));
   }
-  StatusOr<Shape> conv_shape = ShapeInference::InferConvolveShape(
+  absl::StatusOr<Shape> conv_shape = ShapeInference::InferConvolveShape(
       x->shape(), transpose_y->shape(), /*feature_group_count=*/1,
       /*batch_group_count=*/1, window, dnums,
       /*preferred_element_type=*/std::nullopt);
@@ -367,7 +367,7 @@ TEST_F(TransposeFoldingTest, FoldConvTransposeLhs) {
     dim->set_stride(1);
     dim->set_size(y->shape().dimensions(dnums.kernel_spatial_dimensions(i)));
   }
-  StatusOr<Shape> conv_shape = ShapeInference::InferConvolveShape(
+  absl::StatusOr<Shape> conv_shape = ShapeInference::InferConvolveShape(
       transpose_x->shape(), y->shape(), /*feature_group_count=*/1,
       /*batch_group_count=*/1, window, dnums,
       /*preferred_element_type=*/std::nullopt);
@@ -436,7 +436,7 @@ TEST_F(TransposeFoldingTest, FoldConvComplexTransposeLhs) {
     dim->set_stride(1);
     dim->set_size(y->shape().dimensions(dnums.kernel_spatial_dimensions(i)));
   }
-  StatusOr<Shape> conv_shape = ShapeInference::InferConvolveShape(
+  absl::StatusOr<Shape> conv_shape = ShapeInference::InferConvolveShape(
       transpose_x->shape(), y->shape(), /*feature_group_count=*/1,
       /*batch_group_count=*/1, window, dnums,
       /*preferred_element_type=*/std::nullopt);
diff --git a/third_party/xla/xla/service/tree_reduction_rewriter.cc b/third_party/xla/xla/service/tree_reduction_rewriter.cc
index 6dde1b5689bb7c..1c505358648bd8 100644
--- a/third_party/xla/xla/service/tree_reduction_rewriter.cc
+++ b/third_party/xla/xla/service/tree_reduction_rewriter.cc
@@ -109,7 +109,7 @@ class ReductionRewriterVisitor : public DfsHloRewriteVisitor {
   int64_t reduce_window_size_;
 };
 
-StatusOr<bool> TreeReductionRewriter::Run(
+absl::StatusOr<bool> TreeReductionRewriter::Run(
     HloModule *module,
     const absl::flat_hash_set<absl::string_view> &execution_threads) {
   ReductionRewriterVisitor visitor(reduce_window_size_);
diff --git a/third_party/xla/xla/service/tree_reduction_rewriter.h b/third_party/xla/xla/service/tree_reduction_rewriter.h
index a28e4f6de74db5..d1284d30417030 100644
--- a/third_party/xla/xla/service/tree_reduction_rewriter.h
+++ b/third_party/xla/xla/service/tree_reduction_rewriter.h
@@ -48,7 +48,7 @@ class TreeReductionRewriter : public HloModulePass {
   absl::string_view name() const override { return "tree_reduction_rewriter"; }
 
   using HloPassInterface::Run;
-  StatusOr<bool> Run(
+  absl::StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/triangular_solve_expander.cc b/third_party/xla/xla/service/triangular_solve_expander.cc
index b4c3abf0a19d8e..80a3e2c6557262 100644
--- a/third_party/xla/xla/service/triangular_solve_expander.cc
+++ b/third_party/xla/xla/service/triangular_solve_expander.cc
@@ -38,7 +38,7 @@ namespace {
 // Get the diagonal blocks of the coefficient matrix
 XlaOp DiagonalBlocks(XlaOp a, int64_t block_size) {
   XlaBuilder* builder = a.builder();
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(a));
     int ndims = shape.rank();
     int64_t n = ShapeUtil::GetDimension(shape, -1);
@@ -129,7 +129,7 @@ XlaOp SolveWithInvertedDiagonalBlocks(XlaOp a, XlaOp b, XlaOp inv_diag_blocks,
                                       bool transpose_a, bool conjugate_a,
                                       PrecisionConfig::Precision precision) {
   XlaBuilder* builder = a.builder();
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape blocks_shape, builder->GetShape(inv_diag_blocks));
     TF_ASSIGN_OR_RETURN(Shape b_shape, builder->GetShape(b));
     int64_t block_size = ShapeUtil::GetDimension(blocks_shape, -1);
@@ -238,7 +238,7 @@ XlaOp TriangularSolveExpander::InvertDiagonalBlocks(
     XlaOp diag_blocks, bool lower_triangular,
     PrecisionConfig::Precision precision) {
   XlaBuilder* builder = diag_blocks.builder();
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     // Input is a batch of square lower triangular square matrices. Its shape is
     // (..., size, size). We resize this to (num_blocks, size, size).
     TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(diag_blocks));
@@ -366,7 +366,7 @@ XlaOp TriangularSolveExpander::SolveByInvertingDiagonalBlocks(
     bool conjugate_a, bool unit_diagonal,
     PrecisionConfig::Precision precision) {
   XlaBuilder* builder = a.builder();
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
     const int64_t ndims = a_shape.rank();
     int64_t k = ShapeUtil::GetDimension(a_shape, -1);
@@ -410,7 +410,7 @@ XlaOp TriangularSolveExpander::SolveDirectly(
     bool conjugate_a, bool unit_diagonal,
     PrecisionConfig::Precision precision) {
   XlaBuilder* builder = a.builder();
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
     TF_ASSIGN_OR_RETURN(Shape b_shape, builder->GetShape(b));
     int64_t m = ShapeUtil::GetDimension(b_shape, -2);
@@ -467,7 +467,7 @@ XlaOp TriangularSolveExpander::BuildTriangularSolve(
     bool conjugate_a, bool unit_diagonal, int64_t block_size,
     PrecisionConfig::Precision precision) {
   XlaBuilder* builder = a.builder();
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
     TF_ASSIGN_OR_RETURN(Shape b_shape, builder->GetShape(b));
     if (a_shape.rank() != b_shape.rank()) {
@@ -551,7 +551,7 @@ bool TriangularSolveExpander::InstructionMatchesPattern(
   return instruction->opcode() == HloOpcode::kTriangularSolve;
 }
 
-StatusOr<HloInstruction*> TriangularSolveExpander::ExpandInstruction(
+absl::StatusOr<HloInstruction*> TriangularSolveExpander::ExpandInstruction(
     HloInstruction* instruction) {
   const TriangularSolveOptions& options =
       instruction->triangular_solve_options();
diff --git a/third_party/xla/xla/service/triangular_solve_expander.h b/third_party/xla/xla/service/triangular_solve_expander.h
index c78d296ee4559a..0ccbcf1cf7ceaa 100644
--- a/third_party/xla/xla/service/triangular_solve_expander.h
+++ b/third_party/xla/xla/service/triangular_solve_expander.h
@@ -36,7 +36,7 @@ class TriangularSolveExpander : public OpExpanderPass {
 
   bool InstructionMatchesPattern(HloInstruction* instruction) override;
 
-  StatusOr<HloInstruction*> ExpandInstruction(
+  absl::StatusOr<HloInstruction*> ExpandInstruction(
       HloInstruction* instruction) override;
 
   // Performs a triangular solve using an algorithm from MAGMA, which inverts
diff --git a/third_party/xla/xla/service/tuple_points_to_analysis.cc b/third_party/xla/xla/service/tuple_points_to_analysis.cc
index 96b14bbf054494..c90e8836a12ff5 100644
--- a/third_party/xla/xla/service/tuple_points_to_analysis.cc
+++ b/third_party/xla/xla/service/tuple_points_to_analysis.cc
@@ -137,7 +137,7 @@ void GatherFusionInstructions(
 
 }  // namespace
 
-/* static */ StatusOr<std::unique_ptr<TuplePointsToAnalysis>>
+/* static */ absl::StatusOr<std::unique_ptr<TuplePointsToAnalysis>>
 TuplePointsToAnalysis::Run(const HloModule* module) {
   auto logical_buffer_analysis = LogicalBufferAnalysis::Run(module);
   std::unique_ptr<TuplePointsToAnalysis> analysis(new TuplePointsToAnalysis(
@@ -632,7 +632,7 @@ const LogicalBuffer& TuplePointsToAnalysis::GetBuffer(
   return logical_buffer_analysis_->GetBuffer(id);
 }
 
-StatusOr<const LogicalBuffer*> TuplePointsToAnalysis::GetBufferDefinedAt(
+absl::StatusOr<const LogicalBuffer*> TuplePointsToAnalysis::GetBufferDefinedAt(
     const HloInstruction* instruction, const ShapeIndex& index) const {
   const auto& buffers = GetPointsToSet(instruction).element(index);
   if (buffers.size() != 1 || buffers[0]->instruction() != instruction) {
diff --git a/third_party/xla/xla/service/tuple_points_to_analysis.h b/third_party/xla/xla/service/tuple_points_to_analysis.h
index 32b5eed009da43..2b58b1c2e97ace 100644
--- a/third_party/xla/xla/service/tuple_points_to_analysis.h
+++ b/third_party/xla/xla/service/tuple_points_to_analysis.h
@@ -183,7 +183,7 @@ std::ostream& operator<<(std::ostream& out, const BufferAlias& buffer_alias);
 class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
  public:
   // Runs points-to analysis on 'module'.
-  static StatusOr<std::unique_ptr<TuplePointsToAnalysis>> Run(
+  static absl::StatusOr<std::unique_ptr<TuplePointsToAnalysis>> Run(
       const HloModule* module);
 
   // Return the points-to set of an instruction. This describes the potential
@@ -196,7 +196,7 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
 
   // Returns the buffer defined at the given instruction and index. An error is
   // returned if no buffer is defined at that point.
-  StatusOr<const LogicalBuffer*> GetBufferDefinedAt(
+  absl::StatusOr<const LogicalBuffer*> GetBufferDefinedAt(
       const HloInstruction* instruction, const ShapeIndex& index) const;
 
   // Return a (possibly empty) vector containing all BufferAliases of the given
diff --git a/third_party/xla/xla/service/tuple_simplifier.cc b/third_party/xla/xla/service/tuple_simplifier.cc
index f78a353ad1966e..3163e82161f4cd 100644
--- a/third_party/xla/xla/service/tuple_simplifier.cc
+++ b/third_party/xla/xla/service/tuple_simplifier.cc
@@ -26,7 +26,7 @@ namespace xla {
 TupleSimplifier::TupleSimplifier(bool exclude_entry_computation)
     : exclude_entry_computation_(exclude_entry_computation) {}
 
-StatusOr<bool> TupleSimplifier::RemoveWholeTuple(HloInstruction* tuple) {
+absl::StatusOr<bool> TupleSimplifier::RemoveWholeTuple(HloInstruction* tuple) {
   HloInstruction* top_tuple = nullptr;
   for (int64_t operand_number = 0; operand_number < tuple->operand_count();
        ++operand_number) {
@@ -53,7 +53,7 @@ StatusOr<bool> TupleSimplifier::RemoveWholeTuple(HloInstruction* tuple) {
   return changed;
 }
 
-StatusOr<bool> TupleSimplifier::Run(
+absl::StatusOr<bool> TupleSimplifier::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // Initially add all GTE and Tuple instructions to the worklist.
diff --git a/third_party/xla/xla/service/tuple_simplifier.h b/third_party/xla/xla/service/tuple_simplifier.h
index 481405a541ab53..bd9d2d850c7ea5 100644
--- a/third_party/xla/xla/service/tuple_simplifier.h
+++ b/third_party/xla/xla/service/tuple_simplifier.h
@@ -37,7 +37,7 @@ class TupleSimplifier : public HloModulePass {
   // computation was changed.
   using HloPassInterface::Run;
   using HloPassInterface::RunOnModuleGroup;
-  StatusOr<bool> Run(
+  absl::StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
@@ -59,7 +59,7 @@ class TupleSimplifier : public HloModulePass {
   //         |
   //       Tuple
   //
-  StatusOr<bool> RemoveWholeTuple(HloInstruction* tuple);
+  absl::StatusOr<bool> RemoveWholeTuple(HloInstruction* tuple);
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/tuple_util.cc b/third_party/xla/xla/service/tuple_util.cc
index b5e73011081530..2ca0b08bb2d60b 100644
--- a/third_party/xla/xla/service/tuple_util.cc
+++ b/third_party/xla/xla/service/tuple_util.cc
@@ -84,7 +84,7 @@ namespace xla {
       HloInstruction::CreateTuple(tuple_elements));
 }
 
-/*static*/ StatusOr<HloInstruction*> TupleUtil::ReplaceTupleWith(
+/*static*/ absl::StatusOr<HloInstruction*> TupleUtil::ReplaceTupleWith(
     HloInstruction* new_instruction, HloInstruction* tuple,
     ShapeIndex shape_index, bool insert_bitcast_if_different_shape) {
   const Shape& tuple_shape = tuple->shape();
diff --git a/third_party/xla/xla/service/tuple_util.h b/third_party/xla/xla/service/tuple_util.h
index 2b86b8ba71dccc..95c5b44e19d9cc 100644
--- a/third_party/xla/xla/service/tuple_util.h
+++ b/third_party/xla/xla/service/tuple_util.h
@@ -60,7 +60,7 @@ class TupleUtil {
   // new_instruction. If the replacement instruction has a different shape than
   // the old one, we insert a bitcast if insert_bitcast_if_different_shape is
   // set to true.
-  static StatusOr<HloInstruction*> ReplaceTupleWith(
+  static absl::StatusOr<HloInstruction*> ReplaceTupleWith(
       HloInstruction* new_instruction, HloInstruction* tuple,
       ShapeIndex shape_index, bool insert_bitcast_if_different_shape = true);
 
diff --git a/third_party/xla/xla/service/while_loop_all_reduce_code_motion.cc b/third_party/xla/xla/service/while_loop_all_reduce_code_motion.cc
index 987c78e38bba06..19fa63daca07bf 100644
--- a/third_party/xla/xla/service/while_loop_all_reduce_code_motion.cc
+++ b/third_party/xla/xla/service/while_loop_all_reduce_code_motion.cc
@@ -932,7 +932,7 @@ Status AddSinkedAllReducesAndReplaceWhile(
 
 }  // namespace
 
-StatusOr<bool> WhileLoopAllReduceCodeMotion::Run(
+absl::StatusOr<bool> WhileLoopAllReduceCodeMotion::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool is_changed = false;
diff --git a/third_party/xla/xla/service/while_loop_all_reduce_code_motion.h b/third_party/xla/xla/service/while_loop_all_reduce_code_motion.h
index a9ab170d4c3c44..141d51f4072751 100644
--- a/third_party/xla/xla/service/while_loop_all_reduce_code_motion.h
+++ b/third_party/xla/xla/service/while_loop_all_reduce_code_motion.h
@@ -52,7 +52,7 @@ class WhileLoopAllReduceCodeMotion : public HloModulePass {
     return "while-loop-all-reduce-code-motion";
   }
   using HloPassInterface::Run;
-  StatusOr<bool> Run(
+  absl::StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/while_loop_analysis.cc b/third_party/xla/xla/service/while_loop_analysis.cc
index 01aa802a929d69..2445489d5eca24 100644
--- a/third_party/xla/xla/service/while_loop_analysis.cc
+++ b/third_party/xla/xla/service/while_loop_analysis.cc
@@ -508,7 +508,7 @@ optional<int64_t> ComputeWhileLoopTripCount(const HloInstruction* while_op,
   HloEvaluator evaluator(/*max_loop_iterations=*/0);
   auto* while_init = while_op->operand(0);
   auto* indvar_init = while_init->operand(*indvar_tuple_idx);
-  StatusOr<Literal> indvar_init_result = evaluator.Evaluate(indvar_init);
+  absl::StatusOr<Literal> indvar_init_result = evaluator.Evaluate(indvar_init);
   if (!indvar_init_result.ok()) {
     VLOG(2) << "Couldn't evaluate induction variable init, "
             << indvar_init_result.status() << ", " << indvar_init->ToString();
@@ -534,7 +534,7 @@ optional<int64_t> ComputeWhileLoopTripCount(const HloInstruction* while_op,
 
   for (int64_t trip_count = 0; trip_count != max_brute_force_iters + 1;
        ++trip_count) {
-    StatusOr<Literal> result = evaluator.EvaluateWithSubstitutions(
+    absl::StatusOr<Literal> result = evaluator.EvaluateWithSubstitutions(
         while_cond_root, {{while_cond_indvar, &indvar_iter_val}});
     if (!result.ok()) {
       VLOG(2) << "Couldn't evaluate while cond: " << result.status();
@@ -547,8 +547,9 @@ optional<int64_t> ComputeWhileLoopTripCount(const HloInstruction* while_op,
 
     // Calculate the value of the induction variable after one iteration of the
     // loop, and check whether the while condition is true with this new value.
-    StatusOr<Literal> indvar_next_result = evaluator.EvaluateWithSubstitutions(
-        while_body_indvar_update, {{while_body_indvar, &indvar_iter_val}});
+    absl::StatusOr<Literal> indvar_next_result =
+        evaluator.EvaluateWithSubstitutions(
+            while_body_indvar_update, {{while_body_indvar, &indvar_iter_val}});
     if (!indvar_next_result.ok()) {
       VLOG(2) << "Couldn't evaluate induction variable update: "
               << indvar_next_result.status();
@@ -644,7 +645,7 @@ optional<int64_t> ComputeWhileLoopTripCountUpperBound(
   TF_CHECK_OK(fake_input.CopyFrom(while_body_indvar->literal(),
                                   /*dest_shape_index=*/{0},
                                   /*src_shape_index=*/{}));
-  StatusOr<Literal> eval_result =
+  absl::StatusOr<Literal> eval_result =
       evaluator.Evaluate(*new_computation, {std::move(fake_input)});
 
   if (!eval_result.ok()) {
diff --git a/third_party/xla/xla/service/while_loop_analysis_test.cc b/third_party/xla/xla/service/while_loop_analysis_test.cc
index 1beeb4efb8eae5..924222fbc11a6a 100644
--- a/third_party/xla/xla/service/while_loop_analysis_test.cc
+++ b/third_party/xla/xla/service/while_loop_analysis_test.cc
@@ -43,11 +43,11 @@ namespace {
 
 class WhileLoopAnalysisTest : public HloTestBase {
  protected:
-  [[nodiscard]] StatusOr<int64_t> MakeWhileLoopAndGetTripCount(
+  [[nodiscard]] absl::StatusOr<int64_t> MakeWhileLoopAndGetTripCount(
       int init, int limit, int step, ComparisonDirection dir);
 };
 
-StatusOr<int64_t> WhileLoopAnalysisTest::MakeWhileLoopAndGetTripCount(
+absl::StatusOr<int64_t> WhileLoopAnalysisTest::MakeWhileLoopAndGetTripCount(
     int init, int limit, int step, ComparisonDirection dir) {
   std::string hlo_string_template = R"(
   HloModule ModuleWithWhile
diff --git a/third_party/xla/xla/service/while_loop_concat_code_motion.cc b/third_party/xla/xla/service/while_loop_concat_code_motion.cc
index 2ad34b2884fc57..d7cf929ae7c541 100644
--- a/third_party/xla/xla/service/while_loop_concat_code_motion.cc
+++ b/third_party/xla/xla/service/while_loop_concat_code_motion.cc
@@ -944,8 +944,8 @@ Status RewriteLoopWithConcatGroups(HloInstruction* loop,
   return OkStatus();
 }
 
-StatusOr<bool> RunOnLoop(HloInstruction* loop,
-                         int64_t min_operand_count_to_optimize) {
+absl::StatusOr<bool> RunOnLoop(HloInstruction* loop,
+                               int64_t min_operand_count_to_optimize) {
   auto body = loop->while_body();
   auto param = body->parameter_instruction(0);
   auto root = body->root_instruction();
@@ -1019,7 +1019,7 @@ StatusOr<bool> RunOnLoop(HloInstruction* loop,
 
 }  // namespace
 
-StatusOr<bool> WhileLoopConcatCodeMotion::Run(
+absl::StatusOr<bool> WhileLoopConcatCodeMotion::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/while_loop_concat_code_motion.h b/third_party/xla/xla/service/while_loop_concat_code_motion.h
index 515394aaec3f4c..dfb82ae5a009af 100644
--- a/third_party/xla/xla/service/while_loop_concat_code_motion.h
+++ b/third_party/xla/xla/service/while_loop_concat_code_motion.h
@@ -68,7 +68,7 @@ class WhileLoopConcatCodeMotion : public HloModulePass {
     return kName;
   }
   using HloPassInterface::Run;
-  StatusOr<bool> Run(
+  absl::StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/while_loop_constant_sinking.cc b/third_party/xla/xla/service/while_loop_constant_sinking.cc
index bd2cc6a87c7729..d714cdb25a0216 100644
--- a/third_party/xla/xla/service/while_loop_constant_sinking.cc
+++ b/third_party/xla/xla/service/while_loop_constant_sinking.cc
@@ -65,7 +65,7 @@ HloInstruction* CloneHelper(const HloInstruction* instruction,
 
 }  // namespace
 
-StatusOr<bool> WhileLoopConstantSinking::TrySinkingConstantsIntoWhileLoop(
+absl::StatusOr<bool> WhileLoopConstantSinking::TrySinkingConstantsIntoWhileLoop(
     HloInstruction* while_instr) {
   HloComputation* while_cond = while_instr->while_condition();
   HloComputation* while_body = while_instr->while_body();
@@ -133,7 +133,7 @@ StatusOr<bool> WhileLoopConstantSinking::TrySinkingConstantsIntoWhileLoop(
   return changed;
 }
 
-StatusOr<bool> WhileLoopConstantSinking::Run(
+absl::StatusOr<bool> WhileLoopConstantSinking::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(2) << "HLO module before WhileLoopConstantSinking:";
diff --git a/third_party/xla/xla/service/while_loop_constant_sinking.h b/third_party/xla/xla/service/while_loop_constant_sinking.h
index b6484e280c574b..fd556c6c65aac9 100644
--- a/third_party/xla/xla/service/while_loop_constant_sinking.h
+++ b/third_party/xla/xla/service/while_loop_constant_sinking.h
@@ -60,12 +60,13 @@ class WhileLoopConstantSinking : public HloModulePass {
   }
 
   using HloPassInterface::Run;
-  StatusOr<bool> Run(
+  absl::StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
  private:
-  StatusOr<bool> TrySinkingConstantsIntoWhileLoop(HloInstruction* while_instr);
+  absl::StatusOr<bool> TrySinkingConstantsIntoWhileLoop(
+      HloInstruction* while_instr);
 
   const bool sink_broadcast_of_constants_;
   const bool sink_only_scalar_constants_;
diff --git a/third_party/xla/xla/service/while_loop_expensive_invariant_code_motion.cc b/third_party/xla/xla/service/while_loop_expensive_invariant_code_motion.cc
index a26bf9158f0570..399fd7c88c3333 100644
--- a/third_party/xla/xla/service/while_loop_expensive_invariant_code_motion.cc
+++ b/third_party/xla/xla/service/while_loop_expensive_invariant_code_motion.cc
@@ -116,7 +116,7 @@ static void CreateLoopInvariantCopy(
 }
 }  // namespace
 
-StatusOr<bool> WhileLoopExpensiveInvariantCodeMotion::
+absl::StatusOr<bool> WhileLoopExpensiveInvariantCodeMotion::
     TryHoistingInvariantInstructionsFromWhileBody(HloInstruction* while_instr) {
   auto print_no_metadata = HloPrintOptions{}.set_print_metadata(false);
 
@@ -337,7 +337,7 @@ StatusOr<bool> WhileLoopExpensiveInvariantCodeMotion::
   return true;
 }
 
-StatusOr<bool> WhileLoopExpensiveInvariantCodeMotion::Run(
+absl::StatusOr<bool> WhileLoopExpensiveInvariantCodeMotion::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(2) << "HLO module before WhileLoopExpensiveInvariantCodeMotion:";
diff --git a/third_party/xla/xla/service/while_loop_expensive_invariant_code_motion.h b/third_party/xla/xla/service/while_loop_expensive_invariant_code_motion.h
index 9e65ff6fab7453..ac26df8f5b3980 100644
--- a/third_party/xla/xla/service/while_loop_expensive_invariant_code_motion.h
+++ b/third_party/xla/xla/service/while_loop_expensive_invariant_code_motion.h
@@ -44,12 +44,12 @@ class WhileLoopExpensiveInvariantCodeMotion : public HloModulePass {
     return "while-loop-expensive-invariant-code-motion";
   }
   using HloPassInterface::Run;
-  StatusOr<bool> Run(
+  absl::StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
  private:
-  StatusOr<bool> TryHoistingInvariantInstructionsFromWhileBody(
+  absl::StatusOr<bool> TryHoistingInvariantInstructionsFromWhileBody(
       HloInstruction* while_instr);
 
   ShapeSizeFunction shape_size_function_;
diff --git a/third_party/xla/xla/service/while_loop_fusible_sinking.cc b/third_party/xla/xla/service/while_loop_fusible_sinking.cc
index 755b1d9e6480c2..07b0943d9304ec 100644
--- a/third_party/xla/xla/service/while_loop_fusible_sinking.cc
+++ b/third_party/xla/xla/service/while_loop_fusible_sinking.cc
@@ -21,10 +21,11 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/while_util.h"
-#include "xla/statusor.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
 
@@ -117,7 +118,7 @@ HloInstruction* WhileLoopFusibleSinking::CreateSinkableFusion(
   return fusion;
 }
 
-StatusOr<bool> WhileLoopFusibleSinking::TrySinkingFusiblesIntoWhileLoop(
+absl::StatusOr<bool> WhileLoopFusibleSinking::TrySinkingFusiblesIntoWhileLoop(
     HloInstruction* while_instr) {
   HloComputation* while_cond = while_instr->while_condition();
   HloComputation* while_body = while_instr->while_body();
@@ -143,6 +144,18 @@ StatusOr<bool> WhileLoopFusibleSinking::TrySinkingFusiblesIntoWhileLoop(
 
   for (HloInstruction* invariant_body_gte : invariant_body_gtes) {
     int64_t index = invariant_body_gte->tuple_index();
+    if (while_instr->operand_count() == 0 || init_value->operand_count() == 0) {
+      // This is the case when each of tuple elements in the operand tuple of
+      // the while loop was an invariant value and each of the usages has been
+      // replaced.
+      CHECK_EQ(while_instr->user_count(), 0);
+      VLOG(3) << "Each element in the operand tuple of the while instruction '"
+              << while_instr->name()
+              << "' was an invariant value, whose usage has been replaced "
+                 " directly by the value.";
+      break;
+    }
+
     HloInstruction* invariant_value = init_value->mutable_operand(index);
 
     // If a while operand is used by a slicing instruction, avoid fusing
@@ -235,7 +248,7 @@ StatusOr<bool> WhileLoopFusibleSinking::TrySinkingFusiblesIntoWhileLoop(
   return changed;
 }
 
-StatusOr<bool> WhileLoopFusibleSinking::Run(
+absl::StatusOr<bool> WhileLoopFusibleSinking::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   call_counts_.clear();
diff --git a/third_party/xla/xla/service/while_loop_fusible_sinking.h b/third_party/xla/xla/service/while_loop_fusible_sinking.h
index e0db2c7686e9da..a8ac53ec46d7c1 100644
--- a/third_party/xla/xla/service/while_loop_fusible_sinking.h
+++ b/third_party/xla/xla/service/while_loop_fusible_sinking.h
@@ -61,13 +61,14 @@ class WhileLoopFusibleSinking : public HloModulePass {
   }
 
   using HloPassInterface::Run;
-  StatusOr<bool> Run(
+  absl::StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
  private:
   // Sink a fusible subgraph into a while loop.
-  StatusOr<bool> TrySinkingFusiblesIntoWhileLoop(HloInstruction* while_instr);
+  absl::StatusOr<bool> TrySinkingFusiblesIntoWhileLoop(
+      HloInstruction* while_instr);
 
   // Creates a loop fusion instruction containing the computation to move into
   // the while loop to avoid conflicts with actual instruction fusion, the loop
diff --git a/third_party/xla/xla/service/while_loop_invariant_code_motion.cc b/third_party/xla/xla/service/while_loop_invariant_code_motion.cc
index f55585f90b978d..1f670664434c02 100644
--- a/third_party/xla/xla/service/while_loop_invariant_code_motion.cc
+++ b/third_party/xla/xla/service/while_loop_invariant_code_motion.cc
@@ -137,7 +137,7 @@ bool WhileLoopInvariantCodeMotion::NotWorthHoistingIndividually(
   }
 }
 
-StatusOr<bool>
+absl::StatusOr<bool>
 WhileLoopInvariantCodeMotion::TryHoistingInvariantInstructionsFromWhileBody(
     HloInstruction* while_instr, BoundNonLinearCompilerAnalysis* allowance) {
   auto print_no_metadata = HloPrintOptions{}.set_print_metadata(false);
@@ -328,7 +328,7 @@ WhileLoopInvariantCodeMotion::TryHoistingInvariantInstructionsFromWhileBody(
   return true;
 }
 
-StatusOr<bool> WhileLoopInvariantCodeMotion::Run(
+absl::StatusOr<bool> WhileLoopInvariantCodeMotion::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(2) << "HLO module before WhileLoopInvariantCodeMotion:";
diff --git a/third_party/xla/xla/service/while_loop_invariant_code_motion.h b/third_party/xla/xla/service/while_loop_invariant_code_motion.h
index 6cb74435d8106c..249605bacf4736 100644
--- a/third_party/xla/xla/service/while_loop_invariant_code_motion.h
+++ b/third_party/xla/xla/service/while_loop_invariant_code_motion.h
@@ -72,13 +72,13 @@ class WhileLoopInvariantCodeMotion : public HloModulePass {
     return "while-loop-invariant-code-motion";
   }
   using HloPassInterface::Run;
-  StatusOr<bool> Run(
+  absl::StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
  private:
   bool NotWorthHoistingIndividually(const HloInstruction& instruction);
-  StatusOr<bool> TryHoistingInvariantInstructionsFromWhileBody(
+  absl::StatusOr<bool> TryHoistingInvariantInstructionsFromWhileBody(
       HloInstruction* while_instr, BoundNonLinearCompilerAnalysis* allowance);
 
   bool hoist_constants_;
diff --git a/third_party/xla/xla/service/while_loop_simplifier.cc b/third_party/xla/xla/service/while_loop_simplifier.cc
index a7e5bd524d6a84..190ccaba461841 100644
--- a/third_party/xla/xla/service/while_loop_simplifier.cc
+++ b/third_party/xla/xla/service/while_loop_simplifier.cc
@@ -63,7 +63,7 @@ using std::optional;
 // if:
 //   1) x is a constant and x >= k + c.
 //   2) x is a constant x <= c.
-static StatusOr<bool> TryRemoveTrivialCompare(HloInstruction* while_op) {
+static absl::StatusOr<bool> TryRemoveTrivialCompare(HloInstruction* while_op) {
   std::optional<int64_t> indvar_index = GetLoopInductionVarTupleIdx(while_op);
   if (indvar_index.has_value()) {
     if (while_op->operand(0)->operand(*indvar_index)->IsConstant()) {
@@ -135,7 +135,7 @@ void CopyFrontendAttributes(HloInstruction* old_while_op,
 // while loop init, body, and condition. The final shape returned is still the
 // same as before. If set index_for_replaced will replace any use of the removed
 // indices in the final shape with a copy of the removed index.
-static StatusOr<HloInstruction*> RemoveDeadTupleIndices(
+static absl::StatusOr<HloInstruction*> RemoveDeadTupleIndices(
     HloInstruction* while_op, absl::flat_hash_set<int64_t>& used_tuple_indices,
     int64_t index_for_replaced = -1) {
   // Build up maps from the old/new to the new/old tuple indices.
@@ -312,7 +312,7 @@ static StatusOr<HloInstruction*> RemoveDeadTupleIndices(
 // that tuple that is not used by the loop condition and is not used by the loop
 // body except to pass it to the next iteration of the loop, then we can remove
 // that element from the loop's tuples.
-static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
+static absl::StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
   CHECK_EQ(while_op->opcode(), HloOpcode::kWhile);
 
   // Don't try this transformation if the while loop isn't removable, since if
@@ -557,7 +557,7 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
 // This is a helper function for TryRemoveRepeatedWhileTupleIndices. It removes
 // duplicates by replacing them with tuple_index, followed by a call to
 // RemoveDeadTupleIndices.
-static StatusOr<HloInstruction*> TryRemoveRepeatedWhileTupleIndicesHelper(
+static absl::StatusOr<HloInstruction*> TryRemoveRepeatedWhileTupleIndicesHelper(
     HloInstruction* while_op, const int64_t tuple_index, bool replace_with_init,
     absl::flat_hash_set<int64_t>& duplicates) {
   HloComputation* while_cond = while_op->while_condition();
@@ -617,7 +617,7 @@ static bool IsDynamicUpdateSliceWhileInsertion(
 
 // If the while loop init passes the same values to several tuple indices, and
 // if the body keeps on passing them through, we can remove the duplicates.
-static StatusOr<bool> TryRemoveRepeatedWhileTupleIndices(
+static absl::StatusOr<bool> TryRemoveRepeatedWhileTupleIndices(
     HloInstruction* while_op) {
   CHECK_EQ(while_op->opcode(), HloOpcode::kWhile);
 
@@ -765,7 +765,7 @@ static StatusOr<bool> TryRemoveRepeatedWhileTupleIndices(
 
 // Removes each loop parameter (i.e. member of the while loop tuple) that is a
 // constant and is the same in the while loop body and the while loop init.
-static StatusOr<bool> TryRemoveConstantParams(HloInstruction* while_op) {
+static absl::StatusOr<bool> TryRemoveConstantParams(HloInstruction* while_op) {
   HloModule* module = while_op->GetModule();
   HloComputation* computation = while_op->parent();
   auto* while_init = while_op->mutable_operand(0);
@@ -913,7 +913,7 @@ static StatusOr<bool> TryRemoveConstantParams(HloInstruction* while_op) {
 //    loop itself removed.
 //
 // Returns true if it made a change to the graph.
-static StatusOr<bool> TryRemoveWhileLoop(HloInstruction* while_op) {
+static absl::StatusOr<bool> TryRemoveWhileLoop(HloInstruction* while_op) {
   // Cowardly refuse to remove loops that are not removable.  In practice, this
   // means that we can't remove loops that have control predecessors/successors.
   if (!while_op->parent()->IsSafelyRemovable(while_op)) {
@@ -998,7 +998,7 @@ static StatusOr<bool> TryRemoveWhileLoop(HloInstruction* while_op) {
   return false;
 }
 
-static StatusOr<bool> TryPropagateConstant(HloInstruction* while_op) {
+static absl::StatusOr<bool> TryPropagateConstant(HloInstruction* while_op) {
   auto while_init = while_op->operand(0);
   if (while_init->opcode() != HloOpcode::kTuple) {
     return false;
@@ -1037,7 +1037,8 @@ static StatusOr<bool> TryPropagateConstant(HloInstruction* while_op) {
 
   // Replace the use of each constant tuple element in the loop_condition and
   // loop_body with the corresponding constant value.
-  auto propagate_constant = [&](HloComputation* computation) -> StatusOr<bool> {
+  auto propagate_constant =
+      [&](HloComputation* computation) -> absl::StatusOr<bool> {
     HloInstruction* param = computation->parameter_instruction(0);
     bool changed = false;
     for (auto instr : param->users()) {
@@ -1134,7 +1135,7 @@ static std::vector<HloInstruction*> GetFlatTupleElems(
   return elems;
 }
 
-static StatusOr<bool> TryFlattenNestedTuples(HloInstruction* while_op) {
+static absl::StatusOr<bool> TryFlattenNestedTuples(HloInstruction* while_op) {
   HloModule* module = while_op->GetModule();
   HloComputation* computation = while_op->parent();
   auto* while_init = while_op->mutable_operand(0);
@@ -1264,7 +1265,7 @@ static StatusOr<bool> TryFlattenNestedTuples(HloInstruction* while_op) {
 // need to be wrapped in a tuple that changes its shape.  We return the loop
 // itself so that you can call TryMergeInductionVariables in a loop, once for
 // each integral type elem_ty.
-static StatusOr<HloInstruction*> TryMergeInductionVariables(
+static absl::StatusOr<HloInstruction*> TryMergeInductionVariables(
     HloInstruction* while_op, PrimitiveType elem_ty) {
   CHECK(primitive_util::IsIntegralType(elem_ty)) << PrimitiveType_Name(elem_ty);
   HloModule* module = while_op->GetModule();
@@ -1479,7 +1480,7 @@ static StatusOr<HloInstruction*> TryMergeInductionVariables(
   return new_while;
 }
 
-StatusOr<bool> WhileLoopSimplifier::Run(
+absl::StatusOr<bool> WhileLoopSimplifier::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   XLA_VLOG_LINES(3,
diff --git a/third_party/xla/xla/service/while_loop_simplifier.h b/third_party/xla/xla/service/while_loop_simplifier.h
index 47b698c0d63c45..3aacd3b0c70efc 100644
--- a/third_party/xla/xla/service/while_loop_simplifier.h
+++ b/third_party/xla/xla/service/while_loop_simplifier.h
@@ -57,7 +57,7 @@ class WhileLoopSimplifier : public HloModulePass {
   ~WhileLoopSimplifier() override = default;
   absl::string_view name() const override { return "simplify-while-loops"; }
   using HloPassInterface::Run;
-  StatusOr<bool> Run(
+  absl::StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/while_loop_trip_count_annotator.cc b/third_party/xla/xla/service/while_loop_trip_count_annotator.cc
index 80e430e5df39c6..ca92eb6fc78977 100644
--- a/third_party/xla/xla/service/while_loop_trip_count_annotator.cc
+++ b/third_party/xla/xla/service/while_loop_trip_count_annotator.cc
@@ -28,7 +28,7 @@ limitations under the License.
 
 namespace xla {
 
-StatusOr<bool> WhileLoopTripCountAnnotator::Run(
+absl::StatusOr<bool> WhileLoopTripCountAnnotator::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/while_loop_trip_count_annotator.h b/third_party/xla/xla/service/while_loop_trip_count_annotator.h
index af39b7f0595296..440e5e6d6184b2 100644
--- a/third_party/xla/xla/service/while_loop_trip_count_annotator.h
+++ b/third_party/xla/xla/service/while_loop_trip_count_annotator.h
@@ -43,7 +43,7 @@ class WhileLoopTripCountAnnotator : public HloModulePass {
     return "while-loop-trip-count-annotator";
   }
   using HloPassInterface::Run;
-  StatusOr<bool> Run(
+  absl::StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/while_loop_unroller.cc b/third_party/xla/xla/service/while_loop_unroller.cc
index cf260e9919e256..57fe2cdba2de2d 100644
--- a/third_party/xla/xla/service/while_loop_unroller.cc
+++ b/third_party/xla/xla/service/while_loop_unroller.cc
@@ -134,7 +134,7 @@ std::optional<WhileLoopConfig> IsLoopUnrollable(HloInstruction* while_op) {
   HloEvaluator evaluator(/*max_loop_iterations=*/0);
   const HloInstruction* while_init = while_op->operand(0);
   const HloInstruction* indvar_init = while_init->operand(*indvar_tuple_idx);
-  StatusOr<Literal> indvar_init_result = evaluator.Evaluate(indvar_init);
+  absl::StatusOr<Literal> indvar_init_result = evaluator.Evaluate(indvar_init);
   if (!indvar_init_result.ok()) {
     VLOG(2) << "Couldn't evaluate induction variable init, "
             << indvar_init_result.status() << ", " << indvar_init->ToString();
@@ -175,9 +175,10 @@ std::unique_ptr<HloInstruction> GetConstantWithPrimitiveType(PrimitiveType type,
 
 // Helper function that replaces a single iteration of a while loop with
 // induction variable equal to induction_value.
-StatusOr<std::unique_ptr<HloComputation>> UnrollSingleIterationOfTrivialLoop(
-    HloInstruction* while_op, const int64_t indvar_idx,
-    const int64_t induction_value) {
+absl::StatusOr<std::unique_ptr<HloComputation>>
+UnrollSingleIterationOfTrivialLoop(HloInstruction* while_op,
+                                   const int64_t indvar_idx,
+                                   const int64_t induction_value) {
   // We clone the body since we are changing the computation.
   std::unique_ptr<HloComputation> while_body_clone =
       while_op->while_body()->Clone(absl::StrCat(induction_value));
@@ -329,8 +330,9 @@ absl::Status InitialFeasibilityCheck(HloInstruction* while_op,
   return absl::OkStatus();
 }
 
-StatusOr<bool> UnrollInternal(HloInstruction* while_op, WhileLoopConfig config,
-                              int64_t unroll_factor) {
+absl::StatusOr<bool> UnrollInternal(HloInstruction* while_op,
+                                    WhileLoopConfig config,
+                                    int64_t unroll_factor) {
   TF_RETURN_IF_ERROR(InitialFeasibilityCheck(while_op, config, unroll_factor));
 
   VLOG(3) << "Unrolling while instruction " << while_op->ToShortString()
@@ -363,9 +365,9 @@ StatusOr<bool> UnrollInternal(HloInstruction* while_op, WhileLoopConfig config,
   return true;
 }
 
-StatusOr<bool> UnrollInternalWrapped(HloInstruction* while_op,
-                                     WhileLoopConfig config,
-                                     int64_t unroll_factor) {
+absl::StatusOr<bool> UnrollInternalWrapped(HloInstruction* while_op,
+                                           WhileLoopConfig config,
+                                           int64_t unroll_factor) {
   TF_RETURN_IF_ERROR(InitialFeasibilityCheck(while_op, config, unroll_factor));
 
   VLOG(3) << "Unrolling (wrapped) while instruction "
@@ -378,7 +380,7 @@ StatusOr<bool> UnrollInternalWrapped(HloInstruction* while_op,
 
   auto body_builder =
       HloComputation::Builder(absl::StrCat("unrolled-body-", while_op->name()));
-  StatusOr<HloInstruction*> p = body_builder.AddParameter(
+  absl::StatusOr<HloInstruction*> p = body_builder.AddParameter(
       while_op->while_body()->parameter_instruction(0)->Clone());
 
   std::vector<HloInstruction*> call_operands = {p.value()};
@@ -468,8 +470,8 @@ std::vector<std::pair<HloInstruction*, WhileLoopConfig>> GetUnrollableLoops(
   return while_loop_configs;
 }
 
-StatusOr<bool> Unroll(HloInstruction* while_op, int64_t unroll_factor,
-                      bool wrap_in_trivial_loop) {
+absl::StatusOr<bool> Unroll(HloInstruction* while_op, int64_t unroll_factor,
+                            bool wrap_in_trivial_loop) {
   bool changed = false;
   HloModule* module = while_op->GetModule();
 
@@ -501,7 +503,7 @@ StatusOr<bool> Unroll(HloInstruction* while_op, int64_t unroll_factor,
   return unrolled;
 }
 
-StatusOr<bool> WhileLoopUnroller::Run(
+absl::StatusOr<bool> WhileLoopUnroller::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // TODO(b/288130138) For now, we only support full unrolling. Will add partial
diff --git a/third_party/xla/xla/service/while_loop_unroller.h b/third_party/xla/xla/service/while_loop_unroller.h
index 024ea408141762..337ec9d348a8df 100644
--- a/third_party/xla/xla/service/while_loop_unroller.h
+++ b/third_party/xla/xla/service/while_loop_unroller.h
@@ -56,8 +56,9 @@ std::vector<std::pair<HloInstruction*, WhileLoopConfig>> GetUnrollableLoops(
 // Unrolls the given while loop with the default behaviour set to full unroll.
 // If wrap_in_trivial_loop is set, the unrolled body of the loop will be wrapped
 // in a loop with trip count of one.
-StatusOr<bool> Unroll(HloInstruction* while_op, int64_t unroll_factor = -1,
-                      bool wrap_in_trivial_loop = false);
+absl::StatusOr<bool> Unroll(HloInstruction* while_op,
+                            int64_t unroll_factor = -1,
+                            bool wrap_in_trivial_loop = false);
 
 // This pass unrolls while loops with the given unrolling factor. The value of
 // unroll_factor = -1 will fully unroll the loop.
@@ -82,7 +83,7 @@ class WhileLoopUnroller : public HloModulePass {
   absl::string_view name() const override { return "while_loop_unroller"; }
 
   using HloPassInterface::Run;
-  StatusOr<bool> Run(
+  absl::StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/while_util.cc b/third_party/xla/xla/service/while_util.cc
index aac68b78483eed..2f944bf472423c 100644
--- a/third_party/xla/xla/service/while_util.cc
+++ b/third_party/xla/xla/service/while_util.cc
@@ -49,7 +49,8 @@ namespace xla {
 
 using absl::StrCat;
 
-static StatusOr<std::pair<HloComputation*, CallInliner::InlinedInstructionMap>>
+static absl::StatusOr<
+    std::pair<HloComputation*, CallInliner::InlinedInstructionMap>>
 WidenWhileCondition(HloComputation* narrow_condition, const Shape& wide_shape) {
   const Shape& narrow_shape =
       narrow_condition->parameter_instruction(0)->shape();
@@ -86,7 +87,8 @@ WidenWhileCondition(HloComputation* narrow_condition, const Shape& wide_shape) {
   return {{wide_while_cond, std::move(inlined_instructions_map)}};
 }
 
-static StatusOr<std::pair<HloComputation*, CallInliner::InlinedInstructionMap>>
+static absl::StatusOr<
+    std::pair<HloComputation*, CallInliner::InlinedInstructionMap>>
 WidenWhileBody(HloComputation* narrow_body, const Shape& wide_shape) {
   const Shape& narrow_shape = narrow_body->parameter_instruction(0)->shape();
 
@@ -125,7 +127,7 @@ WidenWhileBody(HloComputation* narrow_body, const Shape& wide_shape) {
   return {{wide_while_body, std::move(inlined_instructions_map)}};
 }
 
-/*static*/ StatusOr<WhileUtil::MakeInstructionsLiveInResult>
+/*static*/ absl::StatusOr<WhileUtil::MakeInstructionsLiveInResult>
 WhileUtil::MakeInstructionsLiveIn(
     HloInstruction* while_instr,
     absl::Span<HloInstruction* const> instructions) {
@@ -188,7 +190,7 @@ WhileUtil::MakeInstructionsLiveIn(
   return std::move(result);
 }
 
-static StatusOr<std::unique_ptr<HloComputation>>
+static absl::StatusOr<std::unique_ptr<HloComputation>>
 MakeCountedLoopConditionComputation(const Shape& loop_state_shape,
                                     int32_t trip_count) {
   Shape scalar_pred = ShapeUtil::MakeShape(PRED, {});
@@ -212,9 +214,10 @@ MakeCountedLoopConditionComputation(const Shape& loop_state_shape,
   return std::move(cond_computation);
 }
 
-static StatusOr<std::unique_ptr<HloComputation>> MakeCountedLoopBodyComputation(
+static absl::StatusOr<std::unique_ptr<HloComputation>>
+MakeCountedLoopBodyComputation(
     const Shape& loop_state_shape,
-    absl::FunctionRef<StatusOr<WhileUtil::LoopStateTy>(
+    absl::FunctionRef<absl::StatusOr<WhileUtil::LoopStateTy>(
         HloInstruction*, const WhileUtil::LoopStateTy&)>
         loop_body_generator) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloComputation> body_computation,
@@ -277,11 +280,11 @@ static Shape MakeLoopStateShapeWithLayout(
   return ShapeUtil::MakeTupleShape(loop_state_shape_components);
 }
 
-/*static*/ StatusOr<WhileUtil::OwningLoopStateTy> WhileUtil::MakeCountedLoop(
-    HloModule* module, int32_t trip_count,
-    const WhileUtil::LoopStateTy& init_values,
-    WhileUtil::LoopBodyGeneratorTy loop_body_generator,
-    const OpMetadata& metadata) {
+/*static*/ absl::StatusOr<WhileUtil::OwningLoopStateTy>
+WhileUtil::MakeCountedLoop(HloModule* module, int32_t trip_count,
+                           const WhileUtil::LoopStateTy& init_values,
+                           WhileUtil::LoopBodyGeneratorTy loop_body_generator,
+                           const OpMetadata& metadata) {
   CHECK_GE(trip_count, 0);
 
   // Both MakeCountedLoopConditionComputation and MakeCountedLoopBodyComputation
@@ -319,7 +322,7 @@ static Shape MakeLoopStateShapeWithLayout(
   return WhileUtil::OwningLoopStateTy{std::move(owned), while_results};
 }
 
-/*static*/ StatusOr<WhileUtil::LoopStateTy> WhileUtil::MakeCountedLoop(
+/*static*/ absl::StatusOr<WhileUtil::LoopStateTy> WhileUtil::MakeCountedLoop(
     HloComputation* computation, int32_t trip_count,
     const WhileUtil::LoopStateTy& init_values,
     WhileUtil::LoopBodyGeneratorTy loop_body_generator,
diff --git a/third_party/xla/xla/service/while_util.h b/third_party/xla/xla/service/while_util.h
index 2f56ea074a8781..03cd85605cf140 100644
--- a/third_party/xla/xla/service/while_util.h
+++ b/third_party/xla/xla/service/while_util.h
@@ -72,12 +72,12 @@ class WhileUtil {
   //
   //   Every instruction in `instructions` must be contained in the computation
   //   that contains `while_instr`.
-  static StatusOr<MakeInstructionsLiveInResult> MakeInstructionsLiveIn(
+  static absl::StatusOr<MakeInstructionsLiveInResult> MakeInstructionsLiveIn(
       HloInstruction* while_instr,
       absl::Span<HloInstruction* const> instructions);
 
   using LoopStateTy = std::vector<HloInstruction*>;
-  using LoopBodyGeneratorTy = absl::FunctionRef<StatusOr<LoopStateTy>(
+  using LoopBodyGeneratorTy = absl::FunctionRef<absl::StatusOr<LoopStateTy>(
       HloInstruction* /*induction_var*/,
       const LoopStateTy& /*current_values*/)>;
 
@@ -93,7 +93,7 @@ class WhileUtil {
   //    }
   //    return loop_state;
   //  }
-  static StatusOr<LoopStateTy> MakeCountedLoop(
+  static absl::StatusOr<LoopStateTy> MakeCountedLoop(
       HloComputation* computation, int32_t trip_count,
       const LoopStateTy& init_values, LoopBodyGeneratorTy loop_body_generator,
       const OpMetadata& metadata);
@@ -105,7 +105,7 @@ class WhileUtil {
   // As above but does not add the while loop or other instructions created
   // around it in any particular computation. The caller can instead add it to a
   // computation of their choosing.
-  static StatusOr<OwningLoopStateTy> MakeCountedLoop(
+  static absl::StatusOr<OwningLoopStateTy> MakeCountedLoop(
       HloModule* module, int32_t trip_count,
       const WhileUtil::LoopStateTy& init_values,
       WhileUtil::LoopBodyGeneratorTy loop_body_generator,
diff --git a/third_party/xla/xla/service/while_util_test.cc b/third_party/xla/xla/service/while_util_test.cc
index a899a89a5215c3..e18975f7bdd01a 100644
--- a/third_party/xla/xla/service/while_util_test.cc
+++ b/third_party/xla/xla/service/while_util_test.cc
@@ -35,7 +35,7 @@ namespace op = ::xla::testing::opcode_matchers;
 
 class WhileUtilTest : public HloTestBase {
  protected:
-  StatusOr<std::unique_ptr<VerifiedHloModule>> GetParsedModule(
+  absl::StatusOr<std::unique_ptr<VerifiedHloModule>> GetParsedModule(
       HloComputation** entry_computation, HloInstruction** param0,
       HloInstruction** param1, HloInstruction** param2) {
     const char* const hlo_string = R"(
diff --git a/third_party/xla/xla/service/xla_compile_main.cc b/third_party/xla/xla/service/xla_compile_main.cc
index b7f97fd800f8af..761412c554d23d 100644
--- a/third_party/xla/xla/service/xla_compile_main.cc
+++ b/third_party/xla/xla/service/xla_compile_main.cc
@@ -110,7 +110,7 @@ int main(int argc, char* argv[]) {
 
   tsl::port::InitMain(usage.c_str(), &argc, &argv);
 
-  xla::Status result = xla::XlaCompileMain(
+  absl::Status result = xla::XlaCompileMain(
       module_path, output_path, platform, gpu_target_config_path,
       autotune_results_path, symbol_repository, symbol_id, use_attached_device,
       wait_for_uploads, result_output_file);
diff --git a/third_party/xla/xla/service/zero_sized_hlo_elimination.cc b/third_party/xla/xla/service/zero_sized_hlo_elimination.cc
index fb7ccc9a63c768..22c7cf0f00beb3 100644
--- a/third_party/xla/xla/service/zero_sized_hlo_elimination.cc
+++ b/third_party/xla/xla/service/zero_sized_hlo_elimination.cc
@@ -30,7 +30,7 @@ limitations under the License.
 
 namespace xla {
 
-StatusOr<bool> ZeroSizedHloElimination::Run(
+absl::StatusOr<bool> ZeroSizedHloElimination::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
diff --git a/third_party/xla/xla/service/zero_sized_hlo_elimination.h b/third_party/xla/xla/service/zero_sized_hlo_elimination.h
index 5e908e09095362..f45b9134f6d343 100644
--- a/third_party/xla/xla/service/zero_sized_hlo_elimination.h
+++ b/third_party/xla/xla/service/zero_sized_hlo_elimination.h
@@ -27,7 +27,7 @@ namespace xla {
 class ZeroSizedHloElimination : public HloModulePass {
  public:
   using HloPassInterface::Run;
-  StatusOr<bool> Run(
+  absl::StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
   absl::string_view name() const override {
diff --git a/third_party/xla/xla/shape_layout.cc b/third_party/xla/xla/shape_layout.cc
index 7969aca2d3563c..8181a4e297a8b5 100644
--- a/third_party/xla/xla/shape_layout.cc
+++ b/third_party/xla/xla/shape_layout.cc
@@ -89,8 +89,14 @@ const Layout& ShapeLayout::layout() const {
 }
 
 void ShapeLayout::Clear() { LayoutUtil::ClearLayout(&shape_); }
+void ShapeLayout::Clear(ShapeIndexView shape_index) {
+  ShapeUtil::GetMutableSubshape(&shape_, shape_index)->clear_layout();
+}
 
 bool ShapeLayout::LayoutIsSet() const { return LayoutUtil::HasLayout(shape_); }
+bool ShapeLayout::AnyLayoutIsSet() const {
+  return LayoutUtil::HasAnyLayout(shape_);
+}
 
 void ShapeLayout::ResetLayout(const Layout& layout) {
   DCHECK(!shape_.IsTuple());
diff --git a/third_party/xla/xla/shape_layout.h b/third_party/xla/xla/shape_layout.h
index 6348e7230a7788..9d6353e27f3a61 100644
--- a/third_party/xla/xla/shape_layout.h
+++ b/third_party/xla/xla/shape_layout.h
@@ -58,6 +58,7 @@ class ShapeLayout {
 
   // Clears (Layout::Clear) all the Layouts stored in this object.
   void Clear();
+  void Clear(ShapeIndexView shape_index);
 
   // Sets all Layouts stored in this object to the default layout.
   void SetToDefaultLayout();
@@ -76,6 +77,7 @@ class ShapeLayout {
   // Returns true if all layouts have been set for this ShapeLayout object. That
   // is, every array has a layout.
   bool LayoutIsSet() const;
+  bool AnyLayoutIsSet() const;
 
   // Resets the layout on the shape to the provided layout. Shape must not be a
   // tuple.
diff --git a/third_party/xla/xla/shape_util.cc b/third_party/xla/xla/shape_util.cc
index 132c951cbeec36..4d7c88588388fc 100644
--- a/third_party/xla/xla/shape_util.cc
+++ b/third_party/xla/xla/shape_util.cc
@@ -926,7 +926,8 @@ Shape ShapeUtil::PrependMajorDimension(int64_t bound, Shape shape) {
   int64_t size = sizeof(int64_t) + proto.ByteSizeLong();
 
   TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
-      shape, [&](const Shape& subshape, const ShapeIndex& index) {
+      shape,
+      [&](const Shape& subshape, const ShapeIndex& index) -> absl::Status {
         if (subshape.IsTuple()) {
           return OkStatus();
         }
diff --git a/third_party/xla/xla/status_macros.h b/third_party/xla/xla/status_macros.h
index 392d829bb6b225..59bf958a9985ce 100644
--- a/third_party/xla/xla/status_macros.h
+++ b/third_party/xla/xla/status_macros.h
@@ -72,7 +72,7 @@ class MakeErrorStream {
     operator Status() { return wrapped_error_stream_->GetStatus(); }
     template <typename T>
     // NOLINTNEXTLINE(google-explicit-constructor)
-    operator xla::StatusOr<T>() {
+    operator absl::StatusOr<T>() {
       return wrapped_error_stream_->GetStatus();
     }
 
diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
index 194a0182bdab22..4bbdea5e08ff4b 100644
--- a/third_party/xla/xla/stream_executor/BUILD
+++ b/third_party/xla/xla/stream_executor/BUILD
@@ -1,10 +1,10 @@
-load("@local_tsl//tsl:tsl.bzl", "internal_visibility", "transitive_hdrs")
-load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
 load("@local_tsl//tsl/platform:build_config_root.bzl", "if_static")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//xla:xla.bzl", "xla_cc_test")
 load("//xla/stream_executor:build_defs.bzl", "stream_executor_friends", "stream_executor_internal")
+load("//xla/tsl:tsl.bzl", "internal_visibility", "transitive_hdrs")
+load("//xla/tsl:tsl.default.bzl", "filegroup")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -208,7 +208,7 @@ cc_library(
     hdrs = ["host_memory_allocation.h"],
     deps = [
         ":memory_allocation",
-        ":stream_executor_internal",  # TODO(b/323534971): Remove dependency on Interface.
+        ":stream_executor_interface",
     ],
 )
 
@@ -359,11 +359,13 @@ exports_files(["lazy_op_runner.h"])
 # implementation in static build configuration), or a header only `stream_executor_headers`.
 
 cc_library(
-    name = "stream_executor_internal",
-    hdrs = ["stream_executor_internal.h"],
-    visibility = internal_visibility([":internal"]),
+    name = "stream_executor_interface",
+    hdrs = [
+        "stream_executor_interface.h",
+    ],
     deps = [
         ":stream_executor_headers",
+        ":stream_interface",
         "//xla/stream_executor/platform",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/status",
@@ -374,7 +376,7 @@ cc_library(
 
 #===--------------------------------------------------------------------------------------------===#
 # StreamExecutor internal implementation (visible to StreamExecutor platform implementations)
-#===--------------------------------------------------------------------------------------------===#'
+#===--------------------------------------------------------------------------------------------===#
 
 # This header-only library is an internal implementation detail of StreamExecutor (to break
 # dependency cycles between internal libraries). All StreamExecutor clients should depend on a
@@ -382,7 +384,8 @@ cc_library(
 cc_library(
     name = "stream_executor_headers",
     hdrs = [
-        "stream_executor_pimpl.h",  # TODO(ezhulenev): Remove internal header
+        "stream_executor_interface.h",
+        "stream_executor_pimpl.h",  # TODO(301020144): Remove internal header
         ":stream_executor_api_headers",
         ":stream_executor_plugin_headers",
     ],
@@ -390,6 +393,7 @@ cc_library(
     deps = STREAM_EXECUTOR_DEPENDENCIES + if_static([
         "@com_google_protobuf//:protobuf",  # indirectly-used by dnn.h
     ]) + [
+        ":stream_interface",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@local_tsl//tsl/platform:thread_annotations",
@@ -415,6 +419,21 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "event_interface",
+    hdrs = ["event_interface.h"],
+    deps = [
+    ],
+)
+
+cc_library(
+    name = "stream_interface",
+    hdrs = ["stream_interface.h"],
+    deps = [
+        ":platform",
+    ],
+)
+
 #===--------------------------------------------------------------------------------------------===#
 # StreamExecutor private implementation (has private visibility)
 #===--------------------------------------------------------------------------------------------===#
@@ -439,7 +458,7 @@ cc_library(
     visibility = ["//visibility:private"],
     deps = [
         ":stream_executor_headers",
-        ":stream_executor_internal",
+        ":stream_executor_interface",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -456,8 +475,9 @@ cc_library(
     hdrs = ["event.h"],
     visibility = ["//visibility:private"],
     deps = [
+        ":event_interface",
         ":stream_executor_headers",
-        ":stream_executor_internal",
+        ":stream_executor_interface",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
     ],
@@ -467,7 +487,6 @@ cc_library(
     name = "executor_cache",
     srcs = ["executor_cache.cc"],
     hdrs = ["executor_cache.h"],
-    visibility = ["//visibility:private"],
     deps = [
         ":platform",
         ":stream_executor_headers",
@@ -499,13 +518,12 @@ cc_library(
     name = "kernel",
     srcs = ["kernel.cc"],
     hdrs = ["kernel.h"],
-    visibility = ["//visibility:private"],
     deps = [
         ":device_memory",
         ":kernel_spec",
         ":platform",
         ":stream_executor_headers",
-        ":stream_executor_internal",
+        ":stream_executor_interface",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/meta:type_traits",
@@ -565,7 +583,7 @@ cc_library(
         ":kernel_spec",
         ":platform",
         ":stream_executor_headers",
-        ":stream_executor_internal",
+        ":stream_executor_interface",
         "//xla/tsl/util:env_var",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/functional:any_invocable",
diff --git a/third_party/xla/xla/stream_executor/blas.h b/third_party/xla/xla/stream_executor/blas.h
index 9834e031979b92..3f78c956480086 100644
--- a/third_party/xla/xla/stream_executor/blas.h
+++ b/third_party/xla/xla/stream_executor/blas.h
@@ -158,13 +158,15 @@ class ProfileResult {
  public:
   bool is_valid() const { return is_valid_; }
   void set_is_valid(bool val) { is_valid_ = val; }
+  bool warmup_run_executed() const { return warmup_run_executed_; }
+  void set_warmup_run_executed(bool val) { warmup_run_executed_ = val; }
   AlgorithmType algorithm() const { return algorithm_; }
   void set_algorithm(AlgorithmType val) { algorithm_ = val; }
   float elapsed_time_in_ms() const { return elapsed_time_in_ms_; }
   void set_elapsed_time_in_ms(float val) { elapsed_time_in_ms_ = val; }
 
  private:
-  bool is_valid_ = false;
+  bool is_valid_ = false, warmup_run_executed_ = false;
   AlgorithmType algorithm_ = kDefaultAlgorithm;
   float elapsed_time_in_ms_ = std::numeric_limits<float>::max();
 };
diff --git a/third_party/xla/xla/stream_executor/build_defs.bzl b/third_party/xla/xla/stream_executor/build_defs.bzl
index 6916574c646edf..b619fd8823b57a 100644
--- a/third_party/xla/xla/stream_executor/build_defs.bzl
+++ b/third_party/xla/xla/stream_executor/build_defs.bzl
@@ -1,7 +1,11 @@
 """Configurations for StreamExecutor builds"""
 
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
-load("@local_config_rocm//rocm:build_defs.bzl", _if_gpu_is_configured = "if_gpu_is_configured")
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    _if_cuda_or_rocm = "if_cuda_or_rocm",
+    _if_gpu_is_configured = "if_gpu_is_configured",
+)
 load(
     "@local_tsl//tsl/platform:rules_cc.bzl",
     "cc_library",
@@ -23,8 +27,8 @@ def tf_additional_cudnn_plugin_copts():
 def if_gpu_is_configured(if_true, if_false = []):
     return _if_gpu_is_configured(if_true, if_false)
 
-def if_cuda_or_rocm(x):
-    return if_gpu_is_configured(x)
+def if_cuda_or_rocm(if_true, if_false = []):
+    return _if_cuda_or_rocm(if_true, if_false)
 
 # nvlink is not available via the pip wheels, disable it since it will create
 # unnecessary dependency
diff --git a/third_party/xla/xla/stream_executor/command_buffer.cc b/third_party/xla/xla/stream_executor/command_buffer.cc
index e2172a0e7dc374..937532d41ed588 100644
--- a/third_party/xla/xla/stream_executor/command_buffer.cc
+++ b/third_party/xla/xla/stream_executor/command_buffer.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/stream_executor/stream_executor_internal.h"
+#include "xla/stream_executor/stream_executor_interface.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
diff --git a/third_party/xla/xla/stream_executor/command_buffer.h b/third_party/xla/xla/stream_executor/command_buffer.h
index 1a6c745ef84756..998e4038def710 100644
--- a/third_party/xla/xla/stream_executor/command_buffer.h
+++ b/third_party/xla/xla/stream_executor/command_buffer.h
@@ -115,7 +115,7 @@ class CommandBuffer {
   //  synchronization in other direction. For CUDA/ROCM backend it has the same
   //  semantics as stream wait operation.
   //
-  TSL_LIB_GTL_DEFINE_INT_TYPE(ExecutionScopeId, int64_t);
+  TSL_LIB_GTL_DEFINE_INT_TYPE(ExecutionScopeId, uint64_t);
   static constexpr auto kDefaulExecutionScope = ExecutionScopeId(0);
 
   // Builder constructs nested command buffers owned by a parent command buffer.
diff --git a/third_party/xla/xla/stream_executor/cuda/BUILD b/third_party/xla/xla/stream_executor/cuda/BUILD
index 1403c0dca73efe..a55f5bc68de494 100644
--- a/third_party/xla/xla/stream_executor/cuda/BUILD
+++ b/third_party/xla/xla/stream_executor/cuda/BUILD
@@ -1,5 +1,4 @@
 load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
-load("@local_tsl//tsl:tsl.bzl", "if_google", "if_nccl", "internal_visibility", "tsl_copts")
 load(
     "@local_tsl//tsl/platform:build_config_root.bzl",
     "if_static",
@@ -25,6 +24,7 @@ load(
     "tf_additional_cudnn_plugin_copts",
     "tf_additional_gpu_compilation_copts",
 )
+load("//xla/tsl:tsl.bzl", "if_google", "if_nccl", "internal_visibility", "tsl_copts")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -74,7 +74,7 @@ cuda_only_cc_library(
             ":cuda_runtime",
             "//xla/stream_executor",
             "//xla/stream_executor:platform_manager",
-            "//xla/stream_executor:stream_executor_internal",
+            "//xla/stream_executor:stream_executor_interface",
             "//xla/stream_executor/gpu:gpu_driver_header",
             "//xla/stream_executor/gpu:gpu_executor_header",
             "//xla/stream_executor/platform",
@@ -223,7 +223,7 @@ cuda_only_cc_library(
     deps = [
         ":cuda_driver",
         "//xla/stream_executor",
-        "//xla/stream_executor:stream_executor_internal",
+        "//xla/stream_executor:stream_executor_interface",
         "//xla/stream_executor/gpu:gpu_activation",
         "//xla/stream_executor/platform",
         "@local_config_cuda//cuda:cuda_headers",
@@ -336,7 +336,7 @@ cuda_only_cc_library(
         "//xla/stream_executor",
         "//xla/stream_executor:fft",
         "//xla/stream_executor:plugin_registry",
-        "//xla/stream_executor:stream_executor_internal",
+        "//xla/stream_executor:stream_executor_interface",
         "//xla/stream_executor/gpu:gpu_executor_header",
         "//xla/stream_executor/gpu:gpu_helpers_header",
         "//xla/stream_executor/gpu:gpu_stream_header",
@@ -387,7 +387,7 @@ cuda_only_cc_library(
         "//xla/stream_executor",
         "//xla/stream_executor:dnn",
         "//xla/stream_executor:plugin_registry",
-        "//xla/stream_executor:stream_executor_internal",
+        "//xla/stream_executor:stream_executor_interface",
         "//xla/stream_executor/gpu:gpu_activation_header",
         "//xla/stream_executor/gpu:gpu_diagnostics_header",
         "//xla/stream_executor/gpu:gpu_driver_header",
@@ -561,23 +561,46 @@ xla_cc_test(
 cuda_only_cc_library(
     name = "cuda_asm_compiler",
     srcs = ["cuda_asm_compiler.cc"],
+    hdrs = ["cuda_asm_compiler.h"],
     copts = tf_additional_gpu_compilation_copts(),
+    visibility = internal_visibility([
+        "//third_party/py/jax:__subpackages__",
+        "//tensorflow/compiler/mlir/tools/kernel_gen:__subpackages__",
+        "//xla/service/gpu:__subpackages__",
+        "//xla/stream_executor:__subpackages__",
+        "//tensorflow/core/kernels:__subpackages__",
+    ]),
     deps = [
+        ":cuda_driver",
+        ":fatbinary_wrapper",
+        ":nvlink_wrapper",
+        ":ptx_compiler",
+        ":ptx_compiler_support",
+        ":ptxas_wrapper",
         "//xla:status_macros",
-        "//xla/stream_executor:stream_executor_headers",
-        "//xla/stream_executor/gpu:asm_compiler_header",
+        "//xla:util",
+        "//xla/stream_executor/gpu:gpu_asm_opts",
         "//xla/stream_executor/gpu:gpu_driver_header",
+        "//xla/stream_executor/gpu:gpu_types_header",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/cleanup",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@local_config_cuda//cuda:cuda_headers",
+        "@local_tsl//tsl/platform:cuda_libdevice_path",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:path",
+        "@local_tsl//tsl/platform:regexp",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:subprocess",
@@ -597,7 +620,7 @@ cuda_only_cc_library(
         ":cuda_runtime",  # buildcleaner: keep
         "//xla/stream_executor",
         "//xla/stream_executor:plugin_registry",
-        "//xla/stream_executor:stream_executor_internal",
+        "//xla/stream_executor:stream_executor_interface",
         "//xla/stream_executor/gpu:gpu_collectives_header",
         "//xla/stream_executor/gpu:gpu_command_buffer",
         "//xla/stream_executor/gpu:gpu_diagnostics_header",
@@ -669,7 +692,7 @@ cc_library(
         [
             "//xla/tsl/cuda:cudart",
         ] + select({
-            "@local_tsl//tsl:macos": ["IOKit"],
+            "//xla/tsl:macos": ["IOKit"],
             "//conditions:default": [],
         }),
     ),
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_asm_compiler.cc b/third_party/xla/xla/stream_executor/cuda/cuda_asm_compiler.cc
index 771d570b5f3f38..b2ffeeb7c6d063 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_asm_compiler.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_asm_compiler.cc
@@ -13,37 +13,485 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "xla/stream_executor/cuda/cuda_asm_compiler.h"
+
+#include <cassert>
 #include <cstddef>
 #include <cstdint>
+#include <cstdlib>
 #include <cstring>
 #include <sstream>
 #include <string>
+#include <tuple>
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
+#include "absl/base/const_init.h"
 #include "absl/base/optimization.h"
+#include "absl/base/thread_annotations.h"
 #include "absl/cleanup/cleanup.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/match.h"
+#include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "xla/status_macros.h"
-#include "xla/stream_executor/gpu/asm_compiler.h"
+#include "xla/stream_executor/cuda/ptx_compiler.h"
+#include "xla/stream_executor/cuda/ptx_compiler_support.h"
+#include "xla/stream_executor/gpu/gpu_asm_opts.h"
 #include "xla/stream_executor/gpu/gpu_driver.h"
-#include "xla/stream_executor/stream_executor.h"
+#include "xla/stream_executor/gpu/gpu_types.h"
+#include "xla/util.h"
+#include "tsl/platform/cuda_libdevice_path.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/path.h"
+#include "tsl/platform/regexp.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/subprocess.h"
 
 namespace stream_executor {
 
+static absl::StatusOr<std::string> GetToolVersionString(
+    std::string_view binary_path) {
+  // If binary_path doesn't exist, then tsl::SubProcess will log a bunch of
+  // error messages that have confused users in the past. Therefore we first
+  // check whether the binary_path exists and error out early if not.
+  tsl::Env* env = tsl::Env::Default();
+  if (absl::Status file_exists = env->FileExists(std::string{binary_path});
+      !file_exists.ok()) {
+    return file_exists;
+  }
+
+  tsl::SubProcess binary;
+  std::string binary_path_str(binary_path);
+  binary.SetProgram(binary_path_str, {binary_path_str, "--version"});
+  binary.SetChannelAction(tsl::CHAN_STDOUT, tsl::ACTION_PIPE);
+  if (!binary.Start()) {
+    return absl::InternalError(
+        absl::StrFormat("Couldn't invoke %s --version", binary_path));
+  }
+
+  std::string out;
+  int exit_code = binary.Communicate(/*stdin_input=*/nullptr, &out,
+                                     /*stderr_output=*/nullptr);
+  if (exit_code != 0) {
+    return absl::InternalError(absl::StrFormat(
+        "Running %s --version returned %d", binary_path, exit_code));
+  }
+
+  return out;
+}
+
+static absl::StatusOr<ToolVersion> GetToolVersionImpl(
+    std::string_view tool_path) {
+  absl::StatusOr<std::string> tool_version = GetToolVersionString(tool_path);
+  if (!tool_version.ok()) {
+    return absl::FailedPreconditionError(
+        absl::StrCat("Couldn't get ptxas/nvlink version string: ",
+                     tool_version.status().ToString()));
+  }
+  static constexpr LazyRE2 kVersionRegex = {R"(\bV(\d+)\.(\d+)\.(\d+)\b)"};
+  ToolVersion version{};
+  std::string_view vmaj_str, vmin_str, vdot_str;
+  if (!RE2::PartialMatch(tool_version.value(), *kVersionRegex, &vmaj_str,
+                         &vmin_str, &vdot_str) ||
+      !absl::SimpleAtoi(vmaj_str, &version[0]) ||
+      !absl::SimpleAtoi(vmin_str, &version[1]) ||
+      !absl::SimpleAtoi(vdot_str, &version[2])) {
+    return absl::FailedPreconditionError(
+        absl::StrCat("Couldn't parse ptxas/nvlink version in output of ",
+                     tool_path, " --version:\n", tool_version.value()));
+  }
+  return version;
+}
+
+absl::StatusOr<ToolVersion> GetToolVersion(std::string_view tool_path) {
+  // This is only implementing a static cache. `GetToolVersionImpl` has the
+  // actual business logic.
+  static absl::Mutex mutex(absl::kConstInit);
+  static auto cache =
+      new absl::flat_hash_map<std::string, absl::StatusOr<ToolVersion>>
+          ABSL_GUARDED_BY(mutex);
+
+  absl::MutexLock lock(&mutex);
+  auto it = cache->find(tool_path);
+  if (it != cache->end()) {
+    return it->second;
+  }
+
+  return cache->try_emplace(tool_path, GetToolVersionImpl(tool_path))
+      .first->second;
+}
+
+absl::StatusOr<absl::Span<const uint8_t>> CompileGpuAsmOrGetCached(
+    int device_ordinal, const char* ptx, GpuAsmOpts compilation_options) {
+  using PtxCacheKey = std::tuple<int, std::string, GpuAsmOpts::PtxOptionsTuple>;
+  using PtxCompilerResult = absl::StatusOr<std::vector<uint8_t>>;
+  static absl::Mutex ptx_cache_mutex(absl::kConstInit);
+  static auto& ptx_cache ABSL_GUARDED_BY(ptx_cache_mutex) =
+      *new absl::flat_hash_map<PtxCacheKey, PtxCompilerResult>();
+
+  absl::MutexLock lock(&ptx_cache_mutex);
+  PtxCacheKey cache_key{device_ordinal, std::string(ptx),
+                        compilation_options.ToTuple()};
+  auto it = ptx_cache.find(cache_key);
+  if (it == ptx_cache.end()) {
+    PtxCompilerResult compiled =
+        CompileGpuAsm(device_ordinal, ptx, compilation_options);
+    it = ptx_cache.emplace(cache_key, std::move(compiled)).first;
+  }
+
+  CHECK(it != ptx_cache.end());
+
+  // Failed compilation attempts are cached.
+  // Use separate status check and ValueOrDie invocation on ptx_cache
+  // entry to avoid value moving introduced by TF_ASSIGN_OR_RETURN.
+
+  if (ABSL_PREDICT_FALSE(!it->second.ok())) {
+    return it->second.status();
+  }
+
+  const std::vector<uint8_t>& compiled = it->second.value();
+  return absl::MakeSpan(compiled);
+}
+
+absl::StatusOr<std::vector<uint8_t>> CompileGpuAsm(int device_ordinal,
+                                                   const char* ptx_contents,
+                                                   GpuAsmOpts options) {
+  gpu::GpuDeviceHandle handle;
+  TF_RETURN_IF_ERROR(gpu::GpuDriver::GetDevice(device_ordinal, &handle));
+  int cc_major;
+  int cc_minor;
+  TF_RETURN_IF_ERROR(
+      gpu::GpuDriver::GetComputeCapability(&cc_major, &cc_minor, handle));
+  return CompileGpuAsm(cc_major, cc_minor, ptx_contents, options);
+}
+
+absl::StatusOr<std::string> FindCudaExecutable(
+    std::string_view binary_name, std::string_view preferred_cuda_dir,
+    ToolVersion minimum_version,
+    absl::Span<const ToolVersion> excluded_versions) {
+  std::string binary_filename = std::string{binary_name};
+  tsl::io::AppendDotExeIfWindows(binary_filename);
+
+  std::vector<std::string> candidates{};
+
+  // #1 - Check the preferred CUDA directory
+  candidates.emplace_back(
+      tsl::io::JoinPath(preferred_cuda_dir, "bin", binary_filename));
+
+  // #2 - Check generic CUDA locations if that is preferred over the PATH
+  if (!tsl::PreferPtxasFromPath()) {
+    for (std::string_view path : tsl::CandidateCudaRoots()) {
+      candidates.emplace_back(tsl::io::JoinPath(path, "bin", binary_filename));
+    }
+  }
+
+  // #3 - Check the PATH environment variable
+  std::string_view path_env = std::getenv("PATH");
+
+#if defined(PLATFORM_WINDOWS)
+  constexpr char kSearchPathSeparator = ';';
+#else
+  constexpr char kSearchPathSeparator = ':';
+#endif
+
+  for (std::string_view path : absl::StrSplit(path_env, kSearchPathSeparator)) {
+    candidates.emplace_back(tsl::io::JoinPath(path, binary_filename));
+  }
+
+  // #4 - Check generic CUDA locations if we didn't do that already in #2
+  if (tsl::PreferPtxasFromPath()) {
+    for (std::string_view path : tsl::CandidateCudaRoots()) {
+      candidates.emplace_back(tsl::io::JoinPath(path, "bin", binary_filename));
+    }
+  }
+
+  for (const auto& candidate : candidates) {
+    VLOG(2) << "Looking for " << candidate;
+    auto candidate_version = GetToolVersion(candidate);
+    if (!candidate_version.ok()) {
+      continue;
+    }
+
+    if (candidate_version.value() < minimum_version) {
+      VLOG(2) << candidate << " with version "
+              << absl::StrJoin(minimum_version, ".") << " is too old.";
+      continue;
+    }
+
+    if (absl::c_find(excluded_versions, candidate_version.value()) !=
+        excluded_versions.end()) {
+      VLOG(2) << candidate << " has version "
+              << absl::StrJoin(candidate_version.value(), ".")
+              << " which was explicitly excluded.";
+      continue;
+    }
+
+    VLOG(2) << "Using " << candidate << " with version "
+            << absl::StrJoin(candidate_version.value(), ".");
+    return candidate;
+  }
+
+  return absl::NotFoundError(
+      absl::StrCat("Couldn't find a suitable version of ", binary_name,
+                   ". The following locations were considered: ",
+                   absl::StrJoin(candidates, ", ")));
+}
+
+absl::StatusOr<std::string> FindCudaExecutable(
+    std::string_view binary_name, std::string_view preferred_cuda_dir) {
+  static constexpr ToolVersion kNoMinimumVersion{0, 0, 0};
+  static constexpr absl::Span<const ToolVersion> kNoExcludedVersions{};
+  return FindCudaExecutable(binary_name, preferred_cuda_dir, kNoMinimumVersion,
+                            kNoExcludedVersions);
+}
+
+static void LogPtxasTooOld(const std::string& ptxas_path, int cc_major,
+                           int cc_minor) {
+  using AlreadyLoggedSetTy =
+      absl::flat_hash_set<std::tuple<std::string, int, int>>;
+
+  static absl::Mutex* mutex = new absl::Mutex;
+  static AlreadyLoggedSetTy* already_logged = new AlreadyLoggedSetTy;
+
+  absl::MutexLock lock(mutex);
+
+  if (already_logged->insert(std::make_tuple(ptxas_path, cc_major, cc_minor))
+          .second) {
+    LOG(WARNING) << "Falling back to the CUDA driver for PTX compilation; "
+                    "ptxas does not support CC "
+                 << cc_major << "." << cc_minor;
+    LOG(WARNING) << "Used ptxas at " << ptxas_path;
+  }
+}
+
+static void AppendArgsFromOptions(GpuAsmOpts options,
+                                  std::vector<std::string>& args) {
+  if (options.disable_gpuasm_optimizations) {
+    args.push_back("-O0");
+  }
+  args.insert(args.end(), options.extra_flags.begin(),
+              options.extra_flags.end());
+}
+
+static absl::StatusOr<std::string> FindPtxAsExecutable(
+    std::string_view preferred_cuda_dir) {
+  static constexpr ToolVersion kMinimumSupportedPtxAsVersion{11, 8, 0};
+  static constexpr ToolVersion kBuggyPtxAsVersions[] = {{12, 3, 103}};
+  static constexpr std::string_view kPtxAsBinaryName = "ptxas";
+
+  return FindCudaExecutable(kPtxAsBinaryName, preferred_cuda_dir,
+                            kMinimumSupportedPtxAsVersion, kBuggyPtxAsVersions);
+}
+
+absl::StatusOr<ToolVersion> GetAsmCompilerVersion(
+    std::string_view preferred_cuda_dir) {
+  TF_ASSIGN_OR_RETURN(std::string ptxas_path,
+                      FindPtxAsExecutable(preferred_cuda_dir));
+  return GetToolVersion(ptxas_path);
+}
+
+absl::StatusOr<std::vector<uint8_t>> CompileGpuAsmUsingPtxAs(
+    int cc_major, int cc_minor, const char* ptx_contents, GpuAsmOpts options,
+    bool cancel_if_reg_spill) {
+  TF_ASSIGN_OR_RETURN(std::string ptxas_path,
+                      FindPtxAsExecutable(options.preferred_cuda_dir));
+
+  // Write ptx into a temporary file.
+  std::string ptx_path;
+  auto env = tsl::Env::Default();
+  if (!env->LocalTempFilename(&ptx_path)) {
+    return absl::InternalError("couldn't get temp PTX file name");
+  }
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(
+      tsl::WriteStringToFile(env, ptx_path, ptx_contents),
+      "Unable to write PTX contents to: ", ptx_path);
+  VLOG(2) << "ptx written to: " << ptx_path;
+
+  absl::Cleanup ptx_cleaner = [&ptx_path] {
+    TF_CHECK_OK(tsl::Env::Default()->DeleteFile(ptx_path));
+  };
+
+  // Invoke ptxas and collect its output.
+  std::string cubin_path;
+  if (!env->LocalTempFilename(&cubin_path)) {
+    return absl::InternalError("couldn't get temp CUBIN file name");
+  }
+  absl::Cleanup cubin_cleaner = [&cubin_path] {
+    // CUBIN file may never be created, so the failure to delete it should not
+    // produce TF error.
+    tsl::Env::Default()->DeleteFile(cubin_path).IgnoreError();
+  };
+  tsl::SubProcess ptxas_info_dumper;
+  // If the target is sm_90, hard code it to sm_90a so that all instructions
+  // can be used. We don't need the portability that sm_90 gives.
+  std::string extension = (cc_major == 9 && cc_minor == 0) ? "a" : "";
+  std::vector<std::string> ptxas_args = {
+      ptxas_path,
+      ptx_path,
+      "-o",
+      cubin_path,
+      absl::StrCat("-arch=sm_", cc_major, cc_minor, extension),
+      "--warn-on-spills"};
+  if (VLOG_IS_ON(2)) {
+    ptxas_args.push_back("-v");
+  }
+  AppendArgsFromOptions(options, ptxas_args);
+  if (VLOG_IS_ON(3)) {
+    VLOG(3) << absl::StrJoin(ptxas_args, " ");
+  }
+
+  ptxas_info_dumper.SetProgram(ptxas_path, ptxas_args);
+  ptxas_info_dumper.SetChannelAction(tsl::CHAN_STDERR, tsl::ACTION_PIPE);
+  if (!ptxas_info_dumper.Start()) {
+    return absl::InternalError("Failed to launch ptxas");
+  }
+  std::string stderr_output;
+  int exit_status = ptxas_info_dumper.Communicate(
+      /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
+  if (exit_status != 0) {
+    //  It happens when the ptxas installed is too old for the current GPU.
+    //  Example error message associated with this error code:
+    //      ptxas fatal   : Value 'sm_80' is not defined for option 'gpu-name'
+    // In that case, fallback to the driver for compilation
+    if (absl::StrContains(stderr_output, "ptxas fatal   : Value '") &&
+        absl::StrContains(stderr_output,
+                          "is not defined for option 'gpu-name'")) {
+      LogPtxasTooOld(ptxas_path, cc_major, cc_minor);
+      return absl::UnimplementedError(absl::StrFormat(
+          "%s ptxas too old. Falling back to the driver to compile.",
+          ptxas_path));
+    }
+    if (absl::StrContains(stderr_output, "ptxas fatal") &&
+        absl::StrContains(stderr_output, "Register allocation failed")) {
+      LOG(INFO) << stderr_output;
+      return absl::ResourceExhaustedError("Register allocation failed");
+    }
+
+    return absl::InternalError(
+        absl::StrFormat("ptxas exited with non-zero error code %d, output: %s",
+                        exit_status, stderr_output));
+  }
+  // Print the verbose output of ptxas.
+  if (!stderr_output.empty()) {
+    if (absl::StrContains(stderr_output, "warning")) {
+      LOG(INFO) << stderr_output;
+      if (cancel_if_reg_spill &&
+          absl::StrContains(stderr_output, "Registers are spilled")) {
+        return xla::Cancelled(
+            "Compilation result discarded due to register spilling");
+      }
+    } else {
+      VLOG(2) << stderr_output;
+    }
+  }
+
+  // Read in the result of compilation and return it as a byte vector.
+  std::string cubin;
+  TF_RETURN_IF_ERROR(
+      tsl::ReadFileToString(tsl::Env::Default(), cubin_path, &cubin));
+  std::vector<uint8_t> cubin_vector(cubin.begin(), cubin.end());
+  return cubin_vector;
+}
+
+absl::StatusOr<std::vector<uint8_t>> BundleGpuAsm(
+    std::vector<CubinOrPTXImage> images, GpuAsmOpts options) {
+  TF_ASSIGN_OR_RETURN(
+      std::string fatbinary_path,
+      FindCudaExecutable("fatbinary", options.preferred_cuda_dir));
+
+  // Write images to temporary files.
+  std::vector<std::string> image_paths;
+  auto env = tsl::Env::Default();
+  for (const CubinOrPTXImage& img : images) {
+    std::string img_path;
+    if (!env->LocalTempFilename(&img_path)) {
+      return absl::InternalError(
+          "Could not get temporary filenames for images.");
+    }
+    TF_RETURN_IF_ERROR(tsl::WriteStringToFile(
+        env, img_path, std::string(img.bytes.begin(), img.bytes.end())));
+    VLOG(2) << "image written to " << img_path;
+    image_paths.push_back(std::move(img_path));
+  }
+  absl::Cleanup image_files_cleaner = [&image_paths] {
+    for (const auto& path : image_paths) {
+      TF_CHECK_OK(tsl::Env::Default()->DeleteFile(path));
+    }
+  };
+
+  // Prepare temporary result file.
+  std::string result_path;
+  if (!env->LocalTempFilename(&result_path)) {
+    return absl::InternalError(
+        "Could not get temporary filename for fatbin result.");
+  }
+  absl::Cleanup result_file_cleaner = [&result_path] {
+    // This file may never be created, so the failure to delete it should not
+    // propagate to TF.
+    tsl::Env::Default()->DeleteFile(result_path).IgnoreError();
+  };
+
+  // Compute the ptxas options that were used to produce the cubins.
+  std::vector<std::string> ptxas_options;
+  AppendArgsFromOptions(options, ptxas_options);
+
+  // Invoke fatbinary and collect its output.
+  tsl::SubProcess fatbinary;
+  std::vector<std::string> fatbinary_args = {
+      fatbinary_path, "--64", "--link", "--compress-all",
+      absl::StrCat("--create=", result_path)};
+  if (!ptxas_options.empty()) {
+    auto command_line = absl::StrJoin(ptxas_options, " ");
+    fatbinary_args.push_back(absl::StrFormat("--cmdline=%s", command_line));
+  }
+  assert(images.size() == image_paths.size());
+  for (int i = 0; i < images.size(); i++) {
+    fatbinary_args.push_back(absl::StrFormat(
+        "--image=profile=%s,file=%s", images[i].profile, image_paths[i]));
+  }
+  if (VLOG_IS_ON(3)) {
+    VLOG(3) << absl::StrJoin(fatbinary_args, " ");
+  }
+  fatbinary.SetProgram(fatbinary_path, fatbinary_args);
+  fatbinary.SetChannelAction(tsl::CHAN_STDERR, tsl::ACTION_PIPE);
+  if (!fatbinary.Start()) {
+    return absl::InternalError("Failed to launch fatbinary.");
+  }
+  std::string stderr_output;
+  int exit_status = fatbinary.Communicate(
+      /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
+  if (exit_status != 0) {
+    return absl::InternalError(absl::StrFormat(
+        "fatbinary exited with non-zero error code %d, output: %s", exit_status,
+        stderr_output));
+  }
+  if (!stderr_output.empty()) {
+    VLOG(2) << stderr_output;
+  }
+
+  // Read in the result and return it as a byte vector.
+  std::string result_blob;
+  TF_RETURN_IF_ERROR(
+      tsl::ReadFileToString(tsl::Env::Default(), result_path, &result_blob));
+  return std::vector<uint8_t>(result_blob.begin(), result_blob.end());
+}
+
 #define RETURN_IF_CUDA_ERROR(expr)                                            \
   do {                                                                        \
     CUresult _status = expr;                                                  \
@@ -77,7 +525,7 @@ absl::StatusOr<ToolVersion> GetNvLinkVersion(
 }
 
 absl::StatusOr<std::vector<uint8_t>> LinkUsingNvlink(
-    absl::string_view preferred_cuda_dir, gpu::GpuContext* context,
+    std::string_view preferred_cuda_dir, gpu::GpuContext* context,
     std::vector<CubinOrPTXImage> images) {
   LOG_FIRST_N(INFO, 1) << "Using nvlink for parallel linking";
 
@@ -85,7 +533,7 @@ absl::StatusOr<std::vector<uint8_t>> LinkUsingNvlink(
                       FindNvlinkExecutable(preferred_cuda_dir));
 
   if (images.empty()) {
-    return std::vector<uint8>();
+    return std::vector<uint8_t>();
   }
 
   auto env = tsl::Env::Default();
@@ -184,4 +632,20 @@ absl::StatusOr<std::vector<uint8_t>> LinkGpuAsm(
   return std::move(cubin);
 }
 
+absl::StatusOr<std::vector<uint8_t>> CompileGpuAsm(int cc_major, int cc_minor,
+                                                   const char* ptx_contents,
+                                                   GpuAsmOpts options,
+                                                   bool cancel_if_reg_spill) {
+  if (IsLibNvPtxCompilerSupported()) {
+    VLOG(3) << "Compiling GPU ASM with libnvptxcompiler";
+    return CompileGpuAsmUsingLibNvPtxCompiler(cc_major, cc_minor, ptx_contents,
+                                              options, cancel_if_reg_spill);
+  }
+
+  VLOG(3) << "Compiling GPU ASM with PTXAS. Libnvptxcompiler compilation "
+             "not supported.";
+  return CompileGpuAsmUsingPtxAs(cc_major, cc_minor, ptx_contents, options,
+                                 cancel_if_reg_spill);
+}
+
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_asm_compiler.h b/third_party/xla/xla/stream_executor/cuda/cuda_asm_compiler.h
new file mode 100644
index 00000000000000..d906f927f99bea
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_asm_compiler.h
@@ -0,0 +1,103 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_CUDA_ASM_COMPILER_H_
+#define XLA_STREAM_EXECUTOR_CUDA_CUDA_ASM_COMPILER_H_
+
+#include <array>
+#include <cstdint>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/cuda/cuda_driver.h"
+#include "xla/stream_executor/gpu/gpu_asm_opts.h"
+
+namespace stream_executor {
+// Compiles the given PTX string using ptxas and returns the resulting machine
+// code (i.e. a cubin) as a byte array. The generated cubin matches the compute
+// capabilities of the device associated with 'device_ordinal'.
+//
+// 'options' is used to query for the CUDA location in case it is
+// customized in a passed flag, and for controlling ptxas optimizations.
+absl::StatusOr<std::vector<uint8_t>> CompileGpuAsm(int device_ordinal,
+                                                   const char* ptx_contents,
+                                                   GpuAsmOpts options);
+
+// Compiles the given PTX string using ptxas and returns the resulting machine
+// code (i.e. a cubin) as a byte array. The generated cubin matches the compute
+// capabilities provided by 'cc_major' and 'cc_minor'.
+//
+// 'options' is used to query for the CUDA location in case it is
+// customized in a passed flag, and for controlling ptxas optimizations.
+absl::StatusOr<std::vector<uint8_t>> CompileGpuAsm(
+    int cc_major, int cc_minor, const char* ptx_contents, GpuAsmOpts options,
+    bool cancel_if_reg_spill = false);
+
+absl::StatusOr<std::vector<uint8_t>> CompileGpuAsmUsingPtxAs(
+    int cc_major, int cc_minor, const char* ptx_contents, GpuAsmOpts options,
+    bool cancel_if_reg_spill = false);
+
+// Same as CompileGpuAsm, but caches the result, and returns unowned view of
+// the compiled binary.
+//
+// A copy of the string provided in ptx will be made.
+absl::StatusOr<absl::Span<const uint8_t>> CompileGpuAsmOrGetCached(
+    int device_ordinal, const char* ptx, GpuAsmOpts compilation_options);
+
+struct CubinOrPTXImage {
+  std::string profile;
+  std::vector<uint8_t> bytes;
+};
+
+// Bundles the GPU machine code (cubins) and PTX if requested and returns the
+// resulting binary (i.e. a fatbin) as a byte array.
+absl::StatusOr<std::vector<uint8_t>> BundleGpuAsm(
+    std::vector<CubinOrPTXImage> images, GpuAsmOpts options);
+
+// Links multiple relocatable GPU images (e.g. results of ptxas -c) into a
+// single image.
+absl::StatusOr<std::vector<uint8_t>> LinkGpuAsm(
+    gpu::GpuContext* context, std::vector<CubinOrPTXImage> images);
+
+absl::StatusOr<std::vector<uint8_t>> LinkUsingNvlink(
+    std::string_view preferred_cuda_dir, gpu::GpuContext* context,
+    std::vector<CubinOrPTXImage> images);
+
+using ToolVersion = std::array<int64_t, 3>;
+absl::StatusOr<std::string> FindCudaExecutable(
+    std::string_view binary_name, std::string_view preferred_cuda_dir,
+    ToolVersion minimum_version,
+    absl::Span<const ToolVersion> excluded_versions);
+
+absl::StatusOr<std::string> FindCudaExecutable(
+    std::string_view binary_name, std::string_view preferred_cuda_dir);
+
+// Runs tool --version and parses its version string.
+absl::StatusOr<ToolVersion> GetToolVersion(std::string_view tool_path);
+
+// On NVIDIA GPUs, returns the version of the ptxas command line tool.
+absl::StatusOr<ToolVersion> GetAsmCompilerVersion(
+    std::string_view preferred_cuda_dir);
+
+// On NVIDIA GPUs, returns the version of the nvlink command line tool.
+absl::StatusOr<ToolVersion> GetNvLinkVersion(
+    std::string_view preferred_cuda_dir);
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_CUDA_ASM_COMPILER_H_
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc b/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc
index 066708a85a4bf3..42c4940ff636f8 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc
@@ -750,7 +750,10 @@ absl::Status CUDABlas::DoBlasGemmWithAlgorithm(
 
   TF_ASSIGN_OR_RETURN(
       std::optional<GpuTimer> timer,
-      GpuTimer::CreateIfNeeded(stream, output_profile_result != nullptr));
+      GpuTimer::CreateIfNeeded(
+          stream,
+          output_profile_result && output_profile_result->warmup_run_executed(),
+          output_profile_result != nullptr));
 
   // Since we are converting 'algorithm' to cublasGemmAlgo_t by static_cast,
   // we do the following compile-time check on the default value:
@@ -782,7 +785,10 @@ absl::Status CUDABlas::DoBlasGemmStridedBatchedWithAlgorithm(
       GetMathTypeForGemmEx(stream, algorithm, type_a, type_b, numeric_options));
   TF_ASSIGN_OR_RETURN(
       std::optional<GpuTimer> timer,
-      GpuTimer::CreateIfNeeded(stream, output_profile_result != nullptr));
+      GpuTimer::CreateIfNeeded(
+          stream,
+          output_profile_result && output_profile_result->warmup_run_executed(),
+          output_profile_result != nullptr));
   cudaDataType_t cuda_in_type = AsCudaDataType(type_a);
 
 #if CUDA_VERSION >= 11000
@@ -1423,7 +1429,7 @@ void initialize_cublas() {
   absl::Status status =
       PluginRegistry::Instance()->RegisterFactory<PluginRegistry::BlasFactory>(
           kCudaPlatformId, "cuBLAS",
-          [](::stream_executor::internal::StreamExecutorInterface *parent)
+          [](::stream_executor::StreamExecutorInterface *parent)
               -> blas::BlasSupport * {
             gpu::GpuExecutor *cuda_executor =
                 dynamic_cast<gpu::GpuExecutor *>(parent);
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_blas_lt.cc b/third_party/xla/xla/stream_executor/cuda/cuda_blas_lt.cc
index 90467dad7c15ef..859e298112fefc 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_blas_lt.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_blas_lt.cc
@@ -406,8 +406,11 @@ absl::Status BlasLt::MatmulPlan::DoMatmul(
     DeviceMemoryBase b_scale, DeviceMemoryBase c_scale,
     DeviceMemoryBase d_scale, DeviceMemoryBase d_amax,
     blas::ProfileResult* profile_result) const {
-  TF_ASSIGN_OR_RETURN(std::optional<gpu::GpuTimer> timer,
-                      gpu::GpuTimer::CreateIfNeeded(stream, profile_result));
+  TF_ASSIGN_OR_RETURN(
+      std::optional<gpu::GpuTimer> timer,
+      gpu::GpuTimer::CreateIfNeeded(
+          stream, profile_result && profile_result->warmup_run_executed(),
+          profile_result != nullptr));
 
   void* workspace = nullptr;
   if (algorithm.workspace_size > 0) {
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
index 1f8ce927570e47..fc0eddb2c500f6 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
@@ -68,7 +68,7 @@ limitations under the License.
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/stream_executor/stream_executor_internal.h"
+#include "xla/stream_executor/stream_executor_interface.h"
 #include "xla/tsl/util/env_var.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
@@ -2308,8 +2308,12 @@ absl::Status CudnnSupport::DoRnnForwardImpl(
       reserve_space_allocator, is_training, &workspace, &reserve_space));
 
   const bool is_profiling = output_profile_result != nullptr;
-  TF_ASSIGN_OR_RETURN(std::optional<GpuTimer> timer,
-                      GpuTimer::CreateIfNeeded(stream, is_profiling));
+  TF_ASSIGN_OR_RETURN(
+      std::optional<GpuTimer> timer,
+      GpuTimer::CreateIfNeeded(
+          stream,
+          output_profile_result && output_profile_result->warmup_run_executed(),
+          is_profiling));
 
   if (input_desc.is_var_seq_lengths()) {
     // In CUDNN v8, the cudnnRNNForward*** and cudnnRNNForward***Ex have been
@@ -2458,8 +2462,12 @@ absl::Status CudnnSupport::DoRnnBackwardImpl(
                                         nullptr, true, &workspace, nullptr));
 
   const bool is_profiling = output_profile_result != nullptr;
-  TF_ASSIGN_OR_RETURN(std::optional<GpuTimer> timer,
-                      GpuTimer::CreateIfNeeded(stream, is_profiling));
+  TF_ASSIGN_OR_RETURN(
+      std::optional<GpuTimer> timer,
+      GpuTimer::CreateIfNeeded(
+          stream,
+          output_profile_result && output_profile_result->warmup_run_executed(),
+          is_profiling));
 
   if (input_desc.is_var_seq_lengths()) {
     // In CUDNN v8, the cudnnRNNBackward*** and cudnnRNNBackward***Ex have
@@ -3792,13 +3800,10 @@ enum CudnnfMHAUid {
   NEG_INFINITY_ID,
   ALPHA_SCALE_ID,
   DROPOUT_SCALE_ID,
-  SCALE_PROB_ID,
   Q_SEQLEN_ID,
   K_SEQLEN_ID,
   D_OFFSET_ID,
   D_SEED_ID,
-  S_SUM_ID,
-  d_Q_accum_ID,
   VIRTUAL_ID = 34857
 };
 
@@ -6223,146 +6228,112 @@ GetCudnnFusedMHABackwardOperationGraph(
   return std::make_unique<cudnn_frontend::OperationGraph>(std::move(op_graph));
 }
 
-// Returns a cudnn tensor that's the output of the bias addition op
-absl::StatusOr<cudnn_frontend::Tensor> CreateCudnnFlashAttentionBiasFwdTensor(
-    std::vector<cudnn_frontend::Operation>& ops, absl::Span<const int64_t> dims,
-    absl::Span<const int64_t> strides, dnn::DataType dtype,
-    cudnn_frontend::Tensor& input_tensor) {
-  // Create the bias tensor.
-  TF_ASSIGN_OR_RETURN(
-      auto bias_tensor,
-      CreateCudnnTensor(dims, strides, CudnnfMHAUid::BIAS_ID, dtype, 1, -1));
-
-  // Create the bias output tensor
-  TF_ASSIGN_OR_RETURN(
-      auto bias_out_tensor,
-      CreateCudnnTensor(dims, strides, CudnnfMHAUid::VIRTUAL_ID + 300,
-                        dnn::DataType::kFloat, 1, -1, /*is_virtual=*/true));
-
-  // Define the bias descriptor
-  auto bias_desc = cudnn_frontend::PointWiseDescBuilder()
-                       .setMode(CUDNN_POINTWISE_ADD)
-                       .setComputeType(CUDNN_DATA_FLOAT)
-                       .build();
-  // Create the bias op.
-  auto bias_op = cudnn_frontend::OperationBuilder(
-                     CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
-                     .setxDesc(input_tensor)
-                     .setbDesc(bias_tensor)
-                     .setyDesc(bias_out_tensor)
-                     .setpwDesc(bias_desc)
-                     .build();
-
-  RETURN_MSG_IF_CUDNN_ERROR(bias_op);
-  // Add bias to op list
-  ops.push_back(std::move(bias_op));
+#endif  // CUDNN_VERSION >= 8800
 
-  return bias_out_tensor;
-}
+}  // namespace
 
-absl::StatusOr<cudnn_frontend::Tensor>
-CreateCudnnFlashAttentionCausalMaskTensor(
-    std::vector<cudnn_frontend::Operation>& ops, absl::Span<const int64_t> dims,
-    absl::Span<const int64_t> strides, dnn::DataType dtype,
-    cudnn_frontend::Tensor& input_tensor) {
-  std::vector<int64_t> mask_dim(dims.size(), 1);
-  std::vector<int64_t> mask_stride(strides.size(), 1);
+static absl::StatusOr<cudnn_frontend::ExecutionPlan> GetExecPlanFromHeuristics(
+    cudnn_frontend::OperationGraph&& opGraph, const CudnnHandle& cudnn,
+    bool include_fallback_heuristics = false) {
+#if (CUDNN_VERSION >= 8800)
+  cudnn_frontend::EngineConfigList engine_configs;
+  if (!include_fallback_heuristics) {
+    cudnn_frontend::get_heuristics_list<1>(
+        {"heuristics_instant"}, opGraph, allowAllConfig, engine_configs, true);
+  } else {
+    cudnn_frontend::get_heuristics_list<2>(
+        {"heuristics_instant", "heuristics_fallback"}, opGraph, allowAllConfig,
+        engine_configs, true);
+  }
 
-  // Create the masked out value tensor.
-  TF_ASSIGN_OR_RETURN(
-      auto masked_val_tensor,
-      CreateCudnnTensor(
-          mask_dim, mask_stride, CudnnfMHAUid::NEG_INFINITY_ID,
-          dnn::DataType::kFloat, 1, -1,
-          /*is_virtual*/ false,
-          /*cudnn_tensor_order_type*/ CUDNN_TENSOR_REORDERING_NONE,
-          /*is_value*/ true));
+  if (VLOG_IS_ON(4)) {
+    VLOG(4) << "Heuristic has " << engine_configs.size() << " configurations ";
+  }
+  if (engine_configs.empty()) {
+    return absl::InternalError(
+        "No engine configurations found for this opGraph and heuristics.");
+  }
 
-  // Create the row index tensor
-  TF_ASSIGN_OR_RETURN(
-      auto row_index_tensor,
-      CreateCudnnTensor(dims, strides, CudnnfMHAUid::VIRTUAL_ID + 401,
-                        dnn::DataType::kFloat, 1, -1,
-                        /*is_virtual=*/true));
+  cudnnStatus_t status;
+  for (auto engine_config : engine_configs) {
+    cudnn_frontend::ExecutionPlan plan =
+        cudnn_frontend::ExecutionPlanBuilder()
+            .setHandle(cudnn.handle())
+            .setEngineConfig(engine_config, opGraph.getTag())
+            .build();
+    status = plan.get_status();
+    if (status == CUDNN_STATUS_SUCCESS) {
+      return plan;
+    } else {
+      VLOG(4) << "Failed to build cuDNN execution plan for opGraph "
+              << opGraph.getTag()
+              << ". Status: " << CudnnStatusToString(status);
+    }
+  }
 
-  // Create the column index tensor
-  TF_ASSIGN_OR_RETURN(
-      auto column_index_tensor,
-      CreateCudnnTensor(dims, strides, CudnnfMHAUid::VIRTUAL_ID + 402,
-                        dnn::DataType::kFloat, 1, -1,
-                        /*is_virtual=*/true));
+  LOG(FATAL) << "Failed to generate cuDNN execution plan for opGraph "
+             << opGraph.getTag()
+             << ". Status of final plan: " << CudnnStatusToString(status);
+#else
+  return absl::UnimplementedError("Supported only for cuDNN >= 8.8.0");
+#endif
+}
 
-  // Create the causal mask tensor
-  auto causal_mask_tensor = cudnn_frontend::TensorBuilder()
-                                .setDim(dims.size(), dims.data())
-                                .setStride(strides.size(), strides.data())
-                                .setId(CudnnfMHAUid::VIRTUAL_ID + 403)
-                                .setAlignment(16)
-                                .setDataType(CUDNN_DATA_BOOLEAN)
-                                .setVectorCountAndDimension(1, -1)
-                                .setVirtual(true)
-                                .setReorderType(CUDNN_TENSOR_REORDERING_NONE)
-                                .setByValue(false)
-                                .build();
+static absl::StatusOr<cudnn_frontend::ExecutionPlan> RebuildExecutionPlan(
+    const CudnnHandle& cudnn, const dnn::AlgorithmDesc& desc,
+    const cudnn_frontend::OperationGraph& op_graph) {
+  if (!desc.is_cudnn_frontend()) {
+    return tsl::errors::Internal(
+        "Got legacy cuDNN algorithm enum in RebuildExecutionPlan.");
+  }
 
-  // Create the mask output tensor
-  TF_ASSIGN_OR_RETURN(
-      auto mask_out_tensor,
-      CreateCudnnTensor(dims, strides, CudnnfMHAUid::VIRTUAL_ID + 400,
-                        dnn::DataType::kFloat, 1, -1,
-                        /*is_virtual=*/true));
+  // Errors encountered when building a cuDNN operation graph are surfaced in an
+  // unprecedented and innovative way: they're written into a field of the
+  // contained engine object, but then clobbered by the object's move
+  // constructor which makes more cuDNN API calls and encounters further errors.
+  // The only way to get the actual errors is to peek at them via the returned
+  // rvalue reference before actually moving the object to finish its
+  // initialization.
+  cudnn_frontend::EngineBuilder engine_builder;
+  engine_builder.setOperationGraph(op_graph).setGlobalEngineIdx(desc.algo_id());
+  auto&& unmoved = engine_builder.build();
+  RETURN_MSG_IF_CUDNN_ERROR(unmoved);
+  cudnn_frontend::Engine engine = std::move(unmoved);
+  RETURN_MSG_IF_CUDNN_ERROR(engine);
 
-  auto gen_index_row_desc = cudnn_frontend::PointWiseDescBuilder()
-                                .setMode(CUDNN_POINTWISE_GEN_INDEX)
-                                .setAxis(2)
-                                .setComputeType(CUDNN_DATA_FLOAT)
-                                .build();
-  RETURN_MSG_IF_CUDNN_ERROR(gen_index_row_desc);
+  // Miscellaneous compiler bugs and linker issues conspired to make it
+  // impossible for AlgorithmDesc to just give us a map initially.  Get the
+  // vector of tuning knobs and build the map locally.
+  auto tuning_knobs_vec = desc.TuningKnobs();
+  absl::flat_hash_map<int64_t, int64_t> tuning_knobs;
+  tuning_knobs.reserve(tuning_knobs_vec.size());
+  for (const auto& pair : tuning_knobs_vec) {
+    tuning_knobs[pair.first] = pair.second;
+  }
 
-  TF_ASSIGN_OR_RETURN(
-      auto gen_index_row_op,
-      CreateUnaryPwOp(input_tensor, row_index_tensor, gen_index_row_desc));
-
-  auto gen_index_column_desc = cudnn_frontend::PointWiseDescBuilder()
-                                   .setMode(CUDNN_POINTWISE_GEN_INDEX)
-                                   .setAxis(3)
-                                   .setComputeType(CUDNN_DATA_FLOAT)
-                                   .build();
-  RETURN_MSG_IF_CUDNN_ERROR(gen_index_column_desc);
-
-  TF_ASSIGN_OR_RETURN(auto gen_index_column_op,
-                      CreateUnaryPwOp(input_tensor, column_index_tensor,
-                                      gen_index_column_desc));
-
-  auto row_greater_than_column_desc = cudnn_frontend::PointWiseDescBuilder()
-                                          .setMode(CUDNN_POINTWISE_CMP_GE)
-                                          .setComputeType(CUDNN_DATA_BOOLEAN)
-                                          .build();
-  RETURN_MSG_IF_CUDNN_ERROR(row_greater_than_column_desc);
+  for (auto& knob : engine.getSupportedKnobs()) {
+    const auto it = tuning_knobs.find(static_cast<int64_t>(knob.getKnobType()));
+    if (it != tuning_knobs.end()) {
+      knob.setChoice(it->second);
+    }
+  }
 
-  TF_ASSIGN_OR_RETURN(
-      auto row_greater_than_column_op,
-      CreateBinaryPwOp(row_index_tensor, column_index_tensor,
-                       causal_mask_tensor, row_greater_than_column_desc));
+  auto engine_config =
+      cudnn_frontend::EngineConfigBuilder().setEngine(engine).build();
+  RETURN_MSG_IF_CUDNN_ERROR(engine_config);
 
-  TF_ASSIGN_OR_RETURN(
-      auto mask_desc,
-      CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_BINARY_SELECT));
+  auto plan = cudnn_frontend::ExecutionPlanBuilder()
+                  .setHandle(cudnn.handle())
+                  .setEngineConfig(engine_config)
+                  .build();
+  RETURN_MSG_IF_CUDNN_ERROR(plan);
 
-  // Create the mask op.
-  TF_ASSIGN_OR_RETURN(
-      auto mask_op,
-      CreateTernaryPwOp(input_tensor, masked_val_tensor, causal_mask_tensor,
-                        mask_out_tensor, mask_desc));
+  return {std::move(plan)};
+}
 
-  // Add mask to op list
-  ops.push_back(std::move(gen_index_row_op));
-  ops.push_back(std::move(gen_index_column_op));
-  ops.push_back(std::move(row_greater_than_column_op));
-  ops.push_back(std::move(mask_op));
+#endif  // CUDNN_VERSION >= 8100
 
-  return mask_out_tensor;
-}
+}  // namespace
 
 absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionOperationGraph(
     dnn::DnnSupport& dnn_support,
@@ -6374,9 +6345,11 @@ absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionOperationGraph(
     const std::optional<dnn::TensorDescriptor> mask_descriptor,
     const std::optional<dnn::TensorDescriptor> stats_descriptor,
     const float scale, const bool use_dropout,
-    const std::optional<double> dropout_rate, const bool is_causal_mask) {
+    const std::optional<double> dropout_rate,
+    const dnn::FMHAMaskKind mask_type) {
   using cudnn_frontend::graph::Tensor_attributes;
 
+#if CUDNN_VERSION >= 8904
   if (VLOG_IS_ON(4)) {
     VLOG(4) << "\n bmm1_lhs(q): " << q_descriptor.ToString()
             << "\n bmm1_rhs(k): " << k_descriptor.ToString()
@@ -6413,6 +6386,8 @@ absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionOperationGraph(
                        .set_dim(q_descriptor.GetCudnnCompatibleDimensions(true))
                        .set_stride(q_descriptor.GetCudnnCompatibleStrides(true))
                        .set_uid(CudnnfMHAUid::Q_ID));
+  auto dim = k_descriptor.GetCudnnCompatibleDimensions(true);
+
   std::shared_ptr<Tensor_attributes> k_tensor =
       graph.tensor(Tensor_attributes()
                        .set_name("K")
@@ -6427,14 +6402,15 @@ absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionOperationGraph(
           .set_uid(CudnnfMHAUid::V_ID));
 
   // Setting sdpa, and is_inference
+  bool is_causal = mask_type == dnn::FMHAMaskKind::CAUSAL ||
+                   mask_type == dnn::FMHAMaskKind::PADDING_CAUSAL;
   cudnn_frontend::graph::SDPA_attributes sdpa_options;
   sdpa_options.set_name("flash_attention")
       .set_is_inference(stats_descriptor == std::nullopt)
-      .set_causal_mask(is_causal_mask)
+      .set_causal_mask(is_causal)
       .set_attn_scale(scale);
 
   // Setting bias
-  std::shared_ptr<Tensor_attributes> bias = nullptr;
   if (bias_descriptor.has_value()) {
     auto bias_tensor =
         graph.tensor(Tensor_attributes()
@@ -6444,8 +6420,8 @@ absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionOperationGraph(
                          .set_uid(CudnnfMHAUid::BIAS_ID));
     sdpa_options.set_bias(bias_tensor);
   }
-  // Setting seed and bias
-  if (use_dropout && dropout_rate.has_value() && *dropout_rate > 0.0) {
+  // Setting seed and offset
+  if (use_dropout) {
     auto seed_tensor =
         graph.tensor(Tensor_attributes()
                          .set_name("seed")
@@ -6479,12 +6455,17 @@ absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionOperationGraph(
   if (stats_descriptor.has_value()) {
     cudnn_frontend::DataType_t statsType =
         ToCudnnFrontendDataType(stats_descriptor->type());
+    auto stat_dims = stats_descriptor->dimensions();
+    auto stat_strides = stats_descriptor->GetLogicalStrides();
+    stat_dims.push_back(1);
+    stat_strides.push_back(1);
     stats_tensor->set_name("stats")
         .set_output(true)
         .set_data_type(statsType)
+        .set_dim(stat_dims)
+        .set_stride(stat_strides)
         .set_uid(CudnnfMHAUid::P_ID);
   }
-
   CudnnGraph cudnnGraph(std::move(graph));
   TF_ASSIGN_OR_RETURN(bool supported, cudnnGraph.Prepare(dnn_support));
   if (!supported) {
@@ -6496,806 +6477,187 @@ absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionOperationGraph(
     VLOG(4) << "\b flash attention operation graph: " << graph;
   }
   return cudnnGraph;
+#else
+  return absl::UnimplementedError(
+      "Cudnn flash attention only supported with Cudnn >= 8.9.4");
+#endif
 }
 
-absl::StatusOr<cudnn_frontend::Tensor>
-CreateCudnnFlashAttentionDropoutBwdTensor(
-    std::vector<cudnn_frontend::Operation>& ops, absl::Span<const int64_t> dims,
-    absl::Span<const int64_t> strides, dnn::DataType dtype,
-    cudnn_frontend::Tensor& input_tensor, cudnn_frontend::Tensor& mask_tensor,
-    double dropout_rate) {
-  // Create scale tensor
-  std::vector<int64_t> scale_dims(dims.size(), 1);
-  std::vector<int64_t> scale_strides(strides.size(), 1);
-
-  // Create output tensor of dropout node
-  // it is different from regular attention, the dropout output is always
-  // virtual we compute mask in the bwd instead of storing the mask
+absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionBackwardOperationGraph(
+    dnn::DnnSupport& dnn_support, const dnn::MatmulTensorDescriptor& q_desc,
+    const dnn::MatmulTensorDescriptor& k_desc,
+    const dnn::MatmulTensorDescriptor& p_desc,
+    const dnn::MatmulTensorDescriptor& v_desc,
+    const dnn::MatmulTensorDescriptor& do_desc,
+    const dnn::TensorDescriptor& dq_desc, const dnn::TensorDescriptor& dk_desc,
+    const dnn::TensorDescriptor& dv_desc,
+    const std::optional<dnn::TensorDescriptor> bias_descriptor,
+    std::optional<double> dropout_rate, std::optional<int64_t> seed,
+    double scale, bool use_dropout, bool use_mask, bool use_bias,
+    dnn::FMHAMaskKind mask_type) {
+#if CUDNN_VERSION >= 8904
+  if (VLOG_IS_ON(4)) {
+    VLOG(4) << "\n bmm1_grad_gemm1_rhs(q): " << q_desc.ToString()
+            << "\n bmm1_grad_gemm2_rhs(k): " << k_desc.ToString()
+            << "\n bmm2_grad_gemm1_lhs(p): " << p_desc.ToString()
+            << "\n bmm2_grad_gemm2_rhs(v^t): " << v_desc.ToString()
+            << "\n d_output(do): " << do_desc.ToString()
+            << "\n d_bmm1_lhs(dq): " << dq_desc.ToString()
+            << "\n d_bmm1_rhs(dk): " << dk_desc.ToString()
+            << "\n d_bmm2_rhs(dv): " << dv_desc.ToString();
+  }
+  using cudnn_frontend::graph::Tensor_attributes;
+  cudnn_frontend::graph::Graph graph;
+  if (!(q_desc.type() == k_desc.type() && k_desc.type() == p_desc.type() &&
+        p_desc.type() == v_desc.type() && v_desc.type() == do_desc.type() &&
+        do_desc.type() == dq_desc.type() && dq_desc.type() == dk_desc.type() &&
+        dk_desc.type() == dv_desc.type())) {
+    return absl::InternalError("Input datatypes do not match.");
+  }
 
-  TF_ASSIGN_OR_RETURN(
-      auto dropout_out_tensor,
-      CreateCudnnTensor(
-          dims, strides, CudnnfMHAUid::VIRTUAL_ID + 601, dtype, 1, -1,
-          /*is_virtual*/ true,
-          /*cudnn_tensor_order_type*/
-          cudnnBackendTensorReordering_t::CUDNN_TENSOR_REORDERING_F16x16));
+  auto ioDataType = ToCudnnFrontendDataType(q_desc.type());
+  graph.set_compute_data_type(cudnn_frontend::DataType_t::FLOAT)
+      .set_intermediate_data_type(cudnn_frontend::DataType_t::FLOAT)
+      .set_io_data_type(ioDataType);
 
-  // flash attention TODO: set byValue to true if host pointer is supported
-  // Create offset tensor of dropout node
-  TF_ASSIGN_OR_RETURN(
-      auto dropout_offset_tensor,
-      CreateCudnnTensor(
-          scale_dims, scale_strides, CudnnfMHAUid::D_OFFSET_ID,
-          dnn::DataType::kInt64, 1, -1, /*is_virtual*/ false,
-          /*cudnn_tensor_order_type*/
-          cudnnBackendTensorReordering_t::CUDNN_TENSOR_REORDERING_NONE,
-          /*is_value*/ CUDNN_VERSION < 8903 ? false : true));
+  std::shared_ptr<Tensor_attributes> q =
+      graph.tensor(Tensor_attributes()
+                       .set_name("Q")
+                       .set_dim(q_desc.GetCudnnCompatibleDimensions(false))
+                       .set_stride(q_desc.GetCudnnCompatibleStrides(false))
+                       .set_uid(CudnnfMHAUid::Q_ID)
+                       .set_data_type(ioDataType));
+  std::shared_ptr<Tensor_attributes> k =
+      graph.tensor(Tensor_attributes()
+                       .set_name("K")
+                       .set_dim(k_desc.GetCudnnCompatibleDimensions(false))
+                       .set_stride(k_desc.GetCudnnCompatibleStrides(false))
+                       .set_uid(CudnnfMHAUid::K_ID)
+                       .set_data_type(ioDataType));
+  std::shared_ptr<Tensor_attributes> v =
+      graph.tensor(Tensor_attributes()
+                       .set_name("V")
+                       .set_dim(v_desc.GetCudnnCompatibleDimensions(true))
+                       .set_stride(v_desc.GetCudnnCompatibleStrides(true))
+                       .set_uid(CudnnfMHAUid::V_ID)
+                       .set_data_type(ioDataType));
+  std::shared_ptr<Tensor_attributes> o =
+      graph.tensor(Tensor_attributes()
+                       .set_name("O")
+                       .set_dim(do_desc.GetCudnnCompatibleDimensions(false))
+                       .set_stride(do_desc.GetCudnnCompatibleStrides(false))
+                       .set_uid(CudnnfMHAUid::O_ID)
+                       .set_data_type(ioDataType));
+  std::shared_ptr<Tensor_attributes> dO =
+      graph.tensor(Tensor_attributes()
+                       .set_name("dO")
+                       .set_dim(do_desc.GetCudnnCompatibleDimensions(false))
+                       .set_stride(do_desc.GetCudnnCompatibleStrides(false))
+                       .set_uid(CudnnfMHAUid::dO_ID)
+                       .set_data_type(ioDataType));
+  auto p_dims = p_desc.GetCudnnCompatibleDimensions(false);
+  auto p_strides = p_desc.GetCudnnCompatibleStrides(false);
+  std::vector<int64_t> p_reduction_dims(p_dims.begin(), p_dims.end() - 1);
+  p_reduction_dims.push_back(1);
 
-  // Create seed tensor of dropout node
-  TF_ASSIGN_OR_RETURN(
-      auto dropout_seed_tensor,
-      CreateCudnnTensor(
-          scale_dims, scale_strides, CudnnfMHAUid::D_SEED_ID,
-          dnn::DataType::kInt64, 1, -1, /*is_virtual*/ false,
-          /*cudnn_tensor_order_type*/
-          cudnnBackendTensorReordering_t::CUDNN_TENSOR_REORDERING_NONE,
-          /*is_value*/ CUDNN_VERSION < 8903 ? false : true));
+  // Divide every stride by the last dim value.
+  std::vector<int64_t> p_reduction_strides;
+  p_reduction_strides.reserve(p_strides.size());
+  int64_t p_reduced_dim_len = p_dims.back();
+  for (auto stride : p_strides) {
+    p_reduction_strides.push_back(stride / p_reduced_dim_len);
+  }
+  p_reduction_strides[3] = 1;
+  std::shared_ptr<Tensor_attributes> stats =
+      graph.tensor(Tensor_attributes()
+                       .set_name("stats")
+                       .set_dim(p_reduction_dims)
+                       .set_stride(p_reduction_strides)
+                       .set_uid(CudnnfMHAUid::P_ID)
+                       .set_data_type(cudnn_frontend::DataType_t::FLOAT));
+  bool is_causal = mask_type == dnn::FMHAMaskKind::CAUSAL ||
+                   mask_type == dnn::FMHAMaskKind::PADDING_CAUSAL;
+  auto sdpa_backward_options =
+      cudnn_frontend::graph::SDPA_backward_attributes()
+          .set_name("flash_attention_backward")
+          .set_causal_mask(is_causal)
+          .set_attn_scale(scale)
+          .set_compute_data_type(cudnn_frontend::DataType_t::FLOAT);
 
-  // Create description for rng node
-  auto rng_desc = cudnn_frontend::RngDescBuilder()
-                      .setRngDistribution(CUDNN_RNG_DISTRIBUTION_BERNOULLI)
-                      .setBernoulliDistProbability(1.0 - dropout_rate)
-                      .build();
+  // Setting bias
+  if (use_bias) {
+    DCHECK(bias_descriptor != std::nullopt);
+    auto bias_tensor =
+        graph.tensor(Tensor_attributes()
+                         .set_name("bias")
+                         .set_dim(bias_descriptor->dimensions())
+                         .set_stride(bias_descriptor->GetLogicalStrides())
+                         .set_uid(CudnnfMHAUid::BIAS_ID));
+    sdpa_backward_options.set_bias(bias_tensor);
+  }
+  // Setting seed and offset
+  if (use_dropout) {
+    DCHECK(dropout_rate != std::nullopt);
+    auto seed_tensor =
+        graph.tensor(Tensor_attributes()
+                         .set_name("seed")
+                         .set_dim({1, 1, 1, 1})
+                         .set_stride({1, 1, 1, 1})
+                         .set_data_type(cudnn_frontend::DataType_t::INT64)
+                         .set_is_pass_by_value(true)
+                         .set_uid(CudnnfMHAUid::D_SEED_ID));
+    auto offset_tensor =
+        graph.tensor(Tensor_attributes()
+                         .set_name("offset")
+                         .set_dim({1, 1, 1, 1})
+                         .set_stride({1, 1, 1, 1})
+                         .set_data_type(cudnn_frontend::DataType_t::INT64)
+                         .set_is_pass_by_value(true)
+                         .set_uid(CudnnfMHAUid::D_OFFSET_ID));
+    sdpa_backward_options.set_dropout((float)dropout_rate.value(), seed_tensor,
+                                      offset_tensor);
+  }
+
+  auto [dQ, dK, dV] =
+      graph.sdpa_backward(q, k, v, o, dO, stats, sdpa_backward_options);
+
+  dQ->set_output(true)
+      .set_dim(dq_desc.dimensions())
+      .set_stride(dq_desc.GetLogicalStrides())
+      .set_name("dQ")
+      .set_uid(CudnnfMHAUid::dQ_ID)
+      .set_data_type(ioDataType);
+  dK->set_output(true)
+      .set_dim(dk_desc.dimensions())
+      .set_stride(dk_desc.GetLogicalStrides())
+      .set_name("dK")
+      .set_uid(CudnnfMHAUid::dK_ID)
+      .set_data_type(ioDataType);
+  dV->set_output(true)
+      .set_dim(dv_desc.dimensions())
+      .set_stride(dv_desc.GetLogicalStrides())
+      .set_name("dV")
+      .set_uid(CudnnfMHAUid::dV_ID)
+      .set_data_type(ioDataType);
 
-  // Create the rng Node.
-  auto rng_op =
-      cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR)
-          .setyDesc(mask_tensor)
-          .setSeedDesc(dropout_seed_tensor)
-          .setOffsetDesc(dropout_offset_tensor)
-          .setRngDesc(rng_desc)
-          .build();
-  RETURN_MSG_IF_CUDNN_ERROR(rng_op);
-
-  // Create the masking node desc after mask tensor
-  TF_ASSIGN_OR_RETURN(auto masking_desc,
-                      CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_MUL));
-
-  // Create the scaling op
-  TF_ASSIGN_OR_RETURN(auto masking_op,
-                      CreateBinaryPwOp(input_tensor, mask_tensor,
-                                       dropout_out_tensor, masking_desc));
-
-  TF_ASSIGN_OR_RETURN(
-      auto dropout_scale_tensor,
-      CreateCudnnTensor(
-          scale_dims, scale_strides, CudnnfMHAUid::DROPOUT_SCALE_ID,
-          dnn::DataType::kFloat, 1, -1,
-          /*is_virtual*/ false,
-          /*cudnn_tensor_order_type*/ CUDNN_TENSOR_REORDERING_NONE,
-          /*is_value*/ true));
-
-  // Create output of scale node
-  TF_ASSIGN_OR_RETURN(
-      auto dropout_scale_out_tensor,
-      CreateCudnnTensor(dims, strides, CudnnfMHAUid::VIRTUAL_ID + 602, dtype, 1,
-                        -1, /*is_virtual*/ true));
-  // Create the scaling desc
-  TF_ASSIGN_OR_RETURN(auto scale_desc,
-                      CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_MUL));
-
-  // Create the scaling op
-  TF_ASSIGN_OR_RETURN(auto scale_op,
-                      CreateBinaryPwOp(dropout_out_tensor, dropout_scale_tensor,
-                                       dropout_scale_out_tensor, scale_desc));
-  // Add rng op to op list
-  ops.push_back(std::move(rng_op));
-  // Add masking op to op list
-  ops.push_back(std::move(masking_op));
-  // Add scaling op to op list
-  ops.push_back(std::move(scale_op));
-
-  return dropout_scale_out_tensor;
-}
-
-absl::StatusOr<std::unique_ptr<cudnn_frontend::OperationGraph>>
-GetCudnnFlashAttentionBackwardOperationGraph(
-    const dnn::MatmulTensorDescriptor& bmm1_grad_gemm1_rhs_descriptor,
-    const dnn::MatmulTensorDescriptor& bmm1_grad_gemm2_rhs_descriptor,
-    const dnn::MatmulTensorDescriptor& bmm2_grad_gemm1_lhs_descriptor,
-    const dnn::MatmulTensorDescriptor& bmm2_grad_gemm2_rhs_descriptor,
-    const dnn::MatmulTensorDescriptor& d_output_descriptor,
-    const dnn::TensorDescriptor& d_bmm1_lhs_descriptor,
-    const dnn::TensorDescriptor& d_bmm1_rhs_descriptor,
-    const dnn::TensorDescriptor& d_bmm2_rhs_descriptor, dnn::FusedMHAKind kind,
-    std::optional<double> dropout_rate, std::optional<int64_t> seed,
-    CudnnHandle& cudnn, double scale, std::vector<int64_t>& intermediate_shape,
-    bool use_dropout = false, bool use_mask = false, bool use_bias = false,
-    bool use_causal_mask = false) {
-  if (VLOG_IS_ON(4)) {
-    VLOG(4) << "\n bmm1_grad_gemm1_rhs(q): "
-            << bmm1_grad_gemm1_rhs_descriptor.ToString()
-            << "\n bmm1_grad_gemm2_rhs(k): "
-            << bmm1_grad_gemm2_rhs_descriptor.ToString()
-            << "\n bmm2_grad_gemm1_lhs(p): "
-            << bmm2_grad_gemm1_lhs_descriptor.ToString()
-            << "\n bmm2_grad_gemm2_rhs(v^t): "
-            << bmm2_grad_gemm2_rhs_descriptor.ToString()
-            << "\n d_output(do): " << d_output_descriptor.ToString()
-            << "\n d_bmm1_lhs(dq): " << d_bmm1_lhs_descriptor.ToString()
-            << "\n d_bmm1_rhs(dk): " << d_bmm1_rhs_descriptor.ToString()
-            << "\n d_bmm2_rhs(dv): " << d_bmm2_rhs_descriptor.ToString();
-  }
-  // cnn_infer needs to be preloaded for fMHA as well. Reusing the function
-  // created for convolution for fMHA.
-  PreloadCudnnSubLibsHelper(dnn::ConvolutionKind::FORWARD);
-
-  std::vector<cudnn_frontend::Operation const*> ops;
-  std::vector<cudnn_frontend::Operation> intermediate_ops;
-
-  // fp16 or bf16 is required
-  auto dtype = bmm1_grad_gemm1_rhs_descriptor.type();
-  // create input tensor Q
-  std::vector<int64_t> q_dims =
-      bmm1_grad_gemm1_rhs_descriptor.GetCudnnCompatibleDimensions(false);
-  std::vector<int64_t> q_strides =
-      bmm1_grad_gemm1_rhs_descriptor.GetCudnnCompatibleStrides(false);
-
-  // used for create scale tensor or zero tensor
-  std::vector<int64_t> scale_dims(q_dims.size(), 1);
-  std::vector<int64_t> scale_strides(q_strides.size(), 1);
-
-  VLOG(2) << "\n cuDNN compatible bmm1_grad_gemm1_rhs_dims: "
-          << absl::StrJoin(q_dims, ",")
-          << "\n cuDNN compatible bmm1_grad_gemm1_rhs_strides: "
-          << absl::StrJoin(q_strides, ",");
-
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_q,
-      CreateCudnnTensor(q_dims, q_strides, CudnnfMHAUid::Q_ID, dtype, 1, -1));
-
-  // create input tensor K^T
-  std::vector<int64_t> k_transpose_dims =
-      bmm1_grad_gemm2_rhs_descriptor.GetCudnnCompatibleDimensions(true);
-  std::vector<int64_t> k_transpose_strides =
-      bmm1_grad_gemm2_rhs_descriptor.GetCudnnCompatibleStrides(true);
-
-  VLOG(2) << "\n cuDNN compatible bmm1_grad_gemm2_rhs_dims: "
-          << absl::StrJoin(k_transpose_dims, ",")
-          << "\n cuDNN compatible bmm1_grad_gemm2_rhs_strides: "
-          << absl::StrJoin(k_transpose_strides, ",");
-
-  TF_ASSIGN_OR_RETURN(auto tensor_kt,
-                      CreateCudnnTensor(k_transpose_dims, k_transpose_strides,
-                                        CudnnfMHAUid::K_ID, dtype, 1, -1));
-
-  // P^T is lhs of bmm2grad1 dV = dot(P^T, dO) so we set is_lhs = false here to
-  // get correct P dim and stride
-  std::vector<int64_t> p_dims =
-      bmm2_grad_gemm1_lhs_descriptor.GetCudnnCompatibleDimensions(false);
-  std::vector<int64_t> p_strides =
-      bmm2_grad_gemm1_lhs_descriptor.GetCudnnCompatibleStrides(false);
-
-  // used for calculate offset increment
-  intermediate_shape = p_dims;
-  VLOG(2) << "\n cuDNN compatible bmm2_grad_gemm1_lhs_dims: "
-          << absl::StrJoin(p_dims, ",")
-          << "\n cuDNN compatible bmm2_grad_gemm1_lhs_strides: "
-          << absl::StrJoin(p_strides, ",");
-
-  // create input tensor V^T
-  std::vector<int64_t> v_transpose_dims =
-      bmm2_grad_gemm2_rhs_descriptor.GetCudnnCompatibleDimensions(false);
-  std::vector<int64_t> v_transpose_strides =
-      bmm2_grad_gemm2_rhs_descriptor.GetCudnnCompatibleStrides(false);
-
-  VLOG(2) << "\n cuDNN compatible bmm2_grad_gemm2_rhs_dims: "
-          << absl::StrJoin(v_transpose_dims, ",")
-          << "\n cuDNN compatible bmm2_grad_gemm2_rhs_strides: "
-          << absl::StrJoin(v_transpose_strides, ",");
-
-  TF_ASSIGN_OR_RETURN(auto tensor_vt,
-                      CreateCudnnTensor(v_transpose_dims, v_transpose_strides,
-                                        CudnnfMHAUid::V_ID, dtype, 1, -1));
-
-  // create input tensor dO
-  // FLASH ATTENTION TODO: be really careful here about dim
-  std::vector<int64_t> do_dims =
-      d_output_descriptor.GetCudnnCompatibleDimensions(false);
-  std::vector<int64_t> do_strides =
-      d_output_descriptor.GetCudnnCompatibleStrides(false);
-
-  VLOG(2) << "\n cuDNN compatible d_output_dims: "
-          << absl::StrJoin(do_dims, ",")
-          << "\n cuDNN compatible d_output_strides: "
-          << absl::StrJoin(do_strides, ",");
-
-  TF_ASSIGN_OR_RETURN(auto tensor_do,
-                      CreateCudnnTensor(do_dims, do_strides,
-                                        CudnnfMHAUid::dO_ID, dtype, 1, -1));
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_o,
-      CreateCudnnTensor(do_dims, do_strides, CudnnfMHAUid::O_ID, dtype, 1, -1));
-
-  // create output tensor dQ
-  std::vector<int64_t> dq_dims = d_bmm1_lhs_descriptor.dimensions();
-  std::vector<int64_t> dq_strides = d_bmm1_lhs_descriptor.GetLogicalStrides();
-
-  VLOG(2) << "\n cuDNN compatible d_bmm1_lhs_dims: "
-          << absl::StrJoin(dq_dims, ",")
-          << "\n cuDNN compatible d_bmm1_lhs_strides: "
-          << absl::StrJoin(dq_strides, ",");
-
-  TF_ASSIGN_OR_RETURN(auto tensor_dq,
-                      CreateCudnnTensor(dq_dims, dq_strides,
-                                        CudnnfMHAUid::dQ_ID, dtype, 1, -1));
-
-  // create output tensor dK
-  std::vector<int64_t> dk_dims = d_bmm1_rhs_descriptor.dimensions();
-  std::vector<int64_t> dk_strides = d_bmm1_rhs_descriptor.GetLogicalStrides();
-
-  VLOG(2) << "\n cuDNN compatible d_bmm1_rhs_dims: "
-          << absl::StrJoin(dk_dims, ",")
-          << "\n cuDNN compatible d_bmm1_rhs_strides: "
-          << absl::StrJoin(dk_strides, ",");
-
-  TF_ASSIGN_OR_RETURN(auto tensor_dk,
-                      CreateCudnnTensor(dk_dims, dk_strides,
-                                        CudnnfMHAUid::dK_ID, dtype, 1, -1));
-
-  // create output tensor dV
-  std::vector<int64_t> dv_dims = d_bmm2_rhs_descriptor.dimensions();
-  std::vector<int64_t> dv_strides = d_bmm2_rhs_descriptor.GetLogicalStrides();
-
-  VLOG(2) << "\n cuDNN compatible d_bmm2_rhs_dims: "
-          << absl::StrJoin(dv_dims, ",")
-          << "\n cuDNN compatible d_bmm2_rhs_strides: "
-          << absl::StrJoin(dv_strides, ",");
-
-  TF_ASSIGN_OR_RETURN(auto tensor_dv,
-                      CreateCudnnTensor(dv_dims, dv_strides,
-                                        CudnnfMHAUid::dV_ID, dtype, 1, -1));
-
-  // Begin backward graph creation
-  // dO * O
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_dot_product,
-      CreateCudnnTensor(do_dims, do_strides, CudnnfMHAUid::VIRTUAL_ID + 100,
-                        dnn::DataType::kFloat, 1, -1, /*is_virtual*/ true));
-
-  TF_ASSIGN_OR_RETURN(auto mul_desc,
-                      CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_MUL));
-
-  TF_ASSIGN_OR_RETURN(
-      auto mul_op,
-      CreateBinaryPwOp(tensor_do, tensor_o, tensor_dot_product, mul_desc));
-
-  intermediate_ops.push_back(std::move(mul_op));
-
-  // reduction(dO * O)
-  std::vector<int64_t> do_reduction_dims(do_dims.begin(), do_dims.end() - 1);
-  do_reduction_dims.push_back(1);
-
-  // Divide every stride by the last dim value.
-  std::vector<int64_t> do_reduction_strides;
-  do_reduction_strides.reserve(do_strides.size());
-  int64_t reduced_dim_len = do_dims.back();
-  for (auto stride : do_strides) {
-    do_reduction_strides.push_back(stride / reduced_dim_len);
-  }
-
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_dot_product_reduction,
-      CreateCudnnTensor(do_reduction_dims, do_reduction_strides,
-                        CudnnfMHAUid::VIRTUAL_ID + 101, dnn::DataType::kFloat,
-                        1, -1, /*is_virtual*/ true));
-
-  auto reduction_add_desc = cudnn_frontend::ReductionDescBuilder()
-                                .setComputeType(CUDNN_DATA_FLOAT)
-                                .setReductionOp(CUDNN_REDUCE_TENSOR_ADD)
-                                .build();
-  RETURN_MSG_IF_CUDNN_ERROR(reduction_add_desc);
-  auto reduction_add_op = cudnn_frontend::OperationBuilder(
-                              CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR)
-                              .setxDesc(tensor_dot_product)
-                              .setyDesc(tensor_dot_product_reduction)
-                              .setreductionDesc(reduction_add_desc)
-                              .build();
-  RETURN_MSG_IF_CUDNN_ERROR(reduction_add_op);
-  intermediate_ops.push_back(std::move(reduction_add_op));
-
-  // reduction(dO * O) * scale prob -> softmax_sum
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_scale_prob,
-      CreateCudnnTensor(
-          scale_dims, scale_strides, CudnnfMHAUid::SCALE_PROB_ID,
-          dnn::DataType::kFloat, 1, -1,
-          /*is_virtual*/ false,
-          /*cudnn_tensor_order_type*/ CUDNN_TENSOR_REORDERING_NONE,
-          /*is_value*/ true));
-
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_softmax_sum,
-      CreateCudnnTensor(do_reduction_dims, do_reduction_strides,
-                        CudnnfMHAUid::S_SUM_ID, dnn::DataType::kFloat, 1, -1,
-                        /*is_virtual*/ false));
-
-  TF_ASSIGN_OR_RETURN(
-      auto mul_0_op,
-      CreateBinaryPwOp(tensor_dot_product_reduction, tensor_scale_prob,
-                       tensor_softmax_sum, mul_desc));
-  intermediate_ops.push_back(std::move(mul_0_op));
-
-  // Q @ K.T -> P
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_p,
-      CreateCudnnTensor(p_dims, p_strides, CudnnfMHAUid::VIRTUAL_ID + 102,
-                        dnn::DataType::kFloat, 1, -1,
-                        /*is_virtual*/ true));
-
-  auto bmm1_desc = cudnn_frontend::MatMulDescBuilder()
-                       .setComputeType(CUDNN_DATA_FLOAT)
-                       .build();
-  RETURN_MSG_IF_CUDNN_ERROR(bmm1_desc);
-  auto bmm1_op = cudnn_frontend::OperationBuilder(
-                     CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
-                     .setaMatDesc(tensor_q)
-                     .setbMatDesc(tensor_kt)
-                     .setcMatDesc(tensor_p)
-                     .setmatmulDesc(bmm1_desc)
-                     .build();
-  RETURN_MSG_IF_CUDNN_ERROR(bmm1_op);
-  intermediate_ops.push_back(std::move(bmm1_op));
-
-  // P * alpha_scale -> p_after_alpha_scale
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_alpha_scale,
-      CreateCudnnTensor(
-          scale_dims, scale_strides, CudnnfMHAUid::ALPHA_SCALE_ID,
-          dnn::DataType::kFloat, 1, -1,
-          /*is_virtual*/ false,
-          /*cudnn_tensor_order_type*/ CUDNN_TENSOR_REORDERING_NONE,
-          /*is_value*/ true));
-
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_p_after_alpha_scale,
-      CreateCudnnTensor(p_dims, p_strides, CudnnfMHAUid::VIRTUAL_ID + 103,
-                        dnn::DataType::kFloat, 1, -1,
-                        /*is_virtual*/ true));
-  TF_ASSIGN_OR_RETURN(auto mul_1_op,
-                      CreateBinaryPwOp(tensor_p, tensor_alpha_scale,
-                                       tensor_p_after_alpha_scale, mul_desc));
-  intermediate_ops.push_back(std::move(mul_1_op));
-
-  if (use_bias) {
-    // bias -> p_after_bias
-    TF_ASSIGN_OR_RETURN(auto tensor_p_after_bias,
-                        CreateCudnnFlashAttentionBiasFwdTensor(
-                            intermediate_ops, p_dims, p_strides, dtype,
-                            tensor_p_after_alpha_scale));
-    tensor_p_after_alpha_scale = std::move(tensor_p_after_bias);
-  }
-
-  if (use_mask) {
-    // masking -> p_after_mask
-    TF_ASSIGN_OR_RETURN(
-        auto tensor_p_after_mask,
-        CreateCudnnMaskFwdTensor(intermediate_ops, p_dims, p_strides, dtype,
-                                 tensor_p_after_alpha_scale));
-    tensor_p_after_alpha_scale = std::move(tensor_p_after_mask);
-  } else if (use_causal_mask) {
-    // Causal masking -> p_after_mask
-    TF_ASSIGN_OR_RETURN(auto tensor_p_after_causal_mask,
-                        CreateCudnnFlashAttentionCausalMaskTensor(
-                            intermediate_ops, p_dims, p_strides, dtype,
-                            tensor_p_after_alpha_scale));
-    tensor_p_after_alpha_scale = std::move(tensor_p_after_causal_mask);
-  }
-  auto tensor_p_after_bias_or_mask = std::move(tensor_p_after_alpha_scale);
-  // p_after_mask - softmax_stats -> p_after_sub
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_p_after_sub,
-      CreateCudnnTensor(p_dims, p_strides, CudnnfMHAUid::VIRTUAL_ID + 104,
-                        dnn::DataType::kFloat, 1, -1,
-                        /*is_virtual*/ true));
-
-  std::vector<int64_t> p_reduction_dims(p_dims.begin(), p_dims.end() - 1);
-  p_reduction_dims.push_back(1);
-
-  // Divide every stride by the last dim value.
-  std::vector<int64_t> p_reduction_strides;
-  p_reduction_strides.reserve(p_strides.size());
-  int64_t p_reduced_dim_len = p_dims.back();
-  for (auto stride : p_strides) {
-    p_reduction_strides.push_back(stride / p_reduced_dim_len);
-  }
-
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_softmax_stats,
-      CreateCudnnTensor(p_reduction_dims, p_reduction_strides,
-                        CudnnfMHAUid::P_ID, dnn::DataType::kFloat, 1, -1));
-
-  TF_ASSIGN_OR_RETURN(auto sub_desc,
-                      CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_SUB));
-  TF_ASSIGN_OR_RETURN(
-      auto sub_0_op,
-      CreateBinaryPwOp(tensor_p_after_bias_or_mask, tensor_softmax_stats,
-                       tensor_p_after_sub, sub_desc));
-  intermediate_ops.push_back(std::move(sub_0_op));
-
-  // e^(p_after_sub) -> p_after_softmax
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_p_after_softmax,
-      CreateCudnnTensor(p_dims, p_strides, CudnnfMHAUid::VIRTUAL_ID + 105,
-                        dnn::DataType::kFloat, 1, -1,
-                        /*is_virtual*/ true));
-
-  TF_ASSIGN_OR_RETURN(auto exp_0_desc,
-                      CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_EXP));
-  TF_ASSIGN_OR_RETURN(
-      auto exp_0_op,
-      CreateUnaryPwOp(tensor_p_after_sub, tensor_p_after_softmax, exp_0_desc));
-  intermediate_ops.push_back(std::move(exp_0_op));
-
-  // Dropout -> p_after_scale_dropout
-  // Create tensor for dropout's mask
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_dropout_mask,
-      CreateCudnnTensor(p_dims, p_strides, CudnnfMHAUid::VIRTUAL_ID + 106,
-                        dnn::DataType::kFloat, 1, -1,
-                        /*is_virtual*/ true));
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_p_after_scale_dropout,
-      CreateCudnnFlashAttentionDropoutBwdTensor(
-          intermediate_ops, p_dims, p_strides, dtype, tensor_p_after_softmax,
-          tensor_dropout_mask, use_dropout ? *dropout_rate : 0));
-
-  // after_scale_dropout -> s_transpose
-  auto p_transpose_dims = p_dims;
-  auto p_transpose_strides = p_strides;
-  auto p_rank = p_transpose_dims.size();
-  std::swap(p_transpose_dims[p_rank - 1], p_transpose_dims[p_rank - 2]);
-  std::swap(p_transpose_strides[p_rank - 1], p_transpose_strides[p_rank - 2]);
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_s_transpose,
-      CreateCudnnTensor(p_transpose_dims, p_transpose_strides,
-                        CudnnfMHAUid::VIRTUAL_ID + 107, dtype, 1, -1,
-                        /*is_virtual*/ true));
-  auto reshape_op = cudnn_frontend::OperationBuilder(
-                        CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR)
-                        .setxDesc(tensor_p_after_scale_dropout)
-                        .setyDesc(tensor_s_transpose)
-                        .build();
-  RETURN_MSG_IF_CUDNN_ERROR(reshape_op);
-  intermediate_ops.push_back(std::move(reshape_op));
-
-  // s_transpose @ dO -> dV
-  auto bmm2_grad_gemm1_desc = cudnn_frontend::MatMulDescBuilder()
-                                  .setComputeType(CUDNN_DATA_FLOAT)
-                                  .build();
-  RETURN_MSG_IF_CUDNN_ERROR(bmm2_grad_gemm1_desc);
-  auto bmm2_grad_gemm1_op = cudnn_frontend::OperationBuilder(
-                                CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
-                                .setaMatDesc(tensor_s_transpose)
-                                .setbMatDesc(tensor_do)
-                                .setcMatDesc(tensor_dv)
-                                .setmatmulDesc(bmm2_grad_gemm1_desc)
-                                .build();
-  RETURN_MSG_IF_CUDNN_ERROR(bmm2_grad_gemm1_op);
-  intermediate_ops.push_back(std::move(bmm2_grad_gemm1_op));
-
-  // dO @ V^t -> dS
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_ds,
-      CreateCudnnTensor(p_dims, p_strides, CudnnfMHAUid::VIRTUAL_ID + 108,
-                        dnn::DataType::kFloat, 1, -1,
-                        /*is_virtual*/ true));
-
-  auto bmm2_grad_gemm2_desc = cudnn_frontend::MatMulDescBuilder()
-                                  .setComputeType(CUDNN_DATA_FLOAT)
-                                  .build();
-  RETURN_MSG_IF_CUDNN_ERROR(bmm2_grad_gemm2_desc);
-  auto bmm2_grad_gemm2_op = cudnn_frontend::OperationBuilder(
-                                CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
-                                .setaMatDesc(tensor_do)
-                                .setbMatDesc(tensor_vt)
-                                .setcMatDesc(tensor_ds)
-                                .setmatmulDesc(bmm2_grad_gemm2_desc)
-                                .build();
-  RETURN_MSG_IF_CUDNN_ERROR(bmm2_grad_gemm2_op);
-  intermediate_ops.push_back(std::move(bmm2_grad_gemm2_op));
-
-  // dS * dropout -> dS_after_dropout
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_ds_after_dropout,
-      CreateCudnnTensor(p_dims, p_strides, CudnnfMHAUid::VIRTUAL_ID + 109,
-                        dnn::DataType::kFloat, 1, -1,
-                        /*is_virtual*/ true));
-
-  TF_ASSIGN_OR_RETURN(auto mul_2_op,
-                      CreateBinaryPwOp(tensor_ds, tensor_dropout_mask,
-                                       tensor_ds_after_dropout, mul_desc));
-  intermediate_ops.push_back(std::move(mul_2_op));
-
-  // dS_after_dropout - softmax_sum -> dS_after_sub
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_ds_after_sub,
-      CreateCudnnTensor(p_dims, p_strides, CudnnfMHAUid::VIRTUAL_ID + 110,
-                        dnn::DataType::kFloat, 1, -1,
-                        /*is_virtual*/ true));
-
-  TF_ASSIGN_OR_RETURN(
-      auto sub_1_op,
-      CreateBinaryPwOp(tensor_ds_after_dropout, tensor_softmax_sum,
-                       tensor_ds_after_sub, sub_desc));
-  intermediate_ops.push_back(std::move(sub_1_op));
-
-  // dS_after_sub * p_after_softmax -> dP
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_dp,
-      CreateCudnnTensor(p_dims, p_strides, CudnnfMHAUid::VIRTUAL_ID + 111,
-                        dnn::DataType::kFloat, 1, -1,
-                        /*is_virtual*/ true));
-
-  TF_ASSIGN_OR_RETURN(auto mul_3_op, CreateBinaryPwOp(tensor_ds_after_sub,
-                                                      tensor_p_after_softmax,
-                                                      tensor_dp, mul_desc));
-  intermediate_ops.push_back(std::move(mul_3_op));
-
-  // dP * dropout_scale -> dP_after_dropout_scale
-  // flash attention TODO: make sure the data type is correct here
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_dp_after_dropout_scale,
-      CreateCudnnTensor(p_dims, p_strides, CudnnfMHAUid::VIRTUAL_ID + 112,
-                        dnn::DataType::kFloat, 1, -1,
-                        /*is_virtual*/ true));
-
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_dropout_scale,
-      CreateCudnnTensor(
-          scale_dims, scale_strides, CudnnfMHAUid::DROPOUT_SCALE_ID,
-          dnn::DataType::kFloat, 1, -1,
-          /*is_virtual*/ false,
-          /*cudnn_tensor_order_type*/ CUDNN_TENSOR_REORDERING_NONE,
-          /*is_value*/ true));
-
-  TF_ASSIGN_OR_RETURN(
-      auto mul_4_op, CreateBinaryPwOp(tensor_dp, tensor_dropout_scale,
-                                      tensor_dp_after_dropout_scale, mul_desc));
-  intermediate_ops.push_back(std::move(mul_4_op));
-
-  // dP_after_dropout_scale * alpha_scale -> dP_scaled
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_dp_scaled,
-      CreateCudnnTensor(p_dims, p_strides, CudnnfMHAUid::VIRTUAL_ID + 113,
-                        dnn::DataType::kFloat, 1, -1,
-                        /*is_virtual*/ true));
-  TF_ASSIGN_OR_RETURN(
-      auto mul_5_op,
-      CreateBinaryPwOp(tensor_dp_after_dropout_scale, tensor_alpha_scale,
-                       tensor_dp_scaled, mul_desc));
-  intermediate_ops.push_back(std::move(mul_5_op));
-
-  // K^T -> K
-  auto k_dims = k_transpose_dims;
-  auto k_strides = k_transpose_strides;
-  auto k_rank = k_dims.size();
-  std::swap(k_dims[k_rank - 1], k_dims[k_rank - 2]);
-  std::swap(k_strides[k_rank - 1], k_strides[k_rank - 2]);
-
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_k,
-      CreateCudnnTensor(k_dims, k_strides, CudnnfMHAUid::VIRTUAL_ID + 114,
-                        dtype, 1, -1, /*is_virtual*/ true));
-  auto reshape_1_op = cudnn_frontend::OperationBuilder(
-                          CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR)
-                          .setxDesc(tensor_kt)
-                          .setyDesc(tensor_k)
-                          .build();
-  RETURN_MSG_IF_CUDNN_ERROR(reshape_1_op);
-  intermediate_ops.push_back(std::move(reshape_1_op));
-
-  // dP_scaled @ K -> d_Q_accum
-  auto tensor_d_Q_accum =
-      cudnn_frontend::TensorBuilder()
-          .setDim(dq_dims.size(), dq_dims.data())
-          .setStride(dq_strides.size(), dq_strides.data())
-          .setId(CudnnfMHAUid::d_Q_accum_ID)
-          .setAlignment(16)
-          .setDataType(ToCudnnDataType(dnn::DataType::kFloat))
-          .setVectorCountAndDimension(1, -1)
-          .setVirtual(false)
-          .setReorderType(CUDNN_TENSOR_REORDERING_F16x16)
-          .setByValue(false)
-          .build();
-  RETURN_MSG_IF_CUDNN_ERROR(tensor_d_Q_accum);
-
-  auto bmm1_grad_gemm1_desc = cudnn_frontend::MatMulDescBuilder()
-                                  .setComputeType(CUDNN_DATA_FLOAT)
-                                  .build();
-  RETURN_MSG_IF_CUDNN_ERROR(bmm1_grad_gemm1_desc);
-  auto bmm1_grad_gemm1_op = cudnn_frontend::OperationBuilder(
-                                CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
-                                .setaMatDesc(tensor_dp_scaled)
-                                .setbMatDesc(tensor_k)
-                                .setcMatDesc(tensor_d_Q_accum)
-                                .setmatmulDesc(bmm1_grad_gemm1_desc)
-                                .build();
-  RETURN_MSG_IF_CUDNN_ERROR(bmm1_grad_gemm1_op);
-  intermediate_ops.push_back(std::move(bmm1_grad_gemm1_op));
-
-  // dP_scaled.T @ Q -> dK
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_dp_scaled_transpose,
-      CreateCudnnTensor(p_transpose_dims, p_transpose_strides,
-                        CudnnfMHAUid::VIRTUAL_ID + 115, dnn::DataType::kFloat,
-                        1, -1, /*is_virtual*/ true));
-  auto reshape_2_op = cudnn_frontend::OperationBuilder(
-                          CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR)
-                          .setxDesc(tensor_dp_scaled)
-                          .setyDesc(tensor_dp_scaled_transpose)
-                          .build();
-  RETURN_MSG_IF_CUDNN_ERROR(reshape_2_op);
-  intermediate_ops.push_back(std::move(reshape_2_op));
-
-  auto bmm1_grad_gemm2_desc = cudnn_frontend::MatMulDescBuilder()
-                                  .setComputeType(CUDNN_DATA_FLOAT)
-                                  .build();
-  RETURN_MSG_IF_CUDNN_ERROR(bmm1_grad_gemm2_desc);
-  auto bmm1_grad_gemm2_op = cudnn_frontend::OperationBuilder(
-                                CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
-                                .setaMatDesc(tensor_dp_scaled_transpose)
-                                .setbMatDesc(tensor_q)
-                                .setcMatDesc(tensor_dk)
-                                .setmatmulDesc(bmm1_grad_gemm2_desc)
-                                .build();
-  RETURN_MSG_IF_CUDNN_ERROR(bmm1_grad_gemm2_op);
-  intermediate_ops.push_back(std::move(bmm1_grad_gemm2_op));
-
-  // d_Q_accum @ identity -> dQ
-  TF_ASSIGN_OR_RETURN(
-      auto identity_desc,
-      CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_IDENTITY));
-  TF_ASSIGN_OR_RETURN(
-      auto identity_op,
-      CreateUnaryPwOp(tensor_d_Q_accum, tensor_dq, identity_desc));
-  intermediate_ops.push_back(std::move(identity_op));
-
-  ops.reserve(intermediate_ops.size());
-  for (auto& intermediate_op : intermediate_ops) {
-    ops.emplace_back(&intermediate_op);
-  }
-
-  auto op_graph = cudnn_frontend::OperationGraphBuilder()
-                      .setHandle(cudnn.handle())
-                      .setOperationGraph(ops.size(), ops.data())
-                      .build();
-  RETURN_MSG_IF_CUDNN_ERROR(op_graph);
-
-  VLOG(4) << "\nTensor_q: " << tensor_q.describe()
-          << "\nTensor_kt: " << tensor_kt.describe()
-          << "\nTensor_p: " << tensor_p.describe()
-          << "\nTensor_vt: " << tensor_vt.describe()
-          << "\nTensor_do: " << tensor_do.describe()
-          << "\nTensor_o: " << tensor_o.describe()
-          << "\nTensor_dq: " << tensor_dq.describe()
-          << "\nTensor_dk: " << tensor_dk.describe()
-          << "\nTensor_dv: " << tensor_dv.describe()
-          << "\nBMM2_grad_gemm1: " << bmm2_grad_gemm1_desc.describe()
-          << "\nBMM2_grad_gemm2: " << bmm2_grad_gemm2_desc.describe()
-          << "\nBMM1_grad_gemm1: " << bmm1_grad_gemm1_desc.describe()
-          << "\nBMM1_grad_gemm2: " << bmm1_grad_gemm2_desc.describe()
-          << "\nOpGraph: " << op_graph.describe();
-  return std::make_unique<cudnn_frontend::OperationGraph>(std::move(op_graph));
-}
-
-#endif  // CUDNN_VERSION >= 8800
-
-}  // namespace
-
-static absl::StatusOr<cudnn_frontend::ExecutionPlan> GetExecPlanFromHeuristics(
-    cudnn_frontend::OperationGraph&& opGraph, const CudnnHandle& cudnn,
-    bool include_fallback_heuristics = false) {
-#if (CUDNN_VERSION >= 8800)
-  cudnn_frontend::EngineConfigList engine_configs;
-  if (!include_fallback_heuristics) {
-    cudnn_frontend::get_heuristics_list<1>(
-        {"heuristics_instant"}, opGraph, allowAllConfig, engine_configs, true);
-  } else {
-    cudnn_frontend::get_heuristics_list<2>(
-        {"heuristics_instant", "heuristics_fallback"}, opGraph, allowAllConfig,
-        engine_configs, true);
+  CudnnGraph cudnnGraph(std::move(graph));
+  TF_ASSIGN_OR_RETURN(bool supported, cudnnGraph.Prepare(dnn_support));
+  if (!supported) {
+    return absl::InternalError("cuDNN graph is not supported.");
   }
+  TF_RETURN_IF_ERROR(cudnnGraph.Build(dnn_support, /*plan_id=*/0));
 
   if (VLOG_IS_ON(4)) {
-    VLOG(4) << "Heuristic has " << engine_configs.size() << " configurations ";
-  }
-  if (engine_configs.empty()) {
-    return absl::InternalError(
-        "No engine configurations found for this opGraph and heuristics.");
-  }
-
-  cudnnStatus_t status;
-  for (auto engine_config : engine_configs) {
-    cudnn_frontend::ExecutionPlan plan =
-        cudnn_frontend::ExecutionPlanBuilder()
-            .setHandle(cudnn.handle())
-            .setEngineConfig(engine_config, opGraph.getTag())
-            .build();
-    status = plan.get_status();
-    if (status == CUDNN_STATUS_SUCCESS) {
-      return plan;
-    } else {
-      VLOG(4) << "Failed to build cuDNN execution plan for opGraph "
-              << opGraph.getTag()
-              << ". Status: " << CudnnStatusToString(status);
-    }
+    VLOG(4) << "\b flash attention operation backward graph: " << graph;
   }
 
-  LOG(FATAL) << "Failed to generate cuDNN execution plan for opGraph "
-             << opGraph.getTag()
-             << ". Status of final plan: " << CudnnStatusToString(status);
+  return cudnnGraph;
 #else
-  return absl::UnimplementedError("Supported only for cuDNN >= 8.8.0");
+  return absl::UnimplementedError(
+      "Cudnn flash attention only supported with Cudnn >= 8.9.4");
 #endif
 }
 
-static absl::StatusOr<cudnn_frontend::ExecutionPlan> RebuildExecutionPlan(
-    const CudnnHandle& cudnn, const dnn::AlgorithmDesc& desc,
-    const cudnn_frontend::OperationGraph& op_graph) {
-  if (!desc.is_cudnn_frontend()) {
-    return tsl::errors::Internal(
-        "Got legacy cuDNN algorithm enum in RebuildExecutionPlan.");
-  }
-
-  // Errors encountered when building a cuDNN operation graph are surfaced in an
-  // unprecedented and innovative way: they're written into a field of the
-  // contained engine object, but then clobbered by the object's move
-  // constructor which makes more cuDNN API calls and encounters further errors.
-  // The only way to get the actual errors is to peek at them via the returned
-  // rvalue reference before actually moving the object to finish its
-  // initialization.
-  cudnn_frontend::EngineBuilder engine_builder;
-  engine_builder.setOperationGraph(op_graph).setGlobalEngineIdx(desc.algo_id());
-  auto&& unmoved = engine_builder.build();
-  RETURN_MSG_IF_CUDNN_ERROR(unmoved);
-  cudnn_frontend::Engine engine = std::move(unmoved);
-  RETURN_MSG_IF_CUDNN_ERROR(engine);
-
-  // Miscellaneous compiler bugs and linker issues conspired to make it
-  // impossible for AlgorithmDesc to just give us a map initially.  Get the
-  // vector of tuning knobs and build the map locally.
-  auto tuning_knobs_vec = desc.TuningKnobs();
-  absl::flat_hash_map<int64_t, int64_t> tuning_knobs;
-  tuning_knobs.reserve(tuning_knobs_vec.size());
-  for (const auto& pair : tuning_knobs_vec) {
-    tuning_knobs[pair.first] = pair.second;
-  }
-
-  for (auto& knob : engine.getSupportedKnobs()) {
-    const auto it = tuning_knobs.find(static_cast<int64_t>(knob.getKnobType()));
-    if (it != tuning_knobs.end()) {
-      knob.setChoice(it->second);
-    }
-  }
-
-  auto engine_config =
-      cudnn_frontend::EngineConfigBuilder().setEngine(engine).build();
-  RETURN_MSG_IF_CUDNN_ERROR(engine_config);
-
-  auto plan = cudnn_frontend::ExecutionPlanBuilder()
-                  .setHandle(cudnn.handle())
-                  .setEngineConfig(engine_config)
-                  .build();
-  RETURN_MSG_IF_CUDNN_ERROR(plan);
-
-  return {std::move(plan)};
-}
-
-#endif  // CUDNN_VERSION >= 8100
-
-}  // namespace
-
 absl::Status CudnnSupport::DoPrepareForConvolution(
     dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
     const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
@@ -7427,7 +6789,7 @@ class CudnnLegacyConvRunner : public dnn::ConvRunner {
                           DeviceMemoryBase output_data) const override {
     auto algo = MakeAlgorithmDesc();
 
-    if (static_cast<internal::StreamExecutorInterface*>(parent_) !=
+    if (static_cast<StreamExecutorInterface*>(parent_) !=
         stream->parent()->implementation()) {
       return tsl::errors::Internal(
           "CudnnLegacyConvRunner cached across multiple StreamExecutors.");
@@ -7448,8 +6810,11 @@ class CudnnLegacyConvRunner : public dnn::ConvRunner {
                      : static_cast<void*>(&fbeta);
 
     const bool is_profiling = profile_result != nullptr;
-    TF_ASSIGN_OR_RETURN(std::optional<GpuTimer> timer,
-                        GpuTimer::CreateIfNeeded(stream, is_profiling));
+    TF_ASSIGN_OR_RETURN(
+        std::optional<GpuTimer> timer,
+        GpuTimer::CreateIfNeeded(
+            stream, profile_result && profile_result->warmup_run_executed(),
+            is_profiling));
 
     const auto get_fwd_bugs = [&]() -> absl::Status {
 #if CUDNN_VERSION < 8000
@@ -7833,7 +7198,7 @@ class CudnnExecutionPlanRunner<void(Args...)>
   absl::Status operator()(Stream* stream, dnn::ProfileResult* profile_result,
                           DeviceMemoryBase scratch_memory,
                           Args... inputs) const override {
-    if (static_cast<internal::StreamExecutorInterface*>(parent_) !=
+    if (static_cast<StreamExecutorInterface*>(parent_) !=
         stream->parent()->implementation()) {
       return tsl::errors::Internal(
           "CudnnExecutionPlanRunner cached across multiple StreamExecutors.");
@@ -7876,7 +7241,7 @@ class CudnnExecutionPlanRunner<void(Args...)>
       data_ptrs_vec.pop_back();
     }
 
-    if (sizeof...(Args) == 9 || sizeof...(Args) == 17) {
+    if (sizeof...(Args) == 9 || sizeof...(Args) == 15) {
       // is attention fwd or bwd
       data_ptrs_vec.erase(
           std::remove(data_ptrs_vec.begin(), data_ptrs_vec.end(), nullptr),
@@ -7919,8 +7284,11 @@ class CudnnExecutionPlanRunner<void(Args...)>
             << "\nVariantPack: " << variantPack.describe();
 
     const bool is_profiling = profile_result != nullptr;
-    TF_ASSIGN_OR_RETURN(std::optional<GpuTimer> timer,
-                        GpuTimer::CreateIfNeeded(stream, is_profiling));
+    TF_ASSIGN_OR_RETURN(
+        std::optional<GpuTimer> timer,
+        GpuTimer::CreateIfNeeded(
+            stream, profile_result && profile_result->warmup_run_executed(),
+            is_profiling));
 
     cudnnStatus_t status = cudnnBackendExecute(
         cudnn.handle(), plan_.get_raw_desc(), variantPack.get_raw_desc());
@@ -8036,7 +7404,7 @@ class CudnnGraphRunner<void(Args...)> : public dnn::OpRunner<void(Args...)> {
   absl::Status operator()(Stream* stream, dnn::ProfileResult* profile_result,
                           DeviceMemoryBase scratch_memory,
                           Args... inputs) const override {
-    if (static_cast<internal::StreamExecutorInterface*>(parent_) !=
+    if (static_cast<StreamExecutorInterface*>(parent_) !=
         stream->parent()->implementation()) {
       return tsl::errors::Internal(
           "CudnnExecutionPlanRunner cached across multiple StreamExecutors.");
@@ -8044,24 +7412,31 @@ class CudnnGraphRunner<void(Args...)> : public dnn::OpRunner<void(Args...)> {
     CudnnHandle handle = cudnn_->GetHandle(parent_, stream);
     std::unordered_map<int64_t, void*> variant_pack;
     std::vector<void*> vec = {inputs.opaque()...};
+
+    // add device buffers to the variant pack
     for (int i = 0; i < uids_.size(); ++i) {
       if (uids_[i].has_value()) {
         variant_pack[*uids_[i]] = vec[i];
       }
     }
-
     if (dropout_rng_offset_increment_ > 0) {
 #if CUDNN_VERSION >= 8800
-      variant_pack[D_SEED_ID] = (void*)&dropout_rng_seed_;
+      variant_pack[CudnnfMHAUid::D_SEED_ID] = (void*)&dropout_rng_seed_;
       current_dropout_rng_offset_ += dropout_rng_offset_increment_;
-      variant_pack[D_OFFSET_ID] = (void*)&current_dropout_rng_offset_;
+      variant_pack[CudnnfMHAUid::D_OFFSET_ID] =
+          (void*)&current_dropout_rng_offset_;
 #else
       return absl::UnimplementedError(
           "Cudnn dropout offset and seed are only supported with Cudnn >= "
           "8.8.0");
 #endif  // CUDNN_VERSION >= 8800
     }
-
+    int workspace = graph_.Graph().get_workspace_size();
+    if (workspace > scratch_memory.size()) {
+      return tsl::errors::Internal(
+          absl::StrFormat("CuDNN FMHA requires %d workspace, got %d workspace.",
+                          workspace, scratch_memory.size()));
+    }
     RETURN_IF_CUDNN_FRONTEND_ERROR(graph_.Graph().execute(
         handle.handle(), variant_pack, scratch_memory.opaque()));
 
@@ -8499,7 +7874,7 @@ class CudnnLegacyFusedConvRunner : public dnn::FusedConvRunner {
                           DeviceMemoryBase side_input_data,
                           DeviceMemoryBase bias_data,
                           DeviceMemoryBase output_data) const override {
-    if (static_cast<internal::StreamExecutorInterface*>(parent_) !=
+    if (static_cast<StreamExecutorInterface*>(parent_) !=
         stream->parent()->implementation()) {
       return tsl::errors::Internal(
           "CudnnLegacyFusedConvRunner cached across multiple "
@@ -8510,7 +7885,9 @@ class CudnnLegacyFusedConvRunner : public dnn::FusedConvRunner {
 
     TF_ASSIGN_OR_RETURN(
         std::optional<GpuTimer> timer,
-        GpuTimer::CreateIfNeeded(stream, profile_result != nullptr));
+        GpuTimer::CreateIfNeeded(
+            stream, profile_result && profile_result->warmup_run_executed(),
+            profile_result != nullptr));
     auto side_input_data_ptr = (side_input_scale_ == 0)
                                    ? output_data.opaque()
                                    : side_input_data.opaque();
@@ -9099,7 +8476,7 @@ CudnnSupport::FusedMHARunnerFromDesc(
     std::optional<dnn::TensorDescriptor> mask_descriptor,
     std::optional<dnn::TensorDescriptor> bias_descriptor, double scale,
     std::optional<double> dropout_rate, std::optional<int64_t> seed,
-    bool is_flash_attention, bool is_causal_mask) {
+    bool is_flash_attention, dnn::FMHAMaskKind mask_type) {
 #if CUDNN_VERSION >= 8800
   auto cudnn = cudnn_->GetHandle(parent_, stream);
   bool use_dropout = dropout_rate && *dropout_rate > 0.0;
@@ -9115,7 +8492,7 @@ CudnnSupport::FusedMHARunnerFromDesc(
             /*o_descriptor=*/output_descriptor, bias_descriptor,
             mask_descriptor, /*stats_descriptor=*/activation_descriptor,
             /*scale=*/static_cast<float>(scale), use_dropout, dropout_rate,
-            is_causal_mask));
+            mask_type));
 
     std::vector<int64_t> intermediate_bmm2_lhs_dims =
         intermediate_bmm2_lhs_descriptor.GetCudnnCompatibleDimensions(true);
@@ -9217,33 +8594,57 @@ CudnnSupport::FusedMHABackwardRunnerFromDesc(
     std::optional<dnn::TensorDescriptor> fwd_output_descriptor,
     std::optional<dnn::TensorDescriptor> bias_descriptor, double scale,
     std::optional<double> dropout_rate, std::optional<int64_t> seed,
-    bool is_flash_attention, bool is_causal_mask) {
+    bool is_flash_attention, dnn::FMHAMaskKind mask_type) {
 #if CUDNN_VERSION >= 8800
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
   bool use_dropout = dropout_rate && *dropout_rate > 0.0;
   std::vector<int64_t> intermediate_shape;
+
+  if (is_flash_attention) {
+    TF_ASSIGN_OR_RETURN(
+        auto graph,
+        GetCudnnFlashAttentionBackwardOperationGraph(
+            *this, bmm1_grad_gemm1_rhs_descriptor,
+            bmm1_grad_gemm2_rhs_descriptor, bmm2_grad_gemm1_lhs_descriptor,
+            bmm2_grad_gemm2_rhs_descriptor, d_output_descriptor,
+            d_bmm1_lhs_descriptor, d_bmm1_rhs_descriptor, d_bmm2_rhs_descriptor,
+            bias_descriptor, dropout_rate, seed, scale, use_dropout,
+            /*use_mask*/ mask_descriptor != std::nullopt,
+            /*use_bias*/ bias_descriptor != std::nullopt, mask_type));
+
+    std::vector<int64_t> p_dims =
+        bmm2_grad_gemm1_lhs_descriptor.GetCudnnCompatibleDimensions(false);
+    intermediate_shape = p_dims;
+    int64_t dropout_rng_offset = GetDropoutRngOffset(intermediate_shape);
+    int64_t dropout_rng_seed = seed.has_value() ? *seed : 0;
+
+    std::vector<std::optional<int64_t>> uids;
+    uids = {CudnnfMHAUid::Q_ID,  CudnnfMHAUid::K_ID,  CudnnfMHAUid::P_ID,
+            CudnnfMHAUid::V_ID,  CudnnfMHAUid::dO_ID, CudnnfMHAUid::dQ_ID,
+            CudnnfMHAUid::dK_ID, CudnnfMHAUid::dV_ID, std::nullopt,
+            std::nullopt,        std::nullopt,        CudnnfMHAUid::O_ID};
+    uids.emplace_back(bias_descriptor.has_value()
+                          ? std::optional<CudnnfMHAUid>(CudnnfMHAUid::BIAS_ID)
+                          : std::nullopt);
+    TF_ASSIGN_OR_RETURN(
+        auto runner, CudnnGraphRunner<dnn::FusedMHABackwardSignature>::Create(
+                         parent_, cudnn_.get(), graph, dropout_rng_seed,
+                         dropout_rng_offset, uids));
+    return {std::make_unique<CudnnGraphRunner<dnn::FusedMHABackwardSignature>>(
+        std::move(runner))};
+  }
+
   TF_ASSIGN_OR_RETURN(
       auto op_graph,
-      is_flash_attention
-          ? GetCudnnFlashAttentionBackwardOperationGraph(
-                bmm1_grad_gemm1_rhs_descriptor, bmm1_grad_gemm2_rhs_descriptor,
-                bmm2_grad_gemm1_lhs_descriptor, bmm2_grad_gemm2_rhs_descriptor,
-                d_output_descriptor, d_bmm1_lhs_descriptor,
-                d_bmm1_rhs_descriptor, d_bmm2_rhs_descriptor, kind,
-                dropout_rate, seed, cudnn, scale, intermediate_shape,
-                use_dropout,
-                /*use_mask*/ mask_descriptor != std::nullopt,
-                /*use_bias*/ bias_descriptor != std::nullopt, is_causal_mask)
-          : GetCudnnFusedMHABackwardOperationGraph(
-                bmm1_grad_gemm1_rhs_descriptor, bmm1_grad_gemm2_rhs_descriptor,
-                bmm2_grad_gemm1_lhs_descriptor, bmm2_grad_gemm2_rhs_descriptor,
-                d_output_descriptor, d_bmm1_lhs_descriptor,
-                d_bmm1_rhs_descriptor, d_bmm2_rhs_descriptor, kind,
-                dropout_rate, seed, cudnn, scale, intermediate_shape,
-                use_dropout,
-                /*use_mask*/ mask_descriptor != std::nullopt,
-                /*use_bias*/ d_bias_descriptor != std::nullopt));
+      GetCudnnFusedMHABackwardOperationGraph(
+          bmm1_grad_gemm1_rhs_descriptor, bmm1_grad_gemm2_rhs_descriptor,
+          bmm2_grad_gemm1_lhs_descriptor, bmm2_grad_gemm2_rhs_descriptor,
+          d_output_descriptor, d_bmm1_lhs_descriptor, d_bmm1_rhs_descriptor,
+          d_bmm2_rhs_descriptor, kind, dropout_rate, seed, cudnn, scale,
+          intermediate_shape, use_dropout,
+          /*use_mask*/ mask_descriptor != std::nullopt,
+          /*use_bias*/ d_bias_descriptor != std::nullopt));
   // The function  GetExecPlanFromHeuristics uses
   // cudnn_frontend::cudnnException which is currently not recommended for
   // use by Google. Hence commenting out the call.
@@ -9261,67 +8662,30 @@ CudnnSupport::FusedMHABackwardRunnerFromDesc(
   std::vector<ScalingParam> scalar_values;
   std::vector<int64_t> uids;
 
-  if (is_flash_attention) {
-    scalar_uids = {CudnnfMHAUid::ALPHA_SCALE_ID, CudnnfMHAUid::DROPOUT_SCALE_ID,
-                   CudnnfMHAUid::SCALE_PROB_ID};
-    // alpha scale
-    ScalingParam alpha_scale(scale, dnn::DataType::kFloat);
-    // dropout scale
-    double dropout_scale_value =
-        use_dropout ? (1.0f / (1.0f - *dropout_rate)) : 1.0f;
-    ScalingParam dropout_scale(dropout_scale_value, dnn::DataType::kFloat);
-    // scale prob
-    double scale_prob_value = use_dropout ? 1.0 - *dropout_rate : 1.0f;
-    ScalingParam scale_prob(scale_prob_value, dnn::DataType::kFloat);
-    scalar_values = {alpha_scale, dropout_scale, scale_prob};
-    // push dropout seed and offset here
-    dropout_rng_offset = GetDropoutRngOffset(intermediate_shape);
-    uids = {CudnnfMHAUid::Q_ID,     CudnnfMHAUid::K_ID,
-            CudnnfMHAUid::P_ID,     CudnnfMHAUid::V_ID,
-            CudnnfMHAUid::dO_ID,    CudnnfMHAUid::dQ_ID,
-            CudnnfMHAUid::dK_ID,    CudnnfMHAUid::dV_ID,
-            CudnnfMHAUid::S_SUM_ID, CudnnfMHAUid::d_Q_accum_ID};
-    if (mask_descriptor != std::nullopt) {
-      uids.push_back(CudnnfMHAUid::MASK_ID);
-    }
-    uids.push_back(CudnnfMHAUid::O_ID);
-    if (bias_descriptor != std::nullopt) {
-      uids.push_back(CudnnfMHAUid::BIAS_ID);
-    }
-    if (is_causal_mask) {
-      // is causal mask
-      // negative infinity
-      double negative_infinity_value = -std::numeric_limits<float>::infinity();
-      ScalingParam negative_infinity(negative_infinity_value,
-                                     dnn::DataType::kFloat);
-      scalar_values.push_back(negative_infinity);
-      scalar_uids.push_back(CudnnfMHAUid::NEG_INFINITY_ID);
-    }
-  } else {
-    // TODO cudnn doesn't support no dropout, so setting dropout rate to 0 here
-    // to mimic no dropout. Change this when cudnn graph is more flexible.
-    scalar_uids = {CudnnfMHAUid::ALPHA_SCALE_ID, CudnnfMHAUid::ZERO_VAL_ID,
-                   CudnnfMHAUid::ONE_VAL_ID, CudnnfMHAUid::DROPOUT_SCALE_ID};
-    ScalingParam alpha_scale(scale, dnn::DataType::kFloat);
-    double zero_value = 0.0f;
-    ScalingParam zero(zero_value, dnn::DataType::kFloat);
-    double one_value = 1.0f;
-    ScalingParam one(one_value, dnn::DataType::kFloat);
-    double dropout_scale_value =
-        use_dropout ? (1.0 / (1.0 - *dropout_rate)) : 1.0;
-    ScalingParam dropout_scale(dropout_scale_value, dnn::DataType::kFloat);
-    scalar_values = {alpha_scale, zero, one, dropout_scale};
+  // TODO cudnn doesn't support no dropout, so setting dropout rate to 0 here
+  // to mimic no dropout. Change this when cudnn graph is more flexible.
+  scalar_uids = {CudnnfMHAUid::ALPHA_SCALE_ID, CudnnfMHAUid::ZERO_VAL_ID,
+                 CudnnfMHAUid::ONE_VAL_ID, CudnnfMHAUid::DROPOUT_SCALE_ID};
+  ScalingParam alpha_scale(scale, dnn::DataType::kFloat);
+  double zero_value = 0.0f;
+  ScalingParam zero(zero_value, dnn::DataType::kFloat);
+  double one_value = 1.0f;
+  ScalingParam one(one_value, dnn::DataType::kFloat);
+  double dropout_scale_value =
+      use_dropout ? (1.0 / (1.0 - *dropout_rate)) : 1.0;
+  ScalingParam dropout_scale(dropout_scale_value, dnn::DataType::kFloat);
+  scalar_values = {alpha_scale, zero, one, dropout_scale};
 
-    uids = {CudnnfMHAUid::Q_ID,  CudnnfMHAUid::K_ID,  CudnnfMHAUid::P_ID,
-            CudnnfMHAUid::V_ID,  CudnnfMHAUid::dO_ID, CudnnfMHAUid::dQ_ID,
-            CudnnfMHAUid::dK_ID, CudnnfMHAUid::dV_ID, CudnnfMHAUid::dS_ID};
-    if (mask_descriptor != std::nullopt) {
-      uids.push_back(CudnnfMHAUid::MASK_ID);
-    }
-    if (d_bias_descriptor != std::nullopt) {
-      uids.push_back(CudnnfMHAUid::dBIAS_ID);
-    }
+  uids = {CudnnfMHAUid::Q_ID,  CudnnfMHAUid::K_ID,  CudnnfMHAUid::P_ID,
+          CudnnfMHAUid::V_ID,  CudnnfMHAUid::dO_ID, CudnnfMHAUid::dQ_ID,
+          CudnnfMHAUid::dK_ID, CudnnfMHAUid::dV_ID, CudnnfMHAUid::dS_ID};
+  if (mask_descriptor != std::nullopt) {
+    uids.push_back(CudnnfMHAUid::MASK_ID);
+  }
+  if (d_bias_descriptor != std::nullopt) {
+    uids.push_back(CudnnfMHAUid::dBIAS_ID);
   }
+
   TF_ASSIGN_OR_RETURN(
       auto runner,
       CudnnExecutionPlanRunner<dnn::FusedMHABackwardSignature>::Create(
@@ -10418,6 +9782,7 @@ absl::Status CudnnGraph::Execute(Stream& stream,
                      void*>
       tensor_to_ptr_map;
   int operand_number = 0;
+
   CHECK_EQ(graph_.get_workspace_size(), 0);
   for (DeviceMemoryBase operand : operands) {
     const cudnn_frontend::graph::Tensor_attributes attr =
@@ -10445,7 +9810,7 @@ void initialize_cudnn() {
   absl::Status status =
       PluginRegistry::Instance()->RegisterFactory<PluginRegistry::DnnFactory>(
           cuda::kCudaPlatformId, "cuDNN",
-          [](internal::StreamExecutorInterface* parent) -> dnn::DnnSupport* {
+          [](StreamExecutorInterface* parent) -> dnn::DnnSupport* {
             gpu::GpuExecutor* cuda_executor =
                 dynamic_cast<gpu::GpuExecutor*>(parent);
             if (cuda_executor == nullptr) {
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.h b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.h
index 06cc5fec2dd9b4..d2ab9ad7d61ab3 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.h
@@ -348,7 +348,7 @@ class CudnnSupport : public dnn::DnnSupport {
       std::optional<dnn::TensorDescriptor> mask_descriptor,
       std::optional<dnn::TensorDescriptor> bias_descriptor, double scale,
       std::optional<double> dropout_rate, std::optional<int64_t> seed,
-      bool is_flash_attention, bool is_causal_mask) override;
+      bool is_flash_attention, dnn::FMHAMaskKind mask_type) override;
 
   absl::StatusOr<std::unique_ptr<const dnn::FusedMHABackwardRunner>>
   FusedMHABackwardRunnerFromDesc(
@@ -368,7 +368,8 @@ class CudnnSupport : public dnn::DnnSupport {
       std::optional<dnn::TensorDescriptor> fwd_output_descriptor,
       std::optional<dnn::TensorDescriptor> bias_descriptor, double scale,
       std::optional<double> dropout_rate, std::optional<int64_t> seed,
-      bool is_flash_attention, bool is_causal_mask);
+      bool is_flash_attention, dnn::FMHAMaskKind mask_type);
+
   bool GetRnnAlgorithms(
       std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
 
@@ -719,6 +720,32 @@ class CudnnSupport : public dnn::DnnSupport {
   void operator=(const CudnnSupport&) = delete;
 };
 
+absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionOperationGraph(
+    dnn::DnnSupport& dnn_support,
+    const dnn::MatmulTensorDescriptor& q_descriptor,
+    const dnn::MatmulTensorDescriptor& k_descriptor,
+    const dnn::MatmulTensorDescriptor& v_descriptor,
+    const dnn::TensorDescriptor& o_descriptor,
+    const std::optional<dnn::TensorDescriptor> bias_descriptor,
+    const std::optional<dnn::TensorDescriptor> mask_descriptor,
+    const std::optional<dnn::TensorDescriptor> stats_descriptor,
+    const float scale, const bool use_dropout,
+    const std::optional<double> dropout_rate,
+    const dnn::FMHAMaskKind mask_type);
+
+absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionBackwardOperationGraph(
+    dnn::DnnSupport& dnn_support, const dnn::MatmulTensorDescriptor& q_desc,
+    const dnn::MatmulTensorDescriptor& k_desc,
+    const dnn::MatmulTensorDescriptor& p_desc,
+    const dnn::MatmulTensorDescriptor& v_desc,
+    const dnn::MatmulTensorDescriptor& do_desc,
+    const dnn::TensorDescriptor& dq_desc, const dnn::TensorDescriptor& dk_desc,
+    const dnn::TensorDescriptor& dv_desc,
+    const std::optional<dnn::TensorDescriptor> bias_descriptor,
+    std::optional<double> dropout_rate, std::optional<int64_t> seed,
+    double scale, bool use_dropout, bool use_mask, bool use_bias,
+    const dnn::FMHAMaskKind mask_type);
+
 }  // namespace gpu
 }  // namespace stream_executor
 
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
index d0b239cf06eb3e..062775371b79a3 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
@@ -76,7 +76,7 @@ limitations under the License.
 #include "xla/stream_executor/plugin_registry.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/stream_executor/stream_executor_internal.h"
+#include "xla/stream_executor/stream_executor_interface.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/fingerprint.h"
@@ -136,9 +136,7 @@ GpuExecutor::~GpuExecutor() {
   }
 }
 
-absl::Status GpuExecutor::Init(int device_ordinal) {
-  device_ordinal_ = device_ordinal;
-
+absl::Status GpuExecutor::Init() {
   TF_RETURN_IF_ERROR(GpuDriver::Init());
   TF_RETURN_IF_ERROR(GpuDriver::GetDevice(device_ordinal_, &device_));
   TF_RETURN_IF_ERROR(
@@ -643,20 +641,6 @@ void GpuExecutor::Deallocate(DeviceMemoryBase* mem) {
   GpuDriver::DeviceDeallocate(context_, mem->opaque());
 }
 
-bool GpuExecutor::HostMemoryRegister(void* location, uint64_t size) {
-  if (location == nullptr || size == 0) {
-    LOG(WARNING) << "attempting to register null or zero-sized memory: "
-                 << location << "; size " << size;
-  }
-  VLOG(2) << "registering " << location << " size " << size;
-  return GpuDriver::HostRegister(context_, location, size);
-}
-
-bool GpuExecutor::HostMemoryUnregister(void* location) {
-  VLOG(2) << "unregistering " << location;
-  return GpuDriver::HostUnregister(context_, location);
-}
-
 bool GpuExecutor::SynchronizeAllActivity() {
   return GpuDriver::SynchronizeContext(context_);
 }
@@ -672,21 +656,6 @@ absl::Status GpuExecutor::SynchronousMemZero(DeviceMemoryBase* location,
                                            0x0, size);
 }
 
-absl::Status GpuExecutor::SynchronousMemSet(DeviceMemoryBase* location,
-                                            int value, uint64_t size) {
-  if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
-      size % 4 == 0) {
-    // cudaMemset reinterprets "value" as a uint8_t.
-    uint8_t byte_value = static_cast<uint8_t>(value);
-    uint32_t pattern = (byte_value << 24) | (byte_value << 16) |
-                       (byte_value << 8) | byte_value;
-    return GpuDriver::SynchronousMemsetUint32(
-        context_, AsCudaDevicePtr(location), pattern, size / 4);
-  }
-  return GpuDriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
-                                           value, size);
-}
-
 absl::Status GpuExecutor::SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
                                             const void* host_src,
                                             uint64_t size) {
@@ -701,12 +670,6 @@ absl::Status GpuExecutor::SynchronousMemcpy(void* host_dst,
                                          AsCudaDevicePtr(gpu_src), size);
 }
 
-absl::Status GpuExecutor::SynchronousMemcpyDeviceToDevice(
-    DeviceMemoryBase* gpu_dst, const DeviceMemoryBase& gpu_src, uint64_t size) {
-  return GpuDriver::SynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
-                                         AsCudaDevicePtr(gpu_src), size);
-}
-
 absl::Status GpuExecutor::MemZero(Stream* stream, DeviceMemoryBase* location,
                                   uint64_t size) {
   if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
@@ -838,6 +801,12 @@ bool GpuExecutor::AllocateStream(Stream* stream) {
 }
 
 void GpuExecutor::DeallocateStream(Stream* stream) {
+  {
+    absl::MutexLock lock(&mu_);
+    if (dnn_ != nullptr) {
+      dnn_->NotifyStreamDestroyed(stream);
+    }
+  }
   GpuStream* cuda_stream = AsGpuStream(stream);
   absl::MutexLock l(&alive_gpu_streams_mu_);
   alive_gpu_streams_.erase(cuda_stream->platform_specific_stream());
@@ -866,7 +835,12 @@ absl::Status GpuExecutor::BlockHostUntilDone(Stream* stream) {
   return GpuDriver::SynchronizeStream(context_, AsGpuStreamValue(stream));
 }
 
-blas::BlasSupport* GpuExecutor::CreateBlas() {
+blas::BlasSupport* GpuExecutor::AsBlas() {
+  absl::MutexLock lock(&mu_);
+  if (blas_ != nullptr) {
+    return blas_.get();
+  }
+
   PluginRegistry* registry = PluginRegistry::Instance();
   absl::StatusOr<PluginRegistry::BlasFactory> status =
       registry->GetFactory<PluginRegistry::BlasFactory>(cuda::kCudaPlatformId);
@@ -876,10 +850,16 @@ blas::BlasSupport* GpuExecutor::CreateBlas() {
     return nullptr;
   }
 
-  return status.value()(this);
+  auto blas = status.value()(this);
+  blas_.reset(blas);
+  return blas_.get();
 }
 
-dnn::DnnSupport* GpuExecutor::CreateDnn() {
+dnn::DnnSupport* GpuExecutor::AsDnn() {
+  absl::MutexLock lock(&mu_);
+  if (dnn_ != nullptr) {
+    return dnn_.get();
+  }
   PluginRegistry* registry = PluginRegistry::Instance();
   absl::StatusOr<PluginRegistry::DnnFactory> status =
       registry->GetFactory<PluginRegistry::DnnFactory>(cuda::kCudaPlatformId);
@@ -889,10 +869,18 @@ dnn::DnnSupport* GpuExecutor::CreateDnn() {
     return nullptr;
   }
 
-  return status.value()(this);
+  auto dnn = status.value()(this);
+
+  dnn_.reset(dnn);
+
+  return dnn_.get();
 }
 
-fft::FftSupport* GpuExecutor::CreateFft() {
+fft::FftSupport* GpuExecutor::AsFft() {
+  absl::MutexLock lock(&mu_);
+  if (fft_ != nullptr) {
+    return fft_.get();
+  }
   PluginRegistry* registry = PluginRegistry::Instance();
   absl::StatusOr<PluginRegistry::FftFactory> status =
       registry->GetFactory<PluginRegistry::FftFactory>(cuda::kCudaPlatformId);
@@ -902,7 +890,10 @@ fft::FftSupport* GpuExecutor::CreateFft() {
     return nullptr;
   }
 
-  return status.value()(this);
+  auto fft = status.value()(this);
+
+  fft_.reset(fft);
+  return fft_.get();
 }
 
 bool GpuExecutor::CanEnablePeerAccessTo(StreamExecutorInterface* other) {
@@ -956,14 +947,12 @@ absl::Status FillBlockDimLimit(GpuDeviceHandle device,
   return absl::OkStatus();
 }
 
-std::unique_ptr<internal::EventInterface>
-GpuExecutor::CreateEventImplementation() {
-  return std::unique_ptr<internal::EventInterface>(new GpuEvent(this));
+std::unique_ptr<EventInterface> GpuExecutor::CreateEventImplementation() {
+  return std::unique_ptr<EventInterface>(new GpuEvent(this));
 }
 
-std::unique_ptr<internal::StreamInterface>
-GpuExecutor::GetStreamImplementation() {
-  return std::unique_ptr<internal::StreamInterface>(new GpuStream(this));
+std::unique_ptr<StreamInterface> GpuExecutor::GetStreamImplementation() {
+  return std::unique_ptr<StreamInterface>(new GpuStream(this));
 }
 
 absl::StatusOr<std::unique_ptr<Kernel>> GpuExecutor::CreateKernel() {
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_fft.cc b/third_party/xla/xla/stream_executor/cuda/cuda_fft.cc
index 60408566b087ef..aa20be7e22df08 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_fft.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_fft.cc
@@ -39,7 +39,7 @@ limitations under the License.
 #include "xla/stream_executor/plugin_registry.h"
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream.h"
-#include "xla/stream_executor/stream_executor_internal.h"
+#include "xla/stream_executor/stream_executor_interface.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
 
@@ -470,7 +470,7 @@ void initialize_cufft() {
   absl::Status status =
       PluginRegistry::Instance()->RegisterFactory<PluginRegistry::FftFactory>(
           cuda::kCudaPlatformId, "cuFFT",
-          [](internal::StreamExecutorInterface *parent) -> fft::FftSupport * {
+          [](StreamExecutorInterface *parent) -> fft::FftSupport * {
             gpu::GpuExecutor *cuda_executor =
                 dynamic_cast<gpu::GpuExecutor *>(parent);
             if (cuda_executor == nullptr) {
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_platform.cc b/third_party/xla/xla/stream_executor/cuda/cuda_platform.cc
index e73fbc621d444d..432994123b89bd 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_platform.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_platform.cc
@@ -133,7 +133,7 @@ absl::StatusOr<StreamExecutor*> CudaPlatform::GetExecutor(
 absl::StatusOr<std::unique_ptr<StreamExecutor>>
 CudaPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
   auto executor = std::make_unique<StreamExecutor>(
-      this, std::make_unique<GpuExecutor>(), config.ordinal);
+      this, std::make_unique<GpuExecutor>(config.ordinal));
   auto init_status = executor->Init();
   if (!init_status.ok()) {
     return absl::InternalError(absl::StrFormat(
diff --git a/third_party/xla/xla/stream_executor/device_memory.h b/third_party/xla/xla/stream_executor/device_memory.h
index 82c3a782094c98..edf4ace76edbd1 100644
--- a/third_party/xla/xla/stream_executor/device_memory.h
+++ b/third_party/xla/xla/stream_executor/device_memory.h
@@ -35,7 +35,6 @@ limitations under the License.
 namespace stream_executor {
 
 class DeviceMemoryAllocator;
-class StreamExecutor;
 
 // void*-analogous device memory allocation. For the typed variation, see
 // DeviceMemory<T>.
@@ -107,16 +106,6 @@ class DeviceMemoryBase {
         reinterpret_cast<std::byte *>(opaque_) + offset_bytes, size_bytes);
   }
 
- protected:
-  friend class StreamExecutor;
-
-  // Resets the internal values of the opaque pointer and number of bytes in the
-  // memory region, just as in the constructor.
-  void Reset(void *opaque, uint64_t bytes) {
-    opaque_ = opaque;
-    size_ = bytes;
-  }
-
  private:
   // Platform-dependent value representing allocated memory.
   //
@@ -154,9 +143,6 @@ class DeviceMemory final : public DeviceMemoryBase {
   // allocation.
   uint64_t ElementCount() const { return size() / sizeof(ElemT); }
 
-  // Returns whether this is a single-element allocation.
-  bool IsScalar() const { return ElementCount() == 1; }
-
   // Creates a typed area of DeviceMemory with a given opaque pointer and the
   // quantity of bytes in the allocation. This function is broken out to
   // distinguish bytes from an element count.
@@ -172,47 +158,16 @@ class DeviceMemory final : public DeviceMemoryBase {
                                             sizeof(ElemT) * element_count));
   }
 
-  // Resets the DeviceMemory data, in MakeFromByteSize fashion.
-  // This simply clobbers the prior values.
-  void ResetFromByteSize(void *opaque, uint64_t bytes) {
-    // TODO(leary) when NVCC is eliminated we can add this check (and the
-    // logging include it requires).
-    // CHECK_EQ(0, bytes % sizeof(ElemT));
-    DeviceMemoryBase::Reset(opaque, bytes);
-  }
-
-  // ------------------------------------------------------------
-
  protected:
-  // This constructor is solely used from derived classes; it is made protected
-  // because it accepts a byte-size instead of an element count, which could
-  // potentially be misused given the ElementCount() nature of this interface.
+  // This is made protected because it accepts a byte-size instead of an element
+  // count, which could potentially be misused given the ElementCount() nature
+  // of this interface.
   //
   // In order to specify the desire to use byte size instead of element count
   // explicitly, use MakeFromByteSize.
   DeviceMemory(void *opaque, uint64_t size) : DeviceMemoryBase(opaque, size) {}
 };
 
-// Host-side representation of packed-and-aligned vector datatypes on the device
-// side. Since these can appear in device kernel signatures, we support
-// launching them with these datatypes in launch signatures.
-
-struct Float2 {
-  float x, y;
-};
-
-struct Float4 {
-  Float2 xz, yw;
-};
-
-struct Double2 {
-  double x, y;
-};
-
-static_assert(sizeof(Float2) == 2 * sizeof(float), "Float2 must be packed");
-static_assert(sizeof(Float4) == 4 * sizeof(float), "Float4 must be packed");
-static_assert(sizeof(Double2) == 2 * sizeof(double), "Double2 must be packed");
-
 }  // namespace stream_executor
 
 #endif  // XLA_STREAM_EXECUTOR_DEVICE_MEMORY_H_
diff --git a/third_party/xla/xla/stream_executor/device_memory_allocator.h b/third_party/xla/xla/stream_executor/device_memory_allocator.h
index d42066591481c9..be77b335e76d44 100644
--- a/third_party/xla/xla/stream_executor/device_memory_allocator.h
+++ b/third_party/xla/xla/stream_executor/device_memory_allocator.h
@@ -75,13 +75,6 @@ class ScopedDeviceMemory {
   // Precondition: memory was allocated by the stream executor `parent`.
   ScopedDeviceMemory(StreamExecutor *parent, DeviceMemoryBase value);
 
-  // Constructor overload that places a literal array into device memory.
-  //
-  // Relies on the allocation function exposed by the stream executor `parent`,
-  // which will be also used for deallocating the memory
-  ScopedDeviceMemory(StreamExecutor *parent,
-                     std::initializer_list<ElemT> values);
-
   // Moves ownership of the memory from other to the constructed
   // object.
   //
diff --git a/third_party/xla/xla/stream_executor/dnn.cc b/third_party/xla/xla/stream_executor/dnn.cc
index f812dca34fc9c0..7cfc21b78a06ff 100644
--- a/third_party/xla/xla/stream_executor/dnn.cc
+++ b/third_party/xla/xla/stream_executor/dnn.cc
@@ -262,7 +262,7 @@ DnnSupport::FusedMHARunnerFromDesc(
     std::optional<dnn::TensorDescriptor> mask_descriptor,
     std::optional<dnn::TensorDescriptor> bias_descriptor, double scale,
     std::optional<double> dropout_rate, std::optional<int64_t> seed,
-    bool is_flash_attention, bool is_causal_mask) {
+    bool is_flash_attention, dnn::FMHAMaskKind mask_type) {
   return absl::UnimplementedError("FusedMHARunnerFromDesc not implemented.");
 }
 
@@ -284,7 +284,7 @@ DnnSupport::FusedMHABackwardRunnerFromDesc(
     std::optional<dnn::TensorDescriptor> fwd_output_descriptor,
     std::optional<dnn::TensorDescriptor> bias_descriptor, double scale,
     std::optional<double> dropout_rate, std::optional<int64_t> seed,
-    bool is_flash_attention, bool is_causal_mask) {
+    bool is_flash_attention, dnn::FMHAMaskKind mask_type) {
   return absl::UnimplementedError(
       "FusedMHABackwardRunnerFromDesc not implemented.");
 }
diff --git a/third_party/xla/xla/stream_executor/dnn.h b/third_party/xla/xla/stream_executor/dnn.h
index f2503a424d9f79..c312ccd802a131 100644
--- a/third_party/xla/xla/stream_executor/dnn.h
+++ b/third_party/xla/xla/stream_executor/dnn.h
@@ -910,6 +910,8 @@ class ProfileResult {
     return algorithm_.has_value() &&
            elapsed_time_in_ms() != std::numeric_limits<float>::max();
   }
+  bool warmup_run_executed() const { return warmup_run_executed_; }
+  void set_warmup_run_executed(bool val) { warmup_run_executed_ = val; }
 
   AlgorithmDesc algorithm() const { return *algorithm_; }
   void set_algorithm(AlgorithmDesc val) { algorithm_ = val; }
@@ -926,6 +928,7 @@ class ProfileResult {
   // The scratch size algorithm_ requires. Currently it's only populated by
   // convolutions.
   size_t scratch_size_ = 0;
+  bool warmup_run_executed_ = false;
 };
 
 // Backend-specific data shared between repeated launches of the same
@@ -1010,11 +1013,9 @@ using FusedMHABackwardSignature = void(
     DeviceMemoryBase /* d_BMM1_inputA_data */,
     DeviceMemoryBase /* d_BMM1_inputB_data */,
     DeviceMemoryBase /* d_BMM2_inputB_data */, DeviceMemoryBase /* d_S_data */,
-    DeviceMemoryBase /* softmax_sum_data */,
-    DeviceMemoryBase /* d_Q_accum_data */, DeviceMemoryBase /* mask_data */,
-    DeviceMemoryBase /* d_bias_data */, DeviceMemoryBase /* fwd_output_data */,
-    DeviceMemoryBase /* bias_data */, DeviceMemoryBase /* seqlen_q_data */,
-    DeviceMemoryBase /* seqlen_k_data */);
+    DeviceMemoryBase /* mask_data */, DeviceMemoryBase /* d_bias_data */,
+    DeviceMemoryBase /* fwd_output_data */, DeviceMemoryBase /* bias_data */,
+    DeviceMemoryBase /* seqlen_q_data */, DeviceMemoryBase /* seqlen_k_data */);
 using FusedMHABackwardRunner = OpRunner<FusedMHABackwardSignature>;
 
 // Describes the configuration for the algorithms that will used.
@@ -1747,7 +1748,7 @@ class DnnSupport {
       std::optional<TensorDescriptor> mask_descriptor,
       std::optional<TensorDescriptor> bias_descriptor, double scale,
       std::optional<double> dropout_rate, std::optional<int64_t> seed,
-      bool is_flash_attention, bool is_causal_mask);
+      bool is_flash_attention, dnn::FMHAMaskKind mask_type);
 
   virtual absl::StatusOr<std::unique_ptr<const FusedMHABackwardRunner>>
   FusedMHABackwardRunnerFromDesc(
@@ -1766,7 +1767,7 @@ class DnnSupport {
       std::optional<TensorDescriptor> fwd_output_descriptor,
       std::optional<TensorDescriptor> bias_descriptor, double scale,
       std::optional<double> dropout_rate, std::optional<int64_t> seed,
-      bool is_flash_attention, bool is_causal_mask);
+      bool is_flash_attention, dnn::FMHAMaskKind mask_type);
 
   virtual bool GetMIOpenConvolveAlgorithms(
       ConvolutionKind kind, DataType element_type, Stream* stream,
diff --git a/third_party/xla/xla/stream_executor/event.cc b/third_party/xla/xla/stream_executor/event.cc
index 6d634640d5dc23..ba4f7d15d54c4a 100644
--- a/third_party/xla/xla/stream_executor/event.cc
+++ b/third_party/xla/xla/stream_executor/event.cc
@@ -19,8 +19,8 @@ limitations under the License.
 
 #include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "xla/stream_executor/event_interface.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/stream_executor/stream_executor_internal.h"
 
 namespace stream_executor {
 
diff --git a/third_party/xla/xla/stream_executor/event.h b/third_party/xla/xla/stream_executor/event.h
index 3be6b53a406a10..ecfe0194f99648 100644
--- a/third_party/xla/xla/stream_executor/event.h
+++ b/third_party/xla/xla/stream_executor/event.h
@@ -23,10 +23,7 @@ limitations under the License.
 
 namespace stream_executor {
 
-namespace internal {
 class EventInterface;
-}
-
 class Stream;
 class StreamExecutor;
 
@@ -64,7 +61,7 @@ class Event {
   absl::Status WaitForEventOnExternalStream(std::intptr_t stream);
 
   // Returns a pointer to the underlying platform-specific implementation.
-  internal::EventInterface* implementation() { return implementation_.get(); }
+  EventInterface* implementation() { return implementation_.get(); }
 
   Event(Event&&);
   Event& operator=(Event&&);
@@ -78,7 +75,7 @@ class Event {
 
   // Pointer to the platform-specific EventInterface implementation underlying
   // the object. Owned.
-  std::unique_ptr<internal::EventInterface> implementation_;
+  std::unique_ptr<EventInterface> implementation_;
 
   Event(const Event&) = delete;
   void operator=(const Event&) = delete;
diff --git a/third_party/xla/xla/stream_executor/event_interface.h b/third_party/xla/xla/stream_executor/event_interface.h
new file mode 100644
index 00000000000000..f78d0a28179ba6
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/event_interface.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_EVENT_INTERFACE_H_
+#define XLA_STREAM_EXECUTOR_EVENT_INTERFACE_H_
+
+namespace stream_executor {
+
+// Base class for all kinds of Events supported by StreamExecutors.
+class EventInterface {
+ public:
+  EventInterface() = default;
+  virtual ~EventInterface() = default;
+
+ private:
+  EventInterface(const EventInterface&) = delete;
+  void operator=(const EventInterface&) = delete;
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_EVENT_INTERFACE_H_
diff --git a/third_party/xla/xla/stream_executor/gpu/BUILD b/third_party/xla/xla/stream_executor/gpu/BUILD
index cfe260ce180fb3..32103dc986dd2d 100644
--- a/third_party/xla/xla/stream_executor/gpu/BUILD
+++ b/third_party/xla/xla/stream_executor/gpu/BUILD
@@ -10,13 +10,6 @@ load(
     "if_rocm",
     "if_rocm_is_configured",
 )
-load(
-    "@local_tsl//tsl:tsl.bzl",
-    "if_libtpu",
-    "internal_visibility",
-    "tsl_copts",
-    "tsl_gpu_library",
-)
 load(
     "@local_tsl//tsl/platform:build_config_root.bzl",
     "if_static",
@@ -47,6 +40,13 @@ load(
     "//xla/tests:build_defs.bzl",
     "xla_test",
 )
+load(
+    "//xla/tsl:tsl.bzl",
+    "if_libtpu",
+    "internal_visibility",
+    "tsl_copts",
+    "tsl_gpu_library",
+)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -152,7 +152,7 @@ gpu_only_cc_library(
         ":gpu_stream",
         ":gpu_types_header",
         "//xla/stream_executor",
-        "//xla/stream_executor:stream_executor_internal",
+        "//xla/stream_executor:stream_executor_interface",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:any_invocable",
@@ -182,7 +182,8 @@ gpu_only_cc_library(
     deps = [
         ":gpu_stream_header",
         ":gpu_types_header",
-        "//xla/stream_executor",
+        "//xla/stream_executor:event_interface",
+        "//xla/stream_executor:stream_executor_interface",
         "@com_google_absl//absl/status",
     ],
 )
@@ -196,7 +197,8 @@ gpu_only_cc_library(
         ":gpu_executor_header",
         ":gpu_stream",
         ":gpu_types_header",
-        "//xla/stream_executor",
+        "//xla/stream_executor:event_interface",
+        "//xla/stream_executor:stream_executor_interface",
         "@com_google_absl//absl/status",
     ],
 )
@@ -210,13 +212,14 @@ gpu_only_cc_library(
         ":gpu_types_header",
         "//xla/stream_executor",
         "//xla/stream_executor:platform",
-        "//xla/stream_executor:stream_executor_internal",
+        "//xla/stream_executor:stream_executor_interface",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/numeric:int128",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:thread_annotations",
@@ -238,7 +241,7 @@ tsl_gpu_library(
         "gpu_init.h",
     ],
     visibility = internal_visibility([
-        "@local_tsl//tsl:internal",
+        "//xla/tsl:internal",
     ]),
     deps = [
         "@com_google_absl//absl/status",
@@ -282,7 +285,7 @@ gpu_only_cc_library(
         ":gpu_executor_header",
         ":gpu_types_header",
         "//xla/stream_executor",
-        "//xla/stream_executor:stream_executor_internal",
+        "//xla/stream_executor:stream_executor_interface",
         "@com_google_absl//absl/status:statusor",
         "@local_tsl//tsl/platform:logging",
     ],
@@ -294,7 +297,7 @@ gpu_only_cc_library(
     deps = [
         ":gpu_types_header",
         "//xla/stream_executor",
-        "//xla/stream_executor:stream_executor_internal",
+        "//xla/stream_executor:stream_executor_interface",
         "@com_google_absl//absl/log:check",
     ],
 )
@@ -308,18 +311,36 @@ gpu_only_cc_library(
         ":gpu_executor_header",
         ":gpu_types_header",
         "//xla/stream_executor",
-        "//xla/stream_executor:stream_executor_internal",
+        "//xla/stream_executor:stream_executor_interface",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
     ],
 )
 
+gpu_only_cc_library(
+    name = "gpu_timer_kernel_header",
+    hdrs = ["gpu_timer_kernel.h"],
+)
+
+gpu_kernel_library(
+    name = "gpu_timer_kernel",
+    srcs = if_gpu_is_configured(["gpu_timer_kernel.cu.cc"]),
+    deps = [
+        ":gpu_timer_kernel_header",
+    ] + if_cuda_is_configured([
+        "@local_config_cuda//cuda:cuda_headers",
+    ]) + if_rocm_is_configured([
+        "@local_config_rocm//rocm:rocm_headers",
+    ]),
+)
+
 gpu_only_cc_library(
     name = "gpu_timer_header",
     hdrs = ["gpu_timer.h"],
     deps = [
         ":gpu_executor_header",
+        ":gpu_timer_kernel_header",
         ":gpu_types_header",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/time",
@@ -330,13 +351,17 @@ gpu_only_cc_library(
     name = "gpu_timer",
     srcs = ["gpu_timer.cc"],
     hdrs = ["gpu_timer.h"],
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+        "TENSORFLOW_USE_ROCM=1",
+    ]),
     deps = [
         ":gpu_driver_header",
         ":gpu_executor_header",
         ":gpu_stream",
+        ":gpu_timer_kernel_header",
         ":gpu_types_header",
         "//xla/stream_executor",
-        "//xla/stream_executor:stream_executor_internal",
+        "//xla/stream_executor:stream_executor_interface",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -348,7 +373,9 @@ gpu_only_cc_library(
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
-    ] + if_cuda_is_configured([
+    ] + if_gpu_is_configured([
+        ":gpu_timer_kernel",
+    ]) + if_cuda_is_configured([
         "//xla/stream_executor/cuda:cuda_driver",
     ]) + if_rocm_is_configured([
         "//xla/stream_executor/rocm:rocm_driver",
@@ -381,56 +408,12 @@ cc_library(
     ],
 )
 
-gpu_only_cc_library(
-    name = "asm_compiler_header",
-    hdrs = ["asm_compiler.h"],
-    copts = tsl_copts(),
-    visibility = internal_visibility([
-        "//tensorflow/compiler/mlir/tools/kernel_gen:__subpackages__",
-        "//xla/service/gpu:__subpackages__",
-        "//xla/stream_executor:__subpackages__",
-        "//tensorflow/core/kernels:__subpackages__",
-    ]),
-    deps = [
-        ":gpu_asm_opts",
-        ":gpu_driver_header",
-        ":gpu_helpers_header",
-        "//xla/stream_executor",
-        "//xla/stream_executor/platform",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/cleanup",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/container:node_hash_map",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/types:span",
-        "@local_config_cuda//cuda:cuda_headers",
-        "@local_tsl//tsl/platform:cuda_libdevice_path",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:mutex",
-        "@local_tsl//tsl/platform:path",
-        "@local_tsl//tsl/platform:regexp",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:subprocess",
-    ] + if_cuda_is_configured([
-        "//xla/stream_executor/cuda:cuda_driver",
-    ]) + if_rocm_is_configured([
-        "//xla/stream_executor/rocm:rocm_driver",
-    ]),
-)
-
 gpu_only_cc_library(
     name = "asm_compiler",
     srcs = ["asm_compiler.cc"],
     hdrs = ["asm_compiler.h"],
     copts = tsl_copts(),
     visibility = internal_visibility([
-        "//third_party/py/jax:__subpackages__",
-        "//tensorflow/compiler/mlir/tools/kernel_gen:__subpackages__",
         "//xla/service/gpu:__subpackages__",
         "//xla/stream_executor:__subpackages__",
         "//tensorflow/core/kernels:__subpackages__",
@@ -471,11 +454,7 @@ gpu_only_cc_library(
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:subprocess",
     ] + if_cuda_is_configured([
-        "//xla/stream_executor/cuda:cuda_asm_compiler",
         "//xla/stream_executor/cuda:cuda_driver",
-        "//xla/stream_executor/cuda:ptxas_wrapper",
-        "//xla/stream_executor/cuda:nvlink_wrapper",
-        "//xla/stream_executor/cuda:fatbinary_wrapper",
     ]) + if_rocm_is_configured([
         "//xla/stream_executor/rocm:rocm_driver",
     ]),
@@ -539,6 +518,8 @@ gpu_only_cc_library(
         "@local_tsl//tsl/platform:statusor",
     ] + if_rocm_is_configured([
         ":redzone_allocator_kernel",
+    ]) + if_cuda_is_configured([
+        "//xla/stream_executor/cuda:cuda_asm_compiler",
     ]),
 )
 
diff --git a/third_party/xla/xla/stream_executor/gpu/asm_compiler.cc b/third_party/xla/xla/stream_executor/gpu/asm_compiler.cc
index 0b77fb0f2d2862..1bdef804f8633e 100644
--- a/third_party/xla/xla/stream_executor/gpu/asm_compiler.cc
+++ b/third_party/xla/xla/stream_executor/gpu/asm_compiler.cc
@@ -15,472 +15,29 @@ limitations under the License.
 
 #include "xla/stream_executor/gpu/asm_compiler.h"
 
-#include <array>
 #include <cassert>
 #include <cstdint>
-#include <cstdlib>
 #include <sstream>
 #include <string>
-#include <string_view>
-#include <tuple>
 #include <utility>
 #include <vector>
 
-#include "absl/algorithm/container.h"
-#include "absl/base/const_init.h"
-#include "absl/base/optimization.h"
-#include "absl/base/thread_annotations.h"
 #include "absl/cleanup/cleanup.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
-#include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "absl/strings/match.h"
-#include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
-#include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
-#include "absl/synchronization/mutex.h"
-#include "absl/types/span.h"
-#include "xla/stream_executor/cuda/ptx_compiler.h"
-#include "xla/stream_executor/cuda/ptx_compiler_support.h"
-#include "xla/stream_executor/gpu/gpu_asm_opts.h"
-#include "xla/stream_executor/gpu/gpu_driver.h"
-#include "xla/stream_executor/gpu/gpu_types.h"
-#include "xla/util.h"
-#include "tsl/platform/cuda_libdevice_path.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/path.h"
-#include "tsl/platform/regexp.h"
 #include "tsl/platform/status.h"
-#include "tsl/platform/statusor.h"
 #include "tsl/platform/subprocess.h"
 
 namespace stream_executor {
 
-static absl::StatusOr<std::string> GetToolVersionString(
-    std::string_view binary_path) {
-  // If binary_path doesn't exist, then tsl::SubProcess will log a bunch of
-  // error messages that have confused users in the past. Therefore we first
-  // check whether the binary_path exists and error out early if not.
-  tsl::Env* env = tsl::Env::Default();
-  if (absl::Status file_exists = env->FileExists(std::string{binary_path});
-      !file_exists.ok()) {
-    return file_exists;
-  }
-
-  tsl::SubProcess binary;
-  std::string binary_path_str(binary_path);
-  binary.SetProgram(binary_path_str, {binary_path_str, "--version"});
-  binary.SetChannelAction(tsl::CHAN_STDOUT, tsl::ACTION_PIPE);
-  if (!binary.Start()) {
-    return absl::InternalError(
-        absl::StrFormat("Couldn't invoke %s --version", binary_path));
-  }
-
-  std::string out;
-  int exit_code = binary.Communicate(/*stdin_input=*/nullptr, &out,
-                                     /*stderr_output=*/nullptr);
-  if (exit_code != 0) {
-    return absl::InternalError(absl::StrFormat(
-        "Running %s --version returned %d", binary_path, exit_code));
-  }
-
-  return out;
-}
-
-static absl::StatusOr<ToolVersion> GetToolVersionImpl(
-    std::string_view tool_path) {
-  absl::StatusOr<std::string> tool_version = GetToolVersionString(tool_path);
-  if (!tool_version.ok()) {
-    return absl::FailedPreconditionError(
-        absl::StrCat("Couldn't get ptxas/nvlink version string: ",
-                     tool_version.status().ToString()));
-  }
-  static constexpr LazyRE2 kVersionRegex = {R"(\bV(\d+)\.(\d+)\.(\d+)\b)"};
-  ToolVersion version{};
-  std::string_view vmaj_str, vmin_str, vdot_str;
-  if (!RE2::PartialMatch(tool_version.value(), *kVersionRegex, &vmaj_str,
-                         &vmin_str, &vdot_str) ||
-      !absl::SimpleAtoi(vmaj_str, &version[0]) ||
-      !absl::SimpleAtoi(vmin_str, &version[1]) ||
-      !absl::SimpleAtoi(vdot_str, &version[2])) {
-    return absl::FailedPreconditionError(
-        absl::StrCat("Couldn't parse ptxas/nvlink version in output of ",
-                     tool_path, " --version:\n", tool_version.value()));
-  }
-  return version;
-}
-
-absl::StatusOr<ToolVersion> GetToolVersion(std::string_view tool_path) {
-  // This is only implementing a static cache. `GetToolVersionImpl` has the
-  // actual business logic.
-  static absl::Mutex mutex(absl::kConstInit);
-  static auto cache =
-      new absl::flat_hash_map<std::string, absl::StatusOr<ToolVersion>>
-          ABSL_GUARDED_BY(mutex);
-
-  absl::MutexLock lock(&mutex);
-  auto it = cache->find(tool_path);
-  if (it != cache->end()) {
-    return it->second;
-  }
-
-  return cache->try_emplace(tool_path, GetToolVersionImpl(tool_path))
-      .first->second;
-}
-
-absl::StatusOr<absl::Span<const uint8_t>> CompileGpuAsmOrGetCached(
-    int device_ordinal, const char* ptx, GpuAsmOpts compilation_options) {
-  using PtxCacheKey = std::tuple<int, std::string, GpuAsmOpts::PtxOptionsTuple>;
-  using PtxCompilerResult = absl::StatusOr<std::vector<uint8_t>>;
-  static absl::Mutex ptx_cache_mutex(absl::kConstInit);
-  static auto& ptx_cache ABSL_GUARDED_BY(ptx_cache_mutex) =
-      *new absl::flat_hash_map<PtxCacheKey, PtxCompilerResult>();
-
-  absl::MutexLock lock(&ptx_cache_mutex);
-  PtxCacheKey cache_key{device_ordinal, std::string(ptx),
-                        compilation_options.ToTuple()};
-  auto it = ptx_cache.find(cache_key);
-  if (it == ptx_cache.end()) {
-    PtxCompilerResult compiled =
-        CompileGpuAsm(device_ordinal, ptx, compilation_options);
-    it = ptx_cache.emplace(cache_key, std::move(compiled)).first;
-  }
-
-  CHECK(it != ptx_cache.end());
-
-  // Failed compilation attempts are cached.
-  // Use separate status check and ValueOrDie invocation on ptx_cache
-  // entry to avoid value moving introduced by TF_ASSIGN_OR_RETURN.
-
-  if (ABSL_PREDICT_FALSE(!it->second.ok())) {
-    return it->second.status();
-  }
-
-  const std::vector<uint8_t>& compiled = it->second.value();
-  return absl::MakeSpan(compiled);
-}
-
-absl::StatusOr<std::vector<uint8_t>> CompileGpuAsm(int device_ordinal,
-                                                   const char* ptx_contents,
-                                                   GpuAsmOpts options) {
-  gpu::GpuDeviceHandle handle;
-  TF_RETURN_IF_ERROR(gpu::GpuDriver::GetDevice(device_ordinal, &handle));
-  int cc_major;
-  int cc_minor;
-  TF_RETURN_IF_ERROR(
-      gpu::GpuDriver::GetComputeCapability(&cc_major, &cc_minor, handle));
-  return CompileGpuAsm(cc_major, cc_minor, ptx_contents, options);
-}
-
-absl::StatusOr<std::string> FindCudaExecutable(
-    std::string_view binary_name, std::string_view preferred_cuda_dir,
-    ToolVersion minimum_version,
-    absl::Span<const ToolVersion> excluded_versions) {
-  std::string binary_filename = std::string{binary_name};
-  tsl::io::AppendDotExeIfWindows(binary_filename);
-
-  std::vector<std::string> candidates{};
-
-  // #1 - Check the preferred CUDA directory
-  candidates.emplace_back(
-      tsl::io::JoinPath(preferred_cuda_dir, "bin", binary_filename));
-
-  // #2 - Check the PATH environment variable
-  std::string_view path_env = std::getenv("PATH");
-
-#if defined(PLATFORM_WINDOWS)
-  constexpr char kSearchPathSeparator = ';';
-#else
-  constexpr char kSearchPathSeparator = ':';
-#endif
-
-  for (std::string_view path : absl::StrSplit(path_env, kSearchPathSeparator)) {
-    candidates.emplace_back(tsl::io::JoinPath(path, binary_filename));
-  }
-
-  // #3 - Check generic CUDA locations
-  for (std::string_view path : tsl::CandidateCudaRoots()) {
-    candidates.emplace_back(tsl::io::JoinPath(path, "bin", binary_filename));
-  }
-
-  for (const auto& candidate : candidates) {
-    VLOG(2) << "Looking for " << candidate;
-    auto candidate_version = GetToolVersion(candidate);
-    if (!candidate_version.ok()) {
-      continue;
-    }
-
-    if (candidate_version.value() < minimum_version) {
-      VLOG(2) << candidate << " with version "
-              << absl::StrJoin(minimum_version, ".") << " is too old.";
-      continue;
-    }
-
-    if (absl::c_find(excluded_versions, candidate_version.value()) !=
-        excluded_versions.end()) {
-      VLOG(2) << candidate << " has version "
-              << absl::StrJoin(candidate_version.value(), ".")
-              << " which was explicitly excluded.";
-      continue;
-    }
-
-    VLOG(2) << "Using " << candidate << " with version "
-            << absl::StrJoin(candidate_version.value(), ".");
-    return candidate;
-  }
-
-  return absl::NotFoundError(
-      absl::StrCat("Couldn't find a suitable version of ", binary_name,
-                   ". The following locations were considered: ",
-                   absl::StrJoin(candidates, ", ")));
-}
-
-absl::StatusOr<std::string> FindCudaExecutable(
-    std::string_view binary_name, std::string_view preferred_cuda_dir) {
-  static constexpr ToolVersion kNoMinimumVersion{0, 0, 0};
-  static constexpr absl::Span<const ToolVersion> kNoExcludedVersions{};
-  return FindCudaExecutable(binary_name, preferred_cuda_dir, kNoMinimumVersion,
-                            kNoExcludedVersions);
-}
-
-static void LogPtxasTooOld(const std::string& ptxas_path, int cc_major,
-                           int cc_minor) {
-  using AlreadyLoggedSetTy =
-      absl::flat_hash_set<std::tuple<std::string, int, int>>;
-
-  static absl::Mutex* mutex = new absl::Mutex;
-  static AlreadyLoggedSetTy* already_logged = new AlreadyLoggedSetTy;
-
-  absl::MutexLock lock(mutex);
-
-  if (already_logged->insert(std::make_tuple(ptxas_path, cc_major, cc_minor))
-          .second) {
-    LOG(WARNING) << "Falling back to the CUDA driver for PTX compilation; "
-                    "ptxas does not support CC "
-                 << cc_major << "." << cc_minor;
-    LOG(WARNING) << "Used ptxas at " << ptxas_path;
-  }
-}
-
-static void AppendArgsFromOptions(GpuAsmOpts options,
-                                  std::vector<std::string>& args) {
-  if (options.disable_gpuasm_optimizations) {
-    args.push_back("-O0");
-  }
-  args.insert(args.end(), options.extra_flags.begin(),
-              options.extra_flags.end());
-}
-
-static absl::StatusOr<std::string> FindPtxAsExecutable(
-    std::string_view preferred_cuda_dir) {
-  static constexpr ToolVersion kMinimumSupportedPtxAsVersion{11, 8, 0};
-  static constexpr ToolVersion kBuggyPtxAsVersions[] = {{12, 3, 103}};
-  static constexpr std::string_view kPtxAsBinaryName = "ptxas";
-
-  return FindCudaExecutable(kPtxAsBinaryName, preferred_cuda_dir,
-                            kMinimumSupportedPtxAsVersion, kBuggyPtxAsVersions);
-}
-
-absl::StatusOr<ToolVersion> GetAsmCompilerVersion(
-    std::string_view preferred_cuda_dir) {
-  TF_ASSIGN_OR_RETURN(std::string ptxas_path,
-                      FindPtxAsExecutable(preferred_cuda_dir));
-  return GetToolVersion(ptxas_path);
-}
-
-absl::StatusOr<std::vector<uint8_t>> CompileGpuAsmUsingPtxAs(
-    int cc_major, int cc_minor, const char* ptx_contents, GpuAsmOpts options,
-    bool cancel_if_reg_spill) {
-  TF_ASSIGN_OR_RETURN(std::string ptxas_path,
-                      FindPtxAsExecutable(options.preferred_cuda_dir));
-
-  // Write ptx into a temporary file.
-  std::string ptx_path;
-  auto env = tsl::Env::Default();
-  if (!env->LocalTempFilename(&ptx_path)) {
-    return absl::InternalError("couldn't get temp PTX file name");
-  }
-  TF_RETURN_WITH_CONTEXT_IF_ERROR(
-      tsl::WriteStringToFile(env, ptx_path, ptx_contents),
-      "Unable to write PTX contents to: ", ptx_path);
-  VLOG(2) << "ptx written to: " << ptx_path;
-
-  absl::Cleanup ptx_cleaner = [&ptx_path] {
-    TF_CHECK_OK(tsl::Env::Default()->DeleteFile(ptx_path));
-  };
-
-  // Invoke ptxas and collect its output.
-  std::string cubin_path;
-  if (!env->LocalTempFilename(&cubin_path)) {
-    return absl::InternalError("couldn't get temp CUBIN file name");
-  }
-  absl::Cleanup cubin_cleaner = [&cubin_path] {
-    // CUBIN file may never be created, so the failure to delete it should not
-    // produce TF error.
-    tsl::Env::Default()->DeleteFile(cubin_path).IgnoreError();
-  };
-  tsl::SubProcess ptxas_info_dumper;
-  // If the target is sm_90, hard code it to sm_90a so that all instructions
-  // can be used. We don't need the portability that sm_90 gives.
-  std::string extension = (cc_major == 9 && cc_minor == 0) ? "a" : "";
-  std::vector<std::string> ptxas_args = {
-      ptxas_path,
-      ptx_path,
-      "-o",
-      cubin_path,
-      absl::StrCat("-arch=sm_", cc_major, cc_minor, extension),
-      "--warn-on-spills"};
-  if (VLOG_IS_ON(2)) {
-    ptxas_args.push_back("-v");
-  }
-  AppendArgsFromOptions(options, ptxas_args);
-  if (VLOG_IS_ON(3)) {
-    VLOG(3) << absl::StrJoin(ptxas_args, " ");
-  }
-
-  ptxas_info_dumper.SetProgram(ptxas_path, ptxas_args);
-  ptxas_info_dumper.SetChannelAction(tsl::CHAN_STDERR, tsl::ACTION_PIPE);
-  if (!ptxas_info_dumper.Start()) {
-    return absl::InternalError("Failed to launch ptxas");
-  }
-  std::string stderr_output;
-  int exit_status = ptxas_info_dumper.Communicate(
-      /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
-  if (exit_status != 0) {
-    //  It happens when the ptxas installed is too old for the current GPU.
-    //  Example error message associated with this error code:
-    //      ptxas fatal   : Value 'sm_80' is not defined for option 'gpu-name'
-    // In that case, fallback to the driver for compilation
-    if (absl::StrContains(stderr_output, "ptxas fatal   : Value '") &&
-        absl::StrContains(stderr_output,
-                          "is not defined for option 'gpu-name'")) {
-      LogPtxasTooOld(ptxas_path, cc_major, cc_minor);
-      return absl::UnimplementedError(absl::StrFormat(
-          "%s ptxas too old. Falling back to the driver to compile.",
-          ptxas_path));
-    }
-    if (absl::StrContains(stderr_output, "ptxas fatal") &&
-        absl::StrContains(stderr_output, "Register allocation failed")) {
-      LOG(INFO) << stderr_output;
-      return absl::ResourceExhaustedError("Register allocation failed");
-    }
-
-    return absl::InternalError(
-        absl::StrFormat("ptxas exited with non-zero error code %d, output: %s",
-                        exit_status, stderr_output));
-  }
-  // Print the verbose output of ptxas.
-  if (!stderr_output.empty()) {
-    if (absl::StrContains(stderr_output, "warning")) {
-      LOG(INFO) << stderr_output;
-      if (cancel_if_reg_spill &&
-          absl::StrContains(stderr_output, "Registers are spilled")) {
-        return xla::Cancelled(
-            "Compilation result discarded due to register spilling");
-      }
-    } else {
-      VLOG(2) << stderr_output;
-    }
-  }
-
-  // Read in the result of compilation and return it as a byte vector.
-  std::string cubin;
-  TF_RETURN_IF_ERROR(
-      tsl::ReadFileToString(tsl::Env::Default(), cubin_path, &cubin));
-  std::vector<uint8_t> cubin_vector(cubin.begin(), cubin.end());
-  return cubin_vector;
-}
-
-absl::StatusOr<std::vector<uint8_t>> BundleGpuAsm(
-    std::vector<CubinOrPTXImage> images, GpuAsmOpts options) {
-  TF_ASSIGN_OR_RETURN(
-      std::string fatbinary_path,
-      FindCudaExecutable("fatbinary", options.preferred_cuda_dir));
-
-  // Write images to temporary files.
-  std::vector<std::string> image_paths;
-  auto env = tsl::Env::Default();
-  for (const CubinOrPTXImage& img : images) {
-    std::string img_path;
-    if (!env->LocalTempFilename(&img_path)) {
-      return absl::InternalError(
-          "Could not get temporary filenames for images.");
-    }
-    TF_RETURN_IF_ERROR(tsl::WriteStringToFile(
-        env, img_path, std::string(img.bytes.begin(), img.bytes.end())));
-    VLOG(2) << "image written to " << img_path;
-    image_paths.push_back(std::move(img_path));
-  }
-  absl::Cleanup image_files_cleaner = [&image_paths] {
-    for (const auto& path : image_paths) {
-      TF_CHECK_OK(tsl::Env::Default()->DeleteFile(path));
-    }
-  };
-
-  // Prepare temorary result file.
-  std::string result_path;
-  if (!env->LocalTempFilename(&result_path)) {
-    return absl::InternalError(
-        "Could not get temporary filename for fatbin result.");
-  }
-  absl::Cleanup result_file_cleaner = [&result_path] {
-    // This file may never be created, so the failure to delete it should not
-    // propagate to TF.
-    tsl::Env::Default()->DeleteFile(result_path).IgnoreError();
-  };
-
-  // Compute the ptxas options that were used to produce the cubins.
-  std::vector<std::string> ptxas_options;
-  AppendArgsFromOptions(options, ptxas_options);
-
-  // Invoke fatbinary and collect its output.
-  tsl::SubProcess fatbinary;
-  std::vector<std::string> fatbinary_args = {
-      fatbinary_path, "--64", "--link", "--compress-all",
-      absl::StrCat("--create=", result_path)};
-  if (!ptxas_options.empty()) {
-    auto command_line = absl::StrJoin(ptxas_options, " ");
-    fatbinary_args.push_back(absl::StrFormat("--cmdline=%s", command_line));
-  }
-  assert(images.size() == image_paths.size());
-  for (int i = 0; i < images.size(); i++) {
-    fatbinary_args.push_back(absl::StrFormat(
-        "--image=profile=%s,file=%s", images[i].profile, image_paths[i]));
-  }
-  if (VLOG_IS_ON(3)) {
-    VLOG(3) << absl::StrJoin(fatbinary_args, " ");
-  }
-  fatbinary.SetProgram(fatbinary_path, fatbinary_args);
-  fatbinary.SetChannelAction(tsl::CHAN_STDERR, tsl::ACTION_PIPE);
-  if (!fatbinary.Start()) {
-    return absl::InternalError("Failed to launch fatbinary.");
-  }
-  std::string stderr_output;
-  int exit_status = fatbinary.Communicate(
-      /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
-  if (exit_status != 0) {
-    return absl::InternalError(absl::StrFormat(
-        "fatbinary exited with non-zero error code %d, output: %s", exit_status,
-        stderr_output));
-  }
-  if (!stderr_output.empty()) {
-    VLOG(2) << stderr_output;
-  }
-
-  // Read in the result and return it as a byte vector.
-  std::string result_blob;
-  TF_RETURN_IF_ERROR(
-      tsl::ReadFileToString(tsl::Env::Default(), result_path, &result_blob));
-  return std::vector<uint8_t>(result_blob.begin(), result_blob.end());
-}
-
 static std::string findRocmExecutable(const std::string& binary_relative_path,
                                       const std::string& rocm_root_dir) {
   auto env = tsl::Env::Default();
@@ -576,20 +133,4 @@ absl::StatusOr<std::vector<uint8_t>> BundleGpuAsm(
   return std::vector<uint8_t>(result_blob.begin(), result_blob.end());
 }
 
-absl::StatusOr<std::vector<uint8_t>> CompileGpuAsm(int cc_major, int cc_minor,
-                                                   const char* ptx_contents,
-                                                   GpuAsmOpts options,
-                                                   bool cancel_if_reg_spill) {
-  if (IsLibNvPtxCompilerSupported()) {
-    VLOG(3) << "Compiling GPU ASM with libnvptxcompiler";
-    return CompileGpuAsmUsingLibNvPtxCompiler(cc_major, cc_minor, ptx_contents,
-                                              options, cancel_if_reg_spill);
-  }
-
-  VLOG(3) << "Compiling GPU ASM with PTXAS. Libnvptxcompiler compilation "
-             "not supported.";
-  return CompileGpuAsmUsingPtxAs(cc_major, cc_minor, ptx_contents, options,
-                                 cancel_if_reg_spill);
-}
-
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/gpu/asm_compiler.h b/third_party/xla/xla/stream_executor/gpu/asm_compiler.h
index 5933a218baca13..8acc94b9fd73e4 100644
--- a/third_party/xla/xla/stream_executor/gpu/asm_compiler.h
+++ b/third_party/xla/xla/stream_executor/gpu/asm_compiler.h
@@ -40,50 +40,6 @@ limitations under the License.
 #endif  // GOOGLE_CUDA
 
 namespace stream_executor {
-namespace gpu {
-class GpuContext;
-}
-
-// Compiles the given PTX string using ptxas and returns the resulting machine
-// code (i.e. a cubin) as a byte array. The generated cubin matches the compute
-// capabilities of the device associated with 'device_ordinal'.
-//
-// 'options' is used to query for the CUDA location in case it is
-// customized in a passed flag, and for controlling ptxas optimizations.
-absl::StatusOr<std::vector<uint8_t>> CompileGpuAsm(int device_ordinal,
-                                                   const char* ptx_contents,
-                                                   GpuAsmOpts options);
-
-// Compiles the given PTX string using ptxas and returns the resulting machine
-// code (i.e. a cubin) as a byte array. The generated cubin matches the compute
-// capabilities provided by 'cc_major' and 'cc_minor'.
-//
-// 'options' is used to query for the CUDA location in case it is
-// customized in a passed flag, and for controlling ptxas optimizations.
-absl::StatusOr<std::vector<uint8_t>> CompileGpuAsm(
-    int cc_major, int cc_minor, const char* ptx_contents, GpuAsmOpts options,
-    bool cancel_if_reg_spill = false);
-
-absl::StatusOr<std::vector<uint8_t>> CompileGpuAsmUsingPtxAs(
-    int cc_major, int cc_minor, const char* ptx_contents, GpuAsmOpts options,
-    bool cancel_if_reg_spill = false);
-
-// Same as CompileGpuAsm, but caches the result, and returns unowned view of
-// the compiled binary.
-//
-// A copy of the string provided in ptx will be made.
-absl::StatusOr<absl::Span<const uint8_t>> CompileGpuAsmOrGetCached(
-    int device_ordinal, const char* ptx, GpuAsmOpts compilation_options);
-
-struct CubinOrPTXImage {
-  std::string profile;
-  std::vector<uint8_t> bytes;
-};
-
-// Bundles the GPU machine code (cubins) and PTX if requested and returns the
-// resulting binary (i.e. a fatbin) as a byte array.
-absl::StatusOr<std::vector<uint8_t>> BundleGpuAsm(
-    std::vector<CubinOrPTXImage> images, GpuAsmOpts options);
 
 struct HsacoImage {
   std::string gfx_arch;
@@ -95,35 +51,6 @@ struct HsacoImage {
 absl::StatusOr<std::vector<uint8_t>> BundleGpuAsm(
     std::vector<HsacoImage> images, const std::string rocm_root_dir);
 
-// Links multiple relocatable GPU images (e.g. results of ptxas -c) into a
-// single image.
-absl::StatusOr<std::vector<uint8_t>> LinkGpuAsm(
-    gpu::GpuContext* context, std::vector<CubinOrPTXImage> images);
-
-absl::StatusOr<std::vector<uint8_t>> LinkUsingNvlink(
-    absl::string_view preferred_cuda_dir, gpu::GpuContext* context,
-    std::vector<CubinOrPTXImage> images);
-
-using ToolVersion = std::array<int64_t, 3>;
-absl::StatusOr<std::string> FindCudaExecutable(
-    std::string_view binary_name, std::string_view preferred_cuda_dir,
-    ToolVersion minimum_version,
-    absl::Span<const ToolVersion> excluded_versions);
-
-absl::StatusOr<std::string> FindCudaExecutable(
-    std::string_view binary_name, std::string_view preferred_cuda_dir);
-
-// Runs tool --version and parses its version string.
-absl::StatusOr<ToolVersion> GetToolVersion(std::string_view tool_path);
-
-// On NVIDIA GPUs, returns the version of the ptxas command line tool.
-absl::StatusOr<ToolVersion> GetAsmCompilerVersion(
-    std::string_view preferred_cuda_dir);
-
-// On NVIDIA GPUs, returns the version of the nvlink command line tool.
-absl::StatusOr<ToolVersion> GetNvLinkVersion(
-    std::string_view preferred_cuda_dir);
-
 #if GOOGLE_CUDA
 // Maintains a cache of pointers to loaded kernels
 template <typename... Args>
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h
index 0172c62ba0c8d3..e156e956d52b00 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h
@@ -38,7 +38,7 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_types.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/launch_dim.h"
-#include "xla/stream_executor/stream_executor_internal.h"
+#include "xla/stream_executor/stream_executor_interface.h"
 
 namespace stream_executor::gpu {
 
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_event.h b/third_party/xla/xla/stream_executor/gpu/gpu_event.h
index 2c8b588dab76cb..d9026f4920b6f4 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_event.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_event.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define XLA_STREAM_EXECUTOR_GPU_GPU_EVENT_H_
 
 #include "absl/status/status.h"
-#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/event_interface.h"
 #include "xla/stream_executor/gpu/gpu_stream.h"
 #include "xla/stream_executor/gpu/gpu_types.h"
 
@@ -26,7 +26,7 @@ namespace gpu {
 
 // GpuEvent wraps a GpuEventHandle in the platform-independent EventInterface
 // interface.
-class GpuEvent : public internal::EventInterface {
+class GpuEvent : public EventInterface {
  public:
   explicit GpuEvent(GpuExecutor* parent);
 
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_executor.h b/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
index 3c6787575454b1..27c21202661dd4 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
@@ -38,8 +38,10 @@ limitations under the License.
 #include "absl/numeric/int128.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
+#include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/dnn.h"
@@ -48,13 +50,15 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_collectives.h"
 #include "xla/stream_executor/gpu/gpu_driver.h"
 #include "xla/stream_executor/gpu/gpu_types.h"
+#include "xla/stream_executor/host_memory_allocation.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/module_spec.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/stream_executor/stream_executor_internal.h"
+#include "xla/stream_executor/stream_executor_interface.h"
 #include "tsl/platform/thread_annotations.h"
 
 namespace stream_executor {
@@ -68,7 +72,7 @@ class GpuCommandBuffer;
 
 // CUDA-platform implementation of the platform-agnostic
 // StreamExecutorInterface.
-class GpuExecutor : public internal::StreamExecutorInterface {
+class GpuExecutor : public StreamExecutorInterface {
   // Helper classes to attach a type erased state to the GpuExecutor. Currently,
   // we just need to support some XLA specific state.
   class Object {
@@ -99,10 +103,10 @@ class GpuExecutor : public internal::StreamExecutorInterface {
  public:
   // sub_platform indicates the subplatform used in this executor; it must
   // be a CUDA type.
-  GpuExecutor()
+  explicit GpuExecutor(int device_ordinal)
       : device_(0),
         context_(nullptr),
-        device_ordinal_(0),
+        device_ordinal_(device_ordinal),
         cc_major_(0),
         cc_minor_(0),
         version_(0) {}
@@ -112,7 +116,7 @@ class GpuExecutor : public internal::StreamExecutorInterface {
 
   ~GpuExecutor() override;
 
-  absl::Status Init(int device_ordinal) override;
+  absl::Status Init() override;
 
   int device_ordinal() const override { return device_ordinal_; };
 
@@ -178,26 +182,25 @@ class GpuExecutor : public internal::StreamExecutorInterface {
   // internally sets up buffers for DMA operations (and page locks them).
   // There's no external interface for us to otherwise control these DMA
   // settings.
-  void* HostMemoryAllocate(uint64_t size) override {
-    return GpuDriver::HostAllocate(context_, size);
+  absl::StatusOr<std::unique_ptr<MemoryAllocation>> HostMemoryAllocate(
+      uint64_t size) override {
+    auto* buffer = GpuDriver::HostAllocate(context_, size);
+    if (buffer == nullptr && size > 0) {
+      return absl::InternalError(
+          absl::StrFormat("Failed to allocate HostMemory of size %d", size));
+    }
+    return std::make_unique<HostMemoryAllocation>(buffer, size, this);
   }
 
   void HostMemoryDeallocate(void* location) override {
     return GpuDriver::HostDeallocate(context_, location);
   }
 
-  bool HostMemoryRegister(void* location, uint64_t size) override;
-
-  bool HostMemoryUnregister(void* location) override;
-
   bool SynchronizeAllActivity() override;
 
   absl::Status SynchronousMemZero(DeviceMemoryBase* location,
                                   uint64_t size) override;
 
-  absl::Status SynchronousMemSet(DeviceMemoryBase* location, int value,
-                                 uint64_t size) override;
-
   absl::Status SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
                                  const void* host_src, uint64_t size) override;
 
@@ -205,10 +208,6 @@ class GpuExecutor : public internal::StreamExecutorInterface {
                                  const DeviceMemoryBase& gpu_src,
                                  uint64_t size) override;
 
-  absl::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase* gpu_dst,
-                                               const DeviceMemoryBase& gpu_src,
-                                               uint64_t size) override;
-
   absl::Status MemZero(Stream* stream, DeviceMemoryBase* location,
                        uint64_t size) override;
   absl::Status Memset(Stream* stream, DeviceMemoryBase* location,
@@ -270,16 +269,15 @@ class GpuExecutor : public internal::StreamExecutorInterface {
   static absl::StatusOr<std::unique_ptr<DeviceDescription>>
   CreateDeviceDescription(int device_ordinal);
 
-  blas::BlasSupport* CreateBlas() override;
+  blas::BlasSupport* AsBlas() override;
 
-  fft::FftSupport* CreateFft() override;
+  fft::FftSupport* AsFft() override;
 
-  dnn::DnnSupport* CreateDnn() override;
+  dnn::DnnSupport* AsDnn() override;
 
-  std::unique_ptr<internal::EventInterface> CreateEventImplementation()
-      override;
+  std::unique_ptr<EventInterface> CreateEventImplementation() override;
 
-  std::unique_ptr<internal::StreamInterface> GetStreamImplementation() override;
+  std::unique_ptr<StreamInterface> GetStreamImplementation() override;
 
   absl::StatusOr<std::unique_ptr<Kernel>> CreateKernel() override;
 
@@ -426,6 +424,21 @@ class GpuExecutor : public internal::StreamExecutorInterface {
   absl::flat_hash_map<void*, Stream*> alive_gpu_streams_
       ABSL_GUARDED_BY(alive_gpu_streams_mu_);
 
+  // Reader/writer lock for mutable data structures on this object.
+  absl::Mutex mu_;
+
+  // Memoized DNN support object -- we only want to create this once when asked
+  // for a DNN interface.
+  std::unique_ptr<dnn::DnnSupport> dnn_ ABSL_GUARDED_BY(mu_);
+
+  // Memoized FFT support object -- we only want to create this once when asked
+  // for a FFT interface.
+  std::unique_ptr<fft::FftSupport> fft_ ABSL_GUARDED_BY(mu_);
+
+  // Memoized BLAS support object -- we only want to create this once when asked
+  // for a BLAS interface.
+  std::unique_ptr<blas::BlasSupport> blas_ ABSL_GUARDED_BY(mu_);
+
   GpuExecutor(const GpuExecutor&) = delete;
   void operator=(const GpuExecutor&) = delete;
 };
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_stream.h b/third_party/xla/xla/stream_executor/gpu/gpu_stream.h
index 166be822995879..ebd143953e54b5 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_stream.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_stream.h
@@ -24,7 +24,7 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "xla/stream_executor/gpu/gpu_types.h"
 #include "xla/stream_executor/platform.h"
-#include "xla/stream_executor/stream_executor_internal.h"
+#include "xla/stream_executor/stream_executor_interface.h"
 
 namespace stream_executor {
 namespace gpu {
@@ -35,7 +35,7 @@ class GpuExecutor;
 // StreamInterface.
 //
 // Thread-safe post-initialization.
-class GpuStream : public internal::StreamInterface {
+class GpuStream : public StreamInterface {
  public:
   explicit GpuStream(GpuExecutor* parent)
       : parent_(parent), gpu_stream_(nullptr), completed_event_(nullptr) {}
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_timer.cc b/third_party/xla/xla/stream_executor/gpu/gpu_timer.cc
index ecd3f40c6725c9..e5928e7a7cc782 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_timer.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_timer.cc
@@ -51,10 +51,21 @@ absl::Duration RandomDuration() {
   return absl::Microseconds(distribution(rng));
 }
 
+bool ShouldLaunchDelayKernel() {
+  // Only launch the delay kernel if CUDA_LAUNCH_BLOCKING is not set to 1.
+  static bool value = [] {
+    const char* blocking = std::getenv("CUDA_LAUNCH_BLOCKING");
+    return !blocking || std::string_view{blocking} != "1";
+  }();
+  return value;
+}
+
 }  // namespace
 
 /*deprecated*/ /*static*/ absl::StatusOr<GpuTimer> GpuTimer::Create(
     GpuStream* stream) {
+  // This deprecated factory does not launch the delay kernel and may lead to
+  // reduced measurement accuracy.
   GpuExecutor* parent = stream->parent();
   GpuContext* context = parent->gpu_context();
   GpuEventHandle start_event;
@@ -72,6 +83,8 @@ absl::Duration RandomDuration() {
 
 /*deprecated*/ /*static*/ absl::StatusOr<std::optional<GpuTimer>>
 GpuTimer::CreateIfNeeded(GpuStream* stream, bool is_needed) {
+  // This deprecated factory does not launch the delay kernel and may lead to
+  // reduced measurement accuracy.
   if (is_needed) {
     TF_ASSIGN_OR_RETURN(GpuTimer t, GpuTimer::Create(stream));
     return {std::make_optional(std::move(t))};
@@ -79,16 +92,85 @@ GpuTimer::CreateIfNeeded(GpuStream* stream, bool is_needed) {
   return std::nullopt;
 }
 
-[[deprecated("So it can quietly call a deprecated method")]] /*static*/ absl::
-    StatusOr<GpuTimer>
-    GpuTimer::Create(Stream* stream) {
-  return GpuTimer::Create(AsGpuStream(stream));
+/*static*/ absl::StatusOr<GpuTimer::GpuSemaphore>
+GpuTimer::GpuSemaphore::Create(StreamExecutor* executor) {
+  // Allocate the value in pinned host memory that can be read from both
+  // host and device.
+  TF_ASSIGN_OR_RETURN(auto alloc,
+                      executor->HostMemoryAllocate(sizeof(GpuSemaphoreState)));
+  return GpuSemaphore{std::move(alloc)};
 }
 
-[[deprecated("So it can quietly call a deprecated method")]] /*static*/ absl::
-    StatusOr<std::optional<GpuTimer>>
-    GpuTimer::CreateIfNeeded(Stream* stream, bool is_needed) {
-  return GpuTimer::CreateIfNeeded(AsGpuStream(stream), is_needed);
+DeviceMemory<GpuSemaphoreState> GpuTimer::GpuSemaphore::device() {
+  // This assumes unified addressing, as we do not explicitly translate the
+  // host pointer into a device pointer.
+  return DeviceMemory<GpuSemaphoreState>::MakeFromByteSize(
+      ptr_->opaque(), sizeof(GpuSemaphoreState));
+}
+
+/*static*/ absl::StatusOr<GpuTimer> GpuTimer::Create(Stream* real_stream,
+                                                     bool use_delay_kernel) {
+  StreamExecutor* executor = real_stream->parent();
+  GpuStream* stream = AsGpuStream(real_stream);
+  GpuExecutor* parent = stream->parent();
+  GpuContext* context = parent->gpu_context();
+  GpuEventHandle start_event;
+  TF_RETURN_IF_ERROR(GpuDriver::InitEvent(context, &start_event,
+                                          GpuDriver::EventFlags::kDefault));
+  GpuEventHandle stop_event;
+  TF_RETURN_IF_ERROR(GpuDriver::InitEvent(context, &stop_event,
+                                          GpuDriver::EventFlags::kDefault));
+  CHECK(start_event != nullptr && stop_event != nullptr);
+  GpuSemaphore semaphore{};
+  if (!use_delay_kernel) {
+    LOG(WARNING)
+        << "Skipping the delay kernel, measurement accuracy will be reduced";
+  }
+#ifdef GOOGLE_CUDA
+  if (use_delay_kernel && ShouldLaunchDelayKernel()) {
+    // Check the assumption that this device supports unified addressing,
+    // otherwise skip the delay kernel
+    TF_ASSIGN_OR_RETURN(int status, GpuDriver::GetDeviceAttribute(
+                                        CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING,
+                                        parent->device()));
+    if (!status) {
+      LOG(WARNING) << "Skipping the delay kernel because the device does not "
+                      "support unified addressing";
+    } else {
+      // Allocate a semaphore value that will be used to signal to the delay
+      // kernel that it may exit.
+      TF_ASSIGN_OR_RETURN(semaphore, GpuSemaphore::Create(executor));
+      *semaphore = GpuSemaphoreState::Hold;
+      // In principle the kernel could be loaded lazily and shared across
+      // multiple GpuTimer objects.
+      TF_ASSIGN_OR_RETURN(
+          auto kernel,
+          (TypedKernel<DeviceMemory<GpuSemaphoreState>,
+                       GpuSemaphoreState>::Create(executor, "DelayKernel",
+                                                  delay_kernel::kernel())));
+      // Launch a delay kernel into this stream, which will spin until
+      // GetElapsedDuration() is called, the timer is destroyed, or the timeout
+      // in the kernel is reached.
+      TF_RETURN_IF_ERROR(real_stream->ThenLaunch(
+          ThreadDim(1, 1, 1), BlockDim(1, 1, 1), kernel, semaphore.device(),
+          GpuSemaphoreState::Release));
+    }
+  }
+#endif  // GOOGLE_CUDA
+  // The start event goes after the delay kernel in the stream
+  TF_RETURN_IF_ERROR(GpuDriver::RecordEvent(parent->gpu_context(), start_event,
+                                            stream->gpu_stream()));
+  return absl::StatusOr<GpuTimer>{absl::in_place, parent, start_event,
+                                  stop_event,     stream, std::move(semaphore)};
+}
+
+/*static*/ absl::StatusOr<std::optional<GpuTimer>> GpuTimer::CreateIfNeeded(
+    Stream* stream, bool use_delay_kernel, bool is_needed) {
+  if (is_needed) {
+    TF_ASSIGN_OR_RETURN(GpuTimer t, GpuTimer::Create(stream, use_delay_kernel));
+    return {std::make_optional(std::move(t))};
+  }
+  return std::nullopt;
 }
 
 /*static*/ void GpuTimer::ReturnRandomDurationsForTesting() {
@@ -97,6 +179,17 @@ GpuTimer::CreateIfNeeded(GpuStream* stream, bool is_needed) {
 
 GpuTimer::~GpuTimer() {
   GpuContext* context = parent_->gpu_context();
+  if (semaphore_ && !is_stopped_) {
+    // Signal the delay kernel that it can exit
+    *semaphore_ = GpuSemaphoreState::Release;
+    // Wait for the delay kernel to exit before destroying the value that it is
+    // watching.
+    absl::Status status =
+        GpuDriver::SynchronizeStream(context, stream_->gpu_stream());
+    if (!status.ok()) {
+      LOG(ERROR) << status;
+    }
+  }
   if (start_event_ != nullptr) {
     absl::Status status = GpuDriver::DestroyEvent(context, &start_event_);
     if (!status.ok()) {
@@ -117,6 +210,18 @@ absl::StatusOr<absl::Duration> GpuTimer::GetElapsedDuration() {
   }
   TF_RETURN_IF_ERROR(GpuDriver::RecordEvent(parent_->gpu_context(), stop_event_,
                                             stream_->gpu_stream()));
+  // If we launched the delay kernel then check if it already timed out.
+  if (semaphore_) {
+    if (*semaphore_ == GpuSemaphoreState::TimedOut) {
+      // The delay kernel did not achieve the intended result.
+      LOG(ERROR) << "Delay kernel timed out: measured time has sub-optimal "
+                    "accuracy. There may be a missing warmup execution, please "
+                    "investigate in Nsight Systems.";
+    } else {
+      // Signal that the kernel can exit
+      *semaphore_ = GpuSemaphoreState::Release;
+    }
+  }
   float elapsed_milliseconds = NAN;
   if (!GpuDriver::GetEventElapsedTime(parent_->gpu_context(),
                                       &elapsed_milliseconds, start_event_,
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_timer.h b/third_party/xla/xla/stream_executor/gpu/gpu_timer.h
index 8fd83bec6499e3..97124e611b4456 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_timer.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_timer.h
@@ -16,12 +16,14 @@ limitations under the License.
 #ifndef XLA_STREAM_EXECUTOR_GPU_GPU_TIMER_H_
 #define XLA_STREAM_EXECUTOR_GPU_GPU_TIMER_H_
 
+#include <memory>
 #include <optional>
 #include <utility>
 
 #include "absl/status/statusor.h"
 #include "absl/time/time.h"
 #include "xla/stream_executor/gpu/gpu_executor.h"
+#include "xla/stream_executor/gpu/gpu_timer_kernel.h"
 #include "xla/stream_executor/gpu/gpu_types.h"
 
 namespace xla {
@@ -36,34 +38,57 @@ namespace gpu {
 class GpuExecutor;
 class GpuStream;
 
-// Timer is started once it's created, and is stopped once read.
+// When a timer is created it launches a delay kernel into the given stream and
+// queues a start event immediately afterwards. This delay kernel blocks
+// execution on the stream until GetElapsedDuration() is called, at which point
+// an end event is queued and the delay kernel exits. This allows the device
+// execution time of the tasks queued to the stream while the timer is active
+// to be measured more accurately.
 class GpuTimer {
  public:
-  static absl::StatusOr<GpuTimer> Create(Stream* stream);
+  class GpuSemaphore {
+   public:
+    GpuSemaphore() = default;
+    static absl::StatusOr<GpuSemaphore> Create(StreamExecutor* executor);
+    explicit operator bool() const { return bool{ptr_}; }
+    GpuSemaphoreState& operator*() {
+      return *static_cast<GpuSemaphoreState*>(ptr_->opaque());
+    }
+    DeviceMemory<GpuSemaphoreState> device();
+
+   private:
+    explicit GpuSemaphore(std::unique_ptr<MemoryAllocation> alloc)
+        : ptr_{std::move(alloc)} {}
+    std::unique_ptr<MemoryAllocation> ptr_;
+  };
+  static absl::StatusOr<GpuTimer> Create(Stream* stream, bool use_delay_kernel);
   [[deprecated("Pass Stream* not GpuStream*")]] static absl::StatusOr<GpuTimer>
   Create(GpuStream* stream);
 
   // An ugly but a very convenient helper: creates a timer only when we need
   // one, but always returns an object. If `is_needed` is false, returns an
   // empty optional, acts like `Create` otherwise.
-  static absl::StatusOr<std::optional<GpuTimer>> CreateIfNeeded(Stream* stream,
-                                                                bool is_needed);
+  static absl::StatusOr<std::optional<GpuTimer>> CreateIfNeeded(
+      Stream* stream, bool use_delay_kernel, bool is_needed);
   [[deprecated("Pass Stream* not GpuStream*")]] static absl::StatusOr<
       std::optional<GpuTimer>>
   CreateIfNeeded(GpuStream* stream, bool is_needed);
 
   explicit GpuTimer(GpuExecutor* parent, GpuEventHandle start_event,
-                    GpuEventHandle stop_event, GpuStream* stream)
+                    GpuEventHandle stop_event, GpuStream* stream,
+                    GpuSemaphore semaphore = {})
       : parent_(parent),
         start_event_(start_event),
         stop_event_(stop_event),
-        stream_(stream) {}
+        stream_(stream),
+        semaphore_(std::move(semaphore)) {}
 
   GpuTimer(GpuTimer&& other)
       : parent_(other.parent_),
         start_event_(std::exchange(other.start_event_, nullptr)),
         stop_event_(std::exchange(other.stop_event_, nullptr)),
-        stream_(other.stream_) {}
+        stream_(other.stream_),
+        semaphore_(std::move(other.semaphore_)) {}
 
   GpuTimer& operator=(GpuTimer&& other) {
     if (this != &other) {
@@ -71,6 +96,7 @@ class GpuTimer {
       start_event_ = std::exchange(other.start_event_, nullptr);
       stop_event_ = std::exchange(other.stop_event_, nullptr);
       stream_ = other.stream_;
+      semaphore_ = std::move(other.semaphore_);
     }
     return *this;
   }
@@ -86,6 +112,7 @@ class GpuTimer {
   GpuEventHandle start_event_ = nullptr;
   GpuEventHandle stop_event_ = nullptr;
   GpuStream* stream_;
+  GpuSemaphore semaphore_;
   bool is_stopped_ = false;
 
   GpuTimer(const GpuTimer&) = delete;
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel.cu.cc b/third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel.cu.cc
new file mode 100644
index 00000000000000..0ce4b1d9fbb323
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel.cu.cc
@@ -0,0 +1,52 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "xla/stream_executor/gpu/gpu_timer_kernel.h"
+
+#include <cstddef>
+
+namespace stream_executor::gpu {
+namespace {
+// Wait for the value pointed to by `semaphore` to have value `target`, timing
+// out after approximately `APPROX_TIMEOUT_SECONDS` seconds if that value is
+// not reached. This can happen if, for example, blocking launches are enabled
+// via CUDA_LAUNCH_BLOCKING=1. It can also happen if launching a kernel after
+// this delay kernel causes synchronisation, e.g. because of lazy loading.
+__global__ void DelayKernel(volatile GpuSemaphoreState* semaphore,
+                            GpuSemaphoreState target) {
+  constexpr int64_t WAIT_CYCLES{1024};
+  constexpr int64_t TIMEOUT_CYCLES{200000000};  // 100ms at 2GHz
+  const int64_t tstart{clock64()};
+  bool target_not_reached;
+  while ((target_not_reached = (*semaphore != target)) &&
+         (clock64() - tstart) < TIMEOUT_CYCLES) {
+    int64_t elapsed{};
+    const int64_t t0{clock64()};
+    do {
+      elapsed = clock64() - t0;
+    } while (elapsed < WAIT_CYCLES);
+  }
+  if (target_not_reached) {
+    // We are exiting due to the timeout. Signal this back to the host so that
+    // we can emit a warning, as it probably indicates suboptimal usage.
+    *semaphore = GpuSemaphoreState::TimedOut;
+  }
+}
+}  // namespace
+
+namespace delay_kernel {
+void* kernel() { return reinterpret_cast<void*>(DelayKernel); }
+}  // namespace delay_kernel
+
+}  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel.h b/third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel.h
new file mode 100644
index 00000000000000..2ac358b4ee56c5
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel.h
@@ -0,0 +1,26 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_GPU_GPU_TIMER_KERNEL_H_
+#define XLA_STREAM_EXECUTOR_GPU_GPU_TIMER_KERNEL_H_
+
+namespace stream_executor::gpu {
+enum struct GpuSemaphoreState { Hold, Release, TimedOut };
+namespace delay_kernel {
+void* kernel();  // returns a pointer to a CUDA C++ device function
+}  // namespace delay_kernel
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_GPU_GPU_TIMER_KERNEL_H_
diff --git a/third_party/xla/xla/stream_executor/gpu/redzone_allocator.cc b/third_party/xla/xla/stream_executor/gpu/redzone_allocator.cc
index 5f08f84c2d86b5..0b56378f4f0eed 100644
--- a/third_party/xla/xla/stream_executor/gpu/redzone_allocator.cc
+++ b/third_party/xla/xla/stream_executor/gpu/redzone_allocator.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include <utility>
 
 #include "absl/base/call_once.h"
-#include "absl/container/fixed_array.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -45,6 +44,10 @@ limitations under the License.
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
+#ifdef GOOGLE_CUDA
+#include "xla/stream_executor/cuda/cuda_asm_compiler.h"
+#endif
+
 namespace stream_executor {
 
 // Rounds the value up to a multiple of the divisor by first calling CeilOfRatio
diff --git a/third_party/xla/xla/stream_executor/host/BUILD b/third_party/xla/xla/stream_executor/host/BUILD
index 04edca1c9e225a..43af63f32502bf 100644
--- a/third_party/xla/xla/stream_executor/host/BUILD
+++ b/third_party/xla/xla/stream_executor/host/BUILD
@@ -1,10 +1,10 @@
 # Description:
 #   Host-platform specific StreamExecutor support code.
 
-load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//xla:xla.bzl", "xla_cc_test")
 load("//xla/stream_executor:build_defs.bzl", "stream_executor_friends")
+load("//xla/tsl:tsl.bzl", "internal_visibility")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -63,7 +63,7 @@ cc_library(
         "host_stream.h",
     ],
     deps = [
-        "//xla/stream_executor:stream_executor_internal",
+        "//xla/stream_executor:stream_interface",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log:check",
@@ -89,7 +89,7 @@ cc_library(
         ":host_kernel_c_api",
         "//xla/stream_executor",
         "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:stream_executor_internal",
+        "//xla/stream_executor:stream_executor_interface",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
@@ -121,7 +121,8 @@ cc_library(
     deps = [
         ":host_stream",
         "//xla/stream_executor",
-        "//xla/stream_executor:stream_executor_internal",
+        "//xla/stream_executor:event_interface",
+        "//xla/stream_executor:stream_executor_interface",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
diff --git a/third_party/xla/xla/stream_executor/host/host_executor.cc b/third_party/xla/xla/stream_executor/host/host_executor.cc
index 33d54a16f3fee1..fdaf3bfe3f685b 100644
--- a/third_party/xla/xla/stream_executor/host/host_executor.cc
+++ b/third_party/xla/xla/stream_executor/host/host_executor.cc
@@ -34,9 +34,10 @@ limitations under the License.
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/event.h"
+#include "xla/stream_executor/event_interface.h"
 #include "xla/stream_executor/host/host_stream.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/stream_executor/stream_executor_internal.h"
+#include "xla/stream_executor/stream_executor_interface.h"
 #include "tsl/platform/mem.h"
 #include "tsl/platform/profile_utils/cpu_utils.h"
 
@@ -48,7 +49,7 @@ HostStream* AsHostStream(Stream* stream) {
   return dynamic_cast<HostStream*>(stream->implementation());
 }
 
-absl::Status HostExecutor::Init(int device_ordinal) { return absl::OkStatus(); }
+absl::Status HostExecutor::Init() { return absl::OkStatus(); }
 
 bool HostExecutor::DeviceMemoryUsage(int64_t* free, int64_t* total) const {
   tsl::port::MemoryInfo mem_info = tsl::port::GetMemoryInfo();
@@ -76,12 +77,6 @@ absl::Status HostExecutor::SynchronousMemZero(DeviceMemoryBase* location,
   return absl::OkStatus();
 }
 
-absl::Status HostExecutor::SynchronousMemSet(DeviceMemoryBase* location,
-                                             int value, uint64_t size) {
-  memset(location->opaque(), value, size);
-  return absl::OkStatus();
-}
-
 absl::Status HostExecutor::Memcpy(Stream* stream, void* host_dst,
                                   const DeviceMemoryBase& gpu_src,
                                   uint64_t size) {
@@ -161,12 +156,6 @@ absl::Status HostExecutor::SynchronousMemcpy(void* host_dst,
   return absl::OkStatus();
 }
 
-absl::Status HostExecutor::SynchronousMemcpyDeviceToDevice(
-    DeviceMemoryBase* gpu_dst, const DeviceMemoryBase& gpu_src, uint64_t size) {
-  memcpy(gpu_dst->opaque(), gpu_src.opaque(), size);
-  return absl::OkStatus();
-}
-
 bool HostExecutor::HostCallback(
     Stream* stream, absl::AnyInvocable<absl::Status() &&> callback) {
   AsHostStream(stream)->EnqueueTaskWithStatus(std::move(callback));
@@ -185,7 +174,7 @@ bool HostExecutor::CreateStreamDependency(Stream* dependent, Stream* other) {
   return true;
 }
 
-class HostEvent : public internal::EventInterface {
+class HostEvent : public EventInterface {
  public:
   HostEvent() : notification_(std::make_shared<absl::Notification>()) {}
 
@@ -198,9 +187,8 @@ class HostEvent : public internal::EventInterface {
   std::shared_ptr<absl::Notification> notification_;
 };
 
-std::unique_ptr<internal::EventInterface>
-HostExecutor::CreateEventImplementation() {
-  return std::unique_ptr<internal::EventInterface>(new HostEvent());
+std::unique_ptr<EventInterface> HostExecutor::CreateEventImplementation() {
+  return std::unique_ptr<EventInterface>(new HostEvent());
 }
 
 static HostEvent* AsHostEvent(Event* event) {
@@ -264,8 +252,7 @@ HostExecutor::CreateDeviceDescription(int device_ordinal) {
   return builder.Build();
 }
 
-std::unique_ptr<internal::StreamInterface>
-HostExecutor::GetStreamImplementation() {
+std::unique_ptr<StreamInterface> HostExecutor::GetStreamImplementation() {
   return std::make_unique<HostStream>();
 }
 
diff --git a/third_party/xla/xla/stream_executor/host/host_executor.h b/third_party/xla/xla/stream_executor/host/host_executor.h
index 6123e227591fa8..eaf9468841a993 100644
--- a/third_party/xla/xla/stream_executor/host/host_executor.h
+++ b/third_party/xla/xla/stream_executor/host/host_executor.h
@@ -29,11 +29,13 @@ limitations under the License.
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/event.h"
+#include "xla/stream_executor/host_memory_allocation.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/stream_executor/stream_executor_internal.h"
+#include "xla/stream_executor/stream_executor_interface.h"
 
 namespace stream_executor {
 namespace host {
@@ -47,11 +49,11 @@ namespace host {
 // This is useful for evaluating the performance of host-based or fallback
 // routines executed under the context of a GPU executor.
 // See stream_executor.h for description of the below operations.
-class HostExecutor : public internal::StreamExecutorInterface {
+class HostExecutor : public StreamExecutorInterface {
  public:
-  HostExecutor() = default;
+  explicit HostExecutor(int device_ordinal) : device_ordinal_(device_ordinal) {}
 
-  absl::Status Init(int device_ordinal) override;
+  absl::Status Init() override;
 
   absl::Status GetKernel(const MultiKernelLoaderSpec& spec,
                          Kernel* kernel) override {
@@ -66,12 +68,13 @@ class HostExecutor : public internal::StreamExecutorInterface {
   DeviceMemoryBase Allocate(uint64_t size, int64_t memory_space) override;
   void Deallocate(DeviceMemoryBase* mem) override;
 
-  void* HostMemoryAllocate(uint64_t size) override { return new char[size]; }
+  absl::StatusOr<std::unique_ptr<MemoryAllocation>> HostMemoryAllocate(
+      uint64_t size) override {
+    return std::make_unique<HostMemoryAllocation>(new char[size], size, this);
+  }
   void HostMemoryDeallocate(void* mem) override {
     delete[] static_cast<char*>(mem);
   }
-  bool HostMemoryRegister(void* mem, uint64_t size) override { return true; }
-  bool HostMemoryUnregister(void* mem) override { return true; }
 
   absl::Status Memcpy(Stream* stream, void* host_dst,
                       const DeviceMemoryBase& gpu_src, uint64_t size) override;
@@ -93,17 +96,11 @@ class HostExecutor : public internal::StreamExecutorInterface {
   absl::Status SynchronousMemZero(DeviceMemoryBase* location,
                                   uint64_t size) override;
 
-  absl::Status SynchronousMemSet(DeviceMemoryBase* location, int value,
-                                 uint64_t size) override;
-
   absl::Status SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
                                  const void* host_src, uint64_t size) override;
   absl::Status SynchronousMemcpy(void* host_dst,
                                  const DeviceMemoryBase& gpu_src,
                                  uint64_t size) override;
-  absl::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase* gpu_dst,
-                                               const DeviceMemoryBase& gpu_src,
-                                               uint64_t size) override;
 
   bool HostCallback(Stream* stream,
                     absl::AnyInvocable<absl::Status() &&> callback) override;
@@ -129,6 +126,7 @@ class HostExecutor : public internal::StreamExecutorInterface {
 
   static absl::StatusOr<std::unique_ptr<DeviceDescription>>
   CreateDeviceDescription(int device_ordinal);
+  int device_ordinal() const override { return device_ordinal_; }
 
   absl::Status EnablePeerAccessTo(StreamExecutorInterface* other) override {
     return absl::OkStatus();
@@ -138,10 +136,12 @@ class HostExecutor : public internal::StreamExecutorInterface {
     return true;
   }
 
-  std::unique_ptr<internal::EventInterface> CreateEventImplementation()
-      override;
+  std::unique_ptr<EventInterface> CreateEventImplementation() override;
+
+  std::unique_ptr<StreamInterface> GetStreamImplementation() override;
 
-  std::unique_ptr<internal::StreamInterface> GetStreamImplementation() override;
+ private:
+  int device_ordinal_;
 };
 
 }  // namespace host
diff --git a/third_party/xla/xla/stream_executor/host/host_platform.cc b/third_party/xla/xla/stream_executor/host/host_platform.cc
index 23112fbecd51aa..c9fdb8ef5a7429 100644
--- a/third_party/xla/xla/stream_executor/host/host_platform.cc
+++ b/third_party/xla/xla/stream_executor/host/host_platform.cc
@@ -67,7 +67,7 @@ absl::StatusOr<StreamExecutor*> HostPlatform::GetExecutor(
 absl::StatusOr<std::unique_ptr<StreamExecutor>>
 HostPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
   auto executor = std::make_unique<StreamExecutor>(
-      this, std::make_unique<HostExecutor>(), config.ordinal);
+      this, std::make_unique<HostExecutor>(config.ordinal));
   auto init_status = executor->Init();
   if (!init_status.ok()) {
     return absl::InternalError(absl::StrFormat(
diff --git a/third_party/xla/xla/stream_executor/host/host_stream.h b/third_party/xla/xla/stream_executor/host/host_stream.h
index 47afb45f6c6479..2cabde179ca2b9 100644
--- a/third_party/xla/xla/stream_executor/host/host_stream.h
+++ b/third_party/xla/xla/stream_executor/host/host_stream.h
@@ -26,14 +26,14 @@ limitations under the License.
 #include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
 #include "absl/synchronization/mutex.h"
-#include "xla/stream_executor/stream_executor_internal.h"
+#include "xla/stream_executor/stream_interface.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/thread_annotations.h"
 
 namespace stream_executor {
 namespace host {
 
-class HostStream : public internal::StreamInterface {
+class HostStream : public StreamInterface {
  public:
   HostStream();
   ~HostStream() override;
diff --git a/third_party/xla/xla/stream_executor/host_memory_allocation.cc b/third_party/xla/xla/stream_executor/host_memory_allocation.cc
index 12affb0b3c68b6..623940e83d2c6c 100644
--- a/third_party/xla/xla/stream_executor/host_memory_allocation.cc
+++ b/third_party/xla/xla/stream_executor/host_memory_allocation.cc
@@ -17,12 +17,12 @@ limitations under the License.
 
 #include <cstdint>
 
-#include "xla/stream_executor/stream_executor_internal.h"
+#include "xla/stream_executor/stream_executor_interface.h"
 
 namespace stream_executor {
 
-HostMemoryAllocation::HostMemoryAllocation(
-    void* ptr, uint64_t size, internal::StreamExecutorInterface* executor)
+HostMemoryAllocation::HostMemoryAllocation(void* ptr, uint64_t size,
+                                           StreamExecutorInterface* executor)
     : ptr_(ptr), size_(size), executor_(executor) {}
 
 HostMemoryAllocation::~HostMemoryAllocation() {
diff --git a/third_party/xla/xla/stream_executor/host_memory_allocation.h b/third_party/xla/xla/stream_executor/host_memory_allocation.h
index 974eb63fb8daa5..fb9bb394e0a473 100644
--- a/third_party/xla/xla/stream_executor/host_memory_allocation.h
+++ b/third_party/xla/xla/stream_executor/host_memory_allocation.h
@@ -22,16 +22,14 @@ limitations under the License.
 
 namespace stream_executor {
 
-namespace internal {
 class StreamExecutorInterface;
-}
 
 // RAII container for pinned host memory allocation allocated on an underlying
 // device owned by `*this`.
 class HostMemoryAllocation final : public MemoryAllocation {
  public:
   HostMemoryAllocation(void* ptr, uint64_t size,
-                       internal::StreamExecutorInterface* executor);
+                       StreamExecutorInterface* executor);
   ~HostMemoryAllocation() final;
 
   void* opaque() const final { return ptr_; }
@@ -40,7 +38,7 @@ class HostMemoryAllocation final : public MemoryAllocation {
  private:
   void* ptr_ = nullptr;
   uint64_t size_ = 0;
-  internal::StreamExecutorInterface* executor_ = nullptr;
+  StreamExecutorInterface* executor_ = nullptr;
 };
 
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/integrations/BUILD b/third_party/xla/xla/stream_executor/integrations/BUILD
index 1b3682d9cce5eb..862f9b4a875a6e 100644
--- a/third_party/xla/xla/stream_executor/integrations/BUILD
+++ b/third_party/xla/xla/stream_executor/integrations/BUILD
@@ -1,8 +1,8 @@
-load("@local_tsl//tsl:tsl.bzl", "if_google", "internal_visibility")
-load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//xla:xla.bzl", "xla_cc_test")
 load("//xla/stream_executor:build_defs.bzl", "stream_executor_friends")
+load("//xla/tsl:tsl.bzl", "if_google", "internal_visibility")
+load("//xla/tsl:tsl.default.bzl", "filegroup")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/stream_executor/kernel.cc b/third_party/xla/xla/stream_executor/kernel.cc
index f51257aa049c36..7324ca0d82618d 100644
--- a/third_party/xla/xla/stream_executor/kernel.cc
+++ b/third_party/xla/xla/stream_executor/kernel.cc
@@ -26,7 +26,7 @@ limitations under the License.
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/stream_executor/stream_executor_internal.h"
+#include "xla/stream_executor/stream_executor_interface.h"
 #include "tsl/platform/demangle.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/stream_executor/lazy_op_runner.h b/third_party/xla/xla/stream_executor/lazy_op_runner.h
index c54a9bfdac005f..8f99d6f6b23f54 100644
--- a/third_party/xla/xla/stream_executor/lazy_op_runner.h
+++ b/third_party/xla/xla/stream_executor/lazy_op_runner.h
@@ -296,7 +296,7 @@ struct FusedMHAOp {
     std::optional<double> dropout_rate;
     std::optional<int64_t> seed;
     bool is_flash_attention;
-    bool is_causal_mask;
+    FMHAMaskKind mask_type;
   };
 
   static absl::StatusOr<std::unique_ptr<const OpRunner<FusedMHASignature>>>
@@ -309,7 +309,7 @@ struct FusedMHAOp {
         config.intermediate_bmm2_lhs_descriptor, config.output_descriptor,
         config.activation_descriptor, config.mask_descriptor,
         config.bias_descriptor, config.scale, config.dropout_rate, config.seed,
-        config.is_flash_attention, config.is_causal_mask);
+        config.is_flash_attention, config.mask_type);
   }
 };
 
@@ -335,7 +335,7 @@ struct FusedMHABackwardOp {
     std::optional<double> dropout_rate;
     std::optional<int64_t> seed;
     bool is_flash_attention;
-    bool is_causal_mask;
+    FMHAMaskKind mask_type;
   };
 
   static absl::StatusOr<
@@ -353,7 +353,7 @@ struct FusedMHABackwardOp {
         config.mask_descriptor, config.d_bias_descriptor,
         config.fwd_output_descriptor, config.bias_descriptor, config.scale,
         config.dropout_rate, config.seed, config.is_flash_attention,
-        config.is_causal_mask);
+        config.mask_type);
   }
 };
 
diff --git a/third_party/xla/xla/stream_executor/platform/BUILD b/third_party/xla/xla/stream_executor/platform/BUILD
index 6754de3ebe3dda..7334d2cc63b1cf 100644
--- a/third_party/xla/xla/stream_executor/platform/BUILD
+++ b/third_party/xla/xla/stream_executor/platform/BUILD
@@ -1,7 +1,7 @@
-load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_stream_executor_deps")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//xla/stream_executor:build_defs.bzl", "stream_executor_friends")
+load("//xla/tsl:tsl.bzl", "internal_visibility")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/stream_executor/platform/default/BUILD b/third_party/xla/xla/stream_executor/platform/default/BUILD
index 8f201e9468172f..4e5e517d28dbef 100644
--- a/third_party/xla/xla/stream_executor/platform/default/BUILD
+++ b/third_party/xla/xla/stream_executor/platform/default/BUILD
@@ -1,5 +1,5 @@
-load("@local_tsl//tsl:tsl.bzl", "tsl_copts")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla/tsl:tsl.bzl", "tsl_copts")
 
 licenses(["notice"])
 
diff --git a/third_party/xla/xla/stream_executor/plugin_registry.h b/third_party/xla/xla/stream_executor/plugin_registry.h
index aa3372fc04966d..fa9e7b0f5db167 100644
--- a/third_party/xla/xla/stream_executor/plugin_registry.h
+++ b/third_party/xla/xla/stream_executor/plugin_registry.h
@@ -29,9 +29,7 @@ limitations under the License.
 
 namespace stream_executor {
 
-namespace internal {
 class StreamExecutorInterface;
-}
 
 // Enumeration to list the supported types of plugins / support libraries.
 enum class PluginKind {
@@ -56,9 +54,9 @@ enum class PluginKind {
 // late-loading from distorting performance/benchmarks as much as possible.
 class PluginRegistry {
  public:
-  typedef blas::BlasSupport* (*BlasFactory)(internal::StreamExecutorInterface*);
-  typedef dnn::DnnSupport* (*DnnFactory)(internal::StreamExecutorInterface*);
-  typedef fft::FftSupport* (*FftFactory)(internal::StreamExecutorInterface*);
+  typedef blas::BlasSupport* (*BlasFactory)(StreamExecutorInterface*);
+  typedef dnn::DnnSupport* (*DnnFactory)(StreamExecutorInterface*);
+  typedef fft::FftSupport* (*FftFactory)(StreamExecutorInterface*);
 
   // Gets (and creates, if necessary) the singleton PluginRegistry instance.
   static PluginRegistry* Instance();
diff --git a/third_party/xla/xla/stream_executor/rocm/BUILD b/third_party/xla/xla/stream_executor/rocm/BUILD
index c2ad1f56fb9974..71ad43a60c0dcc 100644
--- a/third_party/xla/xla/stream_executor/rocm/BUILD
+++ b/third_party/xla/xla/stream_executor/rocm/BUILD
@@ -2,6 +2,8 @@
 #   ROCm-platform specific StreamExecutor support code.
 # buildifier: disable=out-of-order-load
 
+# buildifier: disable=out-of-order-load
+
 load(
     "//xla/stream_executor:build_defs.bzl",
     "stream_executor_friends",
@@ -17,7 +19,7 @@ load(
     "if_rocm_is_configured",
     "rocm_library",
 )
-load("@local_tsl//tsl:tsl.bzl", "internal_visibility", "tsl_copts")
+load("//xla/tsl:tsl.bzl", "internal_visibility", "tsl_copts")
 load("@local_tsl//tsl/platform:build_config_root.bzl", "if_static")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
@@ -112,7 +114,7 @@ cc_library(
         ":rocm_driver",
         "@local_config_rocm//rocm:rocm_headers",
         "//xla/stream_executor",
-        "//xla/stream_executor:stream_executor_internal",
+        "//xla/stream_executor:stream_executor_interface",
         "//xla/stream_executor/gpu:gpu_activation",
         "//xla/stream_executor/platform",
     ]),
@@ -144,6 +146,7 @@ cc_library(
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/strings",
         "//xla/stream_executor",
+        "//xla/stream_executor:kernel",
         "//xla/stream_executor:plugin_registry",
         "//xla/stream_executor/gpu:gpu_activation_header",
         "//xla/stream_executor/gpu:gpu_event",
@@ -191,6 +194,7 @@ cc_library(
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/memory",
         "//xla/stream_executor",  # buildcleaner: keep
+        "//xla/stream_executor:executor_cache",
         "//xla/stream_executor/platform",
     ]),
     alwayslink = True,  # Registers itself with the PlatformManager.
@@ -347,7 +351,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@local_config_rocm//rocm:rocm_headers",
-        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:env_impl",
         "//xla/tsl/util:env_var",
         "//xla/tsl/util:determinism_for_kernels",
     ]),
diff --git a/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc b/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc
index f29506050fbdf6..5dd9b20b2ca3fe 100644
--- a/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc
+++ b/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc
@@ -390,8 +390,11 @@ absl::Status BlasLt::MatmulPlan::DoMatmul(
     DeviceMemoryBase b_scale, DeviceMemoryBase c_scale,
     DeviceMemoryBase d_scale, DeviceMemoryBase d_amax,
     blas::ProfileResult* profile_result) const {
-  TF_ASSIGN_OR_RETURN(std::optional<gpu::GpuTimer> timer,
-                      gpu::GpuTimer::CreateIfNeeded(stream, profile_result));
+  TF_ASSIGN_OR_RETURN(
+      std::optional<gpu::GpuTimer> timer,
+      gpu::GpuTimer::CreateIfNeeded(
+          stream, profile_result && profile_result->warmup_run_executed(),
+          profile_result));
 
   void* workspace = nullptr;
   if (algorithm.workspace_size > 0) {
@@ -456,10 +459,18 @@ absl::Status BlasLt::MatmulPlan::DoMatmul(
     }
   }
 
+  typedef struct __attribute__((packed, aligned(8))) _rocblaslt_matmul_algo {
+    uint8_t data[8] = {0};
+    bool fallback = false;
+    size_t max_workspace_bytes = 0;
+  } rocblaslt_matmul_algo;
+
   if (profile_result != nullptr) {
     TF_ASSIGN_OR_RETURN(absl::Duration elapsed, timer->GetElapsedDuration());
     // set algorithm ID to be unique (otherwise it gets kDefaultAlgorithm ID)
-    profile_result->set_algorithm(reinterpret_cast<blas::AlgorithmType>(palgo));
+    auto roc_algo = (const rocblaslt_matmul_algo*)palgo;
+    auto pindex = (int*)roc_algo->data;
+    profile_result->set_algorithm(static_cast<blas::AlgorithmType>(*pindex));
     profile_result->set_is_valid(true);
     profile_result->set_elapsed_time_in_ms(absl::ToDoubleMilliseconds(elapsed));
   }
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_blas.cc b/third_party/xla/xla/stream_executor/rocm/rocm_blas.cc
index 5de18b8557094c..5686a4791de9b4 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_blas.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_blas.cc
@@ -530,7 +530,10 @@ absl::Status ROCMBlas::DoBlasGemmWithAlgorithm(
         static_cast<int>(type_a), static_cast<int>(type_b)));
   }
   TF_ASSIGN_OR_RETURN(
-      auto timer, GpuTimer::CreateIfNeeded(stream, profile_result != nullptr));
+      auto timer,
+      GpuTimer::CreateIfNeeded(
+          stream, profile_result && profile_result->warmup_run_executed(),
+          profile_result != nullptr));
 
   // fall back to the default implementation
   if (algorithm == blas::kDefaultAlgorithm && type_a == type_c) {
@@ -586,7 +589,10 @@ absl::Status ROCMBlas::DoBlasGemmStridedBatchedWithAlgorithm(
         static_cast<int>(type_a), static_cast<int>(type_b)));
   }
   TF_ASSIGN_OR_RETURN(
-      auto timer, GpuTimer::CreateIfNeeded(stream, profile_result != nullptr));
+      auto timer,
+      GpuTimer::CreateIfNeeded(
+          stream, profile_result && profile_result->warmup_run_executed(),
+          profile_result != nullptr));
 
   // fall back to the default implementation
   if (algorithm == blas::kDefaultAlgorithm && type_a == type_c) {
@@ -662,10 +668,13 @@ bool ROCMBlas::GetBlasGemmAlgorithms(
         ret != rocblas_status_success) {
       return ret;
     }
-    out_algorithms->resize(num_sols);
+    out_algorithms->resize(num_sols + 1);
+    (*out_algorithms)[0] = blas::kDefaultAlgorithm;
     for (rocblas_int i = 0; i < num_sols; i++) {
-      (*out_algorithms)[i] = solutions_[i];
+      (*out_algorithms)[i + 1] = solutions_[i];
     }
+    // Sort the list solutions by IDs
+    std::sort(out_algorithms->begin() + 1, out_algorithms->end());
     return rocblas_status_success;
   };
 
@@ -1231,8 +1240,7 @@ void initialize_rocblas() {
         PluginRegistry::Instance()
             ->RegisterFactory<PluginRegistry::BlasFactory>(
                 rocm::kROCmPlatformId, "rocBLAS",
-                [](internal::StreamExecutorInterface *parent)
-                    -> blas::BlasSupport * {
+                [](StreamExecutorInterface *parent) -> blas::BlasSupport * {
                   gpu::GpuExecutor *rocm_executor =
                       dynamic_cast<gpu::GpuExecutor *>(parent);
                   if (rocm_executor == nullptr) {
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
index 28521b319d54d0..749e4a1a752522 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
@@ -2379,8 +2379,12 @@ absl::Status MIOpenSupport::DoRnnForwardImpl(
 
   const bool is_profiling = output_profile_result != nullptr;
 
-  TF_ASSIGN_OR_RETURN(std::optional<GpuTimer> timer,
-                      GpuTimer::CreateIfNeeded(stream, is_profiling));
+  TF_ASSIGN_OR_RETURN(
+      std::optional<GpuTimer> timer,
+      GpuTimer::CreateIfNeeded(
+          stream,
+          output_profile_result && output_profile_result->warmup_run_executed(),
+          is_profiling));
 
   // make the forward call
   if (!is_training) {
@@ -2507,8 +2511,12 @@ absl::Status MIOpenSupport::DoRnnBackwardImpl(
 
   const bool is_profiling = output_profile_result != nullptr;
 
-  TF_ASSIGN_OR_RETURN(std::optional<GpuTimer> timer,
-                      GpuTimer::CreateIfNeeded(stream, is_profiling));
+  TF_ASSIGN_OR_RETURN(
+      std::optional<GpuTimer> timer,
+      GpuTimer::CreateIfNeeded(
+          stream,
+          output_profile_result && output_profile_result->warmup_run_executed(),
+          is_profiling));
 
   // make the backward data call
   auto status = wrap::miopenRNNBackwardData(
@@ -3188,7 +3196,8 @@ class RocmConvRunner : public dnn::ConvRunner {
     return {{algo_id_, false, workspace_size_}};
   }
 
-  absl::Status operator()(Stream* stream, dnn::ProfileResult* profile_result,
+  absl::Status operator()(Stream* stream,
+                          dnn::ProfileResult* output_profile_result,
                           DeviceMemoryBase scratch_memory,
                           DeviceMemoryBase input_data,
                           DeviceMemoryBase filter_data,
@@ -3199,9 +3208,13 @@ class RocmConvRunner : public dnn::ConvRunner {
     // Beta is the scaling factor for output.
     float beta = 0.0;
 
-    const bool is_profiling = profile_result != nullptr;
+    const bool is_profiling = output_profile_result != nullptr;
     TF_ASSIGN_OR_RETURN(std::optional<GpuTimer> timer,
-                        GpuTimer::CreateIfNeeded(stream, is_profiling));
+                        GpuTimer::CreateIfNeeded(
+                            stream,
+                            output_profile_result &&
+                                output_profile_result->warmup_run_executed(),
+                            is_profiling));
 
     miopenStatus_t status = miopenStatusSuccess;
     switch (kind_) {
@@ -3272,11 +3285,11 @@ class RocmConvRunner : public dnn::ConvRunner {
       if (status == miopenStatusSuccess) {
         TF_ASSIGN_OR_RETURN(absl::Duration elapsed,
                             timer->GetElapsedDuration());
-        profile_result->set_elapsed_time_in_ms(
+        output_profile_result->set_elapsed_time_in_ms(
             absl::ToDoubleMilliseconds(elapsed));
         dnn::AlgorithmDesc algotype(algo_id_, false);
-        profile_result->set_algorithm(algotype);
-        profile_result->set_scratch_size(scratch_memory.size());
+        output_profile_result->set_algorithm(algotype);
+        output_profile_result->set_scratch_size(scratch_memory.size());
       }
     }
 
@@ -4684,7 +4697,7 @@ void initialize_miopen() {
     absl::Status status =
         PluginRegistry::Instance()->RegisterFactory<PluginRegistry::DnnFactory>(
             rocm::kROCmPlatformId, "MIOpen",
-            [](internal::StreamExecutorInterface* parent) -> dnn::DnnSupport* {
+            [](StreamExecutorInterface* parent) -> dnn::DnnSupport* {
               gpu::GpuExecutor* rocm_executor =
                   dynamic_cast<gpu::GpuExecutor*>(parent);
               if (rocm_executor == nullptr) {
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc b/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc
index 722f2c800acef7..40bbb158f9437b 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc
@@ -44,7 +44,7 @@ limitations under the License.
 #include "xla/stream_executor/rocm/rocm_platform_id.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/stream_executor/stream_executor_internal.h"
+#include "xla/stream_executor/stream_executor_interface.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/fingerprint.h"
@@ -222,9 +222,7 @@ void GpuExecutor::UnloadKernel(const Kernel* kernel) {
   kernel_to_gpu_binary_.erase(gpu_binary_it);
 }
 
-absl::Status GpuExecutor::Init(int device_ordinal) {
-  device_ordinal_ = device_ordinal;
-
+absl::Status GpuExecutor::Init() {
   auto status = GpuDriver::Init();
   if (!status.ok()) {
     return status;
@@ -546,20 +544,6 @@ void GpuExecutor::Deallocate(DeviceMemoryBase* mem) {
   GpuDriver::DeviceDeallocate(context_, mem->opaque());
 }
 
-bool GpuExecutor::HostMemoryRegister(void* location, uint64_t size) {
-  if (location == nullptr || size == 0) {
-    LOG(WARNING) << "attempting to register null or zero-sized memory: "
-                 << location << "; size " << size;
-  }
-  VLOG(2) << "registering " << location << " size " << size;
-  return GpuDriver::HostRegister(context_, location, size);
-}
-
-bool GpuExecutor::HostMemoryUnregister(void* location) {
-  VLOG(2) << "unregistering " << location;
-  return GpuDriver::HostUnregister(context_, location);
-}
-
 bool GpuExecutor::SynchronizeAllActivity() {
   return GpuDriver::SynchronizeContext(context_);
 }
@@ -575,21 +559,6 @@ absl::Status GpuExecutor::SynchronousMemZero(DeviceMemoryBase* location,
                                            0x0, size);
 }
 
-absl::Status GpuExecutor::SynchronousMemSet(DeviceMemoryBase* location,
-                                            int value, uint64_t size) {
-  if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
-      size % 4 == 0) {
-    // hipMemset reinterprets "value" as a uint8.
-    uint8 byte_value = static_cast<uint8>(value);
-    uint32 pattern = (byte_value << 24) | (byte_value << 16) |
-                     (byte_value << 8) | byte_value;
-    return GpuDriver::SynchronousMemsetUint32(
-        context_, AsROCmDevicePtr(location), pattern, size / 4);
-  }
-  return GpuDriver::SynchronousMemsetUint8(context_, AsROCmDevicePtr(location),
-                                           value, size);
-}
-
 absl::Status GpuExecutor::SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
                                             const void* host_src,
                                             uint64_t size) {
@@ -604,12 +573,6 @@ absl::Status GpuExecutor::SynchronousMemcpy(void* host_dst,
                                          AsROCmDevicePtr(gpu_src), size);
 }
 
-absl::Status GpuExecutor::SynchronousMemcpyDeviceToDevice(
-    DeviceMemoryBase* gpu_dst, const DeviceMemoryBase& gpu_src, uint64_t size) {
-  return GpuDriver::SynchronousMemcpyD2D(context_, AsROCmDevicePtr(gpu_dst),
-                                         AsROCmDevicePtr(gpu_src), size);
-}
-
 absl::Status GpuExecutor::MemZero(Stream* stream, DeviceMemoryBase* location,
                                   uint64_t size) {
   if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
@@ -744,6 +707,12 @@ bool GpuExecutor::AllocateStream(Stream* stream) {
 }
 
 void GpuExecutor::DeallocateStream(Stream* stream) {
+  {
+    absl::MutexLock lock(&mu_);
+    if (dnn_ != nullptr) {
+      dnn_->NotifyStreamDestroyed(stream);
+    }
+  }
   GpuStream* rocm_stream = AsGpuStream(stream);
   absl::MutexLock l(&alive_gpu_streams_mu_);
   alive_gpu_streams_.erase(rocm_stream->platform_specific_stream());
@@ -772,7 +741,12 @@ absl::Status GpuExecutor::BlockHostUntilDone(Stream* stream) {
   return GpuDriver::SynchronizeStream(context_, AsGpuStreamValue(stream));
 }
 
-blas::BlasSupport* GpuExecutor::CreateBlas() {
+blas::BlasSupport* GpuExecutor::AsBlas() {
+  absl::MutexLock lock(&mu_);
+  if (blas_ != nullptr) {
+    return blas_.get();
+  }
+
   PluginRegistry* registry = PluginRegistry::Instance();
   absl::StatusOr<PluginRegistry::BlasFactory> status =
       registry->GetFactory<PluginRegistry::BlasFactory>(rocm::kROCmPlatformId);
@@ -782,10 +756,16 @@ blas::BlasSupport* GpuExecutor::CreateBlas() {
     return nullptr;
   }
 
-  return status.value()(this);
+  auto blas = status.value()(this);
+  blas_.reset(blas);
+  return blas_.get();
 }
 
-dnn::DnnSupport* GpuExecutor::CreateDnn() {
+dnn::DnnSupport* GpuExecutor::AsDnn() {
+  absl::MutexLock lock(&mu_);
+  if (dnn_ != nullptr) {
+    return dnn_.get();
+  }
   PluginRegistry* registry = PluginRegistry::Instance();
   absl::StatusOr<PluginRegistry::DnnFactory> status =
       registry->GetFactory<PluginRegistry::DnnFactory>(rocm::kROCmPlatformId);
@@ -795,10 +775,18 @@ dnn::DnnSupport* GpuExecutor::CreateDnn() {
     return nullptr;
   }
 
-  return status.value()(this);
+  auto dnn = status.value()(this);
+
+  dnn_.reset(dnn);
+
+  return dnn_.get();
 }
 
-fft::FftSupport* GpuExecutor::CreateFft() {
+fft::FftSupport* GpuExecutor::AsFft() {
+  absl::MutexLock lock(&mu_);
+  if (fft_ != nullptr) {
+    return fft_.get();
+  }
   PluginRegistry* registry = PluginRegistry::Instance();
   absl::StatusOr<PluginRegistry::FftFactory> status =
       registry->GetFactory<PluginRegistry::FftFactory>(rocm::kROCmPlatformId);
@@ -808,7 +796,10 @@ fft::FftSupport* GpuExecutor::CreateFft() {
     return nullptr;
   }
 
-  return status.value()(this);
+  auto fft = status.value()(this);
+
+  fft_.reset(fft);
+  return fft_.get();
 }
 
 bool GpuExecutor::CanEnablePeerAccessTo(StreamExecutorInterface* other) {
@@ -866,14 +857,12 @@ absl::Status FillBlockDimLimit(GpuDeviceHandle device,
   return absl::OkStatus();
 }
 
-std::unique_ptr<internal::EventInterface>
-GpuExecutor::CreateEventImplementation() {
-  return std::unique_ptr<internal::EventInterface>(new GpuEvent(this));
+std::unique_ptr<EventInterface> GpuExecutor::CreateEventImplementation() {
+  return std::unique_ptr<EventInterface>(new GpuEvent(this));
 }
 
-std::unique_ptr<internal::StreamInterface>
-GpuExecutor::GetStreamImplementation() {
-  return std::unique_ptr<internal::StreamInterface>(new GpuStream(this));
+std::unique_ptr<StreamInterface> GpuExecutor::GetStreamImplementation() {
+  return std::unique_ptr<StreamInterface>(new GpuStream(this));
 }
 
 absl::StatusOr<std::unique_ptr<Kernel>> GpuExecutor::CreateKernel() {
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_fft.cc b/third_party/xla/xla/stream_executor/rocm/rocm_fft.cc
index 9d2f9b30899cea..0168dd98aaf3b5 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_fft.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_fft.cc
@@ -27,7 +27,7 @@ limitations under the License.
 #include "xla/stream_executor/platform/port.h"
 #include "xla/stream_executor/plugin_registry.h"
 #include "xla/stream_executor/rocm/rocm_platform_id.h"
-#include "xla/stream_executor/stream_executor_internal.h"
+#include "xla/stream_executor/stream_executor_interface.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/logging.h"
 
@@ -509,7 +509,7 @@ void initialize_rocfft() {
     absl::Status status =
         PluginRegistry::Instance()->RegisterFactory<PluginRegistry::FftFactory>(
             rocm::kROCmPlatformId, "rocFFT",
-            [](internal::StreamExecutorInterface *parent) -> fft::FftSupport * {
+            [](StreamExecutorInterface *parent) -> fft::FftSupport * {
               gpu::GpuExecutor *rocm_executor =
                   dynamic_cast<gpu::GpuExecutor *>(parent);
               if (rocm_executor == nullptr) {
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_platform.cc b/third_party/xla/xla/stream_executor/rocm/rocm_platform.cc
index b7f4f9ff2fc64e..1a759bd8e6a9b6 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_platform.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_platform.cc
@@ -130,7 +130,7 @@ absl::StatusOr<StreamExecutor*> ROCmPlatform::GetExecutor(
 absl::StatusOr<std::unique_ptr<StreamExecutor>>
 ROCmPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
   auto executor = std::make_unique<StreamExecutor>(
-      this, std::make_unique<GpuExecutor>(), config.ordinal);
+      this, std::make_unique<GpuExecutor>(config.ordinal));
   auto init_status = executor->Init();
   if (!init_status.ok()) {
     return absl::Status{
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_platform.h b/third_party/xla/xla/stream_executor/rocm/rocm_platform.h
index 7c9f5037435496..05934b1245ab6a 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_platform.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_platform.h
@@ -25,7 +25,7 @@ limitations under the License.
 #include "xla/stream_executor/platform/port.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/stream_executor/stream_executor_internal.h"
+#include "xla/stream_executor/stream_executor_interface.h"
 
 namespace stream_executor {
 namespace gpu {
diff --git a/third_party/xla/xla/stream_executor/stream.cc b/third_party/xla/xla/stream_executor/stream.cc
index dfc3a5475a125a..304eda4b339c57 100644
--- a/third_party/xla/xla/stream_executor/stream.cc
+++ b/third_party/xla/xla/stream_executor/stream.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cstdint>
 #include <functional>
 #include <memory>
+#include <optional>
 #include <sstream>
 #include <string>
 #include <utility>
@@ -33,7 +34,7 @@ limitations under the License.
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/stream_executor/stream_executor_internal.h"
+#include "xla/stream_executor/stream_executor_interface.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/stacktrace.h"
@@ -41,10 +42,31 @@ limitations under the License.
 namespace stream_executor {
 
 Stream::Stream(StreamExecutor *parent)
-    : parent_(parent),
-      implementation_(parent->implementation()->GetStreamImplementation()),
-      allocated_(false),
-      status_(absl::InternalError("Uninitialized stream")) {}
+    : parent_(parent), implementation_(nullptr), status_(absl::OkStatus()) {}
+
+absl::Status Stream::Initialize(
+    std::optional<std::variant<StreamPriority, int>> priority) {
+  absl::MutexLock lock(&mu_);
+  if (implementation_ != nullptr) {
+    return absl::InternalError(
+        "stream appears to already have been initialized");
+  }
+  implementation_ = parent_->implementation()->GetStreamImplementation();
+  if (priority.has_value()) {
+    if (std::holds_alternative<StreamPriority>(*priority)) {
+      implementation_->SetPriority(std::get<StreamPriority>(*priority));
+    } else {
+      implementation_->SetPriority(std::get<int>(*priority));
+    }
+  }
+
+  if (parent_->AllocateStream(this)) {
+    // Successful initialization!
+    return absl::OkStatus();
+  }
+
+  return absl::InternalError("failed to allocate stream during initialization");
+}
 
 Stream::~Stream() {
   // Ensure the stream is completed.
@@ -54,19 +76,11 @@ Stream::~Stream() {
                  << status;
   }
 
-  if (allocated_) {
+  if (implementation_ != nullptr) {
     parent_->DeallocateStream(this);
   }
 }
 
-void Stream::SetPriority(StreamPriority priority) {
-  implementation_->SetPriority(priority);
-}
-
-void Stream::SetPriority(int priority) {
-  implementation_->SetPriority(priority);
-}
-
 std::variant<StreamPriority, int> Stream::priority() const {
   return implementation_->priority();
 }
@@ -88,27 +102,6 @@ absl::Status Stream::RefreshStatus() {
   return status;
 }
 
-absl::Status Stream::Initialize() {
-  absl::MutexLock lock(&mu_);
-  if (allocated_) {
-    return absl::InternalError(
-        "stream appears to already have been initialized");
-  }
-  if (status_.ok()) {
-    return absl::InternalError(
-        "stream should be in !ok() state pre-initialization");
-  }
-
-  if (parent_->AllocateStream(this)) {
-    // Successful initialization!
-    allocated_ = true;
-    status_ = absl::OkStatus();
-    return absl::OkStatus();
-  }
-
-  return absl::InternalError("failed to allocate stream during initialization");
-}
-
 absl::Status Stream::RecordEvent(Event *event) {
   return parent_->RecordEvent(this, event);
 }
diff --git a/third_party/xla/xla/stream_executor/stream.h b/third_party/xla/xla/stream_executor/stream.h
index 7579c40685f920..078b14acbd94c4 100644
--- a/third_party/xla/xla/stream_executor/stream.h
+++ b/third_party/xla/xla/stream_executor/stream.h
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
+#include <optional>
 #include <string>
 #include <utility>
 #include <variant>
@@ -110,7 +111,8 @@ class Stream {
 
   // Initialize the stream. This must be performed before entraining any other
   // operations.
-  absl::Status Initialize();
+  absl::Status Initialize(
+      std::optional<std::variant<StreamPriority, int>> priority = std::nullopt);
 
   // Get or create a sub-stream from this stream. If there is any sub-stream in
   // the pool that can be reused then just return this sub-stream.  Otherwise
@@ -187,15 +189,13 @@ class Stream {
 
   // Entrain onto the stream: a memcpy to a host destination from a GPU source
   // of the given target size. host_dst must be a pointer to host memory
-  // allocated by StreamExecutor::HostMemoryAllocate or otherwise allocated and
-  // then registered with StreamExecutor::HostMemoryRegister.
+  // allocated by StreamExecutor::HostMemoryAllocate.
   absl::Status Memcpy(void *host_dst, const DeviceMemoryBase &gpu_src,
                       uint64_t size);
 
   // Entrain onto the stream: a memcpy to a GPU destination from a host source
   // of the given target size. host_src must be a pointer to host memory
-  // allocated by StreamExecutor::HostMemoryAllocate or otherwise allocated and
-  // then registered with StreamExecutor::HostMemoryRegister.
+  // allocated by StreamExecutor::HostMemoryAllocate.
   absl::Status Memcpy(DeviceMemoryBase *gpu_dst, const void *host_src,
                       uint64_t size);
 
@@ -255,7 +255,7 @@ class Stream {
 
   // Returns the (opaque) platform-specific backing object. Ownership is not
   // transferred to the caller.
-  internal::StreamInterface *implementation() { return implementation_.get(); }
+  StreamInterface *implementation() { return implementation_.get(); }
 
   // Entrains onto the stream a callback to the host (from the device).
   // Behaves as DoHostCallbackWithStatus below, but the callback should
@@ -292,9 +292,6 @@ class Stream {
     return parent()->GetDeviceDescription().rocm_compute_capability();
   }
 
-  void SetPriority(StreamPriority priority);
-  void SetPriority(int priority);
-
   std::variant<StreamPriority, int> priority() const;
 
  private:
@@ -317,17 +314,12 @@ class Stream {
 
   // The platform-dependent implementation that the StreamExecutor interface
   // delegates to.
-  std::unique_ptr<internal::StreamInterface> implementation_;
+  std::unique_ptr<StreamInterface> implementation_;
 
   // mutex that guards the allocation / error state flags.
   // Mutable so that it can be obtained via const reader lock.
   mutable absl::Mutex mu_;
 
-  // Whether Init() was successfully called to allocate this stream on the
-  // underlying platform. It simply flips from 0 to 1 with a sanity check.
-  // See StreamExecutor::AllocateStream.
-  bool allocated_ ABSL_GUARDED_BY(mu_);
-
   // The last error (if any) of all method calls.
   absl::Status status_ ABSL_GUARDED_BY(mu_);
 
diff --git a/third_party/xla/xla/stream_executor/stream_executor_internal.h b/third_party/xla/xla/stream_executor/stream_executor_interface.h
similarity index 54%
rename from third_party/xla/xla/stream_executor/stream_executor_internal.h
rename to third_party/xla/xla/stream_executor/stream_executor_interface.h
index c837a5ce6e7a18..6f2b433bfc04ed 100644
--- a/third_party/xla/xla/stream_executor/stream_executor_internal.h
+++ b/third_party/xla/xla/stream_executor/stream_executor_interface.h
@@ -1,4 +1,4 @@
-/* Copyright 2015 The OpenXLA Authors.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,29 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// Interfaces for platform-dependent implementations to satisfy. This are
-// delegated to from the StreamExecutor in pointer-to-implementation style; i.e.
-// the StreamExecutor is just a husk that delegates calls to the
-// platform-specific objects which implement the interfaces defined here.
+#ifndef XLA_STREAM_EXECUTOR_STREAM_EXECUTOR_INTERFACE_H_
+#define XLA_STREAM_EXECUTOR_STREAM_EXECUTOR_INTERFACE_H_
 
-#ifndef XLA_STREAM_EXECUTOR_STREAM_EXECUTOR_INTERNAL_H_
-#define XLA_STREAM_EXECUTOR_STREAM_EXECUTOR_INTERNAL_H_
-
-#include <cstddef>
 #include <cstdint>
 #include <memory>
 #include <optional>
-#include <string>
-#include <variant>
 
-#include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
 #include "xla/stream_executor/allocator_stats.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/command_buffer.h"
-#include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/event.h"
@@ -43,120 +33,71 @@ limitations under the License.
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/module_spec.h"
-#include "xla/stream_executor/platform.h"
-#include "xla/stream_executor/platform/port.h"
+#include "xla/stream_executor/stream_interface.h"
 
 namespace stream_executor {
 
 class Stream;
 
-namespace internal {
-
-//===----------------------------------------------------------------------===//
-// EventInterface
-//===----------------------------------------------------------------------===//
-
-// Platform-dependent interface class for the generic Events interface, in
-// the PIMPL style.
-class EventInterface {
- public:
-  EventInterface() = default;
-  virtual ~EventInterface() = default;
-
- private:
-  EventInterface(const EventInterface&) = delete;
-  void operator=(const EventInterface&) = delete;
-};
-
-//===----------------------------------------------------------------------===//
-// StreamInterface
-//===----------------------------------------------------------------------===//
-
-// Pointer-to-implementation object type (i.e. the Stream class delegates to
-// this interface) with virtual destruction. This class exists for the
-// platform-dependent code to hang any kernel data/resource info/functionality
-// off of.
-class StreamInterface {
- public:
-  // Default constructor for the abstract interface.
-  StreamInterface() = default;
-
-  // Default destructor for the abstract interface.
-  virtual ~StreamInterface() = default;
-
-  // Sets priority for a stream.
-  virtual void SetPriority(StreamPriority priority) {}
-
-  virtual void SetPriority(int priority) {}
-
-  // Gets priority for a stream.
-  virtual std::variant<StreamPriority, int> priority() const {
-    return StreamPriority::Default;
-  }
-
-  // Returns a pointer to a platform specific stream associated with this object
-  // if it exists, or nullptr otherwise. This is available via Stream public API
-  // as Stream::PlatformSpecificHandle, and should not be accessed directly
-  // outside of a StreamExecutor package.
-  virtual void* platform_specific_stream() { return nullptr; }
-
- private:
-  StreamInterface(const StreamInterface&) = delete;
-  void operator=(const StreamInterface&) = delete;
-};
-
-//===----------------------------------------------------------------------===//
-// StreamExecutorInterface
-//===----------------------------------------------------------------------===//
-
-// Interface for the different StreamExecutor platforms (i.e. CUDA, OpenCL).
-//
-// Various platforms will provide an implementation that satisfy this interface.
+// Interface which defines the method for interacting with an accelerator device
+// (e.g. GPU, TPU).
 class StreamExecutorInterface {
  public:
-  // Default constructor for the abstract interface.
   StreamExecutorInterface() = default;
-
-  // Default destructor for the abstract interface.
   virtual ~StreamExecutorInterface() = default;
 
-  // Returns the (transitively) wrapped executor if this executor is
-  // wrapping another executor; otherwise, returns this.
-  virtual StreamExecutorInterface* GetUnderlyingExecutor() { return this; }
-
-  // See the StreamExecutor interface for comments on the same-named methods.
-  virtual absl::Status Init(int device_ordinal) = 0;
-
-  // This value is cached by the wrapping StreamExecutor instance, so it's OK if
-  // this function is slow.
-  //
-  // The wrapping StreamExecutor will use the platform name if this is nullopt.
-  virtual std::optional<std::string> MakeDeviceDescriptionStr() const {
-    return std::nullopt;
-  }
+  // Initializes the device for use.
+  virtual absl::Status Init() = 0;
 
+  // Returns the device ordinal.
   virtual int device_ordinal() const { return -1; }
 
+  // Retrieves (loads) a kernel, if one exists.
+  //
+  // Parameters:
+  //   spec: The MultiKernelLoaderSpec is usually generated as a compile-time
+  //    constant into an appropriate namespace.
+  //   kernel: Outparam that the kernel is loaded into. A given Kernel
+  //    instantiation should not be loaded into more than once.
   virtual absl::Status GetKernel(const MultiKernelLoaderSpec& spec,
                                  Kernel* kernel) {
     return absl::UnimplementedError("Not Implemented");
   }
+
+  // Unloads the module with handle `module_handle`.
   virtual bool UnloadModule(ModuleHandle module_handle) { return false; }
+
+  // Loads a module for the platform this StreamExecutor is acting upon.
+  //
+  // `spec` describes the module to be loaded.  On success writes the handle for
+  // the loaded module to `module_handle` and returns OkStatus().  Otherwise,
+  // returns the error which has occurred.
   virtual absl::Status LoadModule(const MultiModuleLoaderSpec& spec,
                                   ModuleHandle* module_handle) {
     return absl::UnimplementedError("Not Implemented");
   }
+
+  // Creates a shared constant using the content provided.
   virtual absl::StatusOr<std::shared_ptr<DeviceMemoryBase>>
   CreateOrShareConstant(Stream* stream, absl::Span<const uint8_t> content) {
     return absl::UnimplementedError("Not Implemented");
   }
+
+  // Launches a data parallel kernel with the given thread/block
+  // dimensionality and already-packed args/sizes to pass to the underlying
+  // platform driver.
+
   virtual absl::Status Launch(Stream* stream, const ThreadDim& thread_dims,
                               const BlockDim& block_dims, const Kernel& k,
                               const KernelArgs& args) {
     return absl::UnimplementedError("Not Implemented");
   }
 
+  // Launches a data parallel kernel with the given thread/block
+  // dimensionality and already-packed args/sizes to pass to the underlying
+  // platform driver.
   virtual absl::Status Launch(Stream* stream, const ThreadDim& thread_dims,
                               const BlockDim& block_dims,
                               const ClusterDim& cluster_dims, const Kernel& k,
@@ -164,18 +105,26 @@ class StreamExecutorInterface {
     return absl::UnimplementedError("Not Implemented");
   }
 
+  // Submits command buffer for execution to the underlying platform driver.
   virtual absl::Status Submit(Stream* stream,
                               const CommandBuffer& command_buffer) {
     return absl::UnimplementedError("Not Implemented");
   }
 
-  // Releases any state associated with the kernel.
+  // Releases any state associated with the previously loaded kernel.
   virtual void UnloadKernel(const Kernel* kernel) {}
+
+  // Synchronously allocates size bytes on the underlying platform and returns
+  // a DeviceMemoryBase representing that allocation. In the case of failure,
+  // nullptr is returned.
   virtual DeviceMemoryBase Allocate(uint64_t size, int64_t memory_space) = 0;
   DeviceMemoryBase Allocate(uint64_t size) {
     return Allocate(size, /*memory_space=*/0);
   }
+  // Deallocates the DeviceMemory previously allocated via this interface.
+  // Deallocation of a nullptr-representative value is permitted.
   virtual void Deallocate(DeviceMemoryBase* mem) = 0;
+
   // Allocates unified memory space of the given size, if supported.
   // See
   // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-unified-memory-programming-hd
@@ -185,71 +134,151 @@ class StreamExecutorInterface {
   // Deallocates unified memory space previously allocated with
   // UnifiedMemoryAllocate.
   virtual void UnifiedMemoryDeallocate(void* mem) {}
+
+  // Allocates collective device memory using ncclMemAlloc.
+  // See
+  // https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/bufferreg.html
+  // for more details on User Buffer Registration.
   virtual absl::StatusOr<void*> CollectiveMemoryAllocate(uint64_t size) {
     return absl::UnimplementedError("Not implemented");
   }
+
+  // Deallocates collective device memory previously allocated with
+  // CollectiveMemoryAllocate.
   virtual absl::Status CollectiveMemoryDeallocate(void* mem) {
     return absl::UnimplementedError("Not implemented");
   }
-  virtual void* HostMemoryAllocate(uint64_t size) = 0;
+
+  // Allocates a region of host memory and registers it with the platform API.
+  // Memory allocated in this manner is required for use in asynchronous memcpy
+  // operations, such as Stream::Memcpy.
+  virtual absl::StatusOr<std::unique_ptr<MemoryAllocation>> HostMemoryAllocate(
+      uint64_t size) = 0;
+
+  // Deallocates a region of host memory allocated by HostMemoryAllocate().
   virtual void HostMemoryDeallocate(void* mem) = 0;
-  virtual bool HostMemoryRegister(void* mem, uint64_t size) = 0;
-  virtual bool HostMemoryUnregister(void* mem) = 0;
+
+  // Synchronizes all activity occurring in the StreamExecutor's context.
   virtual bool SynchronizeAllActivity() = 0;
+
+  // Blocks the caller while "size" bytes are zeroed out (in POD fashion) at the
+  // given location in device memory.
   virtual absl::Status SynchronousMemZero(DeviceMemoryBase* location,
                                           uint64_t size) = 0;
-  virtual absl::Status SynchronousMemSet(DeviceMemoryBase* location, int value,
-                                         uint64_t size) = 0;
-  virtual absl::Status SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
+
+  // Blocks the caller while "size" bytes are copied to the given location in
+  // device memory.
+  virtual absl::Status SynchronousMemcpy(DeviceMemoryBase* device_dst,
                                          const void* host_src,
                                          uint64_t size) = 0;
+
+  // Blocks the caller while "size" bytes are copied to the given location in
+  // host memory.
   virtual absl::Status SynchronousMemcpy(void* host_dst,
-                                         const DeviceMemoryBase& gpu_src,
+                                         const DeviceMemoryBase& device_src,
                                          uint64_t size) = 0;
-  virtual absl::Status SynchronousMemcpyDeviceToDevice(
-      DeviceMemoryBase* gpu_dst, const DeviceMemoryBase& gpu_src,
-      uint64_t size) = 0;
+
+  // Enqueues an operation onto stream to zero out size bytes at the given
+  // device memory location. Neither stream nor location may be null. Returns
+  // whether the operation was successfully enqueued onto the stream.
   virtual absl::Status MemZero(Stream* stream, DeviceMemoryBase* location,
                                uint64_t size) = 0;
+
+  // Enqueues an operation onto stream to set 8-bit patterns starting at
+  // location, for byte count given by size.  Returns whether the operation was
+  // successfully enqueued onto the stream.
   virtual absl::Status Memset(Stream* stream, DeviceMemoryBase* location,
                               uint8 pattern, uint64_t size) {
     return absl::InternalError("Not implemented");
   }
+
+  // Enqueues an operation onto stream to set 32-bit patterns starting at
+  // location, for byte count given by size. size must be 32-bit quantified
+  // (i.e. evenly divisible by 4). Returns whether the operation was
+  // successfully enqueued onto the stream.
   virtual absl::Status Memset32(Stream* stream, DeviceMemoryBase* location,
                                 uint32_t pattern, uint64_t size) = 0;
+
+  // Enqueues a memcpy operation onto stream, with a host destination location
+  // host_dst and a device memory source, with target size size.
   virtual absl::Status Memcpy(Stream* stream, void* host_dst,
-                              const DeviceMemoryBase& gpu_src,
+                              const DeviceMemoryBase& device_src,
                               uint64_t size) = 0;
-  virtual absl::Status Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst,
+
+  // Enqueues a memcpy operation onto stream, with a device destination location
+  // and a host memory source, with target size size.
+  virtual absl::Status Memcpy(Stream* stream, DeviceMemoryBase* device_dst,
                               const void* host_src, uint64_t size) = 0;
-  virtual bool MemcpyDeviceToDevice(Stream* stream, DeviceMemoryBase* gpu_dst,
-                                    const DeviceMemoryBase& gpu_src,
+
+  // Enqueues a memcpy operation onto stream, with a device destination location
+  // and a device source location, with target size size. Peer access should
+  // have been enabled between the StreamExecutors owning the device memory
+  // regions.
+  virtual bool MemcpyDeviceToDevice(Stream* stream,
+                                    DeviceMemoryBase* device_dst,
+                                    const DeviceMemoryBase& device_src,
                                     uint64_t size) = 0;
+
+  // Enqueues on a stream a user-specified function to be run on the host.
   virtual bool HostCallback(Stream* stream,
                             absl::AnyInvocable<absl::Status() &&> callback) = 0;
+
+  // Performs platform-specific allocation and initialization of an event.
   virtual absl::Status AllocateEvent(Event* event) = 0;
+
+  // Performs platform-specific deallocation and cleanup of an event.
   virtual absl::Status DeallocateEvent(Event* event) = 0;
+
+  // Inserts the specified event at the end of the specified stream.
   virtual absl::Status RecordEvent(Stream* stream, Event* event) = 0;
+
+  // Waits for the specified event at the end of the specified stream.
   virtual absl::Status WaitForEvent(Stream* stream, Event* event) = 0;
+
+  // Waits for the specified event at the end of the raw platform-specific
+  // stream.
   virtual absl::Status WaitForEventOnExternalStream(std::intptr_t stream,
                                                     Event* event) {
     return absl::UnimplementedError(
         "WaitForEventOnExternalStream not supported on this executor.");
   }
+
+  // Requests the current status of the event from the underlying platform.
   virtual Event::Status PollForEventStatus(Event* event) = 0;
+
+  // Allocates stream resources on the underlying platform and initializes its
+  // internals.
   virtual bool AllocateStream(Stream* stream) = 0;
+
+  // Deallocates stream resources on the underlying platform.
   virtual void DeallocateStream(Stream* stream) = 0;
+
+  // Causes dependent to not begin execution until other has finished its
+  // last-enqueued work.
   virtual bool CreateStreamDependency(Stream* dependent, Stream* other) = 0;
+
+  // Causes the host code to synchronously wait for operations enqueued
+  // onto stream to complete. Effectively a join on the asynchronous device
+  // operations enqueued on the stream before this program point.
   virtual absl::Status BlockHostUntilDone(Stream* stream) = 0;
+
+  // Without blocking the device, retrieve the current stream status.
   virtual absl::Status GetStatus(Stream* stream) {
     return absl::UnimplementedError(
         "GetStatus is not supported on this executor.");
   }
+  // Enables peer access from this StreamExecutor to memory
+  // allocated by other, such that launched device code, memcpies, etc may
+  // access it directly.
   virtual absl::Status EnablePeerAccessTo(StreamExecutorInterface* other) = 0;
-  virtual bool CanEnablePeerAccessTo(StreamExecutorInterface* other) = 0;
 
-  virtual int64_t GetDeviceLoad() { return -1; }
+  // Returns whether it's possible to enable peer access from this
+  // StreamExecutor to memory allocated by another.
+  virtual bool CanEnablePeerAccessTo(StreamExecutorInterface* other) = 0;
 
+  // Returns the underlying device memory usage information, if it is available.
+  // If it is not available (false is returned), free/total may not be
+  // initialized.
   virtual bool DeviceMemoryUsage(int64_t* free, int64_t* total) const {
     return false;
   }
@@ -274,47 +303,52 @@ class StreamExecutorInterface {
   virtual absl::StatusOr<std::unique_ptr<DeviceDescription>>
   CreateDeviceDescription() const = 0;
 
-  // Creates a new BlasSupport object, ownership is transferred to the caller.
+  // Gets-or-creates a BlasSupport datatype that can be used to execute BLAS
+  // routines on the current platform.
   //
-  // This may return null if the BLAS initialization fails or this object does
-  // not support BLAS.
-  virtual blas::BlasSupport* CreateBlas() { return nullptr; }
+  // Returns null if there was an error initializing the BLAS support for the
+  // underlying platform.
+  virtual blas::BlasSupport* AsBlas() { return nullptr; }
 
-  // Creates a new fft::FftSupport object, ownership is transferred to the
-  // caller.
-  // This may return null if the FFT initialization fails or this object does
-  // not support FFT.
-  virtual fft::FftSupport* CreateFft() { return nullptr; }
+  // Gets or creates a FftSupport datatype that can be used to execute FFT
+  // routines on the current platform.
+  //
+  // Returns null if there was an error initializing the FFT support for the
+  // underlying platform.
+  virtual fft::FftSupport* AsFft() { return nullptr; }
 
-  // Creates a new DnnSupport object, ownership is transferred to the caller.
-  // This may return null if the DNN initialization fails or this object does
-  // not support Dnns.
-  virtual dnn::DnnSupport* CreateDnn() { return nullptr; }
+  // Gets-or-creates  a DnnSupport datatype that can be used for neural network
+  // routines on the current platform.
+  //
+  // Returns null if there was an error initializing the DNN support for the
+  // underlying platform.
+  virtual dnn::DnnSupport* AsDnn() { return nullptr; }
 
   // Each call creates a new instance of the platform-specific implementation of
   // the corresponding interface type.
   virtual std::unique_ptr<EventInterface> CreateEventImplementation() = 0;
   virtual std::unique_ptr<StreamInterface> GetStreamImplementation() = 0;
 
+  // Creates a new Kernel object.
+  // TODO(klucke) Combine with GetKernel.
   virtual absl::StatusOr<std::unique_ptr<Kernel>> CreateKernel() {
     return absl::UnimplementedError("Kernels are not implemented");
   }
 
+  // Creates a new CommandBuffer object.
   virtual absl::StatusOr<std::unique_ptr<CommandBuffer>> CreateCommandBuffer(
       CommandBuffer::Mode mode) {
     return absl::UnimplementedError("Command buffers are not implemented");
   }
 
-  // Return allocator statistics.
+  // Returns allocator statistics.
   virtual std::optional<AllocatorStats> GetAllocatorStats() {
     return std::nullopt;
   }
 
-  // If implemented, clears the internal stats except for the `in_use` fields
-  // and sets the `peak_bytes_in_use` to be equal to the `bytes_in_use`. Returns
-  // true if implemented.
-  //
-  // REQUIRES: GetAllocatorStats is overridden.
+  // Clears the internal stats except for the `in_use` fields  and sets the
+  // `peak_bytes_in_use` to be equal to the `bytes_in_use`. Returns true if
+  // implemented.
   virtual bool ClearAllocatorStats() { return false; }
 
   // Clears the compilation cache from volatile memory. Returns OK if no
@@ -323,15 +357,13 @@ class StreamExecutorInterface {
   virtual absl::Status FlushCompilationCache() { return absl::OkStatus(); }
 
   // Returns a stream allocated by this executor, or nullptr if not found.
-  // Performs linear search over alive GPU streams.
-  virtual Stream* FindAllocatedStream(void* /*gpu_stream*/) { return nullptr; }
+  virtual Stream* FindAllocatedStream(void* device_stream) { return nullptr; }
 
  private:
   StreamExecutorInterface(const StreamExecutorInterface&) = delete;
   void operator=(const StreamExecutorInterface&) = delete;
 };
 
-}  // namespace internal
 }  // namespace stream_executor
 
-#endif  // XLA_STREAM_EXECUTOR_STREAM_EXECUTOR_INTERNAL_H_
+#endif  // XLA_STREAM_EXECUTOR_STREAM_EXECUTOR_INTERFACE_H_
diff --git a/third_party/xla/xla/stream_executor/stream_executor_pimpl.cc b/third_party/xla/xla/stream_executor/stream_executor_pimpl.cc
index 12d2d0a9b33c25..ea962a8fe871f8 100644
--- a/third_party/xla/xla/stream_executor/stream_executor_pimpl.cc
+++ b/third_party/xla/xla/stream_executor/stream_executor_pimpl.cc
@@ -46,33 +46,21 @@ limitations under the License.
 #include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/fft.h"
-#include "xla/stream_executor/host_memory_allocation.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/module_spec.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
-#include "xla/stream_executor/stream_executor_internal.h"
+#include "xla/stream_executor/stream_executor_interface.h"
 #include "xla/tsl/util/env_var.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/numbers.h"
-#include "tsl/platform/stacktrace.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
 
 namespace stream_executor {
-namespace {
-
-std::string StackTraceIfVLOG10() {
-  if (VLOG_IS_ON(10)) {
-    return absl::StrCat(" ", tsl::CurrentStackTrace(), "\n");
-  } else {
-    return "";
-  }
-}
-
-}  // namespace
 
 // Get per-device memory limit in bytes. Returns 0 if
 // TF_PER_DEVICE_MEMORY_LIMIT_MB environment variable is not set.
@@ -85,27 +73,13 @@ static int64_t GetMemoryLimitBytes() {
 
 StreamExecutor::StreamExecutor(
     const Platform* platform,
-    std::unique_ptr<internal::StreamExecutorInterface> implementation,
-    int device_ordinal)
+    std::unique_ptr<StreamExecutorInterface> implementation)
     : platform_(platform),
       implementation_(std::move(implementation)),
-      device_ordinal_(device_ordinal),
-      live_stream_count_(0),
       memory_limit_bytes_(GetMemoryLimitBytes()),
       allocator_(this) {}
 
-StreamExecutor::~StreamExecutor() {
-  if (live_stream_count_.load() != 0) {
-    LOG(WARNING) << "Not all streams were deallocated at executor destruction "
-                 << "time. This may lead to unexpected/bad behavior - "
-                 << "especially if any stream is still active!";
-  }
-}
-
-absl::Status StreamExecutor::Init() {
-  TF_RETURN_IF_ERROR(implementation_->Init(device_ordinal_));
-  return absl::OkStatus();
-}
+absl::Status StreamExecutor::Init() { return implementation_->Init(); }
 
 absl::Status StreamExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
                                        Kernel* kernel) {
@@ -132,11 +106,7 @@ StreamExecutor::CreateOrShareConstant(Stream* stream,
 }
 
 void StreamExecutor::Deallocate(DeviceMemoryBase* mem) {
-  VLOG(1) << "Called StreamExecutor::Deallocate(mem=" << mem->opaque()
-          << ") mem->size()=" << mem->size() << StackTraceIfVLOG10();
-
   implementation_->Deallocate(mem);
-  mem->Reset(nullptr, 0);
 }
 
 bool StreamExecutor::CanEnablePeerAccessTo(StreamExecutor* other) {
@@ -157,39 +127,13 @@ const DeviceDescription& StreamExecutor::GetDeviceDescription() const {
   return *device_description_;
 }
 
-int64_t StreamExecutor::GetDeviceLoad() const {
-  return implementation_->GetDeviceLoad();
-}
-
-dnn::DnnSupport* StreamExecutor::AsDnn() {
-  absl::MutexLock lock(&mu_);
-  if (dnn_ != nullptr) {
-    return dnn_.get();
-  }
-
-  dnn_.reset(implementation_->CreateDnn());
-  return dnn_.get();
-}
+dnn::DnnSupport* StreamExecutor::AsDnn() { return implementation_->AsDnn(); }
 
 blas::BlasSupport* StreamExecutor::AsBlas() {
-  absl::MutexLock lock(&mu_);
-  if (blas_ != nullptr) {
-    return blas_.get();
-  }
-
-  blas_.reset(implementation_->CreateBlas());
-  return blas_.get();
+  return implementation_->AsBlas();
 }
 
-fft::FftSupport* StreamExecutor::AsFft() {
-  absl::MutexLock lock(&mu_);
-  if (fft_ != nullptr) {
-    return fft_.get();
-  }
-
-  fft_.reset(implementation_->CreateFft());
-  return fft_.get();
-}
+fft::FftSupport* StreamExecutor::AsFft() { return implementation_->AsFft(); }
 
 absl::Status StreamExecutor::Launch(Stream* stream,
                                     const ThreadDim& thread_dims,
@@ -215,9 +159,7 @@ absl::Status StreamExecutor::Submit(Stream* stream,
 }
 
 absl::Status StreamExecutor::BlockHostUntilDone(Stream* stream) {
-  absl::Status result;
-  result = implementation_->BlockHostUntilDone(stream);
-  return result;
+  return implementation_->BlockHostUntilDone(stream);
 }
 
 absl::Status StreamExecutor::GetStatus(Stream* stream) {
@@ -228,17 +170,12 @@ DeviceMemoryBase StreamExecutor::Allocate(uint64_t size, int64_t memory_space) {
   if (memory_limit_bytes_ > 0 &&
       static_cast<int64_t>(size) > memory_limit_bytes_) {
     LOG(WARNING) << "Not enough memory to allocate " << size << " on device "
-                 << device_ordinal_
+                 << device_ordinal()
                  << " within provided limit.  limit=" << memory_limit_bytes_
                  << "]";
     return DeviceMemoryBase();
   }
-  DeviceMemoryBase buf = implementation_->Allocate(size, memory_space);
-  VLOG(1) << "Called StreamExecutor::Allocate(size=" << size
-          << ", memory_space=" << memory_space << ") returns " << buf.opaque()
-          << StackTraceIfVLOG10();
-
-  return buf;
+  return implementation_->Allocate(size, memory_space);
 }
 
 absl::StatusOr<DeviceMemoryBase> StreamExecutor::GetUntypedSymbol(
@@ -264,118 +201,47 @@ bool StreamExecutor::GetSymbol(const std::string& symbol_name,
 }
 
 void* StreamExecutor::UnifiedMemoryAllocate(uint64_t bytes) {
-  void* buffer = implementation_->UnifiedMemoryAllocate(bytes);
-  VLOG(1) << "Called StreamExecutor::UnifiedMemoryAllocate(size=" << bytes
-          << ") returns " << buffer << StackTraceIfVLOG10();
-  return buffer;
+  return implementation_->UnifiedMemoryAllocate(bytes);
 }
 
 void StreamExecutor::UnifiedMemoryDeallocate(void* location) {
-  VLOG(1) << "Called StreamExecutor::UnifiedMemoryDeallocate(location="
-          << location << ")" << StackTraceIfVLOG10();
-
   return implementation_->UnifiedMemoryDeallocate(location);
 }
 
 absl::StatusOr<void*> StreamExecutor::CollectiveMemoryAllocate(uint64_t bytes) {
-  TF_ASSIGN_OR_RETURN(void* buffer,
-                      implementation_->CollectiveMemoryAllocate(bytes));
-  VLOG(1) << "Called StreamExecutor::CollectiveMemoryAllocate(size=" << bytes
-          << ") returns " << buffer << StackTraceIfVLOG10();
-  return buffer;
+  return implementation_->CollectiveMemoryAllocate(bytes);
 }
 
 absl::Status StreamExecutor::CollectiveMemoryDeallocate(void* location) {
-  VLOG(1) << "Called StreamExecutor::CollectiveMemoryDeallocate(location="
-          << location << ")" << StackTraceIfVLOG10();
-
   return implementation_->CollectiveMemoryDeallocate(location);
 }
 
-absl::StatusOr<std::unique_ptr<HostMemoryAllocation>>
+absl::StatusOr<std::unique_ptr<MemoryAllocation>>
 StreamExecutor::HostMemoryAllocate(uint64_t size) {
-  void* buffer = implementation_->HostMemoryAllocate(size);
-  VLOG(1) << "Called StreamExecutor::HostMemoryAllocate(size=" << size
-          << ") returns " << buffer << StackTraceIfVLOG10();
-  if (buffer == nullptr && size > 0) {
-    return absl::InternalError(
-        absl::StrFormat("Failed to allocate HostMemory of size %d", size));
-  }
-  return std::make_unique<HostMemoryAllocation>(buffer, size, implementation());
+  return implementation_->HostMemoryAllocate(size);
 }
 
 void StreamExecutor::HostMemoryDeallocate(void* data, uint64_t size) {
-  VLOG(1) << "Called StreamExecutor::HostMemoryDeallocate(data=" << data << ")"
-          << StackTraceIfVLOG10();
-
   return implementation_->HostMemoryDeallocate(data);
 }
 
 bool StreamExecutor::SynchronizeAllActivity() {
-  VLOG(1) << "Called StreamExecutor::SynchronizeAllActivity()"
-          << StackTraceIfVLOG10();
-  bool ok = implementation_->SynchronizeAllActivity();
-
-  return ok;
+  return implementation_->SynchronizeAllActivity();
 }
 
 absl::Status StreamExecutor::SynchronousMemZero(DeviceMemoryBase* location,
                                                 uint64_t size) {
-  VLOG(1) << "Called StreamExecutor::SynchronousMemZero(location=" << location
-          << ", size=" << size << ")" << StackTraceIfVLOG10();
-
   return implementation_->SynchronousMemZero(location, size);
 }
 
-bool StreamExecutor::SynchronousMemcpy(DeviceMemoryBase* device_dst,
-                                       const DeviceMemoryBase& device_src,
-                                       uint64_t size) {
-  VLOG(1) << "Called StreamExecutor::SynchronousMemcpy(device_dst="
-          << device_dst->opaque() << ", device_src=" << device_src.opaque()
-          << ", size=" << size << ") D2D" << StackTraceIfVLOG10();
-
-  absl::Status status = implementation_->SynchronousMemcpyDeviceToDevice(
-      device_dst, device_src, size);
-  if (!status.ok()) {
-    LOG(ERROR) << "synchronous memcpy: " << status;
-  }
-  return status.ok();
-}
-
 absl::Status StreamExecutor::SynchronousMemcpyD2H(
     const DeviceMemoryBase& device_src, int64_t size, void* host_dst) {
-  VLOG(1) << "Called StreamExecutor::SynchronousMemcpyD2H(device_src="
-          << device_src.opaque() << ", size=" << size
-          << ", host_dst=" << host_dst << ")" << StackTraceIfVLOG10();
-
-  absl::Status result =
-      implementation_->SynchronousMemcpy(host_dst, device_src, size);
-  if (!result.ok()) {
-    result = absl::InternalError(absl::StrFormat(
-        "failed to synchronously memcpy device-to-host: device "
-        "%p to host %p size %d: %s",
-        device_src.opaque(), host_dst, size, result.ToString()));
-  }
-
-  return result;
+  return implementation_->SynchronousMemcpy(host_dst, device_src, size);
 }
 
 absl::Status StreamExecutor::SynchronousMemcpyH2D(
     const void* host_src, int64_t size, DeviceMemoryBase* device_dst) {
-  VLOG(1) << "Called StreamExecutor::SynchronousMemcpyH2D(host_src=" << host_src
-          << ", size=" << size << ", device_dst=" << device_dst->opaque() << ")"
-          << StackTraceIfVLOG10();
-
-  absl::Status result =
-      implementation_->SynchronousMemcpy(device_dst, host_src, size);
-  if (!result.ok()) {
-    result = absl::InternalError(absl::StrFormat(
-        "failed to synchronously memcpy host-to-device: host "
-        "%p to device %p size %d: %s",
-        host_src, device_dst->opaque(), size, result.ToString()));
-  }
-
-  return result;
+  return implementation_->SynchronousMemcpy(device_dst, host_src, size);
 }
 
 bool StreamExecutor::Memcpy(Stream* stream, void* host_dst,
@@ -404,8 +270,6 @@ absl::Status StreamExecutor::MemZero(Stream* stream, DeviceMemoryBase* location,
 absl::Status StreamExecutor::Memset32(Stream* stream,
                                       DeviceMemoryBase* location,
                                       uint32_t pattern, uint64_t size) {
-  CHECK_EQ(0, size % 4)
-      << "need 32-bit multiple size to fill with 32-bit pattern";
   return implementation_->Memset32(stream, location, pattern, size);
 }
 
@@ -442,41 +306,16 @@ Event::Status StreamExecutor::PollForEventStatus(Event* event) {
 absl::StatusOr<std::unique_ptr<Stream>> StreamExecutor::CreateStream(
     std::optional<std::variant<StreamPriority, int>> priority) {
   auto stream = std::make_unique<Stream>(this);
-  if (priority.has_value()) {
-    if (std::holds_alternative<StreamPriority>(*priority)) {
-      stream->SetPriority(std::get<StreamPriority>(*priority));
-    } else {
-      stream->SetPriority(std::get<int>(*priority));
-    }
-  }
-  TF_RETURN_IF_ERROR(stream->Initialize());
+  TF_RETURN_IF_ERROR(stream->Initialize(priority));
   return std::move(stream);
 }
 
 bool StreamExecutor::AllocateStream(Stream* stream) {
-  live_stream_count_.fetch_add(1, std::memory_order_relaxed);
-  if (!implementation_->AllocateStream(stream)) {
-    auto count = live_stream_count_.fetch_sub(1);
-    CHECK_GE(count, 0) << "live stream count should not dip below zero";
-    LOG(INFO) << "failed to allocate stream; live stream count: " << count;
-    return false;
-  }
-
-  return true;
+  return implementation_->AllocateStream(stream);
 }
 
 void StreamExecutor::DeallocateStream(Stream* stream) {
-  dnn::DnnSupport* dnn;
-  {
-    absl::MutexLock lock(&mu_);
-    dnn = dnn_.get();
-  }
-  if (dnn) {
-    dnn->NotifyStreamDestroyed(stream);
-  }
   implementation_->DeallocateStream(stream);
-  CHECK_GE(live_stream_count_.fetch_sub(1), 0)
-      << "live stream count should not dip below zero";
 }
 
 bool StreamExecutor::CreateStreamDependency(Stream* dependent, Stream* other) {
@@ -504,8 +343,8 @@ Stream* StreamExecutor::FindAllocatedStream(void* gpu_stream) {
   return implementation_->FindAllocatedStream(gpu_stream);
 }
 
-internal::StreamExecutorInterface* StreamExecutor::implementation() {
-  return implementation_->GetUnderlyingExecutor();
+StreamExecutorInterface* StreamExecutor::implementation() {
+  return implementation_.get();
 }
 
 StreamExecutorMemoryAllocator::StreamExecutorMemoryAllocator(
diff --git a/third_party/xla/xla/stream_executor/stream_executor_pimpl.h b/third_party/xla/xla/stream_executor/stream_executor_pimpl.h
index 2717d71eb3ebfa..7e30af2c089fae 100644
--- a/third_party/xla/xla/stream_executor/stream_executor_pimpl.h
+++ b/third_party/xla/xla/stream_executor/stream_executor_pimpl.h
@@ -16,7 +16,6 @@ limitations under the License.
 #ifndef XLA_STREAM_EXECUTOR_STREAM_EXECUTOR_PIMPL_H_
 #define XLA_STREAM_EXECUTOR_STREAM_EXECUTOR_PIMPL_H_
 
-#include <atomic>
 #include <cstddef>
 #include <cstdint>
 #include <initializer_list>
@@ -41,11 +40,12 @@ limitations under the License.
 #include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/fft.h"
-#include "xla/stream_executor/host_memory_allocation.h"
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/module_spec.h"
 #include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream_executor_interface.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/status.h"
 
@@ -53,10 +53,6 @@ namespace stream_executor {
 
 class Stream;
 
-namespace internal {
-class StreamExecutorInterface;
-}  // namespace internal
-
 // A StreamExecutor manages a single device, in terms of executing work (kernel
 // launches) and memory management (allocation/deallocation, memory copies to
 // and from the device). It is conceptually the "handle" for a device -- Stream
@@ -71,12 +67,10 @@ class StreamExecutorInterface;
 // StreamExecutor interface should not be invoked from a signal handler.
 class StreamExecutor {
  public:
-  StreamExecutor(
-      const Platform* platform,
-      std::unique_ptr<internal::StreamExecutorInterface> implementation,
-      int device_ordinal);
+  StreamExecutor(const Platform* platform,
+                 std::unique_ptr<StreamExecutorInterface> implementation);
 
-  ~StreamExecutor();
+  ~StreamExecutor() = default;
 
   absl::Status Init();
 
@@ -121,12 +115,6 @@ class StreamExecutor {
   DeviceMemory<T> AllocateArray(uint64_t element_count,
                                 int64_t memory_space = 0);
 
-  // As AllocateArray(), but returns a ScopedDeviceMemory<T>.
-  template <typename T>
-  ScopedDeviceMemory<T> AllocateOwnedArray(uint64_t element_count) {
-    return ScopedDeviceMemory<T>(this, AllocateArray<T>(element_count));
-  }
-
   // Convenience wrapper that allocates space for a single element of type T in
   // device memory.
   template <typename T>
@@ -137,7 +125,7 @@ class StreamExecutor {
   // As AllocateScalar(), but returns a ScopedDeviceMemory<T>.
   template <typename T>
   ScopedDeviceMemory<T> AllocateOwnedScalar() {
-    return AllocateOwnedArray<T>(1);
+    return ScopedDeviceMemory<T>(this, AllocateArray<T>(1));
   }
 
   // An untyped version of GetSymbol.
@@ -172,10 +160,9 @@ class StreamExecutor {
   absl::Status CollectiveMemoryDeallocate(void* location);
 
   // Allocates a region of host memory and registers it with the platform API.
-  // Memory allocated in this manner (or allocated and registered with
-  // HostMemoryRegister() is required for use in asynchronous memcpy operations,
-  // such as Stream::ThenMemcpy.
-  absl::StatusOr<std::unique_ptr<HostMemoryAllocation>> HostMemoryAllocate(
+  // Memory allocated in this manner is required for use in asynchronous memcpy
+  // operations, such as Stream::ThenMemcpy.
+  absl::StatusOr<std::unique_ptr<MemoryAllocation>> HostMemoryAllocate(
       uint64_t size);
 
   // Synchronizes all activity occurring in the StreamExecutor's context (most
@@ -206,12 +193,6 @@ class StreamExecutor {
   absl::Status SynchronousMemcpyD2H(const DeviceMemoryBase& device_src,
                                     int64_t size, void* host_dst);
 
-  // Blocks the caller while a data segment of the given size is copied from the
-  // device source to the device destination.
-  bool SynchronousMemcpy(DeviceMemoryBase* device_dst,
-                         const DeviceMemoryBase& device_src,
-                         uint64_t size) ABSL_MUST_USE_RESULT;
-
   // Enqueues an operation onto stream to zero out size bytes at the given
   // device memory location. Neither stream nor location may be null. Returns
   // whether the operation was successfully enqueued onto the stream.
@@ -246,10 +227,6 @@ class StreamExecutor {
   // The value is cached on first use.
   const DeviceDescription& GetDeviceDescription() const;
 
-  // If implemented, returns device specific measurement of load
-  // (e.g. pending requests).
-  int64_t GetDeviceLoad() const;
-
   // Returns the underlying device memory usage information, if it is available.
   // If it is not available (false is returned), free/total may not be
   // initialized.
@@ -262,10 +239,10 @@ class StreamExecutor {
 
   // Returns the device ordinal that this StreamExecutor was initialized with.
   // Meaningless before initialization.
-  int device_ordinal() const { return device_ordinal_; }
+  int device_ordinal() const { return implementation_->device_ordinal(); }
 
   // Returns a borrowed pointer to the underlying StreamExecutor implementation.
-  internal::StreamExecutorInterface* implementation();
+  StreamExecutorInterface* implementation();
 
   // Warning: use Stream::ThenLaunch instead, this method is not for general
   // consumption. However, this is the only way to launch a kernel for which
@@ -437,37 +414,13 @@ class StreamExecutor {
   // Pointer to the platform-specific-interface implementation. This is
   // delegated to by the interface routines in pointer-to-implementation
   // fashion.
-  std::unique_ptr<internal::StreamExecutorInterface> implementation_;
-
-  // Memoized BLAS support object -- we only want to create this once when asked
-  // for a BLAS interface.
-  std::unique_ptr<blas::BlasSupport> blas_ ABSL_GUARDED_BY(mu_);
-
-  // Memoized DNN support object -- we only want to create this once when asked
-  // for an DNN interface.
-  std::unique_ptr<dnn::DnnSupport> dnn_ ABSL_GUARDED_BY(mu_);
-
-  // Memoized FFT support object -- we only want to create this once when asked
-  // for a FFT interface.
-  std::unique_ptr<fft::FftSupport> fft_;
+  std::unique_ptr<StreamExecutorInterface> implementation_;
 
   // Slot to cache the owned DeviceDescription for the underlying device
   // once it has been queried from DeviceDescription().
   mutable std::unique_ptr<DeviceDescription> device_description_
       ABSL_GUARDED_BY(mu_);
 
-  // The device ordinal that this object was initialized with.
-  //
-  // Immutable post-initialization.
-  int device_ordinal_;
-
-  // Counter for the current number of live streams. This is used to check
-  // for accidentally-outstanding streams at StreamExecutor teardown time, as
-  // well
-  // as to indicate leaks (via a large outstanding count being logged) in the
-  // case we can't allocate more streams.
-  std::atomic_int_fast32_t live_stream_count_;
-
   // Only one worker thread is needed; little work will be done by the
   // executor.
   static constexpr int kNumBackgroundThreads = 1;
@@ -535,18 +488,6 @@ ScopedDeviceMemory<ElemT>::ScopedDeviceMemory(StreamExecutor* parent,
       device_ordinal_(parent->device_ordinal()),
       allocator_(parent->GetAllocator()) {}
 
-template <typename ElemT>
-ScopedDeviceMemory<ElemT>::ScopedDeviceMemory(
-    StreamExecutor* parent, std::initializer_list<ElemT> values)
-    : ScopedDeviceMemory(parent, parent->AllocateArray<ElemT>(values.size())) {
-  if (ptr() != nullptr) {
-    std::vector<ElemT> local(values);
-    if (!parent->SynchronousMemcpy(ptr(), local.data(), ptr()->size())) {
-      TF_CHECK_OK(Free());
-    }
-  }
-}
-
 }  // namespace stream_executor
 
 #endif  // XLA_STREAM_EXECUTOR_STREAM_EXECUTOR_PIMPL_H_
diff --git a/third_party/xla/xla/stream_executor/stream_interface.h b/third_party/xla/xla/stream_executor/stream_interface.h
new file mode 100644
index 00000000000000..ae8e5dd2660c79
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/stream_interface.h
@@ -0,0 +1,58 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_STREAM_INTERFACE_H_
+#define XLA_STREAM_EXECUTOR_STREAM_INTERFACE_H_
+
+#include <variant>
+
+#include "xla/stream_executor/platform.h"
+
+namespace stream_executor {
+// Pointer-to-implementation object type (i.e. the Stream class delegates to
+// this interface) with virtual destruction. This class exists for the
+// platform-dependent code to hang any kernel data/resource info/functionality
+// off of.
+class StreamInterface {
+ public:
+  // Default constructor for the abstract interface.
+  StreamInterface() = default;
+
+  // Default destructor for the abstract interface.
+  virtual ~StreamInterface() = default;
+
+  // Sets priority for a stream.
+  virtual void SetPriority(StreamPriority priority) {}
+  virtual void SetPriority(int priority) {}
+
+  // Gets priority for a stream.
+  virtual std::variant<StreamPriority, int> priority() const {
+    return StreamPriority::Default;
+  }
+
+  // Returns a pointer to a platform specific stream associated with this object
+  // if it exists, or nullptr otherwise. This is available via Stream public API
+  // as Stream::PlatformSpecificHandle, and should not be accessed directly
+  // outside of a StreamExecutor package.
+  virtual void* platform_specific_stream() { return nullptr; }
+
+ private:
+  StreamInterface(const StreamInterface&) = delete;
+  void operator=(const StreamInterface&) = delete;
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_STREAM_INTERFACE_H_
diff --git a/third_party/xla/xla/stream_executor/stream_test.cc b/third_party/xla/xla/stream_executor/stream_test.cc
index 37515b5a266ee1..7801abe6294315 100644
--- a/third_party/xla/xla/stream_executor/stream_test.cc
+++ b/third_party/xla/xla/stream_executor/stream_test.cc
@@ -34,12 +34,6 @@ class StreamTest : public ::testing::Test {
   }
 };
 
-TEST_F(StreamTest, NoInitNotOk) {
-  std::unique_ptr<StreamExecutor> executor = NewStreamExecutor();
-  Stream stream(executor.get());
-  EXPECT_FALSE(stream.ok());
-}
-
 TEST_F(StreamTest, InitOk) {
   std::unique_ptr<StreamExecutor> executor = NewStreamExecutor();
   TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
diff --git a/third_party/xla/xla/stream_executor/tpu/BUILD b/third_party/xla/xla/stream_executor/tpu/BUILD
index 4f26d6c6b4fcf5..fe23094d0cc2cf 100644
--- a/third_party/xla/xla/stream_executor/tpu/BUILD
+++ b/third_party/xla/xla/stream_executor/tpu/BUILD
@@ -1,8 +1,8 @@
 # Description: StreamExecutor Interface for TPUs
 
-load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//xla:xla.bzl", "xla_cc_test")
+load("//xla/tsl:tsl.bzl", "internal_visibility")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -201,7 +201,7 @@ cc_library(
         ":tpu_stream_interface",
         ":tpu_topology_external",
         "//xla/stream_executor",
-        "//xla/stream_executor:stream_executor_internal",
+        "//xla/stream_executor:stream_executor_interface",
         "//xla/stream_executor/platform",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/functional:any_invocable",
@@ -232,7 +232,7 @@ cc_library(
         ":tpu_platform_interface",
         ":tpu_topology_external",
         "//xla/stream_executor",
-        "//xla/stream_executor:stream_executor_internal",
+        "//xla/stream_executor:stream_executor_interface",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -260,7 +260,7 @@ cc_library(
         ":tpu_executor_c_api_hdrs",
         ":tpu_topology_external",
         "//xla/stream_executor",
-        "//xla/stream_executor:stream_executor_internal",
+        "//xla/stream_executor:stream_executor_interface",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/status",
@@ -307,7 +307,8 @@ cc_library(
         ":tpu_topology_external",
         "//xla:status",
         "//xla/stream_executor",
-        "//xla/stream_executor:stream_executor_internal",
+        "//xla/stream_executor:event_interface",
+        "//xla/stream_executor:stream_executor_interface",
         "//xla/tsl/c:tsl_status_internal",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -486,7 +487,7 @@ cc_library(
         ":tpu_platform_interface",
         ":tpu_topology_external",
         "//xla/stream_executor",
-        "//xla/stream_executor:stream_executor_internal",
+        "//xla/stream_executor:stream_executor_interface",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
     ],
@@ -543,7 +544,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//xla/stream_executor",
-        "//xla/stream_executor:stream_executor_internal",
+        "//xla/stream_executor:stream_executor_interface",
         "@com_google_absl//absl/status",
     ],
 )
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_event.h b/third_party/xla/xla/stream_executor/tpu/tpu_event.h
index a9daeea8160773..c3b35d2d98d8fa 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_event.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_event.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef XLA_STREAM_EXECUTOR_TPU_TPU_EVENT_H_
 #define XLA_STREAM_EXECUTOR_TPU_TPU_EVENT_H_
 
-#include "xla/stream_executor/stream_executor_internal.h"
+#include "xla/stream_executor/event_interface.h"
 #include "xla/stream_executor/tpu/c_api_decl.h"
 #include "xla/stream_executor/tpu/tpu_api.h"
 #include "xla/stream_executor/tpu/tpu_executor_api.h"
@@ -23,12 +23,10 @@ limitations under the License.
 namespace stream_executor {
 namespace tpu {
 
-class TpuEvent : public ::stream_executor::internal::EventInterface {
+class TpuEvent : public EventInterface {
  public:
   explicit TpuEvent(SE_Event* event) : event_(event) {}
-  ~TpuEvent() override {
-    stream_executor::tpu::ExecutorApiFn()->TpuEvent_FreeFn(event_);
-  }
+  ~TpuEvent() override { ExecutorApiFn()->TpuEvent_FreeFn(event_); }
 
  private:
   SE_Event* event_;
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_executable_interface.cc b/third_party/xla/xla/stream_executor/tpu/tpu_executable_interface.cc
index 453ade70abccb5..69b9c4a0fc8067 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_executable_interface.cc
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_executable_interface.cc
@@ -101,7 +101,8 @@ TpuExecutableInterface::AllocateOutputMemoryWithInputReuse(
 
   TF_RETURN_IF_ERROR(alias_config.ForEachAliasWithStatus(
       [&](const ShapeIndex& output_index,
-          std::optional<HloInputOutputAliasConfig::Alias> alias) {
+          std::optional<HloInputOutputAliasConfig::Alias> alias)
+          -> absl::Status {
         if (alias && alias->must_alias()) {
           VLOG(1) << alias->ToString();
           const MaybeOwningDeviceMemory& original_input =
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_executor.cc b/third_party/xla/xla/stream_executor/tpu/tpu_executor.cc
index 924f8f0d27913c..194bda4d20fd72 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_executor.cc
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_executor.cc
@@ -29,7 +29,7 @@ limitations under the License.
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/event.h"
-#include "xla/stream_executor/stream_executor_internal.h"
+#include "xla/stream_executor/stream_executor_interface.h"
 #include "xla/stream_executor/tpu/c_api_conversions.h"
 #include "xla/stream_executor/tpu/c_api_decl.h"
 #include "xla/stream_executor/tpu/status_helper.h"
@@ -45,15 +45,14 @@ namespace stream_executor {
 namespace tpu {
 
 namespace {
-using xla::Status;
+using absl::Status;
 }  // namespace
 
 TpuExecutor::~TpuExecutor() { ExecutorApiFn()->TpuExecutor_FreeFn(executor_); }
 
-Status TpuExecutor::Init(int device_ordinal) {
+Status TpuExecutor::Init() {
   StatusHelper status;
-  ExecutorApiFn()->TpuExecutor_InitFn(executor_, device_ordinal,
-                                      status.c_status);
+  ExecutorApiFn()->TpuExecutor_InitFn(executor_, status.c_status);
   return status.status();
 }
 
@@ -141,8 +140,7 @@ Status TpuExecutor::WaitForEvent(Stream* stream,
 // responsible for deallocating the internal value when they are destroyed.
 
 // Called by Stream::Stream
-std::unique_ptr<::stream_executor::internal::StreamInterface>
-TpuExecutor::GetStreamImplementation() {
+std::unique_ptr<StreamInterface> TpuExecutor::GetStreamImplementation() {
   SE_Stream* tpu_stream = ExecutorApiFn()->TpuStream_NewFn(executor_);
   auto ptr = std::make_unique<tensorflow::tpu::TpuStream>(tpu_stream);
   tpu_platform().mutex().Lock();
@@ -152,8 +150,7 @@ TpuExecutor::GetStreamImplementation() {
 }
 
 // Called by Event::Event
-std::unique_ptr<::stream_executor::internal::EventInterface>
-TpuExecutor::CreateEventImplementation() {
+std::unique_ptr<EventInterface> TpuExecutor::CreateEventImplementation() {
   SE_Event* tpu_event = ExecutorApiFn()->TpuEvent_NewFn(executor_);
   auto ptr = std::make_unique<TpuEvent>(tpu_event);
   tpu_platform().InsertEvent(ptr.get(), tpu_event);
@@ -272,12 +269,6 @@ Status TpuExecutor::SynchronousMemcpy(
   return status.status();
 }
 
-Status TpuExecutor::SynchronousMemcpyDeviceToDevice(
-    ::stream_executor::DeviceMemoryBase* device_dst,
-    const ::stream_executor::DeviceMemoryBase& device_src, uint64_t size) {
-  return absl::UnimplementedError("This operation not supported on TPU");
-}
-
 bool TpuExecutor::MemcpyDeviceToDevice(
     Stream* stream, ::stream_executor::DeviceMemoryBase* gpu_dst,
     const ::stream_executor::DeviceMemoryBase& host_src, uint64_t size) {
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_executor.h b/third_party/xla/xla/stream_executor/tpu/tpu_executor.h
index 53226961c290fc..aa71bdb71daab9 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_executor.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_executor.h
@@ -1,3 +1,4 @@
+#include "xla/stream_executor/memory_allocation.h"
 /* Copyright 2020 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -32,7 +33,7 @@ limitations under the License.
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/stream_executor/stream_executor_internal.h"
+#include "xla/stream_executor/stream_executor_interface.h"
 #include "xla/stream_executor/tpu/c_api_decl.h"
 #include "xla/stream_executor/tpu/tpu_executor_c_api.h"
 #include "xla/stream_executor/tpu/tpu_executor_interface.h"
@@ -51,20 +52,16 @@ class TpuExecutor : public tensorflow::tpu::TpuExecutorInterface {
   template <typename T>
   using StatusOr = ::absl::StatusOr<T>;
   using StatusCallback = std::function<void(const absl::Status&)>;
-  using Stream = ::stream_executor::Stream;
-  using Event = ::stream_executor::Event;
-  using DeviceMemoryBase = ::stream_executor::DeviceMemoryBase;
-  using StreamInterface = ::stream_executor::internal::StreamInterface;
-  using StreamExecutorInterface =
-      ::stream_executor::internal::StreamExecutorInterface;
 
   explicit TpuExecutor(::tensorflow::tpu::TpuPlatformInterface* platform,
-                       SE_StreamExecutor* executor)
-      : platform_(platform), executor_(executor) {}
+                       SE_StreamExecutor* executor, int device_ordinal)
+      : platform_(platform),
+        executor_(executor),
+        device_ordinal_(device_ordinal) {}
 
   ~TpuExecutor() override;
 
-  absl::Status Init(int device_ordinal) override;
+  absl::Status Init() override;
 
   DeviceMemoryBase Allocate(uint64_t size, int64_t memory_space) override;
 
@@ -72,10 +69,10 @@ class TpuExecutor : public tensorflow::tpu::TpuExecutorInterface {
 
   bool AllocateStream(Stream* stream) override;
 
-  absl::Status BlockHostUntilDone(::stream_executor::Stream* stream) override;
+  absl::Status BlockHostUntilDone(Stream* stream) override;
 
-  StatusOr<std::unique_ptr<::stream_executor::DeviceDescription>>
-  CreateDeviceDescription() const override;
+  StatusOr<std::unique_ptr<DeviceDescription>> CreateDeviceDescription()
+      const override;
 
   bool CreateStreamDependency(Stream* dependent, Stream* other) override;
 
@@ -102,46 +99,35 @@ class TpuExecutor : public tensorflow::tpu::TpuExecutorInterface {
 
   absl::Status GetStatus(Stream* stream) override;
 
-  std::unique_ptr<::stream_executor::internal::StreamInterface>
-  GetStreamImplementation() override;
+  std::unique_ptr<StreamInterface> GetStreamImplementation() override;
 
-  std::unique_ptr<::stream_executor::internal::EventInterface>
-  CreateEventImplementation() override;
+  std::unique_ptr<EventInterface> CreateEventImplementation() override;
 
   bool HostCallback(Stream* stream,
                     absl::AnyInvocable<absl::Status() &&> callback) override;
 
   absl::Status Memcpy(Stream* stream, void* host_dst,
-                      const ::stream_executor::DeviceMemoryBase& device_src,
+                      const DeviceMemoryBase& device_src,
                       uint64_t size) override;
 
-  absl::Status Memcpy(Stream* stream,
-                      ::stream_executor::DeviceMemoryBase* device_dst,
+  absl::Status Memcpy(Stream* stream, DeviceMemoryBase* device_dst,
                       const void* host_src, uint64_t size) override;
 
-  bool MemcpyDeviceToDevice(Stream* stream,
-                            ::stream_executor::DeviceMemoryBase* gpu_dst,
-                            const ::stream_executor::DeviceMemoryBase& host_src,
+  bool MemcpyDeviceToDevice(Stream* stream, DeviceMemoryBase* gpu_dst,
+                            const DeviceMemoryBase& host_src,
                             uint64_t size) override;
 
   bool SynchronizeAllActivity() override;
 
-  absl::Status SynchronousMemcpy(
-      ::stream_executor::DeviceMemoryBase* device_dst, const void* host_src,
-      uint64_t size) override;
-  absl::Status SynchronousMemcpy(
-      void* host_dst, const ::stream_executor::DeviceMemoryBase& device_src,
-      uint64_t size) override;
-  absl::Status SynchronousMemcpyDeviceToDevice(
-      ::stream_executor::DeviceMemoryBase* device_dst,
-      const ::stream_executor::DeviceMemoryBase& device_src,
-      uint64_t size) override;
+  absl::Status SynchronousMemcpy(DeviceMemoryBase* device_dst,
+                                 const void* host_src, uint64_t size) override;
+  absl::Status SynchronousMemcpy(void* host_dst,
+                                 const DeviceMemoryBase& device_src,
+                                 uint64_t size) override;
 
   Event::Status PollForEventStatus(Event* event) override;
-  absl::Status RecordEvent(Stream* stream,
-                           ::stream_executor::Event* event) override;
-  absl::Status WaitForEvent(Stream* stream,
-                            ::stream_executor::Event* event) override;
+  absl::Status RecordEvent(Stream* stream, Event* event) override;
+  absl::Status WaitForEvent(Stream* stream, Event* event) override;
 
   absl::Status UnloadAllPrograms() override;
 
@@ -155,7 +141,7 @@ class TpuExecutor : public tensorflow::tpu::TpuExecutorInterface {
   ::tensorflow::tpu::TpuPlatformInterface& platform() override {
     return *platform_;
   }
-
+  int device_ordinal() const override { return device_ordinal_; }
   // TODO(henrytan): convert this to override once the base interface is changed
   // to TpuExecutorInterface.
   StatusOr<std::unique_ptr<
@@ -182,26 +168,17 @@ class TpuExecutor : public tensorflow::tpu::TpuExecutorInterface {
     LOG(FATAL) << "not yet implemented";
   }
 
-  void* HostMemoryAllocate(uint64_t size) override {
+  absl::StatusOr<std::unique_ptr<MemoryAllocation>> HostMemoryAllocate(
+      uint64_t size) override {
     LOG(FATAL) << "not yet implemented";
   }
   void HostMemoryDeallocate(void* mem) override {
     LOG(FATAL) << "not yet implemented";
   }
-  bool HostMemoryRegister(void* mem, uint64_t size) override {
-    LOG(FATAL) << "not yet implemented";
-  }
-  bool HostMemoryUnregister(void* mem) override {
-    LOG(FATAL) << "not yet implemented";
-  }
   absl::Status SynchronousMemZero(DeviceMemoryBase* location,
                                   uint64_t size) override {
     LOG(FATAL) << "not yet implemented";
   }
-  absl::Status SynchronousMemSet(DeviceMemoryBase* location, int value,
-                                 uint64_t size) override {
-    LOG(FATAL) << "not yet implemented";
-  }
 
   SE_StreamExecutor* se_executor() { return executor_; }
 
@@ -221,6 +198,7 @@ class TpuExecutor : public tensorflow::tpu::TpuExecutorInterface {
 
   tensorflow::tpu::TpuPlatformInterface* platform_;
   SE_StreamExecutor* executor_;
+  int device_ordinal_;
 };
 
 }  // namespace tpu
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_executor_c_api.h b/third_party/xla/xla/stream_executor/tpu/tpu_executor_c_api.h
index 34f869e718849a..68ddb6c03da435 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_executor_c_api.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_executor_c_api.h
@@ -40,8 +40,7 @@ SE_TpuTopology* TpuPlatform_GetTopologyPtr(SE_Platform* platform);
 SE_TpuTopology_Host* TpuPlatform_GetHostLocation(SE_Platform* platform);
 TpuRuntimeVersion TpuPlatform_GetRuntimeVersion(SE_Platform* platform);
 
-void TpuExecutor_Init(SE_StreamExecutor* executor, int device_ordinal,
-                      TF_Status* status);
+void TpuExecutor_Init(SE_StreamExecutor* executor, TF_Status* status);
 void TpuExecutor_Free(SE_StreamExecutor* executor);
 
 SE_DeviceMemoryBase TpuExecutor_Allocate(SE_StreamExecutor* executor,
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_executor_interface.h b/third_party/xla/xla/stream_executor/tpu/tpu_executor_interface.h
index f716f1d53d063e..7d6218f4f871e1 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_executor_interface.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_executor_interface.h
@@ -22,7 +22,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/stream_executor_internal.h"
+#include "xla/stream_executor/stream_executor_interface.h"
 #include "xla/stream_executor/tpu/tpu_platform_interface.h"
 #include "xla/stream_executor/tpu/tpu_topology.h"
 
@@ -33,8 +33,7 @@ class TpuCore;
 namespace tensorflow {
 namespace tpu {
 
-class TpuExecutorInterface
-    : public stream_executor::internal::StreamExecutorInterface {
+class TpuExecutorInterface : public stream_executor::StreamExecutorInterface {
  public:
   template <typename T>
   using StatusOr = absl::StatusOr<T>;
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_op_executable.cc b/third_party/xla/xla/stream_executor/tpu/tpu_op_executable.cc
index 5ecfac3965ecd8..c4256230ac3bd6 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_op_executable.cc
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_op_executable.cc
@@ -49,7 +49,7 @@ TpuOpExecutable::TpuOpExecutable(
       core_program_(core_program),
       outside_compilation_params_(outside_compilation_params) {}
 
-xla::Status TpuOpExecutable::LoadProgramAndEnqueueToStream(
+absl::Status TpuOpExecutable::LoadProgramAndEnqueueToStream(
     const xla::ServiceExecutableRunOptions& run_options,
     absl::Span<const se::DeviceMemoryBase> arguments,
     se::DeviceMemoryBase result,
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_op_executable.h b/third_party/xla/xla/stream_executor/tpu/tpu_op_executable.h
index 35b79820af655e..6ff5cced6c78e9 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_op_executable.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_op_executable.h
@@ -49,7 +49,7 @@ class TpuOpExecutable : public xla::TpuExecutableInterface {
   absl::string_view fingerprint() const override;
 
  private:
-  xla::Status LoadProgramAndEnqueueToStream(
+  absl::Status LoadProgramAndEnqueueToStream(
       const xla::ServiceExecutableRunOptions& run_options,
       absl::Span<const stream_executor::DeviceMemoryBase> arguments,
       stream_executor::DeviceMemoryBase result,
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_platform.cc b/third_party/xla/xla/stream_executor/tpu/tpu_platform.cc
index d346c510d90069..b67e8faf5d5263 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_platform.cc
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_platform.cc
@@ -29,7 +29,7 @@ limitations under the License.
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/stream_executor/stream_executor_internal.h"
+#include "xla/stream_executor/stream_executor_interface.h"
 #include "xla/stream_executor/tpu/c_api_decl.h"
 #include "xla/stream_executor/tpu/status_helper.h"
 #include "xla/stream_executor/tpu/tpu_api.h"
@@ -123,8 +123,8 @@ TpuPlatform::GetUncachedExecutor(
     return status.status();
   }
   return std::make_unique<stream_executor::StreamExecutor>(
-      this, std::make_unique<stream_executor::tpu::TpuExecutor>(this, executor),
-      config.ordinal);
+      this, std::make_unique<stream_executor::tpu::TpuExecutor>(
+                this, executor, config.ordinal));
 }
 
 ::stream_executor::Platform::Id TpuPlatform::id() const {
@@ -155,19 +155,18 @@ TpuRuntimeVersion TpuPlatform::version() const {
       platform_);
 }
 
-void TpuPlatform::InsertEvent(stream_executor::internal::EventInterface* key,
+void TpuPlatform::InsertEvent(stream_executor::EventInterface* key,
                               SE_Event* val) {
   absl::MutexLock lock(&event_map_mu_);
   event_map_[key] = val;
 }
 
-SE_Event* TpuPlatform::LookupEvent(
-    stream_executor::internal::EventInterface* key) {
+SE_Event* TpuPlatform::LookupEvent(stream_executor::EventInterface* key) {
   absl::ReaderMutexLock lock(&event_map_mu_);
   return event_map_.at(key);
 }
 
-void TpuPlatform::EraseEvent(stream_executor::internal::EventInterface* key) {
+void TpuPlatform::EraseEvent(stream_executor::EventInterface* key) {
   absl::MutexLock lock(&event_map_mu_);
   event_map_.erase(key);
 }
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_platform.h b/third_party/xla/xla/stream_executor/tpu/tpu_platform.h
index 348acf135c5cfc..13fbf669da6a3a 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_platform.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_platform.h
@@ -28,7 +28,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "xla/stream_executor/executor_cache.h"
 #include "xla/stream_executor/platform.h"
-#include "xla/stream_executor/stream_executor_internal.h"
+#include "xla/stream_executor/stream_executor_interface.h"
 #include "xla/stream_executor/tpu/c_api_decl.h"
 #include "xla/stream_executor/tpu/tpu_executor_c_api.h"  // IWYU pragma: keep
 #include "xla/stream_executor/tpu/tpu_platform_interface.h"
@@ -41,11 +41,9 @@ namespace tpu {
 class TpuPlatform : public ::tensorflow::tpu::TpuPlatformInterface {
  public:
   using StreamMap =
-      absl::flat_hash_map<stream_executor::internal::StreamInterface*,
-                          SE_Stream*>;
+      absl::flat_hash_map<stream_executor::StreamInterface*, SE_Stream*>;
   using EventMap =
-      absl::flat_hash_map<stream_executor::internal::EventInterface*,
-                          SE_Event*>;
+      absl::flat_hash_map<stream_executor::EventInterface*, SE_Event*>;
 
   static const ::stream_executor::Platform::Id kId;
 
@@ -103,16 +101,15 @@ class TpuPlatform : public ::tensorflow::tpu::TpuPlatformInterface {
 
   StreamMap* stream_map() { return &stream_map_; }
 
-  void InsertEvent(stream_executor::internal::EventInterface* key,
-                   SE_Event* val);
-  SE_Event* LookupEvent(stream_executor::internal::EventInterface* key);
-  SE_Stream* LookupStream(stream_executor::internal::StreamInterface* key) {
+  void InsertEvent(stream_executor::EventInterface* key, SE_Event* val);
+  SE_Event* LookupEvent(stream_executor::EventInterface* key);
+  SE_Stream* LookupStream(stream_executor::StreamInterface* key) {
     mutex().Lock();
     auto stream = stream_map_.at(key);
     mutex().Unlock();
     return stream;
   }
-  void EraseEvent(stream_executor::internal::EventInterface* key);
+  void EraseEvent(stream_executor::EventInterface* key);
 
   SE_Platform* se_platform() const { return platform_; }
 
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_stream_interface.h b/third_party/xla/xla/stream_executor/tpu/tpu_stream_interface.h
index f5d21eb4f475c8..4cdacedffde5f2 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_stream_interface.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_stream_interface.h
@@ -19,12 +19,12 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/stream_executor/stream_executor_internal.h"
+#include "xla/stream_executor/stream_executor_interface.h"
 
 namespace tensorflow {
 namespace tpu {
 
-class TpuStreamInterface : public stream_executor::internal::StreamInterface {
+class TpuStreamInterface : public stream_executor::StreamInterface {
  public:
   virtual bool IsSameSharedMemoryLocation(TpuStreamInterface* other) = 0;
   virtual absl::Status EnqueueOnTpuDeviceSendRecvLocal(
diff --git a/third_party/xla/xla/stream_executor/tpu/tsl_status_helper.h b/third_party/xla/xla/stream_executor/tpu/tsl_status_helper.h
index 8d6d75e0800b7d..074b7703d09e21 100644
--- a/third_party/xla/xla/stream_executor/tpu/tsl_status_helper.h
+++ b/third_party/xla/xla/stream_executor/tpu/tsl_status_helper.h
@@ -28,7 +28,8 @@ class TslStatusHelper {
 
   ~TslStatusHelper() { TSL_DeleteStatus(c_status); }
 
-  static tsl::Status FromC(TF_Status* const c_status) {  // TENSORFLOW_STATUS_OK
+  static absl::Status FromC(
+      TF_Status* const c_status) {  // TENSORFLOW_STATUS_OK
     absl::StatusCode code = tsl::StatusCodeFromTSLCode(TSL_GetCode(c_status));
     if (code == absl::StatusCode::kOk) {
       return tsl::OkStatus();
@@ -41,7 +42,7 @@ class TslStatusHelper {
            absl::StatusCode::kOk;
   }
 
-  tsl::Status status() const {  // TENSORFLOW_STATUS_OK
+  absl::Status status() const {  // TENSORFLOW_STATUS_OK
     return FromC(c_status);
   }
 
diff --git a/third_party/xla/xla/tests/BUILD b/third_party/xla/xla/tests/BUILD
index fca608158771f3..6ec37032b77ddc 100644
--- a/third_party/xla/xla/tests/BUILD
+++ b/third_party/xla/xla/tests/BUILD
@@ -5,8 +5,6 @@ load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "if_rocm_is_configured",
 )
-load("@local_tsl//tsl:tsl.bzl", "internal_visibility", "tsl_copts")
-load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
 load(
     "@local_tsl//tsl/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
@@ -22,6 +20,8 @@ load(
     "if_gpu_is_configured",
 )
 load("//xla/tests:build_defs.bzl", "generate_backend_suites", "generate_backend_test_macros", "xla_test", "xla_test_library")
+load("//xla/tsl:tsl.bzl", "internal_visibility", "tsl_copts")
+load("//xla/tsl:tsl.default.bzl", "filegroup")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -700,6 +700,29 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "complex_unary_op_test",
+    srcs = [
+        "complex_unary_op_samples.h",
+        "complex_unary_op_test.cc",
+    ],
+    backends = [
+        "cpu",
+        "gpu",
+    ],
+    deps = [
+        ":client_library_test_base",
+        ":literal_test_util",
+        ":test_macros_header",
+        ":xla_internal_test_main",
+        "//xla:xla_data_proto_cc",
+        "//xla/client:global_data",
+        "//xla/client:local_client",
+        "//xla/client:xla_builder",
+        "@local_tsl//tsl/platform:test",
+    ],
+)
+
 xla_test(
     name = "scalar_computations_test",
     srcs = ["scalar_computations_test.cc"],
@@ -1702,10 +1725,15 @@ xla_test(
     srcs = ["token_hlo_test.cc"],
     deps = [
         ":hlo_test_base",
+        ":literal_test_util",
         ":test_macros_header",
         ":test_utils",
         ":xla_internal_test_main",
-        "//xla/service:hlo_verifier",
+        "//xla:literal",
+        "//xla:literal_util",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_runner",
+        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
     ],
 )
@@ -1746,12 +1774,16 @@ xla_test(
         "//xla:xla_data_proto_cc",
         "//xla/client:xla_builder",
         "//xla/client/lib:constants",
+        "//xla/ffi",
+        "//xla/ffi:ffi_api",
         "//xla/hlo/ir:hlo",
-        "//xla/runtime/ffi:ffi_api",
         "//xla/service:custom_call_status",
         "//xla/service:custom_call_target_registry",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:dynamic_annotations",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
diff --git a/third_party/xla/xla/tests/complex_unary_op_samples.h b/third_party/xla/xla/tests/complex_unary_op_samples.h
new file mode 100644
index 00000000000000..9e5db598ae150e
--- /dev/null
+++ b/third_party/xla/xla/tests/complex_unary_op_samples.h
@@ -0,0 +1,1281 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/*
+  This file is generated using xla/tests/generate_complex_unary_op_samples.py.
+  Do not edit!
+ */
+
+#include <cmath>
+#include <complex>
+#include <limits>
+#include <tuple>
+#include <vector>
+
+#ifndef XLA_TESTS_COMPLEX_UNARY_OP_SAMPLES_H_
+#define XLA_TESTS_COMPLEX_UNARY_OP_SAMPLES_H_
+
+namespace complex_unary_op_samples {
+// NOLINTBEGIN(whitespace/line_length)
+
+template <class>
+constexpr bool dependent_false = false;
+
+template <typename T, int default_dps_deficiency = 0>
+struct Log1p {
+  typedef std::complex<T> InputType;
+  typedef std::complex<T> OutputType;
+  typedef T FloatType;
+  using TableType = std::vector<std::tuple<InputType, OutputType, FloatType>>;
+  static constexpr int dps_deficiency = default_dps_deficiency;
+  const TableType get() {
+    if constexpr (std::is_same_v<T, float>) {
+      const T pi = 3.1415927f;
+      const T pi_4 = 0.7853982f;
+      const T pi_2 = 1.5707964f;
+      const T pi3_4 = 2.3561945f;
+      const T zero = 0.0f;
+      const T inf = std::numeric_limits<T>::infinity();
+      const T min = std::numeric_limits<T>::min();
+      const T max = std::numeric_limits<T>::max();
+      const TableType table{
+          // clang-format off
+          // Ignore max 80 character line width style requirement for
+          // (i) the readability
+          // (ii) the consistency with the local conventions
+        /* 0 */ { { -inf, -inf }, { inf, -pi3_4 }, 1.e+00f },
+        /* 1 */ { { -max, -inf }, { inf, -pi_2 }, 1.e+00f },
+        /* 2 */ { { -6.1409603e+25f, -inf }, { inf, -pi_2 }, 1.e+00f },
+        /* 3 */ { { -1.1082383e+13f, -inf }, { inf, -pi_2 }, 1.e+00f },
+        /* 4 */ { { -2.e+00f, -inf }, { inf, -pi_2 }, 1.e+00f },
+        /* 5 */ { { -3.6093321e-13f, -inf }, { inf, -pi_2 }, 1.e+00f },
+        /* 6 */ { { -6.5136393e-26f, -inf }, { inf, -pi_2 }, 1.e+00f },
+        /* 7 */ { { -min, -inf }, { inf, -pi_2 }, 1.e+00f },
+        /* 8 */ { { zero, -inf }, { inf, -pi_2 }, 1.e+00f },
+        /* 9 */ { { min, -inf }, { inf, -pi_2 }, 1.e+00f },
+        /* 10 */ { { 6.5136393e-26f, -inf }, { inf, -pi_2 }, 1.e+00f },
+        /* 11 */ { { 3.6093321e-13f, -inf }, { inf, -pi_2 }, 1.e+00f },
+        /* 12 */ { { 2.e+00f, -inf }, { inf, -pi_2 }, 1.e+00f },
+        /* 13 */ { { 1.1082383e+13f, -inf }, { inf, -pi_2 }, 1.e+00f },
+        /* 14 */ { { 6.1409603e+25f, -inf }, { inf, -pi_2 }, 1.e+00f },
+        /* 15 */ { { max, -inf }, { inf, -pi_2 }, 1.e+00f },
+        /* 16 */ { { inf, -inf }, { inf, -pi_4 }, 1.e+00f },
+        /* 17 */ { { -inf, -max }, { inf, -pi }, 1.e+00f },
+        /* 18 */ { { -max, -max }, { 8.9069412e+01f, -pi3_4 }, 7.8125e-03f },
+        /* 19 */ { { -6.1409603e+25f, -max }, { 8.8722839e+01f, -pi_2 }, 7.8125e-03f },
+        /* 20 */ { { -1.1082383e+13f, -max }, { 8.8722839e+01f, -pi_2 }, 7.8125e-03f },
+        /* 21 */ { { -2.e+00f, -max }, { 8.8722839e+01f, -pi_2 }, 7.8125e-03f },
+        /* 22 */ { { -3.6093321e-13f, -max }, { 8.8722839e+01f, -pi_2 }, 7.8125e-03f },
+        /* 23 */ { { -6.5136393e-26f, -max }, { 8.8722839e+01f, -pi_2 }, 7.8125e-03f },
+        /* 24 */ { { -min, -max }, { 8.8722839e+01f, -pi_2 }, 7.8125e-03f },
+        /* 25 */ { { zero, -max }, { 8.8722839e+01f, -pi_2 }, 7.8125e-03f },
+        /* 26 */ { { min, -max }, { 8.8722839e+01f, -pi_2 }, 7.8125e-03f },
+        /* 27 */ { { 6.5136393e-26f, -max }, { 8.8722839e+01f, -pi_2 }, 7.8125e-03f },
+        /* 28 */ { { 3.6093321e-13f, -max }, { 8.8722839e+01f, -pi_2 }, 7.8125e-03f },
+        /* 29 */ { { 2.e+00f, -max }, { 8.8722839e+01f, -pi_2 }, 7.8125e-03f },
+        /* 30 */ { { 1.1082383e+13f, -max }, { 8.8722839e+01f, -pi_2 }, 7.8125e-03f },
+        /* 31 */ { { 6.1409603e+25f, -max }, { 8.8722839e+01f, -pi_2 }, 7.8125e-03f },
+        /* 32 */ { { max, -max }, { 8.9069412e+01f, -pi_4 }, 7.8125e-03f },
+        /* 33 */ { { inf, -max }, { inf, zero }, 1.e+00f },
+        /* 34 */ { { -inf, -6.1409603e+25f }, { inf, -pi }, 1.e+00f },
+        /* 35 */ { { -max, -6.1409603e+25f }, { 8.8722839e+01f, -pi }, 7.8125e-03f },
+        /* 36 */ { { -6.1409603e+25f, -6.1409603e+25f }, { 5.9726181e+01f, -pi3_4 }, 1.5625e-02f },
+        /* 37 */ { { -1.1082383e+13f, -6.1409603e+25f }, { 5.9379608e+01f, -pi_2 }, 1.5625e-02f },
+        /* 38 */ { { -2.e+00f, -6.1409603e+25f }, { 5.9379608e+01f, -pi_2 }, 1.5625e-02f },
+        /* 39 */ { { -3.6093321e-13f, -6.1409603e+25f }, { 5.9379608e+01f, -pi_2 }, 1.5625e-02f },
+        /* 40 */ { { -6.5136393e-26f, -6.1409603e+25f }, { 5.9379608e+01f, -pi_2 }, 1.5625e-02f },
+        /* 41 */ { { -min, -6.1409603e+25f }, { 5.9379608e+01f, -pi_2 }, 1.5625e-02f },
+        /* 42 */ { { zero, -6.1409603e+25f }, { 5.9379608e+01f, -pi_2 }, 1.5625e-02f },
+        /* 43 */ { { min, -6.1409603e+25f }, { 5.9379608e+01f, -pi_2 }, 1.5625e-02f },
+        /* 44 */ { { 6.5136393e-26f, -6.1409603e+25f }, { 5.9379608e+01f, -pi_2 }, 1.5625e-02f },
+        /* 45 */ { { 3.6093321e-13f, -6.1409603e+25f }, { 5.9379608e+01f, -pi_2 }, 1.5625e-02f },
+        /* 46 */ { { 2.e+00f, -6.1409603e+25f }, { 5.9379608e+01f, -pi_2 }, 1.5625e-02f },
+        /* 47 */ { { 1.1082383e+13f, -6.1409603e+25f }, { 5.9379608e+01f, -pi_2 }, 1.5625e-02f },
+        /* 48 */ { { 6.1409603e+25f, -6.1409603e+25f }, { 5.9726181e+01f, -pi_4 }, 1.5625e-02f },
+        /* 49 */ { { max, -6.1409603e+25f }, { 8.8722839e+01f, -1.8046662e-13f }, 7.8125e-03f },
+        /* 50 */ { { inf, -6.1409603e+25f }, { inf, zero }, 1.e+00f },
+        /* 51 */ { { -inf, -1.1082383e+13f }, { inf, -pi }, 1.e+00f },
+        /* 52 */ { { -max, -1.1082383e+13f }, { 8.8722839e+01f, -pi }, 7.8125e-03f },
+        /* 53 */ { { -6.1409603e+25f, -1.1082383e+13f }, { 5.9379608e+01f, -pi }, 1.5625e-02f },
+        /* 54 */ { { -1.1082383e+13f, -1.1082383e+13f }, { 3.0382952e+01f, -pi3_4 }, 3.125e-02f },
+        /* 55 */ { { -2.e+00f, -1.1082383e+13f }, { 3.0036377e+01f, -pi_2 }, 3.125e-02f },
+        /* 56 */ { { -3.6093321e-13f, -1.1082383e+13f }, { 3.0036377e+01f, -pi_2 }, 3.125e-02f },
+        /* 57 */ { { -6.5136393e-26f, -1.1082383e+13f }, { 3.0036377e+01f, -pi_2 }, 3.125e-02f },
+        /* 58 */ { { -min, -1.1082383e+13f }, { 3.0036377e+01f, -pi_2 }, 3.125e-02f },
+        /* 59 */ { { zero, -1.1082383e+13f }, { 3.0036377e+01f, -pi_2 }, 3.125e-02f },
+        /* 60 */ { { min, -1.1082383e+13f }, { 3.0036377e+01f, -pi_2 }, 3.125e-02f },
+        /* 61 */ { { 6.5136393e-26f, -1.1082383e+13f }, { 3.0036377e+01f, -pi_2 }, 3.125e-02f },
+        /* 62 */ { { 3.6093321e-13f, -1.1082383e+13f }, { 3.0036377e+01f, -pi_2 }, 3.125e-02f },
+        /* 63 */ { { 2.e+00f, -1.1082383e+13f }, { 3.0036377e+01f, -pi_2 }, 3.125e-02f },
+        /* 64 */ { { 1.1082383e+13f, -1.1082383e+13f }, { 3.0382952e+01f, -pi_4 }, 3.125e-02f },
+        /* 65 */ { { 6.1409603e+25f, -1.1082383e+13f }, { 5.9379608e+01f, -1.8046662e-13f }, 1.5625e-02f },
+        /* 66 */ { { max, -1.1082383e+13f }, { 8.8722839e+01f, -3.25682e-26f }, 7.8125e-03f },
+        /* 67 */ { { inf, -1.1082383e+13f }, { inf, zero }, 1.e+00f },
+        /* 68 */ { { -inf, -2.e+00f }, { inf, -pi }, 1.e+00f },
+        /* 69 */ { { -max, -2.e+00f }, { 8.8722839e+01f, -pi }, 7.8125e-03f },
+        /* 70 */ { { -6.1409603e+25f, -2.e+00f }, { 5.9379608e+01f, -pi }, 1.5625e-02f },
+        /* 71 */ { { -1.1082383e+13f, -2.e+00f }, { 3.0036377e+01f, -pi }, 3.125e-02f },
+        /* 72 */ { { -2.e+00f, -2.e+00f }, { 8.0471897e-01f, -2.0344439e+00f }, 2.5e-01f },
+        /* 73 */ { { -3.6093321e-13f, -2.e+00f }, { 8.0471897e-01f, -1.1071488e+00f }, 5.e-01f },
+        /* 74 */ { { -6.5136393e-26f, -2.e+00f }, { 8.0471897e-01f, -1.1071488e+00f }, 5.e-01f },
+        /* 75 */ { { -min, -2.e+00f }, { 8.0471897e-01f, -1.1071488e+00f }, 5.e-01f },
+        /* 76 */ { { zero, -2.e+00f }, { 8.0471897e-01f, -1.1071488e+00f }, 5.e-01f },
+        /* 77 */ { { min, -2.e+00f }, { 8.0471897e-01f, -1.1071488e+00f }, 5.e-01f },
+        /* 78 */ { { 6.5136393e-26f, -2.e+00f }, { 8.0471897e-01f, -1.1071488e+00f }, 5.e-01f },
+        /* 79 */ { { 3.6093321e-13f, -2.e+00f }, { 8.0471897e-01f, -1.1071488e+00f }, 5.e-01f },
+        /* 80 */ { { 2.e+00f, -2.e+00f }, { 1.2824746e+00f, -5.8800262e-01f }, 5.e-01f },
+        /* 81 */ { { 1.1082383e+13f, -2.e+00f }, { 3.0036377e+01f, -1.804666e-13f }, 3.125e-02f },
+        /* 82 */ { { 6.1409603e+25f, -2.e+00f }, { 5.9379608e+01f, -3.2568196e-26f }, 1.5625e-02f },
+        /* 83 */ { { max, -2.e+00f }, { 8.8722839e+01f, -5.8774718e-39f }, 7.8125e-03f },
+        /* 84 */ { { inf, -2.e+00f }, { inf, zero }, 1.e+00f },
+        /* 85 */ { { -inf, -3.6093321e-13f }, { inf, -pi }, 1.e+00f },
+        /* 86 */ { { -max, -3.6093321e-13f }, { 8.8722839e+01f, -pi }, 7.8125e-03f },
+        /* 87 */ { { -6.1409603e+25f, -3.6093321e-13f }, { 5.9379608e+01f, -pi }, 1.5625e-02f },
+        /* 88 */ { { -1.1082383e+13f, -3.6093321e-13f }, { 3.0036377e+01f, -pi }, 3.125e-02f },
+        /* 89 */ { { -2.e+00f, -3.6093321e-13f }, { 6.5136393e-26f, -pi }, 2.5e-01f },
+        /* 90 */ { { -3.6093321e-13f, -3.6093321e-13f }, { -3.6093321e-13f, -3.6093321e-13f }, 1.0995116e+12f },
+        /* 91 */ { { -6.5136393e-26f, -3.6093321e-13f }, { -2.8437115e-33f, -3.6093321e-13f }, 2.1990233e+12f },
+        /* 92 */ { { -min, -3.6093321e-13f }, { 6.5136393e-26f, -3.6093321e-13f }, 2.1990233e+12f },
+        /* 93 */ { { zero, -3.6093321e-13f }, { 6.5136393e-26f, -3.6093321e-13f }, 2.1990233e+12f },
+        /* 94 */ { { min, -3.6093321e-13f }, { 6.5136393e-26f, -3.6093321e-13f }, 2.1990233e+12f },
+        /* 95 */ { { 6.5136393e-26f, -3.6093321e-13f }, { 1.3027279e-25f, -3.6093321e-13f }, 2.1990233e+12f },
+        /* 96 */ { { 3.6093321e-13f, -3.6093321e-13f }, { 3.6093321e-13f, -3.6093321e-13f }, 1.0995116e+12f },
+        /* 97 */ { { 2.e+00f, -3.6093321e-13f }, { 1.0986123e+00f, -1.2031107e-13f }, 5.e-01f },
+        /* 98 */ { { 1.1082383e+13f, -3.6093321e-13f }, { 3.0036377e+01f, -3.2568193e-26f }, 3.125e-02f },
+        /* 99 */ { { 6.1409603e+25f, -3.6093321e-13f }, { 5.9379608e+01f, -5.8774718e-39f }, 1.5625e-02f },
+        /* 100 */ { { max, -3.6093321e-13f }, { 8.8722839e+01f, zero }, 7.8125e-03f },
+        /* 101 */ { { inf, -3.6093321e-13f }, { inf, zero }, 1.e+00f },
+        /* 102 */ { { -inf, -6.5136393e-26f }, { inf, -pi }, 1.e+00f },
+        /* 103 */ { { -max, -6.5136393e-26f }, { 8.8722839e+01f, -pi }, 7.8125e-03f },
+        /* 104 */ { { -6.1409603e+25f, -6.5136393e-26f }, { 5.9379608e+01f, -pi }, 1.5625e-02f },
+        /* 105 */ { { -1.1082383e+13f, -6.5136393e-26f }, { 3.0036377e+01f, -pi }, 3.125e-02f },
+        /* 106 */ { { -2.e+00f, -6.5136393e-26f }, { zero, -pi }, 2.5e-01f },
+        /* 107 */ { { -3.6093321e-13f, -6.5136393e-26f }, { -3.6093321e-13f, -6.5136393e-26f }, 2.1990233e+12f },
+        /* 108 */ { { -6.5136393e-26f, -6.5136393e-26f }, { -6.5136393e-26f, -6.5136393e-26f }, 9.6714066e+24f },
+        /* 109 */ { { -min, -6.5136393e-26f }, { -min, -6.5136393e-26f }, 9.6714066e+24f },
+        /* 110 */ { { zero, -6.5136393e-26f }, { zero, -6.5136393e-26f }, 9.6714066e+24f },
+        /* 111 */ { { min, -6.5136393e-26f }, { min, -6.5136393e-26f }, 9.6714066e+24f },
+        /* 112 */ { { 6.5136393e-26f, -6.5136393e-26f }, { 6.5136393e-26f, -6.5136393e-26f }, 9.6714066e+24f },
+        /* 113 */ { { 3.6093321e-13f, -6.5136393e-26f }, { 3.6093321e-13f, -6.5136393e-26f }, 2.1990233e+12f },
+        /* 114 */ { { 2.e+00f, -6.5136393e-26f }, { 1.0986123e+00f, -2.1712131e-26f }, 5.e-01f },
+        /* 115 */ { { 1.1082383e+13f, -6.5136393e-26f }, { 3.0036377e+01f, -5.8774718e-39f }, 3.125e-02f },
+        /* 116 */ { { 6.1409603e+25f, -6.5136393e-26f }, { 5.9379608e+01f, zero }, 1.5625e-02f },
+        /* 117 */ { { max, -6.5136393e-26f }, { 8.8722839e+01f, zero }, 7.8125e-03f },
+        /* 118 */ { { inf, -6.5136393e-26f }, { inf, zero }, 1.e+00f },
+        /* 119 */ { { -inf, -min }, { inf, -pi }, 1.e+00f },
+        /* 120 */ { { -max, -min }, { 8.8722839e+01f, -pi }, 7.8125e-03f },
+        /* 121 */ { { -6.1409603e+25f, -min }, { 5.9379608e+01f, -pi }, 1.5625e-02f },
+        /* 122 */ { { -1.1082383e+13f, -min }, { 3.0036377e+01f, -pi }, 3.125e-02f },
+        /* 123 */ { { -2.e+00f, -min }, { zero, -pi }, 2.5e-01f },
+        /* 124 */ { { -3.6093321e-13f, -min }, { -3.6093321e-13f, -min }, 2.1990233e+12f },
+        /* 125 */ { { -6.5136393e-26f, -min }, { -6.5136393e-26f, -min }, 9.6714066e+24f },
+        /* 126 */ { { -min, -min }, { -min, -min }, 4.2535296e+37f },
+        /* 127 */ { { zero, -min }, { zero, -min }, 4.2535296e+37f },
+        /* 128 */ { { min, -min }, { min, -min }, 4.2535296e+37f },
+        /* 129 */ { { 6.5136393e-26f, -min }, { 6.5136393e-26f, -min }, 9.6714066e+24f },
+        /* 130 */ { { 3.6093321e-13f, -min }, { 3.6093321e-13f, -min }, 2.1990233e+12f },
+        /* 131 */ { { 2.e+00f, -min }, { 1.0986123e+00f, zero }, 5.e-01f },
+        /* 132 */ { { 1.1082383e+13f, -min }, { 3.0036377e+01f, zero }, 3.125e-02f },
+        /* 133 */ { { 6.1409603e+25f, -min }, { 5.9379608e+01f, zero }, 1.5625e-02f },
+        /* 134 */ { { max, -min }, { 8.8722839e+01f, zero }, 7.8125e-03f },
+        /* 135 */ { { inf, -min }, { inf, zero }, 1.e+00f },
+        /* 136 */ { { -inf, zero }, { inf, pi }, 1.e+00f },
+        /* 137 */ { { -max, zero }, { 8.8722839e+01f, pi }, 7.8125e-03f },
+        /* 138 */ { { -6.1409603e+25f, zero }, { 5.9379608e+01f, pi }, 1.5625e-02f },
+        /* 139 */ { { -1.1082383e+13f, zero }, { 3.0036377e+01f, pi }, 3.125e-02f },
+        /* 140 */ { { -2.e+00f, zero }, { zero, pi }, 2.5e-01f },
+        /* 141 */ { { -3.6093321e-13f, zero }, { -3.6093321e-13f, zero }, 2.1990233e+12f },
+        /* 142 */ { { -6.5136393e-26f, zero }, { -6.5136393e-26f, zero }, 9.6714066e+24f },
+        /* 143 */ { { -min, zero }, { -min, zero }, 4.2535296e+37f },
+        /* 144 */ { { zero, zero }, { zero, zero }, 1.e+00f },
+        /* 145 */ { { min, zero }, { min, zero }, 4.2535296e+37f },
+        /* 146 */ { { 6.5136393e-26f, zero }, { 6.5136393e-26f, zero }, 9.6714066e+24f },
+        /* 147 */ { { 3.6093321e-13f, zero }, { 3.6093321e-13f, zero }, 2.1990233e+12f },
+        /* 148 */ { { 2.e+00f, zero }, { 1.0986123e+00f, zero }, 5.e-01f },
+        /* 149 */ { { 1.1082383e+13f, zero }, { 3.0036377e+01f, zero }, 3.125e-02f },
+        /* 150 */ { { 6.1409603e+25f, zero }, { 5.9379608e+01f, zero }, 1.5625e-02f },
+        /* 151 */ { { max, zero }, { 8.8722839e+01f, zero }, 7.8125e-03f },
+        /* 152 */ { { inf, zero }, { inf, zero }, 1.e+00f },
+        /* 153 */ { { -inf, min }, { inf, pi }, 1.e+00f },
+        /* 154 */ { { -max, min }, { 8.8722839e+01f, pi }, 7.8125e-03f },
+        /* 155 */ { { -6.1409603e+25f, min }, { 5.9379608e+01f, pi }, 1.5625e-02f },
+        /* 156 */ { { -1.1082383e+13f, min }, { 3.0036377e+01f, pi }, 3.125e-02f },
+        /* 157 */ { { -2.e+00f, min }, { zero, pi }, 2.5e-01f },
+        /* 158 */ { { -3.6093321e-13f, min }, { -3.6093321e-13f, min }, 2.1990233e+12f },
+        /* 159 */ { { -6.5136393e-26f, min }, { -6.5136393e-26f, min }, 9.6714066e+24f },
+        /* 160 */ { { -min, min }, { -min, min }, 4.2535296e+37f },
+        /* 161 */ { { zero, min }, { zero, min }, 4.2535296e+37f },
+        /* 162 */ { { min, min }, { min, min }, 4.2535296e+37f },
+        /* 163 */ { { 6.5136393e-26f, min }, { 6.5136393e-26f, min }, 9.6714066e+24f },
+        /* 164 */ { { 3.6093321e-13f, min }, { 3.6093321e-13f, min }, 2.1990233e+12f },
+        /* 165 */ { { 2.e+00f, min }, { 1.0986123e+00f, zero }, 5.e-01f },
+        /* 166 */ { { 1.1082383e+13f, min }, { 3.0036377e+01f, zero }, 3.125e-02f },
+        /* 167 */ { { 6.1409603e+25f, min }, { 5.9379608e+01f, zero }, 1.5625e-02f },
+        /* 168 */ { { max, min }, { 8.8722839e+01f, zero }, 7.8125e-03f },
+        /* 169 */ { { inf, min }, { inf, zero }, 1.e+00f },
+        /* 170 */ { { -inf, 6.5136393e-26f }, { inf, pi }, 1.e+00f },
+        /* 171 */ { { -max, 6.5136393e-26f }, { 8.8722839e+01f, pi }, 7.8125e-03f },
+        /* 172 */ { { -6.1409603e+25f, 6.5136393e-26f }, { 5.9379608e+01f, pi }, 1.5625e-02f },
+        /* 173 */ { { -1.1082383e+13f, 6.5136393e-26f }, { 3.0036377e+01f, pi }, 3.125e-02f },
+        /* 174 */ { { -2.e+00f, 6.5136393e-26f }, { zero, pi }, 2.5e-01f },
+        /* 175 */ { { -3.6093321e-13f, 6.5136393e-26f }, { -3.6093321e-13f, 6.5136393e-26f }, 2.1990233e+12f },
+        /* 176 */ { { -6.5136393e-26f, 6.5136393e-26f }, { -6.5136393e-26f, 6.5136393e-26f }, 9.6714066e+24f },
+        /* 177 */ { { -min, 6.5136393e-26f }, { -min, 6.5136393e-26f }, 9.6714066e+24f },
+        /* 178 */ { { zero, 6.5136393e-26f }, { zero, 6.5136393e-26f }, 9.6714066e+24f },
+        /* 179 */ { { min, 6.5136393e-26f }, { min, 6.5136393e-26f }, 9.6714066e+24f },
+        /* 180 */ { { 6.5136393e-26f, 6.5136393e-26f }, { 6.5136393e-26f, 6.5136393e-26f }, 9.6714066e+24f },
+        /* 181 */ { { 3.6093321e-13f, 6.5136393e-26f }, { 3.6093321e-13f, 6.5136393e-26f }, 2.1990233e+12f },
+        /* 182 */ { { 2.e+00f, 6.5136393e-26f }, { 1.0986123e+00f, 2.1712131e-26f }, 5.e-01f },
+        /* 183 */ { { 1.1082383e+13f, 6.5136393e-26f }, { 3.0036377e+01f, 5.8774718e-39f }, 3.125e-02f },
+        /* 184 */ { { 6.1409603e+25f, 6.5136393e-26f }, { 5.9379608e+01f, zero }, 1.5625e-02f },
+        /* 185 */ { { max, 6.5136393e-26f }, { 8.8722839e+01f, zero }, 7.8125e-03f },
+        /* 186 */ { { inf, 6.5136393e-26f }, { inf, zero }, 1.e+00f },
+        /* 187 */ { { -inf, 3.6093321e-13f }, { inf, pi }, 1.e+00f },
+        /* 188 */ { { -max, 3.6093321e-13f }, { 8.8722839e+01f, pi }, 7.8125e-03f },
+        /* 189 */ { { -6.1409603e+25f, 3.6093321e-13f }, { 5.9379608e+01f, pi }, 1.5625e-02f },
+        /* 190 */ { { -1.1082383e+13f, 3.6093321e-13f }, { 3.0036377e+01f, pi }, 3.125e-02f },
+        /* 191 */ { { -2.e+00f, 3.6093321e-13f }, { 6.5136393e-26f, pi }, 2.5e-01f },
+        /* 192 */ { { -3.6093321e-13f, 3.6093321e-13f }, { -3.6093321e-13f, 3.6093321e-13f }, 1.0995116e+12f },
+        /* 193 */ { { -6.5136393e-26f, 3.6093321e-13f }, { -2.8437115e-33f, 3.6093321e-13f }, 2.1990233e+12f },
+        /* 194 */ { { -min, 3.6093321e-13f }, { 6.5136393e-26f, 3.6093321e-13f }, 2.1990233e+12f },
+        /* 195 */ { { zero, 3.6093321e-13f }, { 6.5136393e-26f, 3.6093321e-13f }, 2.1990233e+12f },
+        /* 196 */ { { min, 3.6093321e-13f }, { 6.5136393e-26f, 3.6093321e-13f }, 2.1990233e+12f },
+        /* 197 */ { { 6.5136393e-26f, 3.6093321e-13f }, { 1.3027279e-25f, 3.6093321e-13f }, 2.1990233e+12f },
+        /* 198 */ { { 3.6093321e-13f, 3.6093321e-13f }, { 3.6093321e-13f, 3.6093321e-13f }, 1.0995116e+12f },
+        /* 199 */ { { 2.e+00f, 3.6093321e-13f }, { 1.0986123e+00f, 1.2031107e-13f }, 5.e-01f },
+        /* 200 */ { { 1.1082383e+13f, 3.6093321e-13f }, { 3.0036377e+01f, 3.2568193e-26f }, 3.125e-02f },
+        /* 201 */ { { 6.1409603e+25f, 3.6093321e-13f }, { 5.9379608e+01f, 5.8774718e-39f }, 1.5625e-02f },
+        /* 202 */ { { max, 3.6093321e-13f }, { 8.8722839e+01f, zero }, 7.8125e-03f },
+        /* 203 */ { { inf, 3.6093321e-13f }, { inf, zero }, 1.e+00f },
+        /* 204 */ { { -inf, 2.e+00f }, { inf, pi }, 1.e+00f },
+        /* 205 */ { { -max, 2.e+00f }, { 8.8722839e+01f, pi }, 7.8125e-03f },
+        /* 206 */ { { -6.1409603e+25f, 2.e+00f }, { 5.9379608e+01f, pi }, 1.5625e-02f },
+        /* 207 */ { { -1.1082383e+13f, 2.e+00f }, { 3.0036377e+01f, pi }, 3.125e-02f },
+        /* 208 */ { { -2.e+00f, 2.e+00f }, { 8.0471897e-01f, 2.0344439e+00f }, 2.5e-01f },
+        /* 209 */ { { -3.6093321e-13f, 2.e+00f }, { 8.0471897e-01f, 1.1071488e+00f }, 5.e-01f },
+        /* 210 */ { { -6.5136393e-26f, 2.e+00f }, { 8.0471897e-01f, 1.1071488e+00f }, 5.e-01f },
+        /* 211 */ { { -min, 2.e+00f }, { 8.0471897e-01f, 1.1071488e+00f }, 5.e-01f },
+        /* 212 */ { { zero, 2.e+00f }, { 8.0471897e-01f, 1.1071488e+00f }, 5.e-01f },
+        /* 213 */ { { min, 2.e+00f }, { 8.0471897e-01f, 1.1071488e+00f }, 5.e-01f },
+        /* 214 */ { { 6.5136393e-26f, 2.e+00f }, { 8.0471897e-01f, 1.1071488e+00f }, 5.e-01f },
+        /* 215 */ { { 3.6093321e-13f, 2.e+00f }, { 8.0471897e-01f, 1.1071488e+00f }, 5.e-01f },
+        /* 216 */ { { 2.e+00f, 2.e+00f }, { 1.2824746e+00f, 5.8800262e-01f }, 5.e-01f },
+        /* 217 */ { { 1.1082383e+13f, 2.e+00f }, { 3.0036377e+01f, 1.804666e-13f }, 3.125e-02f },
+        /* 218 */ { { 6.1409603e+25f, 2.e+00f }, { 5.9379608e+01f, 3.2568196e-26f }, 1.5625e-02f },
+        /* 219 */ { { max, 2.e+00f }, { 8.8722839e+01f, 5.8774718e-39f }, 7.8125e-03f },
+        /* 220 */ { { inf, 2.e+00f }, { inf, zero }, 1.e+00f },
+        /* 221 */ { { -inf, 1.1082383e+13f }, { inf, pi }, 1.e+00f },
+        /* 222 */ { { -max, 1.1082383e+13f }, { 8.8722839e+01f, pi }, 7.8125e-03f },
+        /* 223 */ { { -6.1409603e+25f, 1.1082383e+13f }, { 5.9379608e+01f, pi }, 1.5625e-02f },
+        /* 224 */ { { -1.1082383e+13f, 1.1082383e+13f }, { 3.0382952e+01f, pi3_4 }, 3.125e-02f },
+        /* 225 */ { { -2.e+00f, 1.1082383e+13f }, { 3.0036377e+01f, pi_2 }, 3.125e-02f },
+        /* 226 */ { { -3.6093321e-13f, 1.1082383e+13f }, { 3.0036377e+01f, pi_2 }, 3.125e-02f },
+        /* 227 */ { { -6.5136393e-26f, 1.1082383e+13f }, { 3.0036377e+01f, pi_2 }, 3.125e-02f },
+        /* 228 */ { { -min, 1.1082383e+13f }, { 3.0036377e+01f, pi_2 }, 3.125e-02f },
+        /* 229 */ { { zero, 1.1082383e+13f }, { 3.0036377e+01f, pi_2 }, 3.125e-02f },
+        /* 230 */ { { min, 1.1082383e+13f }, { 3.0036377e+01f, pi_2 }, 3.125e-02f },
+        /* 231 */ { { 6.5136393e-26f, 1.1082383e+13f }, { 3.0036377e+01f, pi_2 }, 3.125e-02f },
+        /* 232 */ { { 3.6093321e-13f, 1.1082383e+13f }, { 3.0036377e+01f, pi_2 }, 3.125e-02f },
+        /* 233 */ { { 2.e+00f, 1.1082383e+13f }, { 3.0036377e+01f, pi_2 }, 3.125e-02f },
+        /* 234 */ { { 1.1082383e+13f, 1.1082383e+13f }, { 3.0382952e+01f, pi_4 }, 3.125e-02f },
+        /* 235 */ { { 6.1409603e+25f, 1.1082383e+13f }, { 5.9379608e+01f, 1.8046662e-13f }, 1.5625e-02f },
+        /* 236 */ { { max, 1.1082383e+13f }, { 8.8722839e+01f, 3.25682e-26f }, 7.8125e-03f },
+        /* 237 */ { { inf, 1.1082383e+13f }, { inf, zero }, 1.e+00f },
+        /* 238 */ { { -inf, 6.1409603e+25f }, { inf, pi }, 1.e+00f },
+        /* 239 */ { { -max, 6.1409603e+25f }, { 8.8722839e+01f, pi }, 7.8125e-03f },
+        /* 240 */ { { -6.1409603e+25f, 6.1409603e+25f }, { 5.9726181e+01f, pi3_4 }, 1.5625e-02f },
+        /* 241 */ { { -1.1082383e+13f, 6.1409603e+25f }, { 5.9379608e+01f, pi_2 }, 1.5625e-02f },
+        /* 242 */ { { -2.e+00f, 6.1409603e+25f }, { 5.9379608e+01f, pi_2 }, 1.5625e-02f },
+        /* 243 */ { { -3.6093321e-13f, 6.1409603e+25f }, { 5.9379608e+01f, pi_2 }, 1.5625e-02f },
+        /* 244 */ { { -6.5136393e-26f, 6.1409603e+25f }, { 5.9379608e+01f, pi_2 }, 1.5625e-02f },
+        /* 245 */ { { -min, 6.1409603e+25f }, { 5.9379608e+01f, pi_2 }, 1.5625e-02f },
+        /* 246 */ { { zero, 6.1409603e+25f }, { 5.9379608e+01f, pi_2 }, 1.5625e-02f },
+        /* 247 */ { { min, 6.1409603e+25f }, { 5.9379608e+01f, pi_2 }, 1.5625e-02f },
+        /* 248 */ { { 6.5136393e-26f, 6.1409603e+25f }, { 5.9379608e+01f, pi_2 }, 1.5625e-02f },
+        /* 249 */ { { 3.6093321e-13f, 6.1409603e+25f }, { 5.9379608e+01f, pi_2 }, 1.5625e-02f },
+        /* 250 */ { { 2.e+00f, 6.1409603e+25f }, { 5.9379608e+01f, pi_2 }, 1.5625e-02f },
+        /* 251 */ { { 1.1082383e+13f, 6.1409603e+25f }, { 5.9379608e+01f, pi_2 }, 1.5625e-02f },
+        /* 252 */ { { 6.1409603e+25f, 6.1409603e+25f }, { 5.9726181e+01f, pi_4 }, 1.5625e-02f },
+        /* 253 */ { { max, 6.1409603e+25f }, { 8.8722839e+01f, 1.8046662e-13f }, 7.8125e-03f },
+        /* 254 */ { { inf, 6.1409603e+25f }, { inf, zero }, 1.e+00f },
+        /* 255 */ { { -inf, max }, { inf, pi }, 1.e+00f },
+        /* 256 */ { { -max, max }, { 8.9069412e+01f, pi3_4 }, 7.8125e-03f },
+        /* 257 */ { { -6.1409603e+25f, max }, { 8.8722839e+01f, pi_2 }, 7.8125e-03f },
+        /* 258 */ { { -1.1082383e+13f, max }, { 8.8722839e+01f, pi_2 }, 7.8125e-03f },
+        /* 259 */ { { -2.e+00f, max }, { 8.8722839e+01f, pi_2 }, 7.8125e-03f },
+        /* 260 */ { { -3.6093321e-13f, max }, { 8.8722839e+01f, pi_2 }, 7.8125e-03f },
+        /* 261 */ { { -6.5136393e-26f, max }, { 8.8722839e+01f, pi_2 }, 7.8125e-03f },
+        /* 262 */ { { -min, max }, { 8.8722839e+01f, pi_2 }, 7.8125e-03f },
+        /* 263 */ { { zero, max }, { 8.8722839e+01f, pi_2 }, 7.8125e-03f },
+        /* 264 */ { { min, max }, { 8.8722839e+01f, pi_2 }, 7.8125e-03f },
+        /* 265 */ { { 6.5136393e-26f, max }, { 8.8722839e+01f, pi_2 }, 7.8125e-03f },
+        /* 266 */ { { 3.6093321e-13f, max }, { 8.8722839e+01f, pi_2 }, 7.8125e-03f },
+        /* 267 */ { { 2.e+00f, max }, { 8.8722839e+01f, pi_2 }, 7.8125e-03f },
+        /* 268 */ { { 1.1082383e+13f, max }, { 8.8722839e+01f, pi_2 }, 7.8125e-03f },
+        /* 269 */ { { 6.1409603e+25f, max }, { 8.8722839e+01f, pi_2 }, 7.8125e-03f },
+        /* 270 */ { { max, max }, { 8.9069412e+01f, pi_4 }, 7.8125e-03f },
+        /* 271 */ { { inf, max }, { inf, zero }, 1.e+00f },
+        /* 272 */ { { -inf, inf }, { inf, pi3_4 }, 1.e+00f },
+        /* 273 */ { { -max, inf }, { inf, pi_2 }, 1.e+00f },
+        /* 274 */ { { -6.1409603e+25f, inf }, { inf, pi_2 }, 1.e+00f },
+        /* 275 */ { { -1.1082383e+13f, inf }, { inf, pi_2 }, 1.e+00f },
+        /* 276 */ { { -2.e+00f, inf }, { inf, pi_2 }, 1.e+00f },
+        /* 277 */ { { -3.6093321e-13f, inf }, { inf, pi_2 }, 1.e+00f },
+        /* 278 */ { { -6.5136393e-26f, inf }, { inf, pi_2 }, 1.e+00f },
+        /* 279 */ { { -min, inf }, { inf, pi_2 }, 1.e+00f },
+        /* 280 */ { { zero, inf }, { inf, pi_2 }, 1.e+00f },
+        /* 281 */ { { min, inf }, { inf, pi_2 }, 1.e+00f },
+        /* 282 */ { { 6.5136393e-26f, inf }, { inf, pi_2 }, 1.e+00f },
+        /* 283 */ { { 3.6093321e-13f, inf }, { inf, pi_2 }, 1.e+00f },
+        /* 284 */ { { 2.e+00f, inf }, { inf, pi_2 }, 1.e+00f },
+        /* 285 */ { { 1.1082383e+13f, inf }, { inf, pi_2 }, 1.e+00f },
+        /* 286 */ { { 6.1409603e+25f, inf }, { inf, pi_2 }, 1.e+00f },
+        /* 287 */ { { max, inf }, { inf, pi_2 }, 1.e+00f },
+        /* 288 */ { { inf, inf }, { inf, pi_4 }, 1.e+00f }
+          // clang-format on
+      };
+      return table;
+    } else if constexpr (std::is_same_v<T, double>) {
+      const T pi = 3.141592653589793;
+      const T pi_4 = 0.7853981633974483;
+      const T pi_2 = 1.5707963267948966;
+      const T pi3_4 = 2.356194490192345;
+      const T zero = 0.0;
+      const T inf = std::numeric_limits<T>::infinity();
+      const T min = std::numeric_limits<T>::min();
+      const T max = std::numeric_limits<T>::max();
+      const TableType table{
+          // clang-format off
+          // Ignore max 80 character line width style requirement for
+          // (i) the readability
+          // (ii) the consistency with the local conventions
+        /* 0 */ { { -inf, -inf }, { inf, -pi3_4 }, 1.e+00 },
+        /* 1 */ { { -max, -inf }, { inf, -pi_2 }, 1.e+00 },
+        /* 2 */ { { -4.0131652080900752e+205, -inf }, { inf, -pi_2 }, 1.e+00 },
+        /* 3 */ { { -8.9589789687104559e+102, -inf }, { inf, -pi_2 }, 1.e+00 },
+        /* 4 */ { { -1.9999999999998694e+00, -inf }, { inf, -pi_2 }, 1.e+00 },
+        /* 5 */ { { -4.4647944971961829e-103, -inf }, { inf, -pi_2 }, 1.e+00 },
+        /* 6 */ { { -9.9671949510973086e-206, -inf }, { inf, -pi_2 }, 1.e+00 },
+        /* 7 */ { { -min, -inf }, { inf, -pi_2 }, 1.e+00 },
+        /* 8 */ { { zero, -inf }, { inf, -pi_2 }, 1.e+00 },
+        /* 9 */ { { min, -inf }, { inf, -pi_2 }, 1.e+00 },
+        /* 10 */ { { 9.9671949510973086e-206, -inf }, { inf, -pi_2 }, 1.e+00 },
+        /* 11 */ { { 4.4647944971961829e-103, -inf }, { inf, -pi_2 }, 1.e+00 },
+        /* 12 */ { { 1.9999999999998694e+00, -inf }, { inf, -pi_2 }, 1.e+00 },
+        /* 13 */ { { 8.9589789687104559e+102, -inf }, { inf, -pi_2 }, 1.e+00 },
+        /* 14 */ { { 4.0131652080900752e+205, -inf }, { inf, -pi_2 }, 1.e+00 },
+        /* 15 */ { { max, -inf }, { inf, -pi_2 }, 1.e+00 },
+        /* 16 */ { { inf, -inf }, { inf, -pi_4 }, 1.e+00 },
+        /* 17 */ { { -inf, -max }, { inf, -pi }, 1.e+00 },
+        /* 18 */ { { -max, -max }, { 7.1012928648366392e+02, -pi3_4 }, 9.765625e-04 },
+        /* 19 */ { { -4.0131652080900752e+205, -max }, { 7.0978271289338397e+02, -pi_2 }, 9.765625e-04 },
+        /* 20 */ { { -8.9589789687104559e+102, -max }, { 7.0978271289338397e+02, -pi_2 }, 9.765625e-04 },
+        /* 21 */ { { -1.9999999999998694e+00, -max }, { 7.0978271289338397e+02, -pi_2 }, 9.765625e-04 },
+        /* 22 */ { { -4.4647944971961829e-103, -max }, { 7.0978271289338397e+02, -pi_2 }, 9.765625e-04 },
+        /* 23 */ { { -9.9671949510973086e-206, -max }, { 7.0978271289338397e+02, -pi_2 }, 9.765625e-04 },
+        /* 24 */ { { -min, -max }, { 7.0978271289338397e+02, -pi_2 }, 9.765625e-04 },
+        /* 25 */ { { zero, -max }, { 7.0978271289338397e+02, -pi_2 }, 9.765625e-04 },
+        /* 26 */ { { min, -max }, { 7.0978271289338397e+02, -pi_2 }, 9.765625e-04 },
+        /* 27 */ { { 9.9671949510973086e-206, -max }, { 7.0978271289338397e+02, -pi_2 }, 9.765625e-04 },
+        /* 28 */ { { 4.4647944971961829e-103, -max }, { 7.0978271289338397e+02, -pi_2 }, 9.765625e-04 },
+        /* 29 */ { { 1.9999999999998694e+00, -max }, { 7.0978271289338397e+02, -pi_2 }, 9.765625e-04 },
+        /* 30 */ { { 8.9589789687104559e+102, -max }, { 7.0978271289338397e+02, -pi_2 }, 9.765625e-04 },
+        /* 31 */ { { 4.0131652080900752e+205, -max }, { 7.0978271289338397e+02, -pi_2 }, 9.765625e-04 },
+        /* 32 */ { { max, -max }, { 7.1012928648366392e+02, -pi_4 }, 9.765625e-04 },
+        /* 33 */ { { inf, -max }, { inf, zero }, 1.e+00 },
+        /* 34 */ { { -inf, -4.0131652080900752e+205 }, { inf, -pi }, 1.e+00 },
+        /* 35 */ { { -max, -4.0131652080900752e+205 }, { 7.0978271289338397e+02, -pi }, 9.765625e-04 },
+        /* 36 */ { { -4.0131652080900752e+205, -4.0131652080900752e+205 }, { 4.737660979127225e+02, -pi3_4 }, 1.953125e-03 },
+        /* 37 */ { { -8.9589789687104559e+102, -4.0131652080900752e+205 }, { 4.7341952432244256e+02, -pi_2 }, 1.953125e-03 },
+        /* 38 */ { { -1.9999999999998694e+00, -4.0131652080900752e+205 }, { 4.7341952432244256e+02, -pi_2 }, 1.953125e-03 },
+        /* 39 */ { { -4.4647944971961829e-103, -4.0131652080900752e+205 }, { 4.7341952432244256e+02, -pi_2 }, 1.953125e-03 },
+        /* 40 */ { { -9.9671949510973086e-206, -4.0131652080900752e+205 }, { 4.7341952432244256e+02, -pi_2 }, 1.953125e-03 },
+        /* 41 */ { { -min, -4.0131652080900752e+205 }, { 4.7341952432244256e+02, -pi_2 }, 1.953125e-03 },
+        /* 42 */ { { zero, -4.0131652080900752e+205 }, { 4.7341952432244256e+02, -pi_2 }, 1.953125e-03 },
+        /* 43 */ { { min, -4.0131652080900752e+205 }, { 4.7341952432244256e+02, -pi_2 }, 1.953125e-03 },
+        /* 44 */ { { 9.9671949510973086e-206, -4.0131652080900752e+205 }, { 4.7341952432244256e+02, -pi_2 }, 1.953125e-03 },
+        /* 45 */ { { 4.4647944971961829e-103, -4.0131652080900752e+205 }, { 4.7341952432244256e+02, -pi_2 }, 1.953125e-03 },
+        /* 46 */ { { 1.9999999999998694e+00, -4.0131652080900752e+205 }, { 4.7341952432244256e+02, -pi_2 }, 1.953125e-03 },
+        /* 47 */ { { 8.9589789687104559e+102, -4.0131652080900752e+205 }, { 4.7341952432244256e+02, -pi_2 }, 1.953125e-03 },
+        /* 48 */ { { 4.0131652080900752e+205, -4.0131652080900752e+205 }, { 4.737660979127225e+02, -pi_4 }, 1.953125e-03 },
+        /* 49 */ { { max, -4.0131652080900752e+205 }, { 7.0978271289338397e+02, -2.2323972485979601e-103 }, 9.765625e-04 },
+        /* 50 */ { { inf, -4.0131652080900752e+205 }, { inf, zero }, 1.e+00 },
+        /* 51 */ { { -inf, -8.9589789687104559e+102 }, { inf, -pi }, 1.e+00 },
+        /* 52 */ { { -max, -8.9589789687104559e+102 }, { 7.0978271289338397e+02, -pi }, 9.765625e-04 },
+        /* 53 */ { { -4.0131652080900752e+205, -8.9589789687104559e+102 }, { 4.7341952432244256e+02, -pi }, 1.953125e-03 },
+        /* 54 */ { { -8.9589789687104559e+102, -8.9589789687104559e+102 }, { 2.374029093417812e+02, -pi3_4 }, 3.90625e-03 },
+        /* 55 */ { { -1.9999999999998694e+00, -8.9589789687104559e+102 }, { 2.3705633575150122e+02, -pi_2 }, 3.90625e-03 },
+        /* 56 */ { { -4.4647944971961829e-103, -8.9589789687104559e+102 }, { 2.3705633575150122e+02, -pi_2 }, 3.90625e-03 },
+        /* 57 */ { { -9.9671949510973086e-206, -8.9589789687104559e+102 }, { 2.3705633575150122e+02, -pi_2 }, 3.90625e-03 },
+        /* 58 */ { { -min, -8.9589789687104559e+102 }, { 2.3705633575150122e+02, -pi_2 }, 3.90625e-03 },
+        /* 59 */ { { zero, -8.9589789687104559e+102 }, { 2.3705633575150122e+02, -pi_2 }, 3.90625e-03 },
+        /* 60 */ { { min, -8.9589789687104559e+102 }, { 2.3705633575150122e+02, -pi_2 }, 3.90625e-03 },
+        /* 61 */ { { 9.9671949510973086e-206, -8.9589789687104559e+102 }, { 2.3705633575150122e+02, -pi_2 }, 3.90625e-03 },
+        /* 62 */ { { 4.4647944971961829e-103, -8.9589789687104559e+102 }, { 2.3705633575150122e+02, -pi_2 }, 3.90625e-03 },
+        /* 63 */ { { 1.9999999999998694e+00, -8.9589789687104559e+102 }, { 2.3705633575150122e+02, -pi_2 }, 3.90625e-03 },
+        /* 64 */ { { 8.9589789687104559e+102, -8.9589789687104559e+102 }, { 2.374029093417812e+02, -pi_4 }, 3.90625e-03 },
+        /* 65 */ { { 4.0131652080900752e+205, -8.9589789687104559e+102 }, { 4.7341952432244256e+02, -2.2323972485982374e-103 }, 1.953125e-03 },
+        /* 66 */ { { max, -8.9589789687104559e+102 }, { 7.0978271289338397e+02, -4.9835974755483611e-206 }, 9.765625e-04 },
+        /* 67 */ { { inf, -8.9589789687104559e+102 }, { inf, zero }, 1.e+00 },
+        /* 68 */ { { -inf, -1.9999999999998694e+00 }, { inf, -pi }, 1.e+00 },
+        /* 69 */ { { -max, -1.9999999999998694e+00 }, { 7.0978271289338397e+02, -pi }, 9.765625e-04 },
+        /* 70 */ { { -4.0131652080900752e+205, -1.9999999999998694e+00 }, { 4.7341952432244256e+02, -pi }, 1.953125e-03 },
+        /* 71 */ { { -8.9589789687104559e+102, -1.9999999999998694e+00 }, { 2.3705633575150122e+02, -pi }, 3.90625e-03 },
+        /* 72 */ { { -1.9999999999998694e+00, -1.9999999999998694e+00 }, { 8.0471895621697187e-01, -2.0344439357956765e+00 }, 2.5e-01 },
+        /* 73 */ { { -4.4647944971961829e-103, -1.9999999999998694e+00 }, { 8.0471895621699796e-01, -1.1071487177940644e+00 }, 5.e-01 },
+        /* 74 */ { { -9.9671949510973086e-206, -1.9999999999998694e+00 }, { 8.0471895621699796e-01, -1.1071487177940644e+00 }, 5.e-01 },
+        /* 75 */ { { -min, -1.9999999999998694e+00 }, { 8.0471895621699796e-01, -1.1071487177940644e+00 }, 5.e-01 },
+        /* 76 */ { { zero, -1.9999999999998694e+00 }, { 8.0471895621699796e-01, -1.1071487177940644e+00 }, 5.e-01 },
+        /* 77 */ { { min, -1.9999999999998694e+00 }, { 8.0471895621699796e-01, -1.1071487177940644e+00 }, 5.e-01 },
+        /* 78 */ { { 9.9671949510973086e-206, -1.9999999999998694e+00 }, { 8.0471895621699796e-01, -1.1071487177940644e+00 }, 5.e-01 },
+        /* 79 */ { { 4.4647944971961829e-103, -1.9999999999998694e+00 }, { 8.0471895621699796e-01, -1.1071487177940644e+00 }, 5.e-01 },
+        /* 80 */ { { 1.9999999999998694e+00, -1.9999999999998694e+00 }, { 1.2824746787307182e+00, -5.8800260354755751e-01 }, 5.e-01 },
+        /* 81 */ { { 8.9589789687104559e+102, -1.9999999999998694e+00 }, { 2.3705633575150122e+02, -2.2323972485982374e-103 }, 3.90625e-03 },
+        /* 82 */ { { 4.0131652080900752e+205, -1.9999999999998694e+00 }, { 4.7341952432244256e+02, -4.9835974755489796e-206 }, 1.953125e-03 },
+        /* 83 */ { { max, -1.9999999999998694e+00 }, { 7.0978271289338397e+02, zero }, 9.765625e-04 },
+        /* 84 */ { { inf, -1.9999999999998694e+00 }, { inf, zero }, 1.e+00 },
+        /* 85 */ { { -inf, -4.4647944971961829e-103 }, { inf, -pi }, 1.e+00 },
+        /* 86 */ { { -max, -4.4647944971961829e-103 }, { 7.0978271289338397e+02, -pi }, 9.765625e-04 },
+        /* 87 */ { { -4.0131652080900752e+205, -4.4647944971961829e-103 }, { 4.7341952432244256e+02, -pi }, 1.953125e-03 },
+        /* 88 */ { { -8.9589789687104559e+102, -4.4647944971961829e-103 }, { 2.3705633575150122e+02, -pi }, 3.90625e-03 },
+        /* 89 */ { { -1.9999999999998694e+00, -4.4647944971961829e-103 }, { -1.3056222769592694e-13, -pi }, 2.5e-01 },
+        /* 90 */ { { -4.4647944971961829e-103, -4.4647944971961829e-103 }, { -4.4647944971961829e-103, -4.4647944971961829e-103 }, 1.1198723710889021e+102 },
+        /* 91 */ { { -9.9671949510973086e-206, -4.4647944971961829e-103 }, { -6.5066958834738373e-219, -4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 92 */ { { -min, -4.4647944971961829e-103 }, { 9.9671949510966579e-206, -4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 93 */ { { zero, -4.4647944971961829e-103 }, { 9.9671949510966579e-206, -4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 94 */ { { min, -4.4647944971961829e-103 }, { 9.9671949510966579e-206, -4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 95 */ { { 9.9671949510973086e-206, -4.4647944971961829e-103 }, { 1.9934389902193967e-205, -4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 96 */ { { 4.4647944971961829e-103, -4.4647944971961829e-103 }, { 4.4647944971961829e-103, -4.4647944971961829e-103 }, 1.1198723710889021e+102 },
+        /* 97 */ { { 1.9999999999998694e+00, -4.4647944971961829e-103 }, { 1.0986122886680663e+00, -1.4882648323987925e-103 }, 5.e-01 },
+        /* 98 */ { { 8.9589789687104559e+102, -4.4647944971961829e-103 }, { 2.3705633575150122e+02, -4.9835974755489796e-206 }, 3.90625e-03 },
+        /* 99 */ { { 4.0131652080900752e+205, -4.4647944971961829e-103 }, { 4.7341952432244256e+02, -1.1125369292536664e-308 }, 1.953125e-03 },
+        /* 100 */ { { max, -4.4647944971961829e-103 }, { 7.0978271289338397e+02, zero }, 9.765625e-04 },
+        /* 101 */ { { inf, -4.4647944971961829e-103 }, { inf, zero }, 1.e+00 },
+        /* 102 */ { { -inf, -9.9671949510973086e-206 }, { inf, -pi }, 1.e+00 },
+        /* 103 */ { { -max, -9.9671949510973086e-206 }, { 7.0978271289338397e+02, -pi }, 9.765625e-04 },
+        /* 104 */ { { -4.0131652080900752e+205, -9.9671949510973086e-206 }, { 4.7341952432244256e+02, -pi }, 1.953125e-03 },
+        /* 105 */ { { -8.9589789687104559e+102, -9.9671949510973086e-206 }, { 2.3705633575150122e+02, -pi }, 3.90625e-03 },
+        /* 106 */ { { -1.9999999999998694e+00, -9.9671949510973086e-206 }, { -1.3056222769592694e-13, -pi }, 2.5e-01 },
+        /* 107 */ { { -4.4647944971961829e-103, -9.9671949510973086e-206 }, { -4.4647944971961829e-103, -9.9671949510973086e-206 }, 2.2397447421778042e+102 },
+        /* 108 */ { { -9.9671949510973086e-206, -9.9671949510973086e-206 }, { -9.9671949510973086e-206, -9.9671949510973086e-206 }, 5.0164565101131187e+204 },
+        /* 109 */ { { -min, -9.9671949510973086e-206 }, { -min, -9.9671949510973086e-206 }, 1.0032913020226237e+205 },
+        /* 110 */ { { zero, -9.9671949510973086e-206 }, { zero, -9.9671949510973086e-206 }, 1.0032913020226237e+205 },
+        /* 111 */ { { min, -9.9671949510973086e-206 }, { min, -9.9671949510973086e-206 }, 1.0032913020226237e+205 },
+        /* 112 */ { { 9.9671949510973086e-206, -9.9671949510973086e-206 }, { 9.9671949510973086e-206, -9.9671949510973086e-206 }, 5.0164565101131187e+204 },
+        /* 113 */ { { 4.4647944971961829e-103, -9.9671949510973086e-206 }, { 4.4647944971961829e-103, -9.9671949510973086e-206 }, 2.2397447421778042e+102 },
+        /* 114 */ { { 1.9999999999998694e+00, -9.9671949510973086e-206 }, { 1.0986122886680663e+00, -3.322398317032581e-206 }, 5.e-01 },
+        /* 115 */ { { 8.9589789687104559e+102, -9.9671949510973086e-206 }, { 2.3705633575150122e+02, -1.1125369292536664e-308 }, 3.90625e-03 },
+        /* 116 */ { { 4.0131652080900752e+205, -9.9671949510973086e-206 }, { 4.7341952432244256e+02, zero }, 1.953125e-03 },
+        /* 117 */ { { max, -9.9671949510973086e-206 }, { 7.0978271289338397e+02, zero }, 9.765625e-04 },
+        /* 118 */ { { inf, -9.9671949510973086e-206 }, { inf, zero }, 1.e+00 },
+        /* 119 */ { { -inf, -min }, { inf, -pi }, 1.e+00 },
+        /* 120 */ { { -max, -min }, { 7.0978271289338397e+02, -pi }, 9.765625e-04 },
+        /* 121 */ { { -4.0131652080900752e+205, -min }, { 4.7341952432244256e+02, -pi }, 1.953125e-03 },
+        /* 122 */ { { -8.9589789687104559e+102, -min }, { 2.3705633575150122e+02, -pi }, 3.90625e-03 },
+        /* 123 */ { { -1.9999999999998694e+00, -min }, { -1.3056222769592694e-13, -pi }, 2.5e-01 },
+        /* 124 */ { { -4.4647944971961829e-103, -min }, { -4.4647944971961829e-103, -min }, 2.2397447421778042e+102 },
+        /* 125 */ { { -9.9671949510973086e-206, -min }, { -9.9671949510973086e-206, -min }, 1.0032913020226237e+205 },
+        /* 126 */ { { -min, -min }, { -min, -min }, 2.2471164185778949e+307 },
+        /* 127 */ { { zero, -min }, { zero, -min }, 2.2471164185778949e+307 },
+        /* 128 */ { { min, -min }, { min, -min }, 2.2471164185778949e+307 },
+        /* 129 */ { { 9.9671949510973086e-206, -min }, { 9.9671949510973086e-206, -min }, 1.0032913020226237e+205 },
+        /* 130 */ { { 4.4647944971961829e-103, -min }, { 4.4647944971961829e-103, -min }, 2.2397447421778042e+102 },
+        /* 131 */ { { 1.9999999999998694e+00, -min }, { 1.0986122886680663e+00, zero }, 5.e-01 },
+        /* 132 */ { { 8.9589789687104559e+102, -min }, { 2.3705633575150122e+02, zero }, 3.90625e-03 },
+        /* 133 */ { { 4.0131652080900752e+205, -min }, { 4.7341952432244256e+02, zero }, 1.953125e-03 },
+        /* 134 */ { { max, -min }, { 7.0978271289338397e+02, zero }, 9.765625e-04 },
+        /* 135 */ { { inf, -min }, { inf, zero }, 1.e+00 },
+        /* 136 */ { { -inf, zero }, { inf, pi }, 1.e+00 },
+        /* 137 */ { { -max, zero }, { 7.0978271289338397e+02, pi }, 9.765625e-04 },
+        /* 138 */ { { -4.0131652080900752e+205, zero }, { 4.7341952432244256e+02, pi }, 1.953125e-03 },
+        /* 139 */ { { -8.9589789687104559e+102, zero }, { 2.3705633575150122e+02, pi }, 3.90625e-03 },
+        /* 140 */ { { -1.9999999999998694e+00, zero }, { -1.3056222769592694e-13, pi }, 2.5e-01 },
+        /* 141 */ { { -4.4647944971961829e-103, zero }, { -4.4647944971961829e-103, zero }, 2.2397447421778042e+102 },
+        /* 142 */ { { -9.9671949510973086e-206, zero }, { -9.9671949510973086e-206, zero }, 1.0032913020226237e+205 },
+        /* 143 */ { { -min, zero }, { -min, zero }, 2.2471164185778949e+307 },
+        /* 144 */ { { zero, zero }, { zero, zero }, 1.e+00 },
+        /* 145 */ { { min, zero }, { min, zero }, 2.2471164185778949e+307 },
+        /* 146 */ { { 9.9671949510973086e-206, zero }, { 9.9671949510973086e-206, zero }, 1.0032913020226237e+205 },
+        /* 147 */ { { 4.4647944971961829e-103, zero }, { 4.4647944971961829e-103, zero }, 2.2397447421778042e+102 },
+        /* 148 */ { { 1.9999999999998694e+00, zero }, { 1.0986122886680663e+00, zero }, 5.e-01 },
+        /* 149 */ { { 8.9589789687104559e+102, zero }, { 2.3705633575150122e+02, zero }, 3.90625e-03 },
+        /* 150 */ { { 4.0131652080900752e+205, zero }, { 4.7341952432244256e+02, zero }, 1.953125e-03 },
+        /* 151 */ { { max, zero }, { 7.0978271289338397e+02, zero }, 9.765625e-04 },
+        /* 152 */ { { inf, zero }, { inf, zero }, 1.e+00 },
+        /* 153 */ { { -inf, min }, { inf, pi }, 1.e+00 },
+        /* 154 */ { { -max, min }, { 7.0978271289338397e+02, pi }, 9.765625e-04 },
+        /* 155 */ { { -4.0131652080900752e+205, min }, { 4.7341952432244256e+02, pi }, 1.953125e-03 },
+        /* 156 */ { { -8.9589789687104559e+102, min }, { 2.3705633575150122e+02, pi }, 3.90625e-03 },
+        /* 157 */ { { -1.9999999999998694e+00, min }, { -1.3056222769592694e-13, pi }, 2.5e-01 },
+        /* 158 */ { { -4.4647944971961829e-103, min }, { -4.4647944971961829e-103, min }, 2.2397447421778042e+102 },
+        /* 159 */ { { -9.9671949510973086e-206, min }, { -9.9671949510973086e-206, min }, 1.0032913020226237e+205 },
+        /* 160 */ { { -min, min }, { -min, min }, 2.2471164185778949e+307 },
+        /* 161 */ { { zero, min }, { zero, min }, 2.2471164185778949e+307 },
+        /* 162 */ { { min, min }, { min, min }, 2.2471164185778949e+307 },
+        /* 163 */ { { 9.9671949510973086e-206, min }, { 9.9671949510973086e-206, min }, 1.0032913020226237e+205 },
+        /* 164 */ { { 4.4647944971961829e-103, min }, { 4.4647944971961829e-103, min }, 2.2397447421778042e+102 },
+        /* 165 */ { { 1.9999999999998694e+00, min }, { 1.0986122886680663e+00, zero }, 5.e-01 },
+        /* 166 */ { { 8.9589789687104559e+102, min }, { 2.3705633575150122e+02, zero }, 3.90625e-03 },
+        /* 167 */ { { 4.0131652080900752e+205, min }, { 4.7341952432244256e+02, zero }, 1.953125e-03 },
+        /* 168 */ { { max, min }, { 7.0978271289338397e+02, zero }, 9.765625e-04 },
+        /* 169 */ { { inf, min }, { inf, zero }, 1.e+00 },
+        /* 170 */ { { -inf, 9.9671949510973086e-206 }, { inf, pi }, 1.e+00 },
+        /* 171 */ { { -max, 9.9671949510973086e-206 }, { 7.0978271289338397e+02, pi }, 9.765625e-04 },
+        /* 172 */ { { -4.0131652080900752e+205, 9.9671949510973086e-206 }, { 4.7341952432244256e+02, pi }, 1.953125e-03 },
+        /* 173 */ { { -8.9589789687104559e+102, 9.9671949510973086e-206 }, { 2.3705633575150122e+02, pi }, 3.90625e-03 },
+        /* 174 */ { { -1.9999999999998694e+00, 9.9671949510973086e-206 }, { -1.3056222769592694e-13, pi }, 2.5e-01 },
+        /* 175 */ { { -4.4647944971961829e-103, 9.9671949510973086e-206 }, { -4.4647944971961829e-103, 9.9671949510973086e-206 }, 2.2397447421778042e+102 },
+        /* 176 */ { { -9.9671949510973086e-206, 9.9671949510973086e-206 }, { -9.9671949510973086e-206, 9.9671949510973086e-206 }, 5.0164565101131187e+204 },
+        /* 177 */ { { -min, 9.9671949510973086e-206 }, { -min, 9.9671949510973086e-206 }, 1.0032913020226237e+205 },
+        /* 178 */ { { zero, 9.9671949510973086e-206 }, { zero, 9.9671949510973086e-206 }, 1.0032913020226237e+205 },
+        /* 179 */ { { min, 9.9671949510973086e-206 }, { min, 9.9671949510973086e-206 }, 1.0032913020226237e+205 },
+        /* 180 */ { { 9.9671949510973086e-206, 9.9671949510973086e-206 }, { 9.9671949510973086e-206, 9.9671949510973086e-206 }, 5.0164565101131187e+204 },
+        /* 181 */ { { 4.4647944971961829e-103, 9.9671949510973086e-206 }, { 4.4647944971961829e-103, 9.9671949510973086e-206 }, 2.2397447421778042e+102 },
+        /* 182 */ { { 1.9999999999998694e+00, 9.9671949510973086e-206 }, { 1.0986122886680663e+00, 3.322398317032581e-206 }, 5.e-01 },
+        /* 183 */ { { 8.9589789687104559e+102, 9.9671949510973086e-206 }, { 2.3705633575150122e+02, 1.1125369292536664e-308 }, 3.90625e-03 },
+        /* 184 */ { { 4.0131652080900752e+205, 9.9671949510973086e-206 }, { 4.7341952432244256e+02, zero }, 1.953125e-03 },
+        /* 185 */ { { max, 9.9671949510973086e-206 }, { 7.0978271289338397e+02, zero }, 9.765625e-04 },
+        /* 186 */ { { inf, 9.9671949510973086e-206 }, { inf, zero }, 1.e+00 },
+        /* 187 */ { { -inf, 4.4647944971961829e-103 }, { inf, pi }, 1.e+00 },
+        /* 188 */ { { -max, 4.4647944971961829e-103 }, { 7.0978271289338397e+02, pi }, 9.765625e-04 },
+        /* 189 */ { { -4.0131652080900752e+205, 4.4647944971961829e-103 }, { 4.7341952432244256e+02, pi }, 1.953125e-03 },
+        /* 190 */ { { -8.9589789687104559e+102, 4.4647944971961829e-103 }, { 2.3705633575150122e+02, pi }, 3.90625e-03 },
+        /* 191 */ { { -1.9999999999998694e+00, 4.4647944971961829e-103 }, { -1.3056222769592694e-13, pi }, 2.5e-01 },
+        /* 192 */ { { -4.4647944971961829e-103, 4.4647944971961829e-103 }, { -4.4647944971961829e-103, 4.4647944971961829e-103 }, 1.1198723710889021e+102 },
+        /* 193 */ { { -9.9671949510973086e-206, 4.4647944971961829e-103 }, { -6.5066958834738373e-219, 4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 194 */ { { -min, 4.4647944971961829e-103 }, { 9.9671949510966579e-206, 4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 195 */ { { zero, 4.4647944971961829e-103 }, { 9.9671949510966579e-206, 4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 196 */ { { min, 4.4647944971961829e-103 }, { 9.9671949510966579e-206, 4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 197 */ { { 9.9671949510973086e-206, 4.4647944971961829e-103 }, { 1.9934389902193967e-205, 4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 198 */ { { 4.4647944971961829e-103, 4.4647944971961829e-103 }, { 4.4647944971961829e-103, 4.4647944971961829e-103 }, 1.1198723710889021e+102 },
+        /* 199 */ { { 1.9999999999998694e+00, 4.4647944971961829e-103 }, { 1.0986122886680663e+00, 1.4882648323987925e-103 }, 5.e-01 },
+        /* 200 */ { { 8.9589789687104559e+102, 4.4647944971961829e-103 }, { 2.3705633575150122e+02, 4.9835974755489796e-206 }, 3.90625e-03 },
+        /* 201 */ { { 4.0131652080900752e+205, 4.4647944971961829e-103 }, { 4.7341952432244256e+02, 1.1125369292536664e-308 }, 1.953125e-03 },
+        /* 202 */ { { max, 4.4647944971961829e-103 }, { 7.0978271289338397e+02, zero }, 9.765625e-04 },
+        /* 203 */ { { inf, 4.4647944971961829e-103 }, { inf, zero }, 1.e+00 },
+        /* 204 */ { { -inf, 1.9999999999998694e+00 }, { inf, pi }, 1.e+00 },
+        /* 205 */ { { -max, 1.9999999999998694e+00 }, { 7.0978271289338397e+02, pi }, 9.765625e-04 },
+        /* 206 */ { { -4.0131652080900752e+205, 1.9999999999998694e+00 }, { 4.7341952432244256e+02, pi }, 1.953125e-03 },
+        /* 207 */ { { -8.9589789687104559e+102, 1.9999999999998694e+00 }, { 2.3705633575150122e+02, pi }, 3.90625e-03 },
+        /* 208 */ { { -1.9999999999998694e+00, 1.9999999999998694e+00 }, { 8.0471895621697187e-01, 2.0344439357956765e+00 }, 2.5e-01 },
+        /* 209 */ { { -4.4647944971961829e-103, 1.9999999999998694e+00 }, { 8.0471895621699796e-01, 1.1071487177940644e+00 }, 5.e-01 },
+        /* 210 */ { { -9.9671949510973086e-206, 1.9999999999998694e+00 }, { 8.0471895621699796e-01, 1.1071487177940644e+00 }, 5.e-01 },
+        /* 211 */ { { -min, 1.9999999999998694e+00 }, { 8.0471895621699796e-01, 1.1071487177940644e+00 }, 5.e-01 },
+        /* 212 */ { { zero, 1.9999999999998694e+00 }, { 8.0471895621699796e-01, 1.1071487177940644e+00 }, 5.e-01 },
+        /* 213 */ { { min, 1.9999999999998694e+00 }, { 8.0471895621699796e-01, 1.1071487177940644e+00 }, 5.e-01 },
+        /* 214 */ { { 9.9671949510973086e-206, 1.9999999999998694e+00 }, { 8.0471895621699796e-01, 1.1071487177940644e+00 }, 5.e-01 },
+        /* 215 */ { { 4.4647944971961829e-103, 1.9999999999998694e+00 }, { 8.0471895621699796e-01, 1.1071487177940644e+00 }, 5.e-01 },
+        /* 216 */ { { 1.9999999999998694e+00, 1.9999999999998694e+00 }, { 1.2824746787307182e+00, 5.8800260354755751e-01 }, 5.e-01 },
+        /* 217 */ { { 8.9589789687104559e+102, 1.9999999999998694e+00 }, { 2.3705633575150122e+02, 2.2323972485982374e-103 }, 3.90625e-03 },
+        /* 218 */ { { 4.0131652080900752e+205, 1.9999999999998694e+00 }, { 4.7341952432244256e+02, 4.9835974755489796e-206 }, 1.953125e-03 },
+        /* 219 */ { { max, 1.9999999999998694e+00 }, { 7.0978271289338397e+02, zero }, 9.765625e-04 },
+        /* 220 */ { { inf, 1.9999999999998694e+00 }, { inf, zero }, 1.e+00 },
+        /* 221 */ { { -inf, 8.9589789687104559e+102 }, { inf, pi }, 1.e+00 },
+        /* 222 */ { { -max, 8.9589789687104559e+102 }, { 7.0978271289338397e+02, pi }, 9.765625e-04 },
+        /* 223 */ { { -4.0131652080900752e+205, 8.9589789687104559e+102 }, { 4.7341952432244256e+02, pi }, 1.953125e-03 },
+        /* 224 */ { { -8.9589789687104559e+102, 8.9589789687104559e+102 }, { 2.374029093417812e+02, pi3_4 }, 3.90625e-03 },
+        /* 225 */ { { -1.9999999999998694e+00, 8.9589789687104559e+102 }, { 2.3705633575150122e+02, pi_2 }, 3.90625e-03 },
+        /* 226 */ { { -4.4647944971961829e-103, 8.9589789687104559e+102 }, { 2.3705633575150122e+02, pi_2 }, 3.90625e-03 },
+        /* 227 */ { { -9.9671949510973086e-206, 8.9589789687104559e+102 }, { 2.3705633575150122e+02, pi_2 }, 3.90625e-03 },
+        /* 228 */ { { -min, 8.9589789687104559e+102 }, { 2.3705633575150122e+02, pi_2 }, 3.90625e-03 },
+        /* 229 */ { { zero, 8.9589789687104559e+102 }, { 2.3705633575150122e+02, pi_2 }, 3.90625e-03 },
+        /* 230 */ { { min, 8.9589789687104559e+102 }, { 2.3705633575150122e+02, pi_2 }, 3.90625e-03 },
+        /* 231 */ { { 9.9671949510973086e-206, 8.9589789687104559e+102 }, { 2.3705633575150122e+02, pi_2 }, 3.90625e-03 },
+        /* 232 */ { { 4.4647944971961829e-103, 8.9589789687104559e+102 }, { 2.3705633575150122e+02, pi_2 }, 3.90625e-03 },
+        /* 233 */ { { 1.9999999999998694e+00, 8.9589789687104559e+102 }, { 2.3705633575150122e+02, pi_2 }, 3.90625e-03 },
+        /* 234 */ { { 8.9589789687104559e+102, 8.9589789687104559e+102 }, { 2.374029093417812e+02, pi_4 }, 3.90625e-03 },
+        /* 235 */ { { 4.0131652080900752e+205, 8.9589789687104559e+102 }, { 4.7341952432244256e+02, 2.2323972485982374e-103 }, 1.953125e-03 },
+        /* 236 */ { { max, 8.9589789687104559e+102 }, { 7.0978271289338397e+02, 4.9835974755483611e-206 }, 9.765625e-04 },
+        /* 237 */ { { inf, 8.9589789687104559e+102 }, { inf, zero }, 1.e+00 },
+        /* 238 */ { { -inf, 4.0131652080900752e+205 }, { inf, pi }, 1.e+00 },
+        /* 239 */ { { -max, 4.0131652080900752e+205 }, { 7.0978271289338397e+02, pi }, 9.765625e-04 },
+        /* 240 */ { { -4.0131652080900752e+205, 4.0131652080900752e+205 }, { 4.737660979127225e+02, pi3_4 }, 1.953125e-03 },
+        /* 241 */ { { -8.9589789687104559e+102, 4.0131652080900752e+205 }, { 4.7341952432244256e+02, pi_2 }, 1.953125e-03 },
+        /* 242 */ { { -1.9999999999998694e+00, 4.0131652080900752e+205 }, { 4.7341952432244256e+02, pi_2 }, 1.953125e-03 },
+        /* 243 */ { { -4.4647944971961829e-103, 4.0131652080900752e+205 }, { 4.7341952432244256e+02, pi_2 }, 1.953125e-03 },
+        /* 244 */ { { -9.9671949510973086e-206, 4.0131652080900752e+205 }, { 4.7341952432244256e+02, pi_2 }, 1.953125e-03 },
+        /* 245 */ { { -min, 4.0131652080900752e+205 }, { 4.7341952432244256e+02, pi_2 }, 1.953125e-03 },
+        /* 246 */ { { zero, 4.0131652080900752e+205 }, { 4.7341952432244256e+02, pi_2 }, 1.953125e-03 },
+        /* 247 */ { { min, 4.0131652080900752e+205 }, { 4.7341952432244256e+02, pi_2 }, 1.953125e-03 },
+        /* 248 */ { { 9.9671949510973086e-206, 4.0131652080900752e+205 }, { 4.7341952432244256e+02, pi_2 }, 1.953125e-03 },
+        /* 249 */ { { 4.4647944971961829e-103, 4.0131652080900752e+205 }, { 4.7341952432244256e+02, pi_2 }, 1.953125e-03 },
+        /* 250 */ { { 1.9999999999998694e+00, 4.0131652080900752e+205 }, { 4.7341952432244256e+02, pi_2 }, 1.953125e-03 },
+        /* 251 */ { { 8.9589789687104559e+102, 4.0131652080900752e+205 }, { 4.7341952432244256e+02, pi_2 }, 1.953125e-03 },
+        /* 252 */ { { 4.0131652080900752e+205, 4.0131652080900752e+205 }, { 4.737660979127225e+02, pi_4 }, 1.953125e-03 },
+        /* 253 */ { { max, 4.0131652080900752e+205 }, { 7.0978271289338397e+02, 2.2323972485979601e-103 }, 9.765625e-04 },
+        /* 254 */ { { inf, 4.0131652080900752e+205 }, { inf, zero }, 1.e+00 },
+        /* 255 */ { { -inf, max }, { inf, pi }, 1.e+00 },
+        /* 256 */ { { -max, max }, { 7.1012928648366392e+02, pi3_4 }, 9.765625e-04 },
+        /* 257 */ { { -4.0131652080900752e+205, max }, { 7.0978271289338397e+02, pi_2 }, 9.765625e-04 },
+        /* 258 */ { { -8.9589789687104559e+102, max }, { 7.0978271289338397e+02, pi_2 }, 9.765625e-04 },
+        /* 259 */ { { -1.9999999999998694e+00, max }, { 7.0978271289338397e+02, pi_2 }, 9.765625e-04 },
+        /* 260 */ { { -4.4647944971961829e-103, max }, { 7.0978271289338397e+02, pi_2 }, 9.765625e-04 },
+        /* 261 */ { { -9.9671949510973086e-206, max }, { 7.0978271289338397e+02, pi_2 }, 9.765625e-04 },
+        /* 262 */ { { -min, max }, { 7.0978271289338397e+02, pi_2 }, 9.765625e-04 },
+        /* 263 */ { { zero, max }, { 7.0978271289338397e+02, pi_2 }, 9.765625e-04 },
+        /* 264 */ { { min, max }, { 7.0978271289338397e+02, pi_2 }, 9.765625e-04 },
+        /* 265 */ { { 9.9671949510973086e-206, max }, { 7.0978271289338397e+02, pi_2 }, 9.765625e-04 },
+        /* 266 */ { { 4.4647944971961829e-103, max }, { 7.0978271289338397e+02, pi_2 }, 9.765625e-04 },
+        /* 267 */ { { 1.9999999999998694e+00, max }, { 7.0978271289338397e+02, pi_2 }, 9.765625e-04 },
+        /* 268 */ { { 8.9589789687104559e+102, max }, { 7.0978271289338397e+02, pi_2 }, 9.765625e-04 },
+        /* 269 */ { { 4.0131652080900752e+205, max }, { 7.0978271289338397e+02, pi_2 }, 9.765625e-04 },
+        /* 270 */ { { max, max }, { 7.1012928648366392e+02, pi_4 }, 9.765625e-04 },
+        /* 271 */ { { inf, max }, { inf, zero }, 1.e+00 },
+        /* 272 */ { { -inf, inf }, { inf, pi3_4 }, 1.e+00 },
+        /* 273 */ { { -max, inf }, { inf, pi_2 }, 1.e+00 },
+        /* 274 */ { { -4.0131652080900752e+205, inf }, { inf, pi_2 }, 1.e+00 },
+        /* 275 */ { { -8.9589789687104559e+102, inf }, { inf, pi_2 }, 1.e+00 },
+        /* 276 */ { { -1.9999999999998694e+00, inf }, { inf, pi_2 }, 1.e+00 },
+        /* 277 */ { { -4.4647944971961829e-103, inf }, { inf, pi_2 }, 1.e+00 },
+        /* 278 */ { { -9.9671949510973086e-206, inf }, { inf, pi_2 }, 1.e+00 },
+        /* 279 */ { { -min, inf }, { inf, pi_2 }, 1.e+00 },
+        /* 280 */ { { zero, inf }, { inf, pi_2 }, 1.e+00 },
+        /* 281 */ { { min, inf }, { inf, pi_2 }, 1.e+00 },
+        /* 282 */ { { 9.9671949510973086e-206, inf }, { inf, pi_2 }, 1.e+00 },
+        /* 283 */ { { 4.4647944971961829e-103, inf }, { inf, pi_2 }, 1.e+00 },
+        /* 284 */ { { 1.9999999999998694e+00, inf }, { inf, pi_2 }, 1.e+00 },
+        /* 285 */ { { 8.9589789687104559e+102, inf }, { inf, pi_2 }, 1.e+00 },
+        /* 286 */ { { 4.0131652080900752e+205, inf }, { inf, pi_2 }, 1.e+00 },
+        /* 287 */ { { max, inf }, { inf, pi_2 }, 1.e+00 },
+        /* 288 */ { { inf, inf }, { inf, pi_4 }, 1.e+00 }
+          // clang-format on
+      };
+      return table;
+    } else {
+      static_assert(dependent_false<T>); /* unreachable */
+    }
+  }
+};
+
+template <typename T, int default_dps_deficiency = 0>
+struct Tan {
+  typedef std::complex<T> InputType;
+  typedef std::complex<T> OutputType;
+  typedef T FloatType;
+  using TableType = std::vector<std::tuple<InputType, OutputType, FloatType>>;
+  static constexpr int dps_deficiency = default_dps_deficiency;
+  const TableType get() {
+    if constexpr (std::is_same_v<T, float>) {
+      const T zero = 0.0f;
+      const T min = std::numeric_limits<T>::min();
+      const T max = std::numeric_limits<T>::max();
+      const TableType table{
+          // clang-format off
+          // Ignore max 80 character line width style requirement for
+          // (i) the readability
+          // (ii) the consistency with the local conventions
+        // { { -inf, -inf }, { zero, -1.e+00f }, 5.e-01f },
+        // { { -max, -inf }, { zero, -1.e+00f }, 5.e-01f },
+        // { { -6.1409603e+25f, -inf }, { zero, -1.e+00f }, 5.e-01f },
+        // { { -1.1082383e+13f, -inf }, { zero, -1.e+00f }, 5.e-01f },
+        // { { -2.e+00f, -inf }, { zero, -1.e+00f }, 5.e-01f },
+        // { { -3.6093321e-13f, -inf }, { zero, -1.e+00f }, 5.e-01f },
+        // { { -6.5136393e-26f, -inf }, { zero, -1.e+00f }, 5.e-01f },
+        // { { -min, -inf }, { zero, -1.e+00f }, 5.e-01f },
+        // { { zero, -inf }, { zero, -1.e+00f }, 5.e-01f },
+        // { { min, -inf }, { zero, -1.e+00f }, 5.e-01f },
+        // { { 6.5136393e-26f, -inf }, { zero, -1.e+00f }, 5.e-01f },
+        // { { 3.6093321e-13f, -inf }, { zero, -1.e+00f }, 5.e-01f },
+        // { { 2.e+00f, -inf }, { zero, -1.e+00f }, 5.e-01f },
+        // { { 1.1082383e+13f, -inf }, { zero, -1.e+00f }, 5.e-01f },
+        // { { 6.1409603e+25f, -inf }, { zero, -1.e+00f }, 5.e-01f },
+        // { { max, -inf }, { zero, -1.e+00f }, 5.e-01f },
+        // { { inf, -inf }, { zero, -1.e+00f }, 5.e-01f },
+        // { { -inf, -max }, { nan, nan }, 1.e+00f },
+        /* 0 */ { { -max, -max }, { zero, -1.e+00f }, 5.e-01f },
+        /* 1 */ { { -6.1409603e+25f, -max }, { zero, -1.e+00f }, 5.e-01f },
+        /* 2 */ { { -1.1082383e+13f, -max }, { zero, -1.e+00f }, 5.e-01f },
+        /* 3 */ { { -2.e+00f, -max }, { zero, -1.e+00f }, 5.e-01f },
+        /* 4 */ { { -3.6093321e-13f, -max }, { zero, -1.e+00f }, 5.e-01f },
+        /* 5 */ { { -6.5136393e-26f, -max }, { zero, -1.e+00f }, 5.e-01f },
+        /* 6 */ { { -min, -max }, { zero, -1.e+00f }, 5.e-01f },
+        /* 7 */ { { zero, -max }, { zero, -1.e+00f }, 5.e-01f },
+        /* 8 */ { { min, -max }, { zero, -1.e+00f }, 5.e-01f },
+        /* 9 */ { { 6.5136393e-26f, -max }, { zero, -1.e+00f }, 5.e-01f },
+        /* 10 */ { { 3.6093321e-13f, -max }, { zero, -1.e+00f }, 5.e-01f },
+        /* 11 */ { { 2.e+00f, -max }, { zero, -1.e+00f }, 5.e-01f },
+        /* 12 */ { { 1.1082383e+13f, -max }, { zero, -1.e+00f }, 5.e-01f },
+        /* 13 */ { { 6.1409603e+25f, -max }, { zero, -1.e+00f }, 5.e-01f },
+        /* 14 */ { { max, -max }, { zero, -1.e+00f }, 5.e-01f },
+        // { { inf, -max }, { nan, nan }, 1.e+00f },
+        // { { -inf, -6.1409603e+25f }, { nan, nan }, 1.e+00f },
+        /* 15 */ { { -max, -6.1409603e+25f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 16 */ { { -6.1409603e+25f, -6.1409603e+25f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 17 */ { { -1.1082383e+13f, -6.1409603e+25f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 18 */ { { -2.e+00f, -6.1409603e+25f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 19 */ { { -3.6093321e-13f, -6.1409603e+25f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 20 */ { { -6.5136393e-26f, -6.1409603e+25f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 21 */ { { -min, -6.1409603e+25f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 22 */ { { zero, -6.1409603e+25f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 23 */ { { min, -6.1409603e+25f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 24 */ { { 6.5136393e-26f, -6.1409603e+25f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 25 */ { { 3.6093321e-13f, -6.1409603e+25f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 26 */ { { 2.e+00f, -6.1409603e+25f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 27 */ { { 1.1082383e+13f, -6.1409603e+25f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 28 */ { { 6.1409603e+25f, -6.1409603e+25f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 29 */ { { max, -6.1409603e+25f }, { zero, -1.e+00f }, 5.e-01f },
+        // { { inf, -6.1409603e+25f }, { nan, nan }, 1.e+00f },
+        // { { -inf, -1.1082383e+13f }, { nan, nan }, 1.e+00f },
+        /* 30 */ { { -max, -1.1082383e+13f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 31 */ { { -6.1409603e+25f, -1.1082383e+13f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 32 */ { { -1.1082383e+13f, -1.1082383e+13f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 33 */ { { -2.e+00f, -1.1082383e+13f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 34 */ { { -3.6093321e-13f, -1.1082383e+13f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 35 */ { { -6.5136393e-26f, -1.1082383e+13f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 36 */ { { -min, -1.1082383e+13f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 37 */ { { zero, -1.1082383e+13f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 38 */ { { min, -1.1082383e+13f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 39 */ { { 6.5136393e-26f, -1.1082383e+13f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 40 */ { { 3.6093321e-13f, -1.1082383e+13f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 41 */ { { 2.e+00f, -1.1082383e+13f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 42 */ { { 1.1082383e+13f, -1.1082383e+13f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 43 */ { { 6.1409603e+25f, -1.1082383e+13f }, { zero, -1.e+00f }, 5.e-01f },
+        /* 44 */ { { max, -1.1082383e+13f }, { zero, -1.e+00f }, 5.e-01f },
+        // { { inf, -1.1082383e+13f }, { nan, nan }, 1.e+00f },
+        // { { -inf, -2.e+00f }, { nan, nan }, 1.e+00f },
+        /* 45 */ { { -max, -2.e+00f }, { 3.2068815e-02f, -9.8294145e-01f }, 1.e+00f },
+        /* 46 */ { { -6.1409603e+25f, -2.e+00f }, { 5.3751757e-03f, -1.0369183e+00f }, 5.e-01f },
+        /* 47 */ { { -1.1082383e+13f, -2.e+00f }, { -3.4068439e-02f, -9.8717695e-01f }, 1.e+00f },
+        /* 48 */ { { -2.e+00f, -2.e+00f }, { 2.8392954e-02f, -1.0238355e+00f }, 5.e-01f },
+        /* 49 */ { { -3.6093321e-13f, -2.e+00f }, { -2.5500228e-14f, -9.6402758e-01f }, 1.e+00f },
+        /* 50 */ { { -6.5136393e-26f, -2.e+00f }, { -4.6019399e-27f, -9.6402758e-01f }, 1.e+00f },
+        /* 51 */ { { -min, -2.e+00f }, { zero, -9.6402758e-01f }, 1.e+00f },
+        /* 52 */ { { zero, -2.e+00f }, { zero, -9.6402758e-01f }, 1.e+00f },
+        /* 53 */ { { min, -2.e+00f }, { zero, -9.6402758e-01f }, 1.e+00f },
+        /* 54 */ { { 6.5136393e-26f, -2.e+00f }, { 4.6019399e-27f, -9.6402758e-01f }, 1.e+00f },
+        /* 55 */ { { 3.6093321e-13f, -2.e+00f }, { 2.5500228e-14f, -9.6402758e-01f }, 1.e+00f },
+        /* 56 */ { { 2.e+00f, -2.e+00f }, { -2.8392954e-02f, -1.0238355e+00f }, 5.e-01f },
+        /* 57 */ { { 1.1082383e+13f, -2.e+00f }, { 3.4068439e-02f, -9.8717695e-01f }, 1.e+00f },
+        /* 58 */ { { 6.1409603e+25f, -2.e+00f }, { -5.3751757e-03f, -1.0369183e+00f }, 5.e-01f },
+        /* 59 */ { { max, -2.e+00f }, { -3.2068815e-02f, -9.8294145e-01f }, 1.e+00f },
+        // { { inf, -2.e+00f }, { nan, nan }, 1.e+00f },
+        // { { -inf, -3.6093321e-13f }, { nan, nan }, 1.e+00f },
+        /* 60 */ { { -max, -3.6093321e-13f }, { 6.1179793e-01f, -4.9602932e-13f }, 1.e+00f },
+        /* 61 */ { { -6.1409603e+25f, -3.6093321e-13f }, { 1.406664e+01f, -7.1778909e-11f }, 6.25e-02f },
+        /* 62 */ { { -1.1082383e+13f, -3.6093321e-13f }, { -7.0485216e-01f, -5.4025084e-13f }, 1.e+00f },
+        /* 63 */ { { -2.e+00f, -3.6093321e-13f }, { 2.1850398e+00f, -2.0841725e-12f }, 2.5e-01f },
+        /* 64 */ { { -3.6093321e-13f, -3.6093321e-13f }, { -3.6093321e-13f, -3.6093321e-13f }, 1.0995116e+12f },
+        /* 65 */ { { -6.5136393e-26f, -3.6093321e-13f }, { -6.5136393e-26f, -3.6093321e-13f }, 2.1990233e+12f },
+        /* 66 */ { { -min, -3.6093321e-13f }, { -min, -3.6093321e-13f }, 2.1990233e+12f },
+        /* 67 */ { { zero, -3.6093321e-13f }, { zero, -3.6093321e-13f }, 2.1990233e+12f },
+        /* 68 */ { { min, -3.6093321e-13f }, { min, -3.6093321e-13f }, 2.1990233e+12f },
+        /* 69 */ { { 6.5136393e-26f, -3.6093321e-13f }, { 6.5136393e-26f, -3.6093321e-13f }, 2.1990233e+12f },
+        /* 70 */ { { 3.6093321e-13f, -3.6093321e-13f }, { 3.6093321e-13f, -3.6093321e-13f }, 1.0995116e+12f },
+        /* 71 */ { { 2.e+00f, -3.6093321e-13f }, { -2.1850398e+00f, -2.0841725e-12f }, 2.5e-01f },
+        /* 72 */ { { 1.1082383e+13f, -3.6093321e-13f }, { 7.0485216e-01f, -5.4025084e-13f }, 1.e+00f },
+        /* 73 */ { { 6.1409603e+25f, -3.6093321e-13f }, { -1.406664e+01f, -7.1778909e-11f }, 6.25e-02f },
+        /* 74 */ { { max, -3.6093321e-13f }, { -6.1179793e-01f, -4.9602932e-13f }, 1.e+00f },
+        // { { inf, -3.6093321e-13f }, { nan, nan }, 1.e+00f },
+        // { { -inf, -6.5136393e-26f }, { nan, nan }, 1.e+00f },
+        /* 75 */ { { -max, -6.5136393e-26f }, { 6.1179793e-01f, -8.9516731e-26f }, 1.e+00f },
+        /* 76 */ { { -6.1409603e+25f, -6.5136393e-26f }, { 1.406664e+01f, -1.2953697e-23f }, 6.25e-02f },
+        /* 77 */ { { -1.1082383e+13f, -6.5136393e-26f }, { -7.0485216e-01f, -9.7497236e-26f }, 1.e+00f },
+        /* 78 */ { { -2.e+00f, -6.5136393e-26f }, { 2.1850398e+00f, -3.7612353e-25f }, 2.5e-01f },
+        /* 79 */ { { -3.6093321e-13f, -6.5136393e-26f }, { -3.6093321e-13f, -6.5136393e-26f }, 2.1990233e+12f },
+        /* 80 */ { { -6.5136393e-26f, -6.5136393e-26f }, { -6.5136393e-26f, -6.5136393e-26f }, 9.6714066e+24f },
+        /* 81 */ { { -min, -6.5136393e-26f }, { -min, -6.5136393e-26f }, 9.6714066e+24f },
+        /* 82 */ { { zero, -6.5136393e-26f }, { zero, -6.5136393e-26f }, 9.6714066e+24f },
+        /* 83 */ { { min, -6.5136393e-26f }, { min, -6.5136393e-26f }, 9.6714066e+24f },
+        /* 84 */ { { 6.5136393e-26f, -6.5136393e-26f }, { 6.5136393e-26f, -6.5136393e-26f }, 9.6714066e+24f },
+        /* 85 */ { { 3.6093321e-13f, -6.5136393e-26f }, { 3.6093321e-13f, -6.5136393e-26f }, 2.1990233e+12f },
+        /* 86 */ { { 2.e+00f, -6.5136393e-26f }, { -2.1850398e+00f, -3.7612353e-25f }, 2.5e-01f },
+        /* 87 */ { { 1.1082383e+13f, -6.5136393e-26f }, { 7.0485216e-01f, -9.7497236e-26f }, 1.e+00f },
+        /* 88 */ { { 6.1409603e+25f, -6.5136393e-26f }, { -1.406664e+01f, -1.2953697e-23f }, 6.25e-02f },
+        /* 89 */ { { max, -6.5136393e-26f }, { -6.1179793e-01f, -8.9516731e-26f }, 1.e+00f },
+        // { { inf, -6.5136393e-26f }, { nan, nan }, 1.e+00f },
+        // { { -inf, -min }, { nan, nan }, 1.e+00f },
+        /* 90 */ { { -max, -min }, { 6.1179793e-01f, -1.6154781e-38f }, 1.e+00f },
+        /* 91 */ { { -6.1409603e+25f, -min }, { 1.406664e+01f, -2.3377097e-36f }, 6.25e-02f },
+        /* 92 */ { { -1.1082383e+13f, -min }, { -7.0485216e-01f, -1.7594995e-38f }, 1.e+00f },
+        /* 93 */ { { -2.e+00f, -min }, { 2.1850398e+00f, -6.7877737e-38f }, 2.5e-01f },
+        /* 94 */ { { -3.6093321e-13f, -min }, { -3.6093321e-13f, -min }, 2.1990233e+12f },
+        /* 95 */ { { -6.5136393e-26f, -min }, { -6.5136393e-26f, -min }, 9.6714066e+24f },
+        /* 96 */ { { -min, -min }, { -min, -min }, 4.2535296e+37f },
+        /* 97 */ { { zero, -min }, { zero, -min }, 4.2535296e+37f },
+        /* 98 */ { { min, -min }, { min, -min }, 4.2535296e+37f },
+        /* 99 */ { { 6.5136393e-26f, -min }, { 6.5136393e-26f, -min }, 9.6714066e+24f },
+        /* 100 */ { { 3.6093321e-13f, -min }, { 3.6093321e-13f, -min }, 2.1990233e+12f },
+        /* 101 */ { { 2.e+00f, -min }, { -2.1850398e+00f, -6.7877737e-38f }, 2.5e-01f },
+        /* 102 */ { { 1.1082383e+13f, -min }, { 7.0485216e-01f, -1.7594995e-38f }, 1.e+00f },
+        /* 103 */ { { 6.1409603e+25f, -min }, { -1.406664e+01f, -2.3377097e-36f }, 6.25e-02f },
+        /* 104 */ { { max, -min }, { -6.1179793e-01f, -1.6154781e-38f }, 1.e+00f },
+        // { { inf, -min }, { nan, nan }, 1.e+00f },
+        // { { -inf, zero }, { nan, nan }, 1.e+00f },
+        /* 105 */ { { -max, zero }, { 6.1179793e-01f, zero }, 1.e+00f },
+        /* 106 */ { { -6.1409603e+25f, zero }, { 1.406664e+01f, zero }, 6.25e-02f },
+        /* 107 */ { { -1.1082383e+13f, zero }, { -7.0485216e-01f, zero }, 1.e+00f },
+        /* 108 */ { { -2.e+00f, zero }, { 2.1850398e+00f, zero }, 2.5e-01f },
+        /* 109 */ { { -3.6093321e-13f, zero }, { -3.6093321e-13f, zero }, 2.1990233e+12f },
+        /* 110 */ { { -6.5136393e-26f, zero }, { -6.5136393e-26f, zero }, 9.6714066e+24f },
+        /* 111 */ { { -min, zero }, { -min, zero }, 4.2535296e+37f },
+        /* 112 */ { { zero, zero }, { zero, zero }, 1.e+00f },
+        /* 113 */ { { min, zero }, { min, zero }, 4.2535296e+37f },
+        /* 114 */ { { 6.5136393e-26f, zero }, { 6.5136393e-26f, zero }, 9.6714066e+24f },
+        /* 115 */ { { 3.6093321e-13f, zero }, { 3.6093321e-13f, zero }, 2.1990233e+12f },
+        /* 116 */ { { 2.e+00f, zero }, { -2.1850398e+00f, zero }, 2.5e-01f },
+        /* 117 */ { { 1.1082383e+13f, zero }, { 7.0485216e-01f, zero }, 1.e+00f },
+        /* 118 */ { { 6.1409603e+25f, zero }, { -1.406664e+01f, zero }, 6.25e-02f },
+        /* 119 */ { { max, zero }, { -6.1179793e-01f, zero }, 1.e+00f },
+        // { { inf, zero }, { nan, nan }, 1.e+00f },
+        // { { -inf, min }, { nan, nan }, 1.e+00f },
+        /* 120 */ { { -max, min }, { 6.1179793e-01f, 1.6154781e-38f }, 1.e+00f },
+        /* 121 */ { { -6.1409603e+25f, min }, { 1.406664e+01f, 2.3377097e-36f }, 6.25e-02f },
+        /* 122 */ { { -1.1082383e+13f, min }, { -7.0485216e-01f, 1.7594995e-38f }, 1.e+00f },
+        /* 123 */ { { -2.e+00f, min }, { 2.1850398e+00f, 6.7877737e-38f }, 2.5e-01f },
+        /* 124 */ { { -3.6093321e-13f, min }, { -3.6093321e-13f, min }, 2.1990233e+12f },
+        /* 125 */ { { -6.5136393e-26f, min }, { -6.5136393e-26f, min }, 9.6714066e+24f },
+        /* 126 */ { { -min, min }, { -min, min }, 4.2535296e+37f },
+        /* 127 */ { { zero, min }, { zero, min }, 4.2535296e+37f },
+        /* 128 */ { { min, min }, { min, min }, 4.2535296e+37f },
+        /* 129 */ { { 6.5136393e-26f, min }, { 6.5136393e-26f, min }, 9.6714066e+24f },
+        /* 130 */ { { 3.6093321e-13f, min }, { 3.6093321e-13f, min }, 2.1990233e+12f },
+        /* 131 */ { { 2.e+00f, min }, { -2.1850398e+00f, 6.7877737e-38f }, 2.5e-01f },
+        /* 132 */ { { 1.1082383e+13f, min }, { 7.0485216e-01f, 1.7594995e-38f }, 1.e+00f },
+        /* 133 */ { { 6.1409603e+25f, min }, { -1.406664e+01f, 2.3377097e-36f }, 6.25e-02f },
+        /* 134 */ { { max, min }, { -6.1179793e-01f, 1.6154781e-38f }, 1.e+00f },
+        // { { inf, min }, { nan, nan }, 1.e+00f },
+        // { { -inf, 6.5136393e-26f }, { nan, nan }, 1.e+00f },
+        /* 135 */ { { -max, 6.5136393e-26f }, { 6.1179793e-01f, 8.9516731e-26f }, 1.e+00f },
+        /* 136 */ { { -6.1409603e+25f, 6.5136393e-26f }, { 1.406664e+01f, 1.2953697e-23f }, 6.25e-02f },
+        /* 137 */ { { -1.1082383e+13f, 6.5136393e-26f }, { -7.0485216e-01f, 9.7497236e-26f }, 1.e+00f },
+        /* 138 */ { { -2.e+00f, 6.5136393e-26f }, { 2.1850398e+00f, 3.7612353e-25f }, 2.5e-01f },
+        /* 139 */ { { -3.6093321e-13f, 6.5136393e-26f }, { -3.6093321e-13f, 6.5136393e-26f }, 2.1990233e+12f },
+        /* 140 */ { { -6.5136393e-26f, 6.5136393e-26f }, { -6.5136393e-26f, 6.5136393e-26f }, 9.6714066e+24f },
+        /* 141 */ { { -min, 6.5136393e-26f }, { -min, 6.5136393e-26f }, 9.6714066e+24f },
+        /* 142 */ { { zero, 6.5136393e-26f }, { zero, 6.5136393e-26f }, 9.6714066e+24f },
+        /* 143 */ { { min, 6.5136393e-26f }, { min, 6.5136393e-26f }, 9.6714066e+24f },
+        /* 144 */ { { 6.5136393e-26f, 6.5136393e-26f }, { 6.5136393e-26f, 6.5136393e-26f }, 9.6714066e+24f },
+        /* 145 */ { { 3.6093321e-13f, 6.5136393e-26f }, { 3.6093321e-13f, 6.5136393e-26f }, 2.1990233e+12f },
+        /* 146 */ { { 2.e+00f, 6.5136393e-26f }, { -2.1850398e+00f, 3.7612353e-25f }, 2.5e-01f },
+        /* 147 */ { { 1.1082383e+13f, 6.5136393e-26f }, { 7.0485216e-01f, 9.7497236e-26f }, 1.e+00f },
+        /* 148 */ { { 6.1409603e+25f, 6.5136393e-26f }, { -1.406664e+01f, 1.2953697e-23f }, 6.25e-02f },
+        /* 149 */ { { max, 6.5136393e-26f }, { -6.1179793e-01f, 8.9516731e-26f }, 1.e+00f },
+        // { { inf, 6.5136393e-26f }, { nan, nan }, 1.e+00f },
+        // { { -inf, 3.6093321e-13f }, { nan, nan }, 1.e+00f },
+        /* 150 */ { { -max, 3.6093321e-13f }, { 6.1179793e-01f, 4.9602932e-13f }, 1.e+00f },
+        /* 151 */ { { -6.1409603e+25f, 3.6093321e-13f }, { 1.406664e+01f, 7.1778909e-11f }, 6.25e-02f },
+        /* 152 */ { { -1.1082383e+13f, 3.6093321e-13f }, { -7.0485216e-01f, 5.4025084e-13f }, 1.e+00f },
+        /* 153 */ { { -2.e+00f, 3.6093321e-13f }, { 2.1850398e+00f, 2.0841725e-12f }, 2.5e-01f },
+        /* 154 */ { { -3.6093321e-13f, 3.6093321e-13f }, { -3.6093321e-13f, 3.6093321e-13f }, 1.0995116e+12f },
+        /* 155 */ { { -6.5136393e-26f, 3.6093321e-13f }, { -6.5136393e-26f, 3.6093321e-13f }, 2.1990233e+12f },
+        /* 156 */ { { -min, 3.6093321e-13f }, { -min, 3.6093321e-13f }, 2.1990233e+12f },
+        /* 157 */ { { zero, 3.6093321e-13f }, { zero, 3.6093321e-13f }, 2.1990233e+12f },
+        /* 158 */ { { min, 3.6093321e-13f }, { min, 3.6093321e-13f }, 2.1990233e+12f },
+        /* 159 */ { { 6.5136393e-26f, 3.6093321e-13f }, { 6.5136393e-26f, 3.6093321e-13f }, 2.1990233e+12f },
+        /* 160 */ { { 3.6093321e-13f, 3.6093321e-13f }, { 3.6093321e-13f, 3.6093321e-13f }, 1.0995116e+12f },
+        /* 161 */ { { 2.e+00f, 3.6093321e-13f }, { -2.1850398e+00f, 2.0841725e-12f }, 2.5e-01f },
+        /* 162 */ { { 1.1082383e+13f, 3.6093321e-13f }, { 7.0485216e-01f, 5.4025084e-13f }, 1.e+00f },
+        /* 163 */ { { 6.1409603e+25f, 3.6093321e-13f }, { -1.406664e+01f, 7.1778909e-11f }, 6.25e-02f },
+        /* 164 */ { { max, 3.6093321e-13f }, { -6.1179793e-01f, 4.9602932e-13f }, 1.e+00f },
+        // { { inf, 3.6093321e-13f }, { nan, nan }, 1.e+00f },
+        // { { -inf, 2.e+00f }, { nan, nan }, 1.e+00f },
+        /* 165 */ { { -max, 2.e+00f }, { 3.2068815e-02f, 9.8294145e-01f }, 1.e+00f },
+        /* 166 */ { { -6.1409603e+25f, 2.e+00f }, { 5.3751757e-03f, 1.0369183e+00f }, 5.e-01f },
+        /* 167 */ { { -1.1082383e+13f, 2.e+00f }, { -3.4068439e-02f, 9.8717695e-01f }, 1.e+00f },
+        /* 168 */ { { -2.e+00f, 2.e+00f }, { 2.8392954e-02f, 1.0238355e+00f }, 5.e-01f },
+        /* 169 */ { { -3.6093321e-13f, 2.e+00f }, { -2.5500228e-14f, 9.6402758e-01f }, 1.e+00f },
+        /* 170 */ { { -6.5136393e-26f, 2.e+00f }, { -4.6019399e-27f, 9.6402758e-01f }, 1.e+00f },
+        /* 171 */ { { -min, 2.e+00f }, { zero, 9.6402758e-01f }, 1.e+00f },
+        /* 172 */ { { zero, 2.e+00f }, { zero, 9.6402758e-01f }, 1.e+00f },
+        /* 173 */ { { min, 2.e+00f }, { zero, 9.6402758e-01f }, 1.e+00f },
+        /* 174 */ { { 6.5136393e-26f, 2.e+00f }, { 4.6019399e-27f, 9.6402758e-01f }, 1.e+00f },
+        /* 175 */ { { 3.6093321e-13f, 2.e+00f }, { 2.5500228e-14f, 9.6402758e-01f }, 1.e+00f },
+        /* 176 */ { { 2.e+00f, 2.e+00f }, { -2.8392954e-02f, 1.0238355e+00f }, 5.e-01f },
+        /* 177 */ { { 1.1082383e+13f, 2.e+00f }, { 3.4068439e-02f, 9.8717695e-01f }, 1.e+00f },
+        /* 178 */ { { 6.1409603e+25f, 2.e+00f }, { -5.3751757e-03f, 1.0369183e+00f }, 5.e-01f },
+        /* 179 */ { { max, 2.e+00f }, { -3.2068815e-02f, 9.8294145e-01f }, 1.e+00f },
+        // { { inf, 2.e+00f }, { nan, nan }, 1.e+00f },
+        // { { -inf, 1.1082383e+13f }, { nan, nan }, 1.e+00f },
+        /* 180 */ { { -max, 1.1082383e+13f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 181 */ { { -6.1409603e+25f, 1.1082383e+13f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 182 */ { { -1.1082383e+13f, 1.1082383e+13f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 183 */ { { -2.e+00f, 1.1082383e+13f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 184 */ { { -3.6093321e-13f, 1.1082383e+13f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 185 */ { { -6.5136393e-26f, 1.1082383e+13f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 186 */ { { -min, 1.1082383e+13f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 187 */ { { zero, 1.1082383e+13f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 188 */ { { min, 1.1082383e+13f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 189 */ { { 6.5136393e-26f, 1.1082383e+13f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 190 */ { { 3.6093321e-13f, 1.1082383e+13f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 191 */ { { 2.e+00f, 1.1082383e+13f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 192 */ { { 1.1082383e+13f, 1.1082383e+13f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 193 */ { { 6.1409603e+25f, 1.1082383e+13f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 194 */ { { max, 1.1082383e+13f }, { zero, 1.e+00f }, 5.e-01f },
+        // { { inf, 1.1082383e+13f }, { nan, nan }, 1.e+00f },
+        // { { -inf, 6.1409603e+25f }, { nan, nan }, 1.e+00f },
+        /* 195 */ { { -max, 6.1409603e+25f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 196 */ { { -6.1409603e+25f, 6.1409603e+25f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 197 */ { { -1.1082383e+13f, 6.1409603e+25f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 198 */ { { -2.e+00f, 6.1409603e+25f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 199 */ { { -3.6093321e-13f, 6.1409603e+25f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 200 */ { { -6.5136393e-26f, 6.1409603e+25f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 201 */ { { -min, 6.1409603e+25f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 202 */ { { zero, 6.1409603e+25f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 203 */ { { min, 6.1409603e+25f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 204 */ { { 6.5136393e-26f, 6.1409603e+25f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 205 */ { { 3.6093321e-13f, 6.1409603e+25f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 206 */ { { 2.e+00f, 6.1409603e+25f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 207 */ { { 1.1082383e+13f, 6.1409603e+25f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 208 */ { { 6.1409603e+25f, 6.1409603e+25f }, { zero, 1.e+00f }, 5.e-01f },
+        /* 209 */ { { max, 6.1409603e+25f }, { zero, 1.e+00f }, 5.e-01f },
+        // { { inf, 6.1409603e+25f }, { nan, nan }, 1.e+00f },
+        // { { -inf, max }, { nan, nan }, 1.e+00f },
+        /* 210 */ { { -max, max }, { zero, 1.e+00f }, 5.e-01f },
+        /* 211 */ { { -6.1409603e+25f, max }, { zero, 1.e+00f }, 5.e-01f },
+        /* 212 */ { { -1.1082383e+13f, max }, { zero, 1.e+00f }, 5.e-01f },
+        /* 213 */ { { -2.e+00f, max }, { zero, 1.e+00f }, 5.e-01f },
+        /* 214 */ { { -3.6093321e-13f, max }, { zero, 1.e+00f }, 5.e-01f },
+        /* 215 */ { { -6.5136393e-26f, max }, { zero, 1.e+00f }, 5.e-01f },
+        /* 216 */ { { -min, max }, { zero, 1.e+00f }, 5.e-01f },
+        /* 217 */ { { zero, max }, { zero, 1.e+00f }, 5.e-01f },
+        /* 218 */ { { min, max }, { zero, 1.e+00f }, 5.e-01f },
+        /* 219 */ { { 6.5136393e-26f, max }, { zero, 1.e+00f }, 5.e-01f },
+        /* 220 */ { { 3.6093321e-13f, max }, { zero, 1.e+00f }, 5.e-01f },
+        /* 221 */ { { 2.e+00f, max }, { zero, 1.e+00f }, 5.e-01f },
+        /* 222 */ { { 1.1082383e+13f, max }, { zero, 1.e+00f }, 5.e-01f },
+        /* 223 */ { { 6.1409603e+25f, max }, { zero, 1.e+00f }, 5.e-01f },
+        /* 224 */ { { max, max }, { zero, 1.e+00f }, 5.e-01f },
+        // { { inf, max }, { nan, nan }, 1.e+00f },
+        // { { -inf, inf }, { zero, 1.e+00f }, 5.e-01f },
+        // { { -max, inf }, { zero, 1.e+00f }, 5.e-01f },
+        // { { -6.1409603e+25f, inf }, { zero, 1.e+00f }, 5.e-01f },
+        // { { -1.1082383e+13f, inf }, { zero, 1.e+00f }, 5.e-01f },
+        // { { -2.e+00f, inf }, { zero, 1.e+00f }, 5.e-01f },
+        // { { -3.6093321e-13f, inf }, { zero, 1.e+00f }, 5.e-01f },
+        // { { -6.5136393e-26f, inf }, { zero, 1.e+00f }, 5.e-01f },
+        // { { -min, inf }, { zero, 1.e+00f }, 5.e-01f },
+        // { { zero, inf }, { zero, 1.e+00f }, 5.e-01f },
+        // { { min, inf }, { zero, 1.e+00f }, 5.e-01f },
+        // { { 6.5136393e-26f, inf }, { zero, 1.e+00f }, 5.e-01f },
+        // { { 3.6093321e-13f, inf }, { zero, 1.e+00f }, 5.e-01f },
+        // { { 2.e+00f, inf }, { zero, 1.e+00f }, 5.e-01f },
+        // { { 1.1082383e+13f, inf }, { zero, 1.e+00f }, 5.e-01f },
+        // { { 6.1409603e+25f, inf }, { zero, 1.e+00f }, 5.e-01f },
+        // { { max, inf }, { zero, 1.e+00f }, 5.e-01f },
+        // { { inf, inf }, { zero, 1.e+00f }, 5.e-01f }
+          // clang-format on
+      };
+      return table;
+    } else if constexpr (std::is_same_v<T, double>) {
+      const T zero = 0.0;
+      const T min = std::numeric_limits<T>::min();
+      const T max = std::numeric_limits<T>::max();
+      const TableType table{
+          // clang-format off
+          // Ignore max 80 character line width style requirement for
+          // (i) the readability
+          // (ii) the consistency with the local conventions
+        // { { -inf, -inf }, { zero, -1.e+00 }, 5.e-01 },
+        // { { -max, -inf }, { zero, -1.e+00 }, 5.e-01 },
+        // { { -4.0131652080900752e+205, -inf }, { zero, -1.e+00 }, 5.e-01 },
+        // { { -8.9589789687104559e+102, -inf }, { zero, -1.e+00 }, 5.e-01 },
+        // { { -1.9999999999998694e+00, -inf }, { zero, -1.e+00 }, 5.e-01 },
+        // { { -4.4647944971961829e-103, -inf }, { zero, -1.e+00 }, 5.e-01 },
+        // { { -9.9671949510973086e-206, -inf }, { zero, -1.e+00 }, 5.e-01 },
+        // { { -min, -inf }, { zero, -1.e+00 }, 5.e-01 },
+        // { { zero, -inf }, { zero, -1.e+00 }, 5.e-01 },
+        // { { min, -inf }, { zero, -1.e+00 }, 5.e-01 },
+        // { { 9.9671949510973086e-206, -inf }, { zero, -1.e+00 }, 5.e-01 },
+        // { { 4.4647944971961829e-103, -inf }, { zero, -1.e+00 }, 5.e-01 },
+        // { { 1.9999999999998694e+00, -inf }, { zero, -1.e+00 }, 5.e-01 },
+        // { { 8.9589789687104559e+102, -inf }, { zero, -1.e+00 }, 5.e-01 },
+        // { { 4.0131652080900752e+205, -inf }, { zero, -1.e+00 }, 5.e-01 },
+        // { { max, -inf }, { zero, -1.e+00 }, 5.e-01 },
+        // { { inf, -inf }, { zero, -1.e+00 }, 5.e-01 },
+        // { { -inf, -max }, { nan, nan }, 1.e+00 },
+        /* 0 */ { { -max, -max }, { zero, -1.e+00 }, 5.e-01 },
+        /* 1 */ { { -4.0131652080900752e+205, -max }, { zero, -1.e+00 }, 5.e-01 },
+        /* 2 */ { { -8.9589789687104559e+102, -max }, { zero, -1.e+00 }, 5.e-01 },
+        /* 3 */ { { -1.9999999999998694e+00, -max }, { zero, -1.e+00 }, 5.e-01 },
+        /* 4 */ { { -4.4647944971961829e-103, -max }, { zero, -1.e+00 }, 5.e-01 },
+        /* 5 */ { { -9.9671949510973086e-206, -max }, { zero, -1.e+00 }, 5.e-01 },
+        /* 6 */ { { -min, -max }, { zero, -1.e+00 }, 5.e-01 },
+        /* 7 */ { { zero, -max }, { zero, -1.e+00 }, 5.e-01 },
+        /* 8 */ { { min, -max }, { zero, -1.e+00 }, 5.e-01 },
+        /* 9 */ { { 9.9671949510973086e-206, -max }, { zero, -1.e+00 }, 5.e-01 },
+        /* 10 */ { { 4.4647944971961829e-103, -max }, { zero, -1.e+00 }, 5.e-01 },
+        /* 11 */ { { 1.9999999999998694e+00, -max }, { zero, -1.e+00 }, 5.e-01 },
+        /* 12 */ { { 8.9589789687104559e+102, -max }, { zero, -1.e+00 }, 5.e-01 },
+        /* 13 */ { { 4.0131652080900752e+205, -max }, { zero, -1.e+00 }, 5.e-01 },
+        /* 14 */ { { max, -max }, { zero, -1.e+00 }, 5.e-01 },
+        // { { inf, -max }, { nan, nan }, 1.e+00 },
+        // { { -inf, -4.0131652080900752e+205 }, { nan, nan }, 1.e+00 },
+        /* 15 */ { { -max, -4.0131652080900752e+205 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 16 */ { { -4.0131652080900752e+205, -4.0131652080900752e+205 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 17 */ { { -8.9589789687104559e+102, -4.0131652080900752e+205 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 18 */ { { -1.9999999999998694e+00, -4.0131652080900752e+205 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 19 */ { { -4.4647944971961829e-103, -4.0131652080900752e+205 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 20 */ { { -9.9671949510973086e-206, -4.0131652080900752e+205 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 21 */ { { -min, -4.0131652080900752e+205 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 22 */ { { zero, -4.0131652080900752e+205 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 23 */ { { min, -4.0131652080900752e+205 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 24 */ { { 9.9671949510973086e-206, -4.0131652080900752e+205 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 25 */ { { 4.4647944971961829e-103, -4.0131652080900752e+205 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 26 */ { { 1.9999999999998694e+00, -4.0131652080900752e+205 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 27 */ { { 8.9589789687104559e+102, -4.0131652080900752e+205 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 28 */ { { 4.0131652080900752e+205, -4.0131652080900752e+205 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 29 */ { { max, -4.0131652080900752e+205 }, { zero, -1.e+00 }, 5.e-01 },
+        // { { inf, -4.0131652080900752e+205 }, { nan, nan }, 1.e+00 },
+        // { { -inf, -8.9589789687104559e+102 }, { nan, nan }, 1.e+00 },
+        /* 30 */ { { -max, -8.9589789687104559e+102 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 31 */ { { -4.0131652080900752e+205, -8.9589789687104559e+102 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 32 */ { { -8.9589789687104559e+102, -8.9589789687104559e+102 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 33 */ { { -1.9999999999998694e+00, -8.9589789687104559e+102 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 34 */ { { -4.4647944971961829e-103, -8.9589789687104559e+102 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 35 */ { { -9.9671949510973086e-206, -8.9589789687104559e+102 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 36 */ { { -min, -8.9589789687104559e+102 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 37 */ { { zero, -8.9589789687104559e+102 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 38 */ { { min, -8.9589789687104559e+102 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 39 */ { { 9.9671949510973086e-206, -8.9589789687104559e+102 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 40 */ { { 4.4647944971961829e-103, -8.9589789687104559e+102 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 41 */ { { 1.9999999999998694e+00, -8.9589789687104559e+102 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 42 */ { { 8.9589789687104559e+102, -8.9589789687104559e+102 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 43 */ { { 4.0131652080900752e+205, -8.9589789687104559e+102 }, { zero, -1.e+00 }, 5.e-01 },
+        /* 44 */ { { max, -8.9589789687104559e+102 }, { zero, -1.e+00 }, 5.e-01 },
+        // { { inf, -8.9589789687104559e+102 }, { nan, nan }, 1.e+00 },
+        // { { -inf, -1.9999999999998694e+00 }, { nan, nan }, 1.e+00 },
+        /* 45 */ { { -max, -1.9999999999998694e+00 }, { 3.5056249287033966e-04, -9.6402925699855813e-01 }, 1.e+00 },
+        /* 46 */ { { -4.0131652080900752e+205, -1.9999999999998694e+00 }, { 2.7597961351536058e-02, -1.02477724455786e+00 }, 5.e-01 },
+        /* 47 */ { { -8.9589789687104559e+102, -1.9999999999998694e+00 }, { 3.6482534190112638e-02, -1.0041027601252124e+00 }, 5.e-01 },
+        /* 48 */ { { -1.9999999999998694e+00, -1.9999999999998694e+00 }, { 2.8392952868233685e-02, -1.0238355945704865e+00 }, 5.e-01 },
+        /* 49 */ { { -4.4647944971961829e-103, -1.9999999999998694e+00 }, { -3.1544141402685944e-104, -9.6402758007580769e-01 }, 1.e+00 },
+        /* 50 */ { { -9.9671949510973086e-206, -1.9999999999998694e+00 }, { -7.0419054476749839e-207, -9.6402758007580769e-01 }, 1.e+00 },
+        /* 51 */ { { -min, -1.9999999999998694e+00 }, { zero, -9.6402758007580769e-01 }, 1.e+00 },
+        /* 52 */ { { zero, -1.9999999999998694e+00 }, { zero, -9.6402758007580769e-01 }, 1.e+00 },
+        /* 53 */ { { min, -1.9999999999998694e+00 }, { zero, -9.6402758007580769e-01 }, 1.e+00 },
+        /* 54 */ { { 9.9671949510973086e-206, -1.9999999999998694e+00 }, { 7.0419054476749839e-207, -9.6402758007580769e-01 }, 1.e+00 },
+        /* 55 */ { { 4.4647944971961829e-103, -1.9999999999998694e+00 }, { 3.1544141402685944e-104, -9.6402758007580769e-01 }, 1.e+00 },
+        /* 56 */ { { 1.9999999999998694e+00, -1.9999999999998694e+00 }, { -2.8392952868233685e-02, -1.0238355945704865e+00 }, 5.e-01 },
+        /* 57 */ { { 8.9589789687104559e+102, -1.9999999999998694e+00 }, { -3.6482534190112638e-02, -1.0041027601252124e+00 }, 5.e-01 },
+        /* 58 */ { { 4.0131652080900752e+205, -1.9999999999998694e+00 }, { -2.7597961351536058e-02, -1.02477724455786e+00 }, 5.e-01 },
+        /* 59 */ { { max, -1.9999999999998694e+00 }, { -3.5056249287033966e-04, -9.6402925699855813e-01 }, 1.e+00 },
+        // { { inf, -1.9999999999998694e+00 }, { nan, nan }, 1.e+00 },
+        // { { -inf, -4.4647944971961829e-103 }, { nan, nan }, 1.e+00 },
+        /* 60 */ { { -max, -4.4647944971961829e-103 }, { 4.9620158744448951e-03, -4.4649044275872426e-103 }, 1.28e+02 },
+        /* 61 */ { { -4.0131652080900752e+205, -4.4647944971961829e-103 }, { 2.2833759509915996e+00, -2.7743365645875101e-102 }, 2.5e-01 },
+        /* 62 */ { { -8.9589789687104559e+102, -4.4647944971961829e-103 }, { 1.1394650926503123e+00, -1.0261797490073182e-102 }, 5.e-01 },
+        /* 63 */ { { -1.9999999999998694e+00, -4.4647944971961829e-103 }, { 2.185039863262273e+00, -2.5781505790835082e-102 }, 2.5e-01 },
+        /* 64 */ { { -4.4647944971961829e-103, -4.4647944971961829e-103 }, { -4.4647944971961829e-103, -4.4647944971961829e-103 }, 1.1198723710889021e+102 },
+        /* 65 */ { { -9.9671949510973086e-206, -4.4647944971961829e-103 }, { -9.9671949510973086e-206, -4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 66 */ { { -min, -4.4647944971961829e-103 }, { -min, -4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 67 */ { { zero, -4.4647944971961829e-103 }, { zero, -4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 68 */ { { min, -4.4647944971961829e-103 }, { min, -4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 69 */ { { 9.9671949510973086e-206, -4.4647944971961829e-103 }, { 9.9671949510973086e-206, -4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 70 */ { { 4.4647944971961829e-103, -4.4647944971961829e-103 }, { 4.4647944971961829e-103, -4.4647944971961829e-103 }, 1.1198723710889021e+102 },
+        /* 71 */ { { 1.9999999999998694e+00, -4.4647944971961829e-103 }, { -2.185039863262273e+00, -2.5781505790835082e-102 }, 2.5e-01 },
+        /* 72 */ { { 8.9589789687104559e+102, -4.4647944971961829e-103 }, { -1.1394650926503123e+00, -1.0261797490073182e-102 }, 5.e-01 },
+        /* 73 */ { { 4.0131652080900752e+205, -4.4647944971961829e-103 }, { -2.2833759509915996e+00, -2.7743365645875101e-102 }, 2.5e-01 },
+        /* 74 */ { { max, -4.4647944971961829e-103 }, { -4.9620158744448951e-03, -4.4649044275872426e-103 }, 1.28e+02 },
+        // { { inf, -4.4647944971961829e-103 }, { nan, nan }, 1.e+00 },
+        // { { -inf, -9.9671949510973086e-206 }, { nan, nan }, 1.e+00 },
+        /* 75 */ { { -max, -9.9671949510973086e-206 }, { 4.9620158744448951e-03, -9.9674403593998487e-206 }, 1.28e+02 },
+        /* 76 */ { { -4.0131652080900752e+205, -9.9671949510973086e-206 }, { 2.2833759509915996e+00, -6.1934213134706426e-205 }, 2.5e-01 },
+        /* 77 */ { { -8.9589789687104559e+102, -9.9671949510973086e-206 }, { 1.1394650926503123e+00, -2.2908408482511669e-205 }, 5.e-01 },
+        /* 78 */ { { -1.9999999999998694e+00, -9.9671949510973086e-206 }, { 2.185039863262273e+00, -5.7554562592179759e-205 }, 2.5e-01 },
+        /* 79 */ { { -4.4647944971961829e-103, -9.9671949510973086e-206 }, { -4.4647944971961829e-103, -9.9671949510973086e-206 }, 2.2397447421778042e+102 },
+        /* 80 */ { { -9.9671949510973086e-206, -9.9671949510973086e-206 }, { -9.9671949510973086e-206, -9.9671949510973086e-206 }, 5.0164565101131187e+204 },
+        /* 81 */ { { -min, -9.9671949510973086e-206 }, { -min, -9.9671949510973086e-206 }, 1.0032913020226237e+205 },
+        /* 82 */ { { zero, -9.9671949510973086e-206 }, { zero, -9.9671949510973086e-206 }, 1.0032913020226237e+205 },
+        /* 83 */ { { min, -9.9671949510973086e-206 }, { min, -9.9671949510973086e-206 }, 1.0032913020226237e+205 },
+        /* 84 */ { { 9.9671949510973086e-206, -9.9671949510973086e-206 }, { 9.9671949510973086e-206, -9.9671949510973086e-206 }, 5.0164565101131187e+204 },
+        /* 85 */ { { 4.4647944971961829e-103, -9.9671949510973086e-206 }, { 4.4647944971961829e-103, -9.9671949510973086e-206 }, 2.2397447421778042e+102 },
+        /* 86 */ { { 1.9999999999998694e+00, -9.9671949510973086e-206 }, { -2.185039863262273e+00, -5.7554562592179759e-205 }, 2.5e-01 },
+        /* 87 */ { { 8.9589789687104559e+102, -9.9671949510973086e-206 }, { -1.1394650926503123e+00, -2.2908408482511669e-205 }, 5.e-01 },
+        /* 88 */ { { 4.0131652080900752e+205, -9.9671949510973086e-206 }, { -2.2833759509915996e+00, -6.1934213134706426e-205 }, 2.5e-01 },
+        /* 89 */ { { max, -9.9671949510973086e-206 }, { -4.9620158744448951e-03, -9.9674403593998487e-206 }, 1.28e+02 },
+        // { { inf, -9.9671949510973086e-206 }, { nan, nan }, 1.e+00 },
+        // { { -inf, -min }, { nan, nan }, 1.e+00 },
+        /* 90 */ { { -max, -min }, { 4.9620158744448951e-03, -2.2251286433891388e-308 }, 1.28e+02 },
+        /* 91 */ { { -4.0131652080900752e+205, -min }, { 2.2833759509915996e+00, -1.3826176699601631e-307 }, 2.5e-01 },
+        /* 92 */ { { -8.9589789687104559e+102, -min }, { 1.1394650926503123e+00, -5.1140668066123887e-308 }, 5.e-01 },
+        /* 93 */ { { -1.9999999999998694e+00, -min }, { 2.185039863262273e+00, -1.2848464717505794e-307 }, 2.5e-01 },
+        /* 94 */ { { -4.4647944971961829e-103, -min }, { -4.4647944971961829e-103, -min }, 2.2397447421778042e+102 },
+        /* 95 */ { { -9.9671949510973086e-206, -min }, { -9.9671949510973086e-206, -min }, 1.0032913020226237e+205 },
+        /* 96 */ { { -min, -min }, { -min, -min }, 2.2471164185778949e+307 },
+        /* 97 */ { { zero, -min }, { zero, -min }, 2.2471164185778949e+307 },
+        /* 98 */ { { min, -min }, { min, -min }, 2.2471164185778949e+307 },
+        /* 99 */ { { 9.9671949510973086e-206, -min }, { 9.9671949510973086e-206, -min }, 1.0032913020226237e+205 },
+        /* 100 */ { { 4.4647944971961829e-103, -min }, { 4.4647944971961829e-103, -min }, 2.2397447421778042e+102 },
+        /* 101 */ { { 1.9999999999998694e+00, -min }, { -2.185039863262273e+00, -1.2848464717505794e-307 }, 2.5e-01 },
+        /* 102 */ { { 8.9589789687104559e+102, -min }, { -1.1394650926503123e+00, -5.1140668066123887e-308 }, 5.e-01 },
+        /* 103 */ { { 4.0131652080900752e+205, -min }, { -2.2833759509915996e+00, -1.3826176699601631e-307 }, 2.5e-01 },
+        /* 104 */ { { max, -min }, { -4.9620158744448951e-03, -2.2251286433891388e-308 }, 1.28e+02 },
+        // { { inf, -min }, { nan, nan }, 1.e+00 },
+        // { { -inf, zero }, { nan, nan }, 1.e+00 },
+        /* 105 */ { { -max, zero }, { 4.9620158744448951e-03, zero }, 1.28e+02 },
+        /* 106 */ { { -4.0131652080900752e+205, zero }, { 2.2833759509915996e+00, zero }, 2.5e-01 },
+        /* 107 */ { { -8.9589789687104559e+102, zero }, { 1.1394650926503123e+00, zero }, 5.e-01 },
+        /* 108 */ { { -1.9999999999998694e+00, zero }, { 2.185039863262273e+00, zero }, 2.5e-01 },
+        /* 109 */ { { -4.4647944971961829e-103, zero }, { -4.4647944971961829e-103, zero }, 2.2397447421778042e+102 },
+        /* 110 */ { { -9.9671949510973086e-206, zero }, { -9.9671949510973086e-206, zero }, 1.0032913020226237e+205 },
+        /* 111 */ { { -min, zero }, { -min, zero }, 2.2471164185778949e+307 },
+        /* 112 */ { { zero, zero }, { zero, zero }, 1.e+00 },
+        /* 113 */ { { min, zero }, { min, zero }, 2.2471164185778949e+307 },
+        /* 114 */ { { 9.9671949510973086e-206, zero }, { 9.9671949510973086e-206, zero }, 1.0032913020226237e+205 },
+        /* 115 */ { { 4.4647944971961829e-103, zero }, { 4.4647944971961829e-103, zero }, 2.2397447421778042e+102 },
+        /* 116 */ { { 1.9999999999998694e+00, zero }, { -2.185039863262273e+00, zero }, 2.5e-01 },
+        /* 117 */ { { 8.9589789687104559e+102, zero }, { -1.1394650926503123e+00, zero }, 5.e-01 },
+        /* 118 */ { { 4.0131652080900752e+205, zero }, { -2.2833759509915996e+00, zero }, 2.5e-01 },
+        /* 119 */ { { max, zero }, { -4.9620158744448951e-03, zero }, 1.28e+02 },
+        // { { inf, zero }, { nan, nan }, 1.e+00 },
+        // { { -inf, min }, { nan, nan }, 1.e+00 },
+        /* 120 */ { { -max, min }, { 4.9620158744448951e-03, 2.2251286433891388e-308 }, 1.28e+02 },
+        /* 121 */ { { -4.0131652080900752e+205, min }, { 2.2833759509915996e+00, 1.3826176699601631e-307 }, 2.5e-01 },
+        /* 122 */ { { -8.9589789687104559e+102, min }, { 1.1394650926503123e+00, 5.1140668066123887e-308 }, 5.e-01 },
+        /* 123 */ { { -1.9999999999998694e+00, min }, { 2.185039863262273e+00, 1.2848464717505794e-307 }, 2.5e-01 },
+        /* 124 */ { { -4.4647944971961829e-103, min }, { -4.4647944971961829e-103, min }, 2.2397447421778042e+102 },
+        /* 125 */ { { -9.9671949510973086e-206, min }, { -9.9671949510973086e-206, min }, 1.0032913020226237e+205 },
+        /* 126 */ { { -min, min }, { -min, min }, 2.2471164185778949e+307 },
+        /* 127 */ { { zero, min }, { zero, min }, 2.2471164185778949e+307 },
+        /* 128 */ { { min, min }, { min, min }, 2.2471164185778949e+307 },
+        /* 129 */ { { 9.9671949510973086e-206, min }, { 9.9671949510973086e-206, min }, 1.0032913020226237e+205 },
+        /* 130 */ { { 4.4647944971961829e-103, min }, { 4.4647944971961829e-103, min }, 2.2397447421778042e+102 },
+        /* 131 */ { { 1.9999999999998694e+00, min }, { -2.185039863262273e+00, 1.2848464717505794e-307 }, 2.5e-01 },
+        /* 132 */ { { 8.9589789687104559e+102, min }, { -1.1394650926503123e+00, 5.1140668066123887e-308 }, 5.e-01 },
+        /* 133 */ { { 4.0131652080900752e+205, min }, { -2.2833759509915996e+00, 1.3826176699601631e-307 }, 2.5e-01 },
+        /* 134 */ { { max, min }, { -4.9620158744448951e-03, 2.2251286433891388e-308 }, 1.28e+02 },
+        // { { inf, min }, { nan, nan }, 1.e+00 },
+        // { { -inf, 9.9671949510973086e-206 }, { nan, nan }, 1.e+00 },
+        /* 135 */ { { -max, 9.9671949510973086e-206 }, { 4.9620158744448951e-03, 9.9674403593998487e-206 }, 1.28e+02 },
+        /* 136 */ { { -4.0131652080900752e+205, 9.9671949510973086e-206 }, { 2.2833759509915996e+00, 6.1934213134706426e-205 }, 2.5e-01 },
+        /* 137 */ { { -8.9589789687104559e+102, 9.9671949510973086e-206 }, { 1.1394650926503123e+00, 2.2908408482511669e-205 }, 5.e-01 },
+        /* 138 */ { { -1.9999999999998694e+00, 9.9671949510973086e-206 }, { 2.185039863262273e+00, 5.7554562592179759e-205 }, 2.5e-01 },
+        /* 139 */ { { -4.4647944971961829e-103, 9.9671949510973086e-206 }, { -4.4647944971961829e-103, 9.9671949510973086e-206 }, 2.2397447421778042e+102 },
+        /* 140 */ { { -9.9671949510973086e-206, 9.9671949510973086e-206 }, { -9.9671949510973086e-206, 9.9671949510973086e-206 }, 5.0164565101131187e+204 },
+        /* 141 */ { { -min, 9.9671949510973086e-206 }, { -min, 9.9671949510973086e-206 }, 1.0032913020226237e+205 },
+        /* 142 */ { { zero, 9.9671949510973086e-206 }, { zero, 9.9671949510973086e-206 }, 1.0032913020226237e+205 },
+        /* 143 */ { { min, 9.9671949510973086e-206 }, { min, 9.9671949510973086e-206 }, 1.0032913020226237e+205 },
+        /* 144 */ { { 9.9671949510973086e-206, 9.9671949510973086e-206 }, { 9.9671949510973086e-206, 9.9671949510973086e-206 }, 5.0164565101131187e+204 },
+        /* 145 */ { { 4.4647944971961829e-103, 9.9671949510973086e-206 }, { 4.4647944971961829e-103, 9.9671949510973086e-206 }, 2.2397447421778042e+102 },
+        /* 146 */ { { 1.9999999999998694e+00, 9.9671949510973086e-206 }, { -2.185039863262273e+00, 5.7554562592179759e-205 }, 2.5e-01 },
+        /* 147 */ { { 8.9589789687104559e+102, 9.9671949510973086e-206 }, { -1.1394650926503123e+00, 2.2908408482511669e-205 }, 5.e-01 },
+        /* 148 */ { { 4.0131652080900752e+205, 9.9671949510973086e-206 }, { -2.2833759509915996e+00, 6.1934213134706426e-205 }, 2.5e-01 },
+        /* 149 */ { { max, 9.9671949510973086e-206 }, { -4.9620158744448951e-03, 9.9674403593998487e-206 }, 1.28e+02 },
+        // { { inf, 9.9671949510973086e-206 }, { nan, nan }, 1.e+00 },
+        // { { -inf, 4.4647944971961829e-103 }, { nan, nan }, 1.e+00 },
+        /* 150 */ { { -max, 4.4647944971961829e-103 }, { 4.9620158744448951e-03, 4.4649044275872426e-103 }, 1.28e+02 },
+        /* 151 */ { { -4.0131652080900752e+205, 4.4647944971961829e-103 }, { 2.2833759509915996e+00, 2.7743365645875101e-102 }, 2.5e-01 },
+        /* 152 */ { { -8.9589789687104559e+102, 4.4647944971961829e-103 }, { 1.1394650926503123e+00, 1.0261797490073182e-102 }, 5.e-01 },
+        /* 153 */ { { -1.9999999999998694e+00, 4.4647944971961829e-103 }, { 2.185039863262273e+00, 2.5781505790835082e-102 }, 2.5e-01 },
+        /* 154 */ { { -4.4647944971961829e-103, 4.4647944971961829e-103 }, { -4.4647944971961829e-103, 4.4647944971961829e-103 }, 1.1198723710889021e+102 },
+        /* 155 */ { { -9.9671949510973086e-206, 4.4647944971961829e-103 }, { -9.9671949510973086e-206, 4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 156 */ { { -min, 4.4647944971961829e-103 }, { -min, 4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 157 */ { { zero, 4.4647944971961829e-103 }, { zero, 4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 158 */ { { min, 4.4647944971961829e-103 }, { min, 4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 159 */ { { 9.9671949510973086e-206, 4.4647944971961829e-103 }, { 9.9671949510973086e-206, 4.4647944971961829e-103 }, 2.2397447421778042e+102 },
+        /* 160 */ { { 4.4647944971961829e-103, 4.4647944971961829e-103 }, { 4.4647944971961829e-103, 4.4647944971961829e-103 }, 1.1198723710889021e+102 },
+        /* 161 */ { { 1.9999999999998694e+00, 4.4647944971961829e-103 }, { -2.185039863262273e+00, 2.5781505790835082e-102 }, 2.5e-01 },
+        /* 162 */ { { 8.9589789687104559e+102, 4.4647944971961829e-103 }, { -1.1394650926503123e+00, 1.0261797490073182e-102 }, 5.e-01 },
+        /* 163 */ { { 4.0131652080900752e+205, 4.4647944971961829e-103 }, { -2.2833759509915996e+00, 2.7743365645875101e-102 }, 2.5e-01 },
+        /* 164 */ { { max, 4.4647944971961829e-103 }, { -4.9620158744448951e-03, 4.4649044275872426e-103 }, 1.28e+02 },
+        // { { inf, 4.4647944971961829e-103 }, { nan, nan }, 1.e+00 },
+        // { { -inf, 1.9999999999998694e+00 }, { nan, nan }, 1.e+00 },
+        /* 165 */ { { -max, 1.9999999999998694e+00 }, { 3.5056249287033966e-04, 9.6402925699855813e-01 }, 1.e+00 },
+        /* 166 */ { { -4.0131652080900752e+205, 1.9999999999998694e+00 }, { 2.7597961351536058e-02, 1.02477724455786e+00 }, 5.e-01 },
+        /* 167 */ { { -8.9589789687104559e+102, 1.9999999999998694e+00 }, { 3.6482534190112638e-02, 1.0041027601252124e+00 }, 5.e-01 },
+        /* 168 */ { { -1.9999999999998694e+00, 1.9999999999998694e+00 }, { 2.8392952868233685e-02, 1.0238355945704865e+00 }, 5.e-01 },
+        /* 169 */ { { -4.4647944971961829e-103, 1.9999999999998694e+00 }, { -3.1544141402685944e-104, 9.6402758007580769e-01 }, 1.e+00 },
+        /* 170 */ { { -9.9671949510973086e-206, 1.9999999999998694e+00 }, { -7.0419054476749839e-207, 9.6402758007580769e-01 }, 1.e+00 },
+        /* 171 */ { { -min, 1.9999999999998694e+00 }, { zero, 9.6402758007580769e-01 }, 1.e+00 },
+        /* 172 */ { { zero, 1.9999999999998694e+00 }, { zero, 9.6402758007580769e-01 }, 1.e+00 },
+        /* 173 */ { { min, 1.9999999999998694e+00 }, { zero, 9.6402758007580769e-01 }, 1.e+00 },
+        /* 174 */ { { 9.9671949510973086e-206, 1.9999999999998694e+00 }, { 7.0419054476749839e-207, 9.6402758007580769e-01 }, 1.e+00 },
+        /* 175 */ { { 4.4647944971961829e-103, 1.9999999999998694e+00 }, { 3.1544141402685944e-104, 9.6402758007580769e-01 }, 1.e+00 },
+        /* 176 */ { { 1.9999999999998694e+00, 1.9999999999998694e+00 }, { -2.8392952868233685e-02, 1.0238355945704865e+00 }, 5.e-01 },
+        /* 177 */ { { 8.9589789687104559e+102, 1.9999999999998694e+00 }, { -3.6482534190112638e-02, 1.0041027601252124e+00 }, 5.e-01 },
+        /* 178 */ { { 4.0131652080900752e+205, 1.9999999999998694e+00 }, { -2.7597961351536058e-02, 1.02477724455786e+00 }, 5.e-01 },
+        /* 179 */ { { max, 1.9999999999998694e+00 }, { -3.5056249287033966e-04, 9.6402925699855813e-01 }, 1.e+00 },
+        // { { inf, 1.9999999999998694e+00 }, { nan, nan }, 1.e+00 },
+        // { { -inf, 8.9589789687104559e+102 }, { nan, nan }, 1.e+00 },
+        /* 180 */ { { -max, 8.9589789687104559e+102 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 181 */ { { -4.0131652080900752e+205, 8.9589789687104559e+102 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 182 */ { { -8.9589789687104559e+102, 8.9589789687104559e+102 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 183 */ { { -1.9999999999998694e+00, 8.9589789687104559e+102 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 184 */ { { -4.4647944971961829e-103, 8.9589789687104559e+102 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 185 */ { { -9.9671949510973086e-206, 8.9589789687104559e+102 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 186 */ { { -min, 8.9589789687104559e+102 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 187 */ { { zero, 8.9589789687104559e+102 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 188 */ { { min, 8.9589789687104559e+102 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 189 */ { { 9.9671949510973086e-206, 8.9589789687104559e+102 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 190 */ { { 4.4647944971961829e-103, 8.9589789687104559e+102 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 191 */ { { 1.9999999999998694e+00, 8.9589789687104559e+102 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 192 */ { { 8.9589789687104559e+102, 8.9589789687104559e+102 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 193 */ { { 4.0131652080900752e+205, 8.9589789687104559e+102 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 194 */ { { max, 8.9589789687104559e+102 }, { zero, 1.e+00 }, 5.e-01 },
+        // { { inf, 8.9589789687104559e+102 }, { nan, nan }, 1.e+00 },
+        // { { -inf, 4.0131652080900752e+205 }, { nan, nan }, 1.e+00 },
+        /* 195 */ { { -max, 4.0131652080900752e+205 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 196 */ { { -4.0131652080900752e+205, 4.0131652080900752e+205 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 197 */ { { -8.9589789687104559e+102, 4.0131652080900752e+205 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 198 */ { { -1.9999999999998694e+00, 4.0131652080900752e+205 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 199 */ { { -4.4647944971961829e-103, 4.0131652080900752e+205 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 200 */ { { -9.9671949510973086e-206, 4.0131652080900752e+205 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 201 */ { { -min, 4.0131652080900752e+205 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 202 */ { { zero, 4.0131652080900752e+205 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 203 */ { { min, 4.0131652080900752e+205 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 204 */ { { 9.9671949510973086e-206, 4.0131652080900752e+205 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 205 */ { { 4.4647944971961829e-103, 4.0131652080900752e+205 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 206 */ { { 1.9999999999998694e+00, 4.0131652080900752e+205 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 207 */ { { 8.9589789687104559e+102, 4.0131652080900752e+205 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 208 */ { { 4.0131652080900752e+205, 4.0131652080900752e+205 }, { zero, 1.e+00 }, 5.e-01 },
+        /* 209 */ { { max, 4.0131652080900752e+205 }, { zero, 1.e+00 }, 5.e-01 },
+        // { { inf, 4.0131652080900752e+205 }, { nan, nan }, 1.e+00 },
+        // { { -inf, max }, { nan, nan }, 1.e+00 },
+        /* 210 */ { { -max, max }, { zero, 1.e+00 }, 5.e-01 },
+        /* 211 */ { { -4.0131652080900752e+205, max }, { zero, 1.e+00 }, 5.e-01 },
+        /* 212 */ { { -8.9589789687104559e+102, max }, { zero, 1.e+00 }, 5.e-01 },
+        /* 213 */ { { -1.9999999999998694e+00, max }, { zero, 1.e+00 }, 5.e-01 },
+        /* 214 */ { { -4.4647944971961829e-103, max }, { zero, 1.e+00 }, 5.e-01 },
+        /* 215 */ { { -9.9671949510973086e-206, max }, { zero, 1.e+00 }, 5.e-01 },
+        /* 216 */ { { -min, max }, { zero, 1.e+00 }, 5.e-01 },
+        /* 217 */ { { zero, max }, { zero, 1.e+00 }, 5.e-01 },
+        /* 218 */ { { min, max }, { zero, 1.e+00 }, 5.e-01 },
+        /* 219 */ { { 9.9671949510973086e-206, max }, { zero, 1.e+00 }, 5.e-01 },
+        /* 220 */ { { 4.4647944971961829e-103, max }, { zero, 1.e+00 }, 5.e-01 },
+        /* 221 */ { { 1.9999999999998694e+00, max }, { zero, 1.e+00 }, 5.e-01 },
+        /* 222 */ { { 8.9589789687104559e+102, max }, { zero, 1.e+00 }, 5.e-01 },
+        /* 223 */ { { 4.0131652080900752e+205, max }, { zero, 1.e+00 }, 5.e-01 },
+        /* 224 */ { { max, max }, { zero, 1.e+00 }, 5.e-01 },
+        // { { inf, max }, { nan, nan }, 1.e+00 },
+        // { { -inf, inf }, { zero, 1.e+00 }, 5.e-01 },
+        // { { -max, inf }, { zero, 1.e+00 }, 5.e-01 },
+        // { { -4.0131652080900752e+205, inf }, { zero, 1.e+00 }, 5.e-01 },
+        // { { -8.9589789687104559e+102, inf }, { zero, 1.e+00 }, 5.e-01 },
+        // { { -1.9999999999998694e+00, inf }, { zero, 1.e+00 }, 5.e-01 },
+        // { { -4.4647944971961829e-103, inf }, { zero, 1.e+00 }, 5.e-01 },
+        // { { -9.9671949510973086e-206, inf }, { zero, 1.e+00 }, 5.e-01 },
+        // { { -min, inf }, { zero, 1.e+00 }, 5.e-01 },
+        // { { zero, inf }, { zero, 1.e+00 }, 5.e-01 },
+        // { { min, inf }, { zero, 1.e+00 }, 5.e-01 },
+        // { { 9.9671949510973086e-206, inf }, { zero, 1.e+00 }, 5.e-01 },
+        // { { 4.4647944971961829e-103, inf }, { zero, 1.e+00 }, 5.e-01 },
+        // { { 1.9999999999998694e+00, inf }, { zero, 1.e+00 }, 5.e-01 },
+        // { { 8.9589789687104559e+102, inf }, { zero, 1.e+00 }, 5.e-01 },
+        // { { 4.0131652080900752e+205, inf }, { zero, 1.e+00 }, 5.e-01 },
+        // { { max, inf }, { zero, 1.e+00 }, 5.e-01 },
+        // { { inf, inf }, { zero, 1.e+00 }, 5.e-01 }
+          // clang-format on
+      };
+      return table;
+    } else {
+      static_assert(dependent_false<T>); /* unreachable */
+    }
+  }
+};
+// NOLINTEND(whitespace/line_length)
+
+}  // namespace complex_unary_op_samples
+
+#endif  // XLA_TESTS_COMPLEX_UNARY_OP_SAMPLES_H_
diff --git a/third_party/xla/xla/tests/complex_unary_op_test.cc b/third_party/xla/xla/tests/complex_unary_op_test.cc
new file mode 100644
index 00000000000000..e0b5e373028412
--- /dev/null
+++ b/third_party/xla/xla/tests/complex_unary_op_test.cc
@@ -0,0 +1,109 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <vector>
+
+#include "xla/client/global_data.h"
+#include "xla/client/local_client.h"
+#include "xla/client/xla_builder.h"
+#include "xla/tests/client_library_test_base.h"
+#include "xla/tests/complex_unary_op_samples.h"
+#include "xla/tests/literal_test_util.h"
+#include "xla/tests/test_macros.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/test.h"
+
+namespace xla {
+namespace {
+
+template <class>
+constexpr bool dependent_false = false;
+
+class ComplexUnaryOpTest : public ClientLibraryTestBase {
+ protected:
+  template <typename T, size_t index, typename... Types>
+  std::vector<T> get_column(const std::vector<std::tuple<Types...>>& table) {
+    std::vector<T> column;
+    std::transform(
+        table.cbegin(), table.cend(), std::back_inserter(column),
+        [](const auto& item) { return static_cast<T>(std::get<index>(item)); });
+    return column;
+  }
+
+  template <typename T, typename S>
+  void scale_column(std::vector<T>& column, const std::vector<S>& scales) {
+    std::transform(column.begin(), column.end(), scales.begin(), column.begin(),
+                   [](const T& lhs, const S& rhs) { return lhs * rhs; });
+  }
+
+  template <typename C>
+  void UnaryTestHelper(XlaOp (*Op)(const XlaOp operand)) {
+    using InputType = typename C::InputType;
+    using OutputType = typename C::OutputType;
+    using FloatType = typename C::FloatType;
+
+    float atol;
+    // log(10)/log(2) = 3.3219...
+    constexpr int precision_deficiency =
+        static_cast<int>(C::dps_deficiency * 3.3219280948873626);
+    // precision_deficiency defines a slack allowed when comparing a
+    // result value against expected value that is known to be
+    // inaccurate to some extent.
+    if constexpr (std::is_same_v<FloatType, float>) {
+      atol = std::ldexp(1e-6f, precision_deficiency);
+    } else if constexpr (std::is_same_v<FloatType, double>) {
+      atol = std::ldexp(1e-15f, precision_deficiency);
+    } else {
+      static_assert(dependent_false<FloatType>);
+    }
+
+    XlaBuilder builder(TestName());
+    auto table = C().get();
+    auto inputs_vec = get_column<InputType, 0>(table);
+    auto expected_vec = get_column<OutputType, 1>(table);
+    auto scales_vec = get_column<FloatType, 2>(table);
+    scale_column(expected_vec, scales_vec);
+
+    auto inputs = ConstantR1<InputType>(&builder, inputs_vec);
+    auto scales = ConstantR1<FloatType>(&builder, scales_vec);
+    Literal expected = LiteralUtil::CreateR1<OutputType>(expected_vec);
+
+    if constexpr (std::is_same_v<OutputType, FloatType>) {
+      auto results = Op(inputs);
+      Mul(results, scales);
+      ComputeAndCompareLiteral(&builder, expected, {}, ErrorSpec(atol));
+    } else {
+      auto results = Op(inputs);
+      auto re = Mul(Real(results), scales);
+      auto im = Mul(Imag(results), scales);
+      Complex(re, im);
+      ComputeAndCompareLiteral(&builder, expected, {}, ErrorSpec(atol));
+    }
+  }
+};
+
+XLA_TEST_F(ComplexUnaryOpTest, Log1pTest) {
+  UnaryTestHelper<complex_unary_op_samples::Log1p<float>>(Log1p);
+  UnaryTestHelper<complex_unary_op_samples::Log1p<double>>(Log1p);
+}
+
+XLA_TEST_F(ComplexUnaryOpTest, TanTest) {
+  UnaryTestHelper<complex_unary_op_samples::Tan<float>>(Tan);
+  UnaryTestHelper<complex_unary_op_samples::Tan<double>>(Tan);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/third_party/xla/xla/tests/constants_test.cc b/third_party/xla/xla/tests/constants_test.cc
index b487877633b0b4..54037278e41ddd 100644
--- a/third_party/xla/xla/tests/constants_test.cc
+++ b/third_party/xla/xla/tests/constants_test.cc
@@ -284,7 +284,8 @@ TEST_F(ConstantsTest, FullLikeScalar) {
 class ConstantsHloTest : public HloTestBase {};
 
 // TODO(b/121147351): Fails on GPU. Not clear if this is expected behavior.
-XLA_TEST_F(ConstantsHloTest, DISABLED_ON_GPU(BitcastOfConstant)) {
+XLA_TEST_F(ConstantsHloTest,
+           DISABLED_ON_TPU(DISABLED_ON_GPU(BitcastOfConstant))) {
   if (IsMlirLoweringEnabled()) {
     // Bitcasts are not generated by frontends directly and are not supported by
     // the MLIR pipeline.
diff --git a/third_party/xla/xla/tests/custom_call_test.cc b/third_party/xla/xla/tests/custom_call_test.cc
index eaa80424bbd50b..c6dc3b3ffde015 100644
--- a/third_party/xla/xla/tests/custom_call_test.cc
+++ b/third_party/xla/xla/tests/custom_call_test.cc
@@ -13,15 +13,24 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <algorithm>
 #include <cstddef>
+#include <cstdint>
+#include <functional>
 #include <memory>
 #include <utility>
 
+#include "absl/algorithm/container.h"
 #include "absl/base/dynamic_annotations.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/client/lib/constants.h"
 #include "xla/client/xla_builder.h"
+#include "xla/ffi/ffi.h"
+#include "xla/ffi/ffi_api.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -105,69 +114,67 @@ namespace {
 using ::testing::HasSubstr;
 
 class CustomCallTest : public HloTestBase {
- public:
-  CustomCallTest()
-      : HloTestBase(),
-        module_(CreateNewVerifiedModule()),
-        builder_(TestName()) {}
-
  protected:
-  // Call this function when builder_ is complete (i.e. when all instructions
-  // have been added). Note that module_ is empty after calling this function.
-  auto BuildAndExecute(absl::Span<Literal* const> arguments) {
-    module_->AddEntryComputation(builder_.Build());
-    return Execute(std::move(module_), arguments);
-  }
-
   Shape r0f32_ = ShapeUtil::MakeShape(F32, {});
   Shape r2f32_ = ShapeUtil::MakeShape(F32, {2, 2});
-
-  std::unique_ptr<HloModule> module_;
-  HloComputation::Builder builder_;
 };
 
 XLA_TEST_F(CustomCallTest, CustomCallR0F32Add2) {
-  auto constant = builder_.AddInstruction(
+  auto module = CreateNewVerifiedModule();
+  auto builder = HloComputation::Builder(TestName());
+
+  auto constant = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
-  builder_.AddInstruction(
+  builder.AddInstruction(
       HloInstruction::CreateCustomCall(r0f32_, {constant}, "R0F32Add2"));
 
-  TF_ASSERT_OK_AND_ASSIGN(auto result, BuildAndExecute({}));
+  module->AddEntryComputation(builder.Build());
+
+  TF_ASSERT_OK_AND_ASSIGN(auto result, Execute(std::move(module), {}));
   LiteralTestUtil::ExpectR0Near<float>(44.0f, result, error_spec_);
 }
 
 XLA_TEST_F(CustomCallTest, CustomCallR2F32Reduce) {
+  auto module = CreateNewVerifiedModule();
+  auto builder = HloComputation::Builder(TestName());
+
   Array2D<float> array(2, 2);
   array(0, 0) = 1.0f;
   array(0, 1) = 2.0f;
   array(1, 0) = 3.0f;
   array(1, 1) = 4.0f;
 
-  auto constant = builder_.AddInstruction(
+  auto constant = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR2FromArray2D(array)));
-  builder_.AddInstruction(
+  builder.AddInstruction(
       HloInstruction::CreateCustomCall(r0f32_, {constant}, "R2F32ReduceSum"));
 
-  TF_ASSERT_OK_AND_ASSIGN(auto result, BuildAndExecute({}));
+  module->AddEntryComputation(builder.Build());
+
+  TF_ASSERT_OK_AND_ASSIGN(auto result, Execute(std::move(module), {}));
   LiteralTestUtil::ExpectR0Near<float>(10.0f, result, error_spec_);
 }
 
 XLA_TEST_F(CustomCallTest, UsedInOtherComputations) {
-  auto input = builder_.AddInstruction(
+  auto module = CreateNewVerifiedModule();
+  auto b = HloComputation::Builder(TestName());
+
+  auto input = b.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR2FromArray2D(
           Array2D<float>{{1.0f, 2.0f}, {3.0f, 4.0f}})));
-  auto incremented = builder_.AddInstruction(HloInstruction::CreateCustomCall(
+  auto incremented = b.AddInstruction(HloInstruction::CreateCustomCall(
       ShapeUtil::MakeShape(F32, {1, 2, 2}), {input}, "Add1ToValues"));
-  auto incremented_again =
-      builder_.AddInstruction(HloInstruction::CreateCustomCall(
-          ShapeUtil::MakeShape(F32, {1, 2, 2}), {incremented}, "Add1ToValues"));
+  auto incremented_again = b.AddInstruction(HloInstruction::CreateCustomCall(
+      ShapeUtil::MakeShape(F32, {1, 2, 2}), {incremented}, "Add1ToValues"));
 
   // Concatenate the values along first dim.
-  builder_.AddInstruction(
+  b.AddInstruction(
       HloInstruction::CreateConcatenate(ShapeUtil::MakeShape(F32, {2, 2, 2}),
                                         {incremented, incremented_again}, 0));
 
-  TF_ASSERT_OK_AND_ASSIGN(auto result, BuildAndExecute({}));
+  module->AddEntryComputation(b.Build());
+
+  TF_ASSERT_OK_AND_ASSIGN(auto result, Execute(std::move(module), {}));
   LiteralTestUtil::ExpectR3EqualArray3D<float>(
       Array3D<float>{{{2, 3}, {4, 5}}, {{3, 4}, {5, 6}}}, result);
 }
@@ -179,22 +186,24 @@ XLA_TEST_F(CustomCallTest, InputAndOutputLayoutDiffer) {
     GTEST_SKIP() << "Appears to test an XLA current implementation detail";
   }
 
+  auto module = CreateNewVerifiedModule();
+  auto b = HloComputation::Builder(TestName());
+
   auto input =
-      builder_.AddInstruction(HloInstruction::CreateParameter(0, r2f32_, "p"));
-  builder_.AddInstruction(
+      b.AddInstruction(HloInstruction::CreateParameter(0, r2f32_, "p"));
+  b.AddInstruction(
       HloInstruction::CreateCustomCall(r2f32_, {input}, "Add1ToValues"));
 
-  module_->AddEntryComputation(builder_.Build());
-  ForceParameterLayout(module_.get(), 0, LayoutUtil::MakeLayout({1, 0}));
-  ForceResultLayout(module_.get(), LayoutUtil::MakeLayout({0, 1}));
+  module->AddEntryComputation(b.Build());
+  ForceParameterLayout(module.get(), 0, LayoutUtil::MakeLayout({1, 0}));
+  ForceResultLayout(module.get(), LayoutUtil::MakeLayout({0, 1}));
 
   Literal argument = LiteralUtil::CreateR2<float>({{1.f, 2.f}, {3.f, 4.f}});
 
   // Note, the expected result is transposed! This is because the input and
   // output layouts of the custom call differ and the called function just
   // blindly adds one to each element.
-  TF_ASSERT_OK_AND_ASSIGN(auto result,
-                          Execute(std::move(module_), {&argument}));
+  TF_ASSERT_OK_AND_ASSIGN(auto result, Execute(std::move(module), {&argument}));
   LiteralTestUtil::ExpectR2Equal<float>({{2.f, 4.f}, {3.f, 5.f}}, result);
 }
 
@@ -202,24 +211,26 @@ XLA_TEST_F(CustomCallTest, LayoutConstrained) {
   // The argument and result of the computation are set to different layouts,
   // but the custom call is layout constrained to a fixed operand and result
   // layout, so the correct result should be produced.
+  auto module = CreateNewVerifiedModule();
+  auto b = HloComputation::Builder(TestName());
+
   auto input =
-      builder_.AddInstruction(HloInstruction::CreateParameter(0, r2f32_, "p"));
+      b.AddInstruction(HloInstruction::CreateParameter(0, r2f32_, "p"));
 
   const Shape& r2f32_dim0_major =
       ShapeUtil::MakeShapeWithDenseLayout(F32, {2, 2}, {1, 0});
-  auto custom_call = builder_.AddInstruction(HloInstruction::CreateCustomCall(
+  auto custom_call = b.AddInstruction(HloInstruction::CreateCustomCall(
       r2f32_dim0_major, {input}, "Add1ToValues", {r2f32_dim0_major}));
-  builder_.AddInstruction(
+  b.AddInstruction(
       custom_call->CloneWithNewOperands(r2f32_dim0_major, {custom_call}));
 
-  module_->AddEntryComputation(builder_.Build());
-  ForceParameterLayout(module_.get(), 0, LayoutUtil::MakeLayout({1, 0}));
-  ForceResultLayout(module_.get(), LayoutUtil::MakeLayout({0, 1}));
+  module->AddEntryComputation(b.Build());
+  ForceParameterLayout(module.get(), 0, LayoutUtil::MakeLayout({1, 0}));
+  ForceResultLayout(module.get(), LayoutUtil::MakeLayout({0, 1}));
 
   Literal argument = LiteralUtil::CreateR2<float>({{1.f, 2.f}, {3.f, 4.f}});
 
-  TF_ASSERT_OK_AND_ASSIGN(auto result,
-                          Execute(std::move(module_), {&argument}));
+  TF_ASSERT_OK_AND_ASSIGN(auto result, Execute(std::move(module), {&argument}));
   LiteralTestUtil::ExpectR2Equal<float>({{3.f, 4.f}, {5.f, 6.f}}, result);
 }
 
@@ -245,43 +256,58 @@ XLA_TEST_F(CustomCallTest, TupleOutput) {
 }
 
 XLA_TEST_F(CustomCallTest, ReportsSuccess) {
-  auto constant = builder_.AddInstruction(
+  auto module = CreateNewVerifiedModule();
+  auto builder = HloComputation::Builder(TestName());
+
+  auto constant = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
-  builder_.AddInstruction(HloInstruction::CreateCustomCall(
+  builder.AddInstruction(HloInstruction::CreateCustomCall(
       r0f32_, {constant}, "R0F32Add2Succeed",
       /*opaque=*/"", CustomCallApiVersion::API_VERSION_STATUS_RETURNING));
 
-  TF_ASSERT_OK_AND_ASSIGN(auto result, BuildAndExecute({}));
+  module->AddEntryComputation(builder.Build());
+
+  TF_ASSERT_OK_AND_ASSIGN(auto result, Execute(std::move(module), {}));
   LiteralTestUtil::ExpectR0Near<float>(44.0f, result, error_spec_);
 }
 
 XLA_TEST_F(CustomCallTest, ReportsFailure) {
-  auto constant = builder_.AddInstruction(
+  auto module = CreateNewVerifiedModule();
+  auto builder = HloComputation::Builder(TestName());
+
+  auto constant = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
-  builder_.AddInstruction(HloInstruction::CreateCustomCall(
+  builder.AddInstruction(HloInstruction::CreateCustomCall(
       ShapeUtil::MakeShape(F32, {}), {constant}, "CustomCallFail",
       /*opaque=*/"", CustomCallApiVersion::API_VERSION_STATUS_RETURNING));
 
-  auto status = BuildAndExecute({}).status();
+  module->AddEntryComputation(builder.Build());
+
+  auto status = Execute(std::move(module), {}).status();
   EXPECT_EQ(status.code(), absl::StatusCode::kInternal);
   EXPECT_THAT(status.message(), ::testing::HasSubstr("Failed: 42.0"));
 }
 
 XLA_TEST_F(CustomCallTest, ReportsFirstFailure) {
-  auto constant_1 = builder_.AddInstruction(
+  auto module = CreateNewVerifiedModule();
+  auto builder = HloComputation::Builder(TestName());
+
+  auto constant_1 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0f)));
-  auto constant_2 = builder_.AddInstruction(
+  auto constant_2 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0f)));
-  auto res_1 = builder_.AddInstruction(HloInstruction::CreateCustomCall(
+  auto res_1 = builder.AddInstruction(HloInstruction::CreateCustomCall(
       ShapeUtil::MakeShape(F32, {}), {constant_1}, "CustomCallFail",
       /*opaque=*/"", CustomCallApiVersion::API_VERSION_STATUS_RETURNING));
-  auto res_2 = builder_.AddInstruction(HloInstruction::CreateCustomCall(
+  auto res_2 = builder.AddInstruction(HloInstruction::CreateCustomCall(
       ShapeUtil::MakeShape(F32, {}), {constant_2}, "CustomCallFail",
       /*opaque=*/"", CustomCallApiVersion::API_VERSION_STATUS_RETURNING));
-  builder_.AddInstruction(HloInstruction::CreateBinary(
+  builder.AddInstruction(HloInstruction::CreateBinary(
       ShapeUtil::MakeShape(F32, {}), HloOpcode::kAdd, res_1, res_2));
 
-  auto status = BuildAndExecute({}).status();
+  module->AddEntryComputation(builder.Build());
+
+  auto status = Execute(std::move(module), {}).status();
   EXPECT_EQ(status.code(), absl::StatusCode::kInternal);
   EXPECT_THAT(status.message(), ::testing::HasSubstr("Failed: 1.0"));
 }
@@ -347,5 +373,651 @@ XLA_TEST_F(CustomCallClientAPITest, IllegalCustomCallTarget) {
   EXPECT_FALSE(result.ok());
 }
 
+//===----------------------------------------------------------------------===//
+// XLA runtime custom call provides type-safe custom call API
+//===----------------------------------------------------------------------===//
+
+namespace {
+// Helper functions to get data pointers from buffers
+template <typename NativeType, typename BufferType>
+static NativeType* DataPointer(BufferType& buffer) {
+  return reinterpret_cast<NativeType*>(buffer.data.opaque());
+}
+template <typename NativeType, typename BufferType>
+static NativeType* DataPointer(ffi::Result<BufferType>& buffer) {
+  return reinterpret_cast<NativeType*>(buffer->data.opaque());
+}
+
+// TODO(abanas): The following three usings are a workaround, delete when
+// ResultBuffer is implemented as its own class
+using ResultBufferBase = ffi::Result<ffi::BufferBase>;
+template <PrimitiveType dtype, size_t rank = xla::ffi::internal::kDynamicRank>
+using ResultBuffer = ffi::Result<ffi::Buffer<dtype, rank>>;
+template <PrimitiveType dtype>
+using ResultBufferR0 = ResultBuffer<dtype, 0>;
+
+using R0F32Buffer = typename ffi::BufferR0<PrimitiveType::F32>;
+using F32Buffer = typename ffi::Buffer<PrimitiveType::F32>;
+using R0F32ResultBuffer = ResultBufferR0<PrimitiveType::F32>;
+using F32ResultBuffer = ResultBuffer<PrimitiveType::F32>;
+using BufferBase = ffi::BufferBase;
+
+// Custom kernels definitions and registrations
+static absl::Status AlwaysSucceed(ResultBufferBase) { return absl::OkStatus(); }
+
+XLA_FFI_DEFINE_HANDLER(kAlwaysSucceed, AlwaysSucceed,
+                       ffi::Ffi::Bind().Ret<BufferBase>()  // unused out buffer
+);
+
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$always_succeed",
+                         "Host", kAlwaysSucceed);
+
+static absl::Status AlwaysFail(ResultBufferBase, int32_t value) {
+  return absl::InternalError(absl::StrCat("Failed: ", value));
+}
+
+XLA_FFI_DEFINE_HANDLER(kAlwaysFail, AlwaysFail,
+                       ffi::Ffi::Bind()
+                           .Ret<BufferBase>()       // unused out buffer
+                           .Attr<int32_t>("value")  // value
+);
+
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$always_fail", "Host",
+                         kAlwaysFail);
+
+static absl::Status FfiR0F32Add2(R0F32Buffer in, R0F32ResultBuffer out) {
+  auto in_data = DataPointer<float>(in);
+  auto out_data = DataPointer<float>(out);
+  *out_data = *in_data + 2.0f;
+  return absl::OkStatus();
+}
+
+XLA_FFI_DEFINE_HANDLER(kFfiR0F32Add2, FfiR0F32Add2,
+                       ffi::Ffi::Bind()
+                           .Arg<R0F32Buffer>()  // in
+                           .Ret<R0F32Buffer>()  // out
+);
+
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$FfiR0F32Add2",
+                         "Host", kFfiR0F32Add2);
+
+// This represents a kernel that is valid only for F32 and F64 types
+static absl::Status FfiR0FAdd2BufferBase(BufferBase in, ResultBufferBase out) {
+  if (in.dtype != out->dtype) {
+    return absl::InternalError("Input and output dtypes mismatch");
+  }
+
+  switch (in.dtype) {
+    case PrimitiveType::F32: {
+      auto in_data = DataPointer<float>(in);
+      auto out_data = DataPointer<float>(out);
+      *out_data = *in_data + 2.0f;
+      break;
+    }
+    case PrimitiveType::F64: {
+      auto in_data = DataPointer<double>(in);
+      auto out_data = DataPointer<double>(out);
+      *out_data = *in_data + 2.0f;
+      break;
+    }
+    default:
+      return absl::InternalError("Incorrect type");
+  }
+
+  return absl::OkStatus();
+}
+
+XLA_FFI_DEFINE_HANDLER(kFfiR0FAdd2BufferBase, FfiR0FAdd2BufferBase,
+                       ffi::Ffi::Bind()
+                           .Arg<BufferBase>()  // in
+                           .Ret<BufferBase>()  // out
+);
+
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(),
+                         "__xla_test$$FfiR0FAdd2BufferBase", "Host",
+                         kFfiR0FAdd2BufferBase);
+
+static absl::Status FfiR0F32AddN(R0F32Buffer in, R0F32ResultBuffer out,
+                                 float n) {
+  auto in_data = DataPointer<float>(in);
+  auto out_data = DataPointer<float>(out);
+  *out_data = *in_data + n;
+  return absl::OkStatus();
+}
+
+XLA_FFI_DEFINE_HANDLER(kFfiR0F32AddN, FfiR0F32AddN,
+                       ffi::Ffi::Bind()
+                           .Arg<R0F32Buffer>()  // in
+                           .Ret<R0F32Buffer>()  // out
+                           .Attr<float>("n"));
+
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$FfiR0F32AddN",
+                         "Host", kFfiR0F32AddN);
+
+static absl::Status FfiR0F32AddNPointer(R0F32Buffer in, R0F32ResultBuffer out,
+                                        float* n) {
+  auto in_data = DataPointer<float>(in);
+  auto out_data = DataPointer<float>(out);
+  *out_data = *in_data + *n;
+  return absl::OkStatus();
+}
+
+XLA_FFI_DEFINE_HANDLER(kFfiR0F32AddNPointer, FfiR0F32AddNPointer,
+                       ffi::Ffi::Bind()
+                           .Arg<R0F32Buffer>()  // in
+                           .Ret<R0F32Buffer>()  // out
+                           .Attr<ffi::Pointer<float>>("n"));
+
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$FfiR0F32AddNPointer",
+                         "Host", kFfiR0F32AddNPointer);
+
+static absl::Status FfiF32ReduceSum(F32Buffer in, R0F32ResultBuffer out) {
+  auto in_data = DataPointer<float>(in);
+  auto out_data = DataPointer<float>(out);
+
+  // Calculate the total size of the vector
+  const auto size =
+      absl::c_accumulate(in.dimensions, 1, std::multiplies<int>());
+
+  // Calculate the sum of the vector
+  *out_data = absl::c_accumulate(absl::MakeSpan(in_data, size), 0.0f);
+
+  return absl::OkStatus();
+}
+
+XLA_FFI_DEFINE_HANDLER(kFfiF32ReduceSum, FfiF32ReduceSum,
+                       ffi::Ffi::Bind()
+                           .Arg<F32Buffer>()    // in
+                           .Ret<R0F32Buffer>()  // out
+);
+
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$FfiF32ReduceSum",
+                         "Host", kFfiF32ReduceSum);
+
+static absl::Status FfiF32Add1ToValues(F32Buffer in, F32ResultBuffer out) {
+  auto in_data = DataPointer<float>(in);
+  auto out_data = DataPointer<float>(out);
+
+  // Calculate and verify the total size of the vector
+  const auto in_size =
+      absl::c_accumulate(in.dimensions, 1, std::multiplies<int>());
+  const auto out_size =
+      absl::c_accumulate(out->dimensions, 1, std::multiplies<int>());
+  if (in_size != out_size) {
+    return absl::InternalError("Input and output sizes mismatch");
+  }
+
+  // Actual computations
+  std::transform(in_data, in_data + in_size, out_data,
+                 [](float x) { return x + 1; });
+
+  return absl::OkStatus();
+}
+
+XLA_FFI_DEFINE_HANDLER(kFfiF32Add1ToValues, FfiF32Add1ToValues,
+                       ffi::Ffi::Bind()
+                           .Arg<F32Buffer>()  // in
+                           .Ret<F32Buffer>()  // out
+);
+
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$FfiF32Add1ToValues",
+                         "Host", kFfiF32Add1ToValues);
+
+static absl::Status FfiF32TupleSwap(R0F32Buffer in0, R0F32Buffer in1,
+                                    R0F32ResultBuffer out0,
+                                    R0F32ResultBuffer out1) {
+  auto in_data0 = DataPointer<float>(in0);
+  auto in_data1 = DataPointer<float>(in1);
+  auto out_data0 = DataPointer<float>(out0);
+  auto out_data1 = DataPointer<float>(out1);
+  *out_data0 = *in_data1;
+  *out_data1 = *in_data0;
+  return absl::OkStatus();
+}
+
+XLA_FFI_DEFINE_HANDLER(kFfiF32TupleSwap, FfiF32TupleSwap,
+                       ffi::Ffi::Bind()
+                           .Arg<R0F32Buffer>()  // in0
+                           .Arg<R0F32Buffer>()  // in1
+                           .Ret<R0F32Buffer>()  // out0
+                           .Ret<R0F32Buffer>()  // out1
+);
+
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$FfiF32TupleSwap",
+                         "Host", kFfiF32TupleSwap);
+
+}  // namespace
+
+// TODO(abanas): When #10056 (typed FFI support) is ready, this class can be
+// replaced by a simple 'using FfiCustomCallTest = CustomCallTest;'
+class FfiCustomCallTest : public CustomCallTest {
+ protected:
+  void SetUp() override {
+    GTEST_SKIP() << "Typed FFI is not supported yet on CPU";
+  }
+};
+
+XLA_TEST_F(FfiCustomCallTest, FfiReportsSuccess) {
+  auto module = CreateNewVerifiedModule();
+  auto builder = HloComputation::Builder(TestName());
+
+  builder.AddInstruction(HloInstruction::CreateCustomCall(
+      r0f32_, {}, "__xla_test$$always_succeed", "",
+      /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI));
+
+  module->AddEntryComputation(builder.Build());
+
+  auto status = Execute(std::move(module), {}).status();
+  EXPECT_EQ(status, absl::OkStatus());
+}
+
+XLA_TEST_F(FfiCustomCallTest, FfiUnknownTarget) {
+  auto module = CreateNewVerifiedModule();
+  auto builder = HloComputation::Builder(TestName());
+
+  builder.AddInstruction(HloInstruction::CreateCustomCall(
+      r0f32_, {}, "__xla_test$$unknown_target", "",
+      /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI));
+
+  module->AddEntryComputation(builder.Build());
+
+  auto status = Execute(std::move(module), {}).status();
+  // NOTE: In the current CPU implementation, the 'kInternal' status code is
+  // returned when the target is not found. This behavior differs from that of
+  // the GPU, which returns 'kUnimplemented' in such case. When the CPU adopts
+  // the thunks runtime, the status code will be unified across both backends.
+  EXPECT_EQ(status.code(), absl::StatusCode::kInternal);
+  EXPECT_THAT(status.message(), HasSubstr("No registered implementation"));
+}
+
+XLA_TEST_F(FfiCustomCallTest, FfiReportsFailure) {
+  auto module = CreateNewVerifiedModule();
+  auto builder = HloComputation::Builder(TestName());
+
+  builder.AddInstruction(HloInstruction::CreateCustomCall(
+      r0f32_, {}, "__xla_test$$always_fail",
+      /*opaque=*/"{value = 42 : i32}",
+      CustomCallApiVersion::API_VERSION_TYPED_FFI));
+
+  module->AddEntryComputation(builder.Build());
+
+  auto status = Execute(std::move(module), {}).status();
+  EXPECT_EQ(status.code(), absl::StatusCode::kInternal);
+  EXPECT_THAT(status.message(), ::testing::HasSubstr("Failed: 42"));
+}
+
+XLA_TEST_F(FfiCustomCallTest, FfiReportsFirstFailure) {
+  auto module = CreateNewVerifiedModule();
+  auto builder = HloComputation::Builder(TestName());
+
+  auto res_1 = builder.AddInstruction(HloInstruction::CreateCustomCall(
+      r0f32_, {}, "__xla_test$$always_fail",
+      /*opaque=*/"{value = 1 : i32}",
+      CustomCallApiVersion::API_VERSION_TYPED_FFI));
+  auto res_2 = builder.AddInstruction(HloInstruction::CreateCustomCall(
+      r0f32_, {}, "__xla_test$$always_fail",
+      /*opaque=*/"{value = 2 : i32}",
+      CustomCallApiVersion::API_VERSION_TYPED_FFI));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, res_1, res_2));
+
+  module->AddEntryComputation(builder.Build());
+
+  auto status = Execute(std::move(module), {}).status();
+  EXPECT_EQ(status.code(), absl::StatusCode::kInternal);
+  EXPECT_THAT(status.message(), ::testing::HasSubstr("Failed: 1"));
+}
+
+XLA_TEST_F(FfiCustomCallTest, FfiTransitiveCustomCallReportsFirstFailure) {
+  const char* const kModuleStr = R"(
+    HloModule m
+    sub_2 {
+      ROOT custom-call = f32[] custom-call(), custom_call_target="__xla_test$$always_fail", api_version=API_VERSION_TYPED_FFI, backend_config="{value = 2 : i32}"
+    }
+    sub_3 {
+      ROOT custom-call = f32[] custom-call(), custom_call_target="__xla_test$$always_fail", api_version=API_VERSION_TYPED_FFI, backend_config="{value = 3 : i32}"
+    }
+    ENTRY test {
+      call0 = f32[] call(), to_apply=sub_2
+      call1 = f32[] call(), to_apply=sub_3
+      ROOT sum = f32[] add(%call0, %call1)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+
+  auto status = Execute(std::move(module), {}).status();
+  EXPECT_EQ(status.code(), absl::StatusCode::kInternal);
+  EXPECT_THAT(status.message(), HasSubstr("Failed: 2"));
+}
+
+XLA_TEST_F(FfiCustomCallTest, FfiWrongNumberOfArguments) {
+  auto module = CreateNewVerifiedModule();
+  auto builder = HloComputation::Builder(TestName());
+
+  builder.AddInstruction(HloInstruction::CreateCustomCall(
+      r0f32_, {}, "__xla_test$$FfiR0F32Add2", "",
+      /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI));
+
+  module->AddEntryComputation(builder.Build());
+
+  auto status = Execute(std::move(module), {}).status();
+  // NOTE: In the current CPU implementation, the 'kInternal' status code is
+  // returned when the argument is invalid. This behavior differs from that of
+  // the GPU, which returns 'kInvalidArgument' in such case. When the CPU adopts
+  // the thunks runtime, the status code will be unified across both backends.
+  EXPECT_EQ(status.code(), absl::StatusCode::kInternal);
+  EXPECT_THAT(status.message(), HasSubstr("Wrong number of arguments"));
+}
+
+XLA_TEST_F(FfiCustomCallTest, FfiWrongRankOfArgument) {
+  auto module = CreateNewVerifiedModule();
+  auto builder = HloComputation::Builder(TestName());
+
+  Array2D<float> array(2, 2);
+  array(0, 0) = 1.0f;
+  array(0, 1) = 2.0f;
+  array(1, 0) = 3.0f;
+  array(1, 1) = 4.0f;
+
+  auto constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR2FromArray2D(array)));
+  builder.AddInstruction(HloInstruction::CreateCustomCall(
+      r2f32_, {constant}, "__xla_test$$FfiR0F32Add2", "",
+      /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI));
+
+  module->AddEntryComputation(builder.Build());
+
+  auto status = Execute(std::move(module), {}).status();
+  // NOTE: In the current CPU implementation, the 'kInternal' status code is
+  // returned when the argument is invalid. This behavior differs from that of
+  // the GPU, which returns 'kInvalidArgument' in such case. When the CPU adopts
+  // the thunks runtime, the status code will be unified across both backends.
+  EXPECT_EQ(status.code(), absl::StatusCode::kInternal);
+  EXPECT_THAT(status.message(), HasSubstr("Wrong buffer rank"));
+}
+
+XLA_TEST_F(FfiCustomCallTest, FfiWrongDTypeOfArgument) {
+  auto module = CreateNewVerifiedModule();
+  auto builder = HloComputation::Builder(TestName());
+
+  auto constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int>(42)));
+  builder.AddInstruction(HloInstruction::CreateCustomCall(
+      r2f32_, {constant}, "__xla_test$$FfiR0F32Add2", "",
+      /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI));
+
+  module->AddEntryComputation(builder.Build());
+
+  auto status = Execute(std::move(module), {}).status();
+  // NOTE: In the current CPU implementation, the 'kInternal' status code is
+  // returned when the argument is invalid. This behavior differs from that of
+  // the GPU, which returns 'kInvalidArgument' in such case. When the CPU adopts
+  // the thunks runtime, the status code will be unified across both backends.
+  EXPECT_EQ(status.code(), absl::StatusCode::kInternal);
+  EXPECT_THAT(status.message(), HasSubstr("Wrong buffer dtype"));
+}
+
+XLA_TEST_F(FfiCustomCallTest, FfiHandleTypedBuffers) {
+  auto module = CreateNewVerifiedModule();
+  auto builder = HloComputation::Builder(TestName());
+
+  auto constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
+  builder.AddInstruction(HloInstruction::CreateCustomCall(
+      r0f32_, {constant}, "__xla_test$$FfiR0F32Add2", "",
+      /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI));
+
+  module->AddEntryComputation(builder.Build());
+
+  TF_ASSERT_OK_AND_ASSIGN(auto result, Execute(std::move(module), {}));
+  LiteralTestUtil::ExpectR0Near<float>(44.0f, result, error_spec_);
+}
+
+XLA_TEST_F(FfiCustomCallTest, FfiHandleInputAsParameters) {
+  auto module = CreateNewVerifiedModule();
+  auto builder = HloComputation::Builder(TestName());
+
+  auto constant =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, "p"));
+  builder.AddInstruction(HloInstruction::CreateCustomCall(
+      r0f32_, {constant}, "__xla_test$$FfiR0F32Add2", "",
+      /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI));
+
+  module->AddEntryComputation(builder.Build());
+
+  Literal argument = LiteralUtil::CreateR0<float>(42.0f);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto result, Execute(std::move(module), {&argument}));
+  LiteralTestUtil::ExpectR0Near<float>(44.0f, result, error_spec_);
+}
+
+XLA_TEST_F(FfiCustomCallTest, FfiHandleBufferBaseFloat) {
+  auto module = CreateNewVerifiedModule();
+  auto builder = HloComputation::Builder(TestName());
+
+  auto constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
+  builder.AddInstruction(HloInstruction::CreateCustomCall(
+      r0f32_, {constant}, "__xla_test$$FfiR0FAdd2BufferBase", "",
+      /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI));
+
+  module->AddEntryComputation(builder.Build());
+
+  TF_ASSERT_OK_AND_ASSIGN(auto result, Execute(std::move(module), {}));
+  LiteralTestUtil::ExpectR0Near<float>(44.0f, result, error_spec_);
+}
+
+XLA_TEST_F(FfiCustomCallTest, FfiHandleBufferBaseDouble) {
+  auto module = CreateNewVerifiedModule();
+  auto builder = HloComputation::Builder(TestName());
+
+  auto constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<double>(42.0f)));
+  builder.AddInstruction(HloInstruction::CreateCustomCall(
+      ShapeUtil::MakeShape(F64, {}), {constant},
+      "__xla_test$$FfiR0FAdd2BufferBase", "",
+      /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI));
+
+  module->AddEntryComputation(builder.Build());
+
+  TF_ASSERT_OK_AND_ASSIGN(auto result, Execute(std::move(module), {}));
+  LiteralTestUtil::ExpectR0Near<double>(44.0f, result, error_spec_);
+}
+
+XLA_TEST_F(FfiCustomCallTest, FfiHandleAttr) {
+  auto module = CreateNewVerifiedModule();
+  auto builder = HloComputation::Builder(TestName());
+
+  auto constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
+  builder.AddInstruction(HloInstruction::CreateCustomCall(
+      r0f32_, {constant}, "__xla_test$$FfiR0F32AddN",
+      /*opaque=*/"{n = 3.0 : f32}",
+      /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI));
+
+  module->AddEntryComputation(builder.Build());
+
+  TF_ASSERT_OK_AND_ASSIGN(auto result, Execute(std::move(module), {}));
+  LiteralTestUtil::ExpectR0Near<float>(45.0f, result, error_spec_);
+}
+
+XLA_TEST_F(FfiCustomCallTest, FfiHandleAttrPointer) {
+  auto module = CreateNewVerifiedModule();
+  auto builder = HloComputation::Builder(TestName());
+
+  auto constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
+  auto n = 4.0f;
+  auto ptr = reinterpret_cast<uintptr_t>(&n);
+  builder.AddInstruction(HloInstruction::CreateCustomCall(
+      r0f32_, {constant}, "__xla_test$$FfiR0F32AddN",
+      /*opaque=*/absl::StrFormat("{n = %d : i64}", ptr),
+      /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI));
+
+  module->AddEntryComputation(builder.Build());
+
+  TF_ASSERT_OK_AND_ASSIGN(auto result, Execute(std::move(module), {}));
+  LiteralTestUtil::ExpectR0Near<float>(46.0f, result, error_spec_);
+}
+
+XLA_TEST_F(FfiCustomCallTest, FfiHandleR2Vector) {
+  auto module = CreateNewVerifiedModule();
+  auto builder = HloComputation::Builder(TestName());
+
+  Array2D<float> array(2, 2);
+  array(0, 0) = 1.0f;
+  array(0, 1) = 2.0f;
+  array(1, 0) = 3.0f;
+  array(1, 1) = 4.0f;
+
+  auto constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR2FromArray2D(array)));
+  builder.AddInstruction(HloInstruction::CreateCustomCall(
+      r0f32_, {constant}, "__xla_test$$FfiF32ReduceSum",
+      /*opaque=*/"",
+      /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI));
+
+  module->AddEntryComputation(builder.Build());
+
+  TF_ASSERT_OK_AND_ASSIGN(auto result, Execute(std::move(module), {}));
+  LiteralTestUtil::ExpectR0Near<float>(10.0f, result, error_spec_);
+}
+
+XLA_TEST_F(FfiCustomCallTest, FfiUsedInOtherComputations) {
+  auto module = CreateNewVerifiedModule();
+  auto builder = HloComputation::Builder(TestName());
+
+  auto input = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR2FromArray2D(
+          Array2D<float>{{1.0f, 2.0f}, {3.0f, 4.0f}})));
+  auto incremented = builder.AddInstruction(HloInstruction::CreateCustomCall(
+      ShapeUtil::MakeShape(F32, {1, 2, 2}), {input},
+      "__xla_test$$FfiF32Add1ToValues",
+      /*opaque=*/"",
+      /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI));
+  auto incremented_again =
+      builder.AddInstruction(HloInstruction::CreateCustomCall(
+          ShapeUtil::MakeShape(F32, {1, 2, 2}), {incremented},
+          "__xla_test$$FfiF32Add1ToValues",
+          /*opaque=*/"",
+          /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI));
+
+  // Concatenate the values along first dim.
+  builder.AddInstruction(
+      HloInstruction::CreateConcatenate(ShapeUtil::MakeShape(F32, {2, 2, 2}),
+                                        {incremented, incremented_again}, 0));
+
+  module->AddEntryComputation(builder.Build());
+
+  TF_ASSERT_OK_AND_ASSIGN(auto result, Execute(std::move(module), {}));
+  LiteralTestUtil::ExpectR3EqualArray3D<float>(
+      Array3D<float>{{{2, 3}, {4, 5}}, {{3, 4}, {5, 6}}}, result);
+}
+
+XLA_TEST_F(FfiCustomCallTest, FfiInputAndOutputLayoutDiffer) {
+  auto module = CreateNewVerifiedModule();
+  auto builder = HloComputation::Builder(TestName());
+
+  if (IsMlirLoweringEnabled()) {
+    // The MLIR pipeline does /not/ transpose the output here, and there's no
+    // obvious reason why it should.
+    GTEST_SKIP() << "Appears to test an XLA current implementation detail";
+  }
+
+  auto input =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, r2f32_, "p"));
+
+  builder.AddInstruction(HloInstruction::CreateCustomCall(
+      r2f32_, {input}, "__xla_test$$FfiF32Add1ToValues", /*opaque=*/"",
+      /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI));
+
+  module->AddEntryComputation(builder.Build());
+  ForceParameterLayout(module.get(), 0, LayoutUtil::MakeLayout({1, 0}));
+  ForceResultLayout(module.get(), LayoutUtil::MakeLayout({0, 1}));
+
+  Literal argument = LiteralUtil::CreateR2<float>({{1.f, 2.f}, {3.f, 4.f}});
+
+  // Note, the expected result is transposed! This is because the input and
+  // output layouts of the custom call differ and the called function just
+  // blindly adds one to each element.
+  TF_ASSERT_OK_AND_ASSIGN(auto result, Execute(std::move(module), {&argument}));
+  LiteralTestUtil::ExpectR2Equal<float>({{2.f, 4.f}, {3.f, 5.f}}, result);
+}
+
+XLA_TEST_F(FfiCustomCallTest, FfiLayoutConstrained) {
+  auto module = CreateNewVerifiedModule();
+  auto builder = HloComputation::Builder(TestName());
+
+  // The argument and result of the computation are set to different layouts,
+  // but the custom call is layout constrained to a fixed operand and result
+  // layout, so the correct result should be produced.
+  auto input =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, r2f32_, "p"));
+
+  const Shape& r2f32_dim0_major =
+      ShapeUtil::MakeShapeWithDenseLayout(F32, {2, 2}, {1, 0});
+  auto custom_call = builder.AddInstruction(HloInstruction::CreateCustomCall(
+      r2f32_dim0_major, {input}, "__xla_test$$FfiF32Add1ToValues",
+      /*operand_shapes_with_layout=*/{r2f32_dim0_major},
+      /*opaque=*/"",
+      /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI));
+  builder.AddInstruction(
+      custom_call->CloneWithNewOperands(r2f32_dim0_major, {custom_call}));
+
+  module->AddEntryComputation(builder.Build());
+  ForceParameterLayout(module.get(), 0, LayoutUtil::MakeLayout({1, 0}));
+  ForceResultLayout(module.get(), LayoutUtil::MakeLayout({0, 1}));
+
+  Literal argument = LiteralUtil::CreateR2<float>({{1.f, 2.f}, {3.f, 4.f}});
+
+  TF_ASSERT_OK_AND_ASSIGN(auto result, Execute(std::move(module), {&argument}));
+  LiteralTestUtil::ExpectR2Equal<float>({{3.f, 4.f}, {5.f, 6.f}}, result);
+}
+
+XLA_TEST_F(FfiCustomCallTest, FfiTupleOutput) {
+  auto module = CreateNewVerifiedModule();
+  auto builder = HloComputation::Builder(TestName());
+
+  auto input0 =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, "p0"));
+  auto input1 =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, r0f32_, "p1"));
+  builder.AddInstruction(HloInstruction::CreateCustomCall(
+      ShapeUtil::MakeTupleShape({r0f32_, r0f32_}), {input0, input1},
+      "__xla_test$$FfiF32TupleSwap", /*opaque=*/"",
+      /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI));
+
+  module->AddEntryComputation(builder.Build());
+
+  Literal arg0 = LiteralUtil::CreateR0<float>(7.f);
+  Literal arg1 = LiteralUtil::CreateR0<float>(42.f);
+
+  Literal expected = LiteralUtil::MakeTuple({&arg1, &arg0});
+  TF_ASSERT_OK_AND_ASSIGN(auto result,
+                          Execute(std::move(module), {&arg0, &arg1}));
+  EXPECT_EQ(result, expected);
+}
+
+XLA_TEST_F(FfiCustomCallTest, FfiTupleInput) {
+  const char* const kModuleStr = R"(
+    HloModule m
+
+    ENTRY test {
+      c0 = (f32[], f32[]) constant((7.0, 42.0))
+      ROOT custom-call = (f32[], f32[]) custom-call(c0), custom_call_target="__xla_test$$FfiF32TupleSwap", api_version=API_VERSION_TYPED_FFI
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+
+  Literal arg0 = LiteralUtil::CreateR0<float>(7.f);
+  Literal arg1 = LiteralUtil::CreateR0<float>(42.f);
+
+  Literal expected = LiteralUtil::MakeTuple({&arg1, &arg0});
+  TF_ASSERT_OK_AND_ASSIGN(auto result, Execute(std::move(module), {}));
+  EXPECT_EQ(result, expected);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/tests/exhaustive/exhaustive_unary_test_complex.cc b/third_party/xla/xla/tests/exhaustive/exhaustive_unary_test_complex.cc
index 5e6ab943bcb013..0c078d667da787 100644
--- a/third_party/xla/xla/tests/exhaustive/exhaustive_unary_test_complex.cc
+++ b/third_party/xla/xla/tests/exhaustive/exhaustive_unary_test_complex.cc
@@ -101,7 +101,21 @@ UNARY_TEST_COMPLEX_64(DISABLED_ON_CPU(Log), {
   known_incorrect_fn_ = [this](int64_t val) {
     return std::isnan(this->ConvertValue(val));
   };
-  Run(Log, [](complex64 x) { return std::log<float>(x); });
+  ErrorSpecGen error_spec_gen = +[](complex64 x) {
+    // std::log uses std::abs, which uses hypotf. The latter upcasts its
+    // argument to double, so there are cases where there's excess precision, so
+    // abs returns numeric_limits<float>::max, but it's not possible to arrive
+    // at this number without using doubles internally.
+    // Obviously, the default relative error spec makes no sense for this case.
+    // `log(max) ~ 88.72`, which is very far from `inf`, but `max` is considered
+    // very close to `inf`.
+    if (std::abs(x) == std::numeric_limits<float>::max()) {
+      float inf = std::numeric_limits<float>::infinity();
+      return ErrorSpec{inf, inf};
+    }
+    return GetDefaultSpecGenerator()(x);
+  };
+  Run(Log, [](complex64 x) { return std::log<float>(x); }, error_spec_gen);
 })
 
 UNARY_TEST_COMPLEX_64(Sqrt, {
@@ -219,8 +233,7 @@ UNARY_TEST_COMPLEX_128(Rsqrt, {
 
 UNARY_TEST_COMPLEX_128(Tanh, {
   SetParamsForTanh();
-  Run(
-      Tanh, +[](complex128 x) { return std::tanh(x); });
+  Run(Tanh, +[](complex128 x) { return std::tanh(x); });
 })
 
 INSTANTIATE_TEST_SUITE_P(
diff --git a/third_party/xla/xla/tests/generate_complex_unary_op_samples.py b/third_party/xla/xla/tests/generate_complex_unary_op_samples.py
new file mode 100644
index 00000000000000..0027fa59f94817
--- /dev/null
+++ b/third_party/xla/xla/tests/generate_complex_unary_op_samples.py
@@ -0,0 +1,275 @@
+"""A script to generate the complex_unary_op_samples.h file.
+
+The generated file contains samples and reference values of complex unary
+functions used by the complex_unary_op_test program.
+
+Prerequisites:
+  jax version 0.4.26 or newer
+  mpmath 1.3
+  numpy
+
+Usage:
+  Running
+    python /path/to/generate_complex_unary_op_samples.py [xla | tensorflow]
+  will create
+    /path/to/generate_complex_unary_op_samples.h
+
+Constraints:
+  Running
+    clang-format -i -style=Google /path/to/complex_unary_op_samples.h
+  should not change the generated complex_unary_op_samples.h
+"""
+
+import os
+import re
+import sys
+import jax._src.test_util as jtu
+import mpmath
+import numpy as np
+
+
+def disable(op, real, imag):
+  # Return True to disable samples (real, imag) that are know to be
+  # problematic for the given op.
+  if op == 'Tan' and ('inf' in real or 'inf' in imag):
+    # TODO(pearu): remove this if-block when google/jax#20688 has landed
+    return True
+  else:
+    del op, real, imag
+  return False
+
+
+def main():
+  target = (sys.argv[1] if len(sys.argv) > 1 else 'xla').lower()
+  assert target in {'xla', 'tensorflow'}, target
+  header_file_define = dict(
+      xla='XLA_TESTS_COMPLEX_UNARY_OP_SAMPLES_H_',
+      tensorflow='TENSORFLOW_COMPILER_XLA_TESTS_COMPLEX_UNARY_OP_SAMPLES_H_',
+  )[target]
+  default_size = 7
+  default_extra_prec_multiplier = 1
+
+  blocks = []
+  for opname in ['Log1p', 'Tan']:
+    mpmath_op = opname.lower()
+    size_re, size_im = dict(Log1p=(7, 7), Tan=(7, 7)).get(
+        opname, (default_size, default_size)
+    )
+    extra_prec_multiplier = dict(Log1p=1, Tan=1).get(
+        opname, default_extra_prec_multiplier
+    )
+    nmp = jtu.numpy_with_mpmath(
+        mpmath, extra_prec_multiplier=extra_prec_multiplier
+    )
+
+    ifblocks = []
+    input_ttype = 'std::complex<T>'
+    output_ttype = 'TBD'
+    for dtype in [np.complex64, np.complex128]:
+      float_dtype = {np.complex64: np.float32, np.complex128: np.float64}[dtype]
+      ctype = {np.float32: 'float', np.float64: 'double'}[float_dtype]
+      cnan = {np.float32: 'std::nanf("")', np.float64: 'std::nan("")'}[
+          float_dtype
+      ]
+      pi = float_dtype(np.pi)
+      h_pi = float_dtype(np.pi / 2)
+      q_pi = float_dtype(np.pi / 4)
+      tq_pi = float_dtype(3 * np.pi / 4)
+      cfloat_suffix = 'f' if float_dtype == np.float32 else ''
+      cpi = str(pi) + cfloat_suffix
+      cpi_2 = str(h_pi) + cfloat_suffix
+      cpi_4 = str(q_pi) + cfloat_suffix
+      cpi3_4 = str(tq_pi) + cfloat_suffix
+      czero = str(float_dtype(0)) + cfloat_suffix
+      finfo = np.finfo(float_dtype)
+
+      # pylint: disable=cell-var-from-loop
+      def _tostr(v):
+        if v == pi:
+          return 'pi'
+        if v == -pi:
+          return '-pi'
+        if v == h_pi:
+          return 'pi_2'
+        if v == -h_pi:
+          return '-pi_2'
+        if v == q_pi:
+          return 'pi_4'
+        if v == -q_pi:
+          return '-pi_4'
+        if v == tq_pi:
+          return 'pi3_4'
+        if v == -tq_pi:
+          return '-pi3_4'
+        if v == finfo.max:
+          return 'max'
+        if v == -finfo.max:
+          return '-max'
+        if v == finfo.tiny:
+          return 'min'
+        if v == -finfo.tiny:
+          return '-min'
+        if np.isnan(v):
+          return 'nan'
+        if np.isneginf(v):
+          return '-inf'
+        if np.isposinf(v):
+          return 'inf'
+        if v == 0.0:
+          return 'zero'
+        if float_dtype == np.float32:
+          s = f'{v:.7e}f'
+        elif float_dtype == np.float64:
+          s = f'{v:.16e}'
+        else:
+          assert 0  # unreachable
+        return re.sub(r'0+e', 'e', s)
+
+      used_constants = set()
+
+      def tostr(v):
+        r = _tostr(v)
+        used_constants.add(r.removeprefix('-'))
+        return r
+
+      rows = []
+      counter = 0
+
+      sample = jtu.complex_plane_sample(
+          dtype, size_re=size_re, size_im=size_im
+      ).flatten()
+      values = getattr(nmp, mpmath_op)(sample)
+      for x, y in zip(sample, values):
+        prev_used_constants = used_constants.copy()
+        re_x, im_x = tostr(x.real), tostr(x.imag)
+        skip = disable(opname, re_x, im_x)
+        if skip:
+          prefix = '// '
+        else:
+          # to ease tracking mismatching cases:
+          prefix = f'/* {counter} */ '
+          counter += 1
+        if values.dtype.kind == 'c':
+          output_ttype = 'std::complex<T>'
+          re_y, im_y = tostr(y.real), tostr(y.imag)
+          scale = tostr(np.ldexp(1.0, -np.frexp(abs(y))[1]))
+          rows.append(
+              f'{prefix}{{ {{ {re_x}, {im_x} }}, {{ {re_y}, {im_y} }},'
+              f' {scale} }}'
+          )
+        else:
+          assert values.dtype.kind == 'f'
+          output_ttype = 'T'
+          # Scale is power of 2 so that multiplication with
+          # it has minimal effect to the binary mantissa
+          # part of other operand.
+          scale = tostr(np.ldexp(1.0, -np.frexp(abs(y))[1]))
+          rows.append(
+              f'{prefix}{{ {{ {re_x}, {im_x} }}, {tostr(y)}, {scale} }}'
+          )
+        if skip:
+          # restore used_constants
+          used_constants.difference_update(
+              used_constants.difference(prev_used_constants)
+          )
+
+      rows = ',\n        '.join(rows)
+
+      constants = []
+      for name, value in dict(
+          nan=cnan,
+          pi=cpi,
+          pi_4=cpi_4,
+          pi_2=cpi_2,
+          pi3_4=cpi3_4,
+          zero=czero,
+          inf='std::numeric_limits<T>::infinity()',
+          min='std::numeric_limits<T>::min()',
+          max='std::numeric_limits<T>::max()',
+      ).items():
+        if name in used_constants:
+          constants.append(f'const T {name} = {value};')
+      constants = '\n      '.join(constants)
+
+      ifblocks.append(f"""\
+if constexpr (std::is_same_v<T, {ctype}>) {{
+      {constants}
+      const TableType table{{
+          // clang-format off
+          // Ignore max 80 character line width style requirement for
+          // (i) the readability
+          // (ii) the consistency with the local conventions
+        {rows}
+          // clang-format on
+      }};
+      return table;
+    }}""")
+    ifblocks.append(
+        '{\n      static_assert(dependent_false<T>); /* unreachable */\n    }'
+    )
+    ifblocks = ' else '.join(ifblocks)
+    blocks.append(f"""
+template <typename T, int default_dps_deficiency = 0>
+struct {opname} {{
+  typedef {input_ttype} InputType;
+  typedef {output_ttype} OutputType;
+  typedef T FloatType;
+  using TableType = std::vector<std::tuple<InputType, OutputType, FloatType>>;
+  static constexpr int dps_deficiency = default_dps_deficiency;
+  const TableType get() {{
+    {ifblocks}
+  }}
+}};
+""")
+  blocks = '\n'.join(blocks)
+
+  output_filename = os.path.join(
+      os.path.dirname(__file__), 'complex_unary_op_samples.h'
+  )
+  output = open(output_filename, 'w')
+
+  output.write(f"""\
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/*
+  This file is generated using xla/tests/{os.path.basename(__file__)}.
+  Do not edit!
+ */
+
+#include <cmath>
+#include <complex>
+#include <limits>
+#include <tuple>
+#include <vector>
+
+#ifndef {header_file_define}
+#define {header_file_define}
+
+namespace complex_unary_op_samples {{
+
+template <class>
+constexpr bool dependent_false = false;
+{blocks}
+}}  // namespace complex_unary_op_samples
+
+#endif  // {header_file_define}
+""")
+  output.close()
+  sys.stdout.write(f'Created {output_filename}\n')
+
+
+if __name__ == '__main__':
+  main()
diff --git a/third_party/xla/xla/tests/multiple_devices_on_host_test.cc b/third_party/xla/xla/tests/multiple_devices_on_host_test.cc
index ef69ea068d7fee..a24b5594f484bc 100644
--- a/third_party/xla/xla/tests/multiple_devices_on_host_test.cc
+++ b/third_party/xla/xla/tests/multiple_devices_on_host_test.cc
@@ -37,7 +37,7 @@ absl::StatusOr<XlaComputation> BuildComputation() {
 void CompileAndExecute(
     LocalExecutable* executable, int device_ordinal, LocalClient* client,
     absl::Mutex* results_mutex,
-    std::vector<std::pair<int, StatusOr<ScopedShapedBuffer>>>* results) {
+    std::vector<std::pair<int, absl::StatusOr<ScopedShapedBuffer>>>* results) {
   xla::ExecutableRunOptions execute_options;
   execute_options.set_intra_op_thread_pool(
       client->backend().eigen_intra_op_thread_pool_device());
@@ -72,7 +72,7 @@ void TestWithDeviceCount(const int device_count) {
   std::unique_ptr<LocalExecutable> executable = std::move(executables[0]);
   std::vector<tsl::Thread*> threads;
   absl::Mutex results_mutex;
-  std::vector<std::pair<int, StatusOr<ScopedShapedBuffer>>> results;
+  std::vector<std::pair<int, absl::StatusOr<ScopedShapedBuffer>>> results;
   tsl::Env* env = tsl::Env::Default();
   for (int device_ordinal = 0; device_ordinal < device_count;
        device_ordinal++) {
diff --git a/third_party/xla/xla/tests/onednn_matmul_test.cc b/third_party/xla/xla/tests/onednn_matmul_test.cc
index 64befeab768c25..88f308b8624671 100644
--- a/third_party/xla/xla/tests/onednn_matmul_test.cc
+++ b/third_party/xla/xla/tests/onednn_matmul_test.cc
@@ -63,6 +63,15 @@ class MatmulTest : public HloTestBase {
     ; CHECK-DAG:   }
     ; CHECK:     }
     )";
+  const char* fused_matmul_bias_gelu_tanh_ = R"(
+    ; CHECK:     custom_call_target="__onednn$matmul",
+    ; CHECK:       backend_config={
+    ; CHECK-DAG:     "outer_dimension_partitions":[],
+    ; CHECK-DAG:     "onednn_matmul_config":{
+    ; CHECK-DAG:       "fused_ops":["BIAS","GELU_TANH"]
+    ; CHECK-DAG:   }
+    ; CHECK:     }
+    )";
 };
 
 TEST_F(MatmulTest, SimpleTestF32) {
@@ -394,16 +403,126 @@ TEST_F(MatmulTest, BiasAndApproxGELUTestF32) {
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
-  MatchOptimizedHlo(matmul_module_str,
-                    R"(
-  ; CHECK:     custom_call_target="__onednn$matmul",
-  ; CHECK:       backend_config={
-  ; CHECK-DAG:     "outer_dimension_partitions":[],
-  ; CHECK-DAG:     "onednn_matmul_config":{
-  ; CHECK-DAG:       "fused_ops":["BIAS","GELU_TANH"]
-  ; CHECK-DAG:   }
-  ; CHECK:     }
-  )");
+  MatchOptimizedHlo(matmul_module_str, fused_matmul_bias_gelu_tanh_);
+}
+
+// Tests GELU approximate pattern from tf.nn.gelu(approximate=True).
+TEST_F(MatmulTest, BiasAndApproxTFGELUTestF32) {
+  const char* matmul_module_str = R"(
+  HloModule matmul.test.f32
+
+  ENTRY matmul.test.f32 {
+  arg0.1 = f32[1024,512] parameter(0), parameter_replication={false}
+  arg1.2 = f32[256,512] parameter(1), parameter_replication={false}
+  dot.7 = f32[1024,256] dot(arg0.1, arg1.2), lhs_contracting_dims={1}, rhs_contracting_dims={1}, frontend_attributes={grad_x="false",grad_y="false"}
+  arg2.3 = f32[256] parameter(2), parameter_replication={false}
+  broadcast.9 = f32[1024,256] broadcast(arg2.3), dimensions={1}
+  add.10 = f32[1024,256] add(dot.7, broadcast.9)
+  constant.12 = f32[] constant(0.044715)
+  broadcast.13 = f32[1024,256] broadcast(constant.12), dimensions={}
+  multiply.14 = f32[1024,256] multiply(broadcast.13, add.10)
+  multiply.11 = f32[1024,256] multiply(add.10, add.10)
+  multiply.15 = f32[1024,256] multiply(multiply.14, multiply.11)
+  add.16 = f32[1024,256] add(add.10, multiply.15)
+  constant.17 = f32[] constant(0.797884583)
+  broadcast.18 = f32[1024,256] broadcast(constant.17), dimensions={}
+  multiply.19 = f32[1024,256] multiply(add.16, broadcast.18)
+  tanh.20 = f32[1024,256] tanh(multiply.19)
+  constant.21 = f32[] constant(1)
+  broadcast.22 = f32[1024,256] broadcast(constant.21), dimensions={}
+  add.23 = f32[1024,256] add(tanh.20, broadcast.22)
+  constant.24 = f32[] constant(0.5)
+  broadcast.25 = f32[1024,256] broadcast(constant.24), dimensions={}
+  multiply.26 = f32[1024,256] multiply(add.23, broadcast.25)
+  ROOT multiply.27 = f32[1024,256] multiply(add.10, multiply.26)
+  })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
+  MatchOptimizedHlo(matmul_module_str, fused_matmul_bias_gelu_tanh_);
+}
+
+// Tests GELU approximate pattern from tf.nn.gelu(approximate=True) with
+// auto_mixed_precision_onednn_bfloat16.
+TEST_F(MatmulTest, BiasAndApproxTFGELUTestBF16) {
+  if (!IsSupportedType(PrimitiveType::BF16)) {
+    GTEST_SKIP() << "CPU does not support BF16.";
+  }
+  const char* matmul_module_str = R"(
+  HloModule matmul.test.f32
+
+  ENTRY matmul.test.f32 {
+  arg0.1 = f32[1024,512] parameter(0), parameter_replication={false}
+  convert.8 = bf16[1024,512] convert(arg0.1)
+  arg1.2 = f32[256,512] parameter(1), parameter_replication={false}
+  convert.9 = bf16[256,512] convert(arg1.2)
+  dot.10 = bf16[1024,256] dot(convert.8, convert.9), lhs_contracting_dims={1}, rhs_contracting_dims={1}, frontend_attributes={grad_x="false",grad_y="false"}
+  convert = f32[1024,256] convert(dot.10)
+  arg2.3 = f32[256] parameter(2), parameter_replication={false}
+  broadcast = f32[1024,256] broadcast(arg2.3), dimensions={1}
+  add.13 = f32[1024,256] add(convert, broadcast)
+  constant.16 = f32[] constant(0.044715)
+  broadcast.17 = f32[1024,256] broadcast(constant.16), dimensions={}
+  multiply.18 = f32[1024,256] multiply(broadcast.17, add.13)
+  multiply.15 = f32[1024,256] multiply(add.13, add.13)
+  multiply.19 = f32[1024,256] multiply(multiply.18, multiply.15)
+  add.20 = f32[1024,256] add(add.13, multiply.19)
+  constant.21 = f32[] constant(0.797884583)
+  broadcast.22 = f32[1024,256] broadcast(constant.21), dimensions={}
+  multiply.23 = f32[1024,256] multiply(add.20, broadcast.22)
+  tanh.24 = f32[1024,256] tanh(multiply.23)
+  constant.25 = f32[] constant(1)
+  broadcast.26 = f32[1024,256] broadcast(constant.25), dimensions={}
+  add.27 = f32[1024,256] add(tanh.24, broadcast.26)
+  constant.1 = f32[] constant(0.5)
+  broadcast.2 = f32[1024,256] broadcast(constant.1), dimensions={}
+  multiply.30 = f32[1024,256] multiply(add.13, broadcast.2)
+  ROOT multiply.32 = f32[1024,256] multiply(add.27, multiply.30)
+  })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-2, 1e-2}));
+  MatchOptimizedHlo(matmul_module_str, fused_matmul_bias_gelu_tanh_);
+}
+
+// Tests GELU approximate pattern from tf.nn.gelu(approximate=True)
+TEST_F(MatmulTest, BiasAndApproxTFGELUTestF16) {
+  if (!IsSupportedType(PrimitiveType::F16)) {
+    GTEST_SKIP() << "CPU does not support F16.";
+  }
+  const char* matmul_module_str = R"(
+  HloModule matmul.test.f32
+
+  ENTRY matmul.test.f32 {
+  arg0.1 = f16[1024,512] parameter(0), parameter_replication={false}
+  reshape.4 = f16[1024,512] reshape(arg0.1)
+  arg1.2 = f16[256,512] parameter(1), parameter_replication={false}
+  reshape.5 = f16[256,512] reshape(arg1.2)
+  dot.7 = f16[1024,256] dot(reshape.4, reshape.5), lhs_contracting_dims={1}, rhs_contracting_dims={1}, frontend_attributes={grad_x="false",grad_y="false"}
+  transpose.8 = f16[1024,256] transpose(dot.7), dimensions={0,1}
+  arg2.3 = f16[256] parameter(2), parameter_replication={false}
+  reshape.6 = f16[256] reshape(arg2.3)
+  broadcast.9 = f16[1024,256] broadcast(reshape.6), dimensions={1}
+  add.10 = f16[1024,256] add(transpose.8, broadcast.9)
+  constant.12 = f16[] constant(0.044708)
+  broadcast.13 = f16[1024,256] broadcast(constant.12), dimensions={}
+  multiply.14 = f16[1024,256] multiply(broadcast.13, add.10)
+  multiply.11 = f16[1024,256] multiply(add.10, add.10)
+  multiply.15 = f16[1024,256] multiply(multiply.14, multiply.11)
+  add.16 = f16[1024,256] add(add.10, multiply.15)
+  constant.17 = f16[] constant(0.79785)
+  broadcast.18 = f16[1024,256] broadcast(constant.17), dimensions={}
+  multiply.19 = f16[1024,256] multiply(add.16, broadcast.18)
+  tanh.20 = f16[1024,256] tanh(multiply.19)
+  constant.21 = f16[] constant(1)
+  broadcast.22 = f16[1024,256] broadcast(constant.21), dimensions={}
+  add.23 = f16[1024,256] add(tanh.20, broadcast.22)
+  constant.24 = f16[] constant(0.5)
+  broadcast.25 = f16[1024,256] broadcast(constant.24), dimensions={}
+  multiply.26 = f16[1024,256] multiply(add.23, broadcast.25)
+  ROOT multiply.27 = f16[1024,256] multiply(add.10, multiply.26)
+  })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-2, 1e-4}));
+  MatchOptimizedHlo(matmul_module_str, fused_matmul_bias_gelu_tanh_);
 }
 
 TEST_F(MatmulTest, ReLUTestF32) {
@@ -692,6 +811,30 @@ TEST_F(MatmulTest, SimpleTestF32WithMulAndAddFusion) {
     )");
 }
 
+TEST_F(MatmulTest, WeightsPrepackAndScratch) {
+  const char* matmul_module_str = R"(
+  HloModule matmul.test.f32
+  ENTRY matmul.test.f32 {
+    arg.0 = f32[64,256,16] parameter(0), parameter_replication={false}
+    constant = f32[] constant(1)
+    arg.1 = f32[16,32] broadcast(constant), dimensions={}
+    ROOT onednn.matmul.0 = f32[64,256,32] dot(arg.0, arg.1), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+  })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
+  MatchOptimizedHlo(matmul_module_str,
+                    R"(
+  ; CHECK:        %matmul.test.f32
+  ; CHECK:        custom_call_target="__onednn$matmul",
+  ; CHECK-SAME:       backend_config={
+  ; CHECK-SAME:           "outer_dimension_partitions":[],
+  ; CHECK-SAME:           "onednn_matmul_config":{
+  ; CHECK-SAME:               "weights_prepacked":true,"user_scratchpad":true
+  ; CHECK-SAME:           }
+  ; CHECK-SAME:       }
+  )");
+}
+
 }  // namespace cpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/tests/test_macros.h b/third_party/xla/xla/tests/test_macros.h
index 91874e0636279f..2930415f6f4c0c 100644
--- a/third_party/xla/xla/tests/test_macros.h
+++ b/third_party/xla/xla/tests/test_macros.h
@@ -35,6 +35,7 @@ limitations under the License.
 #define DISABLED_ON_INTERPRETER(X) X
 #define DISABLED_ON_INTERPRETER_TSAN(X) X
 #define DISABLED_ON_DEBUG(X) X
+#define DISABLED_ON_TPU(X) X
 
 // We need this macro instead of pasting directly to support nesting
 // the DISABLED_ON_FOO macros, as in the definition of DISABLED_ON_CPU.
@@ -81,6 +82,11 @@ limitations under the License.
 # define DISABLED_ON_DEBUG(X) XLA_TEST_PASTE(DISABLED_, X)
 #endif  // !NDEBUG
 
+#ifdef XLA_TEST_BACKEND_TPU
+# undef DISABLED_ON_TPU
+# define DISABLED_ON_TPU(X) XLA_TEST_PASTE(DISABLED_, X)
+#endif  // XLA_TEST_BACKEND_TPU
+
 // clang-format on
 
 namespace xla {
diff --git a/third_party/xla/xla/tests/token_hlo_test.cc b/third_party/xla/xla/tests/token_hlo_test.cc
index 6945ce5a4c01c2..8991e97916a753 100644
--- a/third_party/xla/xla/tests/token_hlo_test.cc
+++ b/third_party/xla/xla/tests/token_hlo_test.cc
@@ -13,12 +13,22 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <array>
-
-#include "xla/service/hlo_verifier.h"
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/literal.h"
+#include "xla/literal_util.h"
+#include "xla/service/hlo_runner.h"
 #include "xla/tests/hlo_test_base.h"
+#include "xla/tests/literal_test_util.h"
 #include "xla/tests/test_macros.h"
 #include "xla/tests/test_utils.h"
+#include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
 
 namespace xla {
@@ -78,47 +88,6 @@ XLA_TEST_F(TokenHloTest, TokenTree) {
   EXPECT_TRUE(LiteralTestUtil::Equal(result, LiteralUtil::CreateToken()));
 }
 
-XLA_TEST_F(TokenHloTest, InvalidTokenShapedEntryParameter) {
-  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
-  auto builder = HloComputation::Builder(TestName());
-  builder.AddInstruction(
-      HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "p0"));
-  builder.AddInstruction(
-      HloInstruction::CreateParameter(1, ShapeUtil::MakeTokenShape(), "p1"));
-  builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32_t>(42)));
-  module->AddEntryComputation(builder.Build());
-
-  Status status =
-      HloVerifier(/*layout_sensitive=*/false, /*allow_mixed_precision=*/false)
-          .Run(module.get())
-          .status();
-  ASSERT_IS_NOT_OK(status);
-  EXPECT_THAT(
-      status.message(),
-      ::testing::HasSubstr("Entry parameter 1 is or contains a token shape"));
-}
-
-XLA_TEST_F(TokenHloTest, InvalidTupleTokenShapedEntryParameter) {
-  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
-  auto builder = HloComputation::Builder(TestName());
-  builder.AddInstruction(HloInstruction::CreateParameter(
-      0,
-      ShapeUtil::MakeTupleShape(
-          {ShapeUtil::MakeShape(F32, {1, 2, 3}), ShapeUtil::MakeTokenShape()}),
-      "param"));
-  module->AddEntryComputation(builder.Build());
-
-  Status status =
-      HloVerifier(/*layout_sensitive=*/false, /*allow_mixed_precision=*/false)
-          .Run(module.get())
-          .status();
-  ASSERT_IS_NOT_OK(status);
-  EXPECT_THAT(
-      status.message(),
-      ::testing::HasSubstr("Entry parameter 0 is or contains a token shape"));
-}
-
 XLA_TEST_F(TokenHloTest, TokenInWhileLoop) {
   // Thread a token around a while loop. Token is created and consumed by a
   // AfterAll instruction in the while body.
diff --git a/third_party/xla/xla/tools/BUILD b/third_party/xla/xla/tools/BUILD
index 4b11cb0362ab5c..a244749dacba7c 100644
--- a/third_party/xla/xla/tools/BUILD
+++ b/third_party/xla/xla/tools/BUILD
@@ -3,12 +3,6 @@
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
-load(
-    "@local_tsl//tsl:tsl.bzl",
-    "if_cuda_or_rocm",
-    "tsl_gpu_library",
-)
-load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
 load(
     "@local_tsl//tsl/platform:build_config.bzl",
     "tf_proto_library",
@@ -26,6 +20,13 @@ load(
 )
 load("//xla/stream_executor:build_defs.bzl", "if_gpu_is_configured")
 load("//xla/tests:build_defs.bzl", "xla_test")
+load(
+    "//xla/tsl:tsl.bzl",
+    "if_cuda_or_rocm",
+    "if_google",
+    "tsl_gpu_library",
+)
+load("//xla/tsl:tsl.default.bzl", "filegroup")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -598,7 +599,12 @@ cc_library(
     srcs = ["hlo_control_flow_flattening.cc"],
     hdrs = ["hlo_control_flow_flattening.h"],
     deps = [
+        "//xla:comparison_util",
+        "//xla:literal",
         "//xla:literal_util",
+        "//xla:shape_util",
+        "//xla:status",
+        "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/service:call_graph",
         "//xla/service:collective_ops_utils",
@@ -606,7 +612,12 @@ cc_library(
         "//xla/service:hlo_pass",
         "//xla/service:tuple_util",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -742,10 +753,7 @@ xla_test(
     name = "xla_compile_lib_test",
     srcs = ["xla_compile_lib_test.cc"],
     backend_tags = {
-        "gpu": [
-            "requires-gpu-nvidia",
-            "config-cuda-only",
-        ],
+        "gpu": ["requires-gpu-nvidia"] + if_google(["config-cuda-only"]),
     },
     backends = [
         "cpu",
diff --git a/third_party/xla/xla/tools/hlo_control_flow_flattening.cc b/third_party/xla/xla/tools/hlo_control_flow_flattening.cc
index f1ff7706c7d3a6..98e3d8584712ed 100644
--- a/third_party/xla/xla/tools/hlo_control_flow_flattening.cc
+++ b/third_party/xla/xla/tools/hlo_control_flow_flattening.cc
@@ -16,19 +16,35 @@ limitations under the License.
 #include "xla/tools/hlo_control_flow_flattening.h"
 
 #include <algorithm>
-#include <functional>
+#include <cstdint>
 #include <string>
+#include <utility>
+#include <vector>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/comparison_util.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/literal.h"
 #include "xla/literal_util.h"
+#include "xla/service/call_graph.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/hlo_dce.h"
 #include "xla/service/tuple_util.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/status.h"
+#include "xla/util.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
@@ -371,7 +387,8 @@ Status HloControlFlowFlattening::RemoveSendDone(
   return OkStatus();
 }
 
-Status HloControlFlowFlattening::RemoveCollective(HloInstruction* hlo) const {
+absl::StatusOr<HloInstruction*> HloControlFlowFlattening::RemoveCollective(
+    HloInstruction* hlo) const {
   HloComputation* computation = hlo->parent();
   HloInstruction* custom_call =
       computation->AddInstruction(HloInstruction::CreateCustomCall(
@@ -387,7 +404,7 @@ Status HloControlFlowFlattening::RemoveCollective(HloInstruction* hlo) const {
   std::string original_op_name(hlo->name());
   TF_RETURN_IF_ERROR(computation->ReplaceInstruction(hlo, custom_call));
   custom_call->SetAndSanitizeName(original_op_name);
-  return OkStatus();
+  return custom_call;
 }
 
 Status HloControlFlowFlattening::RemoveId(HloInstruction* hlo) const {
@@ -467,12 +484,12 @@ absl::StatusOr<bool> HloControlFlowFlattening::Run(
                  instruction->opcode() == HloOpcode::kAsyncStart) {
             HloInstruction* operand = instruction->mutable_operand(0);
             VLOG(1) << "Remove " << instruction->name();
-            TF_RETURN_IF_ERROR(RemoveCollective(instruction));
+            TF_RETURN_IF_ERROR(RemoveCollective(instruction).status());
             instruction = operand;
           }
         } else {
           VLOG(1) << "Remove " << instruction->name();
-          TF_RETURN_IF_ERROR(RemoveCollective(instruction));
+          TF_RETURN_IF_ERROR(RemoveCollective(instruction).status());
         }
         changed = true;
       } else if (remove_comm_ &&
diff --git a/third_party/xla/xla/tools/hlo_control_flow_flattening.h b/third_party/xla/xla/tools/hlo_control_flow_flattening.h
index 30c05c30a73b0f..c70bc80a504073 100644
--- a/third_party/xla/xla/tools/hlo_control_flow_flattening.h
+++ b/third_party/xla/xla/tools/hlo_control_flow_flattening.h
@@ -19,10 +19,14 @@ limitations under the License.
 #include <limits>
 #include <string>
 
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/call_graph.h"
 #include "xla/service/hlo_pass_interface.h"
+#include "xla/status.h"
 
 namespace xla {
 
@@ -89,8 +93,9 @@ class HloControlFlowFlattening : public HloModulePass {
   bool remove_host_transfer_;
 
  protected:
-  // Replaces a collective op with a custom call.
-  Status RemoveCollective(HloInstruction* hlo) const;
+  // Replaces a collective op with a custom call and returns the custom call.
+  virtual absl::StatusOr<HloInstruction*> RemoveCollective(
+      HloInstruction* hlo) const;
 
   bool remove_comm_;
 };
diff --git a/third_party/xla/xla/tools/hlo_opt/BUILD b/third_party/xla/xla/tools/hlo_opt/BUILD
index 1487bc0a484d84..06e2380b70a542 100644
--- a/third_party/xla/xla/tools/hlo_opt/BUILD
+++ b/third_party/xla/xla/tools/hlo_opt/BUILD
@@ -2,7 +2,6 @@ load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "if_rocm_is_configured",
 )
-load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
 load(
     "@local_tsl//tsl/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
@@ -18,6 +17,7 @@ load(
     "//xla/stream_executor:build_defs.bzl",
     "if_gpu_is_configured",
 )
+load("//xla/tsl:tsl.default.bzl", "filegroup")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -34,6 +34,7 @@ cc_library(
         "//xla:debug_options_flags",
         "//xla:statusor",
         "//xla:types",
+        "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:compiler",
         "//xla/service:executable",
@@ -41,9 +42,13 @@ cc_library(
         "//xla/service:platform_util",
         "//xla/stream_executor",
         "//xla/stream_executor:platform",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
@@ -73,6 +78,7 @@ cc_library(
         "//xla/stream_executor",
         "//xla/stream_executor/platform",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@llvm-project//llvm:ir_headers",
         "@local_tsl//tsl/platform:errors",
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_opt.cc b/third_party/xla/xla/tools/hlo_opt/gpu_opt.cc
index ee8f93aa7cb0eb..095cd94dc60ac7 100644
--- a/third_party/xla/xla/tools/hlo_opt/gpu_opt.cc
+++ b/third_party/xla/xla/tools/hlo_opt/gpu_opt.cc
@@ -19,16 +19,13 @@ limitations under the License.
 #include <string>
 #include <utility>
 
-#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "llvm/IR/LLVMContext.h"
-#include "xla/debug_options_flags.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/service/buffer_value.h"
 #include "xla/service/compiler.h"
 #include "xla/service/dump.h"
 #include "xla/service/executable.h"
-#include "xla/service/gpu/buffer_sharing.h"
 #include "xla/service/gpu/compile_module_to_llvm_ir.h"
 #include "xla/service/gpu/executable.pb.h"
 #include "xla/service/gpu/gpu_compiler.h"
@@ -36,11 +33,10 @@ limitations under the License.
 #include "xla/service/gpu/gpu_hlo_schedule.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/service/platform_util.h"
-#include "xla/statusor.h"
+#include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform/initialize.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tools/hlo_opt/opt_lib.h"
-#include "xla/types.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
diff --git a/third_party/xla/xla/tools/hlo_opt/opt_lib.cc b/third_party/xla/xla/tools/hlo_opt/opt_lib.cc
index 5435c0a27259c4..ebc0654808c454 100644
--- a/third_party/xla/xla/tools/hlo_opt/opt_lib.cc
+++ b/third_party/xla/xla/tools/hlo_opt/opt_lib.cc
@@ -22,20 +22,23 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/base/const_init.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/log/check.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
 #include "xla/debug_options_flags.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/compiler.h"
 #include "xla/service/executable.h"
 #include "xla/service/hlo_graph_dumper.h"
 #include "xla/service/platform_util.h"
-#include "xla/statusor.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/types.h"
+#include "xla/xla.pb.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/tools/hlo_opt/opt_lib.h b/third_party/xla/xla/tools/hlo_opt/opt_lib.h
index bed286a5815b1c..97097facf570b1 100644
--- a/third_party/xla/xla/tools/hlo_opt/opt_lib.h
+++ b/third_party/xla/xla/tools/hlo_opt/opt_lib.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <set>
 #include <string>
 
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/compiler.h"
diff --git a/third_party/xla/xla/tools/hlo_opt/opt_main.cc b/third_party/xla/xla/tools/hlo_opt/opt_main.cc
index ef24a0a4da2405..fca335e60647d9 100644
--- a/third_party/xla/xla/tools/hlo_opt/opt_main.cc
+++ b/third_party/xla/xla/tools/hlo_opt/opt_main.cc
@@ -34,20 +34,15 @@ limitations under the License.
 #include "absl/strings/str_split.h"
 #include "xla/debug_options_flags.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/service/hlo_runner.h"
-#include "xla/service/platform_util.h"
 #include "xla/status.h"
-#include "xla/statusor.h"
 #include "xla/tools/hlo_module_loader.h"
 #include "xla/tools/hlo_opt/opt_lib.h"
-#include "xla/tools/run_hlo_module.h"
 #include "xla/tsl/util/command_line_flags.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/init_main.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/path.h"
-#include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
 
 namespace {
@@ -230,7 +225,7 @@ int main(int argc, char** argv) {
     LOG(QFATAL) << kUsageString;
   }
 
-  xla::Status s = xla::RunOpt(argc, argv, opts);
+  absl::Status s = xla::RunOpt(argc, argv, opts);
   if (!s.ok()) {
     std::cerr << s;
     return 1;
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/BUILD b/third_party/xla/xla/tools/multihost_hlo_runner/BUILD
index 1a2ac425ed166c..ae20683fca7aab 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/BUILD
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/BUILD
@@ -1,10 +1,10 @@
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
-load("@local_tsl//tsl:tsl.bzl", "if_cuda_or_rocm")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//xla:xla.bzl", "xla_cc_binary")
 load("//xla/tests:build_defs.bzl", "xla_test")
+load("//xla/tsl:tsl.bzl", "if_cuda_or_rocm")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc b/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc
index b424df91ec9353..93dc178990a10c 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc
@@ -85,7 +85,7 @@ absl::StatusOr<Literal> MakeFakeLiteralWithSameValue(const Shape& shape,
   }
   Shape new_shape = shape;
   new_shape.mutable_layout()->clear_tiles();
-  return primitive_util::PrimitiveTypeSwitch<StatusOr<Literal>>(
+  return primitive_util::PrimitiveTypeSwitch<absl::StatusOr<Literal>>(
       [&](auto type) -> absl::StatusOr<Literal> {
         if constexpr (primitive_util::IsArrayType(type)) {
           using NativeT = primitive_util::NativeTypeOf<type>;
@@ -984,7 +984,7 @@ FunctionalHloRunner::RunInternal(
       execute_options.untuple_result = false;
       break;
   }
-  std::optional<std::vector<PjRtFuture<Status>>> futures;
+  std::optional<std::vector<PjRtFuture<>>> futures;
   futures.emplace();
   for (int repeat = 0; repeat < running_options.num_repeats; ++repeat) {
     VLOG(1) << "FunctionalHloRunner: ExecuteOnDevices started (repeat = "
diff --git a/third_party/xla/xla/tools/run_hlo_module_main.cc b/third_party/xla/xla/tools/run_hlo_module_main.cc
index c19130df2c6561..4f577bf3d279b3 100644
--- a/third_party/xla/xla/tools/run_hlo_module_main.cc
+++ b/third_party/xla/xla/tools/run_hlo_module_main.cc
@@ -180,7 +180,7 @@ int main(int argc, char** argv) {
     if (iteration_count != 1) {
       std::cerr << "\n=== Iteration " << i << "\n";
     }
-    xla::Status result = xla::RunAndCompare(
+    absl::Status result = xla::RunAndCompare(
         hlo_filename, &test_runner, reference_runner.get(), engine.get(), opts,
         /*iteration_literals_proto=*/nullptr,
         /*reference_module_modifier_hook=*/{},
diff --git a/third_party/xla/xla/tools/xla_compile_lib.cc b/third_party/xla/xla/tools/xla_compile_lib.cc
index 3d1a2f9e02bd42..852c83615854e6 100644
--- a/third_party/xla/xla/tools/xla_compile_lib.cc
+++ b/third_party/xla/xla/tools/xla_compile_lib.cc
@@ -185,11 +185,10 @@ absl::Status WriteResultFile(const absl::string_view result_output_file,
 absl::StatusOr<std::unique_ptr<HloModule>> LoadModule(
     const absl::string_view module_path) {
   auto format = std::string(tsl::io::Extension(module_path));
-  if (format == "hlo" || format == "txt") {
+  if (format == "hlo" || format == "txt" || format == "pb") {
     return LoadModuleFromFile(
-        std::string(module_path), /*format=*/"hlo",
-        hlo_module_loader_details::Config(), [&](HloModuleConfig* c) {},
-        nullptr);
+        std::string(module_path), format, hlo_module_loader_details::Config(),
+        [&](HloModuleConfig* c) {}, nullptr);
   }
   std::string module_string;
   TF_RETURN_IF_ERROR(tsl::ReadFileToString(
diff --git a/third_party/xla/xla/translate/BUILD b/third_party/xla/xla/translate/BUILD
index 21f67f91401763..49d77a762f10f0 100644
--- a/third_party/xla/xla/translate/BUILD
+++ b/third_party/xla/xla/translate/BUILD
@@ -1,6 +1,6 @@
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
-load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("//xla:xla.bzl", "xla_cc_binary")
+load("//xla/tsl:tsl.bzl", "internal_visibility")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -56,6 +56,7 @@ xla_cc_binary(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:MlirOptLib",
+        "@llvm-project//mlir:Support",
         "@local_tsl//tsl/platform:platform_port",
         "@stablehlo//:register",
     ],
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/BUILD b/third_party/xla/xla/translate/hlo_to_mhlo/BUILD
index ad5ad86b6b3518..d3f5a0b7fd4121 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/BUILD
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/BUILD
@@ -1,6 +1,6 @@
-load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//xla:xla.bzl", "xla_cc_test")
+load("//xla/tsl:tsl.bzl", "internal_visibility")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils.cc b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils.cc
index 30c680b4ffc9f5..59e24ea3aaf002 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils.cc
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils.cc
@@ -39,8 +39,6 @@ using mlir::AffineMap;
 using mlir::Builder;
 using mlir::DenseElementsAttr;
 using mlir::ShapedType;
-using xla::LiteralBase;
-using xla::StatusOr;
 
 template <typename CppType>
 ::mlir::DenseElementsAttr CreateDenseAttrFromLiteral(
@@ -116,7 +114,8 @@ absl::StatusOr<mlir::DenseElementsAttr> CreateDenseElementsAttrFromLiteral(
 
   // TODO(hinsu): Support remaining XLA primitive types.
   auto element_type = literal.shape().element_type();
-  return primitive_util::PrimitiveTypeSwitch<StatusOr<mlir::DenseElementsAttr>>(
+  return primitive_util::PrimitiveTypeSwitch<
+      absl::StatusOr<mlir::DenseElementsAttr>>(
       [&](auto primitive_type_constant)
           -> absl::StatusOr<mlir::DenseElementsAttr> {
         if constexpr (primitive_util::IsArrayType(primitive_type_constant)) {
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/tests/custom_call.hlotxt b/third_party/xla/xla/translate/hlo_to_mhlo/tests/custom_call.hlotxt
index 898563de35905b..ad40f5dac66699 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/tests/custom_call.hlotxt
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/tests/custom_call.hlotxt
@@ -13,8 +13,8 @@ ENTRY %dummy_main (Arg_0.1: f32[]) -> f32[] {
 %test_custom_call_dynamic_broadcast_in_dim (arg1: f32[1,?], arg2: s64[3]) -> f32[2,?,2] {
   %arg1 = f32[1,?] parameter(0)
   %arg2 = s64[3] parameter(1)
-  // CHECK:  "mhlo.dynamic_broadcast_in_dim"([[ARG_0]], [[ARG_1]]) {
-  // CHECK-SAME: broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
+  // CHECK:  "mhlo.dynamic_broadcast_in_dim"([[ARG_0]], [[ARG_1]]) <{
+  // CHECK-SAME: broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}>
   // CHECK-SAME: (tensor<1x?xf32>, tensor<3xi64>) -> tensor<2x?x2xf32>
   ROOT %custom-call = f32[2,?,2] custom-call(f32[1,?] %arg1, s64[3] %arg2), custom_call_target="mhlo.dynamic_broadcast_in_dim", backend_config={broadcast_dimensions=[0,1]}
 }
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/tests/fully_connected_reference_model.hlotxt b/third_party/xla/xla/translate/hlo_to_mhlo/tests/fully_connected_reference_model.hlotxt
index ad705919886e89..d99796976505e0 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/tests/fully_connected_reference_model.hlotxt
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/tests/fully_connected_reference_model.hlotxt
@@ -14,7 +14,7 @@ ENTRY %tfcompile.48 {
   // CHECK-NEXT: %[[VAL_2:.*]] = mhlo.reshape %[[VAL_0]] : (tensor<1x300xf32>) -> tensor<1x300xf32>
   %reshape.3 = f32[1,300] reshape(%arg0.1)
 
-  // CHECK-NEXT: %[[VAL_3:.*]] = "mhlo.transpose"(%[[VAL_2]]) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x300xf32>) -> tensor<300x1xf32>
+  // CHECK-NEXT: %[[VAL_3:.*]] = "mhlo.transpose"(%[[VAL_2]]) <{permutation = dense<[1, 0]> : tensor<2xi64>}> : (tensor<1x300xf32>) -> tensor<300x1xf32>
   %transpose.27 = f32[300,1] transpose(%reshape.3), dimensions={1,0}
 
   // CHECK-NEXT: %[[VAL_4:.*]] = mhlo.reshape %[[VAL_3]] : (tensor<300x1xf32>) -> tensor<300x1x1xf32>
@@ -23,13 +23,13 @@ ENTRY %tfcompile.48 {
   // CHECK-NEXT: %[[VAL_5:.*]] = mhlo.reshape %[[VAL_4]] : (tensor<300x1x1xf32>) -> tensor<300x1xf32>
   %reshape.29 = f32[300,1] reshape(%reshape.28)
 
-  // CHECK-NEXT: %[[VAL_6:.*]] = "mhlo.broadcast_in_dim"(%[[VAL_5]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<300x1xf32>) -> tensor<300x1x5xf32>
+  // CHECK-NEXT: %[[VAL_6:.*]] = "mhlo.broadcast_in_dim"(%[[VAL_5]]) <{broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}> : (tensor<300x1xf32>) -> tensor<300x1x5xf32>
   %broadcast.30 = f32[300,1,5] broadcast(%reshape.29), dimensions={0,1}
 
   // CHECK-NEXT: %[[VAL_7:.*]] = mhlo.constant dense<1.000000e+00> : tensor<f32>
   %constant.8 = f32[] constant(1)
 
-  // CHECK-NEXT: %[[VAL_8:.*]] = "mhlo.broadcast_in_dim"(%[[VAL_7]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<300x1x5xf32>
+  // CHECK-NEXT: %[[VAL_8:.*]] = "mhlo.broadcast_in_dim"(%[[VAL_7]]) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<f32>) -> tensor<300x1x5xf32>
   %broadcast.9 = f32[300,1,5] broadcast(%constant.8), dimensions={}
 
   // CHECK-NEXT: %[[VAL_9:.*]] = mhlo.multiply %[[VAL_6]], %[[VAL_8]] : tensor<300x1x5xf32>
@@ -38,7 +38,7 @@ ENTRY %tfcompile.48 {
   // CHECK-NEXT: %[[VAL_10:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
   %constant.32 = f32[] constant(0)
 
-  // CHECK-NEXT: %[[VAL_11:.*]] = "mhlo.broadcast_in_dim"(%[[VAL_10]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<300x1x5xf32>
+  // CHECK-NEXT: %[[VAL_11:.*]] = "mhlo.broadcast_in_dim"(%[[VAL_10]]) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<f32>) -> tensor<300x1x5xf32>
   %broadcast.33 = f32[300,1,5] broadcast(%constant.32), dimensions={}
 
   // CHECK-NEXT: %[[VAL_12:.*]] = mhlo.compare GT, %[[VAL_9]], %[[VAL_11]] : (tensor<300x1x5xf32>, tensor<300x1x5xf32>) -> tensor<300x1x5xi1>
@@ -47,13 +47,13 @@ ENTRY %tfcompile.48 {
   // CHECK-NEXT: %[[VAL_13:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
   %constant.10 = f32[] constant(0)
 
-  // CHECK-NEXT: %[[VAL_14:.*]] = "mhlo.broadcast_in_dim"(%[[VAL_13]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<300x1x5xf32>
+  // CHECK-NEXT: %[[VAL_14:.*]] = "mhlo.broadcast_in_dim"(%[[VAL_13]]) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<f32>) -> tensor<300x1x5xf32>
   %broadcast.11 = f32[300,1,5] broadcast(%constant.10), dimensions={}
 
   // CHECK-NEXT: %[[VAL_15:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
   %constant.40 = f32[] constant(0)
 
-  // CHECK-NEXT: %[[VAL_16:.*]] = "mhlo.broadcast_in_dim"(%[[VAL_15]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<300x5xf32>
+  // CHECK-NEXT: %[[VAL_16:.*]] = "mhlo.broadcast_in_dim"(%[[VAL_15]]) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<f32>) -> tensor<300x5xf32>
   %broadcast.41 = f32[300,5] broadcast(%constant.40), dimensions={}
 
   // CHECK-NEXT: %[[VAL_17:.*]] = mhlo.copy %[[VAL_1]] : tensor<1x300x3x1xf32>
@@ -65,7 +65,7 @@ ENTRY %tfcompile.48 {
   // CHECK-NEXT: %[[VAL_19:.*]] = mhlo.reshape %[[VAL_18]] : (tensor<1x300x3x1xf32>) -> tensor<1x300x3xf32>
   %reshape.24 = f32[1,300,3] reshape(%reshape.4)
 
-  // CHECK-NEXT: %[[VAL_20:.*]] = "mhlo.transpose"(%[[VAL_19]]) {permutation = dense<[1, 0, 2]> : tensor<3xi64>} : (tensor<1x300x3xf32>) -> tensor<300x1x3xf32>
+  // CHECK-NEXT: %[[VAL_20:.*]] = "mhlo.transpose"(%[[VAL_19]]) <{permutation = dense<[1, 0, 2]> : tensor<3xi64>}> : (tensor<1x300x3xf32>) -> tensor<300x1x3xf32>
   %transpose.25 = f32[300,1,3] transpose(%reshape.24), dimensions={1,0,2}
 
   // CHECK-NEXT: %[[VAL_21:.*]] = mhlo.reshape %[[VAL_20]] : (tensor<300x1x3xf32>) -> tensor<300x3xf32>
@@ -75,13 +75,13 @@ ENTRY %tfcompile.48 {
   %constant.35 = f32[3,5] constant({ { -0.106023, 0.121505, 0.800239, -0.768885, 0.0966113 }, { 0.689014, -0.407056, -0.797853, 0.00378925, -0.208881 }, { -0.608529, 0.0276617, 0.268557, 0.577401, -0.428437 } })
 
   // TODO(b/129709049) consider making this default precision config implied.
-  // CHECK-NEXT: %[[VAL_23:.*]] = "mhlo.dot"(%[[VAL_21]], %[[VAL_22]]) {precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<300x3xf32>, tensor<3x5xf32>) -> tensor<300x5xf32>
+  // CHECK-NEXT: %[[VAL_23:.*]] = "mhlo.dot"(%[[VAL_21]], %[[VAL_22]]) <{precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]}> : (tensor<300x3xf32>, tensor<3x5xf32>) -> tensor<300x5xf32>
   %dot.36 = f32[300,5] dot(%reshape.26, %constant.35), lhs_contracting_dims={1}, rhs_contracting_dims={0}
 
   // CHECK-NEXT: %[[VAL_24:.*]] = mhlo.constant dense<0.000000e+00> : tensor<5xf32>
   %constant.37 = f32[5]{0} constant({0, 0, 0, 0, 0})
 
-  // CHECK-NEXT: %[[VAL_25:.*]] = "mhlo.broadcast_in_dim"(%[[VAL_24]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<5xf32>) -> tensor<300x5xf32>
+  // CHECK-NEXT: %[[VAL_25:.*]] = "mhlo.broadcast_in_dim"(%[[VAL_24]]) <{broadcast_dimensions = dense<1> : tensor<1xi64>}> : (tensor<5xf32>) -> tensor<300x5xf32>
   %broadcast.38 = f32[300,5] broadcast(%constant.37), dimensions={1}
 
   // CHECK-NEXT: %[[VAL_26:.*]] = mhlo.add %[[VAL_23]], %[[VAL_25]] : tensor<300x5xf32>
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/tests/fusion.hlotxt b/third_party/xla/xla/translate/hlo_to_mhlo/tests/fusion.hlotxt
index 6ba31aafffe5cf..310e95513523c7 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/tests/fusion.hlotxt
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/tests/fusion.hlotxt
@@ -3,18 +3,22 @@
 HloModule main.17
 
 // CHECK: func @main(%[[ARG0:.*]]: tensor<f32>, %[[ARG1:.*]]: tensor<f32>) -> tensor<f32> {
-// CHECK:   %[[F0:.+]] = "mhlo.fusion"(%[[ARG0:.*]], %[[ARG1:.*]]) ({
+// CHECK:   %[[F0:.+]] = "mhlo.fusion"(%[[ARG0:.*]], %[[ARG1:.*]])
+// CHECK:   <{fusion_kind = #mhlo<fusion_kind kLoop>}> ({
 // CHECK:   ^bb0(%[[ARG2:.*]]: tensor<f32>, %[[ARG3:.*]]: tensor<f32>):
-// CHECK:   }) {fusion_kind = #mhlo<fusion_kind kLoop>, output_operand_aliasing = []} : (tensor<f32>, tensor<f32>) -> tensor<f32>
-// CHECK:   %[[F1:.+]]:2 = "mhlo.fusion"(%[[ARG0:.*]]) ({
+// CHECK:   }) {output_operand_aliasing = []} : (tensor<f32>, tensor<f32>) -> tensor<f32>
+// CHECK:   %[[F1:.+]]:2 = "mhlo.fusion"(%[[ARG0:.*]])
+// CHECK:   <{fusion_kind = #mhlo<fusion_kind kLoop>}> ({
 // CHECK:   ^bb0(%[[ARG2:.*]]: tensor<f32>):
-// CHECK:   }) {fusion_kind = #mhlo<fusion_kind kLoop>, output_operand_aliasing = []} : (tensor<f32>) -> (tensor<f32>, tensor<f32>)
-// CHECK:   %[[F2:.+]]:2 = "mhlo.fusion"(%[[ARG0:.*]], %[[ARG1:.*]]) ({
+// CHECK:   }) {output_operand_aliasing = []} : (tensor<f32>) -> (tensor<f32>, tensor<f32>)
+// CHECK:   %[[F2:.+]]:2 = "mhlo.fusion"(%[[ARG0:.*]], %[[ARG1:.*]])
+// CHECK:   <{fusion_kind = #mhlo<fusion_kind kLoop>}> ({
 // CHECK:   ^bb0(%[[ARG2:.*]]: tensor<f32>, %[[ARG3:.*]]: tensor<f32>):
-// CHECK:   }) {fusion_kind = #mhlo<fusion_kind kLoop>, output_operand_aliasing = []} : (tensor<f32>, tensor<f32>) -> (tensor<f32>, tensor<f32>)
-// CHECK:   %[[F3:.+]]:2 = "mhlo.fusion"(%[[ARG0:.*]], %[[ARG1:.*]]) ({
+// CHECK:   }) {output_operand_aliasing = []} : (tensor<f32>, tensor<f32>) -> (tensor<f32>, tensor<f32>)
+// CHECK:   %[[F3:.+]]:2 = "mhlo.fusion"(%[[ARG0:.*]], %[[ARG1:.*]])
+// CHECK:   <{fusion_kind = #mhlo<fusion_kind kLoop>}> ({
 // CHECK:   ^bb0(%[[ARG2:.*]]: tensor<f32>, %[[ARG3:.*]]: tensor<f32>):
-// CHECK:   }) {fusion_kind = #mhlo<fusion_kind kLoop>, output_operand_aliasing = []} : (tensor<f32>, tensor<f32>) -> (tensor<f32>, tensor<f32>)
+// CHECK:   }) {output_operand_aliasing = []} : (tensor<f32>, tensor<f32>) -> (tensor<f32>, tensor<f32>)
 // CHECK: }
 
 %region_0 (Arg_0.4: f32[], Arg_1.5: f32[]) -> f32[] {
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/tests/import.hlotxt b/third_party/xla/xla/translate/hlo_to_mhlo/tests/import.hlotxt
index d8995bd5bdd70a..454c93111a198f 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/tests/import.hlotxt
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/tests/import.hlotxt
@@ -24,7 +24,7 @@ ENTRY %dummy_main (Arg_0.1: f32[]) -> f32[] {
   %add.42 = f32[4]{0} add(f32[4]{0} %Arg_0.1, f32[4]{0} %Arg_1.2)
 
   // TODO(b/129709049) consider making this default precision config inferred.
-  // CHECK-NEXT:  "mhlo.dot"(%0, %arg1) {precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4xf32>, tensor<4xf32>) -> tensor<f32>
+  // CHECK-NEXT:  "mhlo.dot"(%0, %arg1) <{precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]}> : (tensor<4xf32>, tensor<4xf32>) -> tensor<f32>
   ROOT %dot.4 = f32[] dot(f32[4]{0} %add.42, f32[4]{0} %Arg_1.2), lhs_contracting_dims={0}, rhs_contracting_dims={0}
 }
 
@@ -90,13 +90,13 @@ ENTRY %dummy_main (Arg_0.1: f32[]) -> f32[] {
 // CHECK-SAME:  ([[ARG:%.*]]: tensor<2x2xi32>)
 %test_all_to_all {
   %parameter = s32[2,2]{1,0} parameter(0)
-  // CHECK-NEXT: "mhlo.all_to_all"([[ARG]]) {
+  // CHECK-NEXT: "mhlo.all_to_all"([[ARG]]) <{
   // CHECK-SAME:   channel_handle = #mhlo.channel_handle<handle = 1, type = 0>
   // CHECK-SAME:   concat_dimension = 1 : i64,
   // CHECK-SAME{LITERAL}:   replica_groups = dense<[[1, 2], [3, 0]]> : tensor<2x2xi64>,
   // CHECK-SAME:   split_count = 2 : i64,
   // CHECK-SAME:   split_dimension = 1 : i64
-  // CHECK-SAME: } : (tensor<2x2xi32>) -> tensor<2x2xi32>
+  // CHECK-SAME: }> : (tensor<2x2xi32>) -> tensor<2x2xi32>
   ROOT %all-to-all = s32[2,2]{1,0} all-to-all(s32[2,2]{1,0} %parameter), channel_id=1, replica_groups={{1,2}, {3,0}}, dimensions={1}
 }
 
@@ -106,9 +106,9 @@ ENTRY %dummy_main (Arg_0.1: f32[]) -> f32[] {
 %test_tuple_all_to_all {
   %p0 = f32[128,4]{0,1} parameter(0)
   %p1 = f32[128,4]{1,0} parameter(1)
-  // CHECK-NEXT: "mhlo.all_to_all"([[ARG0]], [[ARG1]]) {
+  // CHECK-NEXT: "mhlo.all_to_all"([[ARG0]], [[ARG1]]) <{
   // CHECK-SAME:   channel_handle = #mhlo.channel_handle<handle = 1, type = 0>
-  // CHECK-SAME: } : (tensor<128x4xf32>, tensor<128x4xf32>) -> (tensor<128x4xf32>, tensor<128x4xf32>)
+  // CHECK-SAME: }> : (tensor<128x4xf32>, tensor<128x4xf32>) -> (tensor<128x4xf32>, tensor<128x4xf32>)
   ROOT %all-to-all = (f32[128,4]{0,1}, f32[128,4]{1,0}) all-to-all(%p0, %p1), channel_id=1, replica_groups={{0,1}}
 }
 
@@ -123,15 +123,15 @@ add {
 // CHECK-SAME:  ([[INPUT:%.*]]: tensor<8xf32>)
 %test_all_reduce {
   input = f32[8] parameter(0)
-  // CHECK-NEXT:  "mhlo.all_reduce"([[INPUT]])
-  // CHECK:  ^bb0([[ARG0:%.*]]: tensor<f32>, [[ARG1:%.*]]: tensor<f32>):
-  // CHECK:    [[ADD:%.*]] = mhlo.add [[ARG0]], [[ARG1]]
-  // CHECK:    mhlo.return [[ADD]] : tensor<f32>
-  // CHECK:  }) {
+  // CHECK-NEXT:  "mhlo.all_reduce"([[INPUT]]) <{
   // CHECK-SAME:  channel_handle = #mhlo.channel_handle<handle = 1, type = 0>
   // CHECK-NOT: use_global_device_ids
   // CHECK-SAME{LITERAL}:  replica_groups = dense<[[0, 1, 2, 3], [4, 5, 6, 7]]> : tensor<2x4xi64>
   // CHECK-NOT: use_global_device_ids
+  // CHECK:  ^bb0([[ARG0:%.*]]: tensor<f32>, [[ARG1:%.*]]: tensor<f32>):
+  // CHECK:    [[ADD:%.*]] = mhlo.add [[ARG0]], [[ARG1]]
+  // CHECK:    mhlo.return [[ADD]] : tensor<f32>
+  // CHECK:  })
   // CHECK-SAME: :
   ROOT result = f32[8] all-reduce(input), channel_id=1, replica_groups={{0,1,2,3}, {4,5,6,7}}, to_apply=add
 }
@@ -140,14 +140,14 @@ add {
 // CHECK-SAME:  ([[INPUT:%.*]]: tensor<8xf32>)
 %test_all_reduce_global {
   input = f32[8] parameter(0)
-  // CHECK-NEXT:  "mhlo.all_reduce"([[INPUT]])
-  // CHECK:  ^bb0([[ARG0:%.*]]: tensor<f32>, [[ARG1:%.*]]: tensor<f32>):
-  // CHECK:    [[ADD:%.*]] = mhlo.add [[ARG0]], [[ARG1]]
-  // CHECK:    mhlo.return [[ADD]] : tensor<f32>
-  // CHECK:  }) {
+  // CHECK-NEXT:  "mhlo.all_reduce"([[INPUT]]) <{
   // CHECK-SAME:  channel_handle = #mhlo.channel_handle<handle = 1, type = 0>
   // CHECK-SAME{LITERAL}:  replica_groups = dense<[[0, 1, 2, 3], [4, 5, 6, 7]]> : tensor<2x4xi64>
   // CHECK-SAME: use_global_device_ids
+  // CHECK:  ^bb0([[ARG0:%.*]]: tensor<f32>, [[ARG1:%.*]]: tensor<f32>):
+  // CHECK:    [[ADD:%.*]] = mhlo.add [[ARG0]], [[ARG1]]
+  // CHECK:    mhlo.return [[ADD]] : tensor<f32>
+  // CHECK:  })
   ROOT result = f32[8] all-reduce(input), channel_id=1, replica_groups={{0,1,2,3}, {4,5,6,7}}, use_global_device_ids=true, to_apply=add
 }
 
@@ -192,10 +192,10 @@ add {
 %test_broadcast_in_dim {
   %Arg_0.1 = f32[1, 2] parameter(0)
 
-  // CHECK-NEXT:  "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x2xf32>) -> tensor<1x2x3xf32>
+  // CHECK-NEXT:  "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}> : (tensor<1x2xf32>) -> tensor<1x2x3xf32>
   %broadcast.2 = f32[1,2,3] broadcast(%Arg_0.1), dimensions={0,1}
 
-  // CHECK-NEXT:  "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<1x2xf32>) -> tensor<3x1x2xf32>
+  // CHECK-NEXT:  "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>}> : (tensor<1x2xf32>) -> tensor<3x1x2xf32>
   ROOT broadcast.4 = f32[3,1,2] broadcast(%Arg_0.1), dimensions={1, 2}
 }
 
@@ -246,7 +246,7 @@ add {
 // CHECK-SAME:  ([[ARG:%.*]]: tensor<1x291x291xf32>) -> tensor<1x291x291xf32>
 %test_cholesky (a: f32[1,291,291]) -> f32[1,291,291] {
   %a = f32[1,291,291] parameter(0)
-  // CHECK-NEXT:  "mhlo.cholesky"([[ARG]]) {lower = true} : (tensor<1x291x291xf32>) -> tensor<1x291x291xf32>
+  // CHECK-NEXT:  "mhlo.cholesky"([[ARG]]) <{lower = true}> : (tensor<1x291x291xf32>) -> tensor<1x291x291xf32>
   ROOT %out = f32[1,291,291] cholesky(f32[1,291,291] %a), lower=true
 }
 
@@ -265,10 +265,10 @@ add {
 // CHECK-SAME:  ([[ARG:%.*]]: tensor<128x32xf32>) -> tensor<128x32xf32>
 %test_collective_broadcast (input: f32[128,32]) -> f32[128,32] {
   %input = f32[128,32]{1,0} parameter(0)
-  // CHECK-NEXT:  "mhlo.collective_broadcast"([[ARG]]) {
+  // CHECK-NEXT:  "mhlo.collective_broadcast"([[ARG]]) <{
   // CHECK-SAME:    channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
   // CHECK-SAME{LITERAL}    replica_groups = {{0,1}{2,3}}
-  // CHECK-SAME:  } : (tensor<128x32xf32>) -> tensor<128x32xf32>
+  // CHECK-SAME:  }> : (tensor<128x32xf32>) -> tensor<128x32xf32>
   ROOT root = f32[128,32]{1,0} collective-broadcast(%input), replica_groups={{0,1},{2,3}}, channel_id=1
 }
 
@@ -276,10 +276,10 @@ add {
 // CHECK-SAME:  ([[ARG:%.*]]: tensor<128x32xf32>) -> tensor<128x32xf32>
 %test_collective_permute (input: f32[128,32]) -> f32[128,32] {
   %input = f32[128,32]{1,0} parameter(0)
-  // CHECK-NEXT:  "mhlo.collective_permute"([[ARG]]) {
+  // CHECK-NEXT:  "mhlo.collective_permute"([[ARG]]) <{
   // CHECK-SAME:    channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
   // CHECK-SAME:    source_target_pairs = dense<{{\[\[}}0, 1], [1, 2], [2, 3]]> : tensor<3x2xi64>
-  // CHECK-SAME:  } : (tensor<128x32xf32>) -> tensor<128x32xf32>
+  // CHECK-SAME:  }> : (tensor<128x32xf32>) -> tensor<128x32xf32>
   ROOT root = f32[128,32]{1,0} collective-permute(%input), source_target_pairs={{0,1},{1,2},{2,3}}, channel_id=1
 }
 
@@ -314,7 +314,7 @@ add {
   %Arg_0.1 = f32[4, 1] parameter(0)
   %Arg_1.2 = f32[4, 2] parameter(1)
 
-  // CHECK-NEXT:  "mhlo.concatenate"(%arg0, %arg1) {dimension = 1 : i64} : (tensor<4x1xf32>, tensor<4x2xf32>) -> tensor<4x3xf32>
+  // CHECK-NEXT:  "mhlo.concatenate"(%arg0, %arg1) <{dimension = 1 : i64}> : (tensor<4x1xf32>, tensor<4x2xf32>) -> tensor<4x3xf32>
   ROOT %concatenate.3 = f32[4, 3] concatenate(f32[4, 1] %Arg_0.1, f32[4, 2] %Arg_1.2), dimensions={1}
 }
 
@@ -633,25 +633,25 @@ add {
   %Arg_0.1 = f32[1, 4] parameter(0)
   %Arg_1.2 = f32[4, 1] parameter(1)
 
-  // CHECK-NEXT:  %0 = "mhlo.dot"(%arg0, %arg1) {precision_config = [#mhlo<precision HIGH>, #mhlo<precision HIGHEST>]} : (tensor<1x4xf32>, tensor<4x1xf32>) -> tensor<1x1xf32>
+  // CHECK-NEXT:  %0 = "mhlo.dot"(%arg0, %arg1) <{precision_config = [#mhlo<precision HIGH>, #mhlo<precision HIGHEST>]}> : (tensor<1x4xf32>, tensor<4x1xf32>) -> tensor<1x1xf32>
   dot.3 = f32[1, 1] dot(Arg_0.1, Arg_1.2), lhs_contracting_dims={1}, rhs_contracting_dims={0}, operand_precision={high,highest}
 
-  // CHECK-NEXT:  %1 = "mhlo.dot"(%arg0, %arg1) {precision_config = [#mhlo<precision HIGHEST>, #mhlo<precision DEFAULT>]} : (tensor<1x4xf32>, tensor<4x1xf32>) -> tensor<1x1xf32>
+  // CHECK-NEXT:  %1 = "mhlo.dot"(%arg0, %arg1) <{precision_config = [#mhlo<precision HIGHEST>, #mhlo<precision DEFAULT>]}> : (tensor<1x4xf32>, tensor<4x1xf32>) -> tensor<1x1xf32>
   dot.4 = f32[1, 1] dot(Arg_0.1, Arg_1.2), lhs_contracting_dims={1}, rhs_contracting_dims={0}, operand_precision={highest,default}
 
-  // CHECK-NEXT:  %2 = "mhlo.dot"(%arg0, %arg1) {precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x4xf32>, tensor<4x1xf32>) -> tensor<1x1xf32>
+  // CHECK-NEXT:  %2 = "mhlo.dot"(%arg0, %arg1) <{precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]}> : (tensor<1x4xf32>, tensor<4x1xf32>) -> tensor<1x1xf32>
   %dot.5 = f32[1, 1] dot(Arg_0.1, Arg_1.2), lhs_contracting_dims={1}, rhs_contracting_dims={0}, operand_precision={default,default}
 
   // CHECK-NEXT:  %3 = mhlo.reshape %arg1 : (tensor<4x1xf32>) -> tensor<4xf32>
-  // CHECK-NEXT:  %4 = "mhlo.dot"(%3, %arg1) {precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4xf32>, tensor<4x1xf32>) -> tensor<1xf32>
+  // CHECK-NEXT:  %4 = "mhlo.dot"(%3, %arg1) <{precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]}> : (tensor<4xf32>, tensor<4x1xf32>) -> tensor<1xf32>
   reshape.0 = f32[4]{0} reshape(f32[4, 1] Arg_1.2)
   %dot.6 = f32[1] dot(reshape.0, Arg_1.2), lhs_contracting_dims={0}, rhs_contracting_dims={0}, operand_precision={default,default}
 
-  // CHECK-NEXT:  %5 = "mhlo.dot"(%arg0, %3) {precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1xf32>
+  // CHECK-NEXT:  %5 = "mhlo.dot"(%arg0, %3) <{precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]}> : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1xf32>
   %dot.7 = f32[1] dot(%Arg_0.1, reshape.0), lhs_contracting_dims={1}, rhs_contracting_dims={0}, operand_precision={default,default}
 
   // TODO(b/129709049) consider making this default precision config inferred.
-  // CHECK-NEXT:  "mhlo.dot"(%arg0, %arg1) {precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x4xf32>, tensor<4x1xf32>) -> tensor<1x1xf32>
+  // CHECK-NEXT:  "mhlo.dot"(%arg0, %arg1) <{precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]}> : (tensor<1x4xf32>, tensor<4x1xf32>) -> tensor<1x1xf32>
   ROOT %dot.8 = f32[1, 1] dot(Arg_0.1, Arg_1.2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
 }
 
@@ -660,7 +660,7 @@ add {
   %Arg_0.1 = s4[1, 4] parameter(0)
   %Arg_1.2 = s4[4, 1] parameter(1)
 
-  // CHECK-NEXT:  %0 = "mhlo.dot"(%arg0, %arg1) {precision_config = [#mhlo<precision HIGH>, #mhlo<precision HIGHEST>]} : (tensor<1x4xi4>, tensor<4x1xi4>) -> tensor<1x1xi8>
+  // CHECK-NEXT:  %0 = "mhlo.dot"(%arg0, %arg1) <{precision_config = [#mhlo<precision HIGH>, #mhlo<precision HIGHEST>]}> : (tensor<1x4xi4>, tensor<4x1xi4>) -> tensor<1x1xi8>
   ROOT dot.3 = s8[1, 1] dot(Arg_0.1, Arg_1.2), lhs_contracting_dims={1}, rhs_contracting_dims={0}, operand_precision={high,highest}
 }
 
@@ -669,7 +669,7 @@ add {
   %Arg_0.1 = u4[1, 4] parameter(0)
   %Arg_1.2 = u4[4, 1] parameter(1)
 
-  // CHECK-NEXT:  %0 = "mhlo.dot"(%arg0, %arg1) {precision_config = [#mhlo<precision HIGH>, #mhlo<precision HIGHEST>]} : (tensor<1x4xui4>, tensor<4x1xui4>) -> tensor<1x1xui8>
+  // CHECK-NEXT:  %0 = "mhlo.dot"(%arg0, %arg1) <{precision_config = [#mhlo<precision HIGH>, #mhlo<precision HIGHEST>]}> : (tensor<1x4xui4>, tensor<4x1xui4>) -> tensor<1x1xui8>
   ROOT dot.3 = u8[1, 1] dot(Arg_0.1, Arg_1.2), lhs_contracting_dims={1}, rhs_contracting_dims={0}, operand_precision={high,highest}
 }
 
@@ -787,7 +787,7 @@ add {
 // CHECK-LABEL:  func private @test_fft(%arg0: tensor<3x9xf32>) -> tensor<3x5xcomplex<f32>>
 %test_fft {
   %arg0.1 = f32[3,9]{1,0} parameter(0), parameter_replication={false}, metadata={op_name="XLA_Args"}
-  // CHECK:  "mhlo.fft"(%arg0) {fft_length = dense<9> : tensor<1xi64>, fft_type = #mhlo<fft_type RFFT>
+  // CHECK:  "mhlo.fft"(%arg0) <{fft_length = dense<9> : tensor<1xi64>, fft_type = #mhlo<fft_type RFFT>
   ROOT %fft.2 = c64[3,5]{1,0} fft(%arg0.1), fft_type=RFFT, fft_length={9}, metadata={op_type="RFFT" op_name="rfft"}
 }
 
@@ -826,7 +826,7 @@ add {
 // CHECK-SAME:  ([[ARG:%.*]]: tensor<4x2xf32>)
 %test_get_dimension_size (Arg_0.1: f32[4,2]) -> s32[] {
   %Arg_0.1 = f32[4,2] parameter(0)
-  // CHECK-NEXT:  "mhlo.get_dimension_size"([[ARG]]) {dimension = 1 : i64} : (tensor<4x2xf32>) -> tensor<i32>
+  // CHECK-NEXT:  "mhlo.get_dimension_size"([[ARG]]) <{dimension = 1 : i64}> : (tensor<4x2xf32>) -> tensor<i32>
   ROOT %get-dimension-size.2 = s32[] get-dimension-size(f32[4,2] %Arg_0.1), dimensions={1}
 }
 
@@ -876,13 +876,13 @@ add {
 
 // CHECK-LABEL:  func private @test_iota_1() -> tensor<4xf32>
 %test_iota_1 () -> f32[4] {
-  // CHECK-NEXT:  "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<4xf32>
+  // CHECK-NEXT:  "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<4xf32>
   ROOT %iota.0 = f32[4] iota(), iota_dimension=0
 }
 
 // CHECK-LABEL:  func private @test_iota_2() -> tensor<4x5xf32>
 %test_iota_2 () -> f32[4, 5] {
-  // CHECK-NEXT:  "mhlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<4x5xf32>
+  // CHECK-NEXT:  "mhlo.iota"() <{iota_dimension = 1 : i64}> : () -> tensor<4x5xf32>
   ROOT %iota.0 = f32[4, 5] iota(), iota_dimension=1
 }
 
@@ -914,11 +914,11 @@ add {
 %test_map {
   param0 = f32[4]{0} parameter(0)
   param1 = f32[4]{0} parameter(1)
-// CHECK:  "mhlo.map"([[ARG_0]], [[ARG_1]]) ({
+// CHECK:  "mhlo.map"([[ARG_0]], [[ARG_1]]) <{dimensions = dense<0> : tensor<1xi64>}> ({
 // CHECK:    ^bb0([[ARG_2:%.*]]: tensor<f32>, [[ARG_3:%.*]]: tensor<f32>):
 // CHECK:    [[ADD:%.*]] = mhlo.add [[ARG_2]], [[ARG_3]]
 // CHECK:    mhlo.return [[ADD]] : tensor<f32>
-// CHECK:  }) {dimensions = dense<0> : tensor<1xi64>} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+// CHECK:  }) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
   ROOT map = f32[4]{0} map(param0, param1), dimensions={0}, to_apply=%map_computation
 }
 
@@ -935,11 +935,11 @@ add {
 %test_map_with_reducer_returning_tuple {
   param0 = f32[4]{0} parameter(0)
   param1 = f32[4]{0} parameter(1)
-// CHECK:  "mhlo.map"([[ARG_0]], [[ARG_1]]) ({
+// CHECK:  "mhlo.map"([[ARG_0]], [[ARG_1]]) <{dimensions = dense<0> : tensor<1xi64>}> ({
 // CHECK:    ^bb0([[ARG_2:%.*]]: tensor<f32>, [[ARG_3:%.*]]: tensor<f32>):
 // CHECK:    [[ADD:%.*]] = mhlo.add [[ARG_2]], [[ARG_3]]
 // CHECK:    mhlo.return [[ADD]] : tensor<f32>
-// CHECK:  }) {dimensions = dense<0> : tensor<1xi64>} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+// CHECK:  }) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
   ROOT map = f32[4]{0} map(param0, param1), dimensions={0}, to_apply=%map_computation_returning_tuple
 }
 
@@ -957,10 +957,10 @@ add {
   ROOT %map.6 = f32[4] map(f32[4] %Arg_0.1, s32[4] %Arg_1.2), dimensions={0}, to_apply=%map_computation_take_left
 }
 
-// CHECK:  "mhlo.map"([[ARG_0]], [[ARG_1]]) ({
+// CHECK:  "mhlo.map"([[ARG_0]], [[ARG_1]]) <{dimensions = dense<0> : tensor<1xi64>}> ({
 // CHECK:    ^bb0([[ARG_2:%.*]]: tensor<f32>, [[ARG_3:%.*]]: tensor<i32>):
 // CHECK:    mhlo.return [[ARG_2]] : tensor<f32>
-// CHECK:  }) {dimensions = dense<0> : tensor<1xi64>} : (tensor<4xf32>, tensor<4xi32>) -> tensor<4xf32>
+// CHECK:  }) : (tensor<4xf32>, tensor<4xi32>) -> tensor<4xf32>
 
 // CHECK-LABEL:  func private @test_maximum(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32>
 %test_maximum (Arg_0.1: f32[4], Arg_1.2: f32[4]) -> f32[4] {
@@ -1038,8 +1038,8 @@ add {
   %Arg_0.1 = s32[3] parameter(0)
   %Arg_1.2 = token[] parameter(1)
   // CHECK-NEXT:  "mhlo.outfeed"([[DATA]], [[TOKEN]])
-  // CHECK-SAME: mhlo.sharding = "{maximal device=0}"
   // CHECK-SAME:  outfeed_config = "foobar"
+  // CHECK-SAME: mhlo.sharding = "{maximal device=0}"
   ROOT %outfeed.3 = token[] outfeed(s32[3] %Arg_0.1, token[] %Arg_1.2), outfeed_config="foobar", sharding={maximal device=0}
 }
 
@@ -1060,7 +1060,7 @@ add {
   %Arg_0.1 = f32[4] parameter(0)
   %Arg_1.2 = f32[] parameter(1)
 
-  // CHECK-NEXT:  "mhlo.pad"(%arg0, %arg1) {edge_padding_high = dense<0> : tensor<1xi64>, edge_padding_low = dense<0> : tensor<1xi64>, interior_padding = dense<0> : tensor<1xi64>} : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
+  // CHECK-NEXT:  "mhlo.pad"(%arg0, %arg1) <{edge_padding_high = dense<0> : tensor<1xi64>, edge_padding_low = dense<0> : tensor<1xi64>, interior_padding = dense<0> : tensor<1xi64>}> : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
   ROOT %pad.3 = f32[4] pad(%Arg_0.1, %Arg_1.2), padding=0_0_0
 }
 
@@ -1069,7 +1069,7 @@ add {
   %Arg_0.1 = f32[4, 4, 4] parameter(0)
   %Arg_1.2 = f32[] parameter(1)
 
-  // CHECK-NEXT:  "mhlo.pad"(%arg0, %arg1) {edge_padding_high = dense<[2, 4, 6]> : tensor<3xi64>, edge_padding_low = dense<[1, 3, 5]> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<4x4x4xf32>, tensor<f32>) -> tensor<7x11x15xf32>
+  // CHECK-NEXT:  "mhlo.pad"(%arg0, %arg1) <{edge_padding_high = dense<[2, 4, 6]> : tensor<3xi64>, edge_padding_low = dense<[1, 3, 5]> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>}> : (tensor<4x4x4xf32>, tensor<f32>) -> tensor<7x11x15xf32>
   ROOT %pad.3 = f32[7, 11, 15] pad(%Arg_0.1, %Arg_1.2), padding=1_2x3_4x5_6
 }
 
@@ -1078,7 +1078,7 @@ add {
   %Arg_0.1 = f32[4] parameter(0)
   %Arg_1.2 = f32[] parameter(1)
 
-  // CHECK-NEXT:  "mhlo.pad"(%arg0, %arg1) {edge_padding_high = dense<0> : tensor<1xi64>, edge_padding_low = dense<0> : tensor<1xi64>, interior_padding = dense<2> : tensor<1xi64>} : (tensor<4xf32>, tensor<f32>) -> tensor<10xf32>
+  // CHECK-NEXT:  "mhlo.pad"(%arg0, %arg1) <{edge_padding_high = dense<0> : tensor<1xi64>, edge_padding_low = dense<0> : tensor<1xi64>, interior_padding = dense<2> : tensor<1xi64>}> : (tensor<4xf32>, tensor<f32>) -> tensor<10xf32>
   ROOT %pad.3 = f32[10] pad(%Arg_0.1, %Arg_1.2), padding=0_0_2
 }
 
@@ -1105,7 +1105,7 @@ add {
   %Arg_0.1 = f32[] parameter(0)
   %Arg_1.2 = f32[] parameter(1)
   // CHECK:  [[CST:%.*]] = mhlo.constant dense<[2, 3, 5]> : tensor<3xi64>
-  // CHECK:  "mhlo.rng"([[ARG0]], [[ARG1]], [[CST]]) {rng_distribution = #mhlo.rng_distribution<NORMAL>}
+  // CHECK:  "mhlo.rng"([[ARG0]], [[ARG1]], [[CST]]) <{rng_distribution = #mhlo.rng_distribution<NORMAL>}>
   ROOT %rng.4 = f32[2,3,5] rng(f32[] %Arg_0.1, f32[] %Arg_1.2), distribution=rng_normal
 }
 
@@ -1115,7 +1115,7 @@ add {
   %Arg_0.1 = f32[] parameter(0)
   %Arg_1.2 = f32[] parameter(1)
   // CHECK:  [[CST:%.*]] = mhlo.constant dense<[2, 3, 5]> : tensor<3xi64>
-  // CHECK:  "mhlo.rng"([[ARG0]], [[ARG1]], [[CST]]) {rng_distribution = #mhlo.rng_distribution<UNIFORM>}
+  // CHECK:  "mhlo.rng"([[ARG0]], [[ARG1]], [[CST]]) <{rng_distribution = #mhlo.rng_distribution<UNIFORM>}>
   ROOT %rng.4 = f32[2,3,5] rng(f32[] %Arg_0.1, f32[] %Arg_1.2), distribution=rng_uniform
 }
 
@@ -1198,14 +1198,14 @@ add {
 // CHECK-SAME: ([[ARG0:%.*]]: tensor<4x8xf32>)
 %test_reduce_scatter {
   input = f32[4,8] parameter(0)
-  // CHECK-NEXT: "mhlo.reduce_scatter"([[ARG0]]) ({
+  // CHECK-NEXT: "mhlo.reduce_scatter"([[ARG0]]) <{
+  // CHECK-SAME{LITERAL}:  replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>
+  // CHECK-SAME:           scatter_dimension = 1 : i64
+  // CHECK-SAME: }> ({
   // CHECK-NEXT:   ^bb0([[BARG0:%.*]]: tensor<f32>, [[BARG1:%.*]]: tensor<f32>):
   // CHECK-NEXT:     [[ADD:%.*]] = mhlo.add [[BARG0]], [[BARG1]] : tensor<f32>
   // CHECK-NEXT:     mhlo.return [[ADD]]
-  // CHECK-NEXT:   }) {
-  // CHECK-SAME{LITERAL}:  replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>
-  // CHECK-SAME:           scatter_dimension = 1 : i64
-  // CHECK-SAME: } : (tensor<4x8xf32>) -> tensor<4x4xf32>
+  // CHECK-NEXT:   }) : (tensor<4x8xf32>) -> tensor<4x4xf32>
   ROOT ars = f32[4,4] reduce-scatter(input), replica_groups={{0,1}}, dimensions={1}, to_apply=reduce_helper_add
 }
 
@@ -1223,18 +1223,18 @@ add {
 // CHECK-SAME: ([[ARG0:%.*]]: tensor<4x8xf32>)
 %test_reduce_scatter_with_channel {
   input = f32[4,8] parameter(0)
-  // CHECK-NEXT: "mhlo.reduce_scatter"([[ARG0]]) ({
-  // CHECK-NEXT:   ^bb0([[BARG0:%.*]]: tensor<f32>, [[BARG1:%.*]]: tensor<f32>):
-  // CHECK-NEXT:     [[ADD:%.*]] = mhlo.add [[BARG0]], [[BARG1]] : tensor<f32>
-  // CHECK-NEXT:     mhlo.return [[ADD]]
-  // CHECK-NEXT:   }) {
+  // CHECK-NEXT: "mhlo.reduce_scatter"([[ARG0]]) <{
   // CHECK-SAME:           channel_handle = #mhlo.channel_handle<handle = 1, type = 0>
   // CHECK-NOT: use_global_device_ids
   // CHECK-SAME{LITERAL}:  replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>
   // CHECK-NOT: use_global_device_ids
   // CHECK-SAME:           scatter_dimension = 1 : i64
   // CHECK-NOT: use_global_device_ids
-  // CHECK-SAME: } : (tensor<4x8xf32>) -> tensor<4x4xf32>
+  // CHECK-SAME: }> ({
+  // CHECK-NEXT:   ^bb0([[BARG0:%.*]]: tensor<f32>, [[BARG1:%.*]]: tensor<f32>):
+  // CHECK-NEXT:     [[ADD:%.*]] = mhlo.add [[BARG0]], [[BARG1]] : tensor<f32>
+  // CHECK-NEXT:     mhlo.return [[ADD]]
+  // CHECK-NEXT:   }) : (tensor<4x8xf32>) -> tensor<4x4xf32>
   ROOT ars = f32[4,4] reduce-scatter(input), channel_id=1, replica_groups={{0,1}}, dimensions={1}, to_apply=reduce_helper_add
 }
 
@@ -1242,16 +1242,16 @@ add {
 // CHECK-SAME: ([[ARG0:%.*]]: tensor<4x8xf32>)
 %test_reduce_scatter_global {
   input = f32[4,8] parameter(0)
-  // CHECK-NEXT: "mhlo.reduce_scatter"([[ARG0]]) ({
-  // CHECK-NEXT:   ^bb0([[BARG0:%.*]]: tensor<f32>, [[BARG1:%.*]]: tensor<f32>):
-  // CHECK-NEXT:     [[ADD:%.*]] = mhlo.add [[BARG0]], [[BARG1]] : tensor<f32>
-  // CHECK-NEXT:     mhlo.return [[ADD]]
-  // CHECK-NEXT:   }) {
+  // CHECK-NEXT: "mhlo.reduce_scatter"([[ARG0]]) <{
   // CHECK-SAME:           channel_handle = #mhlo.channel_handle<handle = 1, type = 0>
   // CHECK-SAME{LITERAL}:  replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>
   // CHECK-SAME:           scatter_dimension = 1 : i64
   // CHECK-SAME:           use_global_device_ids
-  // CHECK-SAME: } : (tensor<4x8xf32>) -> tensor<4x4xf32>
+  // CHECK-SAME: }> ({
+  // CHECK-NEXT:   ^bb0([[BARG0:%.*]]: tensor<f32>, [[BARG1:%.*]]: tensor<f32>):
+  // CHECK-NEXT:     [[ADD:%.*]] = mhlo.add [[BARG0]], [[BARG1]] : tensor<f32>
+  // CHECK-NEXT:     mhlo.return [[ADD]]
+  // CHECK-NEXT:   }) : (tensor<4x8xf32>) -> tensor<4x4xf32>
   ROOT ars = f32[4,4] reduce-scatter(input), channel_id=1, replica_groups={{0,1}}, dimensions={1}, use_global_device_ids=true, to_apply=reduce_helper_add
 }
 
@@ -1262,15 +1262,15 @@ add {
   %Arg_0.1 = f32[2,17,31,7] parameter(0)
   %Arg_1.2 = f32[] parameter(1)
 
-  // CHECK: "mhlo.reduce_window"([[ARG0]], [[ARG1]]) ({
-  // CHECK:   mhlo.add {{.*}} : tensor<f32>
-  // CHECK: }) {
+  // CHECK: "mhlo.reduce_window"([[ARG0]], [[ARG1]]) <{
   // CHECK-SAME:   base_dilations = dense<1> : tensor<4xi64>
   // CHECK-SAME:   padding = dense<{{\[\[}}0, 0], [2, 0], [0, 2], [0, 0]]> : tensor<4x2xi64>
   // CHECK-SAME:   window_dilations = dense<[1, 2, 2, 1]> : tensor<4xi64>
   // CHECK-SAME:   window_dimensions = dense<[1, 2, 2, 1]> : tensor<4xi64>
   // CHECK-SAME:   window_strides = dense<[1, 4, 4, 1]> : tensor<4xi64>
-  // CHECK-SAME: }
+  // CHECK-SAME: }> ({
+  // CHECK:   mhlo.add {{.*}} : tensor<f32>
+  // CHECK: })
   ROOT %reduce-window.1 = f32[2,5,8,7] reduce-window(f32[2,17,31,7] %Arg_0.1, f32[] %Arg_1.2), window={size=1x2x2x1 stride=1x4x4x1 pad=0_0x2_0x0_2x0_0 rhs_dilate=1x2x2x1}, to_apply=%reduce_helper.3
 }
 
@@ -1298,19 +1298,18 @@ add {
   %constant.3 = pred[] constant(false)
   %constant.4 = f32[] constant(-inf)
   %constant.5 = f32[] constant(0)
-  // CHECK: [[REDUCE_WINDOW:%.*]]:2 = "mhlo.reduce_window"([[ARG1]], [[ARG0]], [[CONST_INF]], [[CONST_ZERO]]) ({
-  // CHECK: ^bb0([[BARG0:%.*]]: tensor<f32>, [[BARG1:%.*]]: tensor<f32>, [[BARG2:%.*]]: tensor<f32>, [[BARG3:%.*]]: tensor<f32>)
-  // CHECK:   [[COMPARE:%.*]] = mhlo.compare GE, [[BARG0]], [[BARG2]]
-  // CHECK:   [[SELECT_0:%.*]] = mhlo.select [[COMPARE]], [[BARG0]], [[BARG2]]
-  // CHECK:   [[SELECT_1:%.*]] = mhlo.select [[COMPARE]], [[BARG1]], [[BARG3]]
-  // CHECK: }) {
+  // CHECK: [[REDUCE_WINDOW:%.*]]:2 = "mhlo.reduce_window"([[ARG1]], [[ARG0]], [[CONST_INF]], [[CONST_ZERO]]) <{
   // CHECK-SAME:   base_dilations = dense<1> : tensor<2xi64>
   // CHECK-SAME:   padding = dense<0> : tensor<2x2xi64>
   // CHECK-SAME:   window_dilations = dense<1> : tensor<2xi64>
   // CHECK-SAME:   window_dimensions = dense<[1, 2]> : tensor<2xi64>
   // CHECK-SAME:   window_strides = dense<[1, 2]> : tensor<2xi64>
-  // CHECK-SAME: }
-  // CHECK-SAME: : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<f32>, tensor<f32>) -> (tensor<4x3xf32>, tensor<4x3xf32>)
+  // CHECK-SAME: }> ({
+  // CHECK: ^bb0([[BARG0:%.*]]: tensor<f32>, [[BARG1:%.*]]: tensor<f32>, [[BARG2:%.*]]: tensor<f32>, [[BARG3:%.*]]: tensor<f32>)
+  // CHECK:   [[COMPARE:%.*]] = mhlo.compare GE, [[BARG0]], [[BARG2]]
+  // CHECK:   [[SELECT_0:%.*]] = mhlo.select [[COMPARE]], [[BARG0]], [[BARG2]]
+  // CHECK:   [[SELECT_1:%.*]] = mhlo.select [[COMPARE]], [[BARG1]], [[BARG3]]
+  // CHECK: }) : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<f32>, tensor<f32>) -> (tensor<4x3xf32>, tensor<4x3xf32>)
   %reduce-window.15 = (f32[4,3], f32[4,3]) reduce-window(f32[4,6] %Arg1, f32[4,6] %Arg0, f32[] %constant.4, f32[] %constant.5), window={size=1x2 stride=1x2}, to_apply=%reducer_window_helper
   // CHECK: return [[REDUCE_WINDOW]]#1 : tensor<4x3xf32>
   ROOT %get-tuple-element.16 = f32[4,3] get-tuple-element((f32[4,3], f32[4,3]) %reduce-window.15), index=1
@@ -1329,7 +1328,7 @@ add {
 %test_reverse_1d (Arg_0.1: f32[4]) -> f32[4] {
   %Arg_0.1 = f32[4] parameter(0)
 
-  // CHECK-NEXT:  "mhlo.reverse"(%arg0) {dimensions = dense<0> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<4xf32>
+  // CHECK-NEXT:  "mhlo.reverse"(%arg0) <{dimensions = dense<0> : tensor<1xi64>}> : (tensor<4xf32>) -> tensor<4xf32>
   ROOT reverse.2 = f32[4] reverse(%Arg_0.1), dimensions={0}
 }
 
@@ -1337,7 +1336,7 @@ add {
 %test_reverse_2d (Arg_0.1: f32[4, 4]) -> f32[4, 4] {
   %Arg_0.1 = f32[4, 4] parameter(0)
 
-  // CHECK-NEXT:  "mhlo.reverse"(%arg0) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<4x4xf32>) -> tensor<4x4xf32>
+  // CHECK-NEXT:  "mhlo.reverse"(%arg0) <{dimensions = dense<[0, 1]> : tensor<2xi64>}> : (tensor<4x4xf32>) -> tensor<4x4xf32>
   ROOT reverse.2 = f32[4, 4] reverse(%Arg_0.1), dimensions={0, 1}
 }
 
@@ -1372,11 +1371,7 @@ add {
 
 // CHECK-LABEL:  func private @test_scatter
 // CHECK-SAME:   [[ARG_0:%.*]]: tensor<200x100x300xf32>, [[ARG_1:%.*]]: tensor<10x2xi64>, [[ARG_2:%.*]]: tensor<10x300xf32>) -> tensor<200x100x300xf32>
-// CHECK:  "mhlo.scatter"([[ARG_0]], [[ARG_1]], [[ARG_2]]) ({
-// CHECK:    ^bb0([[LHS:%.*]]: tensor<f32>, [[RHS:%.*]]: tensor<f32>):
-// CHECK:      [[ADD:%.*]] = mhlo.add [[LHS]], [[RHS]]
-// CHECK:      mhlo.return [[ADD]] : tensor<f32>
-// CHECK:    })
+// CHECK:  "mhlo.scatter"([[ARG_0]], [[ARG_1]], [[ARG_2]])
 // CHECK-SAME:  indices_are_sorted = false
 // CHECK-SAME:  scatter_dimension_numbers =
 // CHECK-SAME:    update_window_dims = [1]
@@ -1384,6 +1379,10 @@ add {
 // CHECK-SAME:    scatter_dims_to_operand_dims = [0, 1]
 // CHECK-SAME:    index_vector_dim = 1
 // CHECK-SAME:  unique_indices = false
+// CHECK:    ^bb0([[LHS:%.*]]: tensor<f32>, [[RHS:%.*]]: tensor<f32>):
+// CHECK:      [[ADD:%.*]] = mhlo.add [[LHS]], [[RHS]]
+// CHECK:      mhlo.return [[ADD]] : tensor<f32>
+// CHECK:    })
 
 %wide_update_computation {
   %lhs = f32[] parameter(0)
@@ -1402,10 +1401,7 @@ add {
 
 // CHECK-LABEL:  func.func private @test_variadic_scatter
 // CHECK-SAME:   [[ARG_0:%.*]]: tensor<200x100x300xf32>, [[ARG_1:%.*]]: tensor<10x2xi64>, [[ARG_2:%.*]]: tensor<10x300xf32>) -> tuple<tensor<200x100x300xf32>, tensor<200x100x300xf32>>
-// CHECK:  "mhlo.scatter"([[ARG_0]], [[ARG_0]], [[ARG_1]], [[ARG_2]], [[ARG_2]]) ({
-// CHECK:    ^bb0([[LHS:%.*]]: tensor<f32>, [[UPD:%.*]]: tensor<f32>, [[RHS:%.*]]: tensor<f32>, [[UPP:%.*]]: tensor<f32>):
-// CHECK:      mhlo.return [[LHS]], [[RHS]] : tensor<f32>, tensor<f32>
-// CHECK:    })
+// CHECK:  "mhlo.scatter"([[ARG_0]], [[ARG_0]], [[ARG_1]], [[ARG_2]], [[ARG_2]])
 // CHECK-SAME:  indices_are_sorted = false
 // CHECK-SAME:  scatter_dimension_numbers =
 // CHECK-SAME:    update_window_dims = [1]
@@ -1413,6 +1409,9 @@ add {
 // CHECK-SAME:    scatter_dims_to_operand_dims = [0, 1]
 // CHECK-SAME:    index_vector_dim = 1
 // CHECK-SAME:  unique_indices = false
+// CHECK:    ^bb0([[LHS:%.*]]: tensor<f32>, [[UPD:%.*]]: tensor<f32>, [[RHS:%.*]]: tensor<f32>, [[UPP:%.*]]: tensor<f32>):
+// CHECK:      mhlo.return [[LHS]], [[RHS]] : tensor<f32>, tensor<f32>
+// CHECK:    })
 
 %update_computation_returning_tuple {
   %lhs = f32[] parameter(0)
@@ -1430,7 +1429,7 @@ add {
 
 // CHECK-LABEL:  func private @test_scatter_with_reducer_returning_tuple
 // CHECK-SAME:   [[ARG_0:%.*]]: tensor<200x100x300xf32>, [[ARG_1:%.*]]: tensor<10x2xi64>, [[ARG_2:%.*]]: tensor<10x300xf32>) -> tensor<200x100x300xf32>
-// CHECK:  "mhlo.scatter"([[ARG_0]], [[ARG_1]], [[ARG_2]]) ({
+// CHECK:  "mhlo.scatter"([[ARG_0]], [[ARG_1]], [[ARG_2]])
 // CHECK:    ^bb0([[LHS:%.*]]: tensor<f32>, [[RHS:%.*]]: tensor<f32>):
 // CHECK:      [[ADD:%.*]] = mhlo.add [[LHS]], [[RHS]]
 // CHECK:      mhlo.return [[ADD]] : tensor<f32>
@@ -1450,7 +1449,7 @@ add {
 
 // CHECK-LABEL:  func private @test_all_reduce_with_reducer_returning_tuple
 // CHECK-SAME:   [[ARG_0:%.*]]: tensor<4xf32>) -> tensor<4xf32>
-// CHECK:  "mhlo.all_reduce"([[ARG_0]]) ({
+// CHECK:  "mhlo.all_reduce"([[ARG_0]])
 // CHECK:    ^bb0([[LHS:%.*]]: tensor<f32>, [[RHS:%.*]]: tensor<f32>):
 // CHECK:      [[ADD:%.*]] = mhlo.add [[LHS]], [[RHS]]
 // CHECK:      mhlo.return [[ADD]] : tensor<f32>
@@ -1488,7 +1487,11 @@ add {
   ROOT %select-and-scatter = f32[4,5] select-and-scatter(f32[4,5] %input, f32[2,2] %source, f32[] %init_value), window={size=2x3 stride=2x3 pad=0_0x0_1}, select=%ge_select, scatter=%add_gather
 }
 
-// CHECK:  [[RESULT:%.*]] = "mhlo.select_and_scatter"([[INPUT]], [[SOURCE]], [[INIT_VAL]]) ({
+// CHECK:  [[RESULT:%.*]] = "mhlo.select_and_scatter"([[INPUT]], [[SOURCE]], [[INIT_VAL]]) <{
+// CHECK-SAME:    padding = dense<{{\[\[}}0, 0], [0, 1]]> : tensor<2x2xi64>
+// CHECK-SAME:    window_dimensions = dense<[2, 3]> : tensor<2xi64>
+// CHECK-SAME:    window_strides = dense<[2, 3]> : tensor<2xi64>
+// CHECK-SAME:  }> ({
 // CHECK:  ^bb0([[LHS:%.*]]: tensor<f32>, [[RHS:%.*]]: tensor<f32>):
 // CHECK:    [[CMP:%.*]] = mhlo.compare GE, [[LHS]], [[RHS]]
 // CHECK:    mhlo.return [[CMP]] : tensor<i1>
@@ -1496,11 +1499,7 @@ add {
 // CHECK:  ^bb0([[LHS:%.*]]: tensor<f32>, [[RHS:%.*]]: tensor<f32>):
 // CHECK:    [[ADD:%.*]] = mhlo.add [[LHS]], [[RHS]]
 // CHECK:    mhlo.return [[ADD]] : tensor<f32>
-// CHECK:    }) {
-// CHECK-SAME:    padding = dense<{{\[\[}}0, 0], [0, 1]]> : tensor<2x2xi64>
-// CHECK-SAME:    window_dimensions = dense<[2, 3]> : tensor<2xi64>
-// CHECK-SAME:    window_strides = dense<[2, 3]> : tensor<2xi64>
-// CHECK-SAME:  }
+// CHECK:    })
 // CHECK:  return [[RESULT:%.*]] : tensor<4x5xf32>
 
 // Test SelectAndScatter with tuple returns from computations.
@@ -1539,7 +1538,7 @@ add {
 %test_set_dimension_size (Arg_0.1: f32[4,4], Arg_1.2: s32[]) -> f32[4,<=4] {
   %Arg_0.1 = f32[4,4] parameter(0)
   %Arg_1.2 = s32[] parameter(1)
-  // CHECK-NEXT:  "mhlo.set_dimension_size"([[ARG]], [[SIZE]]) {dimension = 1 : i64} : (tensor<4x4xf32>, tensor<i32>)
+  // CHECK-NEXT:  "mhlo.set_dimension_size"([[ARG]], [[SIZE]]) <{dimension = 1 : i64}> : (tensor<4x4xf32>, tensor<i32>)
   // CHECK-SAME: tensor<4x?xf32, #mhlo.type_extensions<bounds = [?, 4]>>
   ROOT %set-dimension-size.2 = f32[4,<=4] set-dimension-size(f32[4,4] %Arg_0.1, s32[] %Arg_1.2), dimensions={1}
 }
@@ -1565,11 +1564,11 @@ add {
 }
 // CHECK-LABEL:  func private @test_sort
 // CHECK-SAME:  [[ARG:%.*]]: tensor<1024xf32>) -> tensor<1024xf32>
-// CHECK:  "mhlo.sort"([[ARG]]) ({
+// CHECK:  "mhlo.sort"([[ARG]]) <{dimension = 0 : i64, is_stable = true}> ({
 // CHECK:    ^bb0([[ARG0:%.*]]: tensor<f32>, [[ARG1:%.*]]: tensor<f32>):
 // CHECK:      [[CMP:%.*]] = mhlo.compare LT, [[ARG0]], [[ARG1]] : (tensor<f32>, tensor<f32>) -> tensor<i1>
 // CHECK:      mhlo.return [[CMP]] : tensor<i1>
-// CHECK:    }) {dimension = 0 : i64, is_stable = true} : (tensor<1024xf32>) -> tensor<1024xf32>
+// CHECK:    }) : (tensor<1024xf32>) -> tensor<1024xf32>
 
 // CHECK-LABEL:  func private @test_subtract
 %test_subtract (Arg_0.1: f32[4], Arg_1.2: f32[4]) -> f32[4] {
@@ -1610,7 +1609,7 @@ add {
 %test_transpose {
   %Arg_0.1 = s32[1,2,3,4] parameter(0)
 
-  // CHECK:  "mhlo.transpose"(%arg0) {permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
+  // CHECK:  "mhlo.transpose"(%arg0) <{permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>}> : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
   ROOT %transpose.2 = s32[2,1,4,3] transpose(s32[1,2,3,4] %Arg_0.1), dimensions={1,0,3,2}
 }
 
@@ -1728,8 +1727,8 @@ add {
 %rngbitgen_tuple_shape (Arg_0.1: u64[3]) -> (u64[3], u32[2,2]) {
   %Arg_0.1 = u64[3] parameter(0)
   // CHECK: %[[RNG0:.+]], %[[RNG1:.+]] = "mhlo.rng_bit_generator"(%[[ARG0]]) 
-  // CHECK-SAME: mhlo.sharding = "{{\{}}{maximal device=0}, {maximal device=1}}"
   // CHECK-SAME: rng_algorithm = #mhlo.rng_algorithm<PHILOX>
+  // CHECK-SAME: mhlo.sharding = "{{\{}}{maximal device=0}, {maximal device=1}}"
   // CHECK-SAME: (tensor<3xui64>) -> (tensor<3xui64>, tensor<2x2xui32>)
   // CHECK: %[[TUPLE:.+]] = mhlo.tuple %[[RNG0]], %[[RNG1]] {xla_shape = "(u64[3]{0}, u32[2,2]{1,0})"} : tuple<tensor<3xui64>, tensor<2x2xui32>>
   // CHECK: return %[[TUPLE]]
@@ -1741,8 +1740,8 @@ add {
 %rngbitgen_array_shape (Arg_0.1: u64[3]) -> u32[2,2] {
   %Arg_0.1 = u64[3] parameter(0)
   // CHECK: %[[RNG0:.+]], %[[RNG1:.+]] = "mhlo.rng_bit_generator"(%[[ARG0]])
-  // CHECK-SAME: mhlo.sharding = "{{\{}}{replicated}, {maximal device=0}}"
   // CHECK-SAME: rng_algorithm = #mhlo.rng_algorithm<DEFAULT>
+  // CHECK-SAME: mhlo.sharding = "{{\{}}{replicated}, {maximal device=0}}"
   // CHECK-SAME: (tensor<3xui64>) -> (tensor<3xui64>, tensor<2x2xui32>)
   // CHECK: return %[[RNG1]]
   ROOT %rng-bit-generator.2 = u32[2,2] rng-bit-generator(u64[3] %Arg_0.1), algorithm=rng_default, sharding={maximal device=0}
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/tests/import_async.hlotxt b/third_party/xla/xla/translate/hlo_to_mhlo/tests/import_async.hlotxt
index 3ac32e0b300aa1..8040ac89faabb0 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/tests/import_async.hlotxt
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/tests/import_async.hlotxt
@@ -26,22 +26,22 @@ HloModule foobar
 
 // CHECK: func private [[CP_GENSYM:@.*collective_permute_.*]]([[INPUT:%.*]]: tensor<128x32xf32>) -> tensor<128x32xf32> attributes {execution_thread = "main"} {
   // CHECK-NEXT:  "mhlo.collective_permute"([[INPUT]])
-  // CHECK-SAME{LITERAL}:{source_target_pairs = dense<[[0, 1], [1, 2], [2, 3]]> : tensor<3x2xi64>} : (tensor<128x32xf32>) -> tensor<128x32xf32>
+  // CHECK-SAME{LITERAL}: <{source_target_pairs = dense<[[0, 1], [1, 2], [2, 3]]> : tensor<3x2xi64>}> : (tensor<128x32xf32>) -> tensor<128x32xf32>
 
 // CHECK:  func private [[AR_GENSYM:@.*all_reduce.*]]([[INPUT:%.*]]: tensor<128x32xf32>) -> tensor<128x32xf32> attributes {execution_thread = "main"} {
   // CHECK-NEXT:  "mhlo.all_reduce"([[INPUT]])
+  // CHECK-SAME: channel_handle = #mhlo.channel_handle<handle = 1, type = 0>
+  // CHECK-SAME{LITERAL}: replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
+  // CHECK-SAME: use_global_device_ids
     // CHECK: [[BLOCK:^.*]]([[LHS:%.*]]: tensor<f32>, [[RHS:%.*]]: tensor<f32>):
     // CHECK: mhlo.add [[LHS]], [[RHS]]
-  // CHECK: channel_handle = #mhlo.channel_handle<handle = 1, type = 0>
-  // CHECK-SAME{LITERAL}: replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
-  // CHECK: use_global_device_ids
 
 // CHECK:  func private [[AG_GENSYM:@.*all_gather.*]]([[INPUT:%.*]]: tensor<128x32xf32>) -> tensor<128x128xf32> attributes {execution_thread = "main"} {
   // CHECK-NEXT:  "mhlo.all_gather"([[INPUT]])
   // CHECK-SAME: all_gather_dim = 1 : i64
   // CHECK-SAME: channel_handle = #mhlo.channel_handle<handle = 1, type = 0>
   // CHECK-SAME{LITERAL}: replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
-  // CHECK: use_global_device_ids
+  // CHECK-SAME: use_global_device_ids
 
 // CHECK:  func @main(%arg0: tensor<f32>) -> tensor<f32> {
 ENTRY %dummy_main (Arg_0.1: f32[]) -> f32[] {
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/tests/simple.hlo b/third_party/xla/xla/translate/hlo_to_mhlo/tests/simple.hlo
index 40502568a7d987..bc542ead303c4a 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/tests/simple.hlo
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/tests/simple.hlo
@@ -141,6 +141,6 @@ dynamic_parameter_binding {
 # CHECK-LABEL: func @main(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<f32> {
 # CHECK-NEXT:   %0 = mhlo.add %arg0, %arg1 : tensor<4xf32>
 # TODO(b/129709049) consider making this default precision config inferred.
-# CHECK-NEXT:   %1 = "mhlo.dot"(%0, %arg1) {precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4xf32>, tensor<4xf32>) -> tensor<f32>
+# CHECK-NEXT:   %1 = "mhlo.dot"(%0, %arg1) <{precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]}> : (tensor<4xf32>, tensor<4xf32>) -> tensor<f32>
 # CHECK-NEXT:   return %1 : tensor<f32>
 # CHECK-NEXT: }
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/tests/while.hlotxt b/third_party/xla/xla/translate/hlo_to_mhlo/tests/while.hlotxt
index 2ed4016368f991..0b6c252a8338d7 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/tests/while.hlotxt
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/tests/while.hlotxt
@@ -143,7 +143,7 @@ ENTRY %foo (arg0.1: s64[]) -> s64[] {
 // CHECK:           %[[CMP_7:.*]] = mhlo.compare LT, %[[RED_5]], %[[RED_6]] : (tensor<i32>, tensor<i32>) -> tensor<i1>
 // CHECK:           mhlo.return %[[CMP_7]] : tensor<i1>
 
-// CHECK:           %[[BDCAST_4:.*]] = "mhlo.broadcast_in_dim"(%[[ARG_3]]) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<3xf32>
+// CHECK:           %[[BDCAST_4:.*]] = "mhlo.broadcast_in_dim"(%[[ARG_3]]) <{broadcast_dimensions = dense<0> : tensor<1xi64>}> : (tensor<1xf32>) -> tensor<3xf32>
 // CHECK-NEXT:      %[[ADD_5:.*]] = mhlo.add %[[ARG_4]], %[[BDCAST_4]] : tensor<3xf32>
 // CHECK-NEXT:      mhlo.return %[[ARG_1]], %[[ARG_2]], %[[ARG_3]], %[[ADD_5]] : tensor<1xi32>, tensor<2xi32>, tensor<1xf32>, tensor<3xf32>
 
@@ -214,7 +214,7 @@ ENTRY %foo (arg0.1: s64[]) -> s64[] {
 // CHECK:           %[[CMP_7:.*]] = mhlo.compare LT, %[[RED_5]], %[[RED_6]] : (tensor<i32>, tensor<i32>) -> tensor<i1>
 // CHECK:           mhlo.return %[[CMP_7]] : tensor<i1>
 
-// CHECK:           %[[BDCAST_4:.*]] = "mhlo.broadcast_in_dim"(%[[ARG_3]]) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<3xf32>
+// CHECK:           %[[BDCAST_4:.*]] = "mhlo.broadcast_in_dim"(%[[ARG_3]]) <{broadcast_dimensions = dense<0> : tensor<1xi64>}> : (tensor<1xf32>) -> tensor<3xf32>
 // CHECK:           %[[ADD_5:.*]] = mhlo.add %[[ARG_4]], %[[BDCAST_4]] : tensor<3xf32>
 // CHECK:           mhlo.return %[[ARG_1]], %[[ARG_2]], %[[ARG_3]], %[[ADD_5]] : tensor<1xi32>, tensor<2xi32>, tensor<1xf32>, tensor<3xf32>
 // CHECK:    return %[[WHILE]]#3 : tensor<3xf32>
@@ -334,7 +334,7 @@ region_cond5 {
 // CHECK-NEXT:    } do {
 // CHECK-NEXT:      %[[CST_2:.*]] = mhlo.constant dense<false> : tensor<i1>
 // CHECK-NEXT:      %[[CST_3:.*]] = mhlo.constant dense<2.000000e+00> : tensor<f32>
-// CHECK-NEXT:      %[[BDCAST:.*]] = "mhlo.broadcast_in_dim"(%[[CST_3]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<3x3xf32>
+// CHECK-NEXT:      %[[BDCAST:.*]] = "mhlo.broadcast_in_dim"(%[[CST_3]]) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<f32>) -> tensor<3x3xf32>
 // CHECK-NEXT:      %[[ADD:.*]] = mhlo.add %[[ITER_ARG]], %[[BDCAST]] : tensor<3x3xf32>
 // CHECK-NEXT:      mhlo.return %[[ADD]] : tensor<3x3xf32>
 // CHECK:    return %[[WHILE]] : tensor<3x3xf32>
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/BUILD b/third_party/xla/xla/translate/mhlo_to_hlo/BUILD
index cb94fd2ce960fa..d9cd70f4658ddc 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/BUILD
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/BUILD
@@ -1,9 +1,9 @@
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
-load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
-load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_binary", "cc_library")
 load("//xla:xla.bzl", "xla_cc_test")
+load("//xla/tsl:tsl.bzl", "internal_visibility")
+load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -190,7 +190,6 @@ cc_library(
     hdrs = ["type_to_shape.h"],
     deps = [
         "//xla:shape_util",
-        "//xla:statusor",
         "//xla:xla_data_proto_cc",
         "//xla/mlir/utils:type_util",
         "//xla/mlir_hlo",
@@ -199,6 +198,7 @@ cc_library(
         "@llvm-project//mlir:SparseTensorDialect",
         "@llvm-project//mlir:SparseTensorEnums",
         "@llvm-project//mlir:Support",
+        "@stablehlo//:stablehlo_ops",
     ],
 )
 
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/layout_util.cc b/third_party/xla/xla/translate/mhlo_to_hlo/layout_util.cc
index 5d8a128f7b231f..88261a847874f5 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/layout_util.cc
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/layout_util.cc
@@ -18,7 +18,7 @@ limitations under the License.
 namespace mlir {
 
 // Rewrites the layout of xla_shape if there is tiled sharding.
-xla::Status RewriteLayoutWithShardedShape(
+absl::Status RewriteLayoutWithShardedShape(
     const std::optional<xla::HloSharding>& sharding, bool use_fast_memory,
     const LayoutPreferenceFn& layout_preference_fn,
     const ShapeRepresentationFn& shape_representation_fn,
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/layout_util.h b/third_party/xla/xla/translate/mhlo_to_hlo/layout_util.h
index 2e72c6d36fca68..f68e3ed94fe851 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/layout_util.h
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/layout_util.h
@@ -65,7 +65,7 @@ typedef std::function<absl::StatusOr<xla::Shape>(
 LayoutPreferenceFn UseNoPreferenceLayoutFn();
 
 // Rewrites the layout of xla_shape if there is tiled sharding.
-xla::Status RewriteLayoutWithShardedShape(
+absl::Status RewriteLayoutWithShardedShape(
     const std::optional<xla::HloSharding>& sharding, bool use_fast_memory,
     const LayoutPreferenceFn& layout_preference_fn,
     const ShapeRepresentationFn& shape_representation_fn,
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc b/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
index 8911c7864ac83e..81df83ad7ba4a6 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
@@ -70,6 +70,7 @@ limitations under the License.
 #include "xla/hlo/ir/dynamic_parameter_binding.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/mlir/utils/error_util.h"
@@ -193,7 +194,7 @@ absl::StatusOr<xla::Literal> CreateArrayLiteralFromAttr(mlir::ElementsAttr attr,
 
   xla::Shape shape = xla::TypeToShape(dense_attr.getType());
 
-  return xla::primitive_util::PrimitiveTypeSwitch<StatusOr<xla::Literal>>(
+  return xla::primitive_util::PrimitiveTypeSwitch<absl::StatusOr<xla::Literal>>(
       [&](auto primitive_type_constant) -> absl::StatusOr<xla::Literal> {
         if constexpr (xla::primitive_util::IsArrayType(
                           primitive_type_constant)) {
@@ -639,6 +640,63 @@ static void ExtractShardingsFromFunction(
       (*ret_shardings)[i] = xla::ConvertSharding(sharding.getValue());
 }
 
+void AppendTupleShardingElements(xla::OpSharding* result,
+                                 const xla::OpSharding& tuple_sharding) {
+  if (tuple_sharding.type() == xla::OpSharding::TUPLE) {
+    for (const xla::OpSharding& element : tuple_sharding.tuple_shardings()) {
+      AppendTupleShardingElements(result, element);
+    }
+  } else {
+    *result->add_tuple_shardings() = tuple_sharding;
+  }
+}
+
+// Creates a tuple sharding with `tuple_shardings` if at least one is present.
+// Adds replicated shardings for any missing tuple shardings.
+//
+// The tuple xla::Shape can be nested, while xla::OpSharding stores a flattened
+// list of shardings for the leaves of a tuple shape.
+std::optional<xla::OpSharding> CreateTupleSharding(
+    llvm::ArrayRef<std::optional<xla::OpSharding>> tuple_shardings) {
+  if (tuple_shardings.empty() ||
+      !SomeOptionalShardingsAreSet(tuple_shardings)) {
+    return std::nullopt;
+  }
+  xla::OpSharding sharding;
+  sharding.set_type(xla::OpSharding::TUPLE);
+  for (const std::optional<xla::OpSharding>& tuple_sharding : tuple_shardings) {
+    if (tuple_sharding) {
+      AppendTupleShardingElements(&sharding, *tuple_sharding);
+    } else {
+      xla::OpSharding fallback_sharding;
+      fallback_sharding.set_type(xla::OpSharding::REPLICATED);
+      *sharding.add_tuple_shardings() = fallback_sharding;
+    }
+  }
+
+  return sharding;
+}
+
+// Returns the flattened result shardings of the given `op_sharding`, i.e.,
+// either:
+// - an empty vector if `op_sharding` is `std::nullopt`.
+// - the tuple shardings in `op_sharding` if it has type TUPLE.
+// - otherwise, returns a vector with `op_sharding` itself.
+llvm::SmallVector<std::optional<xla::OpSharding>> GetResultShardings(
+    std::optional<xla::OpSharding> op_sharding) {
+  if (!op_sharding) {
+    return {};
+  }
+  llvm::SmallVector<std::optional<xla::OpSharding>> res_shardings;
+  if (op_sharding->type() == xla::OpSharding::TUPLE) {
+    res_shardings.assign(op_sharding->tuple_shardings().begin(),
+                         op_sharding->tuple_shardings().end());
+  } else {
+    res_shardings.push_back(op_sharding);
+  }
+  return res_shardings;
+}
+
 namespace mlir {
 namespace {
 class ConvertToHloModule {
@@ -688,7 +746,9 @@ class ConvertToHloModule {
       mlir::Region* region, xla::XlaComputation* func,
       std::optional<llvm::ArrayRef<mlir::Value>> implicit_operands =
           std::nullopt,
-      bool ensure_single_arg = false);
+      bool ensure_single_arg = false,
+      llvm::ArrayRef<std::optional<xla::OpSharding>> arg_shardings = {},
+      llvm::ArrayRef<std::optional<xla::OpSharding>> ret_shardings = {});
 
   // Lower a single `Block` to a `XlaComputation`
   LogicalResult LowerBasicBlockAsFunction(
@@ -835,13 +895,17 @@ bool SimplyReturnedOp(mlir::Operation* op) {
 
 void BuildGetTupleElementsForTupleResults(mlir::Operation* op, xla::XlaOp tuple,
                                           OpLoweringContext ctx) {
-  const std::optional<xla::OpSharding>& tuple_sharding =
-      ctx.builder->sharding();
-  if (tuple_sharding.has_value()) {
-    assert(op->getNumResults() == tuple_sharding->tuple_shardings_size());
+  const std::optional<xla::OpSharding>& sharding = ctx.builder->sharding();
+  if (sharding.has_value()) {
+    bool is_tuple_sharding = sharding->type() == xla::OpSharding::TUPLE;
+    assert(!is_tuple_sharding ||
+           op->getNumResults() == sharding->tuple_shardings_size());
     for (auto [index, result] : llvm::enumerate(op->getResults())) {
+      // If `sharding` is not a tuple sharding, then every `get-tuple-element`
+      // gets the same sharding.
       xla::XlaScopedShardingAssignment scoped_sharding(
-          ctx.builder, tuple_sharding->tuple_shardings(index));
+          ctx.builder,
+          is_tuple_sharding ? sharding->tuple_shardings(index) : sharding);
       (*ctx.values)[result] = xla::GetTupleElement(tuple, index);
     }
   } else {
@@ -2627,11 +2691,17 @@ LogicalResult ExportXlaOp(UnaryEinsumOp op, OpLoweringContext ctx) {
 LogicalResult ExportXlaOp(WhileOp op, OpLoweringContext ctx) {
   xla::XlaComputation condition;
   xla::XlaComputation body;
+  // If the results of the while op have a sharding, we use those shardings for
+  // the corresponding arguments and return shardings in the body and condition.
+  llvm::SmallVector<std::optional<xla::OpSharding>> res_shardings =
+      GetResultShardings(ctx.builder->sharding());
+  assert(res_shardings.empty() || res_shardings.size() == op->getNumResults());
   if (failed(ctx.converter->LowerRegionAsComputation(
-          &op.getBody(), &body, std::nullopt, /*ensure_single_arg*/ true)) ||
+          &op.getBody(), &body, std::nullopt, /*ensure_single_arg=*/true,
+          /*arg_shardings=*/res_shardings, /*ret_shardings=*/res_shardings)) ||
       failed(ctx.converter->LowerRegionAsComputation(
           &op.getCond(), &condition, std::nullopt,
-          /*ensure_single_arg*/ true))) {
+          /*ensure_single_arg=*/true, /*arg_shardings=*/res_shardings))) {
     return failure();
   }
 
@@ -3117,6 +3187,8 @@ LogicalResult ConvertToHloModule::Lower(
         CreateArrayLiteralFromAttr(const_attr, shape_or->layout());
     if (!literal_or.ok())
       return inst->emitError(literal_or.status().ToString());
+    xla::XlaScopedShardingAssignment scoped_sharding(
+        builder, CreateOpShardingFromAttribute(inst));
     auto constant = xla::ConstantLiteral(builder, literal_or.value());
     value_map[inst->getResult(0)] = constant;
 
@@ -3127,8 +3199,8 @@ LogicalResult ConvertToHloModule::Lower(
     // Construct the return value for the function. If there is a single value
     // returned, then return it directly, else create a tuple and return.
     unsigned num_return_values = inst->getNumOperands();
-    const bool has_ret_shardings =
-        !ret_shardings.empty() && SomeOptionalShardingsAreSet(ret_shardings);
+    std::optional<xla::OpSharding> ret_tuple_sharding =
+        CreateTupleSharding(ret_shardings);
     if ((return_tuple_ && is_entry_function) || num_return_values != 1) {
       std::vector<xla::XlaOp> returns(num_return_values);
       for (OpOperand& ret : inst->getOpOperands()) {
@@ -3138,7 +3210,7 @@ LogicalResult ConvertToHloModule::Lower(
           return failure();
 
         returns[index] = operand;
-        if (!is_entry_function || !has_ret_shardings) continue;
+        if (!is_entry_function || !ret_tuple_sharding) continue;
 
         xla::Shape return_shape = xla::TypeToShape(ret.get().getType());
         absl::StatusOr<xla::XlaOp> reshape =
@@ -3152,33 +3224,26 @@ LogicalResult ConvertToHloModule::Lower(
         returns[index] = reshape.value();
       }
 
-      if (has_ret_shardings) {
-        xla::OpSharding sharding;
-        sharding.set_type(xla::OpSharding::TUPLE);
-        for (auto& ret_sharding : ret_shardings)
-          if (ret_sharding) {
-            *sharding.add_tuple_shardings() = *ret_sharding;
-          } else {
-            xla::OpSharding fallback_sharding;
-            fallback_sharding.set_type(xla::OpSharding::REPLICATED);
-            *sharding.add_tuple_shardings() = fallback_sharding;
-          }
-
-        builder->SetSharding(sharding);
-      }
-
+      xla::XlaScopedShardingAssignment scoped_sharding(builder,
+                                                       ret_tuple_sharding);
       *return_value = xla::Tuple(builder, returns);
-      builder->ClearSharding();
-    } else if (num_return_values == 1) {
+    } else {
       xla::XlaOp operand;
       if (failed(GetXlaOp(inst->getOperand(0), value_map, &operand, inst)))
         return failure();
 
-      if (has_ret_shardings) {
-        auto tuple = Tuple(builder, {operand});
-        builder->SetSharding(*ret_shardings[0]);
-        *return_value = GetTupleElement(tuple, 0);
-        builder->ClearSharding();
+      if (ret_tuple_sharding) {
+        xla::XlaOp tuple;
+        {
+          xla::XlaScopedShardingAssignment scoped_sharding(builder,
+                                                           ret_tuple_sharding);
+          tuple = Tuple(builder, {operand});
+        }
+        {
+          xla::XlaScopedShardingAssignment scoped_sharding(builder,
+                                                           *ret_shardings[0]);
+          *return_value = GetTupleElement(tuple, 0);
+        }
       } else {
         *return_value = operand;
       }
@@ -3423,21 +3488,21 @@ LogicalResult ConvertToHloModule::LowerBasicBlockAsFunction(
         xla::Parameter(builder, 0, input_shape, "arg_tuple", leaf_replication);
     builder->ClearSharding();
 
-    bool set_tuple_element_sharding =
-        !arg_shardings.empty() && SomeOptionalShardingsAreSet(arg_shardings);
     for (BlockArgument& arg : block->getArguments()) {
-      if (set_tuple_element_sharding &&
-          arg_shardings[arg.getArgNumber()].has_value()) {
-        builder->SetSharding(*arg_shardings[arg.getArgNumber()]);
-      }
+      xla::XlaScopedShardingAssignment scoped_sharding(
+          builder, arg_shardings.empty() ? std::nullopt
+                                         : arg_shardings[arg.getArgNumber()]);
       lowering[arg] = xla::GetTupleElement(tuple, arg.getArgNumber());
-      builder->ClearSharding();
     }
   } else {
     if (ensure_single_arg) {
       // Applicable for mhlo.IfOp or mhlo.CaseOp or mhlo.WhileOp.
       llvm::SmallVector<xla::Shape, 4> arg_shapes;
 
+      // The arguments of `block` are ignored if `implicit_operands` is set,
+      // therefore `arg_shardings` should be empty in that case.
+      assert(arg_shardings.empty() || !implicit_operands);
+
       auto args_size = block->getNumArguments();
       if (implicit_operands) args_size = implicit_operands->size();
 
@@ -3451,6 +3516,10 @@ LogicalResult ConvertToHloModule::LowerBasicBlockAsFunction(
       }
 
       if (args_size > 1) {
+        xla::XlaScopedShardingAssignment scoped_sharding(
+            builder, arg_shardings.empty()
+                         ? std::nullopt
+                         : CreateTupleSharding(arg_shardings));
         auto tuple = xla::Parameter(builder, 0,
                                     xla::ShapeUtil::MakeTupleShape(arg_shapes),
                                     "arg_tuple");
@@ -3461,14 +3530,22 @@ LogicalResult ConvertToHloModule::LowerBasicBlockAsFunction(
             lowering[implicit_operand] =
                 xla::GetTupleElement(tuple, arg_index++);
         } else {
-          for (BlockArgument& arg : block->getArguments())
-            lowering[arg] = xla::GetTupleElement(tuple, arg.getArgNumber());
+          for (BlockArgument& arg : block->getArguments()) {
+            auto num = arg.getArgNumber();
+            xla::XlaScopedShardingAssignment scoped_sharding(
+                builder,
+                arg_shardings.empty() ? std::nullopt : arg_shardings[num]);
+            lowering[arg] = xla::GetTupleElement(tuple, num);
+          }
         }
       } else if (args_size == 1) {
         if (implicit_operands) {
           lowering[(*implicit_operands)[0]] =
               xla::Parameter(builder, 0, arg_shapes[0], "Arg_");
         } else {
+          xla::XlaScopedShardingAssignment scoped_sharding(
+              builder,
+              arg_shardings.empty() ? std::nullopt : arg_shardings.front());
           lowering[block->getArgument(0)] =
               xla::Parameter(builder, 0, arg_shapes[0], "Arg_");
         }
@@ -3483,9 +3560,8 @@ LogicalResult ConvertToHloModule::LowerBasicBlockAsFunction(
       for (BlockArgument& arg : block->getArguments()) {
         auto num = arg.getArgNumber();
         xla::Shape shape = xla::TypeToShape(arg.getType());
-        if (!arg_shardings.empty() && arg_shardings[num]) {
-          builder->SetSharding(*arg_shardings[num]);
-        }
+        xla::XlaScopedShardingAssignment scoped_sharding(
+            builder, arg_shardings.empty() ? std::nullopt : arg_shardings[num]);
         if (!fe_attrs.empty() && fe_attrs[num]) {
           // Populates frontend attributes for parameters only for the entry
           // functions with no tuple args.
@@ -3500,7 +3576,6 @@ LogicalResult ConvertToHloModule::LowerBasicBlockAsFunction(
               std::vector<bool>(entry_args_same_across_replicas[num],
                                 xla::ShapeUtil::GetLeafCount(shape)));
         }
-        builder->ClearSharding();
         builder->ClearFrontendAttributes();
       }
     }
@@ -3526,19 +3601,21 @@ LogicalResult ConvertToHloModule::LowerBasicBlockAsFunction(
 LogicalResult ConvertToHloModule::LowerRegionAsComputation(
     mlir::Region* region, xla::XlaComputation* func,
     std::optional<llvm::ArrayRef<mlir::Value>> implicit_operands,
-    bool ensure_single_arg) {
+    bool ensure_single_arg,
+    llvm::ArrayRef<std::optional<xla::OpSharding>> arg_shardings,
+    llvm::ArrayRef<std::optional<xla::OpSharding>> ret_shardings) {
   std::unique_ptr<xla::XlaBuilder> builder =
       module_builder_.CreateSubBuilder(absl::StrCat("region_", region_id_++));
   return LowerBasicBlockAsFunction(&region->front(), builder.get(),
                                    /*is_entry_function=*/false,
                                    /*ensure_single_arg*/ ensure_single_arg,
                                    /*entry_args_same_across_replicas=*/{},
-                                   /*arg_shardings=*/{}, /*ret_shardings=*/{},
+                                   arg_shardings, ret_shardings,
                                    /*fe_attrs=*/{}, func, implicit_operands);
 }
 
 // Runs the PrepareForExport pass on the ModuleOp.
-xla::Status PrepareForExport(mlir::ModuleOp module) {
+absl::Status PrepareForExport(mlir::ModuleOp module) {
   bool hasShapeOps = false;
   module.walk([&](Operation* op) {
     hasShapeOps |= isa<shape::ShapeDialect>(op->getDialect());
@@ -3562,9 +3639,10 @@ xla::Status PrepareForExport(mlir::ModuleOp module) {
 
 }  // namespace
 
-xla::Status ConvertMlirHloToHlo(mlir::ModuleOp module, xla::HloProto* hlo_proto,
-                                bool use_tuple_args, bool return_tuple,
-                                MlirToHloConversionOptions options) {
+absl::Status ConvertMlirHloToHlo(mlir::ModuleOp module,
+                                 xla::HloProto* hlo_proto, bool use_tuple_args,
+                                 bool return_tuple,
+                                 MlirToHloConversionOptions options) {
   // To support the ongoing migration of XLA's compiler interface from MHLO
   // to StableHLO, we've inserted this fallback to provide support for backends
   // which are converting incoming ModuleOps directly to HLO.
@@ -3637,10 +3715,10 @@ xla::Status ConvertMlirHloToHlo(mlir::ModuleOp module, xla::HloProto* hlo_proto,
   return absl::OkStatus();
 }
 
-xla::Status BuildHloFromMlirHlo(mlir::Block& block, xla::XlaBuilder& builder,
-                                llvm::ArrayRef<xla::XlaOp> xla_params,
-                                std::vector<xla::XlaOp>& returns,
-                                MlirToHloConversionOptions options) {
+absl::Status BuildHloFromMlirHlo(mlir::Block& block, xla::XlaBuilder& builder,
+                                 llvm::ArrayRef<xla::XlaOp> xla_params,
+                                 std::vector<xla::XlaOp>& returns,
+                                 MlirToHloConversionOptions options) {
   auto module = block.getParentOp()->getParentOfType<mlir::ModuleOp>();
   TF_RETURN_IF_ERROR(PrepareForExport(module));
   ConvertToHloModule converter(module, builder,
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.h b/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.h
index 63b6d243905419..de1360e2d31644 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.h
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.h
@@ -54,18 +54,18 @@ struct MlirToHloConversionOptions {
 // are converted to a tuple even when there is only a single return value.
 // Multiple return values are always converted to a tuple and returned as a
 // single value.
-xla::Status ConvertMlirHloToHlo(mlir::ModuleOp module,
-                                ::xla::HloProto* hlo_proto, bool use_tuple_args,
-                                bool return_tuple,
-                                MlirToHloConversionOptions options = {});
+absl::Status ConvertMlirHloToHlo(mlir::ModuleOp module,
+                                 ::xla::HloProto* hlo_proto,
+                                 bool use_tuple_args, bool return_tuple,
+                                 MlirToHloConversionOptions options = {});
 
 // Transforms a Block into HLO, where the HLO is represented as calls into an
 // XlaBuilder. Callee functions are allowed in the Block's ancestor ModuleOp.
 // xla_params are inputs to block. returns are the returned XlaOps.
-xla::Status BuildHloFromMlirHlo(mlir::Block& block, xla::XlaBuilder& builder,
-                                llvm::ArrayRef<xla::XlaOp> xla_params,
-                                std::vector<xla::XlaOp>& returns,
-                                MlirToHloConversionOptions options = {});
+absl::Status BuildHloFromMlirHlo(mlir::Block& block, xla::XlaBuilder& builder,
+                                 llvm::ArrayRef<xla::XlaOp> xla_params,
+                                 std::vector<xla::XlaOp>& returns,
+                                 MlirToHloConversionOptions options = {});
 
 }  // namespace mlir
 
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/tests/BUILD b/third_party/xla/xla/translate/mhlo_to_hlo/tests/BUILD
index 140d5bd9093576..dbeb09eeb3f0b1 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/tests/BUILD
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/tests/BUILD
@@ -1,5 +1,5 @@
-load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
 load("//xla:lit.bzl", "enforce_glob", "lit_test_suite")
+load("//xla/tsl:tsl.default.bzl", "filegroup")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/tests/dynamic.mlir b/third_party/xla/xla/translate/mhlo_to_hlo/tests/dynamic.mlir
index bc369324bb5865..bb83b10cc3f5fb 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/tests/dynamic.mlir
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/tests/dynamic.mlir
@@ -3,9 +3,9 @@
 // CHECK: HloModule main, entry_computation_layout={(s64[<=4,1]{1,0})->s64[1,<=4]{1,0}}
 func.func @main(%arg0: tensor<?x1xi64, #mhlo.type_extensions<bounds = [4, ?]>>) -> tensor<1x?xi64, #mhlo.type_extensions<bounds = [?, 4]>> {
   %0 = mhlo.constant dense<1> : tensor<1xi32>
-  %1 = "mhlo.get_dimension_size"(%arg0) {dimension = 0 : i64} : (tensor<?x1xi64, #mhlo.type_extensions<bounds = [4, ?]>>) -> tensor<i32>
+  %1 = "mhlo.get_dimension_size"(%arg0) <{dimension = 0 : i64}> : (tensor<?x1xi64, #mhlo.type_extensions<bounds = [4, ?]>>) -> tensor<i32>
   %2 = mhlo.reshape %1 : (tensor<i32>) -> tensor<1xi32>
-  %3 = "mhlo.concatenate"(%0, %2) {dimension = 0 : i64} : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+  %3 = "mhlo.concatenate"(%0, %2) <{dimension = 0 : i64}> : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
   %4 = mhlo.dynamic_reshape %arg0, %3 : (tensor<?x1xi64, #mhlo.type_extensions<bounds = [4, ?]>>, tensor<2xi32>) -> tensor<1x?xi64, #mhlo.type_extensions<bounds = [?, 4]>>
   func.return %4 : tensor<1x?xi64, #mhlo.type_extensions<bounds = [?, 4]>>
   //      CHECK: %[[ARG0:.*]] = s64[<=4,1] parameter(0)
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/tests/export.mlir b/third_party/xla/xla/translate/mhlo_to_hlo/tests/export.mlir
index 0a49b55a59c879..bbf29e9daa742c 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/tests/export.mlir
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/tests/export.mlir
@@ -476,7 +476,7 @@ func.func @main(%arg0: tensor<2xi32>) -> tensor<2xf32> {
 func.func @main(%arg0: tensor<4xi32>) -> tensor<1x2x3x4xi32> {
   // CHECK:  [[ARG:%.*]] = s32[4] parameter(0)
   // CHECK-NEXT:  ROOT %broadcast.2 = s32[1,2,3,4] broadcast(s32[4] [[ARG]]), dimensions={3}
-  %0 = "mhlo.broadcast"(%arg0) {broadcast_sizes = dense<[1,2,3]> : tensor<3xi64>} : (tensor<4xi32>) -> tensor<1x2x3x4xi32>
+  %0 = "mhlo.broadcast"(%arg0) <{broadcast_sizes = dense<[1,2,3]> : tensor<3xi64>}> : (tensor<4xi32>) -> tensor<1x2x3x4xi32>
   func.return %0 : tensor<1x2x3x4xi32>
 }
 
@@ -1769,7 +1769,7 @@ func.func @main(%arg0: tensor<10x16xbf16>, %arg1: tensor<32x20xbf16>, %meta: ten
 func.func @main(%arg0: tensor<3x4xi32>, %arg1: tensor<4x5xi32>) -> tensor<3x5xi32> {
   // Simple einsum is lowered to HLO dot op.
   // CHECK:  dot(s32[3,4] %{{.*}}, s32[4,5] %{{.*}}), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  %0 = "mhlo.einsum"(%arg0, %arg1) {einsum_config = "ab,bc->ac"} : (tensor<3x4xi32>, tensor<4x5xi32>) -> tensor<3x5xi32>
+  %0 = "mhlo.einsum"(%arg0, %arg1) <{einsum_config = "ab,bc->ac"}> : (tensor<3x4xi32>, tensor<4x5xi32>) -> tensor<3x5xi32>
   func.return %0 : tensor<3x5xi32>
 }
 
@@ -1777,7 +1777,7 @@ func.func @main(%arg0: tensor<3x4xi32>, %arg1: tensor<4x5xi32>) -> tensor<3x5xi3
 
 // CHECK:  HloModule
 func.func @main(%arg0: tensor<3x9xf32>) -> tensor<3x5xcomplex<f32>> {
-  %0 = "mhlo.fft"(%arg0) {fft_length = dense<9> : tensor<1xi64>, fft_type = #mhlo<fft_type RFFT>} : (tensor<3x9xf32>) -> tensor<3x5xcomplex<f32>>
+  %0 = "mhlo.fft"(%arg0) <{fft_length = dense<9> : tensor<1xi64>, fft_type = #mhlo<fft_type RFFT>}> : (tensor<3x9xf32>) -> tensor<3x5xcomplex<f32>>
   func.return %0 : tensor<3x5xcomplex<f32>>
 }
 
@@ -1798,7 +1798,7 @@ func.func @main(%arg0: tensor<200x100x300xf32>, %arg1: tensor<10x2xi32>) -> tens
   // CHECK-SAME:  index_vector_dim=1
   // CHECK-SAME:  slice_sizes={1,1,300}
   // CHECK-SAME:  indices_are_sorted=true
-  %0 = "mhlo.gather"(%arg0, %arg1) {
+  %0 = "mhlo.gather"(%arg0, %arg1) <{
     dimension_numbers = #mhlo.gather<
       collapsed_slice_dims = [0, 1],
       index_vector_dim = 1,
@@ -1807,7 +1807,7 @@ func.func @main(%arg0: tensor<200x100x300xf32>, %arg1: tensor<10x2xi32>) -> tens
     >,
     indices_are_sorted = true,
     slice_sizes = dense<[1, 1, 300]> : tensor<3xi64>
-  } : (tensor<200x100x300xf32>, tensor<10x2xi32>) -> tensor<10x300xf32>
+  }> : (tensor<200x100x300xf32>, tensor<10x2xi32>) -> tensor<10x300xf32>
   func.return %0 : tensor<10x300xf32>
 }
 
@@ -1815,8 +1815,8 @@ func.func @main(%arg0: tensor<200x100x300xf32>, %arg1: tensor<10x2xi32>) -> tens
 
 // CHECK:  HloModule
 func.func @main(%arg: tensor<4x2xf32>, %size: tensor<i32>) -> tensor<i32> {
-  %0 = "mhlo.set_dimension_size"(%arg, %size) {dimension = 1 : i64} : (tensor<4x2xf32>, tensor<i32>) -> tensor<4x2xf32>
-  %1 = "mhlo.get_dimension_size"(%0) {dimension = 1 : i64} : (tensor<4x2xf32>) -> tensor<i32>
+  %0 = "mhlo.set_dimension_size"(%arg, %size) <{dimension = 1 : i64}> : (tensor<4x2xf32>, tensor<i32>) -> tensor<4x2xf32>
+  %1 = "mhlo.get_dimension_size"(%0) <{dimension = 1 : i64}> : (tensor<4x2xf32>) -> tensor<i32>
   func.return %1 : tensor<i32>
 }
 
@@ -1832,7 +1832,7 @@ func.func @main(%arg: tensor<4x2xf32>, %size: tensor<i32>) -> tensor<i32> {
 // CHECK:  HloModule
 func.func @main(%arg: tensor<?x4xf32, #mhlo.type_extensions<bounds = [8, ?]>>) -> tensor<8x4xf32> {
   %size = mhlo.constant dense<8> : tensor<i32>
-  %1 = "mhlo.set_dimension_size"(%arg, %size) {dimension = 0 : i64} : (tensor<?x4xf32, #mhlo.type_extensions<bounds = [8, ?]>>, tensor<i32>) -> tensor<8x4xf32>
+  %1 = "mhlo.set_dimension_size"(%arg, %size) <{dimension = 0 : i64}> : (tensor<?x4xf32, #mhlo.type_extensions<bounds = [8, ?]>>, tensor<i32>) -> tensor<8x4xf32>
   func.return %1 : tensor<8x4xf32>
 }
 
@@ -1845,7 +1845,7 @@ func.func @main(%arg: tensor<?x4xf32, #mhlo.type_extensions<bounds = [8, ?]>>) -
 
 // CHECK:  HloModule
 func.func @main(%arg0: tuple<tensor<f32>, tensor<i32>>) -> tensor<f32> {
-  %0 = "mhlo.get_tuple_element"(%arg0) {index = 0 : i32} : (tuple<tensor<f32>, tensor<i32>>) -> tensor<f32>
+  %0 = "mhlo.get_tuple_element"(%arg0) <{index = 0 : i32}> : (tuple<tensor<f32>, tensor<i32>>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
@@ -1857,7 +1857,7 @@ func.func @main(%arg0: tuple<tensor<f32>, tensor<i32>>) -> tensor<f32> {
 
 // CHECK:  HloModule
 func.func @main(%arg0: !mhlo.token) -> tuple<tuple<tensor<3x3xi32>, tensor<i1>>, !mhlo.token> {
-  %0:3 = "mhlo.infeed"(%arg0) {infeed_config = "foobar", layout=[[0, 1], [0]]} : (!mhlo.token) -> (tensor<3x3xi32>, tensor<i1>, !mhlo.token)
+  %0:3 = "mhlo.infeed"(%arg0) <{infeed_config = "foobar", layout=[[0, 1], [0]]}> : (!mhlo.token) -> (tensor<3x3xi32>, tensor<i1>, !mhlo.token)
   %1 = "mhlo.tuple"(%0#0, %0#1) : (tensor<3x3xi32>, tensor<i1>) -> tuple<tensor<3x3xi32>, tensor<i1>>
   %2 = "mhlo.tuple"(%1, %0#2) : (tuple<tensor<3x3xi32>, tensor<i1>>, !mhlo.token) -> tuple<tuple<tensor<3x3xi32>, tensor<i1>>, !mhlo.token>
 
@@ -1876,7 +1876,7 @@ func.func @main(%arg0: !mhlo.token) -> tuple<tuple<tensor<3x3xi32>, tensor<i1>>,
 
 // CHECK:  HloModule
 func.func @main(%arg0: !mhlo.token) -> tensor<3x3xi32> {
-  %0:2 = "mhlo.infeed"(%arg0) {infeed_config = "foobar", layout=[[0,1]]} : (!mhlo.token) -> (tensor<3x3xi32>, !mhlo.token)
+  %0:2 = "mhlo.infeed"(%arg0) <{infeed_config = "foobar", layout=[[0,1]]}> : (!mhlo.token) -> (tensor<3x3xi32>, !mhlo.token)
   func.return %0#0 : tensor<3x3xi32>
 }
 
@@ -1892,7 +1892,7 @@ func.func @main(%arg0: !mhlo.token) -> tensor<3x3xi32> {
 // CHECK:  HloModule
 
 func.func @main(%arg0: !mhlo.token) -> !mhlo.token {
-  %0 = "mhlo.infeed"(%arg0) {infeed_config = "foobar", layout = [], xla_shape = "((), token[])"} : (!mhlo.token) -> !mhlo.token
+  %0 = "mhlo.infeed"(%arg0) <{infeed_config = "foobar", layout = [], xla_shape = "((), token[])"}> : (!mhlo.token) -> !mhlo.token
   func.return %0 : !mhlo.token
 }
 
@@ -2045,7 +2045,7 @@ func.func @main(%token: !mhlo.token) -> !mhlo.token {
 
 // CHECK:  HloModule
 func.func @main(%arg: tensor<4x6xf32>, %pad: tensor<f32>) -> tensor<13x19xf32> {
-  %0 = "mhlo.pad"(%arg, %pad) {edge_padding_high = dense<[4,5]> : tensor<2xi64>, edge_padding_low = dense<[2,3]> : tensor<2xi64>, interior_padding = dense<1> : tensor<2xi64>} : (tensor<4x6xf32>, tensor<f32>) -> tensor<13x19xf32>
+  %0 = "mhlo.pad"(%arg, %pad) <{edge_padding_high = dense<[4,5]> : tensor<2xi64>, edge_padding_low = dense<[2,3]> : tensor<2xi64>, interior_padding = dense<1> : tensor<2xi64>}> : (tensor<4x6xf32>, tensor<f32>) -> tensor<13x19xf32>
   func.return %0 : tensor<13x19xf32>
 }
 
@@ -2264,7 +2264,7 @@ func.func @main(%arg0 : tensor<10x11x12x13xf32>) -> tensor<10x11x12x13xf32> {
 // CHECK:  HloModule
 func.func @main(%mu: tensor<f32>, %sigma: tensor<f32>) -> tensor<2x3x5xf32> {
   %shape = mhlo.constant dense<[2, 3, 5]> : tensor<3xi64>
-  %0 = "mhlo.rng"(%mu, %sigma, %shape) {rng_distribution = #mhlo.rng_distribution<NORMAL>} : (tensor<f32>, tensor<f32>, tensor<3xi64>) -> tensor<2x3x5xf32>
+  %0 = "mhlo.rng"(%mu, %sigma, %shape) <{rng_distribution = #mhlo.rng_distribution<NORMAL>}> : (tensor<f32>, tensor<f32>, tensor<3xi64>) -> tensor<2x3x5xf32>
   func.return %0 : tensor<2x3x5xf32>
 }
 
@@ -2280,7 +2280,7 @@ func.func @main() -> tensor<2x3x5xf32> {
   %0 = mhlo.constant dense<0.000000e+00> : tensor<f32>
   %1 = mhlo.constant dense<1.000000e+00> : tensor<f32>
   %2 = mhlo.constant dense<[2, 3, 5]> : tensor<3xi64>
-  %3 = "mhlo.rng"(%0, %1, %2) {rng_distribution = #mhlo.rng_distribution<UNIFORM>} : (tensor<f32>, tensor<f32>, tensor<3xi64>) -> tensor<2x3x5xf32>
+  %3 = "mhlo.rng"(%0, %1, %2) <{rng_distribution = #mhlo.rng_distribution<UNIFORM>}> : (tensor<f32>, tensor<f32>, tensor<3xi64>) -> tensor<2x3x5xf32>
   func.return %3 : tensor<2x3x5xf32>
 }
 
@@ -2522,7 +2522,7 @@ func.func @main(%arg: tensor<4x4xf32>, %size: tensor<i32>) -> tensor<4x4xf32> {
 
 // CHECK:  HloModule
 func.func @main(%arg: tensor<3x4xi32>) -> tensor<1x2xi32> {
-  %0 = "mhlo.slice"(%arg) {start_indices = dense<[1, 0]> : tensor<2xi64>, limit_indices = dense<[2, 4]> : tensor<2xi64>, strides = dense<[1, 2]> : tensor<2xi64>} : (tensor<3x4xi32>) -> tensor<1x2xi32>
+  %0 = "mhlo.slice"(%arg) <{start_indices = dense<[1, 0]> : tensor<2xi64>, limit_indices = dense<[2, 4]> : tensor<2xi64>, strides = dense<[1, 2]> : tensor<2xi64>}> : (tensor<3x4xi32>) -> tensor<1x2xi32>
   func.return %0 : tensor<1x2xi32>
 }
 
@@ -2535,7 +2535,7 @@ func.func @main(%arg: tensor<3x4xi32>) -> tensor<1x2xi32> {
 
 // CHECK:  HloModule
 func.func @main(%arg: tensor<3x4xi32>, %start1: tensor<i64>, %start2: tensor<i64>) -> tensor<1x4xi32> {
-  %0 = "mhlo.dynamic_slice"(%arg, %start1, %start2) {slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
+  %0 = "mhlo.dynamic_slice"(%arg, %start1, %start2) <{slice_sizes = dense<[1, 4]> : tensor<2xi64>}> : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
   func.return %0 : tensor<1x4xi32>
 }
 
@@ -2553,7 +2553,7 @@ func.func @main(%arg0: tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32> {
   // CHECK:  [[ARG:%.*]] = s32[1,2,3,4] parameter(0)
 
   // CHECK-NEXT:  ROOT %transpose.2 = s32[2,1,4,3] transpose(s32[1,2,3,4] [[ARG]]), dimensions={1,0,3,2}
-  %0 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
+  %0 = "mhlo.transpose"(%arg0) <{permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>}> : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
   func.return %0 : tensor<2x1x4x3xi32>
 }
 
@@ -2812,7 +2812,7 @@ func.func @main(%arg: tensor<3xui64>) -> tuple<tensor<3xui64>, tensor<2x2xui32>>
 // CHECK:  [[GTE1:%.*]] = u32[2,2] get-tuple-element((u64[3], u32[2,2]) [[RNG]]), index=1
 // CHECK:  ROOT
 // CHECK-SAME: [[RES:%.*]] = (u64[3], u32[2,2]) tuple(u64[3] [[GTE0]], u32[2,2] [[GTE1]])
-  %0:2 = "mhlo.rng_bit_generator"(%arg) {rng_algorithm = #mhlo.rng_algorithm<PHILOX>} : (tensor<3xui64>) -> (tensor<3xui64>, tensor<2x2xui32>)
+  %0:2 = "mhlo.rng_bit_generator"(%arg) <{rng_algorithm = #mhlo.rng_algorithm<PHILOX>}> : (tensor<3xui64>) -> (tensor<3xui64>, tensor<2x2xui32>)
   %1 = "mhlo.tuple"(%0#0, %0#1) : (tensor<3xui64>, tensor<2x2xui32>) -> tuple<tensor<3xui64>, tensor<2x2xui32>>
   func.return %1 : tuple<tensor<3xui64>, tensor<2x2xui32>>
 }
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/tests/multiple_return_tuple.mlir b/third_party/xla/xla/translate/mhlo_to_hlo/tests/multiple_return_tuple.mlir
index b8e5612c81e0f2..878285fd21d4d7 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/tests/multiple_return_tuple.mlir
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/tests/multiple_return_tuple.mlir
@@ -9,6 +9,6 @@
 func.func @main(%arg0: tensor<4xi32>) -> (tensor<4xi32>, tensor<1x2x3x4xi32>) {
   // CHECK-NEXT: %Arg_0.1 = s32[4] parameter(0)
   // CHECK-NEXT: %broadcast.2 = s32[1,2,3,4] broadcast(s32[4] %Arg_0.1), dimensions={3}
-  %0 = "mhlo.broadcast"(%arg0) {broadcast_sizes = dense<[1,2,3]> : tensor<3xi64>} : (tensor<4xi32>) -> tensor<1x2x3x4xi32>
+  %0 = "mhlo.broadcast"(%arg0) <{broadcast_sizes = dense<[1,2,3]> : tensor<3xi64>}> : (tensor<4xi32>) -> tensor<1x2x3x4xi32>
   func.return %arg0, %0 : tensor<4xi32>, tensor<1x2x3x4xi32>
 }
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/tests/sharding.mlir b/third_party/xla/xla/translate/mhlo_to_hlo/tests/sharding.mlir
index a4a2650d61b69b..96df99713d6d3d 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/tests/sharding.mlir
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/tests/sharding.mlir
@@ -4,9 +4,9 @@
 func.func public @main(%arg0: tensor<f32> {mhlo.sharding = ""}, %arg1: tensor<4xf32> {mhlo.sharding = "\08\03\1A\01\02\22\02\00\01"}) -> (tensor<4x4xf32> {mhlo.sharding = "\08\03\1A\02\02\01\22\02\00\01"}) {
   // CHECK-NEXT: %Arg_1.2 = f32[4] parameter(1), sharding={devices=[2]0,1}
   // CHECK-NEXT: %Arg_0.1 = f32[] parameter(0), sharding={replicated}
-  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<4xf32>
+  %0 = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<f32>) -> tensor<4xf32>
   %1 = mhlo.multiply %arg1, %0 : tensor<4xf32>
-  %2 = "mhlo.broadcast_in_dim"(%1) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<4x4xf32>
+  %2 = "mhlo.broadcast_in_dim"(%1) <{broadcast_dimensions = dense<0> : tensor<1xi64>}> : (tensor<4xf32>) -> tensor<4x4xf32>
   // CHECK: ROOT {{.*}}, sharding={devices=[2,1]0,1}
   func.return %2 : tensor<4x4xf32>
 }
@@ -14,20 +14,31 @@ func.func public @main(%arg0: tensor<f32> {mhlo.sharding = ""}, %arg1: tensor<4x
 // -----
 
 // CHECK-LABEL: ENTRY %main.{{.*}} ({{[^,]*}}: f32[5,8,128]) -> f32[5,8,128]
-func.func @main(%arg0: tensor<5x8x128xf32> {mhlo.sharding = "\08\03\1A\03\01\02\01\22\02\00\01"}) -> (tensor<5x8x128xf32> {mhlo.sharding = "\08\03\1A\03\01\02\01\22\02\00\01"}) {
+func.func @main(%arg0: tensor<5x8x128xf32> {mhlo.sharding = "{devices=[1,2,1]0,1}"}) -> (tensor<5x8x128xf32> {mhlo.sharding = "{devices=[1,2,1]0,1}"}) {
   // CHECK-NEXT: %Arg_0.1 = f32[5,8,128] parameter(0), sharding={devices=[1,2,1]0,1}
   // CHECK-NEXT: %custom-call.2 = f32[5,8,128] custom-call(f32[5,8,128] %Arg_0.1), custom_call_target="Sharding", sharding={devices=[1,2,1]0,1}
-  // CHECK-NEXT: %tuple.3 = (f32[5,8,128]) tuple(f32[5,8,128] %custom-call.2)
-  // CHECK-NEXT: ROOT %get-tuple-element.4 = f32[5,8,128] get-tuple-element((f32[5,8,128]) %tuple.3), index=0
-  // CHECK-SAME: sharding={devices=[1,2,1]0,1}
-  %0 = "mhlo.custom_call"(%arg0) {call_target_name = "Sharding",
-				  mhlo.sharding = "\08\03\1A\03\01\02\01\22\02\00\01"
-				 } : (tensor<5x8x128xf32>) -> tensor<5x8x128xf32>
+  // CHECK-NEXT: %tuple.3 = (f32[5,8,128]) tuple(f32[5,8,128] %custom-call.2), sharding={{\{}}{devices=[1,2,1]0,1}}
+  // CHECK-NEXT: ROOT %get-tuple-element.4 = f32[5,8,128] get-tuple-element((f32[5,8,128]) %tuple.3), index=0, sharding={devices=[1,2,1]0,1}
+  %0 = "mhlo.custom_call"(%arg0) {call_target_name = "Sharding", mhlo.sharding = "{devices=[1,2,1]0,1}"} : (tensor<5x8x128xf32>) -> tensor<5x8x128xf32>
   func.return %0 : tensor<5x8x128xf32>
 }
 
 // -----
 
+// CHECK-LABEL: ENTRY %main.{{.*}} ({{[^,]*}}: f32[5,8,128]) -> (f32[5,8,128])
+func.func @main(%arg0: tensor<5x8x128xf32> {mhlo.sharding = "{devices=[1,2,1]0,1}"}) -> (tuple<tensor<5x8x128xf32>> {mhlo.sharding = "{{devices=[1,2,1]0,1}}"}) {
+  // CHECK-NEXT: %Arg_0.1 = f32[5,8,128] parameter(0), sharding={devices=[1,2,1]0,1}
+  // CHECK-NEXT: %custom-call.2 = f32[5,8,128] custom-call(f32[5,8,128] %Arg_0.1), custom_call_target="Sharding", sharding={devices=[1,2,1]0,1}
+  // CHECK-NEXT: %tuple.3 = (f32[5,8,128]) tuple(f32[5,8,128] %custom-call.2)
+  // CHECK-NEXT: %tuple.4 = ((f32[5,8,128])) tuple((f32[5,8,128]) %tuple.3), sharding={{\{}}{devices=[1,2,1]0,1}}
+  // CHECK-NEXT: ROOT %get-tuple-element.5 = (f32[5,8,128]) get-tuple-element(((f32[5,8,128])) %tuple.4), index=0, sharding={{\{}}{devices=[1,2,1]0,1}}
+  %0 = "mhlo.custom_call"(%arg0) {call_target_name = "Sharding", mhlo.sharding = "{devices=[1,2,1]0,1}"} : (tensor<5x8x128xf32>) -> tensor<5x8x128xf32>
+  %1 = "mhlo.tuple"(%0) : (tensor<5x8x128xf32>) -> tuple<tensor<5x8x128xf32>>
+  func.return %1 : tuple<tensor<5x8x128xf32>>
+}
+
+// -----
+
 // CHECK-LABEL: ENTRY %main.{{.*}} ({{[^,]*}}: f32[4,4]) -> (f32[4,4], f32[4,4])
 func.func @main(%arg0: tensor<4x4xf32>) -> (tensor<4x4xf32> {mhlo.sharding = "\08\03\1A\03\02\01\02\22\04\00\01\02\03B\01\00"}, tensor<4x4xf32>) {
   // CHECK-NEXT: %Arg_0.1 = f32[4,4] parameter(0)
@@ -41,6 +52,18 @@ func.func @main(%arg0: tensor<4x4xf32>) -> (tensor<4x4xf32> {mhlo.sharding = "\0
 
 // -----
 
+// CHECK-LABEL: ENTRY %main.{{.*}} () -> f32[4]
+func.func @main() -> (tensor<4xf32>) {
+  // CHECK-NEXT: %constant.1 = f32[] constant(3.1415925)
+  // CHECK-NEXT: %broadcast.2 = f32[4] broadcast(f32[] %constant.1), dimensions={}, sharding={devices=[2]0,1}
+  // CHECK-NEXT: ROOT %add.3 = f32[4] add(f32[4] %broadcast.2, f32[4] %broadcast.2)
+  %0 = mhlo.constant {mhlo.sharding = "{devices=[2]0,1}"} dense<3.1415926> : tensor<4xf32>
+  %1 = mhlo.add %0, %0 : tensor<4xf32>
+  return %1 : tensor<4xf32>
+}
+
+// -----
+
 // CHECK-LABEL: ENTRY %main.{{.*}} () -> f32[12,24,36]
 func.func @main() -> (tensor<12x24x36xf32>) {
   // CHECK-NEXT: %constant.1 = f32[] constant(3.1415925)
@@ -63,7 +86,100 @@ func.func @main(%arg0: tensor<2xui64>) -> (tensor<2xui64> {mhlo.sharding = "{dev
   // CHECK-NEXT: %get-tuple-element.4 = u32[512,4] get-tuple-element((u64[2], u32[512,4]) %rng-bit-generator.2), index=1, sharding={devices=[8,4]<=[32]}
   // CHECK-NEXT: %reshape.7 = u32[512,4] reshape(u32[512,4] %get-tuple-element.4)
   // CHECK-NEXT: ROOT %tuple.8 = (u64[2], u32[512,4]) tuple(u64[2] %reshape.6, u32[512,4] %reshape.7), sharding={{\{}}{devices=[2,16]<=[32] last_tile_dim_replicate}, {devices=[4,8]<=[32]}}
-  %output_state, %output = "mhlo.rng_bit_generator"(%arg0) {mhlo.sharding = "{{replicated}, {devices=[8,4]<=[32]}}", rng_algorithm = #mhlo.rng_algorithm<DEFAULT>} : (tensor<2xui64>) -> (tensor<2xui64>, tensor<512x4xui32>)
+  %output_state, %output = "mhlo.rng_bit_generator"(%arg0) <{rng_algorithm = #mhlo.rng_algorithm<DEFAULT>}> {mhlo.sharding = "{{replicated}, {devices=[8,4]<=[32]}}"} : (tensor<2xui64>) -> (tensor<2xui64>, tensor<512x4xui32>)
   %0 = mhlo.add %output_state, %output_state : tensor<2xui64>
   return %0, %output : tensor<2xui64>, tensor<512x4xui32>
 }
+
+// -----
+
+// CHECK-LABEL: ENTRY %main.{{.*}} (Arg_0.1: u64[2]) -> (u64[2], u32[512,4])
+func.func @main(%arg0: tensor<2xui64>) -> (tensor<2xui64>, tensor<512x4xui32>) {
+  // CHECK-NEXT: %Arg_0.1 = u64[2] parameter(0)
+  // CHECK-NEXT: %rng-bit-generator.2 = (u64[2], u32[512,4]) rng-bit-generator(u64[2] %Arg_0.1), algorithm=rng_default, sharding={{\{}}{replicated}, {replicated}}
+  // CHECK-NEXT: %get-tuple-element.3 = u64[2] get-tuple-element((u64[2], u32[512,4]) %rng-bit-generator.2), index=0, sharding={replicated}
+  // CHECK-NEXT: %add.5 = u64[2] add(u64[2] %get-tuple-element.3, u64[2] %get-tuple-element.3)
+  // CHECK-NEXT: %get-tuple-element.4 = u32[512,4] get-tuple-element((u64[2], u32[512,4]) %rng-bit-generator.2), index=1, sharding={replicated}
+  // CHECK-NEXT: ROOT %tuple.6 = (u64[2], u32[512,4]) tuple(u64[2] %add.5, u32[512,4] %get-tuple-element.4)
+  %output_state, %output = "mhlo.rng_bit_generator"(%arg0) <{rng_algorithm = #mhlo.rng_algorithm<DEFAULT>}> {mhlo.sharding = "{replicated}"} : (tensor<2xui64>) -> (tensor<2xui64>, tensor<512x4xui32>)
+  %0 = mhlo.add %output_state, %output_state : tensor<2xui64>
+  return %0, %output : tensor<2xui64>, tensor<512x4xui32>
+}
+
+// -----
+
+// CHECK-LABEL: HloModule main
+
+// CHECK:      %region_0.2 (Arg_.3: s32[]) -> s32[] {
+// CHECK-NEXT:   %Arg_.3 = s32[] parameter(0), sharding={replicated}
+// CHECK-NEXT:   %add.4 = s32[] add(s32[] %Arg_.3, s32[] %Arg_.3)
+// CHECK-NEXT:   %tuple.5 = (s32[]) tuple(s32[] %add.4)
+// CHECK-NEXT:   ROOT %get-tuple-element.6 = s32[] get-tuple-element((s32[]) %tuple.5), index=0, sharding={replicated}
+
+// CHECK:      %region_1.7 (Arg_.8: s32[]) -> pred[] {
+// CHECK-NEXT:   %Arg_.8 = s32[] parameter(0), sharding={replicated}
+// CHECK-NEXT:   ROOT %compare.9 = pred[] compare(s32[] %Arg_.8, s32[] %Arg_.8), direction=LT
+
+// CHECK:      ENTRY %main.11 (Arg_0.1: s32[]) -> s32[] {
+// CHECK-NEXT:   %Arg_0.1 = s32[] parameter(0)
+// CHECK-NEXT:   ROOT %while.10 = s32[] while(s32[] %Arg_0.1), condition=%region_1.7, body=%region_0.2, sharding={replicated}
+
+func.func @main(%arg0: tensor<i32>) -> tensor<i32> {
+  %0 = mhlo.while(%iterArg = %arg0) : tensor<i32> attributes {mhlo.sharding = "{replicated}"}
+    cond {
+    %1 = mhlo.compare LT, %iterArg, %iterArg : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    mhlo.return %1 : tensor<i1>
+  } do {
+    %1 = mhlo.add %iterArg, %iterArg : tensor<i32>
+    mhlo.return %1 : tensor<i32>
+  }
+  func.return %0 : tensor<i32>
+}
+
+// -----
+
+// CHECK-LABEL: HloModule main
+
+// CHECK:      %region_0.5 (arg_tuple.6: (s32[], f32[4], f32[4])) -> (s32[], f32[4], f32[4]) {
+// CHECK-NEXT:   %arg_tuple.6 = (s32[], f32[4], f32[4]) parameter(0)
+// CHECK-SAME:     sharding={{\{}}{replicated}, {devices=[2,2]<=[4] last_tile_dim_replicate}, {devices=[4]<=[4]}}
+// CHECK-NEXT:   %get-tuple-element.7 = s32[] get-tuple-element((s32[], f32[4], f32[4]) %arg_tuple.6), index=0, sharding={replicated}
+// CHECK-NEXT:   %get-tuple-element.8 = f32[4] get-tuple-element((s32[], f32[4], f32[4]) %arg_tuple.6), index=1, sharding={devices=[2,2]<=[4] last_tile_dim_replicate}
+// CHECK-NEXT:   %get-tuple-element.9 = f32[4] get-tuple-element((s32[], f32[4], f32[4]) %arg_tuple.6), index=2, sharding={devices=[4]<=[4]}
+// CHECK-NEXT:   %add.10 = f32[4] add(f32[4] %get-tuple-element.8, f32[4] %get-tuple-element.9)
+// CHECK-NEXT:   ROOT %tuple.11 = (s32[], f32[4], f32[4]) tuple(s32[] %get-tuple-element.7, f32[4] %add.10, f32[4] %get-tuple-element.9)
+// CHECK-SAME:     sharding={{\{}}{replicated}, {devices=[2,2]<=[4] last_tile_dim_replicate}, {devices=[4]<=[4]}}
+
+// CHECK:      %region_1.12 (arg_tuple.13: (s32[], f32[4], f32[4])) -> pred[] {
+// CHECK-NEXT:   %arg_tuple.13 = (s32[], f32[4], f32[4]) parameter(0)
+// CHECK-SAME:     sharding={{\{}}{replicated}, {devices=[2,2]<=[4] last_tile_dim_replicate}, {devices=[4]<=[4]}}
+// CHECK-NEXT:   %get-tuple-element.15 = f32[4] get-tuple-element((s32[], f32[4], f32[4]) %arg_tuple.13), index=1, sharding={devices=[2,2]<=[4] last_tile_dim_replicate}
+// CHECK-NEXT:   %get-tuple-element.16 = f32[4] get-tuple-element((s32[], f32[4], f32[4]) %arg_tuple.13), index=2, sharding={devices=[4]<=[4]}
+// CHECK-NEXT:   %get-tuple-element.14 = s32[] get-tuple-element((s32[], f32[4], f32[4]) %arg_tuple.13), index=0, sharding={replicated}
+// CHECK-NEXT:   ROOT %compare.17 = pred[] compare(s32[] %get-tuple-element.14, s32[] %get-tuple-element.14), direction=LT
+
+// CHECK:      ENTRY %main.23 (Arg_0.1: s32[], Arg_1.2: f32[4], Arg_2.3: f32[4]) -> (f32[4], f32[4]) {
+// CHECK-NEXT:   %Arg_0.1 = s32[] parameter(0)
+// CHECK-NEXT:   %Arg_1.2 = f32[4] parameter(1)
+// CHECK-NEXT:   %Arg_2.3 = f32[4] parameter(2)
+// CHECK-NEXT:   %tuple.4 = (s32[], f32[4], f32[4]) tuple(s32[] %Arg_0.1, f32[4] %Arg_1.2, f32[4] %Arg_2.3)
+// CHECK-SAME:     sharding={{\{}}{replicated}, {devices=[2,2]<=[4] last_tile_dim_replicate}, {devices=[4]<=[4]}}
+// CHECK-NEXT:   %while.18 = (s32[], f32[4], f32[4]) while((s32[], f32[4], f32[4]) %tuple.4), condition=%region_1.12, body=%region_0.5
+// CHECK-SAME:     sharding={{\{}}{replicated}, {devices=[2,2]<=[4] last_tile_dim_replicate}, {devices=[4]<=[4]}}
+// CHECK-NEXT:   %get-tuple-element.19 = s32[] get-tuple-element((s32[], f32[4], f32[4]) %while.18), index=0, sharding={replicated}
+// CHECK-NEXT:   %get-tuple-element.20 = f32[4] get-tuple-element((s32[], f32[4], f32[4]) %while.18), index=1, sharding={devices=[2,2]<=[4] last_tile_dim_replicate}
+// CHECK-NEXT:   %get-tuple-element.21 = f32[4] get-tuple-element((s32[], f32[4], f32[4]) %while.18), index=2, sharding={devices=[4]<=[4]}
+// CHECK-NEXT:   ROOT %tuple.22 = (f32[4], f32[4]) tuple(f32[4] %get-tuple-element.20, f32[4] %get-tuple-element.21)
+
+func.func @main(%arg0: tensor<i32>, %arg1: tensor<4xf32>, %arg2: tensor<4xf32>) -> (tensor<4xf32>, tensor<4xf32>) {
+  %0:3 = mhlo.while(%iterArg = %arg0, %iterArg_0 = %arg1, %iterArg_1 = %arg2) : tensor<i32>, tensor<4xf32>, tensor<4xf32>
+    attributes {mhlo.sharding = "{{replicated},{devices=[2,2]<=[4] last_tile_dim_replicate},{devices=[4]<=[4]}}"}
+    cond {
+    %1 = mhlo.compare LT, %iterArg, %iterArg : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    mhlo.return %1 : tensor<i1>
+  } do {
+    %1 = mhlo.add %iterArg_0, %iterArg_1 : tensor<4xf32>
+    mhlo.return %iterArg, %1, %iterArg_1 : tensor<i32>, tensor<4xf32>, tensor<4xf32>
+  }
+  func.return %0#1, %0#2 : tensor<4xf32>, tensor<4xf32>
+}
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/tests/while.mlir b/third_party/xla/xla/translate/mhlo_to_hlo/tests/while.mlir
index b5d5c98217f036..fddcb2bc61d7e4 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/tests/while.mlir
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/tests/while.mlir
@@ -143,7 +143,7 @@ func.func @main(%arg0: tensor<3xf32>) -> tensor<3xf32> {
     "mhlo.return"(%7) : (tensor<i1>) -> ()
   },  {
   ^bb0(%arg1: tensor<1xi32>, %arg2: tensor<2xi32>, %arg3: tensor<1xf32>, %arg4: tensor<3xf32>):
-    %4 = "mhlo.broadcast_in_dim"(%arg3) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<3xf32>
+    %4 = "mhlo.broadcast_in_dim"(%arg3) <{broadcast_dimensions = dense<0> : tensor<1xi64>}> : (tensor<1xf32>) -> tensor<3xf32>
     %5 = mhlo.add %arg4, %4 : tensor<3xf32>
     "mhlo.return"(%arg1, %arg2, %arg3, %5) : (tensor<1xi32>, tensor<2xi32>, tensor<1xf32>, tensor<3xf32>) -> ()
   }) : (tensor<1xi32>, tensor<2xi32>, tensor<1xf32>, tensor<3xf32>) -> (tensor<1xi32>, tensor<2xi32>, tensor<1xf32>, tensor<3xf32>)
@@ -259,7 +259,7 @@ func.func @main(%arg0: tensor<3x3xf32>) -> tensor<3x3xf32> {
   ^bb0(%arg1: tensor<3x3xf32>):
     %2 = mhlo.constant dense<false> : tensor<i1>
     %3 = mhlo.constant dense<2.000000e+00> : tensor<f32>
-    %4 = "mhlo.broadcast_in_dim"(%3) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<3x3xf32>
+    %4 = "mhlo.broadcast_in_dim"(%3) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<f32>) -> tensor<3x3xf32>
     %5 = mhlo.add %arg1, %4 : tensor<3x3xf32>
     "mhlo.return"(%5) : (tensor<3x3xf32>) -> ()
   }) : (tensor<3x3xf32>) -> tensor<3x3xf32>
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/type_to_shape.cc b/third_party/xla/xla/translate/mhlo_to_hlo/type_to_shape.cc
index 235115596a076f..ddc92393ca86a0 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/type_to_shape.cc
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/type_to_shape.cc
@@ -15,35 +15,40 @@ limitations under the License.
 
 #include "xla/translate/mhlo_to_hlo/type_to_shape.h"
 
+#include <algorithm>
+#include <cstdint>
 #include <numeric>
 #include <optional>
-#include <string>
 #include <tuple>
+#include <utility>
 #include <vector>
 
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/SparseTensor/IR/Enums.h"  // from @llvm-project
 #include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"  // from @llvm-project
 #include "mlir/IR/AffineMap.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "xla/mlir/utils/type_util.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
-#include "xla/primitive_util.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/statusor.h"
 #include "xla/xla_data.pb.h"
 
 using ::int64_t;
-using mlir::IntegerType;
 using mlir::MemRefType;
 using mlir::RankedTensorType;
 using mlir::ShapedType;
 using mlir::VectorType;
 using mlir::mhlo::TypeExtensionsAttr;
 using xla::PrimitiveType;
-using xla::ShapeUtil;
 
 namespace xla {
 
@@ -206,7 +211,8 @@ Shape TypeToShape(mlir::Type type) {
     }
     return ShapeUtil::MakeTupleShape(shapes);
 
-  } else if (type.isa<mlir::mhlo::TokenType>()) {
+  } else if (type.isa<mlir::mhlo::TokenType>() ||
+             type.isa<mlir::stablehlo::TokenType>()) {
     return ShapeUtil::MakeTokenShape();
   } else if (auto bundle_type = type.dyn_cast<mlir::mhlo::AsyncBundleType>()) {
     auto tuple_type =
diff --git a/third_party/xla/xla/translate/xla_translate_main.cc b/third_party/xla/xla/translate/xla_translate_main.cc
index 4fba7ad849fc77..241c8aa0993fde 100644
--- a/third_party/xla/xla/translate/xla_translate_main.cc
+++ b/third_party/xla/xla/translate/xla_translate_main.cc
@@ -16,11 +16,13 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/raw_ostream.h"
 #include "mlir/IR/AsmState.h"  // from @llvm-project
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
diff --git a/third_party/xla/xla/translate/xla_translate_opt_main.cc b/third_party/xla/xla/translate/xla_translate_opt_main.cc
index f5b6aae93b67ff..6b286f34cabd89 100644
--- a/third_party/xla/xla/translate/xla_translate_opt_main.cc
+++ b/third_party/xla/xla/translate/xla_translate_opt_main.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "mlir/InitAllDialects.h"  // from @llvm-project
 #include "mlir/InitAllPasses.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Tools/mlir-opt/MlirOptMain.h"  // from @llvm-project
 #include "stablehlo/dialect/Register.h"  // from @stablehlo
 #include "xla/mlir/framework/ir/xla_framework.h"
diff --git a/third_party/xla/third_party/tsl/tsl/BUILD b/third_party/xla/xla/tsl/BUILD
similarity index 97%
rename from third_party/xla/third_party/tsl/tsl/BUILD
rename to third_party/xla/xla/tsl/BUILD
index fbd87cb54a5a14..8a7cb42086d5d5 100644
--- a/third_party/xla/third_party/tsl/tsl/BUILD
+++ b/third_party/xla/xla/tsl/BUILD
@@ -519,8 +519,11 @@ package_group(
         "//third_party/py/cloud_ml_autoflow/...",
         "//third_party/py/envlogger/...",
         "//third_party/yggdrasil_decision_forests/...",
-        "//tsl/...",
-    ],
+    ] + if_google([
+        # Needed in OSS, where bazel won't allow a package group to refer to an
+        # external repo.
+        "@local_tsl//tsl/...",
+    ]),
 )
 
 bzl_library(
@@ -529,13 +532,13 @@ bzl_library(
     visibility = ["//visibility:public"],
     deps = [
         "//third_party/compute_library:build_defs_bzl",
-        "//third_party/mkl_dnn:build_defs_bzl",
-        "//tsl/platform:rules_cc_bzl",
+        "//xla/tsl/mkl:build_defs_bzl",
         "@bazel_skylib//lib:new_sets",
         "@local_config_cuda//cuda:build_defs_bzl",
         "@local_config_rocm//rocm:build_defs_bzl",
         "@local_config_tensorrt//:build_defs_bzl",
-        "@local_xla//xla/tsl/mkl:build_defs_bzl",
+        "@local_tsl//third_party/mkl_dnn:build_defs_bzl",
+        "@local_tsl//tsl/platform:rules_cc_bzl",
     ],
 )
 
diff --git a/third_party/xla/xla/tsl/c/BUILD b/third_party/xla/xla/tsl/c/BUILD
index ea17d7b3607649..17dca06dbcce4e 100644
--- a/third_party/xla/xla/tsl/c/BUILD
+++ b/third_party/xla/xla/tsl/c/BUILD
@@ -1,12 +1,12 @@
 # Description:
 # C API for TensorFlow, for use by client language bindings.
 
-load("@local_tsl//tsl:tsl.bzl", "internal_visibility", "tsl_copts", "tsl_gpu_library")
-
-# buildifier: disable=same-origin-load
-load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
 load("@local_tsl//tsl/platform:build_config.bzl", "tsl_cc_test")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla/tsl:tsl.bzl", "internal_visibility", "tsl_copts", "tsl_gpu_library")
+
+# buildifier: disable=same-origin-load
+load("//xla/tsl:tsl.default.bzl", "filegroup")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/tsl/cuda/BUILD.bazel b/third_party/xla/xla/tsl/cuda/BUILD.bazel
index 4c26dbaadfc7e6..6f0e9aefab72f0 100644
--- a/third_party/xla/xla/tsl/cuda/BUILD.bazel
+++ b/third_party/xla/xla/tsl/cuda/BUILD.bazel
@@ -101,14 +101,14 @@ cc_library(
     name = "cudart",  # buildifier: disable=duplicated-name
     srcs = select({
         # include dynamic loading implementation only when if_cuda_is_configured and build dynamically
-        "@local_tsl//tsl:is_cuda_enabled_and_oss": [
+        "@local_xla//xla/tsl:is_cuda_enabled_and_oss": [
             "cudart.tramp.S",
             "cudart_stub.cc",
         ],
         "//conditions:default": [],
     }),
     linkopts = select({
-        "@local_tsl//tsl:is_cuda_enabled_and_oss": cuda_rpath_flags("nvidia/cuda_runtime/lib"),
+        "@local_xla//xla/tsl:is_cuda_enabled_and_oss": cuda_rpath_flags("nvidia/cuda_runtime/lib"),
         "//conditions:default": [],
     }),
     local_defines = [
@@ -117,7 +117,7 @@ cc_library(
     textual_hdrs = ["cudart.inc"],
     visibility = ["//visibility:public"],
     deps = select({
-        "@local_tsl//tsl:is_cuda_enabled_and_oss": [
+        "@local_xla//xla/tsl:is_cuda_enabled_and_oss": [
             ":cuda",
             "@com_google_absl//absl/container:flat_hash_set",
             "@local_config_cuda//cuda:cuda_headers",
diff --git a/third_party/xla/xla/tsl/cuda/stub.bzl b/third_party/xla/xla/tsl/cuda/stub.bzl
index 1aaa52746d69b8..23c980766c8673 100644
--- a/third_party/xla/xla/tsl/cuda/stub.bzl
+++ b/third_party/xla/xla/tsl/cuda/stub.bzl
@@ -19,9 +19,9 @@ def cuda_stub(name, srcs):
         ],
         tags = ["gpu"],
         cmd = select({
-            "@local_tsl//tsl:linux_aarch64": "$(location //third_party/implib_so:make_stub) $< --outdir $(RULEDIR) --target aarch64",
-            "@local_tsl//tsl:linux_x86_64": "$(location //third_party/implib_so:make_stub) $< --outdir $(RULEDIR) --target x86_64",
-            "@local_tsl//tsl:linux_ppc64le": "$(location //third_party/implib_so:make_stub) $< --outdir $(RULEDIR) --target powerpc64le",
+            "@local_xla//xla/tsl:linux_aarch64": "$(location //third_party/implib_so:make_stub) $< --outdir $(RULEDIR) --target aarch64",
+            "@local_xla//xla/tsl:linux_x86_64": "$(location //third_party/implib_so:make_stub) $< --outdir $(RULEDIR) --target x86_64",
+            "@local_xla//xla/tsl:linux_ppc64le": "$(location //third_party/implib_so:make_stub) $< --outdir $(RULEDIR) --target powerpc64le",
             "//conditions:default": "NOT_IMPLEMENTED_FOR_THIS_PLATFORM_OR_ARCHITECTURE",
         }),
     )
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/BUILD b/third_party/xla/xla/tsl/distributed_runtime/BUILD
similarity index 70%
rename from third_party/xla/third_party/tsl/tsl/distributed_runtime/BUILD
rename to third_party/xla/xla/tsl/distributed_runtime/BUILD
index 69980ac4a1d7a2..4530432d83cdd9 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/BUILD
+++ b/third_party/xla/xla/tsl/distributed_runtime/BUILD
@@ -6,12 +6,12 @@ load(
     "@local_tsl//tsl/platform:rules_cc.bzl",
     "cc_library",
 )
-load("//tsl:tsl.bzl", "internal_visibility")
+load("//xla/tsl:tsl.bzl", "internal_visibility")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = internal_visibility([
-        "//tsl:internal",
+        "//xla/tsl:internal",
     ]),
     licenses = ["notice"],
 )
@@ -21,10 +21,10 @@ cc_library(
     srcs = ["call_options.cc"],
     hdrs = ["call_options.h"],
     deps = [
-        "//tsl/platform:macros",
-        "//tsl/platform:mutex",
-        "//tsl/platform:thread_annotations",
-        "//tsl/platform:types",
+        "@local_tsl//tsl/platform:macros",
+        "@local_tsl//tsl/platform:mutex",
+        "@local_tsl//tsl/platform:thread_annotations",
+        "@local_tsl//tsl/platform:types",
     ],
 )
 
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/call_options.cc b/third_party/xla/xla/tsl/distributed_runtime/call_options.cc
similarity index 96%
rename from third_party/xla/third_party/tsl/tsl/distributed_runtime/call_options.cc
rename to third_party/xla/xla/tsl/distributed_runtime/call_options.cc
index da93bc4920e6fe..a13ba77fd0bfdf 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/call_options.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/call_options.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tsl/distributed_runtime/call_options.h"
+#include "xla/tsl/distributed_runtime/call_options.h"
 
 #include <utility>
 
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/call_options.h b/third_party/xla/xla/tsl/distributed_runtime/call_options.h
similarity index 93%
rename from third_party/xla/third_party/tsl/tsl/distributed_runtime/call_options.h
rename to third_party/xla/xla/tsl/distributed_runtime/call_options.h
index 6021ba8f1d24eb..99a66d4b42f311 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/call_options.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/call_options.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_CALL_OPTIONS_H_
-#define TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_CALL_OPTIONS_H_
+#ifndef XLA_TSL_DISTRIBUTED_RUNTIME_CALL_OPTIONS_H_
+#define XLA_TSL_DISTRIBUTED_RUNTIME_CALL_OPTIONS_H_
 
 #include <functional>
 
@@ -79,4 +79,4 @@ class CallOptions {
 
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_CALL_OPTIONS_H_
+#endif  // XLA_TSL_DISTRIBUTED_RUNTIME_CALL_OPTIONS_H_
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/BUILD b/third_party/xla/xla/tsl/distributed_runtime/coordination/BUILD
new file mode 100644
index 00000000000000..f6389fa39dac79
--- /dev/null
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/BUILD
@@ -0,0 +1,240 @@
+load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library", "tsl_cc_test")
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla/tsl:tsl.bzl", "if_oss", "internal_visibility", "tsl_gpu_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([
+        "//xla/tsl:internal",
+    ]),
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "coordination_service_error_util",
+    hdrs = ["coordination_service_error_util.h"],
+    deps = [
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/protobuf:coordination_service_proto_cc",
+    ],
+)
+
+tsl_cc_test(
+    name = "coordination_service_error_util_test",
+    srcs = ["coordination_service_error_util_test.cc"],
+    deps = [
+        ":coordination_service_error_util",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_main",
+        "@local_tsl//tsl/protobuf:coordination_service_proto_cc",
+    ],
+)
+
+cc_library(
+    name = "coordination_client",
+    hdrs = ["coordination_client.h"],
+    deps = [
+        "//xla/tsl/distributed_runtime:call_options",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/protobuf:coordination_service_proto_cc",
+    ],
+)
+
+cc_library(
+    name = "coordination_service",
+    hdrs = ["coordination_service.h"],
+    deps = [
+        ":coordination_client",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/protobuf:coordination_config_proto_cc",
+    ],
+)
+
+# Keeping the implementation as a separate build target.
+# This is an alwayslink library for statically registering "standalone" implementation.
+# Other implementations of the service will be provided in the future.
+tsl_gpu_library(
+    name = "coordination_service_impl",
+    srcs = ["coordination_service.cc"],
+    deps = [
+        ":coordination_client",
+        ":coordination_service",
+        ":coordination_service_error_util",
+        "//xla/tsl/distributed_runtime:call_options",
+        "//xla/tsl/util:device_name_utils",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:macros",
+        "@local_tsl//tsl/platform:mutex",
+        "@local_tsl//tsl/platform:random",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:strcat",
+        "@local_tsl//tsl/platform:thread_annotations",
+        "@local_tsl//tsl/protobuf:coordination_config_proto_cc",
+        "@local_tsl//tsl/protobuf:coordination_service_proto_cc",
+    ],
+    alwayslink = 1,
+)
+
+tf_proto_library(
+    name = "test_device_proto",
+    testonly = 1,
+    srcs = ["test_device.proto"],
+    cc_api_version = 2,
+)
+
+tsl_cc_test(
+    name = "coordination_service_test",
+    srcs = ["coordination_service_test.cc"],
+    tags = if_oss([
+        "manual",
+        "no_oss",
+    ]),  # b/169705709, no protobuf matchers in OSS.
+    deps = [
+        ":coordination_client",
+        ":coordination_service",
+        ":coordination_service_error_util",
+        ":coordination_service_impl",
+        ":test_device_proto_cc",
+        "//xla/tsl/distributed_runtime:call_options",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:mutex",
+        "@local_tsl//tsl/platform:random",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_main",
+        "@local_tsl//tsl/platform:thread_annotations",
+        "@local_tsl//tsl/platform:types",
+        "@local_tsl//tsl/protobuf:coordination_config_proto_cc",
+        "@local_tsl//tsl/protobuf:coordination_service_proto_cc",
+    ],
+)
+
+tsl_gpu_library(
+    name = "coordination_service_agent",
+    srcs = ["coordination_service_agent.cc"],
+    hdrs = ["coordination_service_agent.h"],
+    deps = [
+        ":coordination_client",
+        ":coordination_service_error_util",
+        "//xla/tsl/distributed_runtime:call_options",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        "@local_tsl//tsl/framework:cancellation",
+        "@local_tsl//tsl/lib/monitoring:gauge",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:mutex",
+        "@local_tsl//tsl/platform:random",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:thread_annotations",
+        "@local_tsl//tsl/protobuf:coordination_config_proto_cc",
+        "@local_tsl//tsl/protobuf:coordination_service_proto_cc",
+    ],
+)
+
+tsl_cc_test(
+    name = "coordination_service_agent_test",
+    srcs = ["coordination_service_agent_test.cc"],
+    deps = [
+        ":coordination_client",
+        ":coordination_service_agent",
+        "//xla/tsl/distributed_runtime:call_options",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/time",
+        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:env_impl",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:protobuf",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_main",
+        "@local_tsl//tsl/protobuf:coordination_config_proto_cc_impl",
+        "@local_tsl//tsl/protobuf:coordination_service_proto_cc_impl",
+    ],
+)
+
+cc_library(
+    name = "coordination_service_rpc_handler",
+    srcs = ["coordination_service_rpc_handler.cc"],
+    hdrs = [
+        "coordination_service_rpc_handler.h",
+    ],
+    deps = [
+        ":coordination_service",
+        ":coordination_service_agent",
+        ":coordination_service_error_util",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/time",
+        "@local_tsl//tsl/platform:casts",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:mutex",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:thread_annotations",
+        "@local_tsl//tsl/protobuf:coordination_service_proto_cc",
+    ],
+)
+
+tsl_cc_test(
+    name = "coordination_service_recoverable_job_test",
+    srcs = ["coordination_service_recoverable_job_test.cc"],
+    deps = [
+        ":coordination_client",
+        ":coordination_service",
+        ":coordination_service_agent",
+        ":coordination_service_impl",
+        "//xla/tsl/distributed_runtime/rpc:async_service_interface",
+        "//xla/tsl/distributed_runtime/rpc/coordination:grpc_coordination_client",
+        "//xla/tsl/distributed_runtime/rpc/coordination:grpc_coordination_service_impl",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/memory",
+        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:env_impl",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:mutex",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_main",
+        "@local_tsl//tsl/protobuf:coordination_config_proto_cc_impl",
+        "@local_tsl//tsl/protobuf:coordination_service_proto_cc_impl",
+        "@local_tsl//tsl/protobuf:distributed_runtime_payloads_proto_cc_impl",
+    ],
+)
+
+filegroup(
+    name = "pywrap_required_hdrs",
+    srcs = [
+        "coordination_client.h",
+        "coordination_service.h",
+    ],
+)
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_client.h b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_client.h
similarity index 95%
rename from third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_client.h
rename to third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_client.h
index 1de6a45fe70e19..3dcb8623d81f5d 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_client.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_client.h
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_CLIENT_H_
-#define TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_CLIENT_H_
+#ifndef XLA_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_CLIENT_H_
+#define XLA_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_CLIENT_H_
 
 #include <memory>
 #include <string>
 
-#include "tsl/distributed_runtime/call_options.h"
+#include "xla/tsl/distributed_runtime/call_options.h"
 #include "tsl/platform/status.h"
 #include "tsl/protobuf/coordination_service.pb.h"
 
@@ -143,4 +143,4 @@ class CoordinationClientCache {
 
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_CLIENT_H_
+#endif  // XLA_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_CLIENT_H_
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service.cc b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.cc
similarity index 99%
rename from third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service.cc
rename to third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.cc
index 45dfb972a131f5..fb9735ec28766a 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tsl/distributed_runtime/coordination/coordination_service.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service.h"
 
 #include <algorithm>
 #include <cstdint>
@@ -32,10 +32,10 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/notification.h"
 #include "absl/time/time.h"
+#include "xla/tsl/distributed_runtime/call_options.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_client.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_error_util.h"
 #include "xla/tsl/util/device_name_utils.h"
-#include "tsl/distributed_runtime/call_options.h"
-#include "tsl/distributed_runtime/coordination/coordination_client.h"
-#include "tsl/distributed_runtime/coordination/coordination_service_error_util.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/macros.h"
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service.h b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.h
similarity index 97%
rename from third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service.h
rename to third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.h
index b82261e6d30bc6..298759f1ad461a 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_H_
-#define TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_H_
+#ifndef XLA_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_H_
+#define XLA_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_H_
 
 #include <functional>
 #include <memory>
@@ -25,7 +25,7 @@ limitations under the License.
 
 #include "absl/strings/string_view.h"
 #include "absl/time/time.h"
-#include "tsl/distributed_runtime/coordination/coordination_client.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_client.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/protobuf/coordination_config.pb.h"
@@ -252,4 +252,4 @@ class CoordinationServiceInterface {
 
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_H_
+#endif  // XLA_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_H_
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent.cc b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.cc
similarity index 99%
rename from third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent.cc
rename to third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.cc
index 5f65f8e861bd92..59d1abf1b80956 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tsl/distributed_runtime/coordination/coordination_service_agent.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
 
 #include <algorithm>
 #include <cassert>
@@ -36,9 +36,9 @@ limitations under the License.
 #include "absl/synchronization/notification.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
-#include "tsl/distributed_runtime/call_options.h"
-#include "tsl/distributed_runtime/coordination/coordination_client.h"
-#include "tsl/distributed_runtime/coordination/coordination_service_error_util.h"
+#include "xla/tsl/distributed_runtime/call_options.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_client.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_error_util.h"
 #include "tsl/framework/cancellation.h"
 #include "tsl/lib/monitoring/gauge.h"
 #include "tsl/platform/env.h"
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent.h b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.h
similarity index 97%
rename from third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent.h
rename to third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.h
index 6c31eccffd10f0..0f854ae42a101b 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_AGENT_H_
-#define TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_AGENT_H_
+#ifndef XLA_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_AGENT_H_
+#define XLA_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_AGENT_H_
 
 #include <cstdint>
 #include <functional>
@@ -26,8 +26,8 @@ limitations under the License.
 #include <vector>
 
 #include "absl/time/time.h"
-#include "tsl/distributed_runtime/call_options.h"
-#include "tsl/distributed_runtime/coordination/coordination_client.h"
+#include "xla/tsl/distributed_runtime/call_options.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_client.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/protobuf/coordination_service.pb.h"
@@ -280,4 +280,4 @@ std::unique_ptr<CoordinationServiceAgent> CreateCoordinationServiceAgent();
 
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_AGENT_H_
+#endif  // XLA_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_AGENT_H_
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent_test.cc b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent_test.cc
similarity index 98%
rename from third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent_test.cc
rename to third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent_test.cc
index 60b18a033dd343..419471938b8981 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent_test.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tsl/distributed_runtime/coordination/coordination_service_agent.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
 
 #include <memory>
 #include <ostream>
@@ -27,8 +27,8 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
-#include "tsl/distributed_runtime/call_options.h"
-#include "tsl/distributed_runtime/coordination/coordination_client.h"
+#include "xla/tsl/distributed_runtime/call_options.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_client.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_error_util.h b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_error_util.h
similarity index 88%
rename from third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_error_util.h
rename to third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_error_util.h
index 851e233c1a2e84..a777d121c4ce93 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_error_util.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_error_util.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_ERROR_UTIL_H_
-#define TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_ERROR_UTIL_H_
+#ifndef XLA_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_ERROR_UTIL_H_
+#define XLA_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_ERROR_UTIL_H_
 
 #include "absl/strings/string_view.h"
 #include "tsl/platform/errors.h"
@@ -57,4 +57,4 @@ inline absl::Status MakeCoordinationError(
 }
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_ERROR_UTIL_H_
+#endif  // XLA_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_ERROR_UTIL_H_
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_error_util_test.cc b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_error_util_test.cc
similarity index 97%
rename from third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_error_util_test.cc
rename to third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_error_util_test.cc
index 5c10b5aec13543..9f1baada2ce84c 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_error_util_test.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_error_util_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tsl/distributed_runtime/coordination/coordination_service_error_util.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_error_util.h"
 
 #include <string>
 
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_recoverable_job_test.cc b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_recoverable_job_test.cc
similarity index 95%
rename from third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_recoverable_job_test.cc
rename to third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_recoverable_job_test.cc
index fabcefd0197891..37f128b53ff415 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_recoverable_job_test.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_recoverable_job_test.cc
@@ -19,12 +19,12 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
-#include "tsl/distributed_runtime/coordination/coordination_client.h"
-#include "tsl/distributed_runtime/coordination/coordination_service.h"
-#include "tsl/distributed_runtime/coordination/coordination_service_agent.h"
-#include "tsl/distributed_runtime/rpc/async_service_interface.h"
-#include "tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.h"
-#include "tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_client.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
+#include "xla/tsl/distributed_runtime/rpc/async_service_interface.h"
+#include "xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.h"
+#include "xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/mutex.h"
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.cc b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.cc
similarity index 96%
rename from third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.cc
rename to third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.cc
index 2f2f87687d6feb..ea15408886197c 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tsl/distributed_runtime/coordination/coordination_service_rpc_handler.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.h"
 
 #include <iterator>
 #include <string>
@@ -22,9 +22,9 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/time/time.h"
-#include "tsl/distributed_runtime/coordination/coordination_service.h"
-#include "tsl/distributed_runtime/coordination/coordination_service_agent.h"
-#include "tsl/distributed_runtime/coordination/coordination_service_error_util.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_error_util.h"
 #include "tsl/platform/casts.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/mutex.h"
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.h b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.h
similarity index 90%
rename from third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.h
rename to third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.h
index cb92ea68bb173d..2895467f6e89d6 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.h
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_RPC_HANDLER_H_
-#define TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_RPC_HANDLER_H_
+#ifndef XLA_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_RPC_HANDLER_H_
+#define XLA_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_RPC_HANDLER_H_
 
-#include "tsl/distributed_runtime/coordination/coordination_service.h"
-#include "tsl/distributed_runtime/coordination/coordination_service_agent.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
 #include "tsl/platform/mutex.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/thread_annotations.h"
@@ -99,4 +99,4 @@ class CoordinationServiceRpcHandler {
 
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_RPC_HANDLER_H_
+#endif  // XLA_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_RPC_HANDLER_H_
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_test.cc b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_test.cc
similarity index 99%
rename from third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_test.cc
rename to third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_test.cc
index b111a6235c8cbf..b537301f9b92a8 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_test.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tsl/distributed_runtime/coordination/coordination_service.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service.h"
 
 #include <memory>
 #include <string>
@@ -26,10 +26,10 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/synchronization/notification.h"
 #include "absl/time/time.h"
-#include "tsl/distributed_runtime/call_options.h"
-#include "tsl/distributed_runtime/coordination/coordination_client.h"
-#include "tsl/distributed_runtime/coordination/coordination_service_error_util.h"
-#include "tsl/distributed_runtime/coordination/test_device.pb.h"
+#include "xla/tsl/distributed_runtime/call_options.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_client.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_error_util.h"
+#include "xla/tsl/distributed_runtime/coordination/test_device.pb.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/test_device.proto b/third_party/xla/xla/tsl/distributed_runtime/coordination/test_device.proto
similarity index 98%
rename from third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/test_device.proto
rename to third_party/xla/xla/tsl/distributed_runtime/coordination/test_device.proto
index bab3a8f6cb9165..c2b308aaf02a41 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/test_device.proto
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/test_device.proto
@@ -10,4 +10,4 @@ message TestDevice {
 
 message TestDeviceList {
   repeated TestDevice device = 1;
-}
\ No newline at end of file
+}
diff --git a/third_party/xla/xla/tsl/distributed_runtime/preemption/BUILD b/third_party/xla/xla/tsl/distributed_runtime/preemption/BUILD
new file mode 100644
index 00000000000000..d454d4fb27b1ea
--- /dev/null
+++ b/third_party/xla/xla/tsl/distributed_runtime/preemption/BUILD
@@ -0,0 +1,96 @@
+load("@local_tsl//tsl/platform:build_config.bzl", "tsl_cc_test")
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla/tsl:tsl.bzl", "internal_visibility")
+load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable", "tsl_grpc_cc_dependencies")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([
+        "//xla/tsl:internal",
+    ]),
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "preemption_notifier",
+    srcs = ["preemption_notifier.cc"],
+    hdrs = ["preemption_notifier.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:mutex",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+tsl_cc_test(
+    name = "preemption_notifier_test",
+    size = "small",
+    srcs = ["preemption_notifier_test.cc"],
+    deps = [
+        ":preemption_notifier",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:env_impl",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:mutex",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_main",
+    ],
+)
+
+cc_library(
+    name = "preemption_sync_manager",
+    srcs = ["preemption_sync_manager.cc"],
+    hdrs = ["preemption_sync_manager.h"],
+    deps = [
+        ":preemption_notifier",
+        "//xla/tsl/distributed_runtime:call_options",
+        "//xla/tsl/distributed_runtime/coordination:coordination_service_agent",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        "@local_tsl//tsl/lib/monitoring:gauge",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:mutex",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/protobuf:coordination_service_proto_cc",
+    ],
+)
+
+tsl_cc_test(
+    name = "preemption_sync_manager_test",
+    size = "small",
+    srcs = ["preemption_sync_manager_test.cc"],
+    deps = [
+        ":preemption_notifier",
+        ":preemption_sync_manager",
+        "//xla/tsl/distributed_runtime/coordination:coordination_client",
+        "//xla/tsl/distributed_runtime/coordination:coordination_service",
+        "//xla/tsl/distributed_runtime/coordination:coordination_service_agent",
+        "//xla/tsl/distributed_runtime/coordination:coordination_service_impl",
+        "//xla/tsl/distributed_runtime/rpc:async_service_interface",
+        "//xla/tsl/distributed_runtime/rpc/coordination:grpc_coordination_client",
+        "//xla/tsl/distributed_runtime/rpc/coordination:grpc_coordination_service_impl",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/time",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:env_impl",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_main",
+        "@local_tsl//tsl/protobuf:coordination_config_proto_cc_impl",
+        "@local_tsl//tsl/protobuf:coordination_service_proto_cc_impl",
+        "@local_tsl//tsl/protobuf:distributed_runtime_payloads_proto_cc_impl",
+    ] + tsl_grpc_cc_dependencies(),
+)
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/preemption_notifier.cc b/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_notifier.cc
similarity index 98%
rename from third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/preemption_notifier.cc
rename to third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_notifier.cc
index 56226a85896e5f..b1656ef8d59989 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/preemption_notifier.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_notifier.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tsl/distributed_runtime/preemption/preemption_notifier.h"
+#include "xla/tsl/distributed_runtime/preemption/preemption_notifier.h"
 
 #include <atomic>
 #include <csignal>
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/preemption_notifier.h b/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_notifier.h
similarity index 96%
rename from third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/preemption_notifier.h
rename to third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_notifier.h
index 075af20fcd3346..621d0564e50a12 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/preemption_notifier.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_notifier.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_PREEMPTION_PREEMPTION_NOTIFIER_H_
-#define TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_PREEMPTION_PREEMPTION_NOTIFIER_H_
+#ifndef XLA_TSL_DISTRIBUTED_RUNTIME_PREEMPTION_PREEMPTION_NOTIFIER_H_
+#define XLA_TSL_DISTRIBUTED_RUNTIME_PREEMPTION_PREEMPTION_NOTIFIER_H_
 
 #include <functional>
 #include <memory>
@@ -144,4 +144,4 @@ class PreemptionNotifier {
 
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_PREEMPTION_PREEMPTION_NOTIFIER_H_
+#endif  // XLA_TSL_DISTRIBUTED_RUNTIME_PREEMPTION_PREEMPTION_NOTIFIER_H_
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/preemption_notifier_test.cc b/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_notifier_test.cc
similarity index 98%
rename from third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/preemption_notifier_test.cc
rename to third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_notifier_test.cc
index d083e2ef1ba2ab..837148e6add163 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/preemption_notifier_test.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_notifier_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tsl/distributed_runtime/preemption/preemption_notifier.h"
+#include "xla/tsl/distributed_runtime/preemption/preemption_notifier.h"
 
 #include <csignal>
 #include <functional>
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/preemption_sync_manager.cc b/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_sync_manager.cc
similarity index 97%
rename from third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/preemption_sync_manager.cc
rename to third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_sync_manager.cc
index d7e9c1280e63ff..923c7236f83c84 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/preemption_sync_manager.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_sync_manager.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tsl/distributed_runtime/preemption/preemption_sync_manager.h"
+#include "xla/tsl/distributed_runtime/preemption/preemption_sync_manager.h"
 
 #include <algorithm>
 #include <functional>
@@ -24,9 +24,9 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/synchronization/notification.h"
 #include "absl/time/time.h"
-#include "tsl/distributed_runtime/call_options.h"
-#include "tsl/distributed_runtime/coordination/coordination_service_agent.h"
-#include "tsl/distributed_runtime/preemption/preemption_notifier.h"
+#include "xla/tsl/distributed_runtime/call_options.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
+#include "xla/tsl/distributed_runtime/preemption/preemption_notifier.h"
 #include "tsl/lib/monitoring/gauge.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/mutex.h"
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/preemption_sync_manager.h b/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_sync_manager.h
similarity index 87%
rename from third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/preemption_sync_manager.h
rename to third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_sync_manager.h
index baf1911cac2d6d..730dff4836f7e8 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/preemption_sync_manager.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_sync_manager.h
@@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_PREEMPTION_PREEMPTION_SYNC_MANAGER_H_
-#define TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_PREEMPTION_PREEMPTION_SYNC_MANAGER_H_
+#ifndef XLA_TSL_DISTRIBUTED_RUNTIME_PREEMPTION_PREEMPTION_SYNC_MANAGER_H_
+#define XLA_TSL_DISTRIBUTED_RUNTIME_PREEMPTION_PREEMPTION_SYNC_MANAGER_H_
 
 #include <memory>
 #include <string>
 
-#include "tsl/distributed_runtime/coordination/coordination_service_agent.h"
-#include "tsl/distributed_runtime/preemption/preemption_notifier.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
+#include "xla/tsl/distributed_runtime/preemption/preemption_notifier.h"
 #include "tsl/platform/status.h"
 
 namespace tsl {
@@ -64,4 +64,4 @@ std::unique_ptr<PreemptionSyncManager> CreatePreemptionSyncManager();
 
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_PREEMPTION_PREEMPTION_SYNC_MANAGER_H_
+#endif  // XLA_TSL_DISTRIBUTED_RUNTIME_PREEMPTION_PREEMPTION_SYNC_MANAGER_H_
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/preemption_sync_manager_test.cc b/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_sync_manager_test.cc
similarity index 95%
rename from third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/preemption_sync_manager_test.cc
rename to third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_sync_manager_test.cc
index 82d578c2b9658c..d18c2b16c58243 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/preemption_sync_manager_test.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_sync_manager_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tsl/distributed_runtime/preemption/preemption_sync_manager.h"
+#include "xla/tsl/distributed_runtime/preemption/preemption_sync_manager.h"
 
 #include <memory>
 #include <string>
@@ -24,13 +24,13 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
-#include "tsl/distributed_runtime/coordination/coordination_client.h"
-#include "tsl/distributed_runtime/coordination/coordination_service.h"
-#include "tsl/distributed_runtime/coordination/coordination_service_agent.h"
-#include "tsl/distributed_runtime/preemption/preemption_notifier.h"
-#include "tsl/distributed_runtime/rpc/async_service_interface.h"
-#include "tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.h"
-#include "tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_client.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
+#include "xla/tsl/distributed_runtime/preemption/preemption_notifier.h"
+#include "xla/tsl/distributed_runtime/rpc/async_service_interface.h"
+#include "xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.h"
+#include "xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/status.h"
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/BUILD b/third_party/xla/xla/tsl/distributed_runtime/rpc/BUILD
similarity index 50%
rename from third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/BUILD
rename to third_party/xla/xla/tsl/distributed_runtime/rpc/BUILD
index 205029043be73f..dd3dcd24af4d68 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/BUILD
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/BUILD
@@ -1,15 +1,15 @@
 # Description:
 #   RPC communication interfaces and implementations for TensorFlow.
 
+load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library", "tsl_cc_test")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-load("//tsl:tsl.bzl", "internal_visibility")
-load("//tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
-load("//tsl/platform:build_config.bzl", "tf_proto_library", "tsl_cc_test")
+load("//xla/tsl:tsl.bzl", "internal_visibility")
+load("//xla/tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = internal_visibility([
-        "//tsl:internal",
+        "//xla/tsl:internal",
     ]),
     licenses = ["notice"],
 )
@@ -26,8 +26,8 @@ cc_library(
     srcs = [],
     hdrs = ["grpc_call.h"],
     deps = [
-        "//tsl/platform:mutex",
-        "//tsl/platform:refcount",
+        "@local_tsl//tsl/platform:mutex",
+        "@local_tsl//tsl/platform:refcount",
     ] + tsl_grpc_cc_dependencies(),
 )
 
@@ -36,13 +36,13 @@ cc_library(
     srcs = ["grpc_util.cc"],
     hdrs = ["grpc_util.h"],
     deps = [
-        "//tsl/platform:protobuf",
-        "//tsl/platform:status",
-        "//tsl/platform:stringpiece",
-        "//tsl/platform:stringprintf",
-        "//tsl/protobuf:distributed_runtime_payloads_proto_cc",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:cord",
+        "@local_tsl//tsl/platform:protobuf",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:stringpiece",
+        "@local_tsl//tsl/platform:stringprintf",
+        "@local_tsl//tsl/protobuf:distributed_runtime_payloads_proto_cc",
     ] + tsl_grpc_cc_dependencies(),
 )
 
@@ -56,12 +56,12 @@ tsl_cc_test(
     deps = [
         ":grpc_util",
         ":test_request_proto_cc_impl",
-        "//tsl/platform:env_impl",
-        "//tsl/platform:errors",
-        "//tsl/platform:test",
-        "//tsl/platform:test_benchmark",
-        "//tsl/platform:test_main",
-        "//tsl/protobuf:distributed_runtime_payloads_proto_cc_impl",
+        "@local_tsl//tsl/platform:env_impl",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_benchmark",
+        "@local_tsl//tsl/platform:test_main",
+        "@local_tsl//tsl/protobuf:distributed_runtime_payloads_proto_cc_impl",
     ] + tsl_grpc_cc_dependencies(),
 )
 
@@ -70,9 +70,9 @@ cc_library(
     hdrs = ["grpc_channel_common.h"],
     deps = [
         ":grpc_util",
-        "//tsl/platform:logging",
-        "//tsl/platform:mutex",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:mutex",
     ],
 )
 
@@ -83,20 +83,20 @@ cc_library(
     deps = [
         ":grpc_channel_common",
         ":grpc_util",
-        "//tsl/lib/gtl:map_util",
-        "//tsl/platform:errors",
-        "//tsl/platform:logging",
-        "//tsl/platform:macros",
-        "//tsl/platform:mutex",
-        "//tsl/platform:numbers",
-        "//tsl/platform:status",
-        "//tsl/platform:str_util",
-        "//tsl/platform:strcat",
-        "//tsl/platform:thread_annotations",
-        "//tsl/platform:types",
-        "//tsl/protobuf:rpc_options_proto_cc",
+        "//xla/tsl/util:device_name_utils",
         "@com_google_absl//absl/strings",
-        "@local_xla//xla/tsl/util:device_name_utils",
+        "@local_tsl//tsl/lib/gtl:map_util",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:macros",
+        "@local_tsl//tsl/platform:mutex",
+        "@local_tsl//tsl/platform:numbers",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:str_util",
+        "@local_tsl//tsl/platform:strcat",
+        "@local_tsl//tsl/platform:thread_annotations",
+        "@local_tsl//tsl/platform:types",
+        "@local_tsl//tsl/protobuf:rpc_options_proto_cc",
     ] + tsl_grpc_cc_dependencies(),
 )
 
@@ -108,13 +108,13 @@ tsl_cc_test(
     ],
     deps = [
         ":grpc_channel",
-        "//tsl/lib/core:status_test_util",
-        "//tsl/platform:env_impl",
-        "//tsl/platform:strcat",
-        "//tsl/platform:test",
-        "//tsl/platform:test_main",
-        "//tsl/protobuf:rpc_options_proto_cc_impl",
-        "@local_xla//xla/tsl/util:device_name_utils",
+        "//xla/tsl/util:device_name_utils",
+        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:env_impl",
+        "@local_tsl//tsl/platform:strcat",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_main",
+        "@local_tsl//tsl/protobuf:rpc_options_proto_cc_impl",
     ],
 )
 
@@ -124,13 +124,13 @@ cc_library(
     deps = [
         ":grpc_client_cq_tag",
         ":grpc_util",
-        "//tsl/distributed_runtime:call_options",
-        "//tsl/platform:env",
-        "//tsl/platform:errors",
-        "//tsl/platform:status",
-        "//tsl/platform:strcat",
+        "//xla/tsl/distributed_runtime:call_options",
+        "//xla/tsl/util:env_var",
         "@com_google_absl//absl/status",
-        "@local_xla//xla/tsl/util:env_var",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:strcat",
     ] + tsl_grpc_cc_dependencies(),
 )
 
@@ -139,7 +139,7 @@ cc_library(
     srcs = [],
     hdrs = ["grpc_client_cq_tag.h"],
     deps = [
-        "//tsl/platform:macros",
+        "@local_tsl//tsl/platform:macros",
     ],
 )
 
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/async_service_interface.h b/third_party/xla/xla/tsl/distributed_runtime/rpc/async_service_interface.h
similarity index 85%
rename from third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/async_service_interface.h
rename to third_party/xla/xla/tsl/distributed_runtime/rpc/async_service_interface.h
index 479ca0ec8a4c6a..fb6976cf329ae2 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/async_service_interface.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/async_service_interface.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_RPC_ASYNC_SERVICE_INTERFACE_H_
-#define TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_RPC_ASYNC_SERVICE_INTERFACE_H_
+#ifndef XLA_TSL_DISTRIBUTED_RUNTIME_RPC_ASYNC_SERVICE_INTERFACE_H_
+#define XLA_TSL_DISTRIBUTED_RUNTIME_RPC_ASYNC_SERVICE_INTERFACE_H_
 
 namespace tsl {
 
@@ -38,4 +38,4 @@ class AsyncServiceInterface {
 
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_RPC_ASYNC_SERVICE_INTERFACE_H_
+#endif  // XLA_TSL_DISTRIBUTED_RUNTIME_RPC_ASYNC_SERVICE_INTERFACE_H_
diff --git a/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/BUILD b/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/BUILD
new file mode 100644
index 00000000000000..e21bcbd7370868
--- /dev/null
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/BUILD
@@ -0,0 +1,48 @@
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla/tsl:tsl.bzl", "internal_visibility")
+load("//xla/tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([
+        "//xla/tsl:internal",
+    ]),
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "grpc_coordination_client",
+    srcs = ["grpc_coordination_client.cc"],
+    hdrs = ["grpc_coordination_client.h"],
+    deps = [
+        "//xla/tsl/distributed_runtime:call_options",
+        "//xla/tsl/distributed_runtime/coordination:coordination_client",
+        "//xla/tsl/distributed_runtime/rpc:grpc_channel",
+        "//xla/tsl/distributed_runtime/rpc:grpc_client_cq_tag",
+        "//xla/tsl/distributed_runtime/rpc:grpc_state",
+        "//xla/tsl/distributed_runtime/rpc:grpc_util",
+        "@local_tsl//tsl/platform:mutex",
+        "@local_tsl//tsl/platform:protobuf",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:thread_annotations",
+        "@local_tsl//tsl/protobuf:coordination_service_proto_cc",
+    ],
+)
+
+cc_library(
+    name = "grpc_coordination_service_impl",
+    srcs = ["grpc_coordination_service_impl.cc"],
+    hdrs = ["grpc_coordination_service_impl.h"],
+    deps = [
+        "//xla/tsl/distributed_runtime/coordination:coordination_service_agent",
+        "//xla/tsl/distributed_runtime/coordination:coordination_service_rpc_handler",
+        "//xla/tsl/distributed_runtime/rpc:async_service_interface",
+        "//xla/tsl/distributed_runtime/rpc:grpc_call",
+        "//xla/tsl/distributed_runtime/rpc:grpc_util",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:mutex",
+        "@local_tsl//tsl/platform:thread_annotations",
+        "@local_tsl//tsl/protobuf:coordination_service_cc_grpc_proto",
+        "@local_tsl//tsl/protobuf:coordination_service_proto_cc",
+    ] + tsl_grpc_cc_dependencies(),
+)
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.cc b/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.cc
similarity index 96%
rename from third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.cc
rename to third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.cc
index b143b6692cdcbc..7c2c520362bc16 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.h"
+#include "xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.h"
 
 #include <memory>
 #include <string>
@@ -21,12 +21,12 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tsl/distributed_runtime/call_options.h"
-#include "tsl/distributed_runtime/coordination/coordination_client.h"
-#include "tsl/distributed_runtime/rpc/grpc_channel.h"
-#include "tsl/distributed_runtime/rpc/grpc_client_cq_tag.h"
-#include "tsl/distributed_runtime/rpc/grpc_state.h"
-#include "tsl/distributed_runtime/rpc/grpc_util.h"
+#include "xla/tsl/distributed_runtime/call_options.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_client.h"
+#include "xla/tsl/distributed_runtime/rpc/grpc_channel.h"
+#include "xla/tsl/distributed_runtime/rpc/grpc_client_cq_tag.h"
+#include "xla/tsl/distributed_runtime/rpc/grpc_state.h"
+#include "xla/tsl/distributed_runtime/rpc/grpc_util.h"
 #include "tsl/platform/mutex.h"
 #include "tsl/platform/protobuf.h"
 #include "tsl/platform/status.h"
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.h b/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.h
similarity index 70%
rename from third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.h
rename to third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.h
index 969722bac3376f..33a0533ed2cb19 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.h
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_RPC_COORDINATION_GRPC_COORDINATION_CLIENT_H_
-#define TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_RPC_COORDINATION_GRPC_COORDINATION_CLIENT_H_
+#ifndef XLA_TSL_DISTRIBUTED_RUNTIME_RPC_COORDINATION_GRPC_COORDINATION_CLIENT_H_
+#define XLA_TSL_DISTRIBUTED_RUNTIME_RPC_COORDINATION_GRPC_COORDINATION_CLIENT_H_
 
 #include <memory>
 
-#include "tsl/distributed_runtime/coordination/coordination_client.h"
-#include "tsl/distributed_runtime/rpc/grpc_channel.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_client.h"
+#include "xla/tsl/distributed_runtime/rpc/grpc_channel.h"
 
 namespace tsl {
 
@@ -31,4 +31,4 @@ CoordinationClient* NewGrpcCoordinationClient(
 
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_RPC_COORDINATION_GRPC_COORDINATION_CLIENT_H_
+#endif  // XLA_TSL_DISTRIBUTED_RUNTIME_RPC_COORDINATION_GRPC_COORDINATION_CLIENT_H_
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.cc b/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.cc
similarity index 97%
rename from third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.cc
rename to third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.cc
index 64c82d12f83074..86ef14c89f1d2b 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h"
+#include "xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h"
 
 #include "tsl/platform/mutex.h"
 #include "tsl/platform/threadpool.h"
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h b/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h
similarity index 88%
rename from third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h
rename to third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h
index daa7e65225f660..ac27e6ba28d76b 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h
@@ -13,19 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_RPC_COORDINATION_GRPC_COORDINATION_SERVICE_IMPL_H_
-#define TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_RPC_COORDINATION_GRPC_COORDINATION_SERVICE_IMPL_H_
+#ifndef XLA_TSL_DISTRIBUTED_RUNTIME_RPC_COORDINATION_GRPC_COORDINATION_SERVICE_IMPL_H_
+#define XLA_TSL_DISTRIBUTED_RUNTIME_RPC_COORDINATION_GRPC_COORDINATION_SERVICE_IMPL_H_
 
 #include <memory>
 
 #include "grpcpp/alarm.h"
 #include "grpcpp/completion_queue.h"
 #include "grpcpp/server_builder.h"
-#include "tsl/distributed_runtime/coordination/coordination_service_agent.h"
-#include "tsl/distributed_runtime/coordination/coordination_service_rpc_handler.h"
-#include "tsl/distributed_runtime/rpc/async_service_interface.h"
-#include "tsl/distributed_runtime/rpc/grpc_call.h"
-#include "tsl/distributed_runtime/rpc/grpc_util.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.h"
+#include "xla/tsl/distributed_runtime/rpc/async_service_interface.h"
+#include "xla/tsl/distributed_runtime/rpc/grpc_call.h"
+#include "xla/tsl/distributed_runtime/rpc/grpc_util.h"
 #include "tsl/platform/mutex.h"
 #include "tsl/platform/thread_annotations.h"
 #include "tsl/platform/threadpool.h"
@@ -114,4 +114,4 @@ class GrpcCoordinationServiceImpl : public AsyncServiceInterface {
 
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_RPC_COORDINATION_GRPC_COORDINATION_SERVICE_IMPL_H_
+#endif  // XLA_TSL_DISTRIBUTED_RUNTIME_RPC_COORDINATION_GRPC_COORDINATION_SERVICE_IMPL_H_
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_call.h b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_call.h
similarity index 99%
rename from third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_call.h
rename to third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_call.h
index 73d285e3ef8b8c..1a5cbfbb7aa826 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_call.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_call.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_CALL_H_
-#define TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_CALL_H_
+#ifndef XLA_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_CALL_H_
+#define XLA_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_CALL_H_
 
 #include "grpcpp/completion_queue.h"
 #include "grpcpp/impl/service_type.h"
@@ -518,4 +518,4 @@ class ServerBidirectionalStreamingCall
 
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_CALL_H_
+#endif  // XLA_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_CALL_H_
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_channel.cc b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.cc
similarity index 99%
rename from third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_channel.cc
rename to third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.cc
index ba12449f03bf2f..db86c3ea0ba98f 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_channel.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tsl/distributed_runtime/rpc/grpc_channel.h"
+#include "xla/tsl/distributed_runtime/rpc/grpc_channel.h"
 
 #include <cstdlib>
 #include <limits>
@@ -25,8 +25,8 @@ limitations under the License.
 #include "absl/strings/escaping.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_split.h"
+#include "xla/tsl/distributed_runtime/rpc/grpc_channel_common.h"
 #include "xla/tsl/util/device_name_utils.h"
-#include "tsl/distributed_runtime/rpc/grpc_channel_common.h"
 #include "tsl/lib/gtl/map_util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_channel.h b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.h
similarity index 93%
rename from third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_channel.h
rename to third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.h
index 654e7aa91c3218..d1fcba72793483 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_channel.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_CHANNEL_H_
-#define TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_CHANNEL_H_
+#ifndef XLA_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_CHANNEL_H_
+#define XLA_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_CHANNEL_H_
 
 #include <map>
 #include <memory>
@@ -23,7 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "grpcpp/grpcpp.h"
-#include "tsl/distributed_runtime/rpc/grpc_util.h"
+#include "xla/tsl/distributed_runtime/rpc/grpc_util.h"
 #include "tsl/protobuf/rpc_options.pb.h"
 
 namespace tsl {
@@ -98,4 +98,4 @@ absl::Status NewHostPortGrpcChannel(const string& target,
 
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_CHANNEL_H_
+#endif  // XLA_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_CHANNEL_H_
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_channel_common.h b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel_common.h
similarity index 93%
rename from third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_channel_common.h
rename to third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel_common.h
index 7e1ef32151f148..61843ec9e20b76 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_channel_common.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel_common.h
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_CHANNEL_COMMON_H_
-#define TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_CHANNEL_COMMON_H_
+#ifndef XLA_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_CHANNEL_COMMON_H_
+#define XLA_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_CHANNEL_COMMON_H_
 
 #include <unordered_map>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
-#include "tsl/distributed_runtime/rpc/grpc_util.h"
+#include "xla/tsl/distributed_runtime/rpc/grpc_util.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/mutex.h"
 
@@ -100,4 +100,4 @@ class GenericCachingChannelCache : public ChannelCacheT {
 
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_CHANNEL_COMMON_H_
+#endif  // XLA_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_CHANNEL_COMMON_H_
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_channel_test.cc b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel_test.cc
similarity index 99%
rename from third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_channel_test.cc
rename to third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel_test.cc
index adc0df2b89ddef..806ea5494d90c8 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_channel_test.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tsl/distributed_runtime/rpc/grpc_channel.h"
+#include "xla/tsl/distributed_runtime/rpc/grpc_channel.h"
 
 #include <string>
 #include <vector>
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_client_cq_tag.h b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_client_cq_tag.h
similarity index 85%
rename from third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_client_cq_tag.h
rename to third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_client_cq_tag.h
index 183ca52e8d2d0a..5acb5a5d42245c 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_client_cq_tag.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_client_cq_tag.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_CLIENT_CQ_TAG_H_
-#define TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_CLIENT_CQ_TAG_H_
+#ifndef XLA_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_CLIENT_CQ_TAG_H_
+#define XLA_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_CLIENT_CQ_TAG_H_
 
 #include "tsl/platform/macros.h"
 
@@ -38,4 +38,4 @@ class GrpcClientCQTag {
 
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_CLIENT_CQ_TAG_H_
+#endif  // XLA_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_CLIENT_CQ_TAG_H_
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_state.h b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_state.h
similarity index 96%
rename from third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_state.h
rename to third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_state.h
index 21d8f2df5099e3..74dd3df0bfc382 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_state.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_state.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_STATE_H_
-#define TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_STATE_H_
+#ifndef XLA_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_STATE_H_
+#define XLA_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_STATE_H_
 
 #include <queue>
 #include <string>
@@ -23,10 +23,10 @@ limitations under the License.
 #include "grpcpp/generic/generic_stub.h"
 #include "grpcpp/grpcpp.h"
 #include "absl/status/status.h"
+#include "xla/tsl/distributed_runtime/call_options.h"
+#include "xla/tsl/distributed_runtime/rpc/grpc_client_cq_tag.h"
+#include "xla/tsl/distributed_runtime/rpc/grpc_util.h"
 #include "xla/tsl/util/env_var.h"
-#include "tsl/distributed_runtime/call_options.h"
-#include "tsl/distributed_runtime/rpc/grpc_client_cq_tag.h"
-#include "tsl/distributed_runtime/rpc/grpc_util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/strcat.h"
@@ -251,4 +251,4 @@ class RPCState : public GrpcClientCQTag {
 };
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_STATE_H_
+#endif  // XLA_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_STATE_H_
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_util.cc b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_util.cc
similarity index 97%
rename from third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_util.cc
rename to third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_util.cc
index 3df92d1418f981..4fdc9213986bd8 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_util.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_util.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tsl/distributed_runtime/rpc/grpc_util.h"
+#include "xla/tsl/distributed_runtime/rpc/grpc_util.h"
 
 #include <algorithm>
 #include <vector>
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_util.h b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_util.h
similarity index 96%
rename from third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_util.h
rename to third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_util.h
index b10fff85a003e0..46247baddb8b47 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_util.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_util.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_UTIL_H_
-#define TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_UTIL_H_
+#ifndef XLA_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_UTIL_H_
+#define XLA_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_UTIL_H_
 
 #include <memory>
 #include <string>
@@ -128,4 +128,4 @@ bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, string* dst);
 bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, tstring* dst);
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_UTIL_H_
+#endif  // XLA_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_UTIL_H_
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_util_test.cc b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_util_test.cc
similarity index 98%
rename from third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_util_test.cc
rename to third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_util_test.cc
index 9872c1cf705a0f..99d34350533596 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_util_test.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_util_test.cc
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tsl/distributed_runtime/rpc/grpc_util.h"
+#include "xla/tsl/distributed_runtime/rpc/grpc_util.h"
 
 #include <algorithm>
 #include <cmath>
 #include <vector>
 
 #include "grpcpp/grpcpp.h"
-#include "tsl/distributed_runtime/rpc/test_request.pb.h"
+#include "xla/tsl/distributed_runtime/rpc/test_request.pb.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/test.h"
 #include "tsl/platform/test_benchmark.h"
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/test_request.proto b/third_party/xla/xla/tsl/distributed_runtime/rpc/test_request.proto
similarity index 100%
rename from third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/test_request.proto
rename to third_party/xla/xla/tsl/distributed_runtime/rpc/test_request.proto
diff --git a/third_party/xla/xla/tsl/mkl/BUILD b/third_party/xla/xla/tsl/mkl/BUILD
index 7b372ddc4b8f61..ece39ea3044fe0 100644
--- a/third_party/xla/xla/tsl/mkl/BUILD
+++ b/third_party/xla/xla/tsl/mkl/BUILD
@@ -1,6 +1,6 @@
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load(
-    "@local_tsl//tsl:tsl.bzl",
+    "@local_xla//xla/tsl:tsl.bzl",
     "clean_dep",
 )
 
@@ -126,13 +126,13 @@ cc_library(
     }),
     visibility = ["//visibility:public"],
     deps = select({
-        clean_dep("@local_tsl//tsl:linux_x86_64"): [
+        clean_dep("@local_xla//xla/tsl:linux_x86_64"): [
             ":mkl_libs_linux",
         ],
-        clean_dep("@local_tsl//tsl:macos"): [
+        clean_dep("@local_xla//xla/tsl:macos"): [
             ":mkl_libs_darwin",
         ],
-        clean_dep("@local_tsl//tsl:windows"): [
+        clean_dep("@local_xla//xla/tsl:windows"): [
             ":mkl_libs_windows",
         ],
         "//conditions:default": [],
diff --git a/third_party/xla/xla/tsl/mkl/build_defs.bzl b/third_party/xla/xla/tsl/mkl/build_defs.bzl
index 53c0fb893d4f9e..e219a5c9148b07 100644
--- a/third_party/xla/xla/tsl/mkl/build_defs.bzl
+++ b/third_party/xla/xla/tsl/mkl/build_defs.bzl
@@ -33,8 +33,8 @@ def if_mkl(if_true, if_false = []):
     """
     return select({
         "@local_xla//xla/tsl/mkl:build_with_mkl_aarch64": if_true,
-        "@local_tsl//tsl:linux_x86_64": if_true,
-        "@local_tsl//tsl:windows": if_true,
+        "@local_xla//xla/tsl:linux_x86_64": if_true,
+        "@local_xla//xla/tsl:windows": if_true,
         "//conditions:default": if_false,
     })
 
@@ -102,8 +102,8 @@ def mkl_deps():
     """
     return select({
         "@local_xla//xla/tsl/mkl:build_with_mkl_aarch64": ["@mkl_dnn_acl_compatible//:mkl_dnn_acl"],
-        "@local_tsl//tsl:linux_x86_64": ["@onednn//:mkl_dnn"],
-        "@local_tsl//tsl:windows": ["@onednn//:mkl_dnn"],
+        "@local_xla//xla/tsl:linux_x86_64": ["@onednn//:mkl_dnn"],
+        "@local_xla//xla/tsl:windows": ["@onednn//:mkl_dnn"],
         "//conditions:default": [],
     })
 
@@ -116,8 +116,8 @@ def onednn_v3_define():
     """
     return select({
         "@local_xla//xla/tsl/mkl:build_with_mkl_aarch64": ["-DENABLE_ONEDNN_V3"],
-        "@local_tsl//tsl:linux_x86_64": ["-DENABLE_ONEDNN_V3"],
-        "@local_tsl//tsl:windows": ["-DENABLE_ONEDNN_V3"],
+        "@local_xla//xla/tsl:linux_x86_64": ["-DENABLE_ONEDNN_V3"],
+        "@local_xla//xla/tsl:windows": ["-DENABLE_ONEDNN_V3"],
         "//conditions:default": [],
     })
 
diff --git a/third_party/xla/third_party/tsl/tsl/tsl.bzl b/third_party/xla/xla/tsl/tsl.bzl
similarity index 90%
rename from third_party/xla/third_party/tsl/tsl/tsl.bzl
rename to third_party/xla/xla/tsl/tsl.bzl
index 5b9ba7a1382fb6..796b7d63cdf88a 100644
--- a/third_party/xla/third_party/tsl/tsl/tsl.bzl
+++ b/third_party/xla/xla/tsl/tsl.bzl
@@ -6,7 +6,7 @@ load(
     "if_cuda",
 )
 load(
-    "@local_xla//xla/tsl/mkl:build_defs.bzl",
+    "//xla/tsl/mkl:build_defs.bzl",
     "if_enable_mkl",
     "if_mkl",
     "onednn_v3_define",
@@ -16,7 +16,7 @@ load(
     "if_enable_acl",
 )
 load(
-    "//third_party/mkl_dnn:build_defs.bzl",
+    "@local_tsl//third_party/mkl_dnn:build_defs.bzl",
     "if_mkldnn_aarch64_acl",
     "if_mkldnn_aarch64_acl_openmp",
     "if_mkldnn_openmp",
@@ -39,7 +39,7 @@ load(
 
 # Internally this loads a macro, but in OSS this is a function
 # buildifier: disable=out-of-order-load
-def register_extension_info(**kwargs):
+def register_extension_info(**_kwargs):
     pass
 
 two_gpu_tags = ["requires-gpu-nvidia:2", "notap", "manual", "no_pip"]
@@ -49,11 +49,23 @@ def clean_dep(target):
 
     Use this function when referring to targets in the TSL
     repository from macros that may be called from external repositories.
+
+    Args:
+      target: the target to produce a canonicalized label for.
+    Returns:
+      The canonical label of the target.
     """
 
     # A repo-relative label is resolved relative to the file in which the
     # Label() call appears, e.g. @local_tsl or tsl.
-    return str(Label(target))
+    # TODO(ddunleavy): update this during and after go/moving-tsl-into-xla-lsc
+    label = Label(target)
+    not_yet_moved = ["concurrency", "framework", "lib", "platform", "profiler", "protobuf"]
+
+    if any([label.package.startswith("tsl/" + dirname) for dirname in not_yet_moved]):
+        return "@local_tsl//" + label.package + ":" + label.name
+    else:
+        return str(label)
 
 def if_cuda_or_rocm(if_true, if_false = []):
     """Shorthand for select()'ing whether to build for either CUDA or ROCm.
@@ -91,6 +103,7 @@ def if_oss(oss_value, google_value = []):
     Specifically, it does not return a `select`, and can be used to e.g.
     compute elements of list attributes.
     """
+    _ = (google_value, oss_value)  # buildifier: disable=unused-variable
     return oss_value  # copybara:comment_replace return google_value
 
 def if_google(google_value, oss_value = []):
@@ -99,6 +112,7 @@ def if_google(google_value, oss_value = []):
     Specifically, it does not return a `select`, and can be used to e.g.
     compute elements of list attributes.
     """
+    _ = (google_value, oss_value)  # buildifier: disable=unused-variable
     return oss_value  # copybara:comment_replace return google_value
 
 def internal_visibility(internal_targets):
@@ -113,7 +127,7 @@ def internal_visibility(internal_targets):
 def if_tsl_link_protobuf(if_true, if_false = []):
     return select({
         "//conditions:default": if_true,
-        clean_dep("//tsl:tsl_protobuf_header_only"): if_false,
+        clean_dep("//xla/tsl:tsl_protobuf_header_only"): if_false,
     })
 
 def if_libtpu(if_true, if_false = []):
@@ -122,32 +136,32 @@ def if_libtpu(if_true, if_false = []):
         # copybara:uncomment_begin(different config setting in OSS)
         # "//tools/cc_target_os:gce": if_true,
         # copybara:uncomment_end_and_comment_begin
-        clean_dep("//tsl:with_tpu_support"): if_true,
+        clean_dep("//xla/tsl:with_tpu_support"): if_true,
         # copybara:comment_end
         "//conditions:default": if_false,
     })
 
 def if_macos(a, otherwise = []):
     return select({
-        clean_dep("//tsl:macos"): a,
+        clean_dep("//xla/tsl:macos"): a,
         "//conditions:default": otherwise,
     })
 
 def if_windows(a, otherwise = []):
     return select({
-        clean_dep("//tsl:windows"): a,
+        clean_dep("//xla/tsl:windows"): a,
         "//conditions:default": otherwise,
     })
 
 def if_not_windows(a):
     return select({
-        clean_dep("//tsl:windows"): [],
+        clean_dep("//xla/tsl:windows"): [],
         "//conditions:default": a,
     })
 
 def if_not_fuchsia(a):
     return select({
-        clean_dep("//tsl:fuchsia"): [],
+        clean_dep("//xla/tsl:fuchsia"): [],
         "//conditions:default": a,
     })
 
@@ -159,37 +173,37 @@ def if_nvcc(a):
 
 def if_xla_available(if_true, if_false = []):
     return select({
-        clean_dep("//tsl:with_xla_support"): if_true,
+        clean_dep("//xla/tsl:with_xla_support"): if_true,
         "//conditions:default": if_false,
     })
 
 def if_android_arm(a):
     return select({
-        clean_dep("//tsl:android_arm"): a,
+        clean_dep("//xla/tsl:android_arm"): a,
         "//conditions:default": [],
     })
 
 def if_not_android(a):
     return select({
-        clean_dep("//tsl:android"): [],
+        clean_dep("//xla/tsl:android"): [],
         "//conditions:default": a,
     })
 
 def if_linux_x86_64(a):
     return select({
-        clean_dep("//tsl:linux_x86_64"): a,
+        clean_dep("//xla/tsl:linux_x86_64"): a,
         "//conditions:default": [],
     })
 
 def if_ios_x86_64(a):
     return select({
-        clean_dep("//tsl:ios_x86_64"): a,
+        clean_dep("//xla/tsl:ios_x86_64"): a,
         "//conditions:default": [],
     })
 
 def if_no_default_logger(a):
     return select({
-        clean_dep("//tsl:no_default_logger"): a,
+        clean_dep("//xla/tsl:no_default_logger"): a,
         "//conditions:default": [],
     })
 
@@ -197,16 +211,16 @@ def if_no_default_logger(a):
 # Combine with 'if_gpu_is_configured' (XLA) or 'if_cuda_or_rocm' (otherwise).
 def if_nccl(if_true, if_false = []):
     return select({
-        clean_dep("//tsl:no_nccl_support"): if_false,
-        clean_dep("//tsl:windows"): if_false,
-        clean_dep("//tsl:arm"): if_false,
+        clean_dep("//xla/tsl:no_nccl_support"): if_false,
+        clean_dep("//xla/tsl:windows"): if_false,
+        clean_dep("//xla/tsl:arm"): if_false,
         "//conditions:default": if_true,
     })
 
 def if_with_tpu_support(if_true, if_false = []):
     """Shorthand for select()ing whether to build API support for TPUs when building TSL"""
     return select({
-        clean_dep("//tsl:with_tpu_support"): if_true,
+        clean_dep("//xla/tsl:with_tpu_support"): if_true,
         "//conditions:default": if_false,
     })
 
@@ -296,16 +310,16 @@ def tsl_copts(
         if_ios_x86_64(["-msse4.1"]) +
         if_no_default_logger(["-DNO_DEFAULT_LOGGER"]) +
         select({
-            clean_dep("//tsl:framework_shared_object"): [],
+            clean_dep("//xla/tsl:framework_shared_object"): [],
             "//conditions:default": ["-DTENSORFLOW_MONOLITHIC_BUILD"],
         }) +
         select({
-            clean_dep("//tsl:android"): android_copts,
-            clean_dep("//tsl:emscripten"): [],
-            clean_dep("//tsl:macos"): [],
-            clean_dep("//tsl:windows"): get_win_copts(is_external),
-            clean_dep("//tsl:ios"): [],
-            clean_dep("//tsl:no_lgpl_deps"): ["-D__TENSORFLOW_NO_LGPL_DEPS__", "-pthread"],
+            clean_dep("//xla/tsl:android"): android_copts,
+            clean_dep("//xla/tsl:emscripten"): [],
+            clean_dep("//xla/tsl:macos"): [],
+            clean_dep("//xla/tsl:windows"): get_win_copts(is_external),
+            clean_dep("//xla/tsl:ios"): [],
+            clean_dep("//xla/tsl:no_lgpl_deps"): ["-D__TENSORFLOW_NO_LGPL_DEPS__", "-pthread"],
             "//conditions:default": ["-pthread"],
         })
     )
@@ -313,11 +327,11 @@ def tsl_copts(
 def tf_openmp_copts():
     # We assume when compiling on Linux gcc/clang will be used and MSVC on Windows
     return select({
-        clean_dep("@local_xla//xla/tsl/mkl:build_with_mkl_lnx_openmp"): ["-fopenmp"],
+        clean_dep("//xla/tsl/mkl:build_with_mkl_lnx_openmp"): ["-fopenmp"],
         # copybara:uncomment_begin
-        # "@local_xla//xla/tsl/mkl:build_with_mkl_windows_openmp": ["/openmp"],
+        # "//xla/tsl/mkl:build_with_mkl_windows_openmp": ["/openmp"],
         # copybara:uncomment_end_and_comment_begin
-        clean_dep("@local_xla//xla/tsl/mkl:build_with_mkl_windows_openmp"): ["/openmp:llvm"],
+        clean_dep("//xla/tsl/mkl:build_with_mkl_windows_openmp"): ["/openmp:llvm"],
         # copybara:comment_end
         "//conditions:default": [],
     })
@@ -352,7 +366,7 @@ def tsl_gpu_library(deps = None, cuda_deps = None, copts = tsl_copts(), **kwargs
         kwargs.pop("default_copts", None)
     cc_library(
         deps = deps + if_cuda([
-            clean_dep("@local_xla//xla/tsl/cuda:cudart"),
+            clean_dep("//xla/tsl/cuda:cudart"),
             "@local_config_cuda//cuda:cuda_headers",
         ]) + if_rocm_is_configured([
             "@local_config_rocm//rocm:rocm_headers",
@@ -456,7 +470,7 @@ def if_not_mobile_or_arm_or_lgpl_restricted(a):
     })
 
 def tsl_grpc_cc_dependencies():
-    return [clean_dep("//tsl:grpc++")]
+    return [clean_dep("//xla/tsl:grpc++")]
 
 # Bazel rule for collecting the header files that a target depends on.
 def _transitive_hdrs_impl(ctx):
@@ -642,7 +656,7 @@ def tsl_pybind_extension_opensource(
                 "-fno-strict-aliasing",
                 "-fexceptions",
             ] + select({
-                clean_dep("//tsl:windows"): [],
+                clean_dep("//xla/tsl:windows"): [],
                 "//conditions:default": [
                     "-fvisibility=hidden",
                 ],
@@ -668,13 +682,13 @@ def tsl_pybind_extension_opensource(
             shared_lib_name = so_file,
             testonly = testonly,
             user_link_flags = linkopts + select({
-                clean_dep("//tsl:macos"): [
+                clean_dep("//xla/tsl:macos"): [
                     # TODO: the -w suppresses a wall of harmless warnings about hidden typeinfo symbols
                     # not being exported.  There should be a better way to deal with this.
                     "-Wl,-w",
                     "-Wl,-exported_symbols_list,$(location %s)" % exported_symbols_file,
                 ],
-                clean_dep("//tsl:windows"): [],
+                clean_dep("//xla/tsl:windows"): [],
                 "//conditions:default": [
                     "-Wl,--version-script",
                     "$(location %s)" % version_script_file,
@@ -701,19 +715,19 @@ def tsl_pybind_extension_opensource(
                 "-fno-strict-aliasing",
                 "-fexceptions",
             ] + select({
-                clean_dep("//tsl:windows"): [],
+                clean_dep("//xla/tsl:windows"): [],
                 "//conditions:default": [
                     "-fvisibility=hidden",
                 ],
             }),
             linkopts = linkopts + select({
-                clean_dep("//tsl:macos"): [
+                clean_dep("//xla/tsl:macos"): [
                     # TODO: the -w suppresses a wall of harmless warnings about hidden typeinfo symbols
                     # not being exported.  There should be a better way to deal with this.
                     "-Wl,-w",
                     "-Wl,-exported_symbols_list,$(location %s)" % exported_symbols_file,
                 ],
-                clean_dep("//tsl:windows"): [],
+                clean_dep("//xla/tsl:windows"): [],
                 "//conditions:default": [
                     "-Wl,--version-script",
                     "$(location %s)" % version_script_file,
@@ -757,7 +771,7 @@ def tsl_pybind_extension_opensource(
     native.py_library(
         name = name,
         data = select({
-            clean_dep("//tsl:windows"): [pyd_file],
+            clean_dep("//xla/tsl:windows"): [pyd_file],
             "//conditions:default": [so_file],
         }) + pytype_srcs,
         deps = pytype_deps,
diff --git a/third_party/xla/third_party/tsl/tsl/tsl.default.bzl b/third_party/xla/xla/tsl/tsl.default.bzl
similarity index 97%
rename from third_party/xla/third_party/tsl/tsl/tsl.default.bzl
rename to third_party/xla/xla/tsl/tsl.default.bzl
index 912939245725ab..578100d2ee8379 100644
--- a/third_party/xla/third_party/tsl/tsl/tsl.default.bzl
+++ b/third_party/xla/xla/tsl/tsl.default.bzl
@@ -1,7 +1,7 @@
 """Default (OSS) build versions of TSL general-purpose build extensions."""
 
 load(
-    "//tsl:tsl.bzl",
+    "//xla/tsl:tsl.bzl",
     _filegroup = "filegroup",
     _get_compatible_with_libtpu_portable = "get_compatible_with_libtpu_portable",
     _get_compatible_with_portable = "get_compatible_with_portable",
diff --git a/third_party/xla/xla/tsl/util/BUILD b/third_party/xla/xla/tsl/util/BUILD
index c96aa52c81f1f7..fd0d1dc9412c4e 100644
--- a/third_party/xla/xla/tsl/util/BUILD
+++ b/third_party/xla/xla/tsl/util/BUILD
@@ -4,13 +4,6 @@
 #   The libraries in this package are not allowed to have ANY dependencies
 #   to other TF components outside of TSL.
 
-load(
-    "@local_tsl//tsl:tsl.bzl",
-    "check_deps",
-    "internal_visibility",
-    "tsl_copts",
-)
-load("@local_tsl//tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
 load(
     "@local_tsl//tsl/platform:build_config.bzl",
     "tsl_cc_test",
@@ -23,6 +16,13 @@ load(
     "@local_tsl//tsl/platform:rules_cc.bzl",
     "cc_library",
 )
+load(
+    "//xla/tsl:tsl.bzl",
+    "check_deps",
+    "internal_visibility",
+    "tsl_copts",
+)
+load("//xla/tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -242,7 +242,7 @@ cc_library(
     ],
     copts = tsl_copts(),
     visibility = internal_visibility([
-        "@local_tsl//tsl:internal",
+        "//xla/tsl:internal",
     ]),
 )
 
diff --git a/third_party/xla/xla/tsl/util/byte_swap_array.cc b/third_party/xla/xla/tsl/util/byte_swap_array.cc
index 3b21798f0caf41..2c80e8cb928d0d 100644
--- a/third_party/xla/xla/tsl/util/byte_swap_array.cc
+++ b/third_party/xla/xla/tsl/util/byte_swap_array.cc
@@ -19,28 +19,28 @@ limitations under the License.
 
 namespace tsl {
 
-Status ByteSwapArray(char* array, size_t bytes_per_elem, int array_len) {
+absl::Status ByteSwapArray(char* array, size_t bytes_per_elem, int array_len) {
   if (bytes_per_elem == 1) {
     // No-op
-    return OkStatus();
+    return absl::OkStatus();
   } else if (bytes_per_elem == 2) {
     auto array_16 = reinterpret_cast<uint16_t*>(array);
     for (int i = 0; i < array_len; i++) {
       array_16[i] = BYTE_SWAP_16(array_16[i]);
     }
-    return OkStatus();
+    return absl::OkStatus();
   } else if (bytes_per_elem == 4) {
     auto array_32 = reinterpret_cast<uint32_t*>(array);
     for (int i = 0; i < array_len; i++) {
       array_32[i] = BYTE_SWAP_32(array_32[i]);
     }
-    return OkStatus();
+    return absl::OkStatus();
   } else if (bytes_per_elem == 8) {
     auto array_64 = reinterpret_cast<uint64_t*>(array);
     for (int i = 0; i < array_len; i++) {
       array_64[i] = BYTE_SWAP_64(array_64[i]);
     }
-    return OkStatus();
+    return absl::OkStatus();
   } else {
     return errors::Unimplemented("Byte-swapping of ", bytes_per_elem,
                                  "-byte values not supported.");
diff --git a/third_party/xla/xla/tsl/util/byte_swap_array.h b/third_party/xla/xla/tsl/util/byte_swap_array.h
index 88c87afd2696e7..43ee9f77a38bac 100644
--- a/third_party/xla/xla/tsl/util/byte_swap_array.h
+++ b/third_party/xla/xla/tsl/util/byte_swap_array.h
@@ -97,7 +97,7 @@ namespace tsl {
 //
 // Returns: OkStatus() on success, -1 otherwise
 //
-Status ByteSwapArray(char *array, size_t bytes_per_elem, int array_len);
+absl::Status ByteSwapArray(char *array, size_t bytes_per_elem, int array_len);
 
 }  // namespace tsl
 
diff --git a/third_party/xla/xla/tsl/util/device_name_utils.cc b/third_party/xla/xla/tsl/util/device_name_utils.cc
index 180e3336666bca..6812071505ae05 100644
--- a/third_party/xla/xla/tsl/util/device_name_utils.cc
+++ b/third_party/xla/xla/tsl/util/device_name_utils.cc
@@ -205,9 +205,9 @@ void CompleteName(const DeviceNameUtils::ParsedName& parsed_basename,
 }  // namespace
 
 /* static */
-Status DeviceNameUtils::CanonicalizeDeviceName(StringPiece fullname,
-                                               StringPiece basename,
-                                               string* canonical_name) {
+absl::Status DeviceNameUtils::CanonicalizeDeviceName(StringPiece fullname,
+                                                     StringPiece basename,
+                                                     string* canonical_name) {
   *canonical_name = "";
   ParsedName parsed_basename;
   if (!ParseFullName(basename, &parsed_basename)) {
@@ -225,12 +225,12 @@ Status DeviceNameUtils::CanonicalizeDeviceName(StringPiece fullname,
   if (ParseLocalName(fullname, &parsed_name)) {
     CompleteName(parsed_basename, &parsed_name);
     *canonical_name = ParsedNameToString(parsed_name);
-    return OkStatus();
+    return absl::OkStatus();
   }
   if (ParseFullName(fullname, &parsed_name)) {
     CompleteName(parsed_basename, &parsed_name);
     *canonical_name = ParsedNameToString(parsed_name);
-    return OkStatus();
+    return absl::OkStatus();
   }
   return errors::InvalidArgument("Could not parse ", fullname,
                                  " into a device "
@@ -341,9 +341,10 @@ bool DeviceNameUtils::IsCompleteSpecification(const ParsedName& pattern,
 }
 
 namespace {
-Status MergeDevNamesImpl(DeviceNameUtils::ParsedName* target,
-                         const DeviceNameUtils::ParsedName& other,
-                         bool allow_soft_placement, bool override_conflicts) {
+absl::Status MergeDevNamesImpl(DeviceNameUtils::ParsedName* target,
+                               const DeviceNameUtils::ParsedName& other,
+                               bool allow_soft_placement,
+                               bool override_conflicts) {
   const auto& ParsedNameToString = DeviceNameUtils::ParsedNameToString;
   if (other.has_job) {
     if (target->has_job && target->job != other.job) {
@@ -393,7 +394,7 @@ Status MergeDevNamesImpl(DeviceNameUtils::ParsedName* target,
       } else {
         target->has_id = false;
         target->has_type = false;
-        return OkStatus();
+        return absl::OkStatus();
       }
     } else {
       target->has_type = other.has_type;
@@ -412,7 +413,7 @@ Status MergeDevNamesImpl(DeviceNameUtils::ParsedName* target,
         target->id = other.id;
       } else {
         target->has_id = false;
-        return OkStatus();
+        return absl::OkStatus();
       }
     } else {
       target->has_id = other.has_id;
@@ -420,22 +421,22 @@ Status MergeDevNamesImpl(DeviceNameUtils::ParsedName* target,
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
 
 /* static */
-Status DeviceNameUtils::MergeDevNames(ParsedName* target,
-                                      const ParsedName& other,
-                                      bool allow_soft_placement) {
+absl::Status DeviceNameUtils::MergeDevNames(ParsedName* target,
+                                            const ParsedName& other,
+                                            bool allow_soft_placement) {
   return MergeDevNamesImpl(target, other, allow_soft_placement,
                            /*override_conflicts=*/false);
 }
 
 /* static */
-Status DeviceNameUtils::MergeOverrideDevNames(ParsedName* target,
-                                              const ParsedName& other) {
+absl::Status DeviceNameUtils::MergeOverrideDevNames(ParsedName* target,
+                                                    const ParsedName& other) {
   return MergeDevNamesImpl(target, other, /*allow_soft_placement=*/true,
                            /*override_conflicts=*/true);
 }
@@ -604,7 +605,7 @@ std::vector<string> DeviceNameUtils::GetLocalNamesForDeviceMappings(
   }
 }
 
-/*static*/ Status DeviceNameUtils::DeviceNameToCpuDeviceName(
+/*static*/ absl::Status DeviceNameUtils::DeviceNameToCpuDeviceName(
     const string& device_name, string* host_device_name) {
   DeviceNameUtils::ParsedName device;
   if (!DeviceNameUtils::ParseFullName(device_name, &device)) {
@@ -615,7 +616,7 @@ std::vector<string> DeviceNameUtils::GetLocalNamesForDeviceMappings(
   device.id = 0;
   device.has_id = true;
   *host_device_name = DeviceNameUtils::ParsedNameToString(device);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 std::ostream& operator<<(std::ostream& os,
diff --git a/third_party/xla/xla/tsl/util/device_name_utils.h b/third_party/xla/xla/tsl/util/device_name_utils.h
index 82b5fa3b1aec2e..90152831444b78 100644
--- a/third_party/xla/xla/tsl/util/device_name_utils.h
+++ b/third_party/xla/xla/tsl/util/device_name_utils.h
@@ -168,9 +168,9 @@ class DeviceNameUtils {
   // and local versions of the device spec. Returns the newer version of the
   // device spec. If we were unable to interpret / parse "fullname" returns
   // an error and *canonical_name is set to "".
-  static Status CanonicalizeDeviceName(StringPiece fullname,
-                                       StringPiece basename,
-                                       std::string* canonical_name);
+  static absl::Status CanonicalizeDeviceName(StringPiece fullname,
+                                             StringPiece basename,
+                                             std::string* canonical_name);
 
   // Returns true if "name" specifies any non-trivial constraint on the device.
   static bool HasSomeDetails(const ParsedName& name) {
@@ -202,15 +202,16 @@ class DeviceNameUtils {
   // Merges the device specifications in "*target" and "other", and
   // stores the result in "*target". Returns OK if "*target" and
   // "other" are compatible, otherwise returns an error.
-  static Status MergeDevNames(ParsedName* target, const ParsedName& other) {
+  static absl::Status MergeDevNames(ParsedName* target,
+                                    const ParsedName& other) {
     return MergeDevNames(target, other, false);
   }
-  static Status MergeDevNames(ParsedName* target, const ParsedName& other,
-                              bool allow_soft_placement);
+  static absl::Status MergeDevNames(ParsedName* target, const ParsedName& other,
+                                    bool allow_soft_placement);
   // Same as MergeDevNames with allow_soft_placement=true, but instead of
   // clearing conflicting fields, overrides them with `other`'s values.
-  static Status MergeOverrideDevNames(ParsedName* target,
-                                      const ParsedName& other);
+  static absl::Status MergeOverrideDevNames(ParsedName* target,
+                                            const ParsedName& other);
 
   // Merges the device specifications in "*target" and "other", and
   // stores the result in "*target" by setting all unset values in target with
@@ -271,8 +272,8 @@ class DeviceNameUtils {
 
   // Returns name of the CPU:0 device on the same host as the device
   // `device_name`.
-  static Status DeviceNameToCpuDeviceName(const std::string& device_name,
-                                          std::string* host_device_name);
+  static absl::Status DeviceNameToCpuDeviceName(const std::string& device_name,
+                                                std::string* host_device_name);
 
   static bool CompareFullNames(const StringPiece& a, const StringPiece& b) {
     ParsedName parsed_a;
diff --git a/third_party/xla/xla/tsl/util/device_name_utils_test.cc b/third_party/xla/xla/tsl/util/device_name_utils_test.cc
index 03aa5fca5899b9..1f5f5114550d40 100644
--- a/third_party/xla/xla/tsl/util/device_name_utils_test.cc
+++ b/third_party/xla/xla/tsl/util/device_name_utils_test.cc
@@ -426,7 +426,7 @@ static void MergeDevNamesHelperAllowSoftPlacement(
 static void MergeDevNamesError(const string& name_a, const string& name_b,
                                const string& expected_error_substr) {
   DeviceNameUtils::ParsedName target_a = Name(name_a);
-  Status s = DeviceNameUtils::MergeDevNames(&target_a, Name(name_b));
+  absl::Status s = DeviceNameUtils::MergeDevNames(&target_a, Name(name_b));
   EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
   EXPECT_TRUE(absl::StrContains(s.message(), expected_error_substr)) << s;
 }
@@ -607,7 +607,7 @@ TEST(DeviceNameUtilsTest, CanonicalizeDeviceName) {
     TF_EXPECT_OK(DeviceNameUtils::CanonicalizeDeviceName("CPU:0", basename,
                                                          &canonical_name));
     EXPECT_EQ("/job:foo/replica:10/task:0/device:CPU:0", canonical_name);
-    Status s = DeviceNameUtils::CanonicalizeDeviceName(
+    absl::Status s = DeviceNameUtils::CanonicalizeDeviceName(
         "/job:foo/task:0/replica/cpu:1", basename, &canonical_name);
     EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
     EXPECT_EQ("", canonical_name);
@@ -617,7 +617,7 @@ TEST(DeviceNameUtilsTest, CanonicalizeDeviceName) {
     // Try out malformed basenames.
     string fullname = "/device:CPU:0";
 
-    Status s = DeviceNameUtils::CanonicalizeDeviceName(
+    absl::Status s = DeviceNameUtils::CanonicalizeDeviceName(
         fullname, "/device:CPU:0", &canonical_name);
     EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
     EXPECT_EQ("", canonical_name);
diff --git a/third_party/xla/xla/tsl/util/env_var.cc b/third_party/xla/xla/tsl/util/env_var.cc
index 564617aa082889..217e851ff175b2 100644
--- a/third_party/xla/xla/tsl/util/env_var.cc
+++ b/third_party/xla/xla/tsl/util/env_var.cc
@@ -25,73 +25,74 @@ limitations under the License.
 
 namespace tsl {
 
-Status ReadBoolFromEnvVar(StringPiece env_var_name, bool default_val,
-                          bool* value) {
+absl::Status ReadBoolFromEnvVar(StringPiece env_var_name, bool default_val,
+                                bool* value) {
   *value = default_val;
   const char* tf_env_var_val = getenv(string(env_var_name).c_str());
   if (tf_env_var_val == nullptr) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   string str_value = absl::AsciiStrToLower(tf_env_var_val);
   if (str_value == "0" || str_value == "false") {
     *value = false;
-    return OkStatus();
+    return absl::OkStatus();
   } else if (str_value == "1" || str_value == "true") {
     *value = true;
-    return OkStatus();
+    return absl::OkStatus();
   }
   return errors::InvalidArgument(strings::StrCat(
       "Failed to parse the env-var ${", env_var_name, "} into bool: ",
       tf_env_var_val, ". Use the default value: ", default_val));
 }
 
-Status ReadInt64FromEnvVar(StringPiece env_var_name, int64_t default_val,
-                           int64_t* value) {
+absl::Status ReadInt64FromEnvVar(StringPiece env_var_name, int64_t default_val,
+                                 int64_t* value) {
   *value = default_val;
   const char* tf_env_var_val = getenv(string(env_var_name).c_str());
   if (tf_env_var_val == nullptr) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   if (strings::safe_strto64(tf_env_var_val, value)) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   return errors::InvalidArgument(strings::StrCat(
       "Failed to parse the env-var ${", env_var_name, "} into int64: ",
       tf_env_var_val, ". Use the default value: ", default_val));
 }
 
-Status ReadFloatFromEnvVar(StringPiece env_var_name, float default_val,
-                           float* value) {
+absl::Status ReadFloatFromEnvVar(StringPiece env_var_name, float default_val,
+                                 float* value) {
   *value = default_val;
   const char* tf_env_var_val = getenv(string(env_var_name).c_str());
   if (tf_env_var_val == nullptr) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   if (strings::safe_strtof(tf_env_var_val, value)) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   return errors::InvalidArgument(strings::StrCat(
       "Failed to parse the env-var ${", env_var_name, "} into float: ",
       tf_env_var_val, ". Use the default value: ", default_val));
 }
 
-Status ReadStringFromEnvVar(StringPiece env_var_name, StringPiece default_val,
-                            string* value) {
+absl::Status ReadStringFromEnvVar(StringPiece env_var_name,
+                                  StringPiece default_val, string* value) {
   const char* tf_env_var_val = getenv(string(env_var_name).c_str());
   if (tf_env_var_val != nullptr) {
     *value = tf_env_var_val;
   } else {
     *value = string(default_val);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status ReadStringsFromEnvVar(StringPiece env_var_name, StringPiece default_val,
-                             std::vector<string>* value) {
+absl::Status ReadStringsFromEnvVar(StringPiece env_var_name,
+                                   StringPiece default_val,
+                                   std::vector<string>* value) {
   string str_val;
   TF_RETURN_IF_ERROR(ReadStringFromEnvVar(env_var_name, default_val, &str_val));
   *value = str_util::Split(str_val, ',');
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tsl
diff --git a/third_party/xla/xla/tsl/util/env_var.h b/third_party/xla/xla/tsl/util/env_var.h
index 69c0bff2a1658c..d3b422bd2bb6d4 100644
--- a/third_party/xla/xla/tsl/util/env_var.h
+++ b/third_party/xla/xla/tsl/util/env_var.h
@@ -27,29 +27,30 @@ namespace tsl {
 // case insensitive "false" is interpreted as false. A string "1" or a case
 // insensitive "true" is interpreted as true. Otherwise, an error status is
 // returned.
-Status ReadBoolFromEnvVar(StringPiece env_var_name, bool default_val,
-                          bool* value);
+absl::Status ReadBoolFromEnvVar(StringPiece env_var_name, bool default_val,
+                                bool* value);
 
 // Returns an int64 into "value" from the environmental variable "env_var_name".
 // If it is unset, the default value is used.
 // If the string cannot be parsed into int64, an error status is returned.
-Status ReadInt64FromEnvVar(StringPiece env_var_name, int64_t default_val,
-                           int64_t* value);
+absl::Status ReadInt64FromEnvVar(StringPiece env_var_name, int64_t default_val,
+                                 int64_t* value);
 // Returns a float into "value" from the environmental variable "env_var_name".
 // If it is unset, the default value is used.
 // If the string cannot be parsed into float, an error status is returned.
-Status ReadFloatFromEnvVar(StringPiece env_var_name, float default_val,
-                           float* value);
+absl::Status ReadFloatFromEnvVar(StringPiece env_var_name, float default_val,
+                                 float* value);
 
 // Returns a string into "value" from the environmental variable "env_var_name".
 // If it is unset, the default value is used.
-Status ReadStringFromEnvVar(StringPiece env_var_name, StringPiece default_val,
-                            std::string* value);
+absl::Status ReadStringFromEnvVar(StringPiece env_var_name,
+                                  StringPiece default_val, std::string* value);
 
 // Returns a comma separated string into "value" from the environmental variable
 // "env_var_name". If it is unset, the default value is comma split and used.
-Status ReadStringsFromEnvVar(StringPiece env_var_name, StringPiece default_val,
-                             std::vector<std::string>* value);
+absl::Status ReadStringsFromEnvVar(StringPiece env_var_name,
+                                   StringPiece default_val,
+                                   std::vector<std::string>* value);
 
 }  // namespace tsl
 
diff --git a/third_party/xla/xla/tsl/util/reporter.cc b/third_party/xla/xla/tsl/util/reporter.cc
index c8ee2f2f87c4ea..1d08abf7b2e6c2 100644
--- a/third_party/xla/xla/tsl/util/reporter.cc
+++ b/third_party/xla/xla/tsl/util/reporter.cc
@@ -24,20 +24,20 @@ namespace tsl {
 TestReportFile::TestReportFile(const string& fname, const string& test_name)
     : closed_(true), fname_(fname), test_name_(test_name) {}
 
-Status TestReportFile::Append(const string& content) {
-  if (closed_) return OkStatus();
+absl::Status TestReportFile::Append(const string& content) {
+  if (closed_) return absl::OkStatus();
   return log_file_->Append(content);
 }
 
-Status TestReportFile::Close() {
-  if (closed_) return OkStatus();
+absl::Status TestReportFile::Close() {
+  if (closed_) return absl::OkStatus();
   closed_ = true;
   return log_file_->Close();
 }
 
-Status TestReportFile::Initialize() {
+absl::Status TestReportFile::Initialize() {
   if (fname_.empty()) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   string mangled_fname = strings::StrCat(
       fname_, absl::StrJoin(str_util::Split(test_name_, '/'), "__"));
@@ -50,7 +50,7 @@ Status TestReportFile::Initialize() {
   TF_RETURN_IF_ERROR(log_file_->Flush());
 
   closed_ = false;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 TestReporter::TestReporter(const string& fname, const string& test_name)
@@ -58,8 +58,8 @@ TestReporter::TestReporter(const string& fname, const string& test_name)
   benchmark_entry_.set_name(test_name);
 }
 
-Status TestReporter::Close() {
-  if (report_file_.IsClosed()) return OkStatus();
+absl::Status TestReporter::Close() {
+  if (report_file_.IsClosed()) return absl::OkStatus();
 
   tensorflow::BenchmarkEntries entries;
   *entries.add_entry() = benchmark_entry_;
@@ -69,36 +69,37 @@ Status TestReporter::Close() {
   return report_file_.Close();
 }
 
-Status TestReporter::Benchmark(int64_t iters, double cpu_time, double wall_time,
-                               double throughput) {
-  if (report_file_.IsClosed()) return OkStatus();
+absl::Status TestReporter::Benchmark(int64_t iters, double cpu_time,
+                                     double wall_time, double throughput) {
+  if (report_file_.IsClosed()) return absl::OkStatus();
   benchmark_entry_.set_iters(iters);
   benchmark_entry_.set_cpu_time(cpu_time / iters);
   benchmark_entry_.set_wall_time(wall_time / iters);
   benchmark_entry_.set_throughput(throughput);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status TestReporter::SetProperty(const string& name, const string& value) {
-  if (report_file_.IsClosed()) return OkStatus();
+absl::Status TestReporter::SetProperty(const string& name,
+                                       const string& value) {
+  if (report_file_.IsClosed()) return absl::OkStatus();
   (*benchmark_entry_.mutable_extras())[name].set_string_value(value);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status TestReporter::SetProperty(const string& name, double value) {
-  if (report_file_.IsClosed()) return OkStatus();
+absl::Status TestReporter::SetProperty(const string& name, double value) {
+  if (report_file_.IsClosed()) return absl::OkStatus();
   (*benchmark_entry_.mutable_extras())[name].set_double_value(value);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status TestReporter::AddMetric(const string& name, double value) {
-  if (report_file_.IsClosed()) return OkStatus();
+absl::Status TestReporter::AddMetric(const string& name, double value) {
+  if (report_file_.IsClosed()) return absl::OkStatus();
   auto* metric = benchmark_entry_.add_metrics();
   metric->set_name(name);
   metric->set_value(value);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status TestReporter::Initialize() { return report_file_.Initialize(); }
+absl::Status TestReporter::Initialize() { return report_file_.Initialize(); }
 
 }  // namespace tsl
diff --git a/third_party/xla/xla/tsl/util/reporter.h b/third_party/xla/xla/tsl/util/reporter.h
index cf1e2b2c274b25..6d2969d404e93e 100644
--- a/third_party/xla/xla/tsl/util/reporter.h
+++ b/third_party/xla/xla/tsl/util/reporter.h
@@ -37,13 +37,13 @@ class TestReportFile {
 
   // Initialize the TestReportFile.  If the reporting env flag is set,
   // try to create the reporting file.  Fails if the file already exists.
-  Status Initialize();
+  absl::Status Initialize();
 
   // Append the report file w/ 'content'.
-  Status Append(const string& content);
+  absl::Status Append(const string& content);
 
   // Close the report file.
-  Status Close();
+  absl::Status Close();
 
   bool IsClosed() const { return closed_; }
 
@@ -91,29 +91,29 @@ class TestReporter {
 
   // Initialize the TestReporter.  If the reporting env flag is set,
   // try to create the reporting file.  Fails if the file already exists.
-  Status Initialize();
+  absl::Status Initialize();
 
   // Finalize the report.  If the reporting env flag is set,
   // flush the reporting file and close it.
   // Once Close is called, no other methods should be called other
   // than Close and the destructor.
-  Status Close();
+  absl::Status Close();
 
   // Set the report to be a Benchmark and log the given parameters.
   // Only does something if the reporting env flag is set.
   // Does not guarantee the report is written.  Use Close() to
   // enforce I/O operations.
-  Status Benchmark(int64_t iters, double cpu_time, double wall_time,
-                   double throughput);
+  absl::Status Benchmark(int64_t iters, double cpu_time, double wall_time,
+                         double throughput);
 
   // Set property on Benchmark to the given value.
-  Status SetProperty(const string& name, double value);
+  absl::Status SetProperty(const string& name, double value);
 
   // Set property on Benchmark to the given value.
-  Status SetProperty(const string& name, const string& value);
+  absl::Status SetProperty(const string& name, const string& value);
 
   // Add the given value to the metrics on the Benchmark.
-  Status AddMetric(const string& name, double value);
+  absl::Status AddMetric(const string& name, double value);
 
   // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
   ~TestReporter() { Close().IgnoreError(); }  // Autoclose in destructor.
diff --git a/third_party/xla/xla/util.h b/third_party/xla/xla/util.h
index 23df713bea51a7..a5910d0ce4197a 100644
--- a/third_party/xla/xla/util.h
+++ b/third_party/xla/xla/util.h
@@ -212,28 +212,83 @@ void StridedCopy(D* dest, int64_t dest_stride, const S* src, int64_t src_stride,
 Status AddStatus(Status prior, absl::string_view context);
 Status AppendStatus(Status prior, absl::string_view context);
 
-// This macro defines the arguments to be used as an error
-// message to be passed to absl::StrFormat, and returns a status in the
-// canonical error space.
-#define DEFINE_XLA_ERROR_WITH_STRFORMAT_WITH_BACKTRACE(error_type)  \
+// The following three macros define a common set of code for creating
+// absl::Status errors with the given error_type, with the addition of adding
+// absl::SourceLocation if it's available (PLATFORM_GOOGLE).  They're a
+// complicated by the need to use #ifdefs within the code.  This would be the
+// equivalent code for ResourceExhausted if a #define macro could have embedded
+// #ifdef directives:
+//
+// template <typename... Args>
+// struct ResourceExhausted {
+//   Status status;
+// #if defined(PLATFORM_GOOGLE)
+//   // NOLINTNEXTLINE(google-explicit-constructor)
+//   ResourceExhausted(const absl::FormatSpec<Args...>& format, Args&&... args,
+//                     absl::SourceLocation loc =
+//                     absl::SourceLocation::current())
+//       : status(WithLogBacktrace(
+//             absl::ResourceExhaustedError(absl::StrFormat(format, args...))
+//                 .WithSourceLocation(loc))) {}
+// #else
+//   ResourceExhaustedStrCat(Args&&... concat)
+//       : status(WithLogBacktrace(
+//             absl::ResourceExhaustedError(absl::StrFormat(format, args...)))
+//             {}
+// #endif
+//
+//   // NOLINTNEXTLINE(google-explicit-constructor)
+//   operator Status() const { return status; }
+// };
+//
+#define XLA_ERROR_WITH_STRFORMAT_AND_BACKTRACE_PREFIX(error_type) \
+  template <typename... Args>                                     \
+  struct error_type {                                             \
+    Status status;
+#define XLA_ERROR_WITH_STRFORMAT_AND_BACKTRACE_SUFFIX(error_type)        \
+  /* NOLINTNEXTLINE(google-explicit-constructor) */                      \
+  operator Status() const { return status; }                             \
+  }                                                                      \
+  ;                                                                      \
+  /*Deduction guide to make variadic arguments play nice with default */ \
+  /* absl::SourceLocation argument. */                                   \
+  template <typename... Args>                                            \
+  error_type(const absl::FormatSpec<Args...>& format,                    \
+             Args&&...) -> error_type<Args...>;
+
+#if defined(PLATFORM_GOOGLE)
+#define XLA_ERROR_WITH_STRFORMAT_AND_BACKTRACE(error_type)               \
+  XLA_ERROR_WITH_STRFORMAT_AND_BACKTRACE_PREFIX(error_type)              \
+  /* NOLINTNEXTLINE(google-explicit-constructor) */                      \
+  error_type(const absl::FormatSpec<Args...>& format, Args&&... args,    \
+             absl::SourceLocation loc = absl::SourceLocation::current()) \
+      : status(WithLogBacktrace(                                         \
+            absl::error_type##Error(absl::StrFormat(format, args...))    \
+                .WithSourceLocation(loc))) {}                            \
+  XLA_ERROR_WITH_STRFORMAT_AND_BACKTRACE_SUFFIX(error_type)
+#else
+#define XLA_ERROR_WITH_STRFORMAT_AND_BACKTRACE(error_type)          \
   template <typename... Args>                                       \
   Status error_type(const absl::FormatSpec<Args...>& format,        \
                     const Args&... args) {                          \
     return WithLogBacktrace(                                        \
-        tsl::errors::error_type(absl::StrFormat(format, args...))); \
+        absl::error_type##Error(absl::StrFormat(format, args...))); \
   }
+#endif
 
-DEFINE_XLA_ERROR_WITH_STRFORMAT_WITH_BACKTRACE(InvalidArgument);
-DEFINE_XLA_ERROR_WITH_STRFORMAT_WITH_BACKTRACE(Unimplemented);
-DEFINE_XLA_ERROR_WITH_STRFORMAT_WITH_BACKTRACE(Internal);
-DEFINE_XLA_ERROR_WITH_STRFORMAT_WITH_BACKTRACE(FailedPrecondition);
-DEFINE_XLA_ERROR_WITH_STRFORMAT_WITH_BACKTRACE(Cancelled);
-DEFINE_XLA_ERROR_WITH_STRFORMAT_WITH_BACKTRACE(ResourceExhausted);
-DEFINE_XLA_ERROR_WITH_STRFORMAT_WITH_BACKTRACE(NotFound);
-DEFINE_XLA_ERROR_WITH_STRFORMAT_WITH_BACKTRACE(Unavailable);
-DEFINE_XLA_ERROR_WITH_STRFORMAT_WITH_BACKTRACE(Unknown);
+XLA_ERROR_WITH_STRFORMAT_AND_BACKTRACE(Cancelled);
+XLA_ERROR_WITH_STRFORMAT_AND_BACKTRACE(FailedPrecondition);
+XLA_ERROR_WITH_STRFORMAT_AND_BACKTRACE(Internal);
+XLA_ERROR_WITH_STRFORMAT_AND_BACKTRACE(InvalidArgument);
+XLA_ERROR_WITH_STRFORMAT_AND_BACKTRACE(NotFound);
+XLA_ERROR_WITH_STRFORMAT_AND_BACKTRACE(ResourceExhausted);
+XLA_ERROR_WITH_STRFORMAT_AND_BACKTRACE(Unavailable);
+XLA_ERROR_WITH_STRFORMAT_AND_BACKTRACE(Unimplemented);
+XLA_ERROR_WITH_STRFORMAT_AND_BACKTRACE(Unknown);
 
-#undef DEFINE_XLA_ERROR_WITH_STRFORMAT_WITH_BACKTRACE
+#undef XLA_ERROR_WITH_STRFORMAT_AND_BACKTRACE
+#undef XLA_ERROR_WITH_STRFORMAT_AND_BACKTRACE_PREFIX
+#undef XLA_ERROR_WITH_STRFORMAT_AND_BACKTRACE_SUFFIX
 
 // The following three macros define a common set of code for creating
 // absl::Status errors with the given error_type, with the addition of adding
@@ -248,14 +303,16 @@ DEFINE_XLA_ERROR_WITH_STRFORMAT_WITH_BACKTRACE(Unknown);
 // #if defined(PLATFORM_GOOGLE)
 //   // NOLINTNEXTLINE(google-explicit-constructor)
 //   ResourceExhaustedStrCat(Args&&... concat, absl::SourceLocation loc =
-//                                                 absl::SourceLocation::current())
+//                                             absl::SourceLocation::current())
 //       : status(WithLogBacktrace(
-//             tsl::errors::ResourceExhausted(std::forward<Args>(concat)...)
+//             absl::ResourceExhaustedError(absl::StrCat(
+//                                          std::forward<Args>(concat)...))
 //                 .WithSourceLocation(loc))) {}
 // #else
 //   ResourceExhaustedStrCat(Args&&... concat)
 //       : status(WithLogBacktrace(
-//             tsl::errors::ResourceExhausted(std::forward<Args>(concat)...)))
+//             absl::ResourceExhaustedError(absl::StrCat(
+//                                          std::forward<Args>(concat)...))))
 //             {}
 // #endif
 //
@@ -279,20 +336,21 @@ DEFINE_XLA_ERROR_WITH_STRFORMAT_WITH_BACKTRACE(Unknown);
   error_type##StrCat(Args&&...)->error_type##StrCat<Args...>;
 
 #if defined(PLATFORM_GOOGLE)
-#define XLA_ERROR_WITH_STRCAT_AND_BACKTRACE(error_type)                     \
-  XLA_ERROR_WITH_STRCAT_AND_BACKTRACE_PREFIX(error_type)                    \
-  error_type##StrCat(Args&&... concat, absl::SourceLocation loc =           \
-                                           absl::SourceLocation::current()) \
-      : status(WithLogBacktrace(                                            \
-            tsl::errors::error_type(std::forward<Args>(concat)...)          \
-                .WithSourceLocation(loc))) {}                               \
+#define XLA_ERROR_WITH_STRCAT_AND_BACKTRACE(error_type)                       \
+  XLA_ERROR_WITH_STRCAT_AND_BACKTRACE_PREFIX(error_type)                      \
+  error_type##StrCat(Args&&... concat, absl::SourceLocation loc =             \
+                                           absl::SourceLocation::current())   \
+      : status(                                                               \
+            WithLogBacktrace(absl::error_type##Error(                         \
+                                 absl::StrCat(std::forward<Args>(concat)...)) \
+                                 .WithSourceLocation(loc))) {}                \
   XLA_ERROR_WITH_STRCAT_AND_BACKTRACE_SUFFIX(error_type)
 #else
-#define XLA_ERROR_WITH_STRCAT_AND_BACKTRACE(error_type)                 \
-  XLA_ERROR_WITH_STRCAT_AND_BACKTRACE_PREFIX(error_type)                \
-  error_type##StrCat(Args&&... concat)                                  \
-      : status(WithLogBacktrace(                                        \
-            tsl::errors::error_type(std::forward<Args>(concat)...))) {} \
+#define XLA_ERROR_WITH_STRCAT_AND_BACKTRACE(error_type)       \
+  XLA_ERROR_WITH_STRCAT_AND_BACKTRACE_PREFIX(error_type)      \
+  error_type##StrCat(Args&&... concat)                        \
+      : status(WithLogBacktrace(absl::error_type##Error(      \
+            absl::StrCat(std::forward<Args>(concat)...)))) {} \
   XLA_ERROR_WITH_STRCAT_AND_BACKTRACE_SUFFIX(error_type)
 #endif
 
diff --git a/third_party/xla/xla/xla.bzl b/third_party/xla/xla/xla.bzl
index 71b67f8e74ce62..bf8d047dc376be 100644
--- a/third_party/xla/xla/xla.bzl
+++ b/third_party/xla/xla/xla.bzl
@@ -4,10 +4,6 @@ load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "if_rocm_is_configured",
 )
-load(
-    "@local_tsl//tsl:tsl.bzl",
-    "tsl_copts",
-)
 load(
     "@local_tsl//tsl/platform:build_config_root.bzl",
     "if_static",
@@ -17,6 +13,10 @@ load(
     "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
+load(
+    "//xla/tsl:tsl.bzl",
+    "tsl_copts",
+)
 
 def xla_py_proto_library(**_kwargs):
     # Note: we don't currently define a proto library target for Python in OSS.
diff --git a/third_party/xla/xla/xla.proto b/third_party/xla/xla/xla.proto
index 1c84566bebb4aa..0926606545866b 100644
--- a/third_party/xla/xla/xla.proto
+++ b/third_party/xla/xla/xla.proto
@@ -598,6 +598,7 @@ message DebugOptions {
   bool xla_gpu_enable_triton_softmax_fusion = 220;
 
   bool xla_gpu_enable_priority_fusion = 221;
+  bool xla_gpu_enable_triton_softmax_priority_fusion = 286;
 
   // File to write autotune results to. It will be a binary file unless the name
   // ends with .txt or .textproto. Warning: The results are written at every
@@ -729,7 +730,17 @@ message DebugOptions {
   // 3: + Nontrivial noncontracting dimension reshapes/transposes.
   int32 xla_gpu_cudnn_gemm_fusion_level = 285;
 
-  // Next id: 286
+  // This instructs the runtime whether to use
+  // memcpy for p2p communication when source and
+  // target are located within a node(nvlink).
+  bool xla_gpu_use_memcpy_local_p2p = 287;
+
+  // If non-zero, limits the number of solutions to be used by GEMM autotuner.
+  // This might be useful if underlying math library returns too many GEMM
+  // solutions.
+  int64 xla_gpu_autotune_max_solutions = 288;
+
+  // Next id: 289
 
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.
diff --git a/third_party/xla/xla/xla_data.proto b/third_party/xla/xla/xla_data.proto
index b48e2a7b8f5338..5bcdb0aa08320d 100644
--- a/third_party/xla/xla/xla_data.proto
+++ b/third_party/xla/xla/xla_data.proto
@@ -470,6 +470,10 @@ message ExecutionProfile {
   // Whether this profile was drawn from a cache of profiles instead of from
   // execution on the hardware.
   bool profile_cache_hit = 7;
+
+  // Whether a warm-up run of the computation was executed before the
+  // measured execution.
+  bool warmup_run_executed = 8;
 }
 
 // Handle given to a user that represents an execution that the user launched
diff --git a/third_party/zlib.BUILD b/third_party/zlib.BUILD
index b8ca17d13b39b1..bab615b54dfd50 100644
--- a/third_party/zlib.BUILD
+++ b/third_party/zlib.BUILD
@@ -33,7 +33,7 @@ cc_library(
     ],
     hdrs = ["zlib.h"],
     copts = select({
-        "@local_tsl//tsl:windows": [],
+        "@local_xla//xla/tsl:windows": [],
         "//conditions:default": [
             "-Wno-shift-negative-value",
             "-DZ_HAVE_UNISTD_H",